From b9bb8960b88d4c3e4da349ba3e6df6b0c1fb07f1 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Apr 2024 16:55:55 -0700 Subject: [PATCH 001/120] KVM: x86/pmu: Do not mask LVTPC when handling a PMI on AMD platforms commit 49ff3b4aec51e3abfc9369997cc603319b02af9a upstream. On AMD and Hygon platforms, the local APIC does not automatically set the mask bit of the LVTPC register when handling a PMI and there is no need to clear it in the kernel's PMI handler. For guests, the mask bit is currently set by kvm_apic_local_deliver() and unless it is cleared by the guest kernel's PMI handler, PMIs stop arriving and break use-cases like sampling with perf record. This does not affect non-PerfMonV2 guests because PMIs are handled in the guest kernel by x86_pmu_handle_irq() which always clears the LVTPC mask bit irrespective of the vendor. Before: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (1 samples) ] After: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (19 samples) ] Fixes: a16eb25b09c0 ("KVM: x86: Mask LVTPC when handling a PMI") Cc: stable@vger.kernel.org Signed-off-by: Sandipan Das Reviewed-by: Jim Mattson [sean: use is_intel_compatible instead of !is_amd_or_hygon()] Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c706c97f1dd32..dcb4ed5f4afba 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2425,7 +2425,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } From 06477fab1ee2a3e62d0d205af369a4e31cee7c4d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Apr 2024 16:55:54 -0700 Subject: [PATCH 002/120] KVM: x86: Snapshot if a vCPU's vendor model is AMD vs. Intel compatible commit fd706c9b1674e2858766bfbf7430534c2b26fbef upstream. Add kvm_vcpu_arch.is_amd_compatible to cache if a vCPU's vendor model is compatible with AMD, i.e. if the vCPU vendor is AMD or Hygon, along with helpers to check if a vCPU is compatible AMD vs. Intel. To handle Intel vs. AMD behavior related to masking the LVTPC entry, KVM will need to check for vendor compatibility on every PMI injection, i.e. querying for AMD will soon be a moderately hot path. Note! This subtly (or maybe not-so-subtly) makes "Intel compatible" KVM's default behavior, both if userspace omits (or never sets) CPUID 0x0 and if userspace sets a completely unknown vendor. One could argue that KVM should treat such vCPUs as not being compatible with Intel *or* AMD, but that would add useless complexity to KVM. KVM needs to do *something* in the face of vendor specific behavior, and so unless KVM conjured up a magic third option, choosing to treat unknown vendors as neither Intel nor AMD means that checks on AMD compatibility would yield Intel behavior, and checks for Intel compatibility would yield AMD behavior. And that's far worse as it would effectively yield random behavior depending on whether KVM checked for AMD vs. Intel vs. !AMD vs. !Intel. And practically speaking, all x86 CPUs follow either Intel or AMD architecture, i.e. "supporting" an unknown third architecture adds no value. Deliberately don't convert any of the existing guest_cpuid_is_intel() checks, as the Intel side of things is messier due to some flows explicitly checking for exactly vendor==Intel, versus some flows assuming anything that isn't "AMD compatible" gets Intel behavior. The Intel code will be cleaned up in the future. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/cpuid.h | 10 ++++++++++ arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5f51700b938ba..63a6e4b614baa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -737,6 +737,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; u32 kvm_cpuid_base; + bool is_amd_compatible; u64 reserved_gpa_bits; int maxphyaddr; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9e6225bab6d9a..17278119c2e64 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -243,6 +243,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9de..a4c4d48f4407e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -123,6 +123,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index be6d70ddc0b46..ef56cc4f76017 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4351,7 +4351,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->root_level, is_efer_nx(context), guest_can_use_gbpages(vcpu), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 593fb5f80aa8c..d4abaf543f3e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3161,7 +3161,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; From bbe3e8647d4f6a99b004f64146294a9d2fe89e89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Sep 2021 15:24:26 -0700 Subject: [PATCH 003/120] KVM: x86: Manually retrieve CPUID.0x1 when getting FMS for RESET/INIT commit 25b9784586a41f1fccc4d2cf7f210252b9df149c upstream. Manually look for a CPUID.0x1 entry instead of bouncing through kvm_cpuid() when retrieving the Family-Model-Stepping information for vCPU RESET/INIT. This fixes a potential undefined behavior bug due to kvm_cpuid() using the uninitialized "dummy" param as the ECX _input_, a.k.a. the index. A more minimal fix would be to simply zero "dummy", but the extra work in kvm_cpuid() is wasteful, and KVM should be treating the FMS retrieval as an out-of-band access, e.g. same as how KVM computes guest.MAXPHYADDR. Both Intel's SDM and AMD's APM describe the RDX value at RESET/INIT as holding the CPU's FMS information, not as holding CPUID.0x1.EAX. KVM's usage of CPUID entries to get FMS is simply a pragmatic approach to avoid having yet another way for userspace to provide inconsistent data. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-Id: <20210929222426.1855730-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d4abaf543f3e6..c52f361e6a781 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11380,9 +11380,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; /* * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's @@ -11460,13 +11460,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); From 15650b8a8798550cad276f120a19d0239d9cca68 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 19 Oct 2021 01:12:39 -0700 Subject: [PATCH 004/120] KVM: vPMU: Fill get_msr MSR_CORE_PERF_GLOBAL_OVF_CTRL w/ 0 commit 540c7abe61cc5e81a3d17fe02bce94f6cbf9dce0 upstream. SDM section 18.2.3 mentioned that: "IA32_PERF_GLOBAL_OVF_CTL MSR allows software to clear overflow indicator(s) of any general-purpose or fixed-function counters via a single WRMSR." It is R/W mentioned by SDM, we read this msr on bare-metal during perf testing, the value is always 0 for ICX/SKX boxes on hands. Let's fill get_msr MSR_CORE_PERF_GLOBAL_OVF_CTRL w/ 0 as hardware behavior and drop global_ovf_ctrl variable. Tested-by: Like Xu Signed-off-by: Wanpeng Li Message-Id: <1634631160-67276-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx/pmu_intel.c | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 63a6e4b614baa..463cd56b7df0a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -505,7 +505,6 @@ struct kvm_pmu { u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; - u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 46b1e630eb3cb..c33a8cf6684bf 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -369,7 +369,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = pmu->global_ovf_ctrl; + msr_info->data = 0; return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -428,7 +428,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; return 0; } break; @@ -602,8 +601,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; intel_pmu_release_guest_lbr_event(vcpu); } From 065bdc24feaa86bf34bcb91de812914e580e49c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:31 +0000 Subject: [PATCH 005/120] KVM: x86: Drop current_vcpu for kvm_running_vcpu + kvm_arch_vcpu variable commit 73cd107b9685c5308e864061772e4a78a629e4a0 upstream. Use the generic kvm_running_vcpu plus a new 'handling_intr_from_guest' variable in kvm_arch_vcpu instead of the semi-redundant current_vcpu. kvm_before/after_interrupt() must be called while the vCPU is loaded, (which protects against preemption), thus kvm_running_vcpu is guaranteed to be non-NULL when handling_intr_from_guest is non-zero. Switching to kvm_get_running_vcpu() will allows moving KVM's perf callbacks to generic code, and the new flag will be used in a future patch to more precisely identify the "NMI from guest" case. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-11-seanjc@google.com Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/x86.c | 21 ++++++++++++--------- arch/x86/kvm/x86.h | 10 ++++++---- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 463cd56b7df0a..22cf1770d97d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -785,6 +785,7 @@ struct kvm_vcpu_arch { unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ + u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; @@ -1922,8 +1923,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -unsigned int kvm_guest_state(void); - void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b4e56b2556e2d..b904c21b71d14 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -87,7 +87,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ - if (!kvm_guest_state()) + if (!kvm_handling_nmi_from_guest(pmc->vcpu)) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c52f361e6a781..3b2ad24097c43 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8821,15 +8821,17 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); +static inline bool kvm_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return vcpu && vcpu->arch.handling_intr_from_guest; +} -unsigned int kvm_guest_state(void) +static unsigned int kvm_guest_state(void) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); unsigned int state; - if (!vcpu) + if (!kvm_pmi_in_guest(vcpu)) return 0; state = PERF_GUEST_ACTIVE; @@ -8841,9 +8843,10 @@ unsigned int kvm_guest_state(void) static unsigned long kvm_guest_get_ip(void) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - if (WARN_ON_ONCE(!vcpu)) + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_pmi_in_guest(vcpu))) return 0; return kvm_rip_read(vcpu); @@ -8851,10 +8854,10 @@ static unsigned long kvm_guest_get_ip(void) static unsigned int kvm_handle_intel_pt_intr(void) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!vcpu) + if (!kvm_pmi_in_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4873b043944b6..50e38c7e4436e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -389,8 +389,6 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); - static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) { return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; @@ -398,14 +396,18 @@ static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, vcpu); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 1); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, NULL); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) +{ + return !!vcpu->arch.handling_intr_from_guest; +} static inline bool kvm_pat_valid(u64 data) { From 0baf69eceb78d67465560ac83fe471fd8dcf68e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:33 +0000 Subject: [PATCH 006/120] KVM: Move x86's perf guest info callbacks to generic KVM commit e1bfc24577cc65c95dc519d7621a9c985b97e567 upstream. Move x86's perf guest callbacks into common KVM, as they are semantically identical to arm64's callbacks (the only other such KVM callbacks). arm64 will convert to the common versions in a future patch. Implement the necessary arm64 arch hooks now to avoid having to provide stubs or a temporary #define (from x86) to avoid arm64 compilation errors when CONFIG_GUEST_PERF_EVENTS=y. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20211111020738.2512932-13-seanjc@google.com Signed-off-by: PvsNarasimha --- arch/arm64/include/asm/kvm_host.h | 10 ++++++ arch/arm64/kvm/arm.c | 5 +++ arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/x86.c | 53 +++++++------------------------ include/linux/kvm_host.h | 10 ++++++ virt/kvm/kvm_main.c | 44 +++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 42 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b1cc8111bc6db..5e96175ef9fb2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -765,6 +765,16 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); void kvm_perf_init(void); void kvm_perf_teardown(void); +/* + * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, + * arrived in guest context. For arm64, any event that arrives while a vCPU is + * loaded is considered to be "in guest". + */ +static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; +} + long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e6e8afe256022..587236240fd55 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -505,6 +505,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu_mode_priv(vcpu); } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return *vcpu_pc(vcpu); +} + /* Just ensure a guest exit from a particular CPU */ static void exit_vm_noop(void *info) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 22cf1770d97d2..89bffa51743ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1591,6 +1591,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b2ad24097c43..3e38d352cd029 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8821,43 +8821,12 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -static inline bool kvm_pmi_in_guest(struct kvm_vcpu *vcpu) -{ - return vcpu && vcpu->arch.handling_intr_from_guest; -} - -static unsigned int kvm_guest_state(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - unsigned int state; - - if (!kvm_pmi_in_guest(vcpu)) - return 0; - - state = PERF_GUEST_ACTIVE; - if (static_call(kvm_x86_get_cpl)(vcpu)) - state |= PERF_GUEST_USER; - - return state; -} - -static unsigned long kvm_guest_get_ip(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ - if (WARN_ON_ONCE(!kvm_pmi_in_guest(vcpu))) - return 0; - - return kvm_rip_read(vcpu); -} - static unsigned int kvm_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!kvm_pmi_in_guest(vcpu)) + if (!kvm_arch_pmi_in_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); @@ -8866,12 +8835,6 @@ static unsigned int kvm_handle_intel_pt_intr(void) return 1; } -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .state = kvm_guest_state, - .get_ip = kvm_guest_get_ip, - .handle_intel_pt_intr = NULL, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -11644,9 +11607,11 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); + /* Temporary ugliness. */ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; - perf_register_guest_info_callbacks(&kvm_guest_cbs); + kvm_register_perf_callbacks(kvm_handle_intel_pt_intr); + else + kvm_register_perf_callbacks(NULL); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -11675,8 +11640,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - kvm_guest_cbs.handle_intel_pt_intr = NULL; + kvm_unregister_perf_callbacks(); static_call(kvm_x86_hardware_unsetup)(); } @@ -12310,6 +12274,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu->arch.preempted_in_kernel; } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e2423ffaf593..a45e535e832bf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1264,6 +1264,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void kvm_unregister_perf_callbacks(void); +#else +static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_unregister_perf_callbacks(void) {} +#endif /* CONFIG_GUEST_PERF_EVENTS */ + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2a5a924b32d5a..a113bcc4c9c60 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5611,6 +5611,50 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_running_vcpu; } +#ifdef CONFIG_GUEST_PERF_EVENTS +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; + + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + state = PERF_GUEST_ACTIVE; + if (!kvm_arch_vcpu_in_kernel(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) + return 0; + + return kvm_arch_vcpu_get_ip(vcpu); +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, + .handle_intel_pt_intr = NULL, +}; + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +{ + kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); +} +void kvm_unregister_perf_callbacks(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} +#endif + struct kvm_cpu_compat_check { void *opaque; int *ret; From 689f6d115024dd2e39c5c5b188873aaf203b9855 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 17 Nov 2021 16:03:04 +0800 Subject: [PATCH 007/120] KVM: x86/svm: Add module param to control PMU virtualization commit b1d66dad65dcc8a6e5942db27027a086aa4f5c16 upstream. For Intel, the guest PMU can be disabled via clearing the PMU CPUID. For AMD, all hw implementations support the base set of four performance counters, with current mainstream hardware indicating the presence of two additional counters via X86_FEATURE_PERFCTR_CORE. In the virtualized world, the AMD guest driver may detect the presence of at least one counter MSR. Most hypervisor vendors would introduce a module param (like lbrv for svm) to disable PMU for all guests. Another control proposal per-VM is to pass PMU disable information via MSR_IA32_PERF_CAPABILITIES or one bit in CPUID Fn4000_00[FF:00]. Both of methods require some guest-side changes, so a module parameter may not be sufficiently granular, but practical enough. Signed-off-by: Like Xu Message-Id: <20211117080304.38989-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/pmu.c | 4 ++++ arch/x86/kvm/svm/svm.c | 11 +++++++++++ arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 17278119c2e64..aeef6dd59fca3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -542,7 +542,7 @@ void kvm_set_cpu_caps(void) F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE) + F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d35c94e13afb0..ef5faa9124c81 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,6 +16,7 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, @@ -116,6 +117,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + if (!pmu) + return NULL; + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8a317e34c0231..786593220af1a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -187,6 +187,10 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable PMU virtualization */ +bool pmu = true; +module_param(pmu, bool, 0444); + /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -957,6 +961,10 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + /* AMD PMU PERFCTR_CORE CPUID */ + if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); } @@ -1078,6 +1086,9 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (!pmu) + pr_info("PMU virtualization is disabled\n"); + svm_set_cpu_caps(); /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1d9b1a9e4398f..7113910b33a86 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -34,6 +34,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; +extern bool pmu; /* * Clean bits in VMCB. From 4f17fb1ac251781aa5972605e7ffbfae74dbb0d5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Dec 2021 14:10:04 -0500 Subject: [PATCH 008/120] KVM: x86: avoid out of bounds indices for fixed performance counters commit 006a0f0607e1504950dd8fa3b6ca8e438ec6c9d2 upstream. Because IceLake has 4 fixed performance counters but KVM only supports 3, it is possible for reprogram_fixed_counters to pass to reprogram_fixed_counter an index that is out of bounds for the fixed_pmc_events array. Ultimately intel_find_fixed_event, which is the only place that uses fixed_pmc_events, handles this correctly because it checks against the size of fixed_pmc_events anyway. Every other place operates on the fixed_counters[] array which is sized according to INTEL_PMC_MAX_FIXED. However, it is cleaner if the unsupported performance counters are culled early on in reprogram_fixed_counters. Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index c33a8cf6684bf..52bfcb20dead0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -92,7 +92,7 @@ static unsigned intel_find_fixed_event(int idx) u32 event; size_t size = ARRAY_SIZE(fixed_pmc_events); - if (idx >= size) + if (WARN_ON_ONCE(idx >= size)) return PERF_COUNT_HW_MAX; event = fixed_pmc_events[array_index_nospec(idx, size)]; @@ -511,8 +511,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - x86_pmu.num_counters_fixed); + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t) x86_pmu.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = From 65ac83003874e2782be3febf1894a29d6cda24f9 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:16 +0800 Subject: [PATCH 009/120] KVM: x86/pmu: Setup pmc->eventsel for fixed PMCs commit 761875634a5e2c3fed36c439fc4acac6f85a96eb upstream. The current pmc->eventsel for fixed counter is underutilised. The pmc->eventsel can be setup for all known available fixed counters since we have mapping between fixed pmc index and the intel_arch_events array. Either gp or fixed counter, it will simplify the later checks for consistency between eventsel and perf_hw_id. Signed-off-by: Like Xu Message-Id: <20211130074221.93635-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 52bfcb20dead0..a53c27c6e2110 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -465,6 +465,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -518,6 +533,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) From bc4d8bed369f3fb128c81367f3bae76163035647 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:18 +0800 Subject: [PATCH 010/120] KVM: x86/pmu: Reuse pmc_perf_hw_id() and drop find_fixed_event() commit 6ed1298eb0bf6641b0a66c2c38369f5767a2575c upstream. Since we set the same semantic event value for the fixed counter in pmc->eventsel, returning the perf_hw_id for the fixed counter via find_fixed_event() can be painlessly replaced by pmc_perf_hw_id() with the help of pmc_is_fixed() check. Signed-off-by: Like Xu Message-Id: <20211130074221.93635-4-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/pmu.h | 1 - arch/x86/kvm/svm/pmu.c | 11 ++++------- arch/x86/kvm/vmx/pmu_intel.c | 19 +++---------------- 4 files changed, 8 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b904c21b71d14..3ce2584c5ffd4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -259,7 +259,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->find_fixed_event(idx), + kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 84aa808ddfbf8..b26b40092c1b9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,7 +23,6 @@ struct kvm_event_hw_type_mapping { struct kvm_pmu_ops { unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); - unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index ef5faa9124c81..5ea8a6b42e756 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,6 +166,10 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) else event_mapping = amd_event_mapping; + /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ + if (WARN_ON(pmc_is_fixed(pmc))) + return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) if (event_mapping[i].eventsel == event_select && event_mapping[i].unit_mask == unit_mask) @@ -177,12 +181,6 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) return event_mapping[i].event_type; } -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; -} - /* check if a PMC is enabled by comparing it against global_ctrl bits. Because * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). */ @@ -347,7 +345,6 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu_ops amd_pmu_ops = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, - .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a53c27c6e2110..629037aeb9712 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -76,9 +76,9 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) int i; for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) + if (intel_arch_events[i].eventsel == event_select && + intel_arch_events[i].unit_mask == unit_mask && + (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) break; if (i == ARRAY_SIZE(intel_arch_events)) @@ -87,18 +87,6 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) return intel_arch_events[i].event_type; } -static unsigned intel_find_fixed_event(int idx) -{ - u32 event; - size_t size = ARRAY_SIZE(fixed_pmc_events); - - if (WARN_ON_ONCE(idx >= size)) - return PERF_COUNT_HW_MAX; - - event = fixed_pmc_events[array_index_nospec(idx, size)]; - return intel_arch_events[event].event_type; -} - /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { @@ -736,7 +724,6 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) struct kvm_pmu_ops intel_pmu_ops = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, - .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, From 45822f300b8d0665cbfc925b40edaa628a6b33c2 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 30 Nov 2021 15:42:19 +0800 Subject: [PATCH 011/120] KVM: x86/pmu: Add pmc->intr to refactor kvm_perf_overflow{_intr}() commit 40ccb96d5483c7ef773f50db15f82f0ab587cf8a upstream. Depending on whether intr should be triggered or not, KVM registers two different event overflow callbacks in the perf_event context. The code skeleton of these two functions is very similar, so the pmc->intr can be stored into pmc from pmc_reprogram_counter() which provides smaller instructions footprint against the u-architecture branch predictor. The __kvm_perf_overflow() can be called in non-nmi contexts and a flag is needed to distinguish the caller context and thus avoid a check on kvm_is_in_guest(), otherwise we might get warnings from suspicious RCU or check_preemption_disabled(). [Backport Changes] - In commit b9f5621c9547, kvm_is_in_guest() was changed to kvm_guest_state(). - In commit 73cd107b9685, kvm_guest_state() was updated to kvm_handling_nmi_from_guest(). - In commit 40ccb96d5483, kvm_is_in_guest() was removed, but instead of removing kvm_handling_nmi_from_guest(pmc->vcpu) was retained for compatibility - This backported patch adds kvm_handling_nmi_from_guest(pmc->vcpu) instead of kvm_is_in_guest() for compatibility. Suggested-by: Paolo Bonzini Signed-off-by: Like Xu Message-Id: <20211130074221.93635-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 58 ++++++++++++++++----------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89bffa51743ad..435184d5ec180 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -494,6 +494,7 @@ struct kvm_pmc { */ u64 current_config; bool is_paused; + bool intr; }; #define KVM_PMC_MAX_FIXED 3 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3ce2584c5ffd4..93f2144b4f993 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) kvm_pmu_deliver_pmi(vcpu); } -static void kvm_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { - struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - } + /* Ignore counters that have been reprogrammed already. */ + if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) + return; + + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + + if (!pmc->intr) + return; + + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) + irq_work_queue(&pmc_to_pmu(pmc)->irq_work); + else + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } -static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (!kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); - } + __kvm_perf_overflow(pmc, true); } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, @@ -123,7 +121,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } event = perf_event_create_kernel_counter(&attr, -1, current, - intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", @@ -135,6 +132,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; + pmc->intr = intr; } static void pmc_pause_counter(struct kvm_pmc *pmc) From 3ec65bcbdcafed888d7351b5cae0265d9cecfdc0 Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Tue, 30 Nov 2021 15:42:20 +0800 Subject: [PATCH 012/120] KVM: x86: Update vPMCs when retiring instructions commit 9cd803d496e72cd1dd3287c9a6cb4afa636ee16a upstream. When KVM retires a guest instruction through emulation, increment any vPMCs that are configured to monitor "instructions retired," and update the sample period of those counters so that they will overflow at the right time. Signed-off-by: Eric Hankland [jmattson: - Split the code to increment "branch instructions retired" into a separate commit. - Added 'static' to kvm_pmu_incr_counter() definition. - Modified kvm_pmu_incr_counter() to check pmc->perf_event->state == PERF_EVENT_STATE_ACTIVE. ] Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") Signed-off-by: Jim Mattson [likexu: - Drop checks for pmc->perf_event or event state or event type - Increase a counter once its umask bits and the first 8 select bits are matched - Rewrite kvm_pmu_incr_counter() with a less invasive approach to the host perf; - Rename kvm_pmu_record_event to kvm_pmu_trigger_event; - Add counter enable and CPL check for kvm_pmu_trigger_event(); ] Cc: Peter Zijlstra Signed-off-by: Like Xu Message-Id: <20211130074221.93635-6-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/x86.c | 3 +++ 3 files changed, 64 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 93f2144b4f993..eb78a23b23214 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -479,6 +479,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 prev_count; + + prev_count = pmc->counter; + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + + reprogram_counter(pmu, pmc->idx); + if (pmc->counter < prev_count) + __kvm_perf_overflow(pmc, false); +} + +static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, + unsigned int perf_hw_id) +{ + u64 old_eventsel = pmc->eventsel; + unsigned int config; + + pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); + config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + pmc->eventsel = old_eventsel; + return config == perf_hw_id; +} + +static inline bool cpl_is_matched(struct kvm_pmc *pmc) +{ + bool select_os, select_user; + u64 config = pmc->current_config; + + if (pmc_is_gp(pmc)) { + select_os = config & ARCH_PERFMON_EVENTSEL_OS; + select_user = config & ARCH_PERFMON_EVENTSEL_USR; + } else { + select_os = config & 0x1; + select_user = config & 0x2; + } + + return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; +} + +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + continue; + + /* Ignore checks for edge detect, pin control, invert and CMASK bits */ + if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) + kvm_pmu_incr_counter(pmc); + } +} +EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b26b40092c1b9..e6636ad8cad17 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -164,6 +164,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3e38d352cd029..c9f52a620fb38 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8241,6 +8241,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -8511,6 +8513,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); From a839a97857f6b378fdfe0283420f385f13117ad5 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 16 Dec 2021 11:13:54 -0600 Subject: [PATCH 013/120] KVM: SVM: include CR3 in initial VMSA state for SEV-ES guests commit 405329fc9aeef1e3e2eccaadf32b539ad6c7120f upstream. Normally guests will set up CR3 themselves, but some guests, such as kselftests, and potentially CONFIG_PVH guests, rely on being booted with paging enabled and CR3 initialized to a pre-allocated page table. Currently CR3 updates via KVM_SET_SREGS* are not loaded into the guest VMCB until just prior to entering the guest. For SEV-ES/SEV-SNP, this is too late, since it will have switched over to using the VMSA page prior to that point, with the VMSA CR3 copied from the VMCB initial CR3 value: 0. Address this by sync'ing the CR3 value into the VMCB save area immediately when KVM_SET_SREGS* is issued so it will find it's way into the initial VMSA. Suggested-by: Tom Lendacky Signed-off-by: Michael Roth Message-Id: <20211216171358.61140-10-michael.roth@amd.com> [Remove vmx_post_set_cr3; add a remark about kvm_set_cr3 not calling the new hook. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 19 +++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 2 ++ 5 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 56cf3950a2660..fa85ad0fd93fe 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -37,6 +37,7 @@ KVM_X86_OP(set_segment) KVM_X86_OP_NULL(get_cs_db_l_bits) KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) +KVM_X86_OP_NULL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 435184d5ec180..6d01bfca5d005 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1363,6 +1363,7 @@ struct kvm_x86_ops { bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 786593220af1a..61d9fa9ae1c23 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1755,6 +1755,24 @@ static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return true; } +static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4621,6 +4639,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, + .post_set_cr3 = svm_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 48b19346af7b8..ef020141df452 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3288,6 +3288,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9f52a620fb38..899d7e3501b65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1200,6 +1200,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* @@ -10885,6 +10886,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); From 9b1055f1362d1b0c73de74be5b09a2102e0d5b28 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:39 +0100 Subject: [PATCH 014/120] KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries commit ee3a5f9e3d9bf94159f3cc80da542fbe83502dd8 upstream. kvm_update_cpuid_runtime() mangles CPUID data coming from userspace VMM after updating 'vcpu->arch.cpuid_entries', this makes it impossible to compare an update with what was previously supplied. Introduce __kvm_update_cpuid_runtime() version which can be used to tweak the input before it goes to 'vcpu->arch.cpuid_entries' so the upcoming update check can compare tweaked data. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index aeef6dd59fca3..999a2c5cae2b3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -138,14 +138,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid_base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -160,11 +167,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -175,33 +183,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, 0); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } } + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); +} EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -292,6 +305,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, { int r; + __kvm_update_cpuid_runtime(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; @@ -301,7 +316,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); return 0; From 5b9fef6888e05b51daad3e00e24797a3f33f866e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 11 Jan 2022 15:38:23 +0800 Subject: [PATCH 015/120] KVM: x86: Making the module parameter of vPMU more common commit 4732f2444acd9b7eabc744f20eb5cc9694c0bfbd upstream. The new module parameter to control PMU virtualization should apply to Intel as well as AMD, for situations where userspace is not trusted. If the module parameter allows PMU virtualization, there could be a new KVM_CAP or guest CPUID bits whereby userspace can enable/disable PMU virtualization on a per-VM basis. If the module parameter does not allow PMU virtualization, there should be no userspace override, since we have no precedent for authorizing that kind of override. If it's false, other counter-based profiling features (such as LBR including the associated CPUID bits if any) will not be exposed. Change its name from "pmu" to "enable_pmu" as we have temporary variables with the same name in our code like "struct kvm_pmu *pmu". Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization") Suggested-by : Jim Mattson Signed-off-by: Like Xu Message-Id: <20220111073823.21885-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 6 +++--- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++------ arch/x86/kvm/svm/svm.h | 1 - arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/x86.c | 5 +++++ arch/x86/kvm/x86.h | 1 + 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 999a2c5cae2b3..1ead0c63d8a24 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -816,10 +816,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) perf_get_x86_pmu_capability(&cap); /* - * Only support guest architectural pmu on a host - * with architectural pmu. + * The guest architecture pmu is only supported if the architecture + * pmu exists on the host and the module parameters allow it. */ - if (!cap.version) + if (!cap.version || !enable_pmu) memset(&cap, 0, sizeof(cap)); eax.split.version_id = min(cap.version, 2); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5ea8a6b42e756..59ebce60351f5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -117,7 +117,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!pmu) + if (!enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 61d9fa9ae1c23..2eba876d44519 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -187,10 +187,6 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); -/* enable/disable PMU virtualization */ -bool pmu = true; -module_param(pmu, bool, 0444); - /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -962,7 +958,7 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); /* AMD PMU PERFCTR_CORE CPUID */ - if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); /* CPUID 0x8000001F (SME/SEV features) */ @@ -1086,7 +1082,7 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } - if (!pmu) + if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); svm_set_cpu_caps(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7113910b33a86..1d9b1a9e4398f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -34,7 +34,6 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; -extern bool pmu; /* * Clean bits in VMCB. diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 1fb65099dfcda..dc44c58be2dae 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -393,6 +394,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 629037aeb9712..f76b418c83533 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -491,7 +491,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 899d7e3501b65..ed32b83af079f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -194,6 +194,11 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 50e38c7e4436e..868d0712525ac 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -332,6 +332,7 @@ extern u64 host_xcr0; extern u64 supported_xcr0; extern u64 host_xss; extern u64 supported_xss; +extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { From 5d7ac14aa73697e4b4f96c49d7f147c35f84f205 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:26 -0800 Subject: [PATCH 016/120] KVM: x86/pmu: Use binary search to check filtered events commit 7ff775aca48adc854436b92c060e5eebfffb6a4a upstream. The PMU event filter may contain up to 300 events. Replace the linear search in reprogram_gp_counter() with a binary search. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-2-jmattson@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb78a23b23214..f8821ee5c1b4d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include "x86.h" #include "cpuid.h" @@ -166,13 +168,17 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static int cmp_u64(const void *a, const void *b) +{ + return *(__u64 *)a - *(__u64 *)b; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { u64 config; u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - int i; struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); bool allow_event = true; @@ -188,16 +194,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (filter) { - for (i = 0; i < filter->nevents; i++) - if (filter->events[i] == - (eventsel & AMD64_RAW_EVENT_MASK_NB)) - break; - if (filter->action == KVM_PMU_EVENT_ALLOW && - i == filter->nevents) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_DENY && - i < filter->nevents) - allow_event = false; + __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; } if (!allow_event) return; @@ -570,6 +573,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) /* Ensure nevents can't be changed between the user copies. */ *filter = tmp; + /* + * Sort the in-kernel list so that we can search it with bsearch. + */ + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); From 80ff672b338825a0eebf0f799945d68c75d6bd53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:19 +0000 Subject: [PATCH 017/120] KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks commit c3e8abf0f3536a46a235b0533149c2b2c2bbac27 upstream. Drop kvm_x86_ops' pre/post_block() now that all implementations are nops. No functional change intended. [Backport Changes] Definitions of pi_{pre, post}_block() were removed in the commit: d76fb40 Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-10-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 2 -- arch/x86/include/asm/kvm_host.h | 12 ------------ arch/x86/kvm/vmx/posted_intr.h | 2 -- arch/x86/kvm/vmx/vmx.c | 16 ---------------- arch/x86/kvm/x86.c | 6 +----- 5 files changed, 1 insertion(+), 37 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index fa85ad0fd93fe..5bd78efff044b 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -100,8 +100,6 @@ KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6d01bfca5d005..f43aa717074c5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1469,18 +1469,6 @@ struct kvm_x86_ops { const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index bf9b8f6a7242f..0bfea6bde3313 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -90,8 +90,6 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef020141df452..d56b4f9570bf9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7825,19 +7825,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -8045,9 +8032,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed32b83af079f..390858f02d336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10300,8 +10300,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { /* * Switch to the software timer before halt-polling/blocking as * the guest's timer may be a break event for the vCPU, and the @@ -10320,9 +10319,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } From e394c703062eb9bedab5d6743de5ef61b2eef7da Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 24 Jan 2022 11:36:05 +0100 Subject: [PATCH 018/120] KVM: x86: Move CPUID.(EAX=0x12,ECX=1) mangling to __kvm_update_cpuid_runtime() commit 5c89be1dd5cfb697614bc13626ba3bd0781aa160 upstream. Full equality check of CPUID data on update (kvm_cpuid_check_equal()) may fail for SGX enabled CPUs as CPUID.(EAX=0x12,ECX=1) is currently being mangled in kvm_vcpu_after_set_cpuid(). Move it to __kvm_update_cpuid_runtime() and split off cpuid_get_supported_xcr0() helper as 'vcpu->arch.guest_supported_xcr0' update needs (logically) to stay in kvm_vcpu_after_set_cpuid(). Cc: stable@vger.kernel.org Fixes: feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") Signed-off-by: Vitaly Kuznetsov Message-Id: <20220124103606.2630588-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 54 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1ead0c63d8a24..043a4fd4ce9b7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -167,10 +167,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } +/* + * Calculate guest's supported XCR0 taking into account guest CPUID data and + * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + */ +static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +{ + struct kvm_cpuid_entry2 *best; + + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; +} + static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; + u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { @@ -209,6 +225,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } + + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = cpuid_entry2_find(entries, nent, 0x12, 0x1); + if (best) { + best->ecx &= guest_supported_xcr0 & 0xffffffff; + best->edx &= guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } } void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) @@ -232,27 +263,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) - vcpu->arch.guest_supported_xcr0 = 0; - else - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - - /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. - */ - best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); - if (best) { - best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; - best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } + vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); kvm_update_pv_runtime(vcpu); From 7ff75510cdebb4d85eb9fef3cb6ff6771123f243 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Oct 2021 13:15:55 -0400 Subject: [PATCH 019/120] KVM: x86: skip host CPUID call for hypervisor leaves commit 2746a6b72ab9a92bd188c4ac3e4122ee1c18f754 upstream. Hypervisor leaves are always synthesized by __do_cpuid_func; just return zeroes and do not ask the host. Even on nested virtualization, a value from another hypervisor would be bogus, because all hypercalls and MSRs are processed by KVM. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 043a4fd4ce9b7..ce579dba9ac90 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -672,9 +672,17 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, if (!entry) return NULL; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); From c1903168b838b6564ad9a6bc456978bcb07fe777 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:51:49 +0000 Subject: [PATCH 020/120] KVM: x86: Drop export for .tlb_flush_current() static_call key commit feee3d9d5b9fd38b469b51b5b89f1b359420ecaf upstream. Remove the export of kvm_x86_tlb_flush_current() as there are no longer any users outside of common x86 code. Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 390858f02d336..c6918f697c40c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -134,7 +134,6 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); #include EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); From 0a132d2320e954c693961f25ad39e79db21ca492 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:51:50 +0000 Subject: [PATCH 021/120] KVM: x86: Rename kvm_x86_ops pointers to align w/ preferred vendor names commit e27bc0440ebd145ada7e761975d8ea840e994d2f upstream. Rename a variety of kvm_x86_op function pointers so that preferred name for vendor implementations follows the pattern _, e.g. rename .run() to .vcpu_run() to match {svm,vmx}_vcpu_run(). This will allow vendor implementations to be wired up via the KVM_X86_OP macro. In many cases, VMX and SVM "disagree" on the preferred name, though in reality it's VMX and x86 that disagree as SVM blindly prepended _svm to the kvm_x86_ops name. Justification for using the VMX nomenclature: - set_{irq,nmi} => inject_{irq,nmi} because the helper is injecting an event that has already been "set" in e.g. the vIRR. SVM's relevant VMCB field is even named event_inj, and KVM's stat is irq_injections. - prepare_guest_switch => prepare_switch_to_guest because the former is ambiguous, e.g. it could mean switching between multiple guests, switching from the guest to host, etc... - update_pi_irte => pi_update_irte to allow for matching match the rest of VMX's posted interrupt naming scheme, which is vmx_pi_(). - start_assignment => pi_start_assignment to again follow VMX's posted interrupt naming scheme, and to provide context for what bit of code might care about an otherwise undescribed "assignment". The "tlb_flush" => "flush_tlb" creates an inconsistency with respect to Hyper-V's "tlb_remote_flush" hooks, but Hyper-V really is the one that's wrong. x86, VMX, and SVM all use flush_tlb, and even common KVM is on a variant of the bandwagon with "kvm_flush_remote_tlbs", e.g. a more appropriate name for the Hyper-V hooks would be flush_remote_tlbs. Leave that change for another time as the Hyper-V hooks always start as NULL, i.e. the name doesn't matter for using kvm-x86-ops.h, and changing all names requires an astounding amount of churn. VMX and SVM function names are intentionally left as is to minimize the diff. Both VMX and SVM will need to rename even more functions in order to fully utilize KVM_X86_OPS, i.e. an additional patch for each is inevitable. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 20 +++++++++---------- arch/x86/include/asm/kvm_host.h | 20 +++++++++---------- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/svm/svm.c | 18 ++++++++--------- arch/x86/kvm/vmx/vmx.c | 20 +++++++++---------- arch/x86/kvm/x86.c | 31 ++++++++++++++---------------- 6 files changed, 56 insertions(+), 59 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 5bd78efff044b..a5a665ef2afb3 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -24,7 +24,7 @@ KVM_X86_OP_NULL(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) -KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(prepare_switch_to_guest) KVM_X86_OP(vcpu_load) KVM_X86_OP(vcpu_put) KVM_X86_OP(update_exception_bitmap) @@ -51,21 +51,21 @@ KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) -KVM_X86_OP(tlb_flush_all) -KVM_X86_OP(tlb_flush_current) +KVM_X86_OP(flush_tlb_all) +KVM_X86_OP(flush_tlb_current) KVM_X86_OP_NULL(tlb_remote_flush) KVM_X86_OP_NULL(tlb_remote_flush_with_range) -KVM_X86_OP(tlb_flush_gva) -KVM_X86_OP(tlb_flush_guest) -KVM_X86_OP(run) +KVM_X86_OP(flush_tlb_gva) +KVM_X86_OP(flush_tlb_guest) +KVM_X86_OP(vcpu_run) KVM_X86_OP_NULL(handle_exit) KVM_X86_OP_NULL(skip_emulated_instruction) KVM_X86_OP_NULL(update_emulated_instruction) KVM_X86_OP(set_interrupt_shadow) KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) -KVM_X86_OP(set_irq) -KVM_X86_OP(set_nmi) +KVM_X86_OP(inject_irq) +KVM_X86_OP(inject_nmi) KVM_X86_OP(queue_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) @@ -102,8 +102,8 @@ KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) -KVM_X86_OP_NULL(update_pi_irte) -KVM_X86_OP_NULL(start_assignment) +KVM_X86_OP_NULL(pi_update_irte) +KVM_X86_OP_NULL(pi_start_assignment) KVM_X86_OP_NULL(apicv_post_state_restore) KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) KVM_X86_OP_NULL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f43aa717074c5..9bdfbba6e746d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1346,7 +1346,7 @@ struct kvm_x86_ops { void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); @@ -1377,8 +1377,8 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); - void (*tlb_flush_all)(struct kvm_vcpu *vcpu); - void (*tlb_flush_current)(struct kvm_vcpu *vcpu); + void (*flush_tlb_all)(struct kvm_vcpu *vcpu); + void (*flush_tlb_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1389,15 +1389,15 @@ struct kvm_x86_ops { * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ - void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ - void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1406,8 +1406,8 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); @@ -1472,9 +1472,9 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); - void (*start_assignment)(struct kvm *kvm); + void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ef56cc4f76017..8858d0dc5ecde 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5080,7 +5080,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); out: return r; } @@ -5343,7 +5343,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5401,7 +5401,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); ++vcpu->stat.invlpg; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2eba876d44519..938f941003452 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4618,7 +4618,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = svm_vcpu_blocking, @@ -4650,20 +4650,20 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb, + .flush_tlb_current = svm_flush_tlb, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb, - .run = svm_vcpu_run, + .vcpu_run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, + .inject_irq = svm_set_irq, + .inject_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4710,7 +4710,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .deliver_posted_interrupt = svm_deliver_avic_intr, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .pi_update_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d56b4f9570bf9..e51ae788732bd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7940,7 +7940,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7969,20 +7969,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, - .run = vmx_vcpu_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, @@ -8035,8 +8035,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6918f697c40c..efa55e470866d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3296,7 +3296,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + static_call(kvm_x86_flush_tlb_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3315,14 +3315,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) return; } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + static_call(kvm_x86_flush_tlb_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); } /* @@ -9381,10 +9381,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); can_inject = false; } } @@ -9471,7 +9471,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } @@ -9485,7 +9485,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -10107,7 +10107,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -10175,7 +10175,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } for (;;) { - exit_fastpath = static_call(kvm_x86_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10485,10 +10485,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Exclude PKRU from restore as restored separately in - * kvm_x86_ops.run(). - */ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } @@ -12557,7 +12554,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -12605,7 +12602,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -12630,7 +12627,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -12641,7 +12638,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) From 43b1779983716b11c1263bb8fb11c93763781923 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:51:54 +0000 Subject: [PATCH 022/120] KVM: nVMX: Refactor PMU refresh to avoid referencing kvm_x86_ops.pmu_ops commit 0bcd556e15f97f5b6636f146acbf22663ab38ccf upstream. Refactor the nested VMX PMU refresh helper to pass it a flag stating whether or not the vCPU has PERF_GLOBAL_CTRL instead of having the nVMX helper query the information by bouncing through kvm_x86_ops.pmu_ops. This will allow a future patch to use static_call() for the PMU ops without having to export any static call definitions from common x86, and it is also a step toward unexported kvm_x86_ops. Alternatively, nVMX could call kvm_pmu_is_valid_msr() to indirectly use kvm_x86_ops.pmu_ops, but that would incur an extra layer of indirection and would require exporting kvm_pmu_is_valid_msr(). Opportunistically rename the helper to keep line lengths somewhat reasonable, and to better capture its high-level role. No functional change intended. Cc: Like Xu Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-9-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/nested.c | 5 +++-- arch/x86/kvm/vmx/nested.h | 3 ++- arch/x86/kvm/vmx/pmu_intel.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0210ba16f7073..e000b57c605bb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4862,7 +4862,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl) { struct vcpu_vmx *vmx; @@ -4870,7 +4871,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (vcpu_has_perf_global_ctrl) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2d0ac8a86d4a4..129ae4e01f7c1 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f76b418c83533..b276632ceb14d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -549,7 +549,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_entry_exit_ctls_update(vcpu); + nested_vmx_pmu_refresh(vcpu, + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); From 626d0d99bab0119966ca0fc31b3ce2af3db9418d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jan 2022 00:52:05 +0000 Subject: [PATCH 023/120] KVM: x86: Use more verbose names for mem encrypt kvm_x86_ops hooks commit 03d004cd071525894fb1d5638ebaf25cd6177435 upstream. Use slightly more verbose names for the so called "memory encrypt", a.k.a. "mem enc", kvm_x86_ops hooks to bridge the gap between the current super short kvm_x86_ops names and SVM's more verbose, but non-conforming names. This is a step toward using kvm-x86-ops.h with KVM_X86_CVM_OP() to fill svm_x86_ops. Opportunistically rename mem_enc_op() to mem_enc_ioctl() to better reflect its true nature, as it really is a full fledged ioctl() of its own. Ideally, the hook would be named confidential_vm_ioctl() or so, as the ioctl() is a gateway to more than just memory encryption, and because its underlying purpose to support Confidential VMs, which can be provided without memory encryption, e.g. if the TCB of the guest includes the host kernel but not host userspace, or by isolation in hardware without encrypting memory. But, diverging from KVM_MEMORY_ENCRYPT_OP even further is undeseriable, and short of creating alises for all related ioctl()s, which introduces a different flavor of divergence, KVM is stuck with the nomenclature. Defer renaming SVM's functions to a future commit as there are additional changes needed to make SVM fully conforming and to match reality (looking at you, svm_vm_copy_asid_from()). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220128005208.4008533-20-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 6 +++--- arch/x86/include/asm/kvm_host.h | 6 +++--- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/x86.c | 18 ++++++++++++------ 6 files changed, 23 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index a5a665ef2afb3..365e79dc43da5 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -113,9 +113,9 @@ KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) -KVM_X86_OP_NULL(mem_enc_op) -KVM_X86_OP_NULL(mem_enc_reg_region) -KVM_X86_OP_NULL(mem_enc_unreg_region) +KVM_X86_OP_NULL(mem_enc_ioctl) +KVM_X86_OP_NULL(mem_enc_register_region) +KVM_X86_OP_NULL(mem_enc_unregister_region) KVM_X86_OP_NULL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9bdfbba6e746d..f8791116b6257 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1489,9 +1489,9 @@ struct kvm_x86_ops { int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); - int (*mem_enc_op)(struct kvm *kvm, void __user *argp); - int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); - int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 854b3779b8a3a..7b11e15b9f5b1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1537,7 +1537,7 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int svm_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 938f941003452..f252ba1ddf616 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4718,9 +4718,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = svm_mem_enc_ioctl, + .mem_enc_register_region = svm_register_enc_region, + .mem_enc_unregister_region = svm_unregister_enc_region, .guest_memory_reclaimed = sev_guest_memory_reclaimed, .vm_copy_enc_context_from = svm_vm_copy_asid_from, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1d9b1a9e4398f..02ac73275758f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -550,7 +550,7 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); +int svm_mem_enc_ioctl(struct kvm *kvm, void __user *argp); int svm_register_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_unregister_enc_region(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efa55e470866d..8430c6c62b459 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6625,8 +6625,10 @@ long kvm_arch_vm_ioctl(struct file *filp, break; case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -6637,8 +6639,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6649,8 +6653,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { From 6a1ddb08a66f4e442c56b71d87efc6e8f76d1e94 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Feb 2022 13:16:36 -0500 Subject: [PATCH 024/120] KVM: x86: return 1 unconditionally for availability of KVM_CAP_VAPIC commit 8a2897853c53fd3d0e381a46b194889cf6da3391 upstream. The two ioctls used to implement userspace-accelerated TPR, KVM_TPR_ACCESS_REPORTING and KVM_SET_VAPIC_ADDR, are available even if hardware-accelerated TPR can be used. So there is no reason not to report KVM_CAP_VAPIC. [Backport changes] - In commit 58fccda47e4bd, report_flexpriority() is renamed to vmx_cpu_has_accelerated_tpr(). Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 1 - arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 6 ------ arch/x86/kvm/x86.c | 4 +--- 5 files changed, 1 insertion(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 365e79dc43da5..6a8d4178b35da 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -15,7 +15,6 @@ BUILD_BUG_ON(1) KVM_X86_OP_NULL(hardware_enable) KVM_X86_OP_NULL(hardware_disable) KVM_X86_OP_NULL(hardware_unsetup) -KVM_X86_OP_NULL(cpu_has_accelerated_tpr) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8791116b6257..efd8d0aa7fe12 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1332,7 +1332,6 @@ struct kvm_x86_ops { int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); - bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f252ba1ddf616..24eabcfc8ab49 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4022,11 +4022,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4607,7 +4602,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_create_vcpu, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e51ae788732bd..0a82b1bf09253 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -601,11 +601,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -7928,7 +7923,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8430c6c62b459..7334dbc572b4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4264,6 +4264,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4308,9 +4309,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); - break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; break; From 5526ccbd2ebda3ec65feb69c0f2cfbb8985ed2f0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Feb 2022 14:18:07 -0500 Subject: [PATCH 025/120] KVM: x86: use static_call_cond for optional callbacks commit 2a89061451c799bd36dbe1b90613c35212fc1f64 upstream. SVM implements neither update_emulated_instruction nor set_apic_access_page_addr. Remove an "if" by calling them with static_call_cond(). Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7334dbc572b4b..109890473fd85 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8526,8 +8526,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -9918,10 +9917,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) From 73771ebea6d207c378932520bff65288a6753b40 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Dec 2021 08:12:28 -0500 Subject: [PATCH 026/120] KVM: x86: remove KVM_X86_OP_NULL and mark optional kvm_x86_ops commit e4fc23bad813591417f466beb7e833cdd2089cf6 upstream. The original use of KVM_X86_OP_NULL, which was to mark calls that do not follow a specific naming convention, is not in use anymore. Instead, let's mark calls that are optional because they are always invoked within conditionals or with static_call_cond. Those that are _not_, i.e. those that are defined with KVM_X86_OP, must be defined by both vendor modules or some kind of NULL pointer dereference is bound to happen at runtime. [Backport Changes] Replace KVM_X86_OP_NULL with KVM_X86_OP_OPTIONAL for guest_memory_reclaimed() API ensuring better alignment with upstream. changes. Notably, APIs such as vm_copy_enc_context_from() and vm_move_enc_context_from() are not part of our kernel, so they are excluded from this change. The backport commit f349144228479 uses KVM_X86_OP_NULL in the vcpu_precreate() function, whereas the upstream. commit d588bb9be1da has updated vcpu_precreate() to use KVM_X86_OP_OPTIONAL_RET0 instead, which is consistent with this change. This update ensures consistency with the upstream. implementation and eliminates legacy null operations. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 86 +++++++++++++++--------------- arch/x86/include/asm/kvm_host.h | 4 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 6a8d4178b35da..13fc8cf5d7d87 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -1,25 +1,25 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL) BUILD_BUG_ON(1) #endif /* - * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate - * "static_call()"s. They are also intended for use when defining - * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those - * functions that follow the [svm|vmx]_func_name convention. - * KVM_X86_OP_NULL() can leave a NULL definition for the - * case where there is no definition or a function name that - * doesn't match the typical naming convention is supplied. + * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. */ -KVM_X86_OP_NULL(hardware_enable) -KVM_X86_OP_NULL(hardware_disable) -KVM_X86_OP_NULL(hardware_unsetup) +KVM_X86_OP(hardware_enable) +KVM_X86_OP(hardware_disable) +KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) -KVM_X86_OP_NULL(vm_destroy) -KVM_X86_OP_NULL(vcpu_precreate) +KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) @@ -33,10 +33,10 @@ KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) -KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(get_cs_db_l_bits) KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) -KVM_X86_OP_NULL(post_set_cr3) +KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) @@ -52,14 +52,14 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) -KVM_X86_OP_NULL(tlb_remote_flush) -KVM_X86_OP_NULL(tlb_remote_flush_with_range) +KVM_X86_OP_OPTIONAL(tlb_remote_flush) +KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_run) -KVM_X86_OP_NULL(handle_exit) -KVM_X86_OP_NULL(skip_emulated_instruction) -KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(handle_exit) +KVM_X86_OP(skip_emulated_instruction) +KVM_X86_OP_OPTIONAL(update_emulated_instruction) KVM_X86_OP(set_interrupt_shadow) KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) @@ -73,22 +73,22 @@ KVM_X86_OP(get_nmi_mask) KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) -KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) -KVM_X86_OP_NULL(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) -KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_posted_interrupt) -KVM_X86_OP_NULL(sync_pir_to_irr) +KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP(set_tss_addr) KVM_X86_OP(set_identity_map_addr) KVM_X86_OP(get_mt_mask) KVM_X86_OP(load_mmu_pgd) -KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) @@ -96,33 +96,33 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) -KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(vcpu_blocking) -KVM_X86_OP_NULL(vcpu_unblocking) -KVM_X86_OP_NULL(pi_update_irte) -KVM_X86_OP_NULL(pi_start_assignment) -KVM_X86_OP_NULL(apicv_post_state_restore) -KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) -KVM_X86_OP_NULL(set_hv_timer) -KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) +KVM_X86_OP_OPTIONAL(vcpu_blocking) +KVM_X86_OP_OPTIONAL(vcpu_unblocking) +KVM_X86_OP_OPTIONAL(pi_update_irte) +KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP(apicv_post_state_restore) +KVM_X86_OP_OPTIONAL(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(set_hv_timer) +KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) -KVM_X86_OP_NULL(mem_enc_ioctl) -KVM_X86_OP_NULL(mem_enc_register_region) -KVM_X86_OP_NULL(mem_enc_unregister_region) -KVM_X86_OP_NULL(guest_memory_reclaimed) +KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(mem_enc_register_region) +KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) +KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_NULL(enable_direct_tlbflush) -KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) -KVM_X86_OP_NULL(complete_emulated_msr) +KVM_X86_OP(complete_emulated_msr) #undef KVM_X86_OP -#undef KVM_X86_OP_NULL +#undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index efd8d0aa7fe12..98ad9aec20956 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1552,14 +1552,14 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP #include static inline void kvm_ops_static_call_update(void) { #define KVM_X86_OP(func) \ static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP #include } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 109890473fd85..76aff1c023d4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -130,7 +130,7 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP #include EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); From ba2c69b5135fb363912b61b67484129d35d08c35 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Dec 2021 08:12:48 -0500 Subject: [PATCH 027/120] KVM: x86: warn on incorrectly NULL members of kvm_x86_ops commit dd2319c61888018a5295264c9b631e151dad364d upstream. Use the newly corrected KVM_X86_OP annotations to warn about possible NULL pointer dereferences as soon as the vendor module is loaded. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 98ad9aec20956..74afd6a12a4f0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1557,10 +1557,13 @@ extern struct kvm_x86_ops kvm_x86_ops; static inline void kvm_ops_static_call_update(void) { -#define KVM_X86_OP(func) \ +#define __KVM_X86_OP(func) \ static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP #include +#undef __KVM_X86_OP } #define __KVM_HAVE_ARCH_VM_ALLOC From 50696fbef73348211c2e7e5993a7a165b4e52d30 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Feb 2022 13:07:10 -0500 Subject: [PATCH 028/120] KVM: x86: allow defining return-0 static calls commit 5be2226f417d5b06d17e6c52d6e341cf43c29e48 upstream. A few vendor callbacks are only used by VMX, but they return an integer or bool value. Introduce KVM_X86_OP_OPTIONAL_RET0 for them: if a func is NULL in struct kvm_x86_ops, it will be changed to __static_call_return0 when updating static calls. [Backport changes] In this commit f0f101b17c4 in file of "kernel/static_call.c" added the EXPORT_SYMBOL_GPL(__static_call_return0); Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-ops.h | 15 +++++++++------ arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/svm/avic.c | 5 ----- arch/x86/kvm/svm/svm.c | 20 -------------------- arch/x86/kvm/x86.c | 4 ++-- 5 files changed, 15 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 13fc8cf5d7d87..bf8705c566109 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -10,7 +10,9 @@ BUILD_BUG_ON(1) * * KVM_X86_OP_OPTIONAL() can be used for those functions that can have * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. + * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * to make a definition optional, but in this case the default will + * be __static_call_return0. */ KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) @@ -78,15 +80,15 @@ KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) -KVM_X86_OP_OPTIONAL(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_posted_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) -KVM_X86_OP(set_tss_addr) -KVM_X86_OP(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) +KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) @@ -104,7 +106,7 @@ KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) KVM_X86_OP(apicv_post_state_restore) -KVM_X86_OP_OPTIONAL(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) @@ -126,3 +128,4 @@ KVM_X86_OP(complete_emulated_msr) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL +#undef KVM_X86_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74afd6a12a4f0..41d855fb43bcc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1553,6 +1553,7 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include static inline void kvm_ops_static_call_update(void) @@ -1562,6 +1563,9 @@ static inline void kvm_ops_static_call_update(void) #define KVM_X86_OP(func) \ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) #define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \ + (void *) __static_call_return0); #include #undef __KVM_X86_OP } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b595a33860d70..d01996deeff34 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -686,11 +686,6 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) return 0; } -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24eabcfc8ab49..3500e67e48b69 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3625,16 +3625,6 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4044,11 +4034,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4675,10 +4660,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, - .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4703,7 +4684,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nested_ops = &svm_nested_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .pi_update_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76aff1c023d4c..7b669f9052540 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); @@ -12198,8 +12199,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) From 481ae8b7e3d81149510f27cab9c34a0e4b680e6b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Feb 2022 16:23:55 +0000 Subject: [PATCH 029/120] KVM: x86: Fix pointer mistmatch warning when patching RET0 static calls commit 925088781eede43cf6616a1197c31dee451b7948 upstream. Cast kvm_x86_ops.func to 'void *' when updating KVM static calls that are conditionally patched to __static_call_return0(). clang complains about using mismatching pointers in the ternary operator, which breaks the build when compiling with CONFIG_KVM_WERROR=y. >> arch/x86/include/asm/kvm-x86-ops.h:82:1: warning: pointer type mismatch ('bool (*)(struct kvm_vcpu *)' and 'void *') [-Wpointer-type-mismatch] Fixes: 5be2226f417d ("KVM: x86: allow defining return-0 static calls") Reported-by: Like Xu Reported-by: kernel test robot Signed-off-by: Sean Christopherson Reviewed-by: David Dunn Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Message-Id: <20220223162355.3174907-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 41d855fb43bcc..57a23b277cf5d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1564,8 +1564,8 @@ static inline void kvm_ops_static_call_update(void) WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) #define KVM_X86_OP_OPTIONAL __KVM_X86_OP #define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \ - (void *) __static_call_return0); + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); #include #undef __KVM_X86_OP } From c7f1184735a522a02e1dec6f49ebff29e74eb7ae Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Oct 2021 13:26:38 -0400 Subject: [PATCH 030/120] KVM: x86: add support for CPUID leaf 0x80000021 commit 58b3d12c0a860cda34ed9d2378078ea5134e6812 upstream. CPUID leaf 0x80000021 defines some features (or lack of bugs) of AMD processors. Expose the ones that make sense via KVM_GET_SUPPORTED_CPUID. Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ce579dba9ac90..d2185f46c1369 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1020,7 +1020,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001f); + entry->eax = min(entry->eax, 0x80000021); break; case 0x80000001: entry->ebx &= ~GENMASK(27, 16); @@ -1101,6 +1101,23 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx &= ~GENMASK(11, 6); } break; + case 0x80000020: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + case 0x80000021: + entry->ebx = entry->ecx = entry->edx = 0; + /* + * Pass down these bits: + * EAX 0 NNDBP, Processor ignores nested data breakpoints + * EAX 2 LAS, LFENCE always serializing + * EAX 6 NSCB, Null selector clear base + * + * Other defined bits are for MSRs that KVM does not expose: + * EAX 3 SPCL, SMM page configuration lock + * EAX 13 PCMSR, Prefetch control MSR + */ + entry->eax &= BIT(0) | BIT(2) | BIT(6); + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ From d033ce50f6a80dd30fe5d8dbec331e543df39fa4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Oct 2021 17:19:27 -0400 Subject: [PATCH 031/120] KVM: x86: synthesize CPUID leaf 0x80000021h if useful commit f144c49e8c3950add1b051e76ebf40a258984c9d upstream. Guests X86_BUG_NULL_SEG if and only if the host has them. Use the info from static_cpu_has_bug to form the 0x80000021 CPUID leaf that was defined for Zen3. Userspace can then set the bit even on older CPUs that do not have the bug, such as Zen2. Do the same for X86_FEATURE_LFENCE_RDTSC as well, since various processors have had very different ways of detecting it and not all of them are available to userspace. Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d2185f46c1369..20a75e3a1d1d2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -680,6 +680,19 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ return entry; + case 0x80000000: + /* + * 0x80000021 is sometimes synthesized by __do_cpuid_func, which + * would result in out-of-bounds calls to do_host_cpuid. + */ + { + static int max_cpuid_80000000; + if (!READ_ONCE(max_cpuid_80000000)) + WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); + if (function > READ_ONCE(max_cpuid_80000000)) + return entry; + } + default: break; } @@ -1021,6 +1034,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; case 0x80000000: entry->eax = min(entry->eax, 0x80000021); + /* + * Serializing LFENCE is reported in a multitude of ways, + * and NullSegClearsBase is not reported in CPUID on Zen2; + * help userspace by providing the CPUID leaf ourselves. + */ + if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) + || !static_cpu_has_bug(X86_BUG_NULL_SEG)) + entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: entry->ebx &= ~GENMASK(27, 16); @@ -1117,6 +1138,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * EAX 13 PCMSR, Prefetch control MSR */ entry->eax &= BIT(0) | BIT(2) | BIT(6); + if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) + entry->eax |= BIT(2); + if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) + entry->eax |= BIT(6); break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: From cfdc7870e4a40865cea6f91b92c942bb37bfcbdd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 22 Mar 2022 08:29:06 -0700 Subject: [PATCH 032/120] KVM: x86: Fix clang -Wimplicit-fallthrough in do_host_cpuid() commit 07ea4ab1f9b83953ff5c3f6ccfb84d581bfe0046 upstream. Clang warns: arch/x86/kvm/cpuid.c:739:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] default: ^ arch/x86/kvm/cpuid.c:739:2: note: insert 'break;' to avoid fall-through default: ^ break; 1 error generated. Clang is a little more pedantic than GCC, which does not warn when falling through to a case that is just break or return. Clang's version is more in line with the kernel's own stance in deprecated.rst, which states that all switch/case blocks must end in either break, fallthrough, continue, goto, or return. Add the missing break to silence the warning. Fixes: f144c49e8c39 ("KVM: x86: synthesize CPUID leaf 0x80000021h if useful") Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Message-Id: <20220322152906.112164-1-nathan@kernel.org> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 20a75e3a1d1d2..ee57b1d2e5633 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -692,6 +692,7 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, if (function > READ_ONCE(max_cpuid_80000000)) return entry; } + break; default: break; From 8e15d786fa816499d42995cf4969de2507297474 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:10 -0600 Subject: [PATCH 033/120] KVM: x86: Move lookup of indexed CPUID leafs to helper commit b66370db9a90b3fa4c4a1a732af3e7e38d6d4c7c upstream. Determining which CPUID leafs have significant ECX/index values is also needed by guest kernel code when doing SEV-SNP-validated CPUID lookups. Move this to common code to keep future updates in sync. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-31-brijesh.singh@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/cpuid.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 19 ++----------------- 2 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 arch/x86/include/asm/cpuid.h diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h new file mode 100644 index 0000000000000..70b2db18165eb --- /dev/null +++ b/arch/x86/include/asm/cpuid.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPUID-related helpers/definitions + * + * Derived from arch/x86/kvm/cpuid.c + */ + +#ifndef _ASM_X86_CPUID_H +#define _ASM_X86_CPUID_H + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x8000001d: + return true; + } + + return false; +} + +#endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ee57b1d2e5633..e764bd4f6bf98 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -701,24 +702,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } From 6ac0f64a9f3a6f56269c976d07d430152daceab5 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:51 +0000 Subject: [PATCH 034/120] KVM: x86: Move kvm_ops_static_call_update() to x86.c commit fdc298da866165ec0288fe227982f1aa0467bf5c upstream. The kvm_ops_static_call_update() is defined in kvm_host.h. That's completely unnecessary, it should have exactly one caller, kvm_arch_hardware_setup(). Move the helper to x86.c and have it do the actual memcpy() of the ops in addition to the static call updates. This will also allow for cleanly giving kvm_pmu_ops static_call treatment. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: Move memcpy() into the helper and rename accordingly] Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 14 -------------- arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 57a23b277cf5d..775f85a8864d5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1556,20 +1556,6 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include -static inline void kvm_ops_static_call_update(void) -{ -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include -#undef __KVM_X86_OP -} - #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7b669f9052540..01a9b23121cc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11593,6 +11593,22 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include +#undef __KVM_X86_OP +} + int kvm_arch_hardware_setup(void *opaque) { struct kvm_x86_init_ops *ops = opaque; @@ -11607,8 +11623,7 @@ int kvm_arch_hardware_setup(void *opaque) if (r != 0) return r; - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - kvm_ops_static_call_update(); + kvm_ops_update(ops); /* Temporary ugliness. */ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) From d0f446dec2ceaa632d2b7ef39e804d730e26cdaa Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:52 +0000 Subject: [PATCH 035/120] KVM: x86: Copy kvm_pmu_ops by value to eliminate layer of indirection commit 8f969c0c3443183972ac7311d2bd656806082579 upstream. Replace the kvm_pmu_ops pointer in common x86 with an instance of the struct to save one pointer dereference when invoking functions. Copy the struct by value to set the ops during kvm_init(). Signed-off-by: Like Xu [sean: Move pmc_is_enabled(), make kvm_pmu_ops static] Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 54 ++++++++++++++++++++++++++++------------------ arch/x86/kvm/pmu.h | 7 ++---- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f8821ee5c1b4d..77cf7af21bbb2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -49,6 +49,18 @@ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters */ +static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; + +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) +{ + memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); +} + +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +{ + return kvm_pmu_ops.pmc_is_enabled(pmc); +} + static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -210,7 +222,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = kvm_pmu_ops.pmc_perf_hw_id(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -260,7 +272,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), + kvm_pmu_ops.pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi); @@ -269,7 +281,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, pmc_idx); if (!pmc) return; @@ -291,7 +303,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, pmu->reprogram_pmi); @@ -313,7 +325,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); + return kvm_pmu_ops.is_valid_rdpmc_ecx(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -363,7 +375,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = kvm_pmu_ops.rdpmc_ecx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; @@ -379,22 +391,22 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - if (kvm_x86_ops.pmu_ops->deliver_pmi) - kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); + if (kvm_pmu_ops.deliver_pmi) + kvm_pmu_ops.deliver_pmi(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || - kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); + return kvm_pmu_ops.msr_idx_to_pmc(vcpu, msr) || + kvm_pmu_ops.is_valid_msr(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = kvm_pmu_ops.msr_idx_to_pmc(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -402,13 +414,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); + return kvm_pmu_ops.get_msr(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); + return kvm_pmu_ops.set_msr(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -417,7 +429,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_x86_ops.pmu_ops->refresh(vcpu); + kvm_pmu_ops.refresh(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -425,7 +437,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - kvm_x86_ops.pmu_ops->reset(vcpu); + kvm_pmu_ops.reset(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -433,7 +445,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_x86_ops.pmu_ops->init(vcpu); + kvm_pmu_ops.init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; @@ -465,14 +477,14 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } - if (kvm_x86_ops.pmu_ops->cleanup) - kvm_x86_ops.pmu_ops->cleanup(vcpu); + if (kvm_pmu_ops.cleanup) + kvm_pmu_ops.cleanup(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -502,7 +514,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int config; pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = kvm_pmu_ops.pmc_perf_hw_id(pmc); pmc->eventsel = old_eventsel; return config == perf_hw_id; } @@ -530,7 +542,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) int i; for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, i); if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) continue; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e6636ad8cad17..34496b0a20097 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,8 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); }; +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) return pmc->type == KVM_PMC_FIXED; } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) -{ - return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc); -} - static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01a9b23121cc1..17e3d689a07ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11607,6 +11607,8 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) (void *)__static_call_return0); #include #undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->runtime_ops->pmu_ops); } int kvm_arch_hardware_setup(void *opaque) From a93af2eaf6f987faf69de7400ef6bb1dbd0a2019 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:53 +0000 Subject: [PATCH 036/120] KVM: x86: Move .pmu_ops to kvm_x86_init_ops and tag as __initdata commit 34886e796c413273908b036df57cc9ae731badae upstream. The pmu_ops should be moved to kvm_x86_init_ops and tagged as __initdata. That'll save those precious few bytes, and more importantly make the original ops unreachable, i.e. make it harder to sneak in post-init modification bugs. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 775f85a8864d5..e4ca107e07ecd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1464,8 +1464,6 @@ struct kvm_x86_ops { int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); - /* pmu operations of sub-arch */ - const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); @@ -1535,6 +1533,7 @@ struct kvm_x86_init_ops { bool (*intel_pt_intr_in_guest)(void); struct kvm_x86_ops *runtime_ops; + struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 59ebce60351f5..06bb327fa7089 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -343,7 +343,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } } -struct kvm_pmu_ops amd_pmu_ops = { +struct kvm_pmu_ops amd_pmu_ops __initdata = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3500e67e48b69..737ecbc3b2a42 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4680,7 +4680,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, - .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, @@ -4716,6 +4715,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b276632ceb14d..299cda5109cba 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -723,7 +723,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } -struct kvm_pmu_ops intel_pmu_ops = { +struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0a82b1bf09253..30d447e5ecc7d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8026,7 +8026,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, .pi_update_irte = pi_update_irte, @@ -8278,6 +8277,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, }; static void vmx_cleanup_l1d_flush(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 17e3d689a07ca..8c3e15446265a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11608,7 +11608,7 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) #include #undef __KVM_X86_OP - kvm_pmu_ops_update(ops->runtime_ops->pmu_ops); + kvm_pmu_ops_update(ops->pmu_ops); } int kvm_arch_hardware_setup(void *opaque) From 3d5304abdb57c880cdba49accfbe538d30a6539e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:54 +0000 Subject: [PATCH 037/120] KVM: x86: Use static calls to reduce kvm_pmu_ops overhead commit 1921f3aa9263977f6c31b1c3eb2814bff2e84a12 upstream. Use static calls to improve kvm_pmu_ops performance, following the same pattern and naming scheme used by kvm-x86-ops.h. Here are the worst fenced_rdtsc() cycles numbers for the kvm_pmu_ops functions that is most often called (up to 7 digits of calls) when running a single perf test case in a guest on an ICX 2.70GHz host (mitigations=on): | legacy | static call ------------------------------------------------------------ .pmc_idx_to_pmc | 1304840 | 994872 (+23%) .pmc_is_enabled | 978670 | 1011750 (-3%) .msr_idx_to_pmc | 47828 | 41690 (+12%) .is_valid_msr | 28786 | 30108 (-4%) Signed-off-by: Like Xu [sean: Handle static call updates in pmu.c, tweak changelog] Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 31 ++++++++++++++ arch/x86/kvm/pmu.c | 56 ++++++++++++++++---------- 2 files changed, 65 insertions(+), 22 deletions(-) create mode 100644 arch/x86/include/asm/kvm-x86-pmu-ops.h diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h new file mode 100644 index 0000000000000..fdfd8e06fee6d --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. + */ +KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(pmc_is_enabled) +KVM_X86_PMU_OP(pmc_idx_to_pmc) +KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) +KVM_X86_PMU_OP(msr_idx_to_pmc) +KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP(is_valid_msr) +KVM_X86_PMU_OP(get_msr) +KVM_X86_PMU_OP(set_msr) +KVM_X86_PMU_OP(refresh) +KVM_X86_PMU_OP(init) +KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) +KVM_X86_PMU_OP_OPTIONAL(cleanup) + +#undef KVM_X86_PMU_OP +#undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 77cf7af21bbb2..4526c9714b7eb 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -51,14 +51,28 @@ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; +#define KVM_X86_PMU_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ + *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#include + void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) { memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); + +#define __KVM_X86_PMU_OP(func) \ + static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); +#define KVM_X86_PMU_OP(func) \ + WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) +#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#include +#undef __KVM_X86_PMU_OP } static inline bool pmc_is_enabled(struct kvm_pmc *pmc) { - return kvm_pmu_ops.pmc_is_enabled(pmc); + return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); } static void kvm_pmi_trigger_fn(struct irq_work *irq_work) @@ -222,7 +236,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_pmu_ops.pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -272,7 +286,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_pmu_ops.pmc_perf_hw_id(pmc), + static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi); @@ -281,7 +295,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); if (!pmc) return; @@ -303,7 +317,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, pmu->reprogram_pmi); @@ -325,7 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_pmu_ops.is_valid_rdpmc_ecx(vcpu, idx); + return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -375,7 +389,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_pmu_ops.rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; @@ -391,22 +405,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - if (kvm_pmu_ops.deliver_pmi) - kvm_pmu_ops.deliver_pmi(vcpu); + static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_pmu_ops.msr_idx_to_pmc(vcpu, msr) || - kvm_pmu_ops.is_valid_msr(vcpu, msr); + return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_pmu_ops.msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -414,13 +427,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_pmu_ops.get_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_pmu_ops.set_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -429,7 +442,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_pmu_ops.refresh(vcpu); + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -437,7 +450,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - kvm_pmu_ops.reset(vcpu); + static_call(kvm_x86_pmu_reset)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -445,7 +458,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_pmu_ops.init(vcpu); + static_call(kvm_x86_pmu_init)(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; @@ -477,14 +490,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } - if (kvm_pmu_ops.cleanup) - kvm_pmu_ops.cleanup(vcpu); + static_call_cond(kvm_x86_pmu_cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -514,7 +526,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int config; pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = kvm_pmu_ops.pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); pmc->eventsel = old_eventsel; return config == perf_hw_id; } @@ -542,7 +554,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) int i; for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = kvm_pmu_ops.pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) continue; From 5bdaf00f3b33c41d1d1b4026998771bacc5d0012 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 29 Apr 2022 14:43:04 -0400 Subject: [PATCH 038/120] KVM: x86: work around QEMU issue with synthetic CPUID leaves commit f751d8eac17692905cdd6935f72d523d8adf3b65 upstream. Synthesizing AMD leaves up to 0x80000021 caused problems with QEMU, which assumes the *host* CPUID[0x80000000].EAX is higher or equal to what KVM_GET_SUPPORTED_CPUID reports. This causes QEMU to issue bogus host CPUIDs when preparing the input to KVM_SET_CPUID2. It can even get into an infinite loop, which is only terminated by an abort(): cpuid_data is full, no space for cpuid(eax:0x8000001d,ecx:0x3e) To work around this, only synthesize those leaves if 0x8000001d exists on the host. The synthetic 0x80000021 leaf is mostly useful on Zen2, which satisfies the condition. Fixes: f144c49e8c39 ("KVM: x86: synthesize CPUID leaf 0x80000021h if useful") Reported-by: Maxim Levitsky Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e764bd4f6bf98..65a547df39a42 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1021,12 +1021,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 0x80000000: entry->eax = min(entry->eax, 0x80000021); /* - * Serializing LFENCE is reported in a multitude of ways, - * and NullSegClearsBase is not reported in CPUID on Zen2; - * help userspace by providing the CPUID leaf ourselves. + * Serializing LFENCE is reported in a multitude of ways, and + * NullSegClearsBase is not reported in CPUID on Zen2; help + * userspace by providing the CPUID leaf ourselves. + * + * However, only do it if the host has CPUID leaf 0x8000001d. + * QEMU thinks that it can query the host blindly for that + * CPUID leaf if KVM reports that it supports 0x8000001d or + * above. The processor merrily returns values from the + * highest Intel leaf which QEMU tries to use as the guest's + * 0x8000001d. Even worse, this can result in an infinite + * loop if said highest leaf has no subleaves indexed by ECX. */ - if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) - || !static_cpu_has_bug(X86_BUG_NULL_SEG)) + if (entry->eax >= 0x8000001d && + (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) + || !static_cpu_has_bug(X86_BUG_NULL_SEG))) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: From 58e417fae7eba72e449b4b835aedc1cf7f1dab5b Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 17 May 2022 05:12:36 +0000 Subject: [PATCH 039/120] kvm: x86/pmu: Fix the compare function used by the pmu event filter commit 4ac19ead0dfbabd8e0bfc731f507cfb0b95d6c99 upstream. When returning from the compare function the u64 is truncated to an int. This results in a loss of the high nybble[1] in the event select and its sign if that nybble is in use. Switch from using a result that can end up being truncated to a result that can only be: 1, 0, -1. [1] bits 35:32 in the event select register and bits 11:8 in the event select. Fixes: 7ff775aca48ad ("KVM: x86/pmu: Use binary search to check filtered events") Signed-off-by: Aaron Lewis Reviewed-by: Sean Christopherson Message-Id: <20220517051238.2566934-1-aaronlewis@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4526c9714b7eb..0fb26018e89b7 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -194,9 +194,12 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *a, const void *b) +static int cmp_u64(const void *pa, const void *pb) { - return *(__u64 *)a - *(__u64 *)b; + u64 a = *(u64 *)pa; + u64 b = *(u64 *)pb; + + return (a > b) - (a < b); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) From 0508bde1dbe75b3986f31a5b337ad22a5f9b8c1b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:36 +0800 Subject: [PATCH 040/120] KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS commit c59a1f106f5cd4843c097069ff1bb2ad72103a67 upstream. If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the IA32_PEBS_ENABLE MSR exists and all architecturally enumerated fixed and general-purpose counters have corresponding bits in IA32_PEBS_ENABLE that enable generation of PEBS records. The general-purpose counter bits start at bit IA32_PEBS_ENABLE[0], and the fixed counter bits start at bit IA32_PEBS_ENABLE[32]. When guest PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and atomically switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. Based on whether the platform supports x86_pmu.pebs_ept, it has also refactored the way to add more msrs to arr[] in intel_guest_get_msrs() for extensibility. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-8-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 75 ++++++++++++++++++++++++-------- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/include/asm/msr-index.h | 6 +++ arch/x86/kvm/vmx/pmu_intel.c | 31 +++++++++++++ arch/x86/kvm/x86.c | 1 + 5 files changed, 98 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ddd5817042f7a..cc81172faa0c5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3970,33 +3970,72 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; + + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); - *nr = 1; + if (!x86_pmu.pebs) + return arr; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; } + if (!x86_pmu.pebs_ept) + return arr; + pebs_enable = (*nr)++; + + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[0].guest |= arr[*nr].guest; + return arr; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4ca107e07ecd..f01840e653eee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -519,6 +519,9 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 pebs_enable; + u64 pebs_enable_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 45aa98bf804f9..0c6a5cec945ec 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -254,6 +254,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 299cda5109cba..457b5f3bbb0d6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -212,6 +212,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: return intel_pmu_has_perf_global_ctrl(pmu); break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -359,6 +362,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -419,6 +425,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -489,6 +503,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl_mask = ~0ull; pmu->global_ovf_ctrl_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !enable_pmu) @@ -559,6 +574,22 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = ~pmu->global_ctrl; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } else { + vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c3e15446265a..00cab9744063c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1386,6 +1386,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_IA32_PEBS_ENABLE, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, From e24353d174f8e5c2d8e5106cb06e80227d0f324b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:37 +0800 Subject: [PATCH 041/120] KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter commit 79f3e3b58386a2fc05054367b905619f741beeb4 upstream. When a guest counter is configured as a PEBS counter through IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by configuring a non-zero precision level in the perf_event_attr. The guest PEBS overflow PMI bit would be set in the guest GLOBAL_STATUS MSR when PEBS facility generates a PEBS overflow PMI based on guest IA32_DS_AREA MSR. Even with the same counter index and the same event code and mask, guest PEBS events will not be reused for non-PEBS events. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-9-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0fb26018e89b7..56a8d65fd448a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -86,15 +86,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +131,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,6 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -147,6 +156,23 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -160,7 +186,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -186,6 +212,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; From 2b3ba8a6acdd42bd2577aba1bb320e6bc0078cb3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:39 +0800 Subject: [PATCH 042/120] KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS commit 8183a538cd95f72f11871b35726256ec3bcb9439 upstream. When CPUID.01H:EDX.DS[21] is set, the IA32_DS_AREA MSR exists and points to the linear address of the first byte of the DS buffer management area, which is used to manage the PEBS records. When guest PEBS is enabled, the MSR_IA32_DS_AREA MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. The WRMSR to IA32_DS_AREA MSR brings a #GP(0) if the source register contains a non-canonical address. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-11-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 10 +++++++++- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/pmu_intel.c | 11 +++++++++++ arch/x86/kvm/x86.c | 2 +- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cc81172faa0c5..5a8c2767035a8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -3991,6 +3992,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable; @@ -4023,8 +4025,14 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) return arr; } - if (!x86_pmu.pebs_ept) + if (!kvm_pmu || !x86_pmu.pebs_ept) return arr; + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + pebs_enable = (*nr)++; arr[pebs_enable] = (struct perf_guest_switch_msr){ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f01840e653eee..fa73f0c3fcf4f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -519,6 +519,7 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_mask; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 457b5f3bbb0d6..05322cfa10e0d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -215,6 +215,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_IA32_PEBS_ENABLE: ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -365,6 +368,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -433,6 +439,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_DS_AREA: + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00cab9744063c..5993d8a92f0fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1386,7 +1386,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_IA32_PEBS_ENABLE, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, From 6a2eb1beab2168fcb5d1f9139c034566413f7cbf Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:40 +0800 Subject: [PATCH 043/120] KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 902caeb6841a64072791b1c18f9f56089566865d upstream. If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the adaptive PEBS is supported. The PEBS_DATA_CFG MSR and adaptive record enable bits (IA32_PERFEVTSELx.Adaptive_Record and IA32_FIXED_CTR_CTRL. FCx_Adaptive_Record) are also supported. Adaptive PEBS provides software the capability to configure the PEBS records to capture only the data of interest, keeping the record size compact. An overflow of PMCx results in generation of an adaptive PEBS record with state information based on the selections specified in MSR_PEBS_DATA_CFG.By default, the record only contain the Basic group. When guest adaptive PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. According to Intel SDM, software is recommended to PEBS Baseline when the following is true. IA32_PERF_CAPABILITIES.PEBS_BASELINE[14] && IA32_PERF_CAPABILITIES.PEBS_FMT[11:8] ≥ 4. Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-12-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 8 ++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx/pmu_intel.c | 20 +++++++++++++++++++- arch/x86/kvm/x86.c | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5a8c2767035a8..99e4b43a5a7b3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4033,6 +4033,14 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = kvm_pmu->ds_area, }; + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + pebs_enable = (*nr)++; arr[pebs_enable] = (struct perf_guest_switch_msr){ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fa73f0c3fcf4f..1fb0207b0bd49 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -522,6 +522,8 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; /* * The gate to release perf_events not marked in diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 05322cfa10e0d..5072a34e8e2ad 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -203,6 +203,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities = vcpu->arch.perf_capabilities; int ret; switch (msr) { @@ -213,11 +214,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) return intel_pmu_has_perf_global_ctrl(pmu); break; case MSR_IA32_PEBS_ENABLE: - ret = vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; + case MSR_PEBS_DATA_CFG: + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -371,6 +376,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_DS_AREA: msr_info->data = pmu->ds_area; return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -444,6 +452,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; pmu->ds_area = data; return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -515,6 +531,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !enable_pmu) @@ -594,6 +611,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl_mask &= ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); } + pmu->pebs_data_cfg_mask = ~0xff00000full; } else { pmu->pebs_enable_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5993d8a92f0fb..11c32dc9f44e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1386,7 +1386,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, From 5ea4f6a5e4ce2eb021654bcad4849ac732a0e46a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:42 +0800 Subject: [PATCH 044/120] KVM: x86/pmu: Move pmc_speculative_in_use() to arch/x86/kvm/pmu.h commit 63f21f326fc9e068d04c2c1d0a722e8db65588ba upstream. It allows this inline function to be reused by more callers in more files, such as pmu_intel.c. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-14-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 11 ----------- arch/x86/kvm/pmu.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 56a8d65fd448a..4ee04e0b5239c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -498,17 +498,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 34496b0a20097..bda8cc7e6544b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -144,6 +144,17 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); From 4c224c15399a37d22f858bd380ff7ef1c484313b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:43 +0800 Subject: [PATCH 045/120] KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations commit 854250329c02c0616a42532d65e81365272326d1 upstream. The guest PEBS will be disabled when some users try to perf KVM and its user-space through the same PEBS facility OR when the host perf doesn't schedule the guest PEBS counter in a one-to-one mapping manner (neither of these are typical scenarios). The PEBS records in the guest DS buffer are still accurate and the above two restrictions will be checked before each vm-entry only if guest PEBS is deemed to be enabled. Suggested-by: Wei Wang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-15-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 11 +++++++++-- arch/x86/include/asm/kvm_host.h | 9 +++++++++ arch/x86/kvm/vmx/pmu_intel.c | 20 ++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/vmx/vmx.h | 1 + 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 99e4b43a5a7b3..cf0d79d509ec9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4049,8 +4049,15 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, }; - /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ - arr[0].guest |= arr[*nr].guest; + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; + } return arr; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1fb0207b0bd49..f156be53b90bb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -525,6 +525,15 @@ struct kvm_pmu { u64 pebs_data_cfg; u64 pebs_data_cfg_mask; + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5072a34e8e2ad..614a6de1de6f1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -783,6 +783,26 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 30d447e5ecc7d..0e0534b5b5dec 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6843,6 +6843,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c80627cd3f0bd..f22dd34e4f2cb 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -107,6 +107,7 @@ static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); From 85058c30845d000a1ce021954a0496580d8c4a4b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:44 +0800 Subject: [PATCH 046/120] KVM: x86/pmu: Add kvm_pmu_cap to optimize perf_get_x86_pmu_capability commit 968635abd5f5986f3cb6f15602d365cf1b551c5d upstream. The information obtained from the interface perf_get_x86_pmu_capability() doesn't change, so an exported "struct x86_pmu_capability" is introduced for all guests in the KVM, and it's initialized before hardware_setup(). Signed-off-by: Like Xu Message-Id: <20220411101946.20262-16-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 27 ++++++++------------------- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/pmu.h | 19 +++++++++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++---------- arch/x86/kvm/x86.c | 9 ++++----- 5 files changed, 42 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 65a547df39a42..dd2df30cba652 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -823,7 +823,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -832,30 +831,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4ee04e0b5239c..f77e6c0cda263 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -24,6 +24,9 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index bda8cc7e6544b..b9c55289ff651 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -155,6 +155,24 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * Only support guest architectural pmu on + * a host with architectural pmu. + */ + if (!kvm_pmu_cap.version) + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); @@ -173,6 +191,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); +void kvm_init_pmu_capability(void); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 614a6de1de6f1..5d3d0eb6d3b88 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -513,8 +513,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -543,13 +541,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -559,9 +557,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11c32dc9f44e5..933a395e95836 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6690,15 +6690,12 @@ long kvm_arch_vm_ioctl(struct file *filp, static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6750,12 +6747,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -11622,6 +11619,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; From ba68602cb8bfc9d88d3ddd6d57fc292aaba545c9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 May 2022 08:00:05 -0400 Subject: [PATCH 047/120] KVM: x86/pmu: remove useless prototype commit ec4036edf924f741bc717d9afa25053cf63fa218 upstream. Signed-off-by: Paolo Bonzini Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b9c55289ff651..ecc219a677fd5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -191,7 +191,6 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); -void kvm_init_pmu_capability(void); bool is_vmware_backdoor_pmc(u32 pmc_idx); From cc0a5bb1ccb509628f5fe12df7e7fbcf76570dec Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 10 May 2022 12:44:07 +0800 Subject: [PATCH 048/120] KVM: x86/pmu: Don't overwrite the pmu->global_ctrl when refreshing commit c49467a45fe013ad7a892bf1479b1438315058f3 upstream. Assigning a value to pmu->global_ctrl just to set the value of pmu->global_ctrl_mask is more readable but does not conform to the specification. The value is reset to zero on Power up and Reset but stays unchanged on INIT, like most other MSRs. Signed-off-by: Like Xu Message-Id: <20220510044407.26445-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5d3d0eb6d3b88..08e3960b5667e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -516,6 +516,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 counter_mask; int i; pmu->nr_arch_gp_counters = 0; @@ -567,9 +568,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -603,7 +604,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = ~pmu->global_ctrl; + pmu->pebs_enable_mask = counter_mask; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= From ae72d73fd7b68d1f947a22ae1dc7a5dea5a81c2c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 May 2022 04:39:22 -0400 Subject: [PATCH 049/120] KVM: x86: always allow host-initiated writes to PMU MSRs commit d1c88a4020567ba4da52f778bcd9619d87e4ea75 upstream. Whenever an MSR is part of KVM_GET_MSR_INDEX_LIST, it has to be always retrievable and settable with KVM_GET_MSR and KVM_SET_MSR. Accept the PMU MSRs unconditionally in intel_is_valid_msr, if the access was host-initiated. [Backport changes] The line/ret-value 'ret = pmu->version > 1' in 'arch/x86/kvm/vmx/pmu_intel.c: intel_is_valid_msr()' has already been deleted as part of upstream. commit b663f0b Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/pmu.h | 4 ++-- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 26 +++++++++++++++++--------- arch/x86/kvm/x86.c | 10 +++++----- 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f77e6c0cda263..8dcaae7b780aa 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -446,10 +446,10 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) } } -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr, host_initiated); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ecc219a677fd5..60af7d5b42713 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -29,7 +29,7 @@ struct kvm_pmu_ops { unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); - bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); @@ -181,7 +181,7 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 06bb327fa7089..0aca8f29c1039 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -230,7 +230,7 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ return false; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 08e3960b5667e..797e962dc41b1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -200,11 +200,10 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) return ret; } -static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); u64 perf_capabilities = vcpu->arch.perf_capabilities; - int ret; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -212,26 +211,35 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: return intel_pmu_has_perf_global_ctrl(pmu); + if (host_initiated) + return true; + return pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; + if (host_initiated) + return true; + return perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + if (host_initiated) + return true; + return guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: - ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + if (host_initiated) + return true; + return (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + if (host_initiated) + return true; + return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } - - return ret; } static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) @@ -592,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, false)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 933a395e95836..319e4a1ba744c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3672,7 +3672,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3757,7 +3757,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3838,7 +3838,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3848,7 +3848,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4094,7 +4094,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } From c53fe82eb693028ec86c312c794c552b2ccad50b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:03 +0800 Subject: [PATCH 050/120] KVM: x86/pmu: Extract check_pmu_event_filter() handling both GP and fixed counters commit 89cb454ea984d0411523dc10e70e9bf0aca1b527 upstream. Checking the kvm->arch.pmu_event_filter policy in both gp and fixed code paths was somewhat redundant, so common parts can be extracted, which reduces code footprint and improves readability. Signed-off-by: Like Xu Reviewed-by: Wanpeng Li Message-Id: <20220518132512.37864-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 63 +++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8dcaae7b780aa..e16ae2a2c5f49 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -235,14 +235,44 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } +static bool check_pmu_event_filter(struct kvm_pmc *pmc) +{ + struct kvm_pmu_event_filter *filter; + struct kvm *kvm = pmc->vcpu->kvm; + bool allow_event = true; + __u64 key; + int idx; + + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (!filter) + goto out; + + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + } + +out: + return allow_event; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { u64 config; u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; - struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); - bool allow_event = true; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -254,17 +284,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; - - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } - if (!allow_event) + if (!check_pmu_event_filter(pmc)) return; if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | @@ -297,23 +317,14 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; pmc_pause_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } + if (!check_pmu_event_filter(pmc)) + return; if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) return; From 887728cf89de40156692233227ed5186f84fb219 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:05 +0800 Subject: [PATCH 051/120] KVM: x86/pmu: Pass only "struct kvm_pmc *pmc" to reprogram_counter() commit a40239b4cf33b2de872b04759c3e5ab87cc72a7f upstream. Passing the reference "struct kvm_pmc *pmc" when creating pmc->perf_event is sufficient. This change helps to simplify the calling convention by replacing reprogram_{gp, fixed}_counter() with reprogram_counter() seamlessly. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 17 +++++------------ arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 32 ++++++++++++++++++-------------- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e16ae2a2c5f49..735903c9dd98b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -340,18 +340,13 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) } EXPORT_SYMBOL_GPL(reprogram_fixed_counter); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) +void reprogram_counter(struct kvm_pmc *pmc) { - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); - - if (!pmc) - return; - if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + int idx = pmc->idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); reprogram_fixed_counter(pmc, ctrl, idx); } @@ -370,8 +365,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -544,13 +538,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 60af7d5b42713..37084d37604d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -175,7 +175,7 @@ static inline void kvm_init_pmu_capability(void) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 797e962dc41b1..682951ef27728 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -56,16 +56,32 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmu->fixed_ctr_ctrl = data; } +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + /* function is called when global control register has been updated. */ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) @@ -98,18 +114,6 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { From 77e6533db73dbe4ca45fe484e15b63de46512ffb Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:06 +0800 Subject: [PATCH 052/120] KVM: x86/pmu: Drop "u64 eventsel" for reprogram_gp_counter() commit fb121aaf19cd5047a01599debbb85a2c15275727 upstream. Because inside reprogram_gp_counter() it is bound to assign the requested eventel to pmc->eventsel, this assignment step can be moved forward, thus simplifying the passing of parameters to "struct kvm_pmc *pmc" only. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-6-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 7 +++---- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 6 ++++-- arch/x86/kvm/vmx/pmu_intel.c | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 735903c9dd98b..bc65c61d1bf07 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -268,17 +268,16 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return allow_event; } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +void reprogram_gp_counter(struct kvm_pmc *pmc) { u64 config; u32 type = PERF_TYPE_RAW; struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); - pmc->eventsel = eventsel; - pmc_pause_counter(pmc); if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) @@ -343,7 +342,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmc *pmc) { if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); + reprogram_gp_counter(pmc); else { int idx = pmc->idx - INTEL_PMC_IDX_FIXED; u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 37084d37604d5..ce8fed5f569d6 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -173,7 +173,7 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); +void reprogram_gp_counter(struct kvm_pmc *pmc); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmc *pmc); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0aca8f29c1039..89c9503fa3c3c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -287,8 +287,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_gp_counter(pmc); + } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 682951ef27728..cdf8c42ed8fae 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -496,7 +496,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_gp_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) From ca71f8cc27e1b3c849adfde6f422c1be1df64c8f Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:07 +0800 Subject: [PATCH 053/120] KVM: x86/pmu: Drop "u8 ctrl, int idx" for reprogram_fixed_counter() commit 76d287b2342e1906e399fd19d6500013aa074a50 upstream. Since afrer reprogram_fixed_counter() is called, it's bound to assign the requested fixed_ctr_ctrl to pmu->fixed_ctr_ctrl, this assignment step can be moved forward (the stale value for diff is saved extra early), thus simplifying the passing of parameters. No functional change intended. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-7-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 13 ++++++------- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 14 +++++++------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bc65c61d1bf07..592e0f53b2187 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -312,8 +312,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc) } EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_fixed_counter(struct kvm_pmc *pmc) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + int idx = pmc->idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; @@ -343,12 +346,8 @@ void reprogram_counter(struct kvm_pmc *pmc) { if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc); - else { - int idx = pmc->idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, idx); - - reprogram_fixed_counter(pmc, ctrl, idx); - } + else + reprogram_fixed_counter(pmc); } EXPORT_SYMBOL_GPL(reprogram_counter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ce8fed5f569d6..b7873894acfc8 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -174,7 +174,7 @@ static inline void kvm_init_pmu_capability(void) } void reprogram_gp_counter(struct kvm_pmc *pmc); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); +void reprogram_fixed_counter(struct kvm_pmc *pmc); void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index cdf8c42ed8fae..08f68d15634ce 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,23 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_fixed_counter(pmc); } - - pmu->fixed_ctr_ctrl = data; } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) From f19cfed7c7490bc2c04215ca7f69a8f017d04b4c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 May 2022 05:28:56 -0400 Subject: [PATCH 054/120] KVM: x86/pmu: Use only the uniform interface reprogram_counter() commit e99fae6edebcdf53658f531ee3c913ca74536355 upstream. Since reprogram_counter(), reprogram_{gp, fixed}_counter() currently have the same incoming parameter "struct kvm_pmc *pmc", the callers can simplify the conetxt by using uniformly exported interface, which makes reprogram_ {gp, fixed}_counter() static and eliminates EXPORT_SYMBOL_GPL. Signed-off-by: Like Xu Message-Id: <20220518132512.37864-8-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 6 ++---- arch/x86/kvm/pmu.h | 2 -- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 592e0f53b2187..ca5664dfffd83 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -268,7 +268,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return allow_event; } -void reprogram_gp_counter(struct kvm_pmc *pmc) +static void reprogram_gp_counter(struct kvm_pmc *pmc) { u64 config; u32 type = PERF_TYPE_RAW; @@ -310,9 +310,8 @@ void reprogram_gp_counter(struct kvm_pmc *pmc) !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT); } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc) +static void reprogram_fixed_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); int idx = pmc->idx - INTEL_PMC_IDX_FIXED; @@ -340,7 +339,6 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc) !(en_field & 0x1), /* exclude kernel */ pmi); } -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmc *pmc) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b7873894acfc8..f9128b2180e4c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -173,8 +173,6 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_gp_counter(struct kvm_pmc *pmc); -void reprogram_fixed_counter(struct kvm_pmc *pmc); void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 89c9503fa3c3c..9a9add3a995f6 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -289,7 +289,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_gp_counter(pmc); + reprogram_counter(pmc); } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 08f68d15634ce..45a4dbdf2ec85 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc); + reprogram_counter(pmc); } } @@ -497,7 +497,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_gp_counter(pmc); + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) From 3852adca1c9abb374cab15d7032580bf283b06ee Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 18 May 2022 21:25:09 +0800 Subject: [PATCH 055/120] KVM: x86/pmu: Use PERF_TYPE_RAW to merge reprogram_{gp,fixed}counter() commit 02791a5c362b7d71447efb1d0131d8368bb821f2 upstream. The code sketch for reprogram_{gp, fixed}_counter() is similar, while the fixed counter using the PERF_TYPE_HARDWAR type and the gp being able to use either PERF_TYPE_HARDWAR or PERF_TYPE_RAW type depending on the pmc->eventsel value. After 'commit 761875634a5e ("KVM: x86/pmu: Setup pmc->eventsel for fixed PMCs")', the pmc->eventsel of the fixed counter will also have been setup with the same semantic value and will not be changed during the guest runtime. The original story of using the PERF_TYPE_HARDWARE type is to emulate guest architecture PMU on a host without architecture PMU (the Pentium 4), for which the guest vPMC needs to be reprogrammed using the kernel generic perf_hw_id. But essentially, "the HARDWARE is just a convenience wrapper over RAW IIRC", quoated from Peterz. So it could be pretty safe to use the PERF_TYPE_RAW type only in practice to program both gp and fixed counters naturally in the reprogram_counter(). To make the gp and fixed counters more semantically symmetrical, the selection of EVENTSEL_{USER, OS, INT} bits is temporarily translated via fixed_ctr_ctrl before the pmc_reprogram_counter() call. Cc: Peter Zijlstra Suggested-by: Jim Mattson Signed-off-by: Like Xu Message-Id: <20220518132512.37864-9-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 79 ++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 58 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index ca5664dfffd83..b623b2e8bb25a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -268,85 +268,48 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return allow_event; } -static void reprogram_gp_counter(struct kvm_pmc *pmc) +void reprogram_counter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; - - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; if (!check_pmu_event_filter(pmc)) return; - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT); } - -static void reprogram_fixed_counter(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - int idx = pmc->idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - - pmc_pause_counter(pmc); - - if (!en_field || !pmc_is_enabled(pmc)) - return; - - if (!check_pmu_event_filter(pmc)) - return; - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} - -void reprogram_counter(struct kvm_pmc *pmc) -{ - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc); - else - reprogram_fixed_counter(pmc); -} EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) From 4e304d88e40a5e19ae707e00026444792fd1600b Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 19 May 2022 01:01:18 +0800 Subject: [PATCH 056/120] KVM: x86/pmu: Update global enable_pmu when PMU is undetected commit d7808f739162c003d249168bfe4c571aba18fb8a upstream. On some virt platforms (L1 guest w/o PMU), the value of module parameter 'enable_pmu' for nested L2 guests should be updated at initialisation. Considering that there is no concept of "architecture pmu" in AMD or Hygon and that the versions (prior to Zen 4) are all 0, but that the theoretical available counters are at least AMD64_NUM_COUNTERS, the utility check_hw_exists() is reused in the initialisation call path. Opportunistically update Intel specific comments. Fixes: 8eeac7e999e8 ("KVM: x86/pmu: Add kvm_pmu_cap to optimize perf_get_x86_pmu_capability") Signed-off-by: Like Xu Message-Id: <20220518170118.66263-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f9128b2180e4c..45a4ebe9c921b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -159,14 +159,19 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(void) { + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + perf_get_x86_pmu_capability(&kvm_pmu_cap); - /* - * Only support guest architectural pmu on - * a host with architectural pmu. - */ - if (!kvm_pmu_cap.version) + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + enable_pmu = false; + return; + } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, From bee5ae154764560e56b5ed3cc524c91d2afc144e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:24 +0800 Subject: [PATCH 057/120] KVM: x86/pmu: Restrict advanced features based on module enable_pmu commit 6ef25aa0a961298278301ae1d88106c701eb73fa upstream. Once vPMU is disabled, the KVM would not expose features like: PEBS (via clear kvm_pmu_cap.pebs_ept), legacy LBR and ARCH_LBR, CPUID 0xA leaf, PDCM bit and MSR_IA32_PERF_CAPABILITIES, plus PT_MODE_HOST_GUEST mode. What this group of features has in common is that their use relies on the underlying PMU counter and the host perf_event as a back-end resource requester or sharing part of the irq delivery path. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 6 ++++-- arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/vmx.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 45a4ebe9c921b..c9f1b71b15234 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -167,9 +167,11 @@ static inline void kvm_init_pmu_capability(void) * For Intel, only support guest architectural pmu * on a host with architectural pmu. */ - if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index dc44c58be2dae..14a9eddc499d6 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -6,6 +6,7 @@ #include "lapic.h" #include "x86.h" +#include "cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -397,6 +398,9 @@ static inline u64 vmx_get_perf_capabilities(void) if (!enable_pmu) return perf_cap; + if (!enable_pmu) + return 0; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e0534b5b5dec..af0482c59ca86 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7605,6 +7605,9 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); + if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); @@ -8248,7 +8251,7 @@ static __init int hardware_setup(void) if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; setup_default_sgx_lepubkeyhash(); From 289fdcef4a18607eea2780a6195c6435195ee67e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:52 +0000 Subject: [PATCH 058/120] Revert "KVM: x86: always allow host-initiated writes to PMU MSRs" commit 545feb96c052809dab5ec04b95f976acca9f9364 upstream. Revert the hack to allow host-initiated accesses to all "PMU" MSRs, as intel_is_valid_msr() returns true for _all_ MSRs, regardless of whether or not it has a snowball's chance in hell of actually being a PMU MSR. That mostly gets papered over by the actual get/set helpers only handling MSRs that they knows about, except there's the minor detail that kvm_pmu_{g,s}et_msr() eat reads and writes when the PMU is disabled. I.e. KVM will happy allow reads and writes to _any_ MSR if the PMU is disabled, either via module param or capability. This reverts commit d1c88a4020567ba4da52f778bcd9619d87e4ea75. Fixes: d1c88a402056 ("KVM: x86: always allow host-initiated writes to PMU MSRs") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/pmu.h | 4 ++-- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 27 ++++++++++----------------- arch/x86/kvm/x86.c | 10 +++++----- 5 files changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b623b2e8bb25a..e833729cfee50 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -410,10 +410,10 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) } } -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr, host_initiated); + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index c9f1b71b15234..27c3846194475 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -29,7 +29,7 @@ struct kvm_pmu_ops { unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); - bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); @@ -186,7 +186,7 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); -bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9a9add3a995f6..dc6e264d9e8f5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -230,7 +230,7 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ return false; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 45a4dbdf2ec85..8e1d0655753bb 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -204,10 +204,11 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) return ret; } -static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated) +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); u64 perf_capabilities = vcpu->arch.perf_capabilities; + int ret; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -215,35 +216,27 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiat case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: return intel_pmu_has_perf_global_ctrl(pmu); - if (host_initiated) - return true; - return pmu->version > 1; + ret = pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - if (host_initiated) - return true; - return perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - if (host_initiated) - return true; - return guest_cpuid_has(vcpu, X86_FEATURE_DS); + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: - if (host_initiated) - return true; - return (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: - if (host_initiated) - return true; - return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } + + return ret; } static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) @@ -605,7 +598,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, false)); + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); if (intel_pmu_lbr_is_compatible(vcpu)) x86_perf_get_lbr(&lbr_desc->records); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 319e4a1ba744c..933a395e95836 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3672,7 +3672,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3757,7 +3757,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3838,7 +3838,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3848,7 +3848,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4094,7 +4094,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } From 1b5ae32babf919ae01f1b312e3d35eba68c8fb2e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:53 +0000 Subject: [PATCH 059/120] KVM: VMX: Use vcpu_get_perf_capabilities() to get guest-visible value commit 3f7999b988bde6c50cb7b20d6c742d6512d1f0bd upstream. Use vcpu_get_perf_capabilities() when querying MSR_IA32_PERF_CAPABILITIES from the guest's perspective, e.g. to update the vPMU and to determine which MSRs exist. If userspace ignores MSR_IA32_PERF_CAPABILITIES but clear X86_FEATURE_PDCM, the guest should see '0'. Fixes: 902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS") Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 8e1d0655753bb..dfea8e6e52e96 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -207,7 +207,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 perf_capabilities = vcpu->arch.perf_capabilities; + u64 perf_capabilities; int ret; switch (msr) { @@ -219,12 +219,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = pmu->version > 1; break; case MSR_IA32_PEBS_ENABLE: - ret = perf_capabilities & PERF_CAP_PEBS_FORMAT; + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; @@ -522,6 +523,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; u64 counter_mask; int i; @@ -608,8 +610,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) { - if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) { + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_mask = counter_mask; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { From b5ba1a0e37883620abca6ebae0851726ebb93ca5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:54 +0000 Subject: [PATCH 060/120] KVM: x86: Ignore benign host accesses to "unsupported" PEBS and BTS MSRs commit 157fc497b54fd1dfcdedbca6199adca4bf5ee6ff upstream. Ignore host userspace reads and writes of '0' to PEBS and BTS MSRs that KVM reports in the MSR-to-save list, but the MSRs are ultimately unsupported. All MSRs in said list must be writable by userspace, e.g. if userspace sends the list back at KVM without filtering out the MSRs it doesn't need. Fixes: 8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS") Fixes: 902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS") Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-7-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 933a395e95836..a034b924722f7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3756,6 +3756,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3837,9 +3847,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports in + * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; From 07bbb2a7ba789916525e52bcfb0b9c699dba7757 Mon Sep 17 00:00:00 2001 From: David Dunn Date: Wed, 23 Feb 2022 22:57:41 +0000 Subject: [PATCH 061/120] KVM: x86: Provide per VM capability for disabling PMU virtualization commit ba7bb663f5547ef474c98df99a97bb4a13c5715f upstream. Add a new capability, KVM_CAP_PMU_CAPABILITY, that takes a bitmask of settings/features to allow userspace to configure PMU virtualization on a per-VM basis. For now, support a single flag, KVM_PMU_CAP_DISABLE, to allow disabling PMU virtualization for a VM even when KVM is configured with enable_pmu=true a module level. To keep KVM simple, disallow changing VM's PMU configuration after vCPUs have been created. [Backport changes] In the upstream. commit, a blank line was added above section "8.35 KVM_CAP_PMU_CAPABILITY". Since the upstream. patch cde363a was backported before the current patch "9. Known KVM API Problems" section was added before 8.35. Therefore, placed 8.35 above section 9 to maintain the correct order similar to linux upstream., which led to blank line appearing below section 8.35 instead of above, but this glitch does not effect any core functionality of patch. Signed-off-by: David Dunn Message-Id: <20220223225743.2703915-2-daviddunn@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- Documentation/virt/kvm/api.rst | 22 ++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/x86.c | 18 ++++++++++++++++++ include/uapi/linux/kvm.h | 3 +++ tools/include/uapi/linux/kvm.h | 4 ++++ 7 files changed, 50 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index cdd146ce94979..ce4ae0b2689a7 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7424,6 +7424,28 @@ of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace the hypercalls whose corresponding bit is in the argument, and return ENOSYS for the others. +8.35 KVM_CAP_PMU_CAPABILITY +--------------------------- + +:Capability KVM_CAP_PMU_CAPABILITY +:Architectures: x86 +:Type: vm +:Parameters: arg[0] is bitmask of PMU virtualization capabilities. +:Returns 0 on success, -EINVAL when arg[0] contains invalid bits + +This capability alters PMU virtualization in KVM. + +Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of +PMU virtualization capabilities that can be adjusted on a VM. + +The argument to KVM_ENABLE_CAP is also a bitmask and selects specific +PMU virtualization capabilities to be applied to the VM. This can +only be invoked on a VM prior to the creation of VCPUs. + +At this time, KVM_PMU_CAP_DISABLE is the only capability. Setting +this capability will disable PMU virtualization for that VM. Usermode +should adjust CPUID leaf 0xA to reflect that the PMU is disabled. + 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f156be53b90bb..3b4f4031ba527 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1174,6 +1174,7 @@ struct kvm_arch { u32 notify_window; u32 notify_vmexit_flags; + bool enable_pmu; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index dc6e264d9e8f5..ab6f0f4744574 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -117,7 +117,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!enable_pmu) + if (!vcpu->kvm->arch.enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index dfea8e6e52e96..9fc7ccabff550 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -541,7 +541,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry || !enable_pmu) + if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a034b924722f7..009eb8a20f14b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -110,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) +#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -4385,6 +4387,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + case KVM_CAP_PMU_CAPABILITY: + r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; + break; } default: break; @@ -6130,6 +6135,18 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_PMU_CAPABILITY: + r = -EINVAL; + if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -11764,6 +11781,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; + kvm->arch.enable_pmu = enable_pmu; #if IS_ENABLED(CONFIG_HYPERV) spin_lock_init(&kvm->arch.hv_root_tdp_lock); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 1b6e80914a7b5..8f924848a14f9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1122,6 +1122,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SYS_ATTRIBUTES 209 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_PMU_CAPABILITY 212 #ifdef KVM_CAP_IRQ_ROUTING @@ -1947,6 +1948,8 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +#define KVM_PMU_CAP_DISABLE (1 << 0) + /** * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. * @flags: Some extra information for header, always 0 for now. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 7eafe88dcea77..2d8705dc03627 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1115,6 +1115,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_S390_MEM_OP_EXTENSION 211 +#define KVM_CAP_PMU_CAPABILITY 212 #ifdef KVM_CAP_IRQ_ROUTING @@ -1949,6 +1951,8 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +#define KVM_PMU_CAP_DISABLE (1 << 0) + /** * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. * @flags: Some extra information for header, always 0 for now. From b4211a7ff486dcd0402399db9942c7037023824b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:06:45 +0200 Subject: [PATCH 062/120] KVM: x86: Add dedicated helper to get CPUID entry with significant index commit 277ad7d58611b455662f2e3f7bd24ce5bfeb2fdc upstream. Add a second CPUID helper, kvm_find_cpuid_entry_index(), to handle KVM queries for CPUID leaves whose index _may_ be significant, and drop the index param from the existing kvm_find_cpuid_entry(). Add a WARN in the inner helper, cpuid_entry2_find(), to detect attempts to retrieve a CPUID entry whose index is significant without explicitly providing an index. Using an explicit magic number and letting callers omit the index avoids confusion by eliminating the myriad cases where KVM specifies '0' as a dummy value. [Backport changes] Added blank line to align with upstream. commit style. No functional change was made to the code in this section. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 81 +++++++++++++++++++++++++++--------- arch/x86/kvm/cpuid.h | 16 +++---- arch/x86/kvm/hyperv.c | 8 ++-- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 +- arch/x86/kvm/vmx/sgx.c | 8 ++-- arch/x86/kvm/vmx/vmx.c | 6 +-- arch/x86/kvm/x86.c | 2 +- 8 files changed, 85 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dd2df30cba652..fb2c5fd449d31 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -61,8 +61,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -70,9 +79,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); + return e; + } } return NULL; @@ -89,7 +120,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -121,7 +153,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -147,7 +179,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -189,7 +222,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -220,7 +253,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -254,7 +287,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -293,10 +326,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -1259,12 +1292,20 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1301,7 +1342,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1310,11 +1351,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1332,7 +1373,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1342,7 +1383,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1371,7 +1412,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a4c4d48f4407e..134f00aae8ce8 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -137,7 +139,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -148,7 +150,7 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -159,7 +161,7 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 29a96d1c7e2b8..077a7080be953 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1989,7 +1989,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2002,7 +2002,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2013,7 +2013,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2022,7 +2022,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 737ecbc3b2a42..d9086fd1a5296 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4051,7 +4051,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9fc7ccabff550..1d736e7578e90 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -540,7 +540,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -586,7 +586,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b8cf9a59c145e..17b91a65a0bae 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -152,8 +152,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -447,7 +447,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index af0482c59ca86..36fb846776b61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7412,7 +7412,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7428,7 +7428,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7463,7 +7463,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 009eb8a20f14b..2680ff4bc99a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11464,7 +11464,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); From e88a189c10f50a804b8d0e77b8bf2c2600edf8c6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Jul 2022 23:34:22 +0000 Subject: [PATCH 063/120] KVM: x86: Refresh PMU after writes to MSR_IA32_PERF_CAPABILITIES commit 17a024a8b981243fa1d17cf0084de43cc5f9b603 upstream. Refresh the PMU if userspace modifies MSR_IA32_PERF_CAPABILITIES. KVM consumes the vCPU's PERF_CAPABILITIES when enumerating PEBS support, but relies on CPUID updates to refresh the PMU. I.e. KVM will do the wrong thing if userspace stuffs PERF_CAPABILITIES _after_ setting guest CPUID. Opportunistically fix a curly-brace indentation. Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Cc: Like Xu Signed-off-by: Sean Christopherson Message-Id: <20220727233424.2968356-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2680ff4bc99a1..2ff766586f308 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3483,9 +3483,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.perf_capabilities = data; - + kvm_pmu_refresh(vcpu); return 0; - } + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: From 3f6e2675de29bd186950059355556bbdb0ada314 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 11:35:24 +0800 Subject: [PATCH 064/120] perf/x86/core: Completely disable guest PEBS via guest's global_ctrl commit f2aeea57504cbbc58da3c59b939fc16150087648 upstream. When a guest PEBS counter is cross-mapped by a host counter, software will remove the corresponding bit in the arr[global_ctrl].guest and expect hardware to perform a change of state "from enable to disable" via the msr_slot[] switch during the vmx transaction. The real world is that if user adjust the counter overflow value small enough, it still opens a tiny race window for the previously PEBS-enabled counter to write cross-mapped PEBS records into the guest's PEBS buffer, when arr[global_ctrl].guest has been prioritised (switch_msr_special stuff) to switch into the enabled state, while the arr[pebs_enable].guest has not. Close this window by clearing invalid bits in the arr[global_ctrl].guest. Fixes: 854250329c02 ("KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations") Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220831033524.58561-1-likexu@tencent.com Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cf0d79d509ec9..8e908f803e2a5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4053,8 +4053,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) /* Disable guest PEBS if host PEBS is enabled. */ arr[pebs_enable].guest = 0; } else { - /* Disable guest PEBS for cross-mapped PEBS counters. */ + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ arr[global_ctrl].guest |= arr[pebs_enable].guest; } From e88f3dd4f0277957710586b4a90003e29868eccc Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:22 +0800 Subject: [PATCH 065/120] KVM: x86/pmu: Avoid setting BIT_ULL(-1) to pmu->host_cross_mapped_mask commit c23981df6642eec1da94a8125ec0ec402f7b1b7b upstream. In the extreme case of host counters multiplexing and contention, the perf_event requested by the guest's pebs counter is not allocated to any actual physical counter, in which case hw.idx is bookkept as -1, resulting in an out-of-bounds access to host_cross_mapped_mask. Fixes: 854250329c02 ("KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-2-likexu@tencent.com [sean: expand comment to explain how a negative idx can be encountered] Signed-off-by: Sean Christopherson Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 1d736e7578e90..ef35b79882d2f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -794,20 +794,23 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) { struct kvm_pmc *pmc = NULL; - int bit; + int bit, hw_idx; for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc)) + !intel_pmc_is_enabled(pmc) || !pmc->perf_event) continue; - if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { - pmu->host_cross_mapped_mask |= - BIT_ULL(pmc->perf_event->hw.idx); - } + /* + * A negative index indicates the event isn't mapped to a + * physical counter in the host, e.g. due to contention. + */ + hw_idx = pmc->perf_event->hw.idx; + if (hw_idx != pmc->idx && hw_idx > -1) + pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx); } } From 24586b2e5c6b25397c2f4928172108b8153c6eb4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:23 +0800 Subject: [PATCH 066/120] KVM: x86/pmu: Don't generate PEBS records for emulated instructions commit f331601c65ad217a5c000ce20c26266d3f0aceb3 upstream. KVM will accumulate an enabled counter for at least INSTRUCTIONS or BRANCH_INSTRUCTION hw event from any KVM emulated instructions, generating emulated overflow interrupt on counter overflow, which in theory should also happen when the PEBS counter overflows but it currently lacks this part of the underlying support (e.g. through software injection of records in the irq context or a lazy approach). In this case, KVM skips the injection of this BUFFER_OVF PMI (effectively dropping one PEBS record) and let the overflow counter move on. The loss of a single sample does not introduce a loss of accuracy, but is easily noticeable for certain specific instructions. This issue is expected to be addressed along with the issue of PEBS cross-mapped counters with a slow-path proposal. Fixes: 79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-3-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e833729cfee50..85c30a700f20f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -96,9 +96,19 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) return; if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { - /* Indicate PEBS overflow PMI to guest. */ - skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, - (unsigned long *)&pmu->global_status); + if (!in_pmi) { + /* + * TODO: KVM is currently _choosing_ to not generate records + * for emulated instructions, avoiding BUFFER_OVF PMI when + * there are no records. Strictly speaking, it should be done + * as well in the right context to improve sampling accuracy. + */ + skip_pmi = true; + } else { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } From ca032a723b35c811300f5762b6b0fc76b7282ea2 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 22 Sep 2022 13:40:38 -0700 Subject: [PATCH 067/120] KVM: x86/pmu: Refactor PERF_GLOBAL_CTRL update helper for reuse by PEBS commit c0245b774203f7341ddb1cce29a6ee607857f325 upstream. Extract the "global ctrl" specific bits out of global_ctrl_changed() so that the helper only deals with reprogramming general purpose counters, and rename the helper accordingly. PEBS needs the same logic, i.e needs to reprogram counters associated when PEBS_ENABLE bits are toggled, and will use the helper in a future fix. No functional change intended. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-4-likexu@tencent.com [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ef35b79882d2f..83e9c29671aea 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -68,15 +68,11 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* function is called when global control register has been updated. */ -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) { int bit; - u64 diff = pmu->global_ctrl ^ data; struct kvm_pmc *pmc; - pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) @@ -413,7 +409,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct kvm_pmc *pmc; u32 msr = msr_info->index; u64 data = msr_info->data; - u64 reserved_bits; + u64 reserved_bits, diff; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -434,7 +430,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->global_ctrl == data) return 0; if (kvm_valid_perf_global_ctrl(pmu, data)) { - global_ctrl_changed(pmu, data); + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); return 0; } break; From dfb3f5b63b16ce1fe8e27ebe541dffb647ef87ef Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 31 Aug 2022 16:53:24 +0800 Subject: [PATCH 068/120] KVM: x86/pmu: Avoid using PEBS perf_events for normal counters commit cf52de619c67bd1f6b1cf2751c3827815f74a5a5 upstream. The check logic in the pmc_resume_counter() to determine whether a perf_event is reusable is partial and flawed, especially when it comes to a pseudocode sequence (contrived, but valid) like: - enabling a counter and its PEBS bit - enable global_ctrl - run workload - disable only the PEBS bit, leaving the global_ctrl bit enabled In this corner case, a perf_event created for PEBS can be reused by a normal counter before it has been released and recreated, and when this normal counter overflows, it triggers a PEBS interrupt (precise_ip != 0). To address this issue, reprogram all affected counters when PEBS_ENABLE change and reuse a counter if and only if PEBS exactly matches precise. Fixes: 79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-4-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 4 ++-- arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 85c30a700f20f..989dd35c81a99 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -225,8 +225,8 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; - if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && - pmc->perf_event->attr.precise_ip) + if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != + (!!pmc->perf_event->attr.precise_ip)) return false; /* reuse perf_event to serve as pmc_reprogram_counter() does*/ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 83e9c29671aea..96e14f5c3c966 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -447,7 +447,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->pebs_enable == data) return 0; if (!(data & pmu->pebs_enable_mask)) { + diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; + reprogram_counters(pmu, diff); return 0; } break; From c0a38b37fd3e1d0ba3253c36a79e65322647a06a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 19 Sep 2022 17:10:07 +0800 Subject: [PATCH 069/120] KVM: x86/pmu: Limit the maximum number of supported Intel GP counters commit 4f1fa2a1bbeb2feca436d2c86bf6f78dc4e5e4c4 upstream. The Intel Architectural IA32_PMCx MSRs addresses range allows for a maximum of 8 GP counters, and KVM cannot address any more. Introduce a local macro (named KVM_INTEL_PMC_MAX_GENERIC) and use it consistently to refer to the number of counters supported by KVM, thus avoiding possible out-of-bound accesses. Suggested-by: Jim Mattson Signed-off-by: Like Xu Reviewed-by: Jim Mattson Message-Id: <20220919091008.60695-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- arch/x86/kvm/x86.c | 12 +++++++----- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3b4f4031ba527..7498e963b6227 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -497,6 +497,10 @@ struct kvm_pmc { bool intr; }; +/* More counters may conflict with other existing Architectural MSRs */ +#define KVM_INTEL_PMC_MAX_GENERIC 8 +#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) +#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 struct kvm_pmu { unsigned nr_arch_gp_counters; @@ -512,7 +516,7 @@ struct kvm_pmu { u64 reserved_bits; u64 raw_event_mask; u8 version; - struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 989dd35c81a99..f92e7d4916789 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -47,7 +47,7 @@ EXPORT_SYMBOL_GPL(kvm_pmu_cap); * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 96e14f5c3c966..8f04f4c874135 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -635,7 +635,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -661,7 +661,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = NULL; int i; - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2ff766586f308..e9dfcd2c35953 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1380,6 +1380,9 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, @@ -1388,7 +1391,6 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -6779,14 +6781,14 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: From e938c68c13b561285d74728e06504533442eb6dd Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 19 Sep 2022 17:10:08 +0800 Subject: [PATCH 070/120] KVM: x86/pmu: Limit the maximum number of supported AMD GP counters commit 556f3c9ad7c101aa16a43ef4539f3aabc1d7b32e upstream. The AMD PerfMonV2 specification allows for a maximum of 16 GP counters, but currently only 6 pairs of MSRs are accepted by KVM. While AMD64_NUM_COUNTERS_CORE is already equal to 6, increasing without adjusting msrs_to_save_all[] could result in out-of-bounds accesses. Therefore introduce a macro (named KVM_AMD_PMC_MAX_GENERIC) to refer to the number of counters supported by KVM. Signed-off-by: Like Xu Reviewed-by: Jim Mattson Message-Id: <20220919091008.60695-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/pmu.c | 7 ++++--- arch/x86/kvm/x86.c | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7498e963b6227..5fa7e17673a98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,6 +502,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index ab6f0f4744574..b4c9872fd364d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -322,9 +322,10 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); - for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -337,7 +338,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e9dfcd2c35953..f17a439a92340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1394,10 +1394,13 @@ static const u32 msrs_to_save_all[] = { MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; From d238017c77be0e7c545d74ec320a1ad6b89beb49 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 23 Sep 2022 00:13:54 +0000 Subject: [PATCH 071/120] KVM: x86/pmu: Defer reprogram_counter() to kvm_pmu_handle_event() commit 68fb4757e8678894530ee0b15c29a3567207b970 upstream. Batch reprogramming PMU counters by setting KVM_REQ_PMU and thus deferring reprogramming kvm_pmu_handle_event() to avoid reprogramming a counter multiple times during a single VM-Exit. Deferring programming will also allow KVM to fix a bug where immediately reprogramming a counter can result in sleeping (taking a mutex) while interrupts are disabled in the VM-Exit fastpath. Introduce kvm_pmu_request_counter_reprogam() to make it obvious that KVM is _requesting_ a reprogram and not actually doing the reprogram. Opportunistically refine related comments to avoid misunderstandings. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-5-likexu@tencent.com Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 17 ++++++++++++----- arch/x86/kvm/pmu.h | 6 +++++- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 6 +++--- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5fa7e17673a98..a900d57f3500c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -489,6 +489,7 @@ struct kvm_pmc { struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f92e7d4916789..e581edce36143 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -91,7 +91,11 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* Ignore counters that have been reprogrammed already. */ + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; @@ -278,7 +282,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return allow_event; } -void reprogram_counter(struct kvm_pmc *pmc) +static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; @@ -320,7 +324,6 @@ void reprogram_counter(struct kvm_pmc *pmc) !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT); } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -330,10 +333,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { + if (unlikely(!pmc)) { clear_bit(bit, pmu->reprogram_pmi); continue; } + reprogram_counter(pmc); } @@ -532,12 +536,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; - u64 config = pmc->current_config; + u64 config; if (pmc_is_gp(pmc)) { + config = pmc->eventsel; select_os = config & ARCH_PERFMON_EVENTSEL_OS; select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { + config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); select_os = config & 0x1; select_user = config & 0x2; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 27c3846194475..394237b682827 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -180,7 +180,11 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_counter(struct kvm_pmc *pmc); +static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +{ + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); +} void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b4c9872fd364d..9bb81cac7682d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -289,7 +289,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 8f04f4c874135..3d71c3ad0a0f5 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -491,7 +491,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) From 4b2c4402f985cedf5e382512a117d2b6d4e5c754 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Sep 2022 00:13:53 +0000 Subject: [PATCH 072/120] KVM: x86/pmu: Clear "reprogram" bit if counter is disabled or disallowed commit dcbb816a2842e41d3ec22605c6760837d800b20a upstream. When reprogramming a counter, clear the counter's "reprogram pending" bit if the counter is disabled (by the guest) or is disallowed (by the userspace filter). In both cases, there's no need to re-attempt programming on the next coincident KVM_REQ_PMU as enabling the counter by either method will trigger reprogramming. Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e581edce36143..9064ca72d933a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -144,9 +144,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, __kvm_perf_overflow(pmc, true); } -static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - u64 config, bool exclude_user, - bool exclude_kernel, bool intr) +static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, + bool exclude_user, bool exclude_kernel, + bool intr) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; @@ -196,14 +196,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", PTR_ERR(event), pmc->idx); - return; + return PTR_ERR(event); } pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; - clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; pmc->intr = intr || pebs; + return 0; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -237,7 +237,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) perf_event_enable(pmc->perf_event); pmc->is_paused = false; - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; } @@ -292,10 +291,10 @@ static void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - return; + goto reprogram_complete; if (!check_pmu_event_filter(pmc)) - return; + goto reprogram_complete; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -313,16 +312,27 @@ static void reprogram_counter(struct kvm_pmc *pmc) } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - return; + goto reprogram_complete; pmc_release_perf_event(pmc); pmc->current_config = new_config; - pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + + /* + * If reprogramming fails, e.g. due to contention, leave the counter's + * regprogram bit set, i.e. opportunistically try again on the next PMU + * refresh. Don't make a new request as doing so can stall the guest + * if reprogramming repeatedly fails. + */ + if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT)) + return; + +reprogram_complete: + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) From c9311e64cd840872a02e2bccdd7e4bdd4b030b45 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 23 Sep 2022 00:13:55 +0000 Subject: [PATCH 073/120] KVM: x86/pmu: Defer counter emulated overflow via pmc->prev_counter commit de0f619564f4713bd548b82d535a954ffa1ee7d8 upstream. Defer reprogramming counters and handling overflow via KVM_REQ_PMU when incrementing counters. KVM skips emulated WRMSR in the VM-Exit fastpath, the fastpath runs with IRQs disabled, skipping instructions can increment and reprogram counters, reprogramming counters can sleep, and sleeping is disallowed while IRQs are disabled. [*] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580 [*] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2981888, name: CPU 15/KVM [*] preempt_count: 1, expected: 0 [*] RCU nest depth: 0, expected: 0 [*] INFO: lockdep is turned off. [*] irq event stamp: 0 [*] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [*] hardirqs last disabled at (0): [] copy_process+0x146a/0x62d0 [*] softirqs last enabled at (0): [] copy_process+0x14a9/0x62d0 [*] softirqs last disabled at (0): [<0000000000000000>] 0x0 [*] Preemption disabled at: [*] [] vcpu_enter_guest+0x1001/0x3dc0 [kvm] [*] CPU: 17 PID: 2981888 Comm: CPU 15/KVM Kdump: 5.19.0-rc1-g239111db364c-dirty #2 [*] Call Trace: [*] [*] dump_stack_lvl+0x6c/0x9b [*] __might_resched.cold+0x22e/0x297 [*] __mutex_lock+0xc0/0x23b0 [*] perf_event_ctx_lock_nested+0x18f/0x340 [*] perf_event_pause+0x1a/0x110 [*] reprogram_counter+0x2af/0x1490 [kvm] [*] kvm_pmu_trigger_event+0x429/0x950 [kvm] [*] kvm_skip_emulated_instruction+0x48/0x90 [kvm] [*] handle_fastpath_set_msr_irqoff+0x349/0x3b0 [kvm] [*] vmx_vcpu_run+0x268e/0x3b80 [kvm_intel] [*] vcpu_enter_guest+0x1d22/0x3dc0 [kvm] Add a field to kvm_pmc to track the previous counter value in order to defer overflow detection to kvm_pmu_handle_event() (the counter must be paused before handling overflow, and that may increment the counter). Opportunistically shrink sizeof(struct kvm_pmc) a bit. Suggested-by: Wanpeng Li Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-6-likexu@tencent.com [sean: avoid re-triggering KVM_REQ_PMU on overflow, tweak changelog] Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/pmu.c | 32 ++++++++++++++++---------------- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a900d57f3500c..b45e3d8f5eb4d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -484,7 +484,10 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -494,8 +497,6 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; /* More counters may conflict with other existing Architectural MSRs */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 9064ca72d933a..b50cedfe86bcc 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -91,14 +91,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* - * Ignore overflow events for counters that are scheduled to be - * reprogrammed, e.g. if a PMI for the previous event races with KVM's - * handling of a related guest WRMSR. - */ - if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) - return; - if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* @@ -116,7 +108,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); if (!pmc->intr || skip_pmi) return; @@ -141,7 +132,17 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ + if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) + return; + __kvm_perf_overflow(pmc, true); + + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, @@ -296,6 +297,9 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (!check_pmu_event_filter(pmc)) goto reprogram_complete; + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -333,6 +337,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc->prev_counter = 0; } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) @@ -521,14 +526,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - u64 prev_count; - - prev_count = pmc->counter; + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - - reprogram_counter(pmc); - if (pmc->counter < prev_count) - __kvm_perf_overflow(pmc, false); + kvm_pmu_request_counter_reprogam(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9bb81cac7682d..2e94c2344b5a3 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -342,7 +342,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 3d71c3ad0a0f5..2a934c99a73c0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -665,14 +665,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; From 3735770e135b26a5c2f710abf8d0dc7c9f1b9053 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Nov 2022 20:58:39 +0800 Subject: [PATCH 074/120] KVM: x86: Update KVM-only leaf handling to allow for 100% KVM-only leafs commit 047c7229906152fb85c23dc18fd25a00cd7cb4de upstream. Rename kvm_cpu_cap_init_scattered() to kvm_cpu_cap_init_kvm_defined() in anticipation of adding KVM-only CPUID leafs that aren't recognized by the kernel and thus not scattered, i.e. for leafs that are 100% KVM-defined. Adjust/add comments to kvm_only_cpuid_leafs and KVM_X86_FEATURE to document how to create new kvm_only_cpuid_leafs entries for scattered features as well as features that are entirely unknown to the kernel. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221125125845.1182922-3-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 8 ++++---- arch/x86/kvm/reverse_cpuid.h | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index fb2c5fd449d31..c2315479027e0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -471,9 +471,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) } static __always_inline -void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ BUILD_BUG_ON(leaf < NCAPINTS); kvm_cpu_caps[leaf] = mask; @@ -483,7 +483,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ BUILD_BUG_ON(leaf >= NCAPINTS); kvm_cpu_caps[leaf] &= mask; @@ -593,7 +593,7 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); - kvm_cpu_cap_init_scattered(CPUID_12_EAX, + kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, SF(SGX1) | SF(SGX2) ); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 94c0eb4dc8e66..96f19b7b14f80 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,9 +7,9 @@ #include /* - * Hardware-defined CPUID leafs that are scattered in the kernel, but need to - * be directly used by KVM. Note, these word values conflict with the kernel's - * "bug" caps, but KVM doesn't use those. + * Hardware-defined CPUID leafs that are either scattered by the kernel or are + * unknown to the kernel, but need to be directly used by KVM. Note, these + * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, @@ -18,6 +18,18 @@ enum kvm_only_cpuid_leafs { NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; +/* + * Define a KVM-only feature flag. + * + * For features that are scattered by cpufeatures.h, __feature_translate() also + * needs to be updated to translate the kernel-defined feature into the + * KVM-defined feature. + * + * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h, + * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so + * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is + * needed in this case. + */ #define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ From 0c925f451fc81b89c3bf2a6a7363b0fbcc466555 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 7 Oct 2022 15:16:44 -0700 Subject: [PATCH 075/120] KVM: x86: Advertise that the SMM_CTL MSR is not supported commit 74bee0cad8dcd8ddec5e763c369239fc5990676a upstream. CPUID.80000021H:EAX[bit 9] indicates that the SMM_CTL MSR (0xc0010116) is not supported. This defeature can be advertised by KVM_GET_SUPPORTED_CPUID regardless of whether or not the host enumerates it; currently it will be included only if the host enumerates at least leaf 8000001DH, due to a preexisting bug in QEMU that KVM has to work around (commit f751d8eac176, "KVM: x86: work around QEMU issue with synthetic CPUID leaves", 2022-04-29). Signed-off-by: Jim Mattson Message-Id: <20221007221644.138355-1-jmattson@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c2315479027e0..a06f9b9d0b7ce 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1153,8 +1153,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * Other defined bits are for MSRs that KVM does not expose: * EAX 3 SPCL, SMM page configuration lock * EAX 13 PCMSR, Prefetch control MSR + * + * KVM doesn't support SMM_CTL. + * EAX 9 SMM_CTL MSR is not supported */ entry->eax &= BIT(0) | BIT(2) | BIT(6); + entry->eax |= BIT(9); if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) entry->eax |= BIT(2); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) From 9ea190029d39ba1b313c118e2e16d449f28a8971 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:34 +0000 Subject: [PATCH 076/120] KVM: x86: Move Intel Processor Trace interrupt handler to vmx.c commit 33271a9e2b52e07e278a67c900d2d2afb5c55bd5 upstream. Now that all state needed for VMX's PT interrupt handler is exposed to vmx.c (specifically the currently running vCPU), move the handler into vmx.c where it belongs. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211111020738.2512932-14-seanjc@google.com Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/vmx.c | 22 +++++++++++++++++++++- arch/x86/kvm/x86.c | 20 +------------------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b45e3d8f5eb4d..7b9641be32152 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1553,7 +1553,7 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); - bool (*intel_pt_intr_in_guest)(void); + unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 36fb846776b61..a5944ba7835b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8060,6 +8060,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -8086,6 +8100,8 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -8253,6 +8269,10 @@ static __init int hardware_setup(void) return -EINVAL; if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8281,7 +8301,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, - .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, .pmu_ops = &intel_pmu_ops, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f17a439a92340..4e695d70c7711 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8870,20 +8870,6 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -static unsigned int kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!kvm_arch_pmi_in_guest(vcpu)) - return 0; - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); - return 1; -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -11666,11 +11652,7 @@ int kvm_arch_hardware_setup(void *opaque) kvm_ops_update(ops); - /* Temporary ugliness. */ - if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_register_perf_callbacks(kvm_handle_intel_pt_intr); - else - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; From 87f4331094a0564ef22c3b5bee642867abacbc2c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:52 +0000 Subject: [PATCH 077/120] KVM: x86: Move hardware setup/unsetup to init/exit commit b7483387e374977396152298bfbddee1ca0a062e upstream. Now that kvm_arch_hardware_setup() is called immediately after kvm_arch_init(), fold the guts of kvm_arch_hardware_(un)setup() into kvm_arch_{init,exit}() as a step towards dropping one of the hooks. To avoid having to unwind various setup, e.g registration of several notifiers, slot in the vendor hardware setup before the registration of said notifiers and callbacks. Introducing a functional change while moving code is less than ideal, but the alternative is adding a pile of unwinding code, which is much more error prone, e.g. several attempts to move the setup code verbatim all introduced bugs. Add a comment to document that kvm_ops_update() is effectively the point of no return, e.g. it sets the kvm_x86_ops.hardware_enable canary and so needs to be unwound. [Backport changes] Backport commit 938c8745bcf2f73: Skipped due to conflicts. Retained kvm_has_tsc_control, supported_xss, kvm_max_tsc_scaling_ratio, kvm_max_guest_tsc_khz, kvm_default_tsc_scaling_ratio and kvm_tsc_scaling_ratio_frac_bits variables instead of using struct kvm_caps for compatibility. In commit 880993138396f, the line: kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; was moved outside the conditional check. We didn't include this commit due to conflicts and kept it inside the check for backport compatibility. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-9-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 124 +++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 60 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e695d70c7711..58f4594f760db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8927,6 +8927,24 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; @@ -8979,6 +8997,24 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_ops_update(ops); + kvm_timer_init(); if (boot_cpu_has(X86_FEATURE_XSAVE)) { @@ -8995,8 +9031,33 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); + kvm_max_guest_tsc_khz = max; + + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + } + kvm_init_msr_list(); return 0; +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9007,6 +9068,8 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9022,6 +9085,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif + static_call(kvm_x86_hardware_unsetup)(); kvm_x86_ops.hardware_enable = NULL; kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); @@ -11616,74 +11680,14 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) -{ - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include -#undef __KVM_X86_OP - - kvm_pmu_ops_update(ops->pmu_ops); -} - int kvm_arch_hardware_setup(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - kvm_init_pmu_capability(); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - kvm_ops_update(ops); - - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; - } - - kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - kvm_unregister_perf_callbacks(); - static_call(kvm_x86_hardware_unsetup)(); } int kvm_arch_check_processor_compat(void *opaque) From ff51bcea6f97087b89172588e5e98b0ba9b4ad38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:57 +0000 Subject: [PATCH 078/120] KVM: x86: Move guts of kvm_arch_init() to standalone helper commit 4f8396b96a9fc672964842fe7adbe8ddca8a3adf upstream. Move the guts of kvm_arch_init() to a new helper, kvm_x86_vendor_init(), so that VMX can do _all_ arch and vendor initialization before calling kvm_init(). Calling kvm_init() must be the _very_ last step during init, as kvm_init() exposes /dev/kvm to userspace, i.e. allows creating VMs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-14-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 23 +++++++++++++++++++++-- arch/x86/kvm/vmx/vmx.c | 21 +++++++++++++++------ arch/x86/kvm/x86.c | 15 +++++++++++++-- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7b9641be32152..9104aed5b9c18 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1578,6 +1578,9 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9086fd1a5296..8da7ae7c22518 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4720,15 +4720,34 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a5944ba7835b8..f2d1104d69d6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8325,6 +8325,7 @@ static void vmx_exit(void) #endif kvm_exit(); + kvm_x86_vendor_exit(); #if IS_ENABLED(CONFIG_HYPERV) if (static_branch_unlikely(&enable_evmcs)) { @@ -8393,23 +8394,25 @@ static int __init vmx_init(void) } #endif + r = kvm_x86_vendor_init(&vmx_init_ops); + if (r) + return r; + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - return r; + goto err_kvm_init; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8434,5 +8437,11 @@ static int __init vmx_init(void) allow_smaller_maxphyaddr = true; return 0; + +err_l1d_flush: + vmx_exit(); +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58f4594f760db..c86a82099cbf5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8947,7 +8947,16 @@ static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) int kvm_arch_init(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; + return 0; +} + +void kvm_arch_exit(void) +{ + +} + +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ int r; if (kvm_x86_ops.hardware_enable) { @@ -9065,8 +9074,9 @@ int kvm_arch_init(void *opaque) out: return r; } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); -void kvm_arch_exit(void) +void kvm_x86_vendor_exit(void) { kvm_unregister_perf_callbacks(); @@ -9095,6 +9105,7 @@ void kvm_arch_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { From 156277704cdaca69b3da5870596293757cc89f8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:59 +0000 Subject: [PATCH 079/120] KVM: x86: Serialize vendor module initialization (hardware setup) commit 3af4a9e61e71117d5df2df3e1605fa062e04b869 upstream. Acquire a new mutex, vendor_module_lock, in kvm_x86_vendor_init() while doing hardware setup to ensure that concurrent calls are fully serialized. KVM rejects attempts to load vendor modules if a different module has already been loaded, but doesn't handle the case where multiple vendor modules are loaded at the same time, and module_init() doesn't run under the global module_mutex. Note, in practice, this is likely a benign bug as no platform exists that supports both SVM and VMX, i.e. barring a weird VM setup, one of the vendor modules is guaranteed to fail a support check before modifying common KVM state. Alternatively, KVM could perform an atomic CMPXCHG on .hardware_enable, but that comes with its own ugliness as it would require setting .hardware_enable before success is guaranteed, e.g. attempting to load the "wrong" could result in spurious failure to load the "right" module. Introduce a new mutex as using kvm_lock is extremely deadlock prone due to kvm_lock being taken under cpus_write_lock(), and in the future, under under cpus_read_lock(). Any operation that takes cpus_read_lock() while holding kvm_lock would potentially deadlock, e.g. kvm_timer_init() takes cpus_read_lock() to register a callback. In theory, KVM could avoid such problematic paths, i.e. do less setup under kvm_lock, but avoiding all calls to cpus_read_lock() is subtly difficult and thus fragile. E.g. updating static calls also acquires cpus_read_lock(). Inverting the lock ordering, i.e. always taking kvm_lock outside cpus_read_lock(), is not a viable option as kvm_lock is taken in various callbacks that may be invoked under cpus_read_lock(), e.g. x86's kvmclock_cpufreq_notifier(). The lockdep splat below is dependent on future patches to take cpus_read_lock() in hardware_enable_all(), but as above, deadlock is already is already possible. ====================================================== WARNING: possible circular locking dependency detected 6.0.0-smp--7ec93244f194-init2 #27 Tainted: G O ------------------------------------------------------ stable/251833 is trying to acquire lock: ffffffffc097ea28 (kvm_lock){+.+.}-{3:3}, at: hardware_enable_all+0x1f/0xc0 [kvm] but task is already holding lock: ffffffffa2456828 (cpu_hotplug_lock){++++}-{0:0}, at: hardware_enable_all+0xf/0xc0 [kvm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x2a/0xa0 __cpuhp_setup_state+0x2b/0x60 __kvm_x86_vendor_init+0x16a/0x1870 [kvm] kvm_x86_vendor_init+0x23/0x40 [kvm] 0xffffffffc0a4d02b do_one_initcall+0x110/0x200 do_init_module+0x4f/0x250 load_module+0x1730/0x18f0 __se_sys_finit_module+0xca/0x100 __x64_sys_finit_module+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (kvm_lock){+.+.}-{3:3}: __lock_acquire+0x16f4/0x30d0 lock_acquire+0xb2/0x190 __mutex_lock+0x98/0x6f0 mutex_lock_nested+0x1b/0x20 hardware_enable_all+0x1f/0xc0 [kvm] kvm_dev_ioctl+0x45e/0x930 [kvm] __se_sys_ioctl+0x77/0xc0 __x64_sys_ioctl+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug_lock); lock(kvm_lock); lock(cpu_hotplug_lock); lock(kvm_lock); *** DEADLOCK *** 1 lock held by stable/251833: #0: ffffffffa2456828 (cpu_hotplug_lock){++++}-{0:0}, at: hardware_enable_all+0xf/0xc0 [kvm] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-16-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- Documentation/virt/kvm/locking.rst | 6 ++++++ arch/x86/kvm/x86.c | 18 ++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 5d27da3568361..33469fa9e7938 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -257,3 +257,9 @@ time it will be set using the Dirty tracking mechanism described above. wakeup notification event since external interrupts from the assigned devices happens, we will find the vCPU on the list to wakeup. + +``vendor_module_lock`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:Type: mutex +:Arch: x86 +:Protects: loading a vendor module (kvm_amd or kvm_intel) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c86a82099cbf5..2370b4d3d11ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -126,6 +126,7 @@ static int sync_regs(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -8955,7 +8956,7 @@ void kvm_arch_exit(void) } -int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { int r; @@ -9074,6 +9075,17 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) out: return r; } + +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) @@ -9096,7 +9108,6 @@ void kvm_x86_vendor_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif static_call(kvm_x86_hardware_unsetup)(); - kvm_x86_ops.hardware_enable = NULL; kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9104,6 +9115,9 @@ void kvm_x86_vendor_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); From 1b908fa5be556243ef1fe09269f999561b0e48fa Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 24 Jan 2023 10:33:13 -0600 Subject: [PATCH 080/120] KVM: x86: Move open-coded CPUID leaf 0x80000021 EAX bit propagation code commit c35ac8c4bf600ee23bacb20f863aa7830efb23fb upstream. Move code from __do_cpuid_func() to kvm_set_cpu_caps() in preparation for adding the features in their native leaf. Also drop the bit description comments as it will be more self-describing once the individual features are added. Whilst there, switch to using the more efficient cpu_feature_enabled() instead of static_cpu_has(). Note, LFENCE_RDTSC and "NULL selector clears base" are currently synthetic, Linux-defined feature flags as Linux tracking of the features predates AMD's definition. Keep the manual propagation of the flags from their synthetic counterparts until the kernel fully converts to AMD's definition, otherwise KVM would stop synthesizing the flags as intended. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson Link: https://lore.kernel.org/r/20230124163319.2277355-3-kim.phillips@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a06f9b9d0b7ce..2436165c19d1d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -658,6 +658,17 @@ void kvm_set_cpu_caps(void) 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | F(SME_COHERENT)); + kvm_cpu_cap_mask(CPUID_8000_0021_EAX, + BIT(0) /* NO_NESTED_DATA_BP */ | + BIT(2) /* LFENCE Always serializing */ | 0 /* SmmPgCfgLock */ | + BIT(6) /* NULL_SEL_CLR_BASE */ | 0 /* PrefetchCtlMsr */ + ); + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(2) /* LFENCE Always serializing */; + if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) + kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(6) /* NULL_SEL_CLR_BASE */; + kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(9) /* NO_SMM_CTL_MSR */; + kvm_cpu_cap_mask(CPUID_C000_0001_EDX, F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | @@ -1144,25 +1155,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; case 0x80000021: entry->ebx = entry->ecx = entry->edx = 0; - /* - * Pass down these bits: - * EAX 0 NNDBP, Processor ignores nested data breakpoints - * EAX 2 LAS, LFENCE always serializing - * EAX 6 NSCB, Null selector clear base - * - * Other defined bits are for MSRs that KVM does not expose: - * EAX 3 SPCL, SMM page configuration lock - * EAX 13 PCMSR, Prefetch control MSR - * - * KVM doesn't support SMM_CTL. - * EAX 9 SMM_CTL MSR is not supported - */ - entry->eax &= BIT(0) | BIT(2) | BIT(6); - entry->eax |= BIT(9); - if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - entry->eax |= BIT(2); - if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) - entry->eax |= BIT(6); + cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: From 13c34a71d9ef84bf7fde2e45e30142e8aab2b0c9 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 24 Jan 2023 10:33:16 -0600 Subject: [PATCH 081/120] x86/cpu, kvm: Add the Null Selector Clears Base feature commit 5b909d4ae59aedc711b7a432da021be0e82c95a0 upstream. The Null Selector Clears Base feature was being open-coded for KVM. Add it to its newly added native CPUID leaf 0x80000021 EAX proper. Also drop the bit description comments now it's more self-describing. [ bp: Convert test in check_null_seg_clears_base() too. ] Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson Link: https://lore.kernel.org/r/20230124163319.2277355-6-kim.phillips@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 4 +--- arch/x86/kvm/cpuid.c | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bef9b899727e1..ec9dc6377945e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -444,6 +444,7 @@ * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 919f391543025..f3dc98523c434 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1544,9 +1544,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c) if (!IS_ENABLED(CONFIG_X86_64)) return; - /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ - if (c->extended_cpuid_level >= 0x80000021 && - cpuid_eax(0x80000021) & BIT(6)) + if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE)) return; /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2436165c19d1d..5004d7896f372 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -661,12 +661,12 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, BIT(0) /* NO_NESTED_DATA_BP */ | BIT(2) /* LFENCE Always serializing */ | 0 /* SmmPgCfgLock */ | - BIT(6) /* NULL_SEL_CLR_BASE */ | 0 /* PrefetchCtlMsr */ + F(NULL_SEL_CLR_BASE) | 0 /* PrefetchCtlMsr */ ); if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(2) /* LFENCE Always serializing */; if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) - kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(6) /* NULL_SEL_CLR_BASE */; + kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(9) /* NO_SMM_CTL_MSR */; kvm_cpu_cap_mask(CPUID_C000_0001_EDX, From 3d55887fcf697fdba7ba8a18b66d10e00828e164 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 24 Jan 2023 10:33:18 -0600 Subject: [PATCH 082/120] x86/cpu: Support AMD Automatic IBRS commit e7862eda309ecfccc36bb5558d937ed3ace07f3f upstream. The AMD Zen4 core supports a new feature called Automatic IBRS. It is a "set-and-forget" feature that means that, like Intel's Enhanced IBRS, h/w manages its IBRS mitigation resources automatically across CPL transitions. The feature is advertised by CPUID_Fn80000021_EAX bit 8 and is enabled by setting MSR C000_0080 (EFER) bit 21. Enable Automatic IBRS by default if the CPU feature is present. It typically provides greater performance over the incumbent generic retpolines mitigation. Reuse the SPECTRE_V2_EIBRS spectre_v2_mitigation enum. AMD Automatic IBRS and Intel Enhanced IBRS have similar enablement. Add NO_EIBRS_PBRSB to cpu_vuln_whitelist, since AMD Automatic IBRS isn't affected by PBRSB-eIBRS. The kernel command line option spectre_v2=eibrs is used to select AMD Automatic IBRS, if available. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20230124163319.2277355-8-kim.phillips@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- Documentation/admin-guide/hw-vuln/spectre.rst | 6 +++--- .../admin-guide/kernel-parameters.txt | 6 +++--- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/bugs.c | 20 +++++++++++-------- arch/x86/kernel/cpu/common.c | 19 ++++++++++-------- 6 files changed, 32 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 0fba3758d0da8..990aabab37e7b 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -622,9 +622,9 @@ kernel command line. retpoline,generic Retpolines retpoline,lfence LFENCE; indirect branch retpoline,amd alias for retpoline,lfence - eibrs enhanced IBRS - eibrs,retpoline enhanced IBRS + Retpolines - eibrs,lfence enhanced IBRS + LFENCE + eibrs Enhanced/Auto IBRS + eibrs,retpoline Enhanced/Auto IBRS + Retpolines + eibrs,lfence Enhanced/Auto IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0c0eede472ac8..ac58b1b82fff7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5403,9 +5403,9 @@ retpoline,generic - Retpolines retpoline,lfence - LFENCE; indirect branch retpoline,amd - alias for retpoline,lfence - eibrs - enhanced IBRS - eibrs,retpoline - enhanced IBRS + Retpolines - eibrs,lfence - enhanced IBRS + LFENCE + eibrs - Enhanced/Auto IBRS + eibrs,retpoline - Enhanced/Auto IBRS + Retpolines + eibrs,lfence - Enhanced/Auto IBRS + LFENCE ibrs - use IBRS to protect kernel Not specifying this option is equivalent to diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ec9dc6377945e..766ac61355f2d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -445,6 +445,7 @@ */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ /* * BUG word(s) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0c6a5cec945ec..9c0194109482c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) @@ -33,6 +34,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* Intel MSRs. Some also available on other CPUs */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7e0e1889ce0b3..8b3c6931387bf 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1330,9 +1330,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", - [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", - [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; @@ -1401,7 +1401,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -1586,8 +1586,12 @@ static void __init spectre_v2_select_mitigation(void) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - update_spec_ctrl(x86_spec_ctrl_base); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } } switch (mode) { @@ -1671,8 +1675,8 @@ static void __init spectre_v2_select_mitigation(void) /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around - * firmware calls only when IBRS / Enhanced IBRS aren't otherwise - * enabled. + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f3dc98523c434..359e2e3202f51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1105,8 +1105,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* Zhaoxin Family 7 */ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), @@ -1226,8 +1226,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); - if (ia32_cap & ARCH_CAP_IBRS_ALL) + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { @@ -1289,11 +1297,6 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } - if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && - !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) - setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); - if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); From 3128bcfedd882aab6eb63fe966115d9dc2b78bba Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 24 Jan 2023 10:33:19 -0600 Subject: [PATCH 083/120] KVM: x86: Propagate the AMD Automatic IBRS feature to the guest commit 8c19b6f257fa71ed3a7a9df6ce466c6be31ca04c upstream. Add the AMD Automatic IBRS feature bit to those being propagated to the guest, and enable the guest EFER bit. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson Link: https://lore.kernel.org/r/20230124163319.2277355-9-kim.phillips@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/svm.c | 3 +++ arch/x86/kvm/x86.c | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5004d7896f372..afec997697d52 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -661,7 +661,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, BIT(0) /* NO_NESTED_DATA_BP */ | BIT(2) /* LFENCE Always serializing */ | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | 0 /* PrefetchCtlMsr */ + F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(2) /* LFENCE Always serializing */; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8da7ae7c22518..d0c7324e72e24 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1007,6 +1007,9 @@ static __init int svm_hardware_setup(void) tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) + kvm_enable_efer_bits(EFER_AUTOIBRS); + /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { pause_filter_count = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2370b4d3d11ba..b1b052224476e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1627,6 +1627,9 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { + if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) + return false; + if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; From bd2de9c616e26216d076b26748737fb0512a3535 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:00 +0000 Subject: [PATCH 084/120] KVM: x86/pmu: Cap kvm_pmu_cap.num_counters_gp at KVM's internal max commit 8911ce66697e04ce7e92753dbf5645f748e27e12 upstream. Limit kvm_pmu_cap.num_counters_gp during kvm_init_pmu_capability() based on the vendor PMU capabilities so that consuming num_counters_gp naturally does the right thing. This fixes a mostly theoretical bug where KVM could over-report its PMU support in KVM_GET_SUPPORTED_CPUID for leaf 0xA, e.g. if the number of counters reported by perf is greater than KVM's hardcoded internal limit. Incorporating input from the AMD PMU also avoids over-reporting MSRs to save when running on AMD. Link: https://lore.kernel.org/r/20230124234905.3774678-2-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 5 ++++- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 394237b682827..f255d959f12d3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,6 +37,7 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + const int MAX_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -157,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -static inline void kvm_init_pmu_capability(void) +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; @@ -176,6 +177,8 @@ static inline void kvm_init_pmu_capability(void) } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 2e94c2344b5a3..3e5ce5f08c04c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -359,4 +359,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 2a934c99a73c0..87d07ebe601c6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -829,4 +829,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1b052224476e..8d2c569884a4a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6790,12 +6790,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + kvm_pmu_cap.num_counters_gp) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + kvm_pmu_cap.num_counters_gp) continue; break; case MSR_IA32_XFD: @@ -9015,7 +9015,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_pmu_capability(); + kvm_init_pmu_capability(ops->pmu_ops); r = ops->hardware_setup(); if (r != 0) From 4770c2f4e28873912c4f6f5965688ddb22acc2fe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Jan 2023 23:49:02 +0000 Subject: [PATCH 085/120] KVM: x86/pmu: Use separate array for defining "PMU MSRs to save" commit 2374b7310b662e29e3468d510bfaded60fbae99c upstream. Move all potential to-be-saved PMU MSRs into a separate array so that a future patch can easily omit all PMU MSRs from the list when the PMU is disabled. No functional change intended. Link: https://lore.kernel.org/r/20230124234905.3774678-4-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 153 ++++++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 71 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d2c569884a4a..5654c6a0f700c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1360,7 +1360,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1377,6 +1377,10 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, @@ -1401,11 +1405,10 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -6731,84 +6734,92 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - kvm_pmu_cap.num_counters_gp) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - kvm_pmu_cap.num_counters_gp) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) From c69fec708c8fea332e7b441cb308c64871438dbf Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 31 Mar 2023 10:31:16 +0100 Subject: [PATCH 086/120] docs: kvm: x86: Fix broken field list commit fb5015bc8b733323b58f015b88e4f316010ec856 upstream. Add a missing ":" to fix a broken field list. Signed-off-by: Takahiro Itazuri Fixes: ba7bb663f554 ("KVM: x86: Provide per VM capability for disabling PMU virtualization") Message-Id: <20230331093116.99820-1-itazur@amazon.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- Documentation/virt/kvm/api.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index ce4ae0b2689a7..cdb72015843e1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7427,11 +7427,11 @@ ENOSYS for the others. 8.35 KVM_CAP_PMU_CAPABILITY --------------------------- -:Capability KVM_CAP_PMU_CAPABILITY +:Capability: KVM_CAP_PMU_CAPABILITY :Architectures: x86 :Type: vm :Parameters: arg[0] is bitmask of PMU virtualization capabilities. -:Returns 0 on success, -EINVAL when arg[0] contains invalid bits +:Returns: 0 on success, -EINVAL when arg[0] contains invalid bits This capability alters PMU virtualization in KVM. From 650bb0d08b1735a56ac39982ad45c370be7f8c14 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 14 Feb 2023 13:07:46 +0800 Subject: [PATCH 087/120] KVM: x86/pmu: Rename pmc_is_enabled() to pmc_is_globally_enabled() commit cdd2fbf6360e84c02ac4cea11733083c2ca8dace upstream. The name of function pmc_is_enabled() is a bit misleading. A PMC can be disabled either by PERF_CLOBAL_CTRL or by its corresponding EVTSEL. Append global semantics to its name. Suggested-by: Jim Mattson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230214050757.9623-2-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b50cedfe86bcc..67106b2938bbf 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -73,7 +73,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) { return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); } @@ -291,7 +291,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); - if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_globally_enabled(pmc)) goto reprogram_complete; if (!check_pmu_event_filter(pmc)) @@ -571,7 +571,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + if (!pmc || !pmc_is_globally_enabled(pmc) || !pmc_speculative_in_use(pmc)) continue; /* Ignore checks for edge detect, pin control, invert and CMASK bits */ From 0c7b90ec037ed29e7b4b8856ad164ade4b905122 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 Jan 2023 17:08:03 -0800 Subject: [PATCH 088/120] KVM: VMX: Refactor intel_pmu_{g,}set_msr() to align with other helpers commit 8bca8c5ce40b03862d782b9991da146909199327 upstream. Invert the flows in intel_pmu_{g,s}et_msr()'s case statements so that they follow the kernel's preferred style of: if () return return which is also the style used by every other {g,s}et_msr() helper (except AMD's PMU variant, which doesn't use a switch statement). Modify the "set" paths with costly side effects, i.e. that reprogram counters, to skip only the side effects, i.e. to perform reserved bits checks even if the value is unchanged. None of the reserved bits checks are expensive, so there's no strong justification for skipping them, and guarding only the side effect makes it slightly more obvious what is being skipped and why. No functional change intended (assuming no reserved bit bugs). Link: https://lkml.kernel.org/r/Y%2B6cfen%2FCpO3%2FdLO%40google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 109 ++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 87d07ebe601c6..a73cc9cd07fbc 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -362,45 +362,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_STATUS: msr_info->data = pmu->global_status; - return 0; + break; case MSR_CORE_PERF_GLOBAL_CTRL: msr_info->data = pmu->global_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; - return 0; + break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; - return 0; + break; case MSR_IA32_DS_AREA: msr_info->data = pmu->ds_area; - return 0; + break; case MSR_PEBS_DATA_CFG: msr_info->data = pmu->pebs_data_cfg; - return 0; + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_GP]; - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; - return 0; - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) { + break; + } + return 1; } - return 1; + return 0; } static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -413,58 +415,56 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & pmu->fixed_ctr_ctrl_mask)) { + if (data & pmu->fixed_ctr_ctrl_mask) + return 1; + + if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); - return 0; - } break; case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + + pmu->global_status = data; + break; case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (kvm_valid_perf_global_ctrl(pmu, data)) { + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & pmu->global_ovf_ctrl_mask)) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - return 0; - } + if (data & pmu->global_ovf_ctrl_mask) + return 1; + + if (!msr_info->host_initiated) + pmu->global_status &= ~data; break; case MSR_IA32_PEBS_ENABLE: - if (pmu->pebs_enable == data) - return 0; - if (!(data & pmu->pebs_enable_mask)) { + if (data & pmu->pebs_enable_mask) + return 1; + + if (pmu->pebs_enable != data) { diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_IA32_DS_AREA: if (is_noncanonical_address(data, vcpu)) return 1; + pmu->ds_area = data; - return 0; + break; case MSR_PEBS_DATA_CFG: - if (pmu->pebs_data_cfg == data) - return 0; - if (!(data & pmu->pebs_data_cfg_mask)) { - pmu->pebs_data_cfg = data; - return 0; - } + if (data & pmu->pebs_data_cfg_mask) + return 1; + + pmu->pebs_data_cfg = data; break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -472,33 +472,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((msr & MSR_PMC_FULL_WIDTH_BIT) && (data & ~pmu->counter_bitmask[KVM_PMC_GP])) return 1; + if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; reserved_bits = pmu->reserved_bits; if ((pmc->idx == 2) && (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; - if (!(data & reserved_bits)) { + if (data & reserved_bits) + return 1; + + if (data != pmc->eventsel) { pmc->eventsel = data; kvm_pmu_request_counter_reprogam(pmc); - return 0; } - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) { + break; + } + /* Not a known PMU MSR. */ + return 1; } - return 1; + return 0; } static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) From 9e4068ff65b6b6b178097d6c024a7e7df5c595cc Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 14 Feb 2023 13:07:48 +0800 Subject: [PATCH 089/120] KVM: x86/pmu: Rewrite reprogram_counters() to improve performance commit 649bccd7fac98225525c79cf4b1cecc4bafdfc54 upstream. A valid pmc is always tested before using pmu->reprogram_pmi. Eliminate this part of the redundancy by setting the counter's bitmask directly, and in addition, trigger KVM_REQ_PMU only once to save more cpu cycles. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230214050757.9623-4-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a73cc9cd07fbc..68c4fd2afd0cd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -71,13 +71,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) { int bit; - struct kvm_pmc *pmc; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { - pmc = intel_pmc_idx_to_pmc(pmu, bit); - if (pmc) - kvm_pmu_request_counter_reprogam(pmc); - } + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); } static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) From 0bd1d8f64731a5ee8446822983a24eb21bea85ee Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 10 Mar 2023 19:33:49 +0800 Subject: [PATCH 090/120] KVM: x86/pmu: Fix a typo in kvm_pmu_request_counter_reprogam() commit 4fa5843d81fdce9ba7e03f8a666e2f8e592aec2b upstream. Fix a "reprogam" => "reprogram" typo in kvm_pmu_request_counter_reprogam(). Fixes: 68fb4757e867 ("KVM: x86/pmu: Defer reprogram_counter() to kvm_pmu_handle_event()") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230310113349.31799-1-likexu@tencent.com [sean: trim the changelog] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 67106b2938bbf..4ce34ceb9deec 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -528,7 +528,7 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f255d959f12d3..fc27da1e1b0bf 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -183,7 +183,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) KVM_PMC_MAX_FIXED); } -static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 3e5ce5f08c04c..9c4b01ff2595d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -289,7 +289,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 68c4fd2afd0cd..976ae2da53df2 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } } @@ -493,7 +493,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } break; } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) { From 69aa68e6638c396c1348563ce72d0579a2685eeb Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 7 Mar 2023 14:13:56 +0000 Subject: [PATCH 091/120] KVM: x86/pmu: Prevent the PMU from counting disallowed events commit dfdeda67ea2dac57d2d7506d65cfe5a0878ad285 upstream. When counting "Instructions Retired" (0xc0) in a guest, KVM will occasionally increment the PMU counter regardless of if that event is being filtered. This is because some PMU events are incremented via kvm_pmu_trigger_event(), which doesn't know about the event filter. Add the event filter to kvm_pmu_trigger_event(), so events that are disallowed do not increment their counters. Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Aaron Lewis Reviewed-by: Like Xu Link: https://lore.kernel.org/r/20230307141400.1486314-2-aaronlewis@google.com [sean: prepend "pmc" to the new function] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4ce34ceb9deec..8fb82bceead8e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -282,6 +282,12 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return allow_event; } +static bool pmc_event_is_allowed(struct kvm_pmc *pmc) +{ + return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + check_pmu_event_filter(pmc); +} + static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -291,10 +297,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); - if (!pmc_speculative_in_use(pmc) || !pmc_is_globally_enabled(pmc)) - goto reprogram_complete; - - if (!check_pmu_event_filter(pmc)) + if (!pmc_event_is_allowed(pmc)) goto reprogram_complete; if (pmc->counter < pmc->prev_counter) @@ -571,7 +574,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - if (!pmc || !pmc_is_globally_enabled(pmc) || !pmc_speculative_in_use(pmc)) + if (!pmc || !pmc_event_is_allowed(pmc)) continue; /* Ignore checks for edge detect, pin control, invert and CMASK bits */ From 178162078a84e1cab6e91b6ec06f18751a182d7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jun 2023 18:10:47 -0700 Subject: [PATCH 092/120] KVM: x86/pmu: Rename global_ovf_ctrl_mask to global_status_mask commit 53550b89220beda42934a76a61313ecf308b2b03 upstream. Rename global_ovf_ctrl_mask to global_status_mask to avoid confusion now that Intel has renamed GLOBAL_OVF_CTRL to GLOBAL_STATUS_RESET in PMU v4. GLOBAL_OVF_CTRL and GLOBAL_STATUS_RESET are the same MSR index, i.e. are just different names for the same thing, but the SDM provides different entries in the IA-32 Architectural MSRs table, which gets really confusing when looking at PMU v4 definitions since it *looks* like GLOBAL_STATUS has bits that don't exist in GLOBAL_OVF_CTRL, but in reality the bits are simply defined in the GLOBAL_STATUS_RESET entry. No functional change intended. Cc: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-2-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9104aed5b9c18..8d96d97b34adf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -515,7 +515,7 @@ struct kvm_pmu { u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; u8 version; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 976ae2da53df2..a395bde4be013 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -438,7 +438,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (data & pmu->global_ovf_ctrl_mask) + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) return 1; if (!msr_info->host_initiated) @@ -540,7 +544,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->global_ctrl_mask = ~0ull; - pmu->global_ovf_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; @@ -584,11 +588,17 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); pmu->global_ctrl_mask = counter_mask; - pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + + /* + * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) + * share reserved bit definitions. The kernel just happens to use + * OVF_CTRL for the names. + */ + pmu->global_status_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_ovf_ctrl_mask &= + pmu->global_status_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); From cc6aed805ddae41105f93603e128e6358b0df735 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:48 -0700 Subject: [PATCH 093/120] KVM: x86/pmu: Move reprogram_counters() to pmu.h commit 8de18543dfe34a6aac9deefb0256db2609cfa351 upstream. Move reprogram_counters() out of Intel specific PMU code and into pmu.h so that it can be used to implement AMD PMU v2 support. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: rewrite changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-3-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 12 ++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index fc27da1e1b0bf..b0152148a5316 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -189,6 +189,18 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a395bde4be013..b16b9bf2003dd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -68,18 +68,6 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) -{ - int bit; - - if (!diff) - return; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); -} - static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); From e5cac0e00bea9d1b6439f9a66a8661cecc8f8033 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:49 -0700 Subject: [PATCH 094/120] KVM: x86/pmu: Reject userspace attempts to set reserved GLOBAL_STATUS bits commit 30dab5c0b65ea580878f36d6ae7ff85f06813387 upstream. Reject userspace writes to MSR_CORE_PERF_GLOBAL_STATUS that attempt to set reserved bits. Allowing userspace to stuff reserved bits doesn't harm KVM itself, but it's architecturally wrong and the guest can't clear the unsupported bits, e.g. makes the guest's PMI handler very confused. Signed-off-by: Like Xu [sean: rewrite changelog to avoid use of #GP, rebase on name change] Link: https://lore.kernel.org/r/20230603011058.1038821-4-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b16b9bf2003dd..5ef3df41fa2c8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -413,6 +413,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) return 1; /* RO MSR */ + if (data & pmu->global_status_mask) + return 1; + pmu->global_status = data; break; case MSR_CORE_PERF_GLOBAL_CTRL: From b819b154ad060661d2315eaf7d143f6c70aef38a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:50 -0700 Subject: [PATCH 095/120] KVM: x86/pmu: Move handling PERF_GLOBAL_CTRL and friends to common x86 commit c85cdc1cc1ea27573ab15c76f13a06f12530aa54 upstream. Move the handling of GLOBAL_CTRL, GLOBAL_STATUS, and GLOBAL_OVF_CTRL, a.k.a. GLOBAL_STATUS_RESET, from Intel PMU code to generic x86 PMU code. AMD PerfMonV2 defines three registers that have the same semantics as Intel's variants, just with different names and indices. Conveniently, since KVM virtualizes GLOBAL_CTRL on Intel only for PMU v2 and above, and AMD's version shows up in v2, KVM can use common code for the existence check as well. [Backport changes] This change removes the condition that returns the value of pmu->version > 1 from the file `arch/x86/kvm/vmx/pmu_intel.c`, which was included in upstream commit b663f0b. Signed-off-by: Like Xu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230603011058.1038821-5-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 71 ++++++++++++++++++++++++++++++++++-- arch/x86/kvm/pmu.h | 14 +++++++ arch/x86/kvm/vmx/nested.c | 4 +- arch/x86/kvm/vmx/pmu_intel.c | 48 +----------------------- arch/x86/kvm/vmx/vmx.h | 12 ------ 5 files changed, 86 insertions(+), 63 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8fb82bceead8e..e9461d114e405 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -444,6 +444,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } @@ -459,13 +467,70 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + + if (data & pmu->global_status_mask) + return 1; + + pmu->global_status = data; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) + return 1; + + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + } + + return 0; } /* refresh PMU settings. This function generally is called when underlying diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b0152148a5316..87edcec64ba00 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -42,6 +42,20 @@ struct kvm_pmu_ops { void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + * + * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. + */ + return pmu->version > 1; +} + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e000b57c605bb..ec044440ae389 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2621,7 +2621,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4348,7 +4348,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ef3df41fa2c8..1e9d2fd6baaa0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -92,7 +92,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!intel_pmu_has_perf_global_ctrl(pmu)) + if (!kvm_pmu_has_perf_global_ctrl(pmu)) return true; return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); @@ -196,12 +196,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - return intel_pmu_has_perf_global_ctrl(pmu); - ret = pmu->version > 1; - break; + return kvm_pmu_has_perf_global_ctrl(pmu); case MSR_IA32_PEBS_ENABLE: ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; @@ -351,15 +346,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; break; - case MSR_CORE_PERF_GLOBAL_STATUS: - msr_info->data = pmu->global_status; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - msr_info->data = pmu->global_ctrl; - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = 0; - break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; break; @@ -409,36 +395,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (!msr_info->host_initiated) - return 1; /* RO MSR */ - - if (data & pmu->global_status_mask) - return 1; - - pmu->global_status = data; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (!kvm_valid_perf_global_ctrl(pmu, data)) - return 1; - - if (pmu->global_ctrl != data) { - diff = pmu->global_ctrl ^ data; - pmu->global_ctrl = data; - reprogram_counters(pmu, diff); - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - /* - * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in - * GLOBAL_STATUS, and so the set of reserved bits is the same. - */ - if (data & pmu->global_status_mask) - return 1; - - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - break; case MSR_IA32_PEBS_ENABLE: if (data & pmu->pebs_enable_mask) return 1; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f22dd34e4f2cb..1196aa6ddf45d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -92,18 +92,6 @@ union vmx_exit_reason { u32 full; }; -static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) -{ - /* - * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is - * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is - * greater than zero. However, KVM only exposes and emulates the MSR - * to/for the guest if the guest PMU supports at least "Architectural - * Performance Monitoring Version 2". - */ - return pmu->version > 1; -} - #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) From f6927af9fd488c4de868047f7d03325f4cc88ee1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:51 -0700 Subject: [PATCH 096/120] KVM: x86/pmu: Provide Intel PMU's pmc_is_enabled() as generic x86 code commit 13afa29ae489d9b7c1038179f1e2bec74873e471 upstream. Move the Intel PMU implementation of pmc_is_enabled() to common x86 code as pmc_is_globally_enabled(), and drop AMD's implementation. AMD PMU currently supports only v1, and thus not PERF_GLOBAL_CONTROL, thus the semantics for AMD are unchanged. And when support for AMD PMU v2 comes along, the common behavior will also Just Work. Signed-off-by: Like Xu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230603011058.1038821-6-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 1 - arch/x86/kvm/pmu.c | 5 ----- arch/x86/kvm/pmu.h | 16 +++++++++++++++- arch/x86/kvm/svm/pmu.c | 9 --------- arch/x86/kvm/vmx/pmu_intel.c | 14 +------------- 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6d..b2f2b8c21b5b9 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -13,7 +13,6 @@ BUILD_BUG_ON(1) * at the call sites. */ KVM_X86_PMU_OP(pmc_perf_hw_id) -KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e9461d114e405..c5cc352722c4c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -73,11 +73,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) -{ - return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); -} - static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 87edcec64ba00..b89580d64c9e9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,7 +23,6 @@ struct kvm_event_hw_type_mapping { struct kvm_pmu_ops { unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); - bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -215,6 +214,21 @@ static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); } +/* + * Check if a PMC is enabled by comparing it against global_ctrl bits. + * + * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled. + */ +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9c4b01ff2595d..5acb7f0870756 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -181,14 +181,6 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) return event_mapping[i].event_type; } -/* check if a PMC is enabled by comparing it against global_ctrl bits. Because - * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). - */ -static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) -{ - return true; -} - static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER); @@ -348,7 +340,6 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu_ops amd_pmu_ops __initdata = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, - .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 1e9d2fd6baaa0..747a6467d2275 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -87,17 +87,6 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) return intel_arch_events[i].event_type; } -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!kvm_pmu_has_perf_global_ctrl(pmu)) - return true; - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { @@ -763,7 +752,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc) || !pmc->perf_event) + !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; /* @@ -778,7 +767,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, - .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, From a9e0df8afccf59932c255d798ff2f8d72af594d0 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:52 -0700 Subject: [PATCH 097/120] KVM: x86: Explicitly zero cpuid "0xa" leaf when PMU is disabled commit 6593039d33c119968b9e2d85eaca853b391e8a55 upstream. Add an explicit !enable_pmu check as relying on kvm_pmu_cap to be zeroed isn't obvious. Although when !enable_pmu, KVM will have zero-padded kvm_pmu_cap to do subsequent CPUID leaf assignments. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-7-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afec997697d52..6f7fe251c60c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -870,7 +870,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; - if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } From b381765ff570ff8d95ec5831b46a6c25b1aee6dc Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:53 -0700 Subject: [PATCH 098/120] KVM: x86/pmu: Disable vPMU if the minimum num of counters isn't met commit 6a08083f294cb623fbc882417b28c5646b01f4bf upstream. Disable PMU support when running on AMD and perf reports fewer than four general purpose counters. All AMD PMUs must define at least four counters due to AMD's legacy architecture hardcoding the number of counters without providing a way to enumerate the number of counters to software, e.g. from AMD's APM: The legacy architecture defines four performance counters (PerfCtrn) and corresponding event-select registers (PerfEvtSeln). Virtualizing fewer than four counters can lead to guest instability as software expects four counters to be available. Rather than bleed AMD details into the common code, just define a const unsigned int and provide a convenient location to document why Intel and AMD have different mins (in particular, AMD's lack of any way to enumerate less than four counters to the guest). Keep the minimum number of counters at Intel at one, even though old P6 and Core Solo/Duo processor effectively require a minimum of two counters. KVM can, and more importantly has up until this point, supported a vPMU so long as the CPU has at least one counter. Perf's support for P6/Core CPUs does require two counters, but perf will happily chug along with a single counter when running on a modern CPU. [Backport changes] Adjusted tab space to align with upstream. commit style. No functional change was made to the code in this section. Cc: Jim Mattson Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: set Intel min to '1', not '2'] Link: https://lore.kernel.org/r/20230603011058.1038821-8-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 17 ++++++++++++----- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index b89580d64c9e9..37ff07eab8e70 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,6 +37,7 @@ struct kvm_pmu_ops { void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); const int MAX_NR_GP_COUNTERS; + const int MIN_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -174,14 +175,20 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; perf_get_x86_pmu_capability(&kvm_pmu_cap); - /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. - */ - if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) enable_pmu = false; if (!enable_pmu) { diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5acb7f0870756..3f703767b5349 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -351,4 +351,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .init = amd_pmu_init, .reset = amd_pmu_reset, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 747a6467d2275..9101b2589ba4a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -780,4 +780,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = 1, }; From a848f7eb348bc0765e904bef6568dd5cea83e6b7 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:54 -0700 Subject: [PATCH 099/120] KVM: x86/pmu: Advertise PERFCTR_CORE iff the min nr of counters is met commit d338d8789e64b2d8ed2f82f9364c415d6efa118d upstream. Enable and advertise PERFCTR_CORE if and only if the minimum number of required counters are available, i.e. if perf says there are less than six general purpose counters. Opportunistically, use kvm_cpu_cap_check_and_set() instead of open coding the check for host support. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage shortlog and changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-9-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/svm/svm.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0c7324e72e24..6a3c3f0e40e30 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -957,9 +957,18 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + } /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); From 23912f4275ec32de88bab252155dce3ccc873ee2 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:55 -0700 Subject: [PATCH 100/120] KVM: x86/pmu: Constrain the num of guest counters with kvm_pmu_cap commit 1c2bf8a6b045a6ac4e75a7a07fde70db63e5a380 upstream. Cap the number of general purpose counters enumerated on AMD to what KVM actually supports, i.e. don't allow userspace to coerce KVM into thinking there are more counters than actually exist, e.g. by enumerating X86_FEATURE_PERFCTR_CORE in guest CPUID when its not supported. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-10-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/svm/pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 3f703767b5349..d8d7434e8ed94 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -298,6 +298,9 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) else pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, + kvm_pmu_cap.num_counters_gp); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; From 821d1f3379e338acd83cdac865821c740b086778 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:56 -0700 Subject: [PATCH 101/120] KVM: x86/cpuid: Add a KVM-only leaf to redirect AMD PerfMonV2 flag commit fe8d76c1a6f04387953727ee3e63469956e7360d upstream. Add a KVM-only leaf for AMD's PerfMonV2 to redirect the kernel's scattered version to its architectural location, e.g. so that KVM can query guest support via guest_cpuid_has(). Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-11-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/kvm/reverse_cpuid.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 96f19b7b14f80..a43b180a9e4fe 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -13,6 +13,7 @@ */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, + CPUID_8000_0022_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -36,6 +37,9 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +/* CPUID level 0x80000022 (EAX) */ +#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) + struct cpuid_reg { u32 function; u32 index; @@ -61,6 +65,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, + [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, }; /* @@ -93,6 +98,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_PERFMON_V2) + return KVM_X86_FEATURE_PERFMON_V2; return x86_feature; } From 7184c38e2ff5af11b7fcd607a514d877cdcad7d4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:57 -0700 Subject: [PATCH 102/120] KVM: x86/svm/pmu: Add AMD PerfMonV2 support commit 4a2771895ca63a055df815be5e307cce8e85308c upstream. If AMD Performance Monitoring Version 2 (PerfMonV2) is detected by the guest, it can use a new scheme to manage the Core PMCs using the new global control and status registers. In addition to benefiting from the PerfMonV2 functionality in the same way as the host (higher precision), the guest also can reduce the number of vm-exits by lowering the total number of MSRs accesses. In terms of implementation details, amd_is_valid_msr() is resurrected since three newly added MSRs could not be mapped to one vPMC. The possibility of emulating PerfMonV2 on the mainframe has also been eliminated for reasons of precision. Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Like Xu [sean: drop "Based on the observed HW." comments] Link: https://lore.kernel.org/r/20230603011058.1038821-12-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 18 +++++++++++++- arch/x86/kvm/svm/pmu.c | 55 ++++++++++++++++++++++++++++++++++-------- arch/x86/kvm/x86.c | 10 ++++++++ 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c5cc352722c4c..28d922f710665 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -467,11 +467,14 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: msr_info->data = pmu->global_status; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_CORE_PERF_GLOBAL_CTRL: msr_info->data = pmu->global_ctrl; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; break; @@ -489,16 +492,28 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u64 data = msr_info->data; u64 diff; + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: if (!msr_info->host_initiated) return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; if (data & pmu->global_status_mask) return 1; pmu->global_status = data; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_mask; + fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) return 1; @@ -516,7 +531,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if (data & pmu->global_status_mask) return 1; - + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: if (!msr_info->host_initiated) pmu->global_status &= ~data; break; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d8d7434e8ed94..c14e5881f51ea 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -222,12 +222,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; -} - static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -239,6 +233,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + return pmu->version > 0; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + return pmu->version > 1; + default: + if (msr > MSR_F15H_PERF_CTR5 && + msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters) + return pmu->version > 1; + break; + } + + return amd_msr_idx_to_pmc(vcpu, msr); +} + static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -292,23 +309,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + union cpuid_0x80000022_ebx ebx; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->version = 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + pmu->version = 2; + /* + * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest + * CPUID entry is guaranteed to be non-NULL. + */ + BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || + x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); + ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; + pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; - else + } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + } pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, kvm_pmu_cap.num_counters_gp); + if (pmu->version > 1) { + pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_mask = pmu->global_ctrl_mask; + } + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; - pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->global_status = 0; bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } @@ -339,6 +372,8 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } + + pmu->global_ctrl = pmu->global_status = 0; } struct kvm_pmu_ops amd_pmu_ops __initdata = { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5654c6a0f700c..5a6e26db86fa1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1405,6 +1405,10 @@ static const u32 msrs_to_save_pmu[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -6792,6 +6796,12 @@ static void kvm_probe_msr_to_save(u32 msr_index) kvm_pmu_cap.num_counters_gp) return; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; case MSR_IA32_XFD: case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) From a801530e1b1b9c0dd7b489b9ecd36a8127ee67b3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:58 -0700 Subject: [PATCH 103/120] KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022 commit 94cdeebd82111d7b7da5bd4da053eed9e0f65d72 upstream. CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Expose the relevant bits via KVM_GET_SUPPORTED_CPUID so that guests can make use of the PerfMonV2 features. Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-13-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 4 ++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6f7fe251c60c0..30d82fd58d189 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -663,6 +663,11 @@ void kvm_set_cpu_caps(void) BIT(2) /* LFENCE Always serializing */ | 0 /* SmmPgCfgLock */ | F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, + F(PERFMON_V2) + ); + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(2) /* LFENCE Always serializing */; if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) @@ -1052,7 +1057,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x80000021); + entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help @@ -1157,6 +1162,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; + /* AMD Extended Performance Monitoring and Debug */ + case 0x80000022: { + union cpuid_0x80000022_ebx ebx; + + entry->ecx = entry->edx = 0; + if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { + entry->eax = entry->ebx; + break; + } + + cpuid_entry_override(entry, CPUID_8000_0022_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; + else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; + else + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; + + entry->ebx = ebx.full; + break; + } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6a3c3f0e40e30..698853607925e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -968,6 +968,10 @@ static __init void svm_set_cpu_caps(void) kvm_pmu_cap.num_counters_gp); else kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); } /* CPUID 0x8000001F (SME/SEV features) */ From 4293e10ffb7fb23d3e43513a287bfff6586646db Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 20 Jul 2023 14:47:27 -0500 Subject: [PATCH 104/120] x86/cpu: Enable STIBP on AMD if Automatic IBRS is enabled commit fd470a8beed88440b160d690344fbae05a0b9b1b upstream. Unlike Intel's Enhanced IBRS feature, AMD's Automatic IBRS does not provide protection to processes running at CPL3/user mode, see section "Extended Feature Enable Register (EFER)" in the APM v2 at https://bugzilla.kernel.org/attachment.cgi?id=304652 Explicitly enable STIBP to protect against cross-thread CPL3 branch target injections on systems with Automatic IBRS enabled. Also update the relevant documentation. Fixes: e7862eda309e ("x86/cpu: Support AMD Automatic IBRS") Reported-by: Tom Lendacky Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230720194727.67022-1-kim.phillips@amd.com Signed-off-by: PvsNarasimha --- Documentation/admin-guide/hw-vuln/spectre.rst | 11 +++++++---- arch/x86/kernel/cpu/bugs.c | 15 +++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 990aabab37e7b..6c3904bc4670c 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -484,11 +484,14 @@ Spectre variant 2 Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at boot, by setting the IBRS bit, and they're automatically protected against - Spectre v2 variant attacks, including cross-thread branch target injections - on SMT systems (STIBP). In other words, eIBRS enables STIBP too. + Spectre v2 variant attacks. - Legacy IBRS systems clear the IBRS bit on exit to userspace and - therefore explicitly enable STIBP for that + On Intel's enhanced IBRS systems, this includes cross-thread branch target + injections on SMT systems (STIBP). In other words, Intel eIBRS enables + STIBP, too. + + AMD Automatic IBRS does not protect userspace, and Legacy IBRS systems clear + the IBRS bit on exit to userspace, therefore both explicitly enable STIBP. The retpoline mitigation is turned on by default on vulnerable CPUs. It can be forced on or off by the administrator diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8b3c6931387bf..9d0d460508eba 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1288,19 +1288,21 @@ spectre_v2_user_select_mitigation(void) } /* - * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP + * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP * is not required. * - * Enhanced IBRS also protects against cross-thread branch target + * Intel's Enhanced IBRS also protects against cross-thread branch target * injection in user-mode as the IBRS bit remains always set which * implicitly enables cross-thread protections. However, in legacy IBRS * mode, the IBRS bit is set only on kernel entry and cleared on return - * to userspace. This disables the implicit cross-thread protection, - * so allow for STIBP to be selected in that case. + * to userspace. AMD Automatic IBRS also does not protect userspace. + * These modes therefore disable the implicit cross-thread protection, + * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* @@ -2595,7 +2597,8 @@ static ssize_t mmio_stale_data_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { From 4c921110a14e8fff8668344c370034551e6af7db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 15:43:36 -0700 Subject: [PATCH 105/120] KVM: x86: Acquire SRCU read lock when handling fastpath MSR writes commit 3f2739bd1e0b7e9669333852b4e618294d5a1e54 upstream. Temporarily acquire kvm->srcu for read when potentially emulating WRMSR in the VM-Exit fastpath handler, as several of the common helpers used during emulation expect the caller to provide SRCU protection. E.g. if the guest is counting instructions retired, KVM will query the PMU event filter when stepping over the WRMSR. dump_stack+0x85/0xdf lockdep_rcu_suspicious+0x109/0x120 pmc_event_is_allowed+0x165/0x170 kvm_pmu_trigger_event+0xa5/0x190 handle_fastpath_set_msr_irqoff+0xca/0x1e0 svm_vcpu_run+0x5c3/0x7b0 [kvm_amd] vcpu_enter_guest+0x2108/0x2580 Alternatively, check_pmu_event_filter() could acquire kvm->srcu, but this isn't the first bug of this nature, e.g. see commit 5c30e8101e8d ("KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"). Providing protection for the entirety of WRMSR emulation will allow reverting the aforementioned commit, and will avoid having to play whack-a-mole when new uses of SRCU-protected structures are inevitably added in common emulation helpers. [Backport changes] Retain old srcu_read_lock/unlock() for compatibility due to upstream conflict Upstream commit 2031f287689 renames srcu_read_lock/unlock() to kvm_vcpu_srcu_read_lock/unlock(). To avoid conflicts, the old implementation is retained for compatibility until the issue is resolved. Fixes: dfdeda67ea2d ("KVM: x86/pmu: Prevent the PMU from counting disallowed events") Reported-by: Greg Thelen Reported-by: Aaron Lewis Signed-off-by: Sean Christopherson Message-Id: <20230721224337.2335137-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a6e26db86fa1..336086b42b0f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2085,10 +2085,13 @@ static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u32 msr = kvm_rcx_read(vcpu); u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); @@ -2111,6 +2114,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); From 5e24dfc85c49f8587be68daf753b7234247086f6 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Thu, 4 May 2023 14:00:42 +0200 Subject: [PATCH 106/120] KVM: x86/pmu: Truncate counter value to allowed width on write commit b29a2acd36dd7a33c63f260df738fb96baa3d4f8 upstream. Performance counters are defined to have width less than 64 bits. The vPMU code maintains the counters in u64 variables but assumes the value to fit within the defined width. However, for Intel non-full-width counters (MSR_IA32_PERFCTRx) the value receieved from the guest is truncated to 32 bits and then sign-extended to full 64 bits. If a negative value is set, it's sign-extended to 64 bits, but then in kvm_pmu_incr_counter() it's incremented, truncated, and compared to the previous value for overflow detection. That previous value is not truncated, so it always evaluates bigger than the truncated new one, and a PMI is injected. If the PMI handler writes a negative counter value itself, the vCPU never quits the PMI loop. Turns out that Linux PMI handler actually does write the counter with the value just read with RDPMC, so when no full-width support is exposed via MSR_IA32_PERF_CAPABILITIES, and the guest initializes the counter to a negative value, it locks up. This has been observed in the field, for example, when the guest configures atop to use perfevents and runs two instances of it simultaneously. To address the problem, maintain the invariant that the counter value always fits in the defined bit width, by truncating the received value in the respective set_msr methods. For better readability, factor the out into a helper function, pmc_write_counter(), shared by vmx and svm parts. Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions") Cc: stable@vger.kernel.org Signed-off-by: Roman Kagan Link: https://lore.kernel.org/all/20230504120042.785651-1-rkagan@amazon.de Tested-by: Like Xu [sean: tweak changelog, s/set/write in the helper] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.h | 6 ++++++ arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 37ff07eab8e70..e9fe7f07e1fab 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -75,6 +75,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index c14e5881f51ea..045853230a347 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -288,7 +288,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9101b2589ba4a..53354ef7f41d4 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -416,11 +416,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { From 37b9b3c676d1b76182d933d10cbcc5c173e5b67a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 23 Nov 2023 15:58:18 +0800 Subject: [PATCH 107/120] KVM: x86: Get CPL directly when checking if loaded vCPU is in kernel mode commit 547c91929f437b42e6a9c90fec1fb5aec3e64aac upstream. When querying whether or not a vCPU "is" running in kernel mode, directly get the CPL if the vCPU is the currently loaded vCPU. In scenarios where a guest is profiled via perf-kvm, querying vcpu->arch.preempted_in_kernel from kvm_guest_state() is wrong if vCPU is actively running, i.e. isn't scheduled out due to being preempted and so preempted_in_kernel is stale. This affects perf/core's ability to accurately tag guest RIP with PERF_RECORD_MISC_GUEST_{KERNEL|USER} and record it in the sample. This causes perf/tool to fail to connect the vCPU RIPs to the guest kernel space symbols when parsing these samples due to incorrect PERF_RECORD_MISC flags: Before (perf-report of a cpu-cycles sample): 1.23% :58945 [unknown] [u] 0xffffffff818012e0 After: 1.35% :60703 [kernel.vmlinux] [g] asm_exc_page_fault Note, checking preempted_in_kernel in kvm_arch_vcpu_in_kernel() is awful as nothing in the API's suggests that it's safe to use if and only if the vCPU was preempted. That can be cleaned up in the future, for now just fix the glaring correctness bug. Note #2, checking vcpu->preempted is NOT safe, as getting the CPL on VMX requires VMREAD, i.e. is correct if and only if the vCPU is loaded. If the target vCPU *was* preempted, then it can be scheduled back in after the check on vcpu->preempted in kvm_vcpu_on_spin(), i.e. KVM could end up trying to do VMREAD on a VMCS that isn't loaded on the current pCPU. Signed-off-by: Like Xu Fixes: e1bfc24577cc ("KVM: Move x86's perf guest info callbacks to generic KVM") Link: https://lore.kernel.org/r/20231123075818.12521-1-likexu@tencent.com [sean: massage changelong, add Fixes] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 336086b42b0f5..62f5127b3e6e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12370,7 +12370,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - return vcpu->arch.preempted_in_kernel; + if (vcpu != kvm_get_running_vcpu()) + return vcpu->arch.preempted_in_kernel; + + return static_call(kvm_x86_get_cpl)(vcpu) == 0; } unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) From 9e0db4ba48253d3c2499afc484fcda0f29d89c1e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jan 2024 16:15:17 +0100 Subject: [PATCH 108/120] KVM: x86/pmu: fix masking logic for MSR_CORE_PERF_GLOBAL_CTRL commit 971079464001c6856186ca137778e534d983174a upstream. When commit c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") switched the initialization of cpuc->guest_switch_msrs to use compound literals, it screwed up the boolean logic: + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; ... - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), Before the patch, the value of arr[0].guest would have been intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask. The intent is to always treat PEBS events as host-only because, while the guest runs, there is no way to tell the processor about the virtual address where to put PEBS records intended for the host. Unfortunately, the new expression can be expanded to (intel_ctrl & ~cpuc->intel_ctrl_host_mask) | (intel_ctrl & ~pebs_mask) which makes no sense; it includes any bit that isn't *both* marked as exclude_guest and using PEBS. So, reinstate the old logic. Another way to write it could be "intel_ctrl & ~(cpuc->intel_ctrl_host_mask | pebs_mask)", presumably the intention of the author of the faulty. However, I personally find the repeated application of A AND NOT B to be a bit more readable. This shows up as guest failures when running concurrent long-running perf workloads on the host, and was reported to happen with rcutorture. All guests on a given host would die simultaneously with something like an instruction fault or a segmentation violation. Reported-by: Paul E. McKenney Analyzed-by: Sean Christopherson Tested-by: Paul E. McKenney Cc: stable@vger.kernel.org Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/events/intel/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8e908f803e2a5..a6bf3686d023d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3997,12 +3997,17 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable; + /* + * In addition to obeying exclude_guest/exclude_host, remove bits being + * used for PEBS when running a guest, because PEBS writes to virtual + * addresses (not physical addresses). + */ *nr = 0; global_ctrl = (*nr)++; arr[global_ctrl] = (struct perf_guest_switch_msr){ .msr = MSR_CORE_PERF_GLOBAL_CTRL, .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, - .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; if (!x86_pmu.pebs) From 03f92a49ffcc943f4ccc3f65bc73685aa88162bf Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 4 Apr 2023 15:17:59 +0800 Subject: [PATCH 109/120] KVM: x86/pmu: Zero out pmu->all_valid_pmc_idx each time it's refreshed commit 7e768ce8278bafe43e2a4771a82b61856190a3fc upstream. The kvm_pmu_refresh() may be called repeatedly (e.g. configure guest CPUID repeatedly or update MSR_IA32_PERF_CAPABILITIES) and each call will use the last pmu->all_valid_pmc_idx value, with the residual bits introducing additional overhead later in the vPMU emulation. Fixes: b35e5548b411 ("KVM: x86/vPMU: Add lazy mechanism to release perf_event per vPMC") Suggested-by: Sean Christopherson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230404071759.75376-1-likexu@tencent.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 28d922f710665..fc19ac7e5c6e9 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -550,6 +550,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); static_call(kvm_x86_pmu_refresh)(vcpu); } From 481c9aefb5bf4b3e5d13339a410f5393180d619e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Mar 2023 16:46:04 -0800 Subject: [PATCH 110/120] KVM: x86/pmu: WARN and bug the VM if PMU is refreshed after vCPU has run commit 3a6de51a437fb4d2433f8a99fb59f43866cdbb98 upstream. Now that KVM disallows changing feature MSRs, i.e. PERF_CAPABILITIES, after running a vCPU, WARN and bug the VM if the PMU is refreshed after the vCPU has run. Note, KVM has disallowed CPUID updates after running a vCPU since commit feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN"), i.e. PERF_CAPABILITIES was the only remaining way to trigger a PMU refresh after KVM_RUN. [Backport changes] Upstream commit fb3146b4dc adds kvm_vcpu_has_run(), but due to conflicts, the patch is skipped. The API definition is added for backport compatibility. Cc: Like Xu Link: https://lore.kernel.org/r/20230311004618.920745-8-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/x86.c | 8 ++++++++ arch/x86/kvm/x86.h | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index fc19ac7e5c6e9..1cdc278a420d3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -550,6 +550,9 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); static_call(kvm_x86_pmu_refresh)(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62f5127b3e6e3..3029404267f6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3503,6 +3503,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~msr_ent.data) return 1; + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + return 0; + vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); return 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 868d0712525ac..6d2c43b00b394 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -106,6 +106,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.last_vmentry_cpu != -1; +} + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; From 7f339d699da3e87158d5de1da5cb917e2e3d5635 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Nov 2023 18:28:48 -0800 Subject: [PATCH 111/120] KVM: x86/pmu: Zero out PMU metadata on AMD if PMU is disabled commit f933b88e20150f15787390e2a1754a7e412754ed upstream. Move the purging of common PMU metadata from intel_pmu_refresh() to kvm_pmu_refresh(), and invoke the vendor refresh() hook if and only if the VM is supposed to have a vPMU. KVM already denies access to the PMU based on kvm->arch.enable_pmu, as get_gp_pmc_amd() returns NULL for all PMCs in that case, i.e. KVM already violates AMD's architecture by not virtualizing a PMU (kernels have long since learned to not panic when the PMU is unavailable). But configuring the PMU as if it were enabled causes unwanted side effects, e.g. calls to kvm_pmu_trigger_event() waste an absurd number of cycles due to the all_valid_pmc_idx bitmap being non-zero. Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization") Reported-by: Konstantin Khorenko Closes: https://lore.kernel.org/all/20231109180646.2963718-2-khorenko@virtuozzo.com Link: https://lore.kernel.org/r/20231110022857.1273836-2-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/pmu.c | 20 ++++++++++++++++++-- arch/x86/kvm/vmx/pmu_intel.c | 15 ++------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1cdc278a420d3..b6f217e493f7e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -550,11 +550,27 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) return; - bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); - static_call(kvm_x86_pmu_refresh)(vcpu); + pmu->version = 0; + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + + if (vcpu->kvm->arch.enable_pmu) + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 53354ef7f41d4..564da48c53593 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -472,22 +472,11 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) u64 counter_mask; int i; - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; entry = kvm_find_cpuid_entry(vcpu, 0xa); - if (!entry || !vcpu->kvm->arch.enable_pmu) + if (!entry) return; + eax.full = entry->eax; edx.full = entry->edx; From e6945763a7872c9022c947efe31e0ecb044fdac4 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 23 Jan 2024 22:12:20 +0000 Subject: [PATCH 112/120] KVM: x86/pmu: Fix type length error when reading pmu->fixed_ctr_ctrl commit 05519c86d6997cfb9bb6c82ce1595d1015b718dc upstream. Use a u64 instead of a u8 when taking a snapshot of pmu->fixed_ctr_ctrl when reprogramming fixed counters, as truncating the value results in KVM thinking fixed counter 2 is already disabled (the bug also affects fixed counters 3+, but KVM doesn't yet support those). As a result, if the guest disables fixed counter 2, KVM will get a false negative and fail to reprogram/disable emulation of the counter, which can leads to incorrect counts and spurious PMIs in the guest. Fixes: 76d287b2342e ("KVM: x86/pmu: Drop "u8 ctrl, int idx" for reprogram_fixed_counter()") Cc: stable@vger.kernel.org Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240123221220.3911317-1-mizhang@google.com [sean: rewrite changelog to call out the effects of the bug] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/pmu_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 564da48c53593..5d42193c7a70f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -38,7 +38,7 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; - u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; + u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; pmu->fixed_ctr_ctrl = data; From 2f14160d41bfa1079ce2a7117d793e547ca7f6df Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 25 Sep 2023 17:34:46 +0000 Subject: [PATCH 113/120] KVM: x86/pmu: Synthesize at most one PMI per VM-exit commit 73554b29bd70546c1a9efc9c160641ef1b849358 upstream. When the irq_work callback, kvm_pmi_trigger_fn(), is invoked during a VM-exit that also invokes __kvm_perf_overflow() as a result of instruction emulation, kvm_pmu_deliver_pmi() will be called twice before the next VM-entry. Calling kvm_pmu_deliver_pmi() twice is unlikely to be problematic now that KVM sets the LVTPC mask bit when delivering a PMI. But using IRQ work to trigger the PMI is still broken, albeit very theoretically. E.g. if the self-IPI to trigger IRQ work is be delayed long enough for the vCPU to be migrated to a different pCPU, then it's possible for kvm_pmi_trigger_fn() to race with the kvm_pmu_deliver_pmi() from KVM_REQ_PMI and still generate two PMIs. KVM could set the mask bit using an atomic operation, but that'd just be piling on unnecessary code to workaround what is effectively a hack. The *only* reason KVM uses IRQ work is to ensure the PMI is treated as a wake event, e.g. if the vCPU just executed HLT. Remove the irq_work callback for synthesizing a PMI, and all of the logic for invoking it. Instead, to prevent a vcpu from leaving C0 with a PMI pending, add a check for KVM_REQ_PMI to kvm_vcpu_has_events(). Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Jim Mattson Tested-by: Mingwei Zhang Tested-by: Dapeng Mi Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20230925173448.3518223-2-mizhang@google.com [sean: massage changelog] Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/pmu.c | 27 +-------------------------- arch/x86/kvm/x86.c | 3 +++ 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8d96d97b34adf..708d488adfbe0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -521,7 +521,6 @@ struct kvm_pmu { u8 version; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b6f217e493f7e..7a84d92a51532 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -73,14 +73,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - kvm_pmu_deliver_pmi(vcpu); -} - static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -104,20 +96,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -575,9 +554,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); static_call(kvm_x86_pmu_reset)(vcpu); } @@ -587,7 +563,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3029404267f6b..1231ccea0c990 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12331,6 +12331,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) From 20ed35e7c8933b56373467b5960d97f1e91a70ca Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:36 +0100 Subject: [PATCH 114/120] KVM: x86: Use actual kvm_cpuid.base for clearing KVM_FEATURE_PV_UNHALT commit 4736d85f0d18ad0469439a0ebc7ccb0cd94bd754 upstream. Commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") moved tweaking of the supplied CPUID data earlier in kvm_set_cpuid() but __kvm_update_cpuid_runtime() actually uses 'vcpu->arch.kvm_cpuid' (though __kvm_find_kvm_cpuid_features()) which gets set later in kvm_set_cpuid(). In some cases, e.g. when kvm_set_cpuid() is called for the first time and 'vcpu->arch.kvm_cpuid' is clear, __kvm_find_kvm_cpuid_features() fails to find KVM PV feature entry and the logic which clears KVM_FEATURE_PV_UNHALT after enabling KVM_X86_DISABLE_EXITS_HLT does not work. The logic, introduced by the commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") must stay: the supplied CPUID data is tweaked by KVM first (__kvm_update_cpuid_runtime()) and checked later (kvm_check_cpuid()) and the actual data (vcpu->arch.cpuid_*, vcpu->arch.kvm_cpuid, vcpu->arch.xen.cpuid,..) is only updated on success. Switch to searching for KVM_SIGNATURE in the supplied CPUID data to discover KVM PV feature entry instead of using stale 'vcpu->arch.kvm_cpuid'. While on it, drop pointless "&& (best->eax & (1 << KVM_FEATURE_PV_UNHALT)" check when clearing KVM_FEATURE_PV_UNHALT bit. Fixes: ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") Reported-and-tested-by: Li RongQing Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 30d82fd58d189..08bf0bf36a261 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -171,22 +171,22 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, int nent) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, + int nent, u32 kvm_cpuid_base) { - u32 base = vcpu->arch.kvm_cpuid_base; - - if (!base) - return NULL; - - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent); + u32 base = vcpu->arch.kvm_cpuid_base; + + if (!base) + return NULL; + + return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, base); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -220,6 +220,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e int nent) { struct kvm_cpuid_entry2 *best; + struct kvm_hypervisor_cpuid kvm_cpuid; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); @@ -247,10 +248,12 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); + if (kvm_cpuid.base) { + best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); + if (kvm_hlt_in_guest(vcpu->kvm) && best) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); From 25c7a9e9b0f4936da9bddec890c62c55a2b872e8 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:46 +0800 Subject: [PATCH 115/120] KVM: x86/pmu: Expose CPUIDs feature bits PDCM, DS, DTES64 commit cf8e55fe50df0c0292b389a165daa81193cd39d1 upstream. The CPUID features PDCM, DS and DTES64 are required for PEBS feature. KVM would expose CPUID feature PDCM, DS and DTES64 to guest when PEBS is supported in the KVM on the Ice Lake server platforms. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-18-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/capabilities.h | 28 +++++++++++++++++----------- arch/x86/kvm/vmx/vmx.c | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 14a9eddc499d6..c1bd0fb4e3e93 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -7,6 +7,7 @@ #include "lapic.h" #include "x86.h" #include "cpuid.h" +#include "pmu.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -391,26 +392,31 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } -static inline u64 vmx_get_perf_capabilities(void) +static inline bool vmx_pebs_supported(void) { - u64 perf_cap = 0; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} - if (!enable_pmu) - return perf_cap; +static inline u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (!enable_pmu) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f2d1104d69d6d..e71521251f3ce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2334,6 +2334,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!intel_pmu_lbr_is_compatible(vcpu)) return 1; } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } ret = kvm_set_msr_common(vcpu, msr_info); break; @@ -7604,6 +7615,10 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); From dc8fd6c76f2e18002f6b49f03eb348579091c0d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 16:58:33 -0800 Subject: [PATCH 116/120] KVM: x86/pmu: Disable support for adaptive PEBS commit 9e985cbf2942a1bb8fcef9adc2a17d90fd7ca8ee upstream. Drop support for virtualizing adaptive PEBS, as KVM's implementation is architecturally broken without an obvious/easy path forward, and because exposing adaptive PEBS can leak host LBRs to the guest, i.e. can leak host kernel addresses to the guest. Bug #1 is that KVM doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL when (re)programming fixed counters, e.g fixed_ctrl_field() drops the upper bits, reprogram_fixed_counters() stores local variables as u8s and truncates the upper bits too, etc. Bug #2 is that, because KVM _always_ sets precise_ip to a non-zero value for PEBS events, perf will _always_ generate an adaptive record, even if the guest requested a basic record. Note, KVM will also enable adaptive PEBS in individual *counter*, even if adaptive PEBS isn't exposed to the guest, but this is benign as MSR_PEBS_DATA_CFG is guaranteed to be zero, i.e. the guest will only ever see Basic records. Bug #3 is in perf. intel_pmu_disable_fixed() doesn't clear the upper bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set, and intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either. I.e. perf _always_ enables ADAPTIVE counters, regardless of what KVM requests. Bug #4 is that adaptive PEBS *might* effectively bypass event filters set by the host, as "Updated Memory Access Info Group" records information that might be disallowed by userspace via KVM_SET_PMU_EVENT_FILTER. Bug #5 is that KVM doesn't ensure LBR MSRs hold guest values (or at least zeros) when entering a vCPU with adaptive PEBS, which allows the guest to read host LBRs, i.e. host RIPs/addresses, by enabling "LBR Entries" records. Disable adaptive PEBS support as an immediate fix due to the severity of the LBR leak in particular, and because fixing all of the bugs will be non-trivial, e.g. not suitable for backporting to stable kernels. Note! This will break live migration, but trying to make KVM play nice with live migration would be quite complicated, wouldn't be guaranteed to work (i.e. KVM might still kill/confuse the guest), and it's not clear that there are any publicly available VMMs that support adaptive PEBS, let alone live migrate VMs that support adaptive PEBS, e.g. QEMU doesn't support PEBS in any capacity. [Backport changes] Retain changes in capabilities.h for vmx_get_perf_capabilities() Upstream changes were made in arch/x86/kvm/vmx/vmx.c. For backport compatibility, the changes are applied in arch/x86/kvm/vmx/capabilities.h within vmx_get_perf_capabilities(). Link: https://lore.kernel.org/all/20240306230153.786365-1-seanjc@google.com Link: https://lore.kernel.org/all/ZeepGjHCeSfadANM@google.com Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Cc: stable@vger.kernel.org Cc: Like Xu Cc: Mingwei Zhang Cc: Zhenyu Wang Cc: Zhang Xiong Cc: Lv Zhiyuan Cc: Dapeng Mi Cc: Jim Mattson Acked-by: Like Xu Link: https://lore.kernel.org/r/20240307005833.827147-1-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/vmx/capabilities.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c1bd0fb4e3e93..28800f778f9e7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -412,8 +412,28 @@ static inline u64 vmx_get_perf_capabilities(void) if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; From b40d73f1235daa7efcf9b5ea5888c8c56a67744b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:20 +0000 Subject: [PATCH 117/120] KVM: x86: Fix errant brace in KVM capability handling commit 1c4dc57328bf218e999951824dce75c6125c4f3c upstream. The braces around the KVM_CAP_XSAVE2 block also surround the KVM_CAP_PMU_CAPABILITY block, likely the result of a merge issue. Simply move the curly brace back to where it belongs. Fixes: ba7bb663f5547 ("KVM: x86: Provide per VM capability for disabling PMU virtualization") Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-8-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1231ccea0c990..4af97530ee451 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4416,10 +4416,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } default: break; } From c558c52307481518a10c7e8391314ed60187fe25 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Fri, 6 Jan 2023 10:35:59 +0000 Subject: [PATCH 118/120] KVM: x86/cpuid: generalize kvm_update_kvm_cpuid_base() and also capture limit commit 48639df8a9e3a1dfcb5880f77556681bd1e53657 upstream. A subsequent patch will need to acquire the CPUID leaf range for emulated Xen so explicitly pass the signature of the hypervisor we're interested in to the new function. Also introduce a new kvm_hypervisor_cpuid structure so we can neatly store both the base and limit leaf indices. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20230106103600.528-2-pdurrant@amazon.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/cpuid.c | 24 +++++++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 708d488adfbe0..4f7c9eb032a13 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -621,6 +621,11 @@ struct kvm_vcpu_hv { } cpuid_cache; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -757,7 +762,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + struct kvm_hypervisor_cpuid kvm_cpuid; bool is_amd_compatible; u64 reserved_gpa_bits; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 08bf0bf36a261..596ba88d9d38d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -145,15 +145,15 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } -static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { - u32 function; + struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; + u32 base; - vcpu->arch.kvm_cpuid_base = 0; - - for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function); + for_each_possible_hypervisor_cpuid_base(base) { + entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; @@ -162,13 +162,15 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) signature[1] = entry->ecx; signature[2] = entry->edx; - BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); - if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { - vcpu->arch.kvm_cpuid_base = function; + if (!memcmp(signature, sig, sizeof(signature))) { + cpuid.base = base; + cpuid.limit = entry->eax; break; } } } + + return cpuid; } static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, @@ -180,7 +182,7 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_e static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - u32 base = vcpu->arch.kvm_cpuid_base; + u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; @@ -364,7 +366,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); + vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; From 355bcda44d4bb996c4c46e09c94d93b3d3827bd7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:35 +0100 Subject: [PATCH 119/120] KVM: x86: Introduce __kvm_get_hypervisor_cpuid() helper commit 92e82cf632e85474e1e19a6f1cae813e87b07965 upstream. Similar to kvm_find_kvm_cpuid_features()/__kvm_find_kvm_cpuid_features(), introduce a helper to search for the specific hypervisor signature in any struct kvm_cpuid_entry2 array, not only in vcpu->arch.cpuid_entries. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 596ba88d9d38d..f2740bd966c00 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -145,15 +145,15 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } -static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, - const char *sig) +static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, + int nent, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_hypervisor_cpuid_base(base) { - entry = kvm_find_cpuid_entry(vcpu, base); + entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (entry) { u32 signature[3]; @@ -173,6 +173,13 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp return cpuid; } +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) +{ + return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, sig); +} + static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, int nent, u32 kvm_cpuid_base) { From e7235adbfc723e78d7f6dbf5f8d7e2dd80a04dbe Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:45 +0800 Subject: [PATCH 120/120] KVM: x86/cpuid: Refactor host/guest CPU model consistency check commit 59cc99f6e971bb24b40e27f695daab98e2eff4b8 upstream. For the same purpose, the leagcy intel_pmu_lbr_is_compatible() can be renamed for reuse by more callers, and remove the comment about LBR use case can be deleted by the way. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-17-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: PvsNarasimha --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/vmx/pmu_intel.c | 12 +----------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 134f00aae8ce8..18fd2e845989a 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -157,6 +157,11 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5d42193c7a70f..07d3dad405448 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -142,16 +142,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -542,7 +532,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) nested_vmx_pmu_refresh(vcpu, intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e71521251f3ce..325473beaa8f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2331,7 +2331,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1196aa6ddf45d..41b3eaf93988b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -96,7 +96,6 @@ union vmx_exit_reason { #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);