diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 0fba3758d0da8..6c3904bc4670c 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -484,11 +484,14 @@ Spectre variant 2 Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at boot, by setting the IBRS bit, and they're automatically protected against - Spectre v2 variant attacks, including cross-thread branch target injections - on SMT systems (STIBP). In other words, eIBRS enables STIBP too. + Spectre v2 variant attacks. - Legacy IBRS systems clear the IBRS bit on exit to userspace and - therefore explicitly enable STIBP for that + On Intel's enhanced IBRS systems, this includes cross-thread branch target + injections on SMT systems (STIBP). In other words, Intel eIBRS enables + STIBP, too. + + AMD Automatic IBRS does not protect userspace, and Legacy IBRS systems clear + the IBRS bit on exit to userspace, therefore both explicitly enable STIBP. The retpoline mitigation is turned on by default on vulnerable CPUs. It can be forced on or off by the administrator @@ -622,9 +625,9 @@ kernel command line. retpoline,generic Retpolines retpoline,lfence LFENCE; indirect branch retpoline,amd alias for retpoline,lfence - eibrs enhanced IBRS - eibrs,retpoline enhanced IBRS + Retpolines - eibrs,lfence enhanced IBRS + LFENCE + eibrs Enhanced/Auto IBRS + eibrs,retpoline Enhanced/Auto IBRS + Retpolines + eibrs,lfence Enhanced/Auto IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0c0eede472ac8..ac58b1b82fff7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5403,9 +5403,9 @@ retpoline,generic - Retpolines retpoline,lfence - LFENCE; indirect branch retpoline,amd - alias for retpoline,lfence - eibrs - enhanced IBRS - eibrs,retpoline - enhanced IBRS + Retpolines - eibrs,lfence - enhanced IBRS + LFENCE + eibrs - Enhanced/Auto IBRS + eibrs,retpoline - Enhanced/Auto IBRS + Retpolines + eibrs,lfence - Enhanced/Auto IBRS + LFENCE ibrs - use IBRS to protect kernel Not specifying this option is equivalent to diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index cdd146ce94979..cdb72015843e1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7424,6 +7424,28 @@ of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace the hypercalls whose corresponding bit is in the argument, and return ENOSYS for the others. +8.35 KVM_CAP_PMU_CAPABILITY +--------------------------- + +:Capability: KVM_CAP_PMU_CAPABILITY +:Architectures: x86 +:Type: vm +:Parameters: arg[0] is bitmask of PMU virtualization capabilities. +:Returns: 0 on success, -EINVAL when arg[0] contains invalid bits + +This capability alters PMU virtualization in KVM. + +Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of +PMU virtualization capabilities that can be adjusted on a VM. + +The argument to KVM_ENABLE_CAP is also a bitmask and selects specific +PMU virtualization capabilities to be applied to the VM. This can +only be invoked on a VM prior to the creation of VCPUs. + +At this time, KVM_PMU_CAP_DISABLE is the only capability. Setting +this capability will disable PMU virtualization for that VM. Usermode +should adjust CPUID leaf 0xA to reflect that the PMU is disabled. + 9. Known KVM API problems ========================= diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 5d27da3568361..33469fa9e7938 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -257,3 +257,9 @@ time it will be set using the Dirty tracking mechanism described above. wakeup notification event since external interrupts from the assigned devices happens, we will find the vCPU on the list to wakeup. + +``vendor_module_lock`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:Type: mutex +:Arch: x86 +:Protects: loading a vendor module (kvm_amd or kvm_intel) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b1cc8111bc6db..5e96175ef9fb2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -765,6 +765,16 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); void kvm_perf_init(void); void kvm_perf_teardown(void); +/* + * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, + * arrived in guest context. For arm64, any event that arrives while a vCPU is + * loaded is considered to be "in guest". + */ +static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; +} + long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e6e8afe256022..587236240fd55 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -505,6 +505,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu_mode_priv(vcpu); } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return *vcpu_pc(vcpu); +} + /* Just ensure a guest exit from a particular CPU */ static void exit_vm_noop(void *info) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ddd5817042f7a..a6bf3686d023d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -3970,31 +3971,98 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); - *nr = 1; + /* + * In addition to obeying exclude_guest/exclude_host, remove bits being + * used for PEBS when running a guest, because PEBS writes to virtual + * addresses (not physical addresses). + */ + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, + }; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + if (!x86_pmu.pebs) + return arr; + + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; + } + + if (!kvm_pmu || !x86_pmu.pebs_ept) + return arr; + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + + pebs_enable = (*nr)++; + + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; } return arr; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bef9b899727e1..766ac61355f2d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -444,6 +444,8 @@ * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ /* * BUG word(s) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h new file mode 100644 index 0000000000000..70b2db18165eb --- /dev/null +++ b/arch/x86/include/asm/cpuid.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPUID-related helpers/definitions + * + * Derived from arch/x86/kvm/cpuid.c + */ + +#ifndef _ASM_X86_CPUID_H +#define _ASM_X86_CPUID_H + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x8000001d: + return true; + } + + return false; +} + +#endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 56cf3950a2660..bf8705c566109 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -1,30 +1,31 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL) BUILD_BUG_ON(1) #endif /* - * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate - * "static_call()"s. They are also intended for use when defining - * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those - * functions that follow the [svm|vmx]_func_name convention. - * KVM_X86_OP_NULL() can leave a NULL definition for the - * case where there is no definition or a function name that - * doesn't match the typical naming convention is supplied. + * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * to make a definition optional, but in this case the default will + * be __static_call_return0. */ -KVM_X86_OP_NULL(hardware_enable) -KVM_X86_OP_NULL(hardware_disable) -KVM_X86_OP_NULL(hardware_unsetup) -KVM_X86_OP_NULL(cpu_has_accelerated_tpr) +KVM_X86_OP(hardware_enable) +KVM_X86_OP(hardware_disable) +KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) -KVM_X86_OP_NULL(vm_destroy) -KVM_X86_OP_NULL(vcpu_precreate) +KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) -KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(prepare_switch_to_guest) KVM_X86_OP(vcpu_load) KVM_X86_OP(vcpu_put) KVM_X86_OP(update_exception_bitmap) @@ -34,9 +35,10 @@ KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) -KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(get_cs_db_l_bits) KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) +KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) @@ -50,21 +52,21 @@ KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) -KVM_X86_OP(tlb_flush_all) -KVM_X86_OP(tlb_flush_current) -KVM_X86_OP_NULL(tlb_remote_flush) -KVM_X86_OP_NULL(tlb_remote_flush_with_range) -KVM_X86_OP(tlb_flush_gva) -KVM_X86_OP(tlb_flush_guest) -KVM_X86_OP(run) -KVM_X86_OP_NULL(handle_exit) -KVM_X86_OP_NULL(skip_emulated_instruction) -KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(flush_tlb_all) +KVM_X86_OP(flush_tlb_current) +KVM_X86_OP_OPTIONAL(tlb_remote_flush) +KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +KVM_X86_OP(flush_tlb_gva) +KVM_X86_OP(flush_tlb_guest) +KVM_X86_OP(vcpu_run) +KVM_X86_OP(handle_exit) +KVM_X86_OP(skip_emulated_instruction) +KVM_X86_OP_OPTIONAL(update_emulated_instruction) KVM_X86_OP(set_interrupt_shadow) KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) -KVM_X86_OP(set_irq) -KVM_X86_OP(set_nmi) +KVM_X86_OP(inject_irq) +KVM_X86_OP(inject_nmi) KVM_X86_OP(queue_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) @@ -73,22 +75,22 @@ KVM_X86_OP(get_nmi_mask) KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) -KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) -KVM_X86_OP_NULL(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) -KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_posted_interrupt) -KVM_X86_OP_NULL(sync_pir_to_irr) -KVM_X86_OP(set_tss_addr) -KVM_X86_OP(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL(sync_pir_to_irr) +KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) +KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) -KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) @@ -96,35 +98,34 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) -KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) -KVM_X86_OP_NULL(vcpu_blocking) -KVM_X86_OP_NULL(vcpu_unblocking) -KVM_X86_OP_NULL(update_pi_irte) -KVM_X86_OP_NULL(start_assignment) -KVM_X86_OP_NULL(apicv_post_state_restore) -KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) -KVM_X86_OP_NULL(set_hv_timer) -KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) +KVM_X86_OP_OPTIONAL(vcpu_blocking) +KVM_X86_OP_OPTIONAL(vcpu_unblocking) +KVM_X86_OP_OPTIONAL(pi_update_irte) +KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP(apicv_post_state_restore) +KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(set_hv_timer) +KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) -KVM_X86_OP_NULL(mem_enc_op) -KVM_X86_OP_NULL(mem_enc_reg_region) -KVM_X86_OP_NULL(mem_enc_unreg_region) -KVM_X86_OP_NULL(guest_memory_reclaimed) +KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(mem_enc_register_region) +KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) +KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_NULL(enable_direct_tlbflush) -KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) -KVM_X86_OP_NULL(complete_emulated_msr) +KVM_X86_OP(complete_emulated_msr) #undef KVM_X86_OP -#undef KVM_X86_OP_NULL +#undef KVM_X86_OP_OPTIONAL +#undef KVM_X86_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h new file mode 100644 index 0000000000000..b2f2b8c21b5b9 --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. + */ +KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(pmc_idx_to_pmc) +KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) +KVM_X86_PMU_OP(msr_idx_to_pmc) +KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP(is_valid_msr) +KVM_X86_PMU_OP(get_msr) +KVM_X86_PMU_OP(set_msr) +KVM_X86_PMU_OP(refresh) +KVM_X86_PMU_OP(init) +KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) +KVM_X86_PMU_OP_OPTIONAL(cleanup) + +#undef KVM_X86_PMU_OP +#undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5f51700b938ba..4f7c9eb032a13 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -484,19 +484,27 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; }; +/* More counters may conflict with other existing Architectural MSRs */ +#define KVM_INTEL_PMC_MAX_GENERIC 8 +#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) +#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; @@ -505,20 +513,33 @@ struct kvm_pmu { u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; - u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; u8 version; - struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; + u64 pebs_enable; + u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; + + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. @@ -600,6 +621,11 @@ struct kvm_vcpu_hv { } cpuid_cache; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -736,7 +762,8 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + struct kvm_hypervisor_cpuid kvm_cpuid; + bool is_amd_compatible; u64 reserved_gpa_bits; int maxphyaddr; @@ -785,6 +812,7 @@ struct kvm_vcpu_arch { unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ + u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; @@ -1157,6 +1185,7 @@ struct kvm_arch { u32 notify_window; u32 notify_vmexit_flags; + bool enable_pmu; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1330,7 +1359,6 @@ struct kvm_x86_ops { int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); - bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1344,7 +1372,7 @@ struct kvm_x86_ops { void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); @@ -1361,6 +1389,7 @@ struct kvm_x86_ops { bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1374,8 +1403,8 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); - void (*tlb_flush_all)(struct kvm_vcpu *vcpu); - void (*tlb_flush_current)(struct kvm_vcpu *vcpu); + void (*flush_tlb_all)(struct kvm_vcpu *vcpu); + void (*flush_tlb_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1386,15 +1415,15 @@ struct kvm_x86_ops { * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ - void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ - void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1403,8 +1432,8 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); @@ -1462,28 +1491,14 @@ struct kvm_x86_ops { int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); - /* pmu operations of sub-arch */ - const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); - void (*start_assignment)(struct kvm *kvm); + void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1498,9 +1513,9 @@ struct kvm_x86_ops { int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); - int (*mem_enc_op)(struct kvm *kvm, void __user *argp); - int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); - int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); @@ -1542,9 +1557,10 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); - bool (*intel_pt_intr_in_guest)(void); + unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; + struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { @@ -1562,16 +1578,12 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include -static inline void kvm_ops_static_call_update(void) -{ -#define KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP_NULL KVM_X86_OP -#include -} +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) @@ -1590,6 +1602,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); @@ -1922,8 +1937,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -unsigned int kvm_guest_state(void); - void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 45aa98bf804f9..9c0194109482c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) @@ -33,6 +34,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* Intel MSRs. Some also available on other CPUs */ @@ -254,6 +256,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7e0e1889ce0b3..9d0d460508eba 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1288,19 +1288,21 @@ spectre_v2_user_select_mitigation(void) } /* - * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP + * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP * is not required. * - * Enhanced IBRS also protects against cross-thread branch target + * Intel's Enhanced IBRS also protects against cross-thread branch target * injection in user-mode as the IBRS bit remains always set which * implicitly enables cross-thread protections. However, in legacy IBRS * mode, the IBRS bit is set only on kernel entry and cleared on return - * to userspace. This disables the implicit cross-thread protection, - * so allow for STIBP to be selected in that case. + * to userspace. AMD Automatic IBRS also does not protect userspace. + * These modes therefore disable the implicit cross-thread protection, + * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* @@ -1330,9 +1332,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", - [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", - [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; @@ -1401,7 +1403,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -1586,8 +1588,12 @@ static void __init spectre_v2_select_mitigation(void) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - update_spec_ctrl(x86_spec_ctrl_base); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } } switch (mode) { @@ -1671,8 +1677,8 @@ static void __init spectre_v2_select_mitigation(void) /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around - * firmware calls only when IBRS / Enhanced IBRS aren't otherwise - * enabled. + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if @@ -2591,7 +2597,8 @@ static ssize_t mmio_stale_data_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 919f391543025..359e2e3202f51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1105,8 +1105,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* Zhaoxin Family 7 */ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), @@ -1226,8 +1226,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); - if (ia32_cap & ARCH_CAP_IBRS_ALL) + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { @@ -1289,11 +1297,6 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } - if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && - !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) - setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); - if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); @@ -1544,9 +1547,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c) if (!IS_ENABLED(CONFIG_X86_64)) return; - /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ - if (c->extended_cpuid_level >= 0x80000021 && - cpuid_eax(0x80000021) & BIT(6)) + if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE)) return; /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9e6225bab6d9a..f2740bd966c00 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -60,8 +61,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -69,9 +79,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); + return e; + } } return NULL; @@ -88,7 +120,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -112,15 +145,15 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } -static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, + int nent, const char *sig) { - u32 function; + struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; + u32 base; - vcpu->arch.kvm_cpuid_base = 0; - - for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + for_each_possible_hypervisor_cpuid_base(base) { + entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (entry) { u32 signature[3]; @@ -129,23 +162,40 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) signature[1] = entry->ecx; signature[2] = entry->edx; - BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); - if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { - vcpu->arch.kvm_cpuid_base = function; + if (!memcmp(signature, sig, sizeof(signature))) { + cpuid.base = base; + cpuid.limit = entry->eax; break; } } } + + return cpuid; +} + +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) +{ + return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, sig); +} + +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, + int nent, u32 kvm_cpuid_base) +{ + return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - u32 base = vcpu->arch.kvm_cpuid_base; + u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, base); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -160,11 +210,29 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +/* + * Calculate guest's supported XCR0 taking into account guest CPUID data and + * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + */ +static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +{ + struct kvm_cpuid_entry2 *best; + + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; +} + +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; + struct kvm_hypervisor_cpuid kvm_cpuid; + u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -175,32 +243,54 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); + if (kvm_cpuid.base) { + best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); + if (kvm_hlt_in_guest(vcpu->kvm) && best) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } + + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = cpuid_entry2_find(entries, nent, 0x12, 0x1); + if (best) { + best->ecx &= guest_supported_xcr0 & 0xffffffff; + best->edx &= guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } +} + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); } EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); @@ -209,7 +299,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -219,30 +309,12 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) - vcpu->arch.guest_supported_xcr0 = 0; - else - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - - /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. - */ - best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); - if (best) { - best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; - best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } + vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); kvm_update_pv_runtime(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -266,10 +338,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -291,6 +363,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, { int r; + __kvm_update_cpuid_runtime(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; @@ -299,8 +373,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -410,9 +483,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) } static __always_inline -void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ BUILD_BUG_ON(leaf < NCAPINTS); kvm_cpu_caps[leaf] = mask; @@ -422,7 +495,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ BUILD_BUG_ON(leaf >= NCAPINTS); kvm_cpu_caps[leaf] &= mask; @@ -532,7 +605,7 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); - kvm_cpu_cap_init_scattered(CPUID_12_EAX, + kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, SF(SGX1) | SF(SGX2) ); @@ -541,7 +614,7 @@ void kvm_set_cpu_caps(void) F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE) + F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, @@ -597,6 +670,22 @@ void kvm_set_cpu_caps(void) 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | F(SME_COHERENT)); + kvm_cpu_cap_mask(CPUID_8000_0021_EAX, + BIT(0) /* NO_NESTED_DATA_BP */ | + BIT(2) /* LFENCE Always serializing */ | 0 /* SmmPgCfgLock */ | + F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ + ); + + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, + F(PERFMON_V2) + ); + + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(2) /* LFENCE Always serializing */; + if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) + kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); + kvm_cpu_caps[CPUID_8000_0021_EAX] |= BIT(9) /* NO_SMM_CTL_MSR */; + kvm_cpu_cap_mask(CPUID_C000_0001_EDX, F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | @@ -645,31 +734,37 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, if (!entry) return NULL; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + case 0x80000000: + /* + * 0x80000021 is sometimes synthesized by __do_cpuid_func, which + * would result in out-of-bounds calls to do_host_cpuid. + */ + { + static int max_cpuid_80000000; + if (!READ_ONCE(max_cpuid_80000000)) + WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); + if (function > READ_ONCE(max_cpuid_80000000)) + return entry; + } + break; + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } @@ -789,39 +884,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; - if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * Only support guest architectural pmu on a host - * with architectural pmu. - */ - if (!cap.version) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -985,7 +1069,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001f); + entry->eax = min(entry->eax, 0x80000022); + /* + * Serializing LFENCE is reported in a multitude of ways, and + * NullSegClearsBase is not reported in CPUID on Zen2; help + * userspace by providing the CPUID leaf ourselves. + * + * However, only do it if the host has CPUID leaf 0x8000001d. + * QEMU thinks that it can query the host blindly for that + * CPUID leaf if KVM reports that it supports 0x8000001d or + * above. The processor merrily returns values from the + * highest Intel leaf which QEMU tries to use as the guest's + * 0x8000001d. Even worse, this can result in an infinite + * loop if said highest leaf has no subleaves indexed by ECX. + */ + if (entry->eax >= 0x8000001d && + (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) + || !static_cpu_has_bug(X86_BUG_NULL_SEG))) + entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: entry->ebx &= ~GENMASK(27, 16); @@ -1066,6 +1167,35 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx &= ~GENMASK(11, 6); } break; + case 0x80000020: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + case 0x80000021: + entry->ebx = entry->ecx = entry->edx = 0; + cpuid_entry_override(entry, CPUID_8000_0021_EAX); + break; + /* AMD Extended Performance Monitoring and Debug */ + case 0x80000022: { + union cpuid_0x80000022_ebx ebx; + + entry->ecx = entry->edx = 0; + if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { + entry->eax = entry->ebx; + break; + } + + cpuid_entry_override(entry, CPUID_8000_0022_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; + else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; + else + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; + + entry->ebx = ebx.full; + break; + } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -1198,12 +1328,20 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1240,7 +1378,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1249,11 +1387,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1271,7 +1409,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1281,7 +1419,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1310,7 +1448,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9de..18fd2e845989a 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,15 +121,25 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,18 +150,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 29a96d1c7e2b8..077a7080be953 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1989,7 +1989,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2002,7 +2002,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2013,7 +2013,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2022,7 +2022,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c706c97f1dd32..dcb4ed5f4afba 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2425,7 +2425,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index be6d70ddc0b46..8858d0dc5ecde 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4351,7 +4351,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->root_level, is_efer_nx(context), guest_can_use_gbpages(vcpu), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void @@ -5080,7 +5080,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); out: return r; } @@ -5343,7 +5343,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5401,7 +5401,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); ++vcpu->stat.invlpg; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b4e56b2556e2d..7a84d92a51532 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include "x86.h" #include "cpuid.h" @@ -22,6 +24,9 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -42,62 +47,83 @@ * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters */ -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); +static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; + +#define KVM_X86_PMU_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ + *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#include - kvm_pmu_deliver_pmi(vcpu); +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) +{ + memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); + +#define __KVM_X86_PMU_OP(func) \ + static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); +#define KVM_X86_PMU_OP(func) \ + WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) +#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#include +#undef __KVM_X86_PMU_OP } -static void kvm_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { - struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { + bool skip_pmi = false; + + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + if (!in_pmi) { + /* + * TODO: KVM is currently _choosing_ to not generate records + * for emulated instructions, avoiding BUFFER_OVF PMI when + * there are no records. Strictly speaking, it should be done + * as well in the right context to improve sampling accuracy. + */ + skip_pmi = true; + } else { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } + } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } + + if (pmc->intr && !skip_pmi) + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } -static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ + if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) + return; - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (!kvm_guest_state()) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); - } + __kvm_perf_overflow(pmc, true); + + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - u64 config, bool exclude_user, - bool exclude_kernel, bool intr) +static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, + bool exclude_user, bool exclude_kernel, + bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -109,6 +135,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -121,20 +148,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + } event = perf_event_create_kernel_counter(&attr, -1, current, - intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", PTR_ERR(event), pmc->idx); - return; + return PTR_ERR(event); } pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; - clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; + pmc->intr = intr || pebs; + return 0; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -160,129 +204,118 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != + (!!pmc->perf_event->attr.precise_ip)) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +static int cmp_u64(const void *pa, const void *pb) { - u64 config; - u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; - struct kvm_pmu_event_filter *filter; - int i; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); - bool allow_event = true; - - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; + u64 a = *(u64 *)pa; + u64 b = *(u64 *)pb; - pmc_pause_counter(pmc); + return (a > b) - (a < b); +} - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) - return; +static bool check_pmu_event_filter(struct kvm_pmc *pmc) +{ + struct kvm_pmu_event_filter *filter; + struct kvm *kvm = pmc->vcpu->kvm; + bool allow_event = true; + __u64 key; + int idx; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - for (i = 0; i < filter->nevents; i++) - if (filter->events[i] == - (eventsel & AMD64_RAW_EVENT_MASK_NB)) - break; - if (filter->action == KVM_PMU_EVENT_ALLOW && - i == filter->nevents) - allow_event = false; + if (!filter) + goto out; + + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; if (filter->action == KVM_PMU_EVENT_DENY && - i < filter->nevents) + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) allow_event = false; } - if (!allow_event) - return; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; - - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) - return; - pmc_release_perf_event(pmc); +out: + return allow_event; +} - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); +static bool pmc_event_is_allowed(struct kvm_pmc *pmc) +{ + return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + check_pmu_event_filter(pmc); } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +static void reprogram_counter(struct kvm_pmc *pmc) { - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!en_field || !pmc_is_enabled(pmc)) - return; + if (!pmc_event_is_allowed(pmc)) + goto reprogram_complete; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); + + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; } - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) - return; + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) + goto reprogram_complete; pmc_release_perf_event(pmc); - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->find_fixed_event(idx), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); - -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); + pmc->current_config = new_config; - if (!pmc) + /* + * If reprogramming fails, e.g. due to contention, leave the counter's + * regprogram bit set, i.e. opportunistically try again on the next PMU + * refresh. Don't make a new request as doing so can stall the guest + * if reprogramming repeatedly fails. + */ + if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT)) return; - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); - - reprogram_fixed_counter(pmc, ctrl, idx); - } +reprogram_complete: + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc->prev_counter = 0; } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -290,14 +323,14 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { + if (unlikely(!pmc)) { clear_bit(bit, pmu->reprogram_pmi); continue; } - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -312,7 +345,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); + return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -362,7 +395,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; @@ -378,22 +411,29 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - if (kvm_x86_ops.pmu_ops->deliver_pmi) - kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); + static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || - kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } + return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -401,13 +441,86 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; + + if (data & pmu->global_status_mask) + return 1; + + pmu->global_status = data; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_mask; + fallthrough; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) + return 1; + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + } + + return 0; } /* refresh PMU settings. This function generally is called when underlying @@ -416,15 +529,32 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_x86_ops.pmu_ops->refresh(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + + pmu->version = 0; + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + + if (vcpu->kvm->arch.enable_pmu) + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); - kvm_x86_ops.pmu_ops->reset(vcpu); + static_call(kvm_x86_pmu_reset)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -432,24 +562,12 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_x86_ops.pmu_ops->init(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + static_call(kvm_x86_pmu_init)(vcpu); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { @@ -464,14 +582,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } - if (kvm_x86_ops.pmu_ops->cleanup) - kvm_x86_ops.pmu_ops->cleanup(vcpu); + static_call_cond(kvm_x86_pmu_cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -481,6 +598,63 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) +{ + pmc->prev_counter = pmc->counter; + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + kvm_pmu_request_counter_reprogram(pmc); +} + +static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, + unsigned int perf_hw_id) +{ + u64 old_eventsel = pmc->eventsel; + unsigned int config; + + pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); + pmc->eventsel = old_eventsel; + return config == perf_hw_id; +} + +static inline bool cpl_is_matched(struct kvm_pmc *pmc) +{ + bool select_os, select_user; + u64 config; + + if (pmc_is_gp(pmc)) { + config = pmc->eventsel; + select_os = config & ARCH_PERFMON_EVENTSEL_OS; + select_user = config & ARCH_PERFMON_EVENTSEL_USR; + } else { + config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + select_os = config & 0x1; + select_user = config & 0x2; + } + + return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; +} + +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); + + if (!pmc || !pmc_event_is_allowed(pmc)) + continue; + + /* Ignore checks for edge detect, pin control, invert and CMASK bits */ + if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) + kvm_pmu_incr_counter(pmc); + } +} +EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; @@ -512,6 +686,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) /* Ensure nevents can't be changed between the user copies. */ *filter = tmp; + /* + * Sort the in-kernel list so that we can search it with bsearch. + */ + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 84aa808ddfbf8..e9fe7f07e1fab 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,8 +23,6 @@ struct kvm_event_hw_type_mapping { struct kvm_pmu_ops { unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); - unsigned (*find_fixed_event)(int idx); - bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -38,8 +36,26 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + const int MAX_NR_GP_COUNTERS; + const int MIN_NR_GP_COUNTERS; }; +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); + +static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + * + * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. + */ + return pmu->version > 1; +} + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -59,6 +75,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { @@ -87,11 +109,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) return pmc->type == KVM_PMC_FIXED; } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) -{ - return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc); -} - static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { @@ -148,9 +165,82 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) + enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + +static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) +{ + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); +} + +static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); +} + +/* + * Check if a PMC is enabled by comparing it against global_ctrl bits. + * + * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled. + */ +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); @@ -165,6 +255,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 94c0eb4dc8e66..a43b180a9e4fe 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,23 +7,39 @@ #include /* - * Hardware-defined CPUID leafs that are scattered in the kernel, but need to - * be directly used by KVM. Note, these word values conflict with the kernel's - * "bug" caps, but KVM doesn't use those. + * Hardware-defined CPUID leafs that are either scattered by the kernel or are + * unknown to the kernel, but need to be directly used by KVM. Note, these + * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, + CPUID_8000_0022_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; +/* + * Define a KVM-only feature flag. + * + * For features that are scattered by cpufeatures.h, __feature_translate() also + * needs to be updated to translate the kernel-defined feature into the + * KVM-defined feature. + * + * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h, + * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so + * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is + * needed in this case. + */ #define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +/* CPUID level 0x80000022 (EAX) */ +#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) + struct cpuid_reg { u32 function; u32 index; @@ -49,6 +65,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, + [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, }; /* @@ -81,6 +98,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_PERFMON_V2) + return KVM_X86_FEATURE_PERFMON_V2; return x86_feature; } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b595a33860d70..d01996deeff34 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -686,11 +686,6 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) return 0; } -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d35c94e13afb0..045853230a347 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,6 +16,7 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, @@ -116,6 +117,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + if (!vcpu->kvm->arch.enable_pmu) + return NULL; + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -162,6 +166,10 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) else event_mapping = amd_event_mapping; + /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ + if (WARN_ON(pmc_is_fixed(pmc))) + return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) if (event_mapping[i].eventsel == event_select && event_mapping[i].unit_mask == unit_mask) @@ -173,20 +181,6 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) return event_mapping[i].event_type; } -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; -} - -/* check if a PMC is enabled by comparing it against global_ctrl bits. Because - * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). - */ -static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) -{ - return true; -} - static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER); @@ -228,12 +222,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[idx]; } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; -} - static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -245,6 +233,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + return pmu->version > 0; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + return pmu->version > 1; + default: + if (msr > MSR_F15H_PERF_CTR5 && + msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters) + return pmu->version > 1; + break; + } + + return amd_msr_idx_to_pmc(vcpu, msr); +} + static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -277,7 +288,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } @@ -285,8 +296,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + kvm_pmu_request_counter_reprogram(pmc); + } return 0; } @@ -296,20 +309,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + union cpuid_0x80000022_ebx ebx; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->version = 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + pmu->version = 2; + /* + * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest + * CPUID entry is guaranteed to be non-NULL. + */ + BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || + x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); + ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; + pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; - else + } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + } + + pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, + kvm_pmu_cap.num_counters_gp); + + if (pmu->version > 1) { + pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_mask = pmu->global_ctrl_mask; + } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; - pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->global_status = 0; bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } @@ -318,9 +350,10 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); - for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -333,18 +366,18 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } + + pmu->global_ctrl = pmu->global_status = 0; } -struct kvm_pmu_ops amd_pmu_ops = { +struct kvm_pmu_ops amd_pmu_ops __initdata = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, - .find_fixed_event = amd_find_fixed_event, - .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -355,4 +388,6 @@ struct kvm_pmu_ops amd_pmu_ops = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 854b3779b8a3a..7b11e15b9f5b1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1537,7 +1537,7 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int svm_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8a317e34c0231..698853607925e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -957,6 +957,23 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); + } + /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); } @@ -1003,6 +1020,9 @@ static __init int svm_hardware_setup(void) tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) + kvm_enable_efer_bits(EFER_AUTOIBRS); + /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { pause_filter_count = 0; @@ -1078,6 +1098,9 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + svm_set_cpu_caps(); /* @@ -1744,6 +1767,24 @@ static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return true; } +static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3600,16 +3641,6 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3997,11 +4028,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4024,11 +4050,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4046,7 +4067,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } @@ -4582,7 +4603,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_create_vcpu, @@ -4593,7 +4613,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_blocking = svm_vcpu_blocking, @@ -4610,6 +4630,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, + .post_set_cr3 = svm_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4624,20 +4645,20 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb, + .flush_tlb_current = svm_flush_tlb, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb, - .run = svm_vcpu_run, + .vcpu_run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, + .inject_irq = svm_set_irq, + .inject_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4655,10 +4676,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, - .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4679,12 +4696,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, - .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .pi_update_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, @@ -4692,9 +4707,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = svm_mem_enc_ioctl, + .mem_enc_register_region = svm_register_enc_region, + .mem_enc_unregister_region = svm_unregister_enc_region, .guest_memory_reclaimed = sev_guest_memory_reclaimed, .vm_copy_enc_context_from = svm_vm_copy_asid_from, @@ -4716,19 +4731,39 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1d9b1a9e4398f..02ac73275758f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -550,7 +550,7 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); +int svm_mem_enc_ioctl(struct kvm *kvm, void __user *argp); int svm_register_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_unregister_enc_region(struct kvm *kvm, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 1fb65099dfcda..28800f778f9e7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,9 @@ #include #include "lapic.h" +#include "x86.h" +#include "cpuid.h" +#include "pmu.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -389,20 +392,51 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } -static inline u64 vmx_get_perf_capabilities(void) +static inline bool vmx_pebs_supported(void) { - u64 perf_cap = 0; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} - if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); +static inline u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; - perf_cap &= PMU_CAP_LBR_FMT; + if (!enable_pmu) + return 0; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0210ba16f7073..ec044440ae389 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2621,7 +2621,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4348,7 +4348,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); @@ -4862,7 +4862,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl) { struct vcpu_vmx *vmx; @@ -4870,7 +4871,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (vcpu_has_perf_global_ctrl) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2d0ac8a86d4a4..129ae4e01f7c1 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, + bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 46b1e630eb3cb..07d3dad405448 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,35 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + kvm_pmu_request_counter_reprogram(pmc); } - - pmu->fixed_ctr_ctrl = data; } -/* function is called when global control register has been updated. */ -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - int bit; - u64 diff = pmu->global_ctrl ^ data; - - pmu->global_ctrl = data; + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } } static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) @@ -76,9 +76,9 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) int i; for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) + if (intel_arch_events[i].eventsel == event_select && + intel_arch_events[i].unit_mask == unit_mask && + (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) break; if (i == ARRAY_SIZE(intel_arch_events)) @@ -87,41 +87,6 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) return intel_arch_events[i].event_type; } -static unsigned intel_find_fixed_event(int idx) -{ - u32 event; - size_t size = ARRAY_SIZE(fixed_pmc_events); - - if (idx >= size) - return PERF_COUNT_HW_MAX; - - event = fixed_pmc_events[array_index_nospec(idx, size)]; - return intel_arch_events[event].event_type; -} - -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!intel_pmu_has_perf_global_ctrl(pmu)) - return true; - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { @@ -177,16 +142,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -215,14 +170,22 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; int ret; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - return intel_pmu_has_perf_global_ctrl(pmu); + return kvm_pmu_has_perf_global_ctrl(pmu); + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || @@ -361,36 +324,38 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_STATUS: - msr_info->data = pmu->global_status; - return 0; - case MSR_CORE_PERF_GLOBAL_CTRL: - msr_info->data = pmu->global_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = pmu->global_ovf_ctrl; - return 0; + break; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + break; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + break; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_GP]; - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; - return 0; - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) { + break; + } + return 1; } - return 1; + return 0; } static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -399,38 +364,37 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct kvm_pmc *pmc; u32 msr = msr_info->index; u64 data = msr_info->data; - u64 reserved_bits; + u64 reserved_bits, diff; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & pmu->fixed_ctr_ctrl_mask)) { + if (data & pmu->fixed_ctr_ctrl_mask) + return 1; + + if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); - return 0; - } break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ - case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (kvm_valid_perf_global_ctrl(pmu, data)) { - global_ctrl_changed(pmu, data); - return 0; + case MSR_IA32_PEBS_ENABLE: + if (data & pmu->pebs_enable_mask) + return 1; + + if (pmu->pebs_enable != data) { + diff = pmu->pebs_enable ^ data; + pmu->pebs_enable = data; + reprogram_counters(pmu, diff); } break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & pmu->global_ovf_ctrl_mask)) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; - return 0; - } + case MSR_IA32_DS_AREA: + if (is_noncanonical_address(data, vcpu)) + return 1; + + pmu->ds_area = data; + break; + case MSR_PEBS_DATA_CFG: + if (data & pmu->pebs_data_cfg_mask) + return 1; + + pmu->pebs_data_cfg = data; break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -438,59 +402,71 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((msr & MSR_PMC_FULL_WIDTH_BIT) && (data & ~pmu->counter_bitmask[KVM_PMC_GP])) return 1; + if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; reserved_bits = pmu->reserved_bits; if ((pmc->idx == 2) && (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; - if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); - return 0; + if (data & reserved_bits) + return 1; + + if (data != pmc->eventsel) { + pmc->eventsel = data; + kvm_pmu_request_counter_reprogram(pmc); } - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) { + break; + } + /* Not a known PMU MSR. */ + return 1; } - return 1; + return 0; +} + +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } } static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; int i; - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_ovf_ctrl_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry) return; + eax.full = entry->eax; edx.full = entry->edx; @@ -498,13 +474,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -512,27 +488,35 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; - pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; + + /* + * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) + * share reserved bit definitions. The kernel just happens to use + * OVF_CTRL for the names. + */ + pmu->global_status_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_ovf_ctrl_mask &= + pmu->global_status_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { @@ -545,15 +529,34 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_entry_exit_ctls_update(vcpu); + nested_vmx_pmu_refresh(vcpu, + intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } else { + vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK; + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -562,7 +565,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -588,22 +591,21 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = NULL; int i; - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; intel_pmu_release_guest_lbr_event(vcpu); } @@ -719,10 +721,31 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } -struct kvm_pmu_ops intel_pmu_ops = { +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit, hw_idx; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !pmc_is_globally_enabled(pmc) || !pmc->perf_event) + continue; + + /* + * A negative index indicates the event isn't mapped to a + * physical counter in the host, e.g. due to contention. + */ + hw_idx = pmc->perf_event->hw.idx; + if (hw_idx != pmc->idx && hw_idx > -1) + pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx); + } +} + +struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, - .find_fixed_event = intel_find_fixed_event, - .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -735,4 +758,6 @@ struct kvm_pmu_ops intel_pmu_ops = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index bf9b8f6a7242f..0bfea6bde3313 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -90,8 +90,6 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b8cf9a59c145e..17b91a65a0bae 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -152,8 +152,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -447,7 +447,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 48b19346af7b8..325473beaa8f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -601,11 +601,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -2336,7 +2331,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -3288,6 +3294,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* @@ -6847,6 +6854,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) @@ -7412,7 +7423,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7428,7 +7439,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7463,7 +7474,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7604,6 +7615,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7824,19 +7842,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7940,7 +7945,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), @@ -7952,7 +7956,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7981,20 +7985,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, - .run = vmx_vcpu_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, @@ -8044,14 +8048,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8075,6 +8075,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -8101,6 +8115,8 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -8266,8 +8282,12 @@ static __init int hardware_setup(void) if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8296,9 +8316,10 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, - .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, }; static void vmx_cleanup_l1d_flush(void) @@ -8319,6 +8340,7 @@ static void vmx_exit(void) #endif kvm_exit(); + kvm_x86_vendor_exit(); #if IS_ENABLED(CONFIG_HYPERV) if (static_branch_unlikely(&enable_evmcs)) { @@ -8387,23 +8409,25 @@ static int __init vmx_init(void) } #endif + r = kvm_x86_vendor_init(&vmx_init_ops); + if (r) + return r; + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - return r; + goto err_kvm_init; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8428,5 +8452,11 @@ static int __init vmx_init(void) allow_smaller_maxphyaddr = true; return 0; + +err_l1d_flush: + vmx_exit(); +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c80627cd3f0bd..41b3eaf93988b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -92,22 +92,10 @@ union vmx_exit_reason { u32 full; }; -static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) -{ - /* - * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is - * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is - * greater than zero. However, KVM only exposes and emulates the MSR - * to/for the guest if the guest PMU supports at least "Architectural - * Performance Monitoring Version 2". - */ - return pmu->version > 1; -} - #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 593fb5f80aa8c..4af97530ee451 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -110,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) +#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -124,17 +126,18 @@ static int sync_regs(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); @@ -194,6 +197,11 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -1200,6 +1208,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* @@ -1351,7 +1360,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1368,10 +1377,17 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, @@ -1383,14 +1399,20 @@ static const u32 msrs_to_save_all[] = { MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - MSR_IA32_XFD, MSR_IA32_XFD_ERR, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -1612,6 +1634,9 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { + if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) + return false; + if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; @@ -2060,10 +2085,13 @@ static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u32 msr = kvm_rcx_read(vcpu); u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); @@ -2086,6 +2114,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -3161,7 +3191,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -3291,7 +3321,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + static_call(kvm_x86_flush_tlb_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3310,14 +3340,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) return; } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + static_call(kvm_x86_flush_tlb_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); + static_call(kvm_x86_flush_tlb_current)(vcpu); } /* @@ -3473,10 +3503,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~msr_ent.data) return 1; - vcpu->arch.perf_capabilities = data; + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + return 0; + vcpu->arch.perf_capabilities = data; + kvm_pmu_refresh(vcpu); return 0; - } + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3749,6 +3787,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3830,9 +3878,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports in + * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; @@ -4259,6 +4314,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4303,9 +4359,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); - break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; break; @@ -4364,6 +4417,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sizeof(struct kvm_xsave); break; } + case KVM_CAP_PMU_CAPABILITY: + r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; + break; default: break; } @@ -6108,6 +6164,18 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_PMU_CAPABILITY: + r = -EINVAL; + if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6620,8 +6688,10 @@ long kvm_arch_vm_ioctl(struct file *filp, break; case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -6632,8 +6702,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6644,8 +6716,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -6677,87 +6751,98 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); - - perf_get_x86_pmu_capability(&x86_pmu); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } - - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) @@ -8241,6 +8326,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -8511,11 +8598,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8821,54 +8908,6 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -unsigned int kvm_guest_state(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - unsigned int state; - - if (!vcpu) - return 0; - - state = PERF_GUEST_ACTIVE; - if (static_call(kvm_x86_get_cpl)(vcpu)) - state |= PERF_GUEST_USER; - - return state; -} - -static unsigned long kvm_guest_get_ip(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - if (WARN_ON_ONCE(!vcpu)) - return 0; - - return kvm_rip_read(vcpu); -} - -static unsigned int kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!vcpu) - return 0; - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); - return 1; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .state = kvm_guest_state, - .get_ip = kvm_guest_get_ip, - .handle_intel_pt_intr = NULL, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -8926,9 +8965,36 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + int kvm_arch_init(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; + return 0; +} + +void kvm_arch_exit(void) +{ + +} + +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ int r; if (kvm_x86_ops.hardware_enable) { @@ -8978,6 +9044,24 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(ops->pmu_ops); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_ops_update(ops); + kvm_timer_init(); if (boot_cpu_has(X86_FEATURE_XSAVE)) { @@ -8994,8 +9078,33 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); + kvm_max_guest_tsc_khz = max; + + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + } + kvm_init_msr_list(); return 0; +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9004,8 +9113,22 @@ int kvm_arch_init(void *opaque) return r; } -void kvm_arch_exit(void) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); + +void kvm_x86_vendor_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9021,7 +9144,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9029,7 +9152,11 @@ void kvm_arch_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { @@ -9407,10 +9534,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); can_inject = false; } } @@ -9497,7 +9624,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } @@ -9511,7 +9638,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9940,10 +10067,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -10133,7 +10257,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -10201,7 +10325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } for (;;) { - exit_fastpath = static_call(kvm_x86_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10325,8 +10449,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { /* * Switch to the software timer before halt-polling/blocking as * the guest's timer may be a break event for the vCPU, and the @@ -10345,9 +10468,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -10515,10 +10635,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Exclude PKRU from restore as restored separately in - * kvm_x86_ops.run(). - */ + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } @@ -10916,6 +11033,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); @@ -11380,9 +11498,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; /* * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's @@ -11460,13 +11578,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); @@ -11628,56 +11744,12 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void *opaque) { - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - kvm_ops_static_call_update(); - - if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; - perf_register_guest_info_callbacks(&kvm_guest_cbs); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; - } - - kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - kvm_guest_cbs.handle_intel_pt_intr = NULL; - static_call(kvm_x86_hardware_unsetup)(); } int kvm_arch_check_processor_compat(void *opaque) @@ -11762,6 +11834,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; + kvm->arch.enable_pmu = enable_pmu; #if IS_ENABLED(CONFIG_HYPERV) spin_lock_init(&kvm->arch.hv_root_tdp_lock); @@ -12231,8 +12304,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -12259,6 +12331,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) @@ -12306,7 +12381,15 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - return vcpu->arch.preempted_in_kernel; + if (vcpu != kvm_get_running_vcpu()) + return vcpu->arch.preempted_in_kernel; + + return static_call(kvm_x86_get_cpl)(vcpu) == 0; +} + +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -12582,7 +12665,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -12630,7 +12713,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -12655,7 +12738,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -12666,7 +12749,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4873b043944b6..6d2c43b00b394 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -106,6 +106,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.last_vmentry_cpu != -1; +} + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; @@ -332,6 +337,7 @@ extern u64 host_xcr0; extern u64 supported_xcr0; extern u64 host_xss; extern u64 supported_xss; +extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { @@ -389,8 +395,6 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); - static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) { return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; @@ -398,14 +402,18 @@ static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, vcpu); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 1); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, NULL); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) +{ + return !!vcpu->arch.handling_intr_from_guest; +} static inline bool kvm_pat_valid(u64 data) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e2423ffaf593..a45e535e832bf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1264,6 +1264,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void kvm_unregister_perf_callbacks(void); +#else +static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_unregister_perf_callbacks(void) {} +#endif /* CONFIG_GUEST_PERF_EVENTS */ + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 1b6e80914a7b5..8f924848a14f9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1122,6 +1122,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SYS_ATTRIBUTES 209 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_PMU_CAPABILITY 212 #ifdef KVM_CAP_IRQ_ROUTING @@ -1947,6 +1948,8 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +#define KVM_PMU_CAP_DISABLE (1 << 0) + /** * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. * @flags: Some extra information for header, always 0 for now. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 7eafe88dcea77..2d8705dc03627 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1115,6 +1115,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_S390_MEM_OP_EXTENSION 211 +#define KVM_CAP_PMU_CAPABILITY 212 #ifdef KVM_CAP_IRQ_ROUTING @@ -1949,6 +1951,8 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +#define KVM_PMU_CAP_DISABLE (1 << 0) + /** * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. * @flags: Some extra information for header, always 0 for now. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2a5a924b32d5a..a113bcc4c9c60 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5611,6 +5611,50 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_running_vcpu; } +#ifdef CONFIG_GUEST_PERF_EVENTS +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; + + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + state = PERF_GUEST_ACTIVE; + if (!kvm_arch_vcpu_in_kernel(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) + return 0; + + return kvm_arch_vcpu_get_ip(vcpu); +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, + .handle_intel_pt_intr = NULL, +}; + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +{ + kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); +} +void kvm_unregister_perf_callbacks(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} +#endif + struct kvm_cpu_compat_check { void *opaque; int *ret;