From 15d6a51ea37836502fcc92640f57410d93af643b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2023 05:58:03 -0700 Subject: [PATCH 01/89] perf/x86/intel: Factor out the initialization code for ADL e-core commit d87d221f854b62f5e8026505497d33404ef6050c upstream. From PMU's perspective, the ADL e-core and newer SRF/GRR have a similar uarch. Most of the initialization code can be shared. Factor out intel_pmu_init_grt() for the common initialization code. The common part of the ADL e-core will be replaced by the later patch. Intel-SIG: commit d87d221f854b perf/x86/intel: Factor out the initialization code for ADL e-core Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230829125806.3016082-4-kan.liang@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 58 +++++++++++++----------------------- 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5b6d2899046c6..7443170f09e1d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6190,6 +6190,25 @@ static __always_inline void intel_pmu_init_glc(struct pmu *pmu) hybrid(pmu, pebs_constraints) = intel_glc_pebs_event_constraints; } +static __always_inline void intel_pmu_init_grt(struct pmu *pmu) +{ + x86_pmu.mid_ack = true; + x86_pmu.limit_period = glc_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + + memcpy(hybrid_var(pmu, hw_cache_event_ids), glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hybrid_var(pmu, hw_cache_extra_regs), tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hybrid_var(pmu, hw_cache_event_ids)[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + hybrid(pmu, event_constraints) = intel_slm_event_constraints; + hybrid(pmu, pebs_constraints) = intel_grt_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_grt_extra_regs; +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -6468,28 +6487,10 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GRACEMONT: - x86_pmu.mid_ack = true; - memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; - x86_pmu.extra_regs = intel_grt_extra_regs; - - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.pebs_block = true; - x86_pmu.lbr_pt_coexist = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - + intel_pmu_init_grt(NULL); intel_pmu_pebs_data_source_grt(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.get_event_constraints = tnt_get_event_constraints; - x86_pmu.limit_period = glc_limit_period; td_attr = tnt_events_attrs; mem_attr = grt_mem_attrs; extra_attr = nhm_format_attr; @@ -6499,28 +6500,11 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_CRESTMONT: case INTEL_FAM6_ATOM_CRESTMONT_X: - x86_pmu.mid_ack = true; - memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, - sizeof(hw_cache_extra_regs)); - hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - - x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; + intel_pmu_init_grt(NULL); x86_pmu.extra_regs = intel_cmt_extra_regs; - - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.lbr_pt_coexist = true; - x86_pmu.pebs_block = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - intel_pmu_pebs_data_source_cmt(); x86_pmu.pebs_latency_data = mtl_latency_data_small; x86_pmu.get_event_constraints = cmt_get_event_constraints; - x86_pmu.limit_period = glc_limit_period; td_attr = cmt_events_attrs; mem_attr = grt_mem_attrs; extra_attr = cmt_format_attr; From 5d749e61fd00663d3641c73dc02d62095087a186 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2023 05:58:04 -0700 Subject: [PATCH 02/89] perf/x86/intel: Apply the common initialization code for ADL commit 299a5fc8e783eed705015e83e381912dbbf3eabc upstream. Use the intel_pmu_init_glc() and intel_pmu_init_grt() to replace the duplicate code for ADL. The current code already checks the PERF_X86_EVENT_TOPDOWN flag before invoking the Topdown metrics functions. (The PERF_X86_EVENT_TOPDOWN flag is to indicate the Topdown metric feature, which is only available for the p-core.) Drop the unnecessary adl_set_topdown_event_period() and adl_update_topdown_event(). Intel-SIG: commit 299a5fc8e783 perf/x86/intel: Apply the common initialization code for ADL Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230829125806.3016082-5-kan.liang@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 53 ++---------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7443170f09e1d..a245a9b218f32 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2561,16 +2561,6 @@ static int icl_set_topdown_event_period(struct perf_event *event) return 0; } -static int adl_set_topdown_event_period(struct perf_event *event) -{ - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - - if (pmu->cpu_type != hybrid_big) - return 0; - - return icl_set_topdown_event_period(event); -} - DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) @@ -2713,16 +2703,6 @@ static u64 icl_update_topdown_event(struct perf_event *event) x86_pmu.num_topdown_events - 1); } -static u64 adl_update_topdown_event(struct perf_event *event) -{ - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - - if (pmu->cpu_type != hybrid_big) - return 0; - - return icl_update_topdown_event(event); -} - DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_event(struct perf_event *event) @@ -6867,32 +6847,11 @@ __init int intel_pmu_init(void) static_branch_enable(&perf_is_hybrid); x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; - x86_pmu.pebs_aliases = NULL; - x86_pmu.pebs_prec_dist = true; - x86_pmu.pebs_block = true; - x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - x86_pmu.lbr_pt_coexist = true; x86_pmu.pebs_latency_data = adl_latency_data_small; - x86_pmu.num_topdown_events = 8; - static_call_update(intel_pmu_update_topdown_event, - &adl_update_topdown_event); - static_call_update(intel_pmu_set_topdown_event_period, - &adl_set_topdown_event_period); - x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; - x86_pmu.limit_period = glc_limit_period; x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; - /* - * The rtm_abort_event is used to check whether to enable GPRs - * for the RTM abort event. Atom doesn't have the RTM abort - * event. There is no harmful to set it in the common - * x86_pmu.rtm_abort_event. - */ - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); td_attr = adl_hybrid_events_attrs; mem_attr = adl_hybrid_mem_attrs; @@ -6904,6 +6863,7 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; pmu->name = "cpu_core"; pmu->cpu_type = hybrid_big; + intel_pmu_init_glc(&pmu->pmu); pmu->late_ack = true; if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { pmu->num_counters = x86_pmu.num_counters + 2; @@ -6933,16 +6893,13 @@ __init int intel_pmu_init(void) pmu->intel_cap.perf_metrics = 1; pmu->intel_cap.pebs_output_pt_available = 0; - memcpy(pmu->hw_cache_event_ids, glc_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); - memcpy(pmu->hw_cache_extra_regs, glc_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); - pmu->event_constraints = intel_glc_event_constraints; - pmu->pebs_constraints = intel_glc_pebs_event_constraints; pmu->extra_regs = intel_glc_extra_regs; /* Initialize Atom core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; pmu->name = "cpu_atom"; pmu->cpu_type = hybrid_small; + intel_pmu_init_grt(&pmu->pmu); pmu->mid_ack = true; pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; @@ -6954,12 +6911,6 @@ __init int intel_pmu_init(void) pmu->intel_cap.perf_metrics = 0; pmu->intel_cap.pebs_output_pt_available = 1; - memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); - memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); - pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; - pmu->event_constraints = intel_slm_event_constraints; - pmu->pebs_constraints = intel_grt_pebs_event_constraints; - pmu->extra_regs = intel_grt_extra_regs; if (is_mtl(boot_cpu_data.x86_model)) { x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_rwc_extra_regs; x86_pmu.pebs_latency_data = mtl_latency_data_small; From 1e7dc12169dbfd055c8bd2e2bdfa014205e7af03 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2023 05:58:05 -0700 Subject: [PATCH 03/89] perf/x86/intel: Clean up the hybrid CPU type handling code commit b0560bfd4b70277a4936c82e50e940aa253c95bf upstream. There is a fairly long list of grievances about the current code. The main beefs: 1. hybrid_big_small assumes that the *HARDWARE* (CPUID) provided core types are a bitmap. They are not. If Intel happened to make a core type of 0xff, hilarity would ensue. 2. adl_get_hybrid_cpu_type() utterly inscrutable. There are precisely zero comments and zero changelog about what it is attempting to do. According to Kan, the adl_get_hybrid_cpu_type() is there because some Alder Lake (ADL) CPUs can do some silly things. Some ADL models are *supposed* to be hybrid CPUs with big and little cores, but there are some SKUs that only have big cores. CPUID(0x1a) on those CPUs does not say that the CPUs are big cores. It apparently just returns 0x0. It confuses perf because it expects to see either 0x40 (Core) or 0x20 (Atom). The perf workaround for this is to watch for a CPU core saying it is type 0x0. If that happens on an Alder Lake, it calls x86_pmu.get_hybrid_cpu_type() and just assumes that the core is a Core (0x40) CPU. To fix up the mess, separate out the CPU types and the 'pmu' types. This allows 'hybrid_pmu_type' bitmaps without worrying that some future CPU type will set multiple bits. Since the types are now separate, add a function to glue them back together again. Actual comment on the situation in the glue function (find_hybrid_pmu_for_cpu()). Also, give ->get_hybrid_cpu_type() a real return type and make it clear that it is overriding the *CPU* type, not the PMU type. Rename cpu_type to pmu_type in the struct x86_hybrid_pmu to reflect the change. Intel-SIG: commit b0560bfd4b70 perf/x86/intel: Clean up the hybrid CPU type handling code Backport CWF PMU support and dependency Originally-by: Dave Hansen Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230829125806.3016082-6-kan.liang@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun [jz: fix pmu->cpu_type in hybrid_td_is_visible() due to difference between 6.6 stable branch and vanilla upstream] Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 6 +-- arch/x86/events/intel/core.c | 71 ++++++++++++++++++++++++------------ arch/x86/events/intel/ds.c | 2 +- arch/x86/events/perf_event.h | 35 +++++++++++------- 4 files changed, 73 insertions(+), 41 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 271a5f894bea5..26f83ad477260 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1890,9 +1890,9 @@ ssize_t events_hybrid_sysfs_show(struct device *dev, str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) + if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; - if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { + if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); @@ -2172,7 +2172,7 @@ static int __init init_hw_perf_events(void) hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, - (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); + (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a245a9b218f32..656adc9413ed0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3869,7 +3869,7 @@ static inline bool require_mem_loads_aux_event(struct perf_event *event) return false; if (is_hybrid()) - return hybrid_pmu(event->pmu)->cpu_type == hybrid_big; + return hybrid_pmu(event->pmu)->pmu_type == hybrid_big; return true; } @@ -4506,9 +4506,9 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return glc_get_event_constraints(cpuc, idx, event); - else if (pmu->cpu_type == hybrid_small) + else if (pmu->pmu_type == hybrid_small) return tnt_get_event_constraints(cpuc, idx, event); WARN_ON(1); @@ -4583,9 +4583,9 @@ mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return rwc_get_event_constraints(cpuc, idx, event); - if (pmu->cpu_type == hybrid_small) + if (pmu->pmu_type == hybrid_small) return cmt_get_event_constraints(cpuc, idx, event); WARN_ON(1); @@ -4596,18 +4596,18 @@ static int adl_hw_config(struct perf_event *event) { struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return hsw_hw_config(event); - else if (pmu->cpu_type == hybrid_small) + else if (pmu->pmu_type == hybrid_small) return intel_pmu_hw_config(event); WARN_ON(1); return -EOPNOTSUPP; } -static u8 adl_get_hybrid_cpu_type(void) +static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) { - return hybrid_big; + return HYBRID_INTEL_CORE; } static inline bool erratum_hsw11(struct perf_event *event) @@ -4804,22 +4804,47 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) } } -static bool init_hybrid_pmu(int cpu) +static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); u8 cpu_type = get_this_hybrid_cpu_type(); - struct x86_hybrid_pmu *pmu = NULL; int i; - if (!cpu_type && x86_pmu.get_hybrid_cpu_type) - cpu_type = x86_pmu.get_hybrid_cpu_type(); + /* + * This is running on a CPU model that is known to have hybrid + * configurations. But the CPU told us it is not hybrid, shame + * on it. There should be a fixup function provided for these + * troublesome CPUs (->get_hybrid_cpu_type). + */ + if (cpu_type == HYBRID_INTEL_NONE) { + if (x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + else + return NULL; + } + /* + * This essentially just maps between the 'hybrid_cpu_type' + * and 'hybrid_pmu_type' enums: + */ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) { - pmu = &x86_pmu.hybrid_pmu[i]; - break; - } + enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; + + if (cpu_type == HYBRID_INTEL_CORE && + pmu_type == hybrid_big) + return &x86_pmu.hybrid_pmu[i]; + if (cpu_type == HYBRID_INTEL_ATOM && + pmu_type == hybrid_small) + return &x86_pmu.hybrid_pmu[i]; } + + return NULL; +} + +static bool init_hybrid_pmu(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct x86_hybrid_pmu *pmu = find_hybrid_pmu_for_cpu(); + if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) { cpuc->pmu = NULL; return false; @@ -5911,7 +5936,7 @@ static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr) struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr); - return pmu->cpu_type & pmu_attr->pmu_type; + return pmu->pmu_type & pmu_attr->pmu_type; } static umode_t hybrid_events_is_visible(struct kobject *kobj, @@ -5948,7 +5973,7 @@ static umode_t hybrid_format_is_visible(struct kobject *kobj, container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr); int cpu = hybrid_find_supported_cpu(pmu); - return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0; + return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0; } static umode_t hybrid_td_is_visible(struct kobject *kobj, @@ -5963,7 +5988,7 @@ static umode_t hybrid_td_is_visible(struct kobject *kobj, /* Only the big core supports perf metrics */ - if (pmu->cpu_type == hybrid_big) + if (pmu->pmu_type == hybrid_big) return pmu->intel_cap.perf_metrics ? attr->mode : 0; return attr->mode; @@ -6862,7 +6887,7 @@ __init int intel_pmu_init(void) /* Initialize big core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; pmu->name = "cpu_core"; - pmu->cpu_type = hybrid_big; + pmu->pmu_type = hybrid_big; intel_pmu_init_glc(&pmu->pmu); pmu->late_ack = true; if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { @@ -6898,7 +6923,7 @@ __init int intel_pmu_init(void) /* Initialize Atom core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; pmu->name = "cpu_atom"; - pmu->cpu_type = hybrid_small; + pmu->pmu_type = hybrid_small; intel_pmu_init_grt(&pmu->pmu); pmu->mid_ack = true; pmu->num_counters = x86_pmu.num_counters; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3ffe8dd9abfd2..33a0cba6f241f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -261,7 +261,7 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status, { u64 val; - WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big); + WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); dse &= PERF_PEBS_DATA_SOURCE_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8a9d0ba5b0bac..79ed3ef2dda1e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -658,10 +658,29 @@ enum { #define PERF_PEBS_DATA_SOURCE_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1) +enum hybrid_cpu_type { + HYBRID_INTEL_NONE, + HYBRID_INTEL_ATOM = 0x20, + HYBRID_INTEL_CORE = 0x40, +}; + +enum hybrid_pmu_type { + not_hybrid, + hybrid_small = BIT(0), + hybrid_big = BIT(1), + + hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */ +}; + +#define X86_HYBRID_PMU_ATOM_IDX 0 +#define X86_HYBRID_PMU_CORE_IDX 1 + +#define X86_HYBRID_NUM_PMUS 2 + struct x86_hybrid_pmu { struct pmu pmu; const char *name; - u8 cpu_type; + enum hybrid_pmu_type pmu_type; cpumask_t supported_cpus; union perf_capabilities intel_cap; u64 intel_ctrl; @@ -727,18 +746,6 @@ extern struct static_key_false perf_is_hybrid; __Fp; \ }) -enum hybrid_pmu_type { - hybrid_big = 0x40, - hybrid_small = 0x20, - - hybrid_big_small = hybrid_big | hybrid_small, -}; - -#define X86_HYBRID_PMU_ATOM_IDX 0 -#define X86_HYBRID_PMU_CORE_IDX 1 - -#define X86_HYBRID_NUM_PMUS 2 - /* * struct x86_pmu - generic x86 pmu */ @@ -947,7 +954,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - u8 (*get_hybrid_cpu_type) (void); + enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { From 1c16f24cbc5c1a329bd509423882bf2d98c2a400 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2023 05:58:06 -0700 Subject: [PATCH 04/89] perf/x86/intel: Add common intel_pmu_init_hybrid() commit 97588df87b56e27fd2b5d928d61c7a53e38afbb0 upstream. The current hybrid initialization codes aren't well organized and are hard to read. Factor out intel_pmu_init_hybrid() to do a common setup for each hybrid PMU. The PMU-specific capability will be updated later via either hard code (ADL) or CPUID hybrid enumeration (MTL). Splitting the ADL and MTL initialization codes, since they have different uarches. The hard code PMU capabilities are not required for MTL either. They can be enumerated by the new leaf 0x23 and IA32_PERF_CAPABILITIES MSR. The hybrid enumeration of the IA32_PERF_CAPABILITIES MSR is broken on MTL. Using the default value. Intel-SIG: commit 97588df87b56 perf/x86/intel: Add common intel_pmu_init_hybrid() Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230829125806.3016082-7-kan.liang@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun [jz: resolve conflicts in update_pmu_cap() due to following commit 47a973fd7563 ("perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF") already been backported by stable branch] Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 161 ++++++++++++++++++++++++----------- 1 file changed, 110 insertions(+), 51 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 656adc9413ed0..7af64c966c141 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4786,6 +4786,16 @@ static void intel_pmu_check_num_counters(int *num_counters, int *num_counters_fixed, u64 *intel_ctrl, u64 fixed_mask); +static inline bool intel_pmu_broken_perf_cap(void) +{ + /* The Perf Metric (Bit 15) is always cleared */ + if ((boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE) || + (boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE_L)) + return true; + + return false; +} + static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { unsigned int cntr, fixed_cntr, ecx, edx; @@ -4801,7 +4811,26 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) pmu->num_counters_fixed = fls(fixed_cntr); intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, &pmu->intel_ctrl, fixed_cntr); + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + } + + if (!intel_pmu_broken_perf_cap()) { + /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ + rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); } + + if (pmu->intel_cap.perf_metrics) + pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + else + pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + + if (pmu->intel_cap.pebs_output_pt_available) + pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + else + pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; } static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) @@ -6165,10 +6194,52 @@ static void intel_pmu_check_hybrid_pmus(u64 fixed_mask) } } -static __always_inline bool is_mtl(u8 x86_model) +static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { + { hybrid_small, "cpu_atom" }, + { hybrid_big, "cpu_core" }, +}; + +static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) { - return (x86_model == INTEL_FAM6_METEORLAKE) || - (x86_model == INTEL_FAM6_METEORLAKE_L); + unsigned long pmus_mask = pmus; + struct x86_hybrid_pmu *pmu; + int idx = 0, bit; + + x86_pmu.num_hybrid_pmus = hweight_long(pmus_mask); + x86_pmu.hybrid_pmu = kcalloc(x86_pmu.num_hybrid_pmus, + sizeof(struct x86_hybrid_pmu), + GFP_KERNEL); + if (!x86_pmu.hybrid_pmu) + return -ENOMEM; + + static_branch_enable(&perf_is_hybrid); + x86_pmu.filter = intel_pmu_filter; + + for_each_set_bit(bit, &pmus_mask, ARRAY_SIZE(intel_hybrid_pmu_type_map)) { + pmu = &x86_pmu.hybrid_pmu[idx++]; + pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id; + pmu->name = intel_hybrid_pmu_type_map[bit].name; + + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + if (pmu->pmu_type & hybrid_small) { + pmu->intel_cap.perf_metrics = 0; + pmu->intel_cap.pebs_output_pt_available = 1; + pmu->mid_ack = true; + } else if (pmu->pmu_type & hybrid_big) { + pmu->intel_cap.perf_metrics = 1; + pmu->intel_cap.pebs_output_pt_available = 0; + pmu->late_ack = true; + } + } + + return 0; } static __always_inline void intel_pmu_init_glc(struct pmu *pmu) @@ -6857,23 +6928,14 @@ __init int intel_pmu_init(void) case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: case INTEL_FAM6_RAPTORLAKE_S: - case INTEL_FAM6_METEORLAKE: - case INTEL_FAM6_METEORLAKE_L: /* * Alder Lake has 2 types of CPU, core and atom. * * Initialize the common PerfMon capabilities here. */ - x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS, - sizeof(struct x86_hybrid_pmu), - GFP_KERNEL); - if (!x86_pmu.hybrid_pmu) - return -ENOMEM; - static_branch_enable(&perf_is_hybrid); - x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; + intel_pmu_init_hybrid(hybrid_big_small); x86_pmu.pebs_latency_data = adl_latency_data_small; - x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; @@ -6886,10 +6948,7 @@ __init int intel_pmu_init(void) /* Initialize big core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; - pmu->name = "cpu_core"; - pmu->pmu_type = hybrid_big; intel_pmu_init_glc(&pmu->pmu); - pmu->late_ack = true; if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { pmu->num_counters = x86_pmu.num_counters + 2; pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; @@ -6914,45 +6973,45 @@ __init int intel_pmu_init(void) pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); - pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; - pmu->intel_cap.perf_metrics = 1; - pmu->intel_cap.pebs_output_pt_available = 0; - pmu->extra_regs = intel_glc_extra_regs; /* Initialize Atom core specific PerfMon capabilities.*/ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; - pmu->name = "cpu_atom"; - pmu->pmu_type = hybrid_small; intel_pmu_init_grt(&pmu->pmu); - pmu->mid_ack = true; - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->max_pebs_events = x86_pmu.max_pebs_events; - pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); - pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; - pmu->intel_cap.perf_metrics = 0; - pmu->intel_cap.pebs_output_pt_available = 1; - - if (is_mtl(boot_cpu_data.x86_model)) { - x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_rwc_extra_regs; - x86_pmu.pebs_latency_data = mtl_latency_data_small; - extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? - mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; - mem_attr = mtl_hybrid_mem_attrs; - intel_pmu_pebs_data_source_mtl(); - x86_pmu.get_event_constraints = mtl_get_event_constraints; - pmu->extra_regs = intel_cmt_extra_regs; - pr_cont("Meteorlake Hybrid events, "); - name = "meteorlake_hybrid"; - } else { - x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; - intel_pmu_pebs_data_source_adl(); - pr_cont("Alderlake Hybrid events, "); - name = "alderlake_hybrid"; - } + + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + intel_pmu_pebs_data_source_adl(); + pr_cont("Alderlake Hybrid events, "); + name = "alderlake_hybrid"; + break; + + case INTEL_FAM6_METEORLAKE: + case INTEL_FAM6_METEORLAKE_L: + intel_pmu_init_hybrid(hybrid_big_small); + + x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.get_event_constraints = mtl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + + td_attr = adl_hybrid_events_attrs; + mem_attr = mtl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_glc(&pmu->pmu); + pmu->extra_regs = intel_rwc_extra_regs; + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_grt(&pmu->pmu); + pmu->extra_regs = intel_cmt_extra_regs; + + intel_pmu_pebs_data_source_mtl(); + pr_cont("Meteorlake Hybrid events, "); + name = "meteorlake_hybrid"; break; default: @@ -7064,7 +7123,7 @@ __init int intel_pmu_init(void) if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; - if (is_hybrid()) + if (is_hybrid() && !boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) intel_pmu_check_hybrid_pmus((u64)fixed_mask); if (x86_pmu.intel_cap.pebs_timing_info) From a3798d18704901ce8d8d68fa974db44111165f35 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 11 Sep 2023 06:51:28 -0700 Subject: [PATCH 05/89] perf/x86/intel: Fix broken fixed event constraints extension commit 950ecdc672aec9cd29036b2e2535b07c103af494 upstream. Unnecessary multiplexing is triggered when running an "instructions" event on an MTL. perf stat -e cpu_core/instructions/,cpu_core/instructions/ -a sleep 1 Performance counter stats for 'system wide': 115,489,000 cpu_core/instructions/ (50.02%) 127,433,777 cpu_core/instructions/ (49.98%) 1.002294504 seconds time elapsed Linux architectural perf events, e.g., cycles and instructions, usually have dedicated fixed counters. These events also have equivalent events which can be used in the general-purpose counters. The counters are precious. In the intel_pmu_check_event_constraints(), perf check/extend the event constraints of these events. So these events can utilize both fixed counters and general-purpose counters. The following cleanup commit: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") forgot adding the intel_pmu_check_event_constraints() into update_pmu_cap(). The architectural perf events cannot utilize the general-purpose counters. The code to check and update the counters, event constraints and extra_regs is the same among hybrid systems. Move intel_pmu_check_hybrid_pmus() to init_hybrid_pmu(), and emove the duplicate check in update_pmu_cap(). Intel-SIG: commit 950ecdc672ae perf/x86/intel: Fix broken fixed event constraints extension Backport CWF PMU support and dependency Fixes: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230911135128.2322833-1-kan.liang@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun [jz: resolve context conflicts in update_pmu_cap() due to following commit 47a973fd7563 ("perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF") already been backported by stable branch] Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 65 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7af64c966c141..ad89186ef3688 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4786,6 +4786,13 @@ static void intel_pmu_check_num_counters(int *num_counters, int *num_counters_fixed, u64 *intel_ctrl, u64 fixed_mask); +static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, + int num_counters, + int num_counters_fixed, + u64 intel_ctrl); + +static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); + static inline bool intel_pmu_broken_perf_cap(void) { /* The Perf Metric (Bit 15) is always cleared */ @@ -4809,18 +4816,22 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) &cntr, &fixed_cntr, &ecx, &edx); pmu->num_counters = fls(cntr); pmu->num_counters_fixed = fls(fixed_cntr); - intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, - &pmu->intel_ctrl, fixed_cntr); - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); - pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); } if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); } +} + +static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) +{ + intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, + &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); if (pmu->intel_cap.perf_metrics) pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; @@ -4831,6 +4842,13 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; else pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; + + intel_pmu_check_event_constraints(pmu->event_constraints, + pmu->num_counters, + pmu->num_counters_fixed, + pmu->intel_ctrl); + + intel_pmu_check_extra_regs(pmu->extra_regs); } static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) @@ -4886,6 +4904,8 @@ static bool init_hybrid_pmu(int cpu) if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(pmu); + intel_pmu_check_hybrid_pmus(pmu); + if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) return false; @@ -6164,36 +6184,6 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) } } -static void intel_pmu_check_hybrid_pmus(u64 fixed_mask) -{ - struct x86_hybrid_pmu *pmu; - int i; - - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - pmu = &x86_pmu.hybrid_pmu[i]; - - intel_pmu_check_num_counters(&pmu->num_counters, - &pmu->num_counters_fixed, - &pmu->intel_ctrl, - fixed_mask); - - if (pmu->intel_cap.perf_metrics) { - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; - pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS; - } - - if (pmu->intel_cap.pebs_output_pt_available) - pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; - - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->num_counters, - pmu->num_counters_fixed, - pmu->intel_ctrl); - - intel_pmu_check_extra_regs(pmu->extra_regs); - } -} - static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { { hybrid_small, "cpu_atom" }, { hybrid_big, "cpu_core" }, @@ -7123,9 +7113,6 @@ __init int intel_pmu_init(void) if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; - if (is_hybrid() && !boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) - intel_pmu_check_hybrid_pmus((u64)fixed_mask); - if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; From 87181be9ac1ee066912408d55895e86add8f9a46 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:24 -0700 Subject: [PATCH 06/89] tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel commit 76db7aab1fca6688ddf9f388157521c442e0ffb8 upstream. Sync the new sample type for the branch counters feature. Intel-SIG: commit 76db7aab1fca tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel Backport CWF PMU support and dependency Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231025201626.3000228-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 39c6a250dd1b9..3a64499b0f5d6 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -235,6 +237,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -982,6 +986,12 @@ enum perf_event_type { * { u64 nr; * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi @@ -1427,6 +1437,9 @@ struct perf_branch_entry { reserved:31; }; +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + union perf_sample_weight { __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) From c781572dfb60024657787ff268806885f3734c3d Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 21 Nov 2023 09:46:28 +0800 Subject: [PATCH 07/89] perf/x86/intel: Correct incorrect 'or' operation for PMU capabilities commit e8df9d9f4209c04161321d8c12640ae560f65939 upstream. When running perf-stat command on Intel hybrid platform, perf-stat reports the following errors: sudo taskset -c 7 ./perf stat -vvvv -e cpu_atom/instructions/ sleep 1 Opening: cpu/cycles/:HG ------------------------------------------------------------ perf_event_attr: type 0 (PERF_TYPE_HARDWARE) config 0xa00000000 disabled 1 ------------------------------------------------------------ sys_perf_event_open: pid 0 cpu -1 group_fd -1 flags 0x8 sys_perf_event_open failed, error -16 Performance counter stats for 'sleep 1': cpu_atom/instructions/ It looks the cpu_atom/instructions/ event can't be enabled on atom PMU even when the process is pinned on atom core. Investigation shows that exclusive_event_init() helper always returns -EBUSY error in the perf event creation. That's strange since the atom PMU should not be an exclusive PMU. Further investigation shows the issue was introduced by commit: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") The commit originally intents to clear the bit PERF_PMU_CAP_AUX_OUTPUT from PMU capabilities if intel_cap.pebs_output_pt_available is not set, but it incorrectly uses 'or' operation and leads to all PMU capabilities bits are set to 1 except bit PERF_PMU_CAP_AUX_OUTPUT. Testing this fix on Intel hybrid platforms, the observed issues disappear. Intel-SIG: commit e8df9d9f4209 perf/x86/intel: Correct incorrect 'or' operation for PMU capabilities Backport CWF PMU support and dependency Fixes: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") Signed-off-by: Dapeng Mi Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231121014628.729989-1-dapeng1.mi@linux.intel.com [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ad89186ef3688..a5046c19c7ef2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4841,7 +4841,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) if (pmu->intel_cap.pebs_output_pt_available) pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; else - pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; + pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->num_counters, From 52e0519e1aed01ca54485d69b3e940882d2029c8 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 8 Apr 2024 11:51:39 +0800 Subject: [PATCH 08/89] powercap: intel_rapl: Sort header files commit 72b8b94155d957f82697802555d53c142d82dece upstream. Sort header files alphabetically. Intel-SIG: commit 72b8b94155d9 powercap: intel_rapl: Sort header files Backport CWF PMU support and dependency Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- drivers/powercap/intel_rapl_common.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index f1de4111e98d9..e98e56a7cb1ed 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -5,27 +5,27 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include +#include +#include +#include +#include #include -#include #include -#include -#include -#include #include -#include -#include -#include -#include +#include +#include #include -#include -#include #include -#include +#include +#include +#include +#include -#include #include #include +#include /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff From cf5f8dd0bf5689f8bddd7370700ccf8929161665 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 28 Apr 2024 17:24:26 +0800 Subject: [PATCH 09/89] powercap: intel_rapl: Introduce APIs for PMU support commit 575024a8aa7cf1dff49b94092f774ed1c90586be upstream. Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu(). RAPL driver can invoke these APIs to expose its supported energy counters via perf PMU. The new RAPL PMU is fully compatible with current MSR RAPL PMU, including using the same PMU name and events name/id/unit/scale, etc. For example, use below command perf stat -e power/energy-pkg/ -e power/energy-ram/ FOO to get the energy consumption if power/energy-pkg/ and power/energy-ram/ events are available in the "perf list" output. This does not introduce any conflict because TPMI RAPL is the only user of these APIs currently, and it never co-exists with MSR RAPL. Note that RAPL Packages can be probed/removed dynamically, and the events supported by each TPMI RAPL device can be different. Thus the RAPL PMU support is done on demand, which means 1. PMU is registered only if it is needed by a RAPL Package. PMU events for unsupported counters are not exposed. 2. PMU is unregistered and registered when a new RAPL Package is probed and supports new counters that are not supported by current PMU. For example, on a dual-package system using TPMI RAPL, it is possible that Package 1 behaves as TPMI domain root and supports Psys domain. In this case, register PMU without Psys event when probing Package 0, and re-register the PMU with Psys event when probing Package 1. 3. PMU is unregistered when all registered RAPL Packages don't need PMU. Intel-SIG: commit 575024a8aa7c powercap: intel_rapl: Introduce APIs for PMU support Backport CWF PMU support and dependency Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- drivers/powercap/intel_rapl_common.c | 582 +++++++++++++++++++++++++++ include/linux/intel_rapl.h | 32 ++ 2 files changed, 614 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index e98e56a7cb1ed..3350054632f1e 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -1505,6 +1507,586 @@ static int rapl_detect_domains(struct rapl_package *rp) return 0; } +#ifdef CONFIG_PERF_EVENTS + +/* + * Support for RAPL PMU + * + * Register a PMU if any of the registered RAPL Packages have the requirement + * of exposing its energy counters via Perf PMU. + * + * PMU Name: + * power + * + * Events: + * Name Event id RAPL Domain + * energy_cores 0x01 RAPL_DOMAIN_PP0 + * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE + * energy_ram 0x03 RAPL_DOMAIN_DRAM + * energy_gpu 0x04 RAPL_DOMAIN_PP1 + * energy_psys 0x05 RAPL_DOMAIN_PLATFORM + * + * Unit: + * Joules + * + * Scale: + * 2.3283064365386962890625e-10 + * The same RAPL domain in different RAPL Packages may have different + * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as + * the fixed unit for all energy counters, and covert each hardware + * counter increase to N times of PMU event counter increases. + * + * This is fully compatible with the current MSR RAPL PMU. This means that + * userspace programs like turbostat can use the same code to handle RAPL Perf + * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running + * underlying on the platform. + * + * Note that RAPL Packages can be probed/removed dynamically, and the events + * supported by each TPMI RAPL device can be different. Thus the RAPL PMU + * support is done on demand, which means + * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for + * unsupported counters are not exposed. + * 2. PMU is unregistered and registered when a new RAPL Package is probed and + * supports new counters that are not supported by current PMU. + * 3. PMU is unregistered when all registered RAPL Packages don't need PMU. + */ + +struct rapl_pmu { + struct pmu pmu; /* Perf PMU structure */ + u64 timer_ms; /* Maximum expiration time to avoid counter overflow */ + unsigned long domain_map; /* Events supported by current registered PMU */ + bool registered; /* Whether the PMU has been registered or not */ +}; + +static struct rapl_pmu rapl_pmu; + +/* PMU helpers */ + +static int get_pmu_cpu(struct rapl_package *rp) +{ + int cpu; + + if (!rp->has_pmu) + return nr_cpu_ids; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return nr_cpu_ids; + + /* TPMI RAPL uses any CPU in the package for PMU */ + for_each_online_cpu(cpu) + if (topology_physical_package_id(cpu) == rp->id) + return cpu; + + return nr_cpu_ids; +} + +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) +{ + if (!rp->has_pmu) + return false; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return false; + + /* TPMI RAPL uses any CPU in the package for PMU */ + return topology_physical_package_id(cpu) == rp->id; +} + +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + + return &rp->pmu_data; +} + +/* PMU event callbacks */ + +static u64 event_read_counter(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + u64 val; + int ret; + + /* Return 0 for unsupported events */ + if (event->hw.idx < 0) + return 0; + + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + + /* Return 0 for failed read */ + if (ret) + return 0; + + return val; +} + +static void __rapl_pmu_event_start(struct perf_event *event) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &data->active_list); + + local64_set(&event->hw.prev_count, event_read_counter(event)); + if (++data->n_active == 1) + hrtimer_start(&data->hrtimer, data->timer_interval, + HRTIMER_MODE_REL_PINNED); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + __rapl_pmu_event_start(event); + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + + /* + * Follow the generic code to drain hwc->prev_count. + * The loop is not expected to run for multiple times. + */ + prev_raw_count = local64_read(&hwc->prev_count); + do { + new_raw_count = event_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); + + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + */ + delta = new_raw_count - prev_raw_count; + + /* + * Scale delta to smallest unit (2^-32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + sdelta = delta * data->scale[event->hw.flags]; + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + /* Mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(data->n_active <= 0); + if (--data->n_active == 0) + hrtimer_cancel(&data->hrtimer); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* Check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +/* RAPL PMU event ids, same as shown in sysfs */ +enum perf_rapl_events { + PERF_RAPL_PP0 = 1, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + PERF_RAPL_MAX +}; +#define RAPL_EVENT_MASK GENMASK(7, 0) + +static const int event_to_domain[PERF_RAPL_MAX] = { + [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, + [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE, + [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM, + [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1, + [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM, +}; + +static int rapl_pmu_event_init(struct perf_event *event) +{ + struct rapl_package *pos, *rp = NULL; + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int domain, idx; + + /* Only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Check for supported events only */ + if (!cfg || cfg >= PERF_RAPL_MAX) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + /* Find out which Package the event belongs to */ + list_for_each_entry(pos, &rapl_packages, plist) { + if (is_rp_pmu_cpu(pos, event->cpu)) { + rp = pos; + break; + } + } + if (!rp) + return -ENODEV; + + /* Find out which RAPL Domain the event belongs to */ + domain = event_to_domain[cfg]; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->pmu_private = rp; /* Which package */ + event->hw.flags = domain; /* Which domain */ + + event->hw.idx = -1; + /* Find out the index in rp->domains[] to get domain pointer */ + for (idx = 0; idx < rp->nr_domains; idx++) { + if (rp->domains[idx].id == domain) { + event->hw.idx = idx; + break; + } + } + + return 0; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_package_pmu_data *data = + container_of(hrtimer, struct rapl_package_pmu_data, hrtimer); + struct perf_event *event; + unsigned long flags; + + if (!data->n_active) + return HRTIMER_NORESTART; + + raw_spin_lock_irqsave(&data->lock, flags); + + list_for_each_entry(event, &data->active_list, active_entry) + rapl_event_update(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + hrtimer_forward_now(hrtimer, data->timer_interval); + + return HRTIMER_RESTART; +} + +/* PMU sysfs attributes */ + +/* + * There are no default events, but we need to create "events" group (with + * empty attrs) before updating it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rapl_package *rp; + cpumask_var_t cpu_mask; + int cpu; + int ret; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + cpus_read_lock(); + + cpumask_clear(cpu_mask); + + /* Choose a cpu for each RAPL Package */ + list_for_each_entry(rp, &rapl_packages, plist) { + cpu = get_pmu_cpu(rp); + if (cpu < nr_cpu_ids) + cpumask_set_cpu(cpu, cpu_mask); + } + cpus_read_unlock(); + + ret = cpumap_print_to_pagebuf(true, buf, cpu_mask); + + free_cpumask_var(cpu_mask); + + return ret; +} + +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group pmu_cpumask_group = { + .attrs = pmu_cpumask_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *pmu_format_attr[] = { + &format_attr_event.attr, + NULL +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = pmu_format_attr, +}; + +static const struct attribute_group *pmu_attr_groups[] = { + &pmu_events_group, + &pmu_cpumask_group, + &pmu_format_group, + NULL +}; + +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .event_str = str, \ +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); + +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules"); + +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10"); + +#define RAPL_EVENT_GROUP(_name, domain) \ +static struct attribute *pmu_attr_##_name[] = { \ + &event_attr_rapl_##_name.attr.attr, \ + &event_attr_rapl_unit_##_name.attr.attr, \ + &event_attr_rapl_scale_##_name.attr.attr, \ + NULL \ +}; \ +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \ +{ \ + return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \ +} \ +static struct attribute_group pmu_group_##_name = { \ + .name = "events", \ + .attrs = pmu_attr_##_name, \ + .is_visible = is_visible_##_name, \ +} + +RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0); +RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE); +RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM); +RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1); +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM); + +static const struct attribute_group *pmu_attr_update[] = { + &pmu_group_cores, + &pmu_group_pkg, + &pmu_group_ram, + &pmu_group_gpu, + &pmu_group_psys, + NULL +}; + +static int rapl_pmu_update(struct rapl_package *rp) +{ + int ret = 0; + + /* Return if PMU already covers all events supported by current RAPL Package */ + if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map))) + goto end; + + /* Unregister previous registered PMU */ + if (rapl_pmu.registered) + perf_pmu_unregister(&rapl_pmu.pmu); + + rapl_pmu.registered = false; + rapl_pmu.domain_map |= rp->domain_map; + + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu)); + rapl_pmu.pmu.attr_groups = pmu_attr_groups; + rapl_pmu.pmu.attr_update = pmu_attr_update; + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context; + rapl_pmu.pmu.event_init = rapl_pmu_event_init; + rapl_pmu.pmu.add = rapl_pmu_event_add; + rapl_pmu.pmu.del = rapl_pmu_event_del; + rapl_pmu.pmu.start = rapl_pmu_event_start; + rapl_pmu.pmu.stop = rapl_pmu_event_stop; + rapl_pmu.pmu.read = rapl_pmu_event_read; + rapl_pmu.pmu.module = THIS_MODULE; + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT; + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1); + if (ret) { + pr_info("Failed to register PMU\n"); + return ret; + } + + rapl_pmu.registered = true; +end: + rp->has_pmu = true; + return ret; +} + +int rapl_package_add_pmu(struct rapl_package *rp) +{ + struct rapl_package_pmu_data *data = &rp->pmu_data; + int idx; + + if (rp->has_pmu) + return -EEXIST; + + guard(cpus_read_lock)(); + + for (idx = 0; idx < rp->nr_domains; idx++) { + struct rapl_domain *rd = &rp->domains[idx]; + int domain = rd->id; + u64 val; + + if (!test_bit(domain, &rp->domain_map)) + continue; + + /* + * The RAPL PMU granularity is 2^-32 Joules + * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase + */ + val = rd->energy_unit * (1ULL << 32); + do_div(val, ENERGY_UNIT_SCALE * 1000000); + data->scale[domain] = val; + + if (!rapl_pmu.timer_ms) { + struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER); + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. + * + * max_count = rpi->mask >> rpi->shift + 1 + * max_energy_pj = max_count * rd->energy_unit + * max_time_sec = (max_energy_pj / 1000000000) / 200w + * + * rapl_pmu.timer_ms = max_time_sec * 1000 / 2 + */ + val = (rpi->mask >> rpi->shift) + 1; + val *= rd->energy_unit; + do_div(val, 1000000 * 200 * 2); + rapl_pmu.timer_ms = val; + + pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms); + } + + pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]); + } + + /* Initialize per package PMU data */ + raw_spin_lock_init(&data->lock); + INIT_LIST_HEAD(&data->active_list); + data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); + hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + data->hrtimer.function = rapl_hrtimer_handle; + + return rapl_pmu_update(rp); +} +EXPORT_SYMBOL_GPL(rapl_package_add_pmu); + +void rapl_package_remove_pmu(struct rapl_package *rp) +{ + struct rapl_package *pos; + + if (!rp->has_pmu) + return; + + guard(cpus_read_lock)(); + + list_for_each_entry(pos, &rapl_packages, plist) { + /* PMU is still needed */ + if (pos->has_pmu && pos != rp) + return; + } + + perf_pmu_unregister(&rapl_pmu.pmu); + memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); +} +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); +#endif + /* called from CPU hotplug notifier, hotplug lock held */ void rapl_remove_package_cpuslocked(struct rapl_package *rp) { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index f3196f82fd8a1..c0397423d3a89 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -158,6 +158,26 @@ struct rapl_if_priv { void *rpi; }; +#ifdef CONFIG_PERF_EVENTS +/** + * struct rapl_package_pmu_data: Per package data for PMU support + * @scale: Scale of 2^-32 Joules for each energy counter increase. + * @lock: Lock to protect n_active and active_list. + * @n_active: Number of active events. + * @active_list: List of active events. + * @timer_interval: Maximum timer expiration time before counter overflow. + * @hrtimer: Periodically update the counter to prevent overflow. + */ +struct rapl_package_pmu_data { + u64 scale[RAPL_DOMAIN_MAX]; + raw_spinlock_t lock; + int n_active; + struct list_head active_list; + ktime_t timer_interval; + struct hrtimer hrtimer; +}; +#endif + /* maximum rapl package domain name: package-%d-die-%d */ #define PACKAGE_DOMAIN_NAME_LENGTH 30 @@ -176,6 +196,10 @@ struct rapl_package { struct cpumask cpumask; char name[PACKAGE_DOMAIN_NAME_LENGTH]; struct rapl_if_priv *priv; +#ifdef CONFIG_PERF_EVENTS + bool has_pmu; + struct rapl_package_pmu_data pmu_data; +#endif }; struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, @@ -188,4 +212,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +#ifdef CONFIG_PERF_EVENTS +int rapl_package_add_pmu(struct rapl_package *rp); +void rapl_package_remove_pmu(struct rapl_package *rp); +#else +static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; } +static inline void rapl_package_remove_pmu(struct rapl_package *rp) { } +#endif + #endif /* __INTEL_RAPL_H__ */ From 4968941862bc736aeb5bd3460034f4578f2f5813 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 28 Apr 2024 17:24:27 +0800 Subject: [PATCH 10/89] powercap: intel_rapl_tpmi: Enable PMU support commit 963a9ad3c589dc0f922697faea53c69098083945 upstream. Enable RAPL PMU support for TPMI RAPL driver. Intel-SIG: commit 963a9ad3c589 powercap: intel_rapl_tpmi: Enable PMU support Backport CWF PMU support and dependency Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki [ Yunying Sun: amend commit log ] Signed-off-by: Yunying Sun Signed-off-by: Jason Zeng --- drivers/powercap/intel_rapl_tpmi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 1c48dba0ba96a..645fd1dc51a98 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -313,6 +313,8 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, goto err; } + rapl_package_add_pmu(trp->rp); + auxiliary_set_drvdata(auxdev, trp); return 0; @@ -325,6 +327,7 @@ static void intel_rapl_tpmi_remove(struct auxiliary_device *auxdev) { struct tpmi_rapl_package *trp = auxiliary_get_drvdata(auxdev); + rapl_package_remove_pmu(trp->rp); rapl_remove_package(trp->rp); trp_release(trp); } From fab4768935032c51fd816896570cb092267d4aa3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:33 -0700 Subject: [PATCH 11/89] perf/x86/intel: Support the PEBS event mask commit a23eb2fc1d818cdac9b31f032842d55483a6a040 upstream. The current perf assumes that the counters that support PEBS are contiguous. But it's not guaranteed with the new leaf 0x23 introduced. The counters are enumerated with a counter mask. There may be holes in the counter mask for future platforms or in a virtualization environment. Store the PEBS event mask rather than the maximum number of PEBS counters in the x86 PMU structures. Intel-SIG: commit a23eb2fc1d81 perf/x86/intel: Support the PEBS event mask Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-2-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/events/intel/ds.c | 15 ++++++++------- arch/x86/events/perf_event.h | 15 +++++++++++++-- arch/x86/include/asm/intel_ds.h | 1 + 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a5046c19c7ef2..fa42b547ea457 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4828,7 +4828,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) { intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); @@ -6212,7 +6212,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); @@ -6325,7 +6325,7 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; - x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(x86_pmu.num_counters - 1, 0)); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* @@ -6959,7 +6959,7 @@ __init int intel_pmu_init(void) pmu->num_counters_fixed = x86_pmu.num_counters_fixed; } - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 33a0cba6f241f..4a26e9ec8b085 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1136,7 +1136,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sche static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; - int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -2153,6 +2153,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d void *base, *at, *top; short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + int max_pebs_events = intel_pmu_max_num_pebs(NULL); int bit, i, size; u64 mask; @@ -2164,8 +2165,8 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; - mask = (1ULL << x86_pmu.max_pebs_events) - 1; - size = x86_pmu.max_pebs_events; + mask = x86_pmu.pebs_events_mask; + size = max_pebs_events; if (x86_pmu.flags & PMU_FL_PEBS_ALL) { mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; @@ -2204,8 +2205,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, - x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) + max_pebs_events); + + if (!(x86_pmu.pebs_events_mask & (1 << bit))) continue; /* @@ -2263,7 +2265,6 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; @@ -2279,7 +2280,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; - mask = ((1ULL << max_pebs_events) - 1) | + mask = hybrid(cpuc->pmu, pebs_events_mask) | (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); size = INTEL_PMC_IDX_FIXED + num_counters_fixed; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 79ed3ef2dda1e..0e9c56759c472 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -684,7 +684,7 @@ struct x86_hybrid_pmu { cpumask_t supported_cpus; union perf_capabilities intel_cap; u64 intel_ctrl; - int max_pebs_events; + u64 pebs_events_mask; int num_counters; int num_counters_fixed; struct event_constraint unconstrained; @@ -852,7 +852,7 @@ struct x86_pmu { pebs_ept :1; int pebs_record_size; int pebs_buffer_size; - int max_pebs_events; + u64 pebs_events_mask; void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); @@ -1649,6 +1649,17 @@ static inline int is_ht_workaround_enabled(void) return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } +static inline u64 intel_pmu_pebs_mask(u64 cntr_mask) +{ + return MAX_PEBS_EVENTS_MASK & cntr_mask; +} + +static inline int intel_pmu_max_num_pebs(struct pmu *pmu) +{ + static_assert(MAX_PEBS_EVENTS == 32); + return fls((u32)hybrid(pmu, pebs_events_mask)); +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 2f9eeb5c3069a..5dbeac48a5b93 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -9,6 +9,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 #define MAX_PEBS_EVENTS 32 +#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0) #define MAX_FIXED_PEBS_EVENTS 16 /* From 7f166a0c74e04474e8f1a4eac50fc794713ce324 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:34 -0700 Subject: [PATCH 12/89] perf/x86: Support counter mask commit 722e42e45c2f1c6d1adec7813651dba5139f52f4 upstream. The current perf assumes that both GP and fixed counters are contiguous. But it's not guaranteed on newer Intel platforms or in a virtualization environment. Use the counter mask to replace the number of counters for both GP and the fixed counters. For the other ARCHs or old platforms which don't support a counter mask, using GENMASK_ULL(num_counter - 1, 0) to replace. There is no functional change for them. The interface to KVM is not changed. The number of counters still be passed to KVM. It can be updated later separately. Intel-SIG: commit 722e42e45c2f perf/x86: Support counter mask Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-3-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi [jz: resolve conflicts in update_pmu_cap() due to following commit 47a973fd7563 ("perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF") already been backported by stable branch] Signed-off-by: Jason Zeng --- arch/x86/events/amd/core.c | 24 ++--- arch/x86/events/core.c | 98 ++++++++++---------- arch/x86/events/intel/core.c | 163 ++++++++++++++++----------------- arch/x86/events/intel/ds.c | 19 ++-- arch/x86/events/intel/knc.c | 2 +- arch/x86/events/intel/p4.c | 10 +- arch/x86/events/intel/p6.c | 2 +- arch/x86/events/perf_event.h | 47 ++++++++-- arch/x86/events/zhaoxin/core.c | 12 +-- 9 files changed, 199 insertions(+), 178 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index ba70e0ada25c8..29104ba0a2f1a 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -432,7 +432,7 @@ static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, * be removed on one CPU at a time AND PMU is disabled * when we come here */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (cmpxchg(nb->owners + i, event, NULL) == event) break; } @@ -499,7 +499,7 @@ __amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *ev * because of successive calls to x86_schedule_events() from * hw_perf_group_sched_in() without hw_perf_enable() */ - for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + for_each_set_bit(idx, c->idxmsk, x86_pmu_max_num_counters(NULL)) { if (new == -1 || hwc->idx == idx) /* assign free slot, prefer hwc->idx */ old = cmpxchg(nb->owners + idx, NULL, event); @@ -542,7 +542,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) /* * initialize all possible NB constraints */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } @@ -735,7 +735,7 @@ static void amd_pmu_check_overflow(void) * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -755,7 +755,7 @@ static void amd_pmu_enable_all(int added) amd_brs_enable_all(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { /* only activate events which are marked as active */ if (!test_bit(idx, cpuc->active_mask)) continue; @@ -948,7 +948,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) /* Clear any reserved bits set by buggy microcode */ status &= amd_pmu_global_cntr_mask; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -1288,7 +1288,7 @@ static __initconst const struct x86_pmu amd_pmu = { .addr_offset = amd_pmu_addr_offset, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, + .cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS - 1, 0), .add = amd_pmu_add_event, .del = amd_pmu_del_event, .cntval_bits = 48, @@ -1387,7 +1387,7 @@ static int __init amd_core_pmu_init(void) */ x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; - x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + x86_pmu.cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS_CORE - 1, 0); /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { @@ -1397,9 +1397,9 @@ static int __init amd_core_pmu_init(void) x86_pmu.version = 2; /* Find the number of available Core PMCs */ - x86_pmu.num_counters = ebx.split.num_core_pmc; + x86_pmu.cntr_mask64 = GENMASK_ULL(ebx.split.num_core_pmc - 1, 0); - amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; @@ -1427,12 +1427,12 @@ static int __init amd_core_pmu_init(void) * even numbered counter that has a consecutive adjacent odd * numbered counter following it. */ - for (i = 0; i < x86_pmu.num_counters - 1; i += 2) + for (i = 0; i < x86_pmu_max_num_counters(NULL) - 1; i += 2) even_ctr_mask |= BIT_ULL(i); pair_constraint = (struct event_constraint) __EVENT_CONSTRAINT(0, even_ctr_mask, 0, - x86_pmu.num_counters / 2, 0, + x86_pmu_max_num_counters(NULL) / 2, 0, PERF_X86_EVENT_PAIR); x86_pmu.get_event_constraints = amd_get_event_constraints_f17h; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 26f83ad477260..59aaa8965ba1d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -191,29 +191,31 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC -static inline int get_possible_num_counters(void) +static inline u64 get_possible_counter_mask(void) { - int i, num_counters = x86_pmu.num_counters; + u64 cntr_mask = x86_pmu.cntr_mask64; + int i; if (!is_hybrid()) - return num_counters; + return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) - num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); + cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; - return num_counters; + return cntr_mask; } static bool reserve_pmc_hardware(void) { - int i, num_counters = get_possible_num_counters(); + u64 cntr_mask = get_possible_counter_mask(); + int i, end; - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -221,13 +223,14 @@ static bool reserve_pmc_hardware(void) return true; eventsel_fail: - for (i--; i >= 0; i--) + end = i; + for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); - - i = num_counters; + i = X86_PMC_IDX_MAX; perfctr_fail: - for (i--; i >= 0; i--) + end = i; + for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; @@ -235,9 +238,10 @@ static bool reserve_pmc_hardware(void) static void release_pmc_hardware(void) { - int i, num_counters = get_possible_num_counters(); + u64 cntr_mask = get_possible_counter_mask(); + int i; - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } @@ -250,7 +254,8 @@ static void release_pmc_hardware(void) {} #endif -bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) +bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, + unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; @@ -261,7 +266,7 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) @@ -275,12 +280,12 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) } } - if (num_counters_fixed) { + if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - for (i = 0; i < num_counters_fixed; i++) { + for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { @@ -681,7 +686,7 @@ void x86_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; @@ -738,7 +743,7 @@ void x86_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) @@ -977,7 +982,6 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; @@ -1053,7 +1057,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { - int gpmax = num_counters; + int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available @@ -1074,7 +1078,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { - gpmax = num_counters - cpuc->n_pair; + gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } @@ -1159,12 +1163,10 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - int num_counters = hybrid(cpuc->pmu, num_counters); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; - max_count = num_counters + num_counters_fixed; + max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; @@ -1524,13 +1526,13 @@ void perf_event_print_debug(void) u64 pebs, debugctl; int cpu = smp_processor_id(); struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int num_counters = hybrid(cpuc->pmu, num_counters); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask); + unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; int idx; - if (!num_counters) + if (!*(u64 *)cntr_mask) return; local_irq_save(flags); @@ -1557,7 +1559,7 @@ void perf_event_print_debug(void) } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < num_counters; idx++) { + for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); @@ -1570,7 +1572,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < num_counters_fixed; idx++) { + for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); @@ -1684,7 +1686,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ apic_write(APIC_LVTPC, APIC_DM_NMI); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -2040,18 +2042,15 @@ static void _x86_pmu_read(struct perf_event *event) static_call(x86_pmu_update)(event); } -void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, - u64 intel_ctrl) +void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", num_counters); + pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %lu\n", - hweight64((((1ULL << num_counters_fixed) - 1) - << INTEL_PMC_IDX_FIXED) & intel_ctrl)); - pr_info("... event mask: %016Lx\n", intel_ctrl); + pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) @@ -2088,7 +2087,7 @@ static int __init init_hw_perf_events(void) pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) + if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2099,14 +2098,14 @@ static int __init init_hw_perf_events(void) quirk->func(); if (!x86_pmu.intel_ctrl) - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0, 0); + __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, + 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -2115,11 +2114,8 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - if (!is_hybrid()) { - x86_pmu_show_pmu_cap(x86_pmu.num_counters, - x86_pmu.num_counters_fixed, - x86_pmu.intel_ctrl); - } + if (!is_hybrid()) + x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; @@ -2483,7 +2479,7 @@ void perf_clear_dirty_counters(void) for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ - if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) + if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); @@ -3049,8 +3045,8 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; - cap->num_counters_gp = x86_pmu.num_counters; - cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->num_counters_gp = x86_pmu_num_counters(NULL); + cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fa42b547ea457..d84063ca3c865 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2869,23 +2869,23 @@ static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); - int num_counters = hybrid(cpuc->pmu, num_counters); + unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask); + unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); unsigned long flags; int idx; - if (!num_counters) + if (!*(u64 *)cntr_mask) return; local_irq_save(flags); pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < num_counters; idx++) { + for_each_set_bit(idx, cntr_mask, INTEL_PMC_MAX_GENERIC) { wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < num_counters_fixed; idx++) { + for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); @@ -2935,8 +2935,7 @@ static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, !guest_pebs_idxs) return; - for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, - INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (!event->attr.precise_ip) continue; @@ -4279,7 +4278,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; arr[idx].msr = x86_pmu_config_addr(idx); @@ -4297,7 +4296,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; } - *nr = x86_pmu.num_counters; + *nr = x86_pmu_max_num_counters(cpuc->pmu); return arr; } @@ -4312,7 +4311,7 @@ static void core_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask) || @@ -4782,13 +4781,33 @@ static void flip_smm_bit(void *data) } } -static void intel_pmu_check_num_counters(int *num_counters, - int *num_counters_fixed, - u64 *intel_ctrl, u64 fixed_mask); +static void intel_pmu_check_counters_mask(u64 *cntr_mask, + u64 *fixed_cntr_mask, + u64 *intel_ctrl) +{ + unsigned int bit; + + bit = fls64(*cntr_mask); + if (bit > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + bit, INTEL_PMC_MAX_GENERIC); + *cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + } + *intel_ctrl = *cntr_mask; + + bit = fls64(*fixed_cntr_mask); + if (bit > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + bit, INTEL_PMC_MAX_FIXED); + *fixed_cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0); + } + + *intel_ctrl |= *fixed_cntr_mask << INTEL_PMC_IDX_FIXED; +} static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, - int num_counters, - int num_counters_fixed, + u64 cntr_mask, + u64 fixed_cntr_mask, u64 intel_ctrl); static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); @@ -4814,8 +4833,8 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); - pmu->num_counters = fls(cntr); - pmu->num_counters_fixed = fls(fixed_cntr); + pmu->cntr_mask64 = cntr; + pmu->fixed_cntr_mask64 = fixed_cntr; } if (!intel_pmu_broken_perf_cap()) { @@ -4826,12 +4845,12 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) { - intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, - &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + intel_pmu_check_counters_mask(&pmu->cntr_mask64, &pmu->fixed_cntr_mask64, + &pmu->intel_ctrl); + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; @@ -4844,8 +4863,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->num_counters, - pmu->num_counters_fixed, + pmu->cntr_mask64, + pmu->fixed_cntr_mask64, pmu->intel_ctrl); intel_pmu_check_extra_regs(pmu->extra_regs); @@ -4906,7 +4925,7 @@ static bool init_hybrid_pmu(int cpu) intel_pmu_check_hybrid_pmus(pmu); - if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) + if (!check_hw_exists(&pmu->pmu, pmu->cntr_mask, pmu->fixed_cntr_mask)) return false; pr_info("%s PMU driver: ", pmu->name); @@ -4916,8 +4935,7 @@ static bool init_hybrid_pmu(int cpu) pr_cont("\n"); - x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed, - pmu->intel_ctrl); + x86_pmu_show_pmu_cap(&pmu->pmu); end: cpumask_set_cpu(cpu, &pmu->supported_cpus); @@ -6097,29 +6115,9 @@ static const struct attribute_group *hybrid_attr_update[] = { static struct attribute *empty_attrs; -static void intel_pmu_check_num_counters(int *num_counters, - int *num_counters_fixed, - u64 *intel_ctrl, u64 fixed_mask) -{ - if (*num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - *num_counters, INTEL_PMC_MAX_GENERIC); - *num_counters = INTEL_PMC_MAX_GENERIC; - } - *intel_ctrl = (1ULL << *num_counters) - 1; - - if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - *num_counters_fixed, INTEL_PMC_MAX_FIXED); - *num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; -} - static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, - int num_counters, - int num_counters_fixed, + u64 cntr_mask, + u64 fixed_cntr_mask, u64 intel_ctrl) { struct event_constraint *c; @@ -6156,10 +6154,9 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con * generic counters */ if (!use_fixed_pseudo_encoding(c->code)) - c->idxmsk64 |= (1ULL << num_counters) - 1; + c->idxmsk64 |= cntr_mask; } - c->idxmsk64 &= - ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed)); + c->idxmsk64 &= cntr_mask | (fixed_cntr_mask << INTEL_PMC_IDX_FIXED); c->weight = hweight64(c->idxmsk64); } } @@ -6210,12 +6207,12 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id; pmu->name = intel_hybrid_pmu_type_map[bit].name; - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; if (pmu->pmu_type & hybrid_small) { @@ -6318,14 +6315,14 @@ __init int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0); x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; - x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(x86_pmu.num_counters - 1, 0)); + x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* @@ -6335,12 +6332,10 @@ __init int intel_pmu_init(void) if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); - x86_pmu.num_counters_fixed = - max((int)edx.split.num_counters_fixed, assume); - - fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + x86_pmu.fixed_cntr_mask64 = + GENMASK_ULL(max((int)edx.split.num_counters_fixed, assume) - 1, 0); } else if (version >= 5) - x86_pmu.num_counters_fixed = fls(fixed_mask); + x86_pmu.fixed_cntr_mask64 = fixed_mask; if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -6940,11 +6935,13 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; intel_pmu_init_glc(&pmu->pmu); if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { - pmu->num_counters = x86_pmu.num_counters + 2; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + pmu->cntr_mask64 <<= 2; + pmu->cntr_mask64 |= 0x3; + pmu->fixed_cntr_mask64 <<= 1; + pmu->fixed_cntr_mask64 |= 0x1; } else { - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; } /* @@ -6954,15 +6951,16 @@ __init int intel_pmu_init(void) * mistakenly add extra counters for P-cores. Correct the number of * counters here. */ - if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + if ((x86_pmu_num_counters(&pmu->pmu) > 8) || (x86_pmu_num_counters_fixed(&pmu->pmu) > 4)) { + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; } - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); + pmu->extra_regs = intel_glc_extra_regs; /* Initialize Atom core specific PerfMon capabilities.*/ @@ -7029,9 +7027,9 @@ __init int intel_pmu_init(void) * The constraints may be cut according to the CPUID enumeration * by inserting the EVENT_CONSTRAINT_END. */ - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1; + if (fls64(x86_pmu.fixed_cntr_mask64) > INTEL_PMC_MAX_FIXED) + x86_pmu.fixed_cntr_mask64 &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0); + intel_v5_gen_event_constraints[fls64(x86_pmu.fixed_cntr_mask64)].weight = -1; x86_pmu.event_constraints = intel_v5_gen_event_constraints; pr_cont("generic architected perfmon, "); name = "generic_arch_v5+"; @@ -7058,18 +7056,17 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } - intel_pmu_check_num_counters(&x86_pmu.num_counters, - &x86_pmu.num_counters_fixed, - &x86_pmu.intel_ctrl, - (u64)fixed_mask); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, + &x86_pmu.fixed_cntr_mask64, + &x86_pmu.intel_ctrl); /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.num_counters, - x86_pmu.num_counters_fixed, + x86_pmu.cntr_mask64, + x86_pmu.fixed_cntr_mask64, x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4a26e9ec8b085..24aa3b13bc57f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1137,7 +1137,6 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -1145,7 +1144,7 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) return; if (x86_pmu.flags & PMU_FL_PEBS_ALL) - reserved = max_pebs_events + num_counters_fixed; + reserved = max_pebs_events + x86_pmu_max_num_counters_fixed(cpuc->pmu); else reserved = max_pebs_events; @@ -2168,8 +2167,8 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d mask = x86_pmu.pebs_events_mask; size = max_pebs_events; if (x86_pmu.flags & PMU_FL_PEBS_ALL) { - mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; - size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + mask |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED; + size = INTEL_PMC_IDX_FIXED + x86_pmu_max_num_counters_fixed(NULL); } if (unlikely(base >= top)) { @@ -2265,11 +2264,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; - int bit, size; + int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2281,11 +2279,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; mask = hybrid(cpuc->pmu, pebs_events_mask) | - (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); - size = INTEL_PMC_IDX_FIXED + num_counters_fixed; + (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); return; } @@ -2295,11 +2292,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; pebs_status &= mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, size) + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) counts[bit]++; } - for_each_set_bit(bit, (unsigned long *)&mask, size) { + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { if (counts[bit] == 0) continue; diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c index 618001c208e81..034a1f6a457c6 100644 --- a/arch/x86/events/intel/knc.c +++ b/arch/x86/events/intel/knc.c @@ -303,7 +303,7 @@ static const struct x86_pmu knc_pmu __initconst = { .apic = 1, .max_period = (1ULL << 39) - 1, .version = 0, - .num_counters = 2, + .cntr_mask64 = 0x3, .cntval_bits = 40, .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 35936188db01b..844bc4fc4724d 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -919,7 +919,7 @@ static void p4_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -998,7 +998,7 @@ static void p4_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -1040,7 +1040,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) cpuc = this_cpu_ptr(&cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { int overflow; if (!test_bit(idx, cpuc->active_mask)) { @@ -1353,7 +1353,7 @@ static __initconst const struct x86_pmu p4_pmu = { * though leave it restricted at moment assuming * HT is on */ - .num_counters = ARCH_P4_MAX_CCCR, + .cntr_mask64 = GENMASK_ULL(ARCH_P4_MAX_CCCR - 1, 0), .apic = 1, .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, @@ -1395,7 +1395,7 @@ __init int p4_pmu_init(void) * * Solve this by zero'ing out the registers to mimic a reset. */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); wrmsrl_safe(reg, 0ULL); } diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index 408879b0c0d4e..a6cffb4f4ef52 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -214,7 +214,7 @@ static __initconst const struct x86_pmu p6_pmu = { .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .cntr_mask64 = 0x3, /* * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0e9c56759c472..e447c6431c66c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -685,8 +685,14 @@ struct x86_hybrid_pmu { union perf_capabilities intel_cap; u64 intel_ctrl; u64 pebs_events_mask; - int num_counters; - int num_counters_fixed; + union { + u64 cntr_mask64; + unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 fixed_cntr_mask64; + unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -774,8 +780,14 @@ struct x86_pmu { int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; - int num_counters; - int num_counters_fixed; + union { + u64 cntr_mask64; + unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 fixed_cntr_mask64; + unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { @@ -1126,8 +1138,8 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } -bool check_hw_exists(struct pmu *pmu, int num_counters, - int num_counters_fixed); +bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, + unsigned long *fixed_cntr_mask); int x86_add_exclusive(unsigned int what); @@ -1198,8 +1210,27 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); -void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, - u64 intel_ctrl); +void x86_pmu_show_pmu_cap(struct pmu *pmu); + +static inline int x86_pmu_num_counters(struct pmu *pmu) +{ + return hweight64(hybrid(pmu, cntr_mask64)); +} + +static inline int x86_pmu_max_num_counters(struct pmu *pmu) +{ + return fls64(hybrid(pmu, cntr_mask64)); +} + +static inline int x86_pmu_num_counters_fixed(struct pmu *pmu) +{ + return hweight64(hybrid(pmu, fixed_cntr_mask64)); +} + +static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu) +{ + return fls64(hybrid(pmu, fixed_cntr_mask64)); +} extern struct event_constraint emptyconstraint; diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index 3e9acdaeed1ec..2fd9b0cf9a5e5 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -530,13 +530,13 @@ __init int zhaoxin_pmu_init(void) pr_info("Version check pass!\n"); x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0); x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.fixed_cntr_mask64 = GENMASK_ULL(edx.split.num_counters_fixed - 1, 0); x86_add_quirk(zhaoxin_arch_events_quirk); switch (boot_cpu_data.x86) { @@ -604,13 +604,13 @@ __init int zhaoxin_pmu_init(void) return -ENODEV; } - x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; - x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; + x86_pmu.intel_ctrl |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED; if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; + c->idxmsk64 |= x86_pmu.cntr_mask64; + c->weight += x86_pmu_num_counters(NULL); } } From 4321c87d2e2646a378d8078ae9f7eab9a28388dd Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:35 -0700 Subject: [PATCH 13/89] perf/x86: Add Lunar Lake and Arrow Lake support commit a932aa0e868f92c5219d84b58c1131fc05ddcf79 upstream. From PMU's perspective, Lunar Lake and Arrow Lake are similar to the previous generation Meteor Lake. Both are hybrid platforms, with e-core and p-core. The key differences include: - The e-core supports 3 new fixed counters - The p-core supports an updated PEBS Data Source format - More GP counters (Updated event constraint table) - New Architectural performance monitoring V6 (New Perfmon MSRs aliasing, umask2, eq). - New PEBS format V6 (Counters Snapshotting group) - New RDPMC metrics clear mode The legacy features, the 3 new fixed counters and updated event constraint table are enabled in this patch. The new PEBS data source format, the architectural performance monitoring V6, the PEBS format V6, and the new RDPMC metrics clear mode are supported in the following patches. Intel-SIG: commit a932aa0e868f perf/x86: Add Lunar Lake and Arrow Lake support Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 117 ++++++++++++++++++++++++++++++ arch/x86/events/intel/ds.c | 24 ++++++ arch/x86/events/perf_event.h | 2 + arch/x86/include/asm/perf_event.h | 4 + 4 files changed, 147 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d84063ca3c865..3896e8a98fd92 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -211,6 +211,17 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_skt_event_constraints[] __read_mostly = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0073, 4), /* TOPDOWN_BAD_SPECULATION.ALL */ + FIXED_EVENT_CONSTRAINT(0x019c, 5), /* TOPDOWN_FE_BOUND.ALL */ + FIXED_EVENT_CONSTRAINT(0x02c2, 6), /* TOPDOWN_RETIRING.ALL */ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -360,6 +371,55 @@ static struct extra_reg intel_rwc_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_lnc_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), + INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), + + INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), + INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), + /* + * Generally event codes < 0x90 are restricted to counters 0-3. + * The 0x2E and 0x3C are exception, which has no restriction. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), + /* + * Generally event codes >= 0x90 are likely to have no restrictions. + * The exception are defined as above. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + EVENT_CONSTRAINT_END +}; + + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -5915,6 +5975,23 @@ static struct attribute *adl_hybrid_events_attrs[] = { NULL, }; +EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_lnl, "event=0xc2,umask=0x02;event=0x00,umask=0x80", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_lnl, "event=0x9c,umask=0x01;event=0x00,umask=0x82", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_lnl, "event=0xa4,umask=0x02;event=0x00,umask=0x83", hybrid_big_small); + +static struct attribute *lnl_hybrid_events_attrs[] = { + EVENT_PTR(slots_adl), + EVENT_PTR(td_retiring_lnl), + EVENT_PTR(td_bad_spec_adl), + EVENT_PTR(td_fe_bound_lnl), + EVENT_PTR(td_be_bound_lnl), + EVENT_PTR(td_heavy_ops_adl), + EVENT_PTR(td_br_mis_adl), + EVENT_PTR(td_fetch_lat_adl), + EVENT_PTR(td_mem_bound_adl), + NULL +}; + /* Must be in IDX order */ EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); @@ -6272,6 +6349,21 @@ static __always_inline void intel_pmu_init_grt(struct pmu *pmu) hybrid(pmu, extra_regs) = intel_grt_extra_regs; } +static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) +{ + intel_pmu_init_glc(pmu); + hybrid(pmu, event_constraints) = intel_lnc_event_constraints; + hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_rwc_extra_regs; +} + +static __always_inline void intel_pmu_init_skt(struct pmu *pmu) +{ + intel_pmu_init_grt(pmu); + hybrid(pmu, event_constraints) = intel_skt_event_constraints; + hybrid(pmu, extra_regs) = intel_cmt_extra_regs; +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -7002,6 +7094,31 @@ __init int intel_pmu_init(void) name = "meteorlake_hybrid"; break; + case INTEL_FAM6_LUNARLAKE_M: + case INTEL_FAM6_ARROWLAKE: + intel_pmu_init_hybrid(hybrid_big_small); + + x86_pmu.get_event_constraints = mtl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + + td_attr = lnl_hybrid_events_attrs; + mem_attr = mtl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_lnc(&pmu->pmu); + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_skt(&pmu->pmu); + + pr_cont("Lunarlake Hybrid events, "); + name = "lunarlake_hybrid"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 24aa3b13bc57f..6c6841cc87bfd 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1085,6 +1085,30 @@ struct event_constraint intel_glc_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_lnc_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e447c6431c66c..f2a2eed45010a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1570,6 +1570,8 @@ extern struct event_constraint intel_icl_pebs_event_constraints[]; extern struct event_constraint intel_glc_pebs_event_constraints[]; +extern struct event_constraint intel_lnc_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ccf1b23772e2a..30ee476708b29 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -331,6 +331,10 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) #define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) +/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */ +/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */ +/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */ + static inline bool use_fixed_pseudo_encoding(u64 code) { return !(code & 0xff); From e4263d46144b6418f2baf7884f2e5c903f18c29d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:36 -0700 Subject: [PATCH 14/89] perf/x86/intel: Rename model-specific pebs_latency_data functions commit 090262439f66df03d4e9d0e52e14104b729e2ef8 upstream. The model-specific pebs_latency_data functions of ADL and MTL use the "small" as a postfix to indicate the e-core. The postfix is too generic for a model-specific function. It cannot provide useful information that can directly map it to a specific uarch, which can facilitate the development and maintenance. Use the abbr of the uarch to rename the model-specific functions. Intel-SIG: commit 090262439f66 perf/x86/intel: Rename model-specific pebs_latency_data functions Backport CWF PMU support and dependency Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/events/intel/ds.c | 20 ++++++++++---------- arch/x86/events/perf_event.h | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3896e8a98fd92..e3d4daa5b6e3d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6642,7 +6642,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GRACEMONT: intel_pmu_init_grt(NULL); intel_pmu_pebs_data_source_grt(); - x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.pebs_latency_data = grt_latency_data; x86_pmu.get_event_constraints = tnt_get_event_constraints; td_attr = tnt_events_attrs; mem_attr = grt_mem_attrs; @@ -6656,7 +6656,7 @@ __init int intel_pmu_init(void) intel_pmu_init_grt(NULL); x86_pmu.extra_regs = intel_cmt_extra_regs; intel_pmu_pebs_data_source_cmt(); - x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.pebs_latency_data = cmt_latency_data; x86_pmu.get_event_constraints = cmt_get_event_constraints; td_attr = cmt_events_attrs; mem_attr = grt_mem_attrs; @@ -7012,7 +7012,7 @@ __init int intel_pmu_init(void) */ intel_pmu_init_hybrid(hybrid_big_small); - x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.pebs_latency_data = grt_latency_data; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; @@ -7069,7 +7069,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_METEORLAKE_L: intel_pmu_init_hybrid(hybrid_big_small); - x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.pebs_latency_data = cmt_latency_data; x86_pmu.get_event_constraints = mtl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6c6841cc87bfd..fe4fac03e562e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -256,8 +256,8 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock) } /* Retrieve the latency data for e-core of ADL */ -static u64 __adl_latency_data_small(struct perf_event *event, u64 status, - u8 dse, bool tlb, bool lock, bool blk) +static u64 __grt_latency_data(struct perf_event *event, u64 status, + u8 dse, bool tlb, bool lock, bool blk) { u64 val; @@ -276,27 +276,27 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status, return val; } -u64 adl_latency_data_small(struct perf_event *event, u64 status) +u64 grt_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; dse.val = status; - return __adl_latency_data_small(event, status, dse.ld_dse, - dse.ld_locked, dse.ld_stlb_miss, - dse.ld_data_blk); + return __grt_latency_data(event, status, dse.ld_dse, + dse.ld_locked, dse.ld_stlb_miss, + dse.ld_data_blk); } /* Retrieve the latency data for e-core of MTL */ -u64 mtl_latency_data_small(struct perf_event *event, u64 status) +u64 cmt_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; dse.val = status; - return __adl_latency_data_small(event, status, dse.mtl_dse, - dse.mtl_stlb_miss, dse.mtl_locked, - dse.mtl_fwd_blk); + return __grt_latency_data(event, status, dse.mtl_dse, + dse.mtl_stlb_miss, dse.mtl_locked, + dse.mtl_fwd_blk); } static u64 load_latency_data(struct perf_event *event, u64 status) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f2a2eed45010a..7298a61d8538f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1536,9 +1536,9 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); -u64 adl_latency_data_small(struct perf_event *event, u64 status); +u64 grt_latency_data(struct perf_event *event, u64 status); -u64 mtl_latency_data_small(struct perf_event *event, u64 status); +u64 cmt_latency_data(struct perf_event *event, u64 status); extern struct event_constraint intel_core2_pebs_event_constraints[]; From 0af83c6f2a227266d752ee9971fc34ae9ee572ab Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:37 -0700 Subject: [PATCH 15/89] perf/x86/intel: Support new data source for Lunar Lake commit 608f6976c309793ceea37292c54b057dab091944 upstream. A new PEBS data source format is introduced for the p-core of Lunar Lake. The data source field is extended to 8 bits with new encodings. A new layout is introduced into the union intel_x86_pebs_dse. Introduce the lnl_latency_data() to parse the new format. Enlarge the pebs_data_source[] accordingly to include new encodings. Only the mem load and the mem store events can generate the data source. Introduce INTEL_HYBRID_LDLAT_CONSTRAINT and INTEL_HYBRID_STLAT_CONSTRAINT to mark them. Add two new bits for the new cache-related data src, L2_MHB and MSC. The L2_MHB is short for L2 Miss Handling Buffer, which is similar to LFB (Line Fill Buffer), but to track the L2 Cache misses. The MSC stands for the memory-side cache. Intel-SIG: commit 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-6-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 2 + arch/x86/events/intel/ds.c | 94 ++++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 16 +++++- include/uapi/linux/perf_event.h | 6 ++- 4 files changed, 113 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e3d4daa5b6e3d..258e755240420 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -7098,6 +7098,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ARROWLAKE: intel_pmu_init_hybrid(hybrid_big_small); + x86_pmu.pebs_latency_data = lnl_latency_data; x86_pmu.get_event_constraints = mtl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; @@ -7115,6 +7116,7 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; intel_pmu_init_skt(&pmu->pmu); + intel_pmu_pebs_data_source_lnl(); pr_cont("Lunarlake Hybrid events, "); name = "lunarlake_hybrid"; break; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index fe4fac03e562e..9e679033e96ec 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -62,6 +62,15 @@ union intel_x86_pebs_dse { unsigned int mtl_fwd_blk:1; unsigned int ld_reserved4:24; }; + struct { + unsigned int lnc_dse:8; + unsigned int ld_reserved5:2; + unsigned int lnc_stlb_miss:1; + unsigned int lnc_locked:1; + unsigned int lnc_data_blk:1; + unsigned int lnc_addr_blk:1; + unsigned int ld_reserved6:18; + }; }; @@ -76,7 +85,7 @@ union intel_x86_pebs_dse { #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ -static u64 pebs_data_source[] = { +static u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = { P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ @@ -172,6 +181,40 @@ void __init intel_pmu_pebs_data_source_cmt(void) __intel_pmu_pebs_data_source_cmt(pebs_data_source); } +/* Version for Lion Cove and later */ +static u64 lnc_pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA), /* 0x00: ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: LFB/L1 Miss Handling Buffer hit */ + 0, /* 0x04: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x05: L2 Hit */ + OP_LH | LEVEL(L2_MHB) | P(SNOOP, NONE), /* 0x06: L2 Miss Handling Buffer Hit */ + 0, /* 0x07: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x08: L3 Hit */ + 0, /* 0x09: Reserved */ + 0, /* 0x0a: Reserved */ + 0, /* 0x0b: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD), /* 0x0c: L3 Hit Snoop Fwd */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0d: L3 Hit Snoop HitM */ + 0, /* 0x0e: Reserved */ + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0f: L3 Miss Snoop HitM */ + OP_LH | LEVEL(MSC) | P(SNOOP, NONE), /* 0x10: Memory-side Cache Hit */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE), /* 0x11: Local Memory Hit */ +}; + +void __init intel_pmu_pebs_data_source_lnl(void) +{ + u64 *data_source; + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source; + memcpy(data_source, lnc_pebs_data_source, sizeof(lnc_pebs_data_source)); + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_cmt(data_source); +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -263,7 +306,7 @@ static u64 __grt_latency_data(struct perf_event *event, u64 status, WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); - dse &= PERF_PEBS_DATA_SOURCE_MASK; + dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; pebs_set_tlb_lock(&val, tlb, lock); @@ -299,6 +342,51 @@ u64 cmt_latency_data(struct perf_event *event, u64 status) dse.mtl_fwd_blk); } +static u64 lnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + /* LNC core latency data */ + val = hybrid_var(event->pmu, pebs_data_source)[status & PERF_PEBS_DATA_SOURCE_MASK]; + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.lnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.lnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.lnc_data_blk) + val |= P(BLK, DATA); + if (dse.lnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.lnc_data_blk && !dse.lnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + +u64 lnl_latency_data(struct perf_event *event, u64 status) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_small) + return cmt_latency_data(event, status); + + return lnc_latency_data(event, status); +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; @@ -1089,6 +1177,8 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7298a61d8538f..4a7924c851519 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -476,6 +476,14 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID) +#define INTEL_HYBRID_LDLAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_LD_HSW) + +#define INTEL_HYBRID_STLAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_ST_HSW) + /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) @@ -655,8 +663,10 @@ enum { x86_lbr_exclusive_max, }; -#define PERF_PEBS_DATA_SOURCE_MAX 0x10 +#define PERF_PEBS_DATA_SOURCE_MAX 0x100 #define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1) +#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 +#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) enum hybrid_cpu_type { HYBRID_INTEL_NONE, @@ -1540,6 +1550,8 @@ u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); +u64 lnl_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; @@ -1661,6 +1673,8 @@ void intel_pmu_pebs_data_source_mtl(void); void intel_pmu_pebs_data_source_cmt(void); +void intel_pmu_pebs_data_source_lnl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3a64499b0f5d6..4842c36fdf801 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ From 32378de50d3f36e15b4351c2c08ac4dd4c064611 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:38 -0700 Subject: [PATCH 16/89] perf/x86: Add config_mask to represent EVENTSEL bitmask commit e8fb5d6e765838e913253ef7c9b6fd8ec76c8d53 upstream. Different vendors may support different fields in EVENTSEL MSR, such as Intel would introduce new fields umask2 and eq bits in EVENTSEL MSR since Perfmon version 6. However, a fixed mask X86_RAW_EVENT_MASK is used to filter the attr.config. Introduce a new config_mask to record the real supported EVENTSEL bitmask. Only apply it to the existing code now. No functional change. Intel-SIG: commit e8fb5d6e7658 perf/x86: Add config_mask to represent EVENTSEL bitmask Backport CWF PMU support and dependency Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-7-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 5 ++++- arch/x86/events/intel/core.c | 1 + arch/x86/events/perf_event.h | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 59aaa8965ba1d..bf788e8761346 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -626,7 +626,7 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) - event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + event->hw.config |= x86_pmu_get_event_config(event); if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; @@ -2100,6 +2100,9 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; + if (!x86_pmu.config_mask) + x86_pmu.config_mask = X86_RAW_EVENT_MASK; + perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 258e755240420..30334a2910932 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6287,6 +6287,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->cntr_mask64 = x86_pmu.cntr_mask64; pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); + pmu->config_mask = X86_RAW_EVENT_MASK; pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, pmu->cntr_mask64, 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4a7924c851519..6a1303ae58714 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -695,6 +695,7 @@ struct x86_hybrid_pmu { union perf_capabilities intel_cap; u64 intel_ctrl; u64 pebs_events_mask; + u64 config_mask; union { u64 cntr_mask64; unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -790,6 +791,7 @@ struct x86_pmu { int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; + u64 config_mask; union { u64 cntr_mask64; unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -1242,6 +1244,11 @@ static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu) return fls64(hybrid(pmu, fixed_cntr_mask64)); } +static inline u64 x86_pmu_get_event_config(struct perf_event *event) +{ + return event->attr.config & hybrid(event->pmu, config_mask); +} + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; From c578aa1ba7ba36e4b808c30036b2b328248e89f2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:39 -0700 Subject: [PATCH 17/89] perf/x86/intel: Support PERFEVTSEL extension commit dce0c74d2d180ce21d074b4f977821a567ab0020 upstream. Two new fields (the unit mask2, and the equal flag) are added in the IA32_PERFEVTSELx MSRs. They can be enumerated by the CPUID.23H.0.EBX. Update the config_mask in x86_pmu and x86_hybrid_pmu for the true layout of the PERFEVTSEL. Expose the new formats into sysfs if they are available. The umask extension reuses the same format attr name "umask" as the previous umask. Add umask2_show to determine/display the correct format for the current machine. Intel-SIG: commit dce0c74d2d18 perf/x86/intel: Support PERFEVTSEL extension Backport CWF PMU support and dependency Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-8-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi [jz: due to following commit 47a973fd75639 ("perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF") has already been backported by stable branch, - resolve context conflict in update_pmu_cap(). - definition for ARCH_PERFMON_EXT_UMASK2 and ARCH_PERFMON_EXT_EQ are not added because upstream commit 47a973fd75639 have removed them] Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 64 ++++++++++++++++++++++++++++++- arch/x86/include/asm/perf_event.h | 2 + 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 30334a2910932..17fbc24db5d3b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4730,8 +4730,55 @@ PMU_FORMAT_ATTR(pc, "config:19" ); PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ PMU_FORMAT_ATTR(inv, "config:23" ); PMU_FORMAT_ATTR(cmask, "config:24-31" ); -PMU_FORMAT_ATTR(in_tx, "config:32"); -PMU_FORMAT_ATTR(in_tx_cp, "config:33"); +PMU_FORMAT_ATTR(in_tx, "config:32" ); +PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); +PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ + +static ssize_t umask2_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + u64 mask = hybrid(dev_get_drvdata(dev), config_mask) & ARCH_PERFMON_EVENTSEL_UMASK2; + + if (mask == ARCH_PERFMON_EVENTSEL_UMASK2) + return sprintf(page, "config:8-15,40-47\n"); + + /* Roll back to the old format if umask2 is not supported. */ + return sprintf(page, "config:8-15\n"); +} + +static struct device_attribute format_attr_umask2 = + __ATTR(umask, 0444, umask2_show, NULL); + +static struct attribute *format_evtsel_ext_attrs[] = { + &format_attr_umask2.attr, + &format_attr_eq.attr, + NULL +}; + +static umode_t +evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + u64 mask; + + /* + * The umask and umask2 have different formats but share the + * same attr name. In update mode, the previous value of the + * umask is unconditionally removed before is_visible. If + * umask2 format is not enumerated, it's impossible to roll + * back to the old format. + * Does the check in umask2_show rather than is_visible. + */ + if (i == 0) + return attr->mode; + + mask = hybrid(dev_get_drvdata(dev), config_mask); + if (i == 1) + return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + + return 0; +} static struct attribute *intel_arch_formats_attr[] = { &format_attr_event.attr, @@ -4890,6 +4937,11 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); + if (ebx.split.umask2) + pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + if (ebx.split.eq) + pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; + if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); @@ -5935,6 +5987,12 @@ static struct attribute_group group_format_extra_skl = { .is_visible = exra_is_visible, }; +static struct attribute_group group_format_evtsel_ext = { + .name = "format", + .attrs = format_evtsel_ext_attrs, + .is_visible = evtsel_ext_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -5948,6 +6006,7 @@ static const struct attribute_group *attr_update[] = { &group_caps_lbr, &group_format_extra, &group_format_extra_skl, + &group_format_evtsel_ext, &group_default, NULL, }; @@ -6185,6 +6244,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_gen, &group_caps_lbr, &hybrid_group_format_extra, + &group_format_evtsel_ext, &group_default, &hybrid_group_cpus, NULL, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 30ee476708b29..ccc5dc13c9f42 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -32,6 +32,8 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL #define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) +#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) +#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) #define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 From 8022edd44464988c449a5d5a5d40dddfaf64faf4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:40 -0700 Subject: [PATCH 18/89] perf/x86/intel: Support Perfmon MSRs aliasing commit 149fd4712bcd492a031945f92e5ce19879f62311 upstream. The architectural performance monitoring V6 supports a new range of counters' MSRs in the 19xxH address range. They include all the GP counter MSRs, the GP control MSRs, and the fixed counter MSRs. The step between each sibling counter is 4. Add intel_pmu_addr_offset() to calculate the correct offset. Add fixedctr in struct x86_pmu to store the address of the fixed counter 0. It can be used to calculate the rest of the fixed counters. The MSR address of the fixed counter control is not changed. Intel-SIG: commit 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-9-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 7 +++---- arch/x86/events/intel/core.c | 17 ++++++++++++++++- arch/x86/events/perf_event.h | 7 +++++++ arch/x86/include/asm/msr-index.h | 6 ++++++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bf788e8761346..52e309028cb7f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1238,8 +1238,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + - (idx - INTEL_PMC_IDX_FIXED); + hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; @@ -1575,7 +1574,7 @@ void perf_event_print_debug(void) for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); @@ -2485,7 +2484,7 @@ void perf_clear_dirty_counters(void) if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; - wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); + wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 17fbc24db5d3b..eaa85f55787b8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2948,7 +2948,7 @@ static void intel_pmu_reset(void) for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(x86_pmu_fixed_ctr_addr(idx), 0ull); } if (ds) @@ -5291,6 +5291,7 @@ static __initconst const struct x86_pmu core_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0, .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, @@ -5344,6 +5345,7 @@ static __initconst const struct x86_pmu intel_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0, .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, @@ -6318,6 +6320,11 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) } } +static inline int intel_pmu_v6_addr_offset(int index, bool eventsel) +{ + return MSR_IA32_PMC_V6_STEP * index; +} + static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { { hybrid_small, "cpu_atom" }, { hybrid_big, "cpu_core" }, @@ -7287,6 +7294,14 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + /* Support V6+ MSR Aliasing */ + if (x86_pmu.version >= 6) { + x86_pmu.perfctr = MSR_IA32_PMC_V6_GP0_CTR; + x86_pmu.eventsel = MSR_IA32_PMC_V6_GP0_CFG_A; + x86_pmu.fixedctr = MSR_IA32_PMC_V6_FX0_CTR; + x86_pmu.addr_offset = intel_pmu_v6_addr_offset; + } + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6a1303ae58714..c6f773fb18a3f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -787,6 +787,7 @@ struct x86_pmu { int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; unsigned perfctr; + unsigned fixedctr; int (*addr_offset)(int index, bool eventsel); int (*rdpmc_index)(int index); u64 (*event_map)(int); @@ -1145,6 +1146,12 @@ static inline unsigned int x86_pmu_event_addr(int index) x86_pmu.addr_offset(index, false) : index); } +static inline unsigned int x86_pmu_fixed_ctr_addr(int index) +{ + return x86_pmu.fixedctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); +} + static inline int x86_pmu_rdpmc_index(int index) { return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fdf5663ea492e..c1e6214372ce7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -567,6 +567,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* KeyID partitioning between MKTME and TDX */ #define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 From b2bd7e9331171e55c1f8a1d08b944dac0fc9438a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:02 -0800 Subject: [PATCH 19/89] perf/x86/intel/ds: Clarify adaptive PEBS processing commit 7087bfb0adc9a12ec3b463b1d38072c5efce5d6c upstream. Modify the pebs_basic and pebs_meminfo structs to make the bitfields more explicit to ease readability of the code. Intel-SIG: commit 7087bfb0adc9 perf/x86/intel/ds: Clarify adaptive PEBS processing Backport CWF PMU support and dependency Co-developed-by: Stephane Eranian Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 43 ++++++++++++++----------------- arch/x86/include/asm/perf_event.h | 16 ++++++++++-- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9e679033e96ec..289b8f7554934 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1897,8 +1897,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, } #define PEBS_LATENCY_MASK 0xffff -#define PEBS_CACHE_LATENCY_OFFSET 32 -#define PEBS_RETIRE_LATENCY_OFFSET 32 /* * With adaptive PEBS the layout depends on what fields are configured. @@ -1912,8 +1910,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type; - u64 format_size; + u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; @@ -1925,7 +1922,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs->xmm_regs = NULL; sample_type = event->attr.sample_type; - format_size = basic->format_size; + format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; @@ -1947,7 +1944,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; + data->weight.var3_w = basic->retire_latency; else data->weight.var3_w = 0; } @@ -1957,12 +1954,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * But PERF_SAMPLE_TRANSACTION needs gprs->ax. * Save the pointer here but process later. */ - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { meminfo = next_record; next_record = meminfo + 1; } - if (format_size & PEBS_DATACFG_GP) { + if (format_group & PEBS_DATACFG_GP) { gprs = next_record; next_record = gprs + 1; @@ -1975,14 +1972,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, adaptive_pebs_save_regs(regs, gprs); } - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 weight = meminfo->latency; + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { - data->weight.var2_w = weight & PEBS_LATENCY_MASK; - weight >>= PEBS_CACHE_LATENCY_OFFSET; - } + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) + data->weight.var2_w = meminfo->instr_latency; /* * Although meminfo::latency is defined as a u64, @@ -1990,12 +1986,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * in practice on Ice Lake and earlier platforms. */ if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = weight ?: + data->weight.full = latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } else { - data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + data->weight.var1_dw = (u32)latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } @@ -2016,16 +2013,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - if (format_size & PEBS_DATACFG_XMMS) { + if (format_group & PEBS_DATACFG_XMMS) { struct pebs_xmm *xmm = next_record; next_record = xmm + 1; perf_regs->xmm_regs = xmm->xmm; } - if (format_size & PEBS_DATACFG_LBRS) { + if (format_group & PEBS_DATACFG_LBRS) { struct lbr_entry *lbr = next_record; - int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; next_record = next_record + num_lbr * sizeof(struct lbr_entry); @@ -2035,11 +2032,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - WARN_ONCE(next_record != __pebs + (format_size >> 48), - "PEBS record size %llu, expected %llu, config %llx\n", - format_size >> 48, + WARN_ONCE(next_record != __pebs + basic->format_size, + "PEBS record size %u, expected %llu, config %llx\n", + basic->format_size, (u64)(next_record - __pebs), - basic->format_size); + format_group); } static inline void * diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ccc5dc13c9f42..2b09f718e31f4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -444,7 +444,9 @@ static inline bool is_topdown_idx(int idx) */ struct pebs_basic { - u64 format_size; + u64 format_group:32, + retire_latency:16, + format_size:16; u64 ip; u64 applicable_counters; u64 tsc; @@ -453,7 +455,17 @@ struct pebs_basic { struct pebs_meminfo { u64 address; u64 aux; - u64 latency; + union { + /* pre Alder Lake */ + u64 mem_latency; + /* Alder Lake and later */ + struct { + u64 instr_latency:16; + u64 pad2:16; + u64 cache_latency:16; + u64 pad3:16; + }; + }; u64 tsx_tuning; }; From a50324824ca1bb5d69a584cd0208657a4fae8338 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:03 -0800 Subject: [PATCH 20/89] perf/x86/intel/ds: Factor out functions for PEBS records processing commit 3c00ed344cef4dbb57d8769b961af414132a173a upstream. Factor out functions to process normal and the last PEBS records, which can be shared with the later patch. Move the event updating related codes (intel_pmu_save_and_restart()) to the end, where all samples have been processed. For the current usage, it doesn't matter when perf updates event counts and reset the counter. Because all counters are stopped when the PEBS buffer is drained. Drop the return of the !intel_pmu_save_and_restart(event) check. Because it never happen. The intel_pmu_save_and_restart(event) only returns 0, when !hwc->event_base or the period_left > 0. - The !hwc->event_base is impossible for the PEBS event, since the PEBS event is only available on GP and fixed counters, which always have a valid hwc->event_base. - The check only happens for the case of non-AUTO_RELOAD and single PEBS, which implies that the event must be overflowed. The period_left must be always <= 0 for an overflowed event after the x86_pmu_update(). Intel-SIG: commit 3c00ed344cef perf/x86/intel/ds: Factor out functions for PEBS records processing Backport CWF PMU support and dependency Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 109 +++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 289b8f7554934..e0ae49c52e6ea 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2138,46 +2138,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } +typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *, + struct perf_sample_data *, struct pt_regs *); + +static struct pt_regs dummy_iregs; + static __always_inline void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, + struct pt_regs *regs, struct perf_sample_data *data, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) + void *at, + setup_fn setup_sample) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - struct x86_perf_regs perf_regs; - struct pt_regs *regs = &perf_regs.regs; - void *at = get_next_pebs_record_by_bit(base, top, bit); - static struct pt_regs dummy_iregs; - - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); - } else if (!intel_pmu_save_and_restart(event)) - return; - - if (!iregs) - iregs = &dummy_iregs; + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); +} - while (count > 1) { - setup_sample(event, iregs, at, data, regs); - perf_event_output(event, data, regs); - at += cpuc->pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - count--; - } +static __always_inline void +__intel_pmu_pebs_last_event(struct perf_event *event, + struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, + int count, + setup_fn setup_sample) +{ + struct hw_perf_event *hwc = &event->hw; setup_sample(event, iregs, at, data, regs); if (iregs == &dummy_iregs) { @@ -2196,6 +2183,44 @@ __intel_pmu_pebs_event(struct perf_event *event, if (perf_event_overflow(event, data, regs)) x86_pmu_stop(event, 0); } + + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else + intel_pmu_save_and_restart(event); +} + +static __always_inline void +__intel_pmu_pebs_events(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *at = get_next_pebs_record_by_bit(base, top, bit); + int cnt = count; + + if (!iregs) + iregs = &dummy_iregs; + + while (cnt > 1) { + __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample); + at += cpuc->pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + cnt--; + } + + __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) @@ -2232,8 +2257,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ return; } - __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n, + setup_pebs_fixed_sample_data); } static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) @@ -2364,9 +2389,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } @@ -2418,9 +2443,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (WARN_ON_ONCE(!event->attr.precise_ip)) continue; - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); } } From ae83a658c2aacb9e65f1253d1b1a44da83ac1bee Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:04 -0800 Subject: [PATCH 21/89] perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS commit ae55e308bde2267df79c4475daa85e174b7ab4c8 upstream. The current code may iterate all the PEBS records in the DS area several times. The first loop is to find all active events and calculate the available records for each event. Then iterate the whole buffer again and again to process available records until all active events are processed. The algorithm is inherited from the old generations. The old PEBS hardware does not deal well with the situation when events happen near each other. SW has to drop the error records. Multiple iterations are required. The hardware limit has been addressed on newer platforms with adaptive PEBS. A simple one-iteration algorithm is introduced. The samples are output by record order with the patch, rather than the event order. It doesn't impact the post-processing. The perf tool always sorts the records by time before presenting them to the end user. In an NMI, the last record has to be specially handled. Add a last[] variable to track the last unprocessed record of each event. Test: 11 PEBS events are used in the perf test. Only the basic information is collected. perf record -e instructions:up,...,instructions:up -c 2000003 benchmark The ftrace is used to record the duration of the intel_pmu_drain_pebs_icl(). The average duration reduced from 62.04us to 57.94us. A small improvement can be observed with the new algorithm. Also, the implementation becomes simpler and more straightforward. Intel-SIG: commit ae55e308bde2 perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS Backport CWF PMU support and dependency Suggested-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20241119135504.1463839-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 43 +++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index e0ae49c52e6ea..66924822063a4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2399,8 +2399,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + struct pebs_basic *basic; struct perf_event *event; void *base, *at, *top; int bit; @@ -2422,30 +2426,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d return; } - for (at = base; at < top; at += cpuc->pebs_record_size) { + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top; at += basic->format_size) { u64 pebs_status; - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; - pebs_status &= mask; + basic = at; + if (basic->format_size != cpuc->pebs_record_size) + continue; + + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) - counts[bit]++; + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], + setup_pebs_adaptive_sample_data); + } + last[bit] = at; + } } for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (counts[bit] == 0) + if (!counts[bit]) continue; event = cpuc->events[bit]; - if (WARN_ON_ONCE(!event)) - continue; - - if (WARN_ON_ONCE(!event->attr.precise_ip)) - continue; - __intel_pmu_pebs_events(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_pebs_adaptive_sample_data); } } From f787c13ec74939050fd4510de492d27856d88bab Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:03:17 -0800 Subject: [PATCH 22/89] perf/x86/intel: Support RDPMC metrics clear mode commit 0e45818ec1896c2b4aee0ec6721022ad625ea531 upstream. The new RDPMC enhancement, metrics clear mode, is to clear the PERF_METRICS-related resources as well as the fixed-function performance monitoring counter 3 after the read is performed. It is available for ring 3. The feature is enumerated by the IA32_PERF_CAPABILITIES.RDPMC_CLEAR_METRICS[bit 19]. To enable the feature, the IA32_FIXED_CTR_CTRL.METRICS_CLEAR_EN[bit 14] must be set. Two ways were considered to enable the feature. - Expose a knob in the sysfs globally. One user may affect the measurement of other users when changing the knob. The solution is dropped. - Introduce a new event format, metrics_clear, for the slots event to disable/enable the feature only for the current process. Users can utilize the feature as needed. The latter solution is implemented in the patch. The current KVM doesn't support the perf metrics yet. For virtualization, the feature can be enabled later separately. Intel-SIG: commit 0e45818ec189 perf/x86/intel: Support RDPMC metrics clear mode Backport CWF PMU support and dependency Suggested-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20241211160318.235056-1-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 20 +++++++++++++++++++- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 4 ++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index eaa85f55787b8..85e6b041a242d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2811,6 +2811,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event) return; idx = INTEL_PMC_IDX_FIXED_SLOTS; + + if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR) + bits |= INTEL_FIXED_3_METRICS_CLEAR; } intel_set_masks(event, idx); @@ -4147,7 +4150,12 @@ static int intel_pmu_hw_config(struct perf_event *event) * is used in a metrics group, it too cannot support sampling. */ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { - if (event->attr.config1 || event->attr.config2) + /* The metrics_clear can only be set for the slots event */ + if (event->attr.config1 && + (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) + return -EINVAL; + + if (event->attr.config2) return -EINVAL; /* @@ -4734,6 +4742,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" ); PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ +PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + static ssize_t umask2_show(struct device *dev, struct device_attribute *attr, char *page) @@ -4753,6 +4763,7 @@ static struct device_attribute format_attr_umask2 = static struct attribute *format_evtsel_ext_attrs[] = { &format_attr_umask2.attr, &format_attr_eq.attr, + &format_attr_metrics_clear.attr, NULL }; @@ -4777,6 +4788,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) if (i == 1) return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + if (i == 2) { + union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap); + + return intel_cap.rdpmc_metrics_clear ? attr->mode : 0; + } + return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index c6f773fb18a3f..658d8f0360a5d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -624,6 +624,7 @@ union perf_capabilities { u64 pebs_output_pt_available:1; u64 pebs_timing_info:1; u64 anythread_deprecated:1; + u64 rdpmc_metrics_clear:1; }; u64 capabilities; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2b09f718e31f4..8e5ac206ca344 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -41,6 +41,7 @@ #define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) @@ -394,6 +395,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code) #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_NUM 8 +#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 +#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) + static inline bool is_metric_idx(int idx) { return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; From 02b88e2aafa24ae57443c2bb57c010410433a2ce Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:11:46 -0800 Subject: [PATCH 23/89] perf/x86/intel/uncore: Add Clearwater Forest support commit b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 upstream. From the perspective of the uncore PMU, the Clearwater Forest is the same as the previous Sierra Forest. The only difference is the event list, which will be supported in the perf tool later. Intel-SIG: commit b6ccddd6fe1f perf/x86/intel/uncore: Add Clearwater Forest support Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241211161146.235253-1-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 4d856b51307f3..e6e8d5ca21d60 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1900,6 +1900,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_DARKMONT_X, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); From 0125f661a9597d697008cdbeb3212d8c022e334b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 12:45:02 -0800 Subject: [PATCH 24/89] perf/x86/intel/ds: Add PEBS format 6 commit b8c3a2502a205321fe66c356f4b70cabd8e1a5fc upstream. The only difference between 5 and 6 is the new counters snapshotting group, without the following counters snapshotting enabling patches, it's impossible to utilize the feature in a PEBS record. It's safe to share the same code path with format 5. Add format 6, so the end user can at least utilize the legacy PEBS features. Intel-SIG: commit b8c3a2502a20 perf/x86/intel/ds: Add PEBS format 6 Backport CWF PMU support and dependency Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241216204505.748363-1-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 66924822063a4..15f1e352453b3 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2525,6 +2525,7 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 6: case 5: x86_pmu.pebs_ept = 1; fallthrough; From 4be2e3aaf03fcddb0466a8fcd1304a963a510f8e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:16 -0800 Subject: [PATCH 25/89] perf/x86/intel/uncore: Clean up func_id commit 3f710be02ea648001ba18fb2c9fa7765e743dec2 upstream. The below warning may be triggered on GNR when the PCIE uncore units are exposed. WARNING: CPU: 4 PID: 1 at arch/x86/events/intel/uncore.c:1169 uncore_pci_pmu_register+0x158/0x190 The current uncore driver assumes that all the devices in the same PMU have the exact same devfn. It's true for the previous platforms. But it doesn't work for the new PCIE uncore units on GNR. The assumption doesn't make sense. There is no reason to limit the devices from the same PMU to the same devfn. Also, the current code just throws the warning, but still registers the device. The WARN_ON_ONCE() should be removed. The func_id is used by the later event_init() to check if a event->pmu has valid devices. For cpu and mmio uncore PMUs, they are always valid. For pci uncore PMUs, it's set when the PMU is registered. It can be replaced by the pmu->registered. Clean up the func_id. Intel-SIG: commit 3f710be02ea6 perf/x86/intel/uncore: Clean up func_id Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-1-kan.liang@linux.intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore.c | 20 +++++++------------- arch/x86/events/intel/uncore.h | 1 - arch/x86/events/intel/uncore_snb.c | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index e6e8d5ca21d60..84d6508e807b5 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -744,7 +744,7 @@ static int uncore_pmu_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ @@ -991,7 +991,7 @@ static void uncore_types_exit(struct intel_uncore_type **types) uncore_type_exit(*types); } -static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) +static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; size_t size; @@ -1004,7 +1004,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = setid ? i : -1; pmus[i].pmu_idx = i; pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); @@ -1054,12 +1053,12 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } static int __init -uncore_types_init(struct intel_uncore_type **types, bool setid) +uncore_types_init(struct intel_uncore_type **types) { int ret; for (; *types; types++) { - ret = uncore_type_init(*types, setid); + ret = uncore_type_init(*types); if (ret) return ret; } @@ -1159,11 +1158,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, if (!box) return -ENOMEM; - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - atomic_inc(&box->refcnt); box->dieid = die; box->pci_dev = pdev; @@ -1409,7 +1403,7 @@ static int __init uncore_pci_init(void) goto err; } - ret = uncore_types_init(uncore_pci_uncores, false); + ret = uncore_types_init(uncore_pci_uncores); if (ret) goto errtype; @@ -1677,7 +1671,7 @@ static int __init uncore_cpu_init(void) { int ret; - ret = uncore_types_init(uncore_msr_uncores, true); + ret = uncore_types_init(uncore_msr_uncores); if (ret) goto err; @@ -1696,7 +1690,7 @@ static int __init uncore_mmio_init(void) struct intel_uncore_type **types = uncore_mmio_uncores; int ret; - ret = uncore_types_init(types, true); + ret = uncore_types_init(types); if (ret) goto err; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 027ef292c6022..9ba2305c0d9ae 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -125,7 +125,6 @@ struct intel_uncore_pmu { struct pmu pmu; char name[UNCORE_PMU_NAME_LEN]; int pmu_idx; - int func_id; bool registered; atomic_t activeboxes; cpumask_t cpu_mask; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 7fd4334e12a17..3725bb6571c48 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -881,7 +881,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ From 39bc0887bccfa18949b27c3e58f069ac94dc96bf Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:17 -0800 Subject: [PATCH 26/89] perf/x86/intel/uncore: Support more units on Granite Rapids commit 6d642735cdb6cdb814d2b6c81652caa53ce04842 upstream. The same CXL PMONs support is also avaiable on GNR. Apply spr_uncore_cxlcm and spr_uncore_cxldp to GNR as well. The other units were broken on early HW samples, so they were ignored in the early enabling patch. The issue has been fixed and verified on the later production HW. Add UPI, B2UPI, B2HOT, PCIEX16 and PCIEX8 for GNR. Intel-SIG: commit 6d642735cdb6 perf/x86/intel/uncore: Support more units on Granite Rapids Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-2-kan.liang@linux.intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_snbep.c | 48 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index abb0325b1eb1e..8591d1183ffca 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6593,17 +6593,8 @@ void spr_uncore_mmio_init(void) /* GNR uncore support */ #define UNCORE_GNR_NUM_UNCORE_TYPES 23 -#define UNCORE_GNR_TYPE_15 15 -#define UNCORE_GNR_B2UPI 18 -#define UNCORE_GNR_TYPE_21 21 -#define UNCORE_GNR_TYPE_22 22 int gnr_uncore_units_ignore[] = { - UNCORE_SPR_UPI, - UNCORE_GNR_TYPE_15, - UNCORE_GNR_B2UPI, - UNCORE_GNR_TYPE_21, - UNCORE_GNR_TYPE_22, UNCORE_IGNORE_END }; @@ -6612,6 +6603,31 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type gnr_uncore_pciex8 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex8", +}; + +static struct intel_uncore_type gnr_uncore_pciex16 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex16", +}; + +static struct intel_uncore_type gnr_uncore_upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "upi", +}; + +static struct intel_uncore_type gnr_uncore_b2upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2upi", +}; + +static struct intel_uncore_type gnr_uncore_b2hot = { + .name = "b2hot", + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type gnr_uncore_b2cmi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "b2cmi", @@ -6636,21 +6652,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { &gnr_uncore_ubox, &spr_uncore_imc, NULL, + &gnr_uncore_upi, NULL, NULL, NULL, + &spr_uncore_cxlcm, + &spr_uncore_cxldp, NULL, - NULL, - NULL, - NULL, - NULL, + &gnr_uncore_b2hot, &gnr_uncore_b2cmi, &gnr_uncore_b2cxl, - NULL, + &gnr_uncore_b2upi, NULL, &gnr_uncore_mdf_sbo, - NULL, - NULL, + &gnr_uncore_pciex16, + &gnr_uncore_pciex8, }; static struct freerunning_counters gnr_iio_freerunning[] = { From 7fe7d1a22b131dc1d7fb0b363e698aadee049cfb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 21 Jan 2025 07:23:03 -0800 Subject: [PATCH 27/89] perf/x86/intel: Support PEBS counters snapshotting commit e02e9b0374c378aab016ae8ace60d9d98ab8caa6 upstream. The counters snapshotting is a new adaptive PEBS extension, which can capture programmable counters, fixed-function counters, and performance metrics in a PEBS record. The feature is available in the PEBS format V6. The target counters can be configured in the new fields of MSR_PEBS_CFG. Then the PEBS HW will generate the bit mask of counters (Counters Group Header) followed by the content of all the requested counters into a PEBS record. The current Linux perf sample read feature can read all events in the group when any event in the group is overflowed. But the rdpmc in the NMI/overflow handler has a small gap from overflow. Also, there is some overhead for each rdpmc read. The counters snapshotting feature can be used as an accurate and low-overhead replacement. Extend intel_update_topdown_event() to accept the value from PEBS records. Add a new PEBS_CNTR flag to indicate a sample read group that utilizes the counters snapshotting feature. When the group is scheduled, the PEBS configure can be updated accordingly. To prevent the case that a PEBS record value might be in the past relative to what is already in the event, perf always stops the PMU and drains the PEBS buffer before updating the corresponding event->count. Intel-SIG: commit e02e9b0374c3 perf/x86/intel: Support PEBS counters snapshotting Backport CWF PMU support and dependency Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 13 ++ arch/x86/events/intel/core.c | 75 ++++++++--- arch/x86/events/intel/ds.c | 191 +++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 13 ++ arch/x86/events/perf_event_flags.h | 2 +- arch/x86/include/asm/perf_event.h | 15 +++ 6 files changed, 284 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 52e309028cb7f..c1fca3b914027 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); +DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -1298,6 +1300,15 @@ static void x86_pmu_enable(struct pmu *pmu) if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; + + /* + * The late setup (after counters are scheduled) + * is required for some cases, e.g., PEBS counters + * snapshotting. Because an accurate counter index + * is needed. + */ + static_call_cond(x86_pmu_late_setup)(); + /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -2034,6 +2045,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); + + static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 85e6b041a242d..18074f85fe85e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2700,7 +2700,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots, * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2708,13 +2708,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) bool reset = true; int idx; - /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); - if (!slots) - return 0; + if (!val) { + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; - /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + } else { + slots = val[0]; + metrics = val[1]; + /* + * Don't reset the PERF_METRICS and Fixed counter 3 + * for each PEBS record read. Utilize the RDPMC metrics + * clear mode. + */ + reset = false; + } for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) @@ -2757,17 +2768,19 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) return slots; } -static u64 icl_update_topdown_event(struct perf_event *event) +static u64 icl_update_topdown_event(struct perf_event *event, u64 *val) { return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + - x86_pmu.num_topdown_events - 1); + x86_pmu.num_topdown_events - 1, + val); } -DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); static void intel_pmu_read_event(struct perf_event *event) { - if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) { + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) || + is_pebs_counter_event_group(event)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); bool pmu_enabled = cpuc->enabled; @@ -2779,8 +2792,12 @@ static void intel_pmu_read_event(struct perf_event *event) if (pmu_enabled) intel_pmu_disable_all(); - if (is_topdown_event(event)) - static_call(intel_pmu_update_topdown_event)(event); + /* + * If the PEBS counters snapshotting is enabled, + * the topdown event is available in PEBS records. + */ + if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); @@ -2923,7 +2940,7 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { if (unlikely(is_topdown_count(event))) - return static_call(intel_pmu_update_topdown_event)(event); + return static_call(intel_pmu_update_topdown_event)(event, NULL); return x86_perf_event_update(event); } @@ -3089,7 +3106,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - static_call(intel_pmu_update_topdown_event)(NULL); + static_call(intel_pmu_update_topdown_event)(NULL, NULL); } /* @@ -3107,6 +3124,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* + * There may be unprocessed PEBS records in the PEBS buffer, + * which still stores the previous values. + * Process those records first before handling the latest value. + * For example, + * A is a regular counter + * B is a PEBS event which reads A + * C is a PEBS event + * + * The following can happen: + * B-assist A=1 + * C A=2 + * B-assist A=3 + * A-overflow-PMI A=4 + * C-assist-PMI (PEBS buffer) A=5 + * + * The PEBS buffer has to be drained before handling the A-PMI + */ + if (is_pebs_counter_event_group(event)) + x86_pmu.drain_pebs(regs, &data); + if (!intel_pmu_save_and_restart(event)) continue; @@ -4135,6 +4173,13 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } + if ((event->attr.sample_type & PERF_SAMPLE_READ) && + (x86_pmu.intel_cap.pebs_format >= 6) && + x86_pmu.intel_cap.pebs_baseline && + is_sampling_event(event) && + event->attr.precise_ip) + event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 15f1e352453b3..56223574b68c5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1272,6 +1272,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) ds->pebs_interrupt_threshold = threshold; } +#define PEBS_DATACFG_CNTRS(x) \ + ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK) + +#define PEBS_DATACFG_CNTR_BIT(x) \ + (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT) + +#define PEBS_DATACFG_FIX(x) \ + ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK) + +#define PEBS_DATACFG_FIX_BIT(x) \ + (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \ + << PEBS_DATACFG_FIX_SHIFT) + static void adaptive_pebs_record_size_update(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1286,10 +1299,58 @@ static void adaptive_pebs_record_size_update(void) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); + if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) { + sz += sizeof(struct pebs_cntr_header); + + /* Metrics base and Metrics Data */ + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + sz += 2 * sizeof(u64); + + if (pebs_data_cfg & PEBS_DATACFG_CNTR) { + sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) + + hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) * + sizeof(u64); + } + } cpuc->pebs_record_size = sz; } +static void __intel_pmu_pebs_update_cfg(struct perf_event *event, + int idx, u64 *pebs_data_cfg) +{ + if (is_metric_event(event)) { + *pebs_data_cfg |= PEBS_DATACFG_METRICS; + return; + } + + *pebs_data_cfg |= PEBS_DATACFG_CNTR; + + if (idx >= INTEL_PMC_IDX_FIXED) + *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED); + else + *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx); +} + + +static void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + u64 pebs_data_cfg = 0; + int i; + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + if (!is_pebs_counter_event_group(event)) + continue; + __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg); + } + + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; +} + #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_WEIGHT_TYPE | \ @@ -1896,12 +1957,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc) +{ + int shift = 64 - x86_pmu.cntval_bits; + struct hw_perf_event *hwc; + u64 delta, prev_pmc; + + /* + * A recorded counter may not have an assigned event in the + * following cases. The value should be dropped. + * - An event is deleted. There is still an active PEBS event. + * The PEBS record doesn't shrink on pmu::del(). + * If the counter of the deleted event once occurred in a PEBS + * record, PEBS still records the counter until the counter is + * reassigned. + * - An event is stopped for some reason, e.g., throttled. + * During this period, another event is added and takes the + * counter of the stopped event. The stopped event is assigned + * to another new and uninitialized counter, since the + * x86_pmu_start(RELOAD) is not invoked for a stopped event. + * The PEBS__DATA_CFG is updated regardless of the event state. + * The uninitialized counter can be recorded in a PEBS record. + * But the cpuc->events[uninitialized_counter] is always NULL, + * because the event is stopped. The uninitialized value is + * safely dropped. + */ + if (!event) + return; + + hwc = &event->hw; + prev_pmc = local64_read(&hwc->prev_count); + + /* Only update the count when the PMU is disabled */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + local64_set(&hwc->prev_count, pmc); + + delta = (pmc << shift) - (prev_pmc << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct pebs_cntr_header *cntr, + void *next_record) +{ + int bit; + + for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) { + intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record); + next_record += sizeof(u64); + } + + for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) { + /* The slots event will be handled with perf_metric later */ + if ((cntr->metrics == INTEL_CNTR_METRICS) && + (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) { + next_record += sizeof(u64); + continue; + } + intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED], + *(u64 *)next_record); + next_record += sizeof(u64); + } + + /* HW will reload the value right after the overflow. */ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period); + + if (cntr->metrics == INTEL_CNTR_METRICS) { + static_call(intel_pmu_update_topdown_event) + (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS], + (u64 *)next_record); + next_record += 2 * sizeof(u64); + } +} + #define PEBS_LATENCY_MASK 0xffff /* * With adaptive PEBS the layout depends on what fields are configured. */ - static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -2032,6 +2170,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } + if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) { + struct pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct pebs_cntr_header); + /* + * The PEBS_DATA_CFG is a global register, which is the + * superset configuration for all PEBS events. + * For the PEBS record of non-sample-read group, ignore + * the counter snapshot fields. + */ + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + WARN_ONCE(next_record != __pebs + basic->format_size, "PEBS record size %u, expected %llu, config %llx\n", basic->format_size, @@ -2185,13 +2345,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event, } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); + if ((is_pebs_counter_event_group(event))) { + /* + * The value of each sample has been updated when setup + * the corresponding sample data. + */ + perf_event_update_userpage(event); + } else { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } } else intel_pmu_save_and_restart(event); } @@ -2526,6 +2694,11 @@ void __init intel_ds_init(void) break; case 6: + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + x86_pmu.late_setup = intel_pmu_late_setup; + } + fallthrough; case 5: x86_pmu.pebs_ept = 1; fallthrough; @@ -2550,7 +2723,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); } - pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual); if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 658d8f0360a5d..c4487ab5eebad 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; } +static inline bool is_pebs_counter_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -786,6 +791,7 @@ struct x86_pmu { u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + void (*late_setup)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -1094,6 +1100,7 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1135,6 +1142,12 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); +static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val) +{ + return x86_perf_event_update(event); +} +DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); + static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 6c977c19f2cd7..1d9e385649b57 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ - /* 0x00080 */ +PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8e5ac206ca344..85a60a0a18f29 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,6 +141,12 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -482,6 +488,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ From e717f9a431843bdcce96c81e84ef170869d0a0ba Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 11 Feb 2025 13:30:15 -0800 Subject: [PATCH 28/89] perf vendor events: Add Clearwaterforest events commit e415c1493fa1e93afaec697385b8952d932c41bc upstream. Add events v1.00. Bring in the events from: https://github.com/intel/perfmon/tree/main/CWF/events Intel-SIG: commit e415c1493fa1 perf vendor events: Add Clearwaterforest events Backport CWF PMU support and dependency Co-developed-by: Caleb Biggers Signed-off-by: Caleb Biggers Acked-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Link: https://lore.kernel.org/r/20250211213031.114209-9-irogers@google.com Signed-off-by: Namhyung Kim [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- .../arch/x86/clearwaterforest/cache.json | 144 ++++++++++++++++++ .../arch/x86/clearwaterforest/counter.json | 7 + .../arch/x86/clearwaterforest/frontend.json | 18 +++ .../arch/x86/clearwaterforest/memory.json | 22 +++ .../arch/x86/clearwaterforest/other.json | 22 +++ .../arch/x86/clearwaterforest/pipeline.json | 113 ++++++++++++++ .../x86/clearwaterforest/virtual-memory.json | 29 ++++ tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 8 files changed, 356 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/other.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json new file mode 100644 index 0000000000000..875361b30f1df --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json @@ -0,0 +1,144 @@ +[ + { + "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.MISS", + "PublicDescription": "Counts the number of cacheable memory requests that miss in the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the core has access to an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", + "SampleAfterValue": "1000003", + "UMask": "0x41" + }, + { + "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.REFERENCE", + "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the core has access to an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", + "SampleAfterValue": "1000003", + "UMask": "0x4f" + }, + { + "BriefDescription": "Counts the number of load ops retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.ALL_LOADS", + "SampleAfterValue": "1000003", + "UMask": "0x81" + }, + { + "BriefDescription": "Counts the number of store ops retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.ALL_STORES", + "SampleAfterValue": "1000003", + "UMask": "0x82" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", + "MSRIndex": "0x3F6", + "MSRValue": "0x400", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", + "MSRIndex": "0x3F6", + "MSRValue": "0x80", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", + "MSRIndex": "0x3F6", + "MSRValue": "0x10", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", + "MSRIndex": "0x3F6", + "MSRValue": "0x800", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", + "MSRIndex": "0x3F6", + "MSRValue": "0x100", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", + "MSRIndex": "0x3F6", + "MSRValue": "0x20", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", + "MSRIndex": "0x3F6", + "MSRValue": "0x4", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", + "MSRIndex": "0x3F6", + "MSRValue": "0x200", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", + "MSRIndex": "0x3F6", + "MSRValue": "0x40", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", + "MSRIndex": "0x3F6", + "MSRValue": "0x8", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY", + "SampleAfterValue": "1000003", + "UMask": "0x6" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json new file mode 100644 index 0000000000000..a0eaf5b6f2bc8 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json @@ -0,0 +1,7 @@ +[ + { + "Unit": "core", + "CountersNumFixed": "3", + "CountersNumGeneric": "39" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json new file mode 100644 index 0000000000000..7a9250e5c8f29 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json @@ -0,0 +1,18 @@ +[ + { + "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x80", + "EventName": "ICACHE.ACCESSES", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are not present. -", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x80", + "EventName": "ICACHE.MISSES", + "SampleAfterValue": "1000003", + "UMask": "0x2" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json new file mode 100644 index 0000000000000..f5007e56f39bb --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json @@ -0,0 +1,22 @@ +[ + { + "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x33FBFC00001", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x33FBFC00002", + "SampleAfterValue": "100003", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json new file mode 100644 index 0000000000000..80454e497f835 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json @@ -0,0 +1,22 @@ +[ + { + "BriefDescription": "Counts demand data reads that have any type of response.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10001", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10002", + "SampleAfterValue": "100003", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json new file mode 100644 index 0000000000000..6a5faa704b852 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json @@ -0,0 +1,113 @@ +[ + { + "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.ALL_BRANCHES", + "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", + "PublicDescription": "Counts the total number of mispredicted branch instructions retired. All branch type instructions are accounted for. Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP. A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", + "Counter": "Fixed counter 1", + "EventName": "CPU_CLK_UNHALTED.CORE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.CORE_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.", + "Counter": "Fixed counter 2", + "EventName": "CPU_CLK_UNHALTED.REF_TSC", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", + "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", + "Counter": "Fixed counter 1", + "EventName": "CPU_CLK_UNHALTED.THREAD", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.THREAD_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of instructions retired.", + "Counter": "Fixed counter 0", + "EventName": "INST_RETIRED.ANY", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.ANY_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "Counter": "36", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls. [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls. [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls.", + "Counter": "37", + "EventName": "TOPDOWN_FE_BOUND.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", + "Counter": "38", + "EventName": "TOPDOWN_RETIRING.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json new file mode 100644 index 0000000000000..78f2b835c1fa4 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json @@ -0,0 +1,29 @@ +[ + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + } +] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 33767dd281223..da3cd1763d8e0 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -6,6 +6,7 @@ GenuineIntel-6-(3D|47),v28,broadwell,core GenuineIntel-6-56,v10,broadwellde,core GenuineIntel-6-4F,v21,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.19,cascadelakex,core +GenuineIntel-6-DD,v1.00,clearwaterforest,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core From 2731067529750a9f03a6670a0b66381854ba49ec Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 10 Mar 2025 11:15:36 -0700 Subject: [PATCH 29/89] perf: Extend per event callchain limit to branch stack commit c53e14f1ea4a8f8ddd9b2cd850fcbc0d934b79f5 upstream. The commit 97c79a38cd45 ("perf core: Per event callchain limit") introduced a per-event term to allow finer tuning of the depth of callchains to save space. It should be applied to the branch stack as well. For example, autoFDO collections require maximum LBR entries. In the meantime, other system-wide LBR users may only be interested in the latest a few number of LBRs. A per-event LBR depth would save the perf output buffer. The patch simply drops the uninterested branches, but HW still collects the maximum branches. There may be a model-specific optimization that can reduce the HW depth for some cases to reduce the overhead further. But it isn't included in the patch set. Because it's not useful for all cases. For example, ARCH LBR can utilize the PEBS and XSAVE to collect LBRs. The depth should have less impact on the collecting overhead. The model-specific optimization may be implemented later separately. Intel-SIG: commit c53e14f1ea4a perf: Extend per event callchain limit to branch stack Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310181536.3645382-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- include/linux/perf_event.h | 3 +++ include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 091b07ea9fe36..4d5433176eda6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1283,6 +1283,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, if (branch_sample_hw_index(event)) size += sizeof(u64); + + brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); + size += brs->nr * sizeof(struct perf_branch_entry); /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4842c36fdf801..008403fe0b0b3 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { From 2eba5115b752221c8fb0caa3c9ea8b62d723f9a5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:13 -0700 Subject: [PATCH 30/89] perf/x86: Add dynamic constraint commit 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b upstream. More and more features require a dynamic event constraint, e.g., branch counter logging, auto counter reload, Arch PEBS, etc. Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It avoids keeping adding the individual flag in intel_cpuc_prepare(). Add a variable dyn_constraint in the struct hw_perf_event to track the dynamic constraint of the event. Apply it if it's updated. Apply the generic dynamic constraint for branch counter logging. Many features on and after V6 require dynamic constraint. So unconditionally set the flag for V6+. Intel-SIG: commit 4dfe3232cc04 perf/x86: Add dynamic constraint Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 21 +++++++++++++++------ arch/x86/events/intel/lbr.c | 2 +- arch/x86/events/perf_event.h | 1 + include/linux/perf_event.h | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c1fca3b914027..75e9bb5a264bf 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -675,6 +675,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 18074f85fe85e..6c93869b01185 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3716,10 +3716,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4117,15 +4116,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4903,7 +4906,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -6585,6 +6588,12 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table: */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e9..5f677447b8128 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1608,7 +1608,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index c4487ab5eebad..db637e7620bc9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1052,6 +1052,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4d5433176eda6..5263f138ad16e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -158,6 +158,7 @@ struct hw_perf_event { struct { /* hardware */ u64 config; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; From bd4718d2715c89a8ebb7c2f1a05894de8ef2702c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:14 -0700 Subject: [PATCH 31/89] perf/x86/intel: Track the num of events needs late setup commit 0a6557938d8f189024a03aca77e58763930840ee upstream. When a machine supports PEBS v6, perf unconditionally searches the cpuc->event_list[] for every event and check if the late setup is required, which is unnecessary. The late setup is only required for special events, e.g., events support counters snapshotting feature. Add n_late_setup to track the num of events that needs the late setup. Other features, e.g., auto counter reload feature, require the late setup as well. Add a wrapper, intel_pmu_pebs_late_setup, for the events that support counters snapshotting feature. Intel-SIG: commit 0a6557938d8f perf/x86/intel: Track the num of events needs late setup Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 14 ++++++++++++++ arch/x86/events/intel/ds.c | 3 +-- arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6c93869b01185..c1d25f1563e2e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2589,6 +2589,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); + if (is_pebs_counter_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } static int icl_set_topdown_event_period(struct perf_event *event) @@ -2900,12 +2902,24 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!cpuc->n_late_setup) + return; + + intel_pmu_pebs_late_setup(cpuc); +} + static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); + if (is_pebs_counter_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } /* diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 56223574b68c5..cd8634165bdee 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1333,9 +1333,8 @@ static void __intel_pmu_pebs_update_cfg(struct perf_event *event, } -static void intel_pmu_late_setup(void) +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; u64 pebs_data_cfg = 0; int i; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index db637e7620bc9..0e6cc1e9910dc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -261,6 +261,7 @@ struct cpu_hw_events { struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; int n_excl; /* the number of exclusive events */ + int n_late_setup; /* the num of events needs late setup */ unsigned int txn_flags; int is_fake; @@ -1575,6 +1576,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +void intel_pmu_late_setup(void); + u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); @@ -1629,6 +1632,8 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc); + void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); From 4d44536583b1955c285ceb3cb22cfdda9347caf6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:15 -0700 Subject: [PATCH 32/89] perf: Extend the bit width of the arch-specific flag commit c9449c8506a5df5052ef4d17867699517b10b55a upstream. The auto counter reload feature requires an event flag to indicate an auto counter reload group, which can only be scheduled on specific counters that enumerated in CPUID. However, the hw_perf_event.flags has run out on X86. Two solutions were considered to address the issue. - Currently, 20 bits are reserved for the architecture-specific flags. Only the bit 31 is used for the generic flag. There is still plenty of space left. Reserve 8 more bits for the arch-specific flags. - Add a new X86 specific hw_perf_event.flags1 to support more flags. The former is implemented. Enough room is still left in the global generic flag. Intel-SIG: commit c9449c8506a5 perf: Extend the bit width of the arch-specific flag Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event_flags.h | 41 +++++++++++++++--------------- include/linux/perf_event.h | 2 +- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b57..70078334e4a33 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5263f138ad16e..b6636a74085c5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -144,7 +144,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); From f597246d8738f1476088b3a38f3430f8c6ac47f6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:16 -0700 Subject: [PATCH 33/89] perf/x86/intel: Add CPUID enumeration for the auto counter reload commit 1856c6c2f8416b1340652cccfa1fc302ac8d5ecd upstream. The counters that support the auto counter reload feature can be enumerated in the CPUID Leaf 0x23 sub-leaf 0x2. Add acr_cntr_mask to store the mask of counters which are reloadable. Add acr_cause_mask to store the mask of counters which can cause reload. Since the e-core and p-core may have different numbers of counters, track the masks in the struct x86_hybrid_pmu as well. Intel-SIG: commit 1856c6c2f841 perf/x86/intel: Add CPUID enumeration for the auto counter reload Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 10 ++++++++++ arch/x86/events/perf_event.h | 17 +++++++++++++++++ arch/x86/include/asm/perf_event.h | 1 + 3 files changed, 28 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c1d25f1563e2e..b20d010eebb61 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5029,6 +5029,16 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) pmu->fixed_cntr_mask64 = fixed_cntr; } + if (eax.split.acr_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, + &cntr, &fixed_cntr, &ecx, &edx); + /* The mask of the counters which can be reloaded */ + pmu->acr_cntr_mask64 = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + + /* The mask of the counters which can cause a reload of reloadable counters */ + pmu->acr_cause_mask64 = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + } + if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0e6cc1e9910dc..856374d4f9812 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -711,6 +711,15 @@ struct x86_hybrid_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -809,6 +818,14 @@ struct x86_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 85a60a0a18f29..2a7d0742403bd 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -195,6 +195,7 @@ union cpuid10_edx { */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +#define ARCH_PERFMON_ACR_LEAF 0x2 union cpuid35_eax { struct { From 7126c1007172053cd5c780dba90fdf87be86bebf Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:17 -0700 Subject: [PATCH 34/89] perf/x86/intel: Support auto counter reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ec980e4facef8110f6fce27e5b6344660117f01f upstream. The relative rates among two or more events are useful for performance analysis, e.g., a high branch miss rate may indicate a performance issue. Usually, the samples with a relative rate that exceeds some threshold are more useful. However, the traditional sampling takes samples of events separately. To get the relative rates among two or more events, a high sample rate is required, which can bring high overhead. Many samples taken in the non-hotspot area are also dropped (useless) in the post-process. The auto counter reload (ACR) feature takes samples when the relative rate of two or more events exceeds some threshold, which provides the fine-grained information at a low cost. To support the feature, two sets of MSRs are introduced. For a given counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s) can cause a reload of that counter. The reload value is stored in the IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C. The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto Counter Reload. In the hw_config(), an ACR event is specially configured, because the cause/reloadable counter mask has to be applied to the dyn_constraint. Besides the HW limit, e.g., not support perf metrics, PDist and etc, a SW limit is applied as well. ACR events in a group must be contiguous. It facilitates the later conversion from the event idx to the counter idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole event list again to find the "cause" event. Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which is set to the group leader. The late setup() is also required for an ACR group. It's to convert the event idx to the counter idx, and saved it in hw.config1. The ACR configuration MSRs are only updated in the enable_event(). The disable_event() doesn't clear the ACR CFG register. Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR values. It can avoid a MSR write if the value is not changed. Expose an acr_mask to the sysfs. The perf tool can utilize the new format to configure the relation of events in the group. The bit sequence of the acr_mask follows the events enabled order of the group. Example: Here is the snippet of the mispredict.c. Since the array has a random numbers, jumps are random and often mispredicted. The mispredicted rate depends on the compared value. For the Loop1, ~11% of all branches are mispredicted. For the Loop2, ~21% of all branches are mispredicted. main() { ... for (i = 0; i < N; i++) data[i] = rand() % 256; ... /* Loop 1 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 64) sum += data[i]; ... ... /* Loop 2 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 128) sum += data[i]; ... } Usually, a code with a high branch miss rate means a bad performance. To understand the branch miss rate of the codes, the traditional method usually samples both branches and branch-misses events. E.g., perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}" -c 1000000 -- ./mispredict [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ] The 5106 samples are from both events and spread in both Loops. In the post-process stage, a user can know that the Loop 2 has a 21% branch miss rate. Then they can focus on the samples of branch-misses events for the Loop 2. With this patch, the user can generate the samples only when the branch miss rate > 20%. For example, perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu, cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}" -- ./mispredict (Two different periods are applied to branch-misses and branch-instructions. The ratio is set to 20%. If the branch-instructions is overflowed first, the branch-miss rate < 20%. No samples should be generated. All counters should be automatically reloaded. If the branch-misses is overflowed first, the branch-miss rate > 20%. A sample triggered by the branch-misses event should be generated. Just the counter of the branch-instructions should be automatically reloaded. The branch-misses event should only be automatically reloaded when the branch-instructions is overflowed. So the "cause" event is the branch-instructions event. The acr_mask is set to 0x2, since the event index in the group of branch-instructions is 1. The branch-instructions event is automatically reloaded no matter which events are overflowed. So the "cause" events are the branch-misses and the branch-instructions event. The acr_mask should be set to 0x3.) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ] $perf report Percent │154: movl $0x0,-0x14(%rbp) │ ↓ jmp 1af │ for (i = j; i < N; i++) │15d: mov -0x10(%rbp),%eax │ mov %eax,-0x18(%rbp) │ ↓ jmp 1a2 │ if (data[i] >= 128) │165: mov -0x18(%rbp),%eax │ cltq │ lea 0x0(,%rax,4),%rdx │ mov -0x8(%rbp),%rax │ add %rdx,%rax │ mov (%rax),%eax │ ┌──cmp $0x7f,%eax 100.00 0.00 │ ├──jle 19e │ │sum += data[i]; The 2498 samples are all from the branch-misses events for the Loop 2. The number of samples and overhead is significantly reduced without losing any information. Intel-SIG: commit ec980e4facef perf/x86/intel: Support auto counter reload Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 226 ++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 10 ++ arch/x86/include/asm/msr-index.h | 4 + include/linux/perf_event.h | 1 + 5 files changed, 240 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 75e9bb5a264bf..444b8d07121ad 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -756,7 +756,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b20d010eebb61..5a79e8ee35b38 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2589,7 +2589,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } @@ -2868,6 +2869,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2882,9 +2929,12 @@ static void intel_pmu_enable_event(struct perf_event *event) if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2902,6 +2952,31 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + void intel_pmu_late_setup(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2910,6 +2985,7 @@ void intel_pmu_late_setup(void) return; intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); } static void intel_pmu_add_event(struct perf_event *event) @@ -2918,7 +2994,8 @@ static void intel_pmu_add_event(struct perf_event *event) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } @@ -4073,6 +4150,39 @@ static u64 intel_pmu_freq_start_period(struct perf_event *event) return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4197,6 +4307,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -6043,6 +6241,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6085,6 +6298,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6099,6 +6318,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6337,6 +6557,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6520,6 +6741,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 856374d4f9812..46467b932824e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -120,6 +120,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; } +static inline bool is_acr_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -287,6 +292,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -1113,6 +1122,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c1e6214372ce7..f78b1b7d8c1ee 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -570,7 +570,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b6636a74085c5..1818251a2105d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -157,6 +157,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; From d5fa424cdd1793cffc7b5a26d12e880c3d743c4d Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 10:41:34 +0000 Subject: [PATCH 35/89] perf/x86/intel: Don't clear perf metrics overflow bit unconditionally commit a5f5e1238f4ff919816f69e77d2537a48911767b upstream. The below code would always unconditionally clear other status bits like perf metrics overflow bit once PEBS buffer overflows: status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; This is incorrect. Perf metrics overflow bit should be cleared only when fixed counter 3 in PEBS counter group. Otherwise perf metrics overflow could be missed to handle. Intel-SIG: commit a5f5e1238f4f perf/x86/intel: Don't clear perf metrics overflow bit unconditionally Backport CWF PMU support and dependency Closes: https://lore.kernel.org/all/20250225110012.GK31462@noisy.programming.kicks-ass.net/ Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250415104135.318169-1-dapeng1.mi@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5a79e8ee35b38..62398db62ebbc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3126,7 +3126,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; int handled = 0; - u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); inc_irq_stat(apic_perf_irqs); @@ -3170,7 +3169,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); static_call(x86_pmu_drain_pebs)(regs, &data); - status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* * PMI throttle may be triggered, which stops the PEBS event. @@ -3181,6 +3179,15 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (pebs_enabled != cpuc->pebs_enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + /* + * Above PEBS handler (PEBS counters snapshotting) has updated fixed + * counter 3 and perf metrics counts if they are in counter group, + * unnecessary to update again. + */ + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; } /* @@ -3200,6 +3207,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) static_call(intel_pmu_update_topdown_event)(NULL, NULL); } + status &= hybrid(cpuc->pmu, intel_ctrl); + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status From c63693cfc400411b9f9cc40be23ef94cde4713b9 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:08 +0000 Subject: [PATCH 36/89] perf/x86/intel: Add PMU support for Clearwater Forest commit 48d66c89dce1e3687174608a5f5c31d5961a9916 upstream. From the PMU's perspective, Clearwater Forest is similar to the previous generation Sierra Forest. The key differences are the ARCH PEBS feature and the new added 3 fixed counters for topdown L1 metrics events. The ARCH PEBS is supported in the following patches. This patch provides support for basic perfmon features and 3 new added fixed counters. Intel-SIG: commit 48d66c89dce1 perf/x86/intel: Add PMU support for Clearwater Forest Backport CWF PMU support and dependency Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-3-dapeng1.mi@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 62398db62ebbc..f4a0830a17925 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2210,6 +2210,18 @@ static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); + +static struct attribute *skt_events_attrs[] = { + EVENT_PTR(td_fe_bound_skt), + EVENT_PTR(td_retiring_skt), + EVENT_PTR(td_bad_spec_cmt), + EVENT_PTR(td_be_bound_skt), + NULL, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -7060,6 +7072,18 @@ __init int intel_pmu_init(void) name = "crestmont"; break; + case INTEL_FAM6_ATOM_DARKMONT_X: + intel_pmu_init_skt(NULL); + intel_pmu_pebs_data_source_cmt(); + x86_pmu.pebs_latency_data = cmt_latency_data; + x86_pmu.get_event_constraints = cmt_get_event_constraints; + td_attr = skt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = cmt_format_attr; + pr_cont("Darkmont events, "); + name = "darkmont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: From 875ff5958425fcea88c70ffe78698a7817f9a1ba Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:09 +0000 Subject: [PATCH 37/89] perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs commit 25c623f41438fafc6f63c45e2e141d7bcff78299 upstream. CPUID archPerfmonExt (0x23) leaves are supported to enumerate CPU level's PMU capabilities on non-hybrid processors as well. This patch supports to parse archPerfmonExt leaves on non-hybrid processors. Architectural PEBS leverages archPerfmonExt sub-leaves 0x4 and 0x5 to enumerate the PEBS capabilities as well. This patch is a precursor of the subsequent arch-PEBS enabling patches. Intel-SIG: commit 25c623f41438 perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs Backport CWF PMU support and dependency Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-4-dapeng1.mi@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f4a0830a17925..498143b312fa2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5228,7 +5228,7 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } -static void update_pmu_cap(struct x86_hybrid_pmu *pmu) +static void update_pmu_cap(struct pmu *pmu) { unsigned int cntr, fixed_cntr, ecx, edx; union cpuid35_eax eax; @@ -5237,30 +5237,30 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); if (ebx.split.umask2) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; if (ebx.split.eq) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); - pmu->cntr_mask64 = cntr; - pmu->fixed_cntr_mask64 = fixed_cntr; + hybrid(pmu, cntr_mask64) = cntr; + hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; } if (eax.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, &cntr, &fixed_cntr, &ecx, &edx); /* The mask of the counters which can be reloaded */ - pmu->acr_cntr_mask64 = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); /* The mask of the counters which can cause a reload of reloadable counters */ - pmu->acr_cause_mask64 = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); } if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ - rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities); } } @@ -5342,7 +5342,7 @@ static bool init_hybrid_pmu(int cpu) goto end; if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) - update_pmu_cap(pmu); + update_pmu_cap(&pmu->pmu); intel_pmu_check_hybrid_pmus(pmu); @@ -6817,6 +6817,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; + x86_pmu.config_mask = X86_RAW_EVENT_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -7594,6 +7595,18 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } + /* + * The archPerfmonExt (0x23) includes an enhanced enumeration of + * PMU architectural features with a per-core view. For non-hybrid, + * each core has the same PMU capabilities. It's good enough to + * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu + * is used to keep the common capabilities. Still keep the values + * from the leaf 0xa. The core specific update will be done later + * when a new type is online. + */ + if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) + update_pmu_cap(NULL); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); From c82f0bcd91522996476348608516c2cc2142bbfb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:14 -0700 Subject: [PATCH 38/89] perf/x86/intel: Only check the group flag for X86 leader commit 75aea4b0656ead0facd13d2aae4cb77326e53d2f upstream. A warning in intel_pmu_lbr_counters_reorder() may be triggered by below perf command. perf record -e "{cpu-clock,cycles/call-graph="lbr"/}" -- sleep 1 It's because the group is mistakenly treated as a branch counter group. The hw.flags of the leader are used to determine whether a group is a branch counters group. However, the hw.flags is only available for a hardware event. The field to store the flags is a union type. For a software event, it's a hrtimer. The corresponding bit may be set if the leader is a software event. For a branch counter group and other groups that have a group flag (e.g., topdown, PEBS counters snapshotting, and ACR), the leader must be a X86 event. Check the X86 event before checking the flag. The patch only fixes the issue for the branch counter group. The following patch will fix the other groups. There may be an alternative way to fix the issue by moving the hw.flags out of the union type. It should work for now. But it's still possible that the flags will be used by other types of events later. As long as that type of event is used as a leader, a similar issue will be triggered. So the alternative way is dropped. Intel-SIG: commit 75aea4b0656e perf/x86/intel: Only check the group flag for X86 leader Backport CWF PMU support and dependency Fixes: 33744916196b ("perf/x86/intel: Support branch counters logging") Closes: https://lore.kernel.org/lkml/20250412091423.1839809-1-luogengkun@huaweicloud.com/ Reported-by: Luo Gengkun Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250424134718.311934-2-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi [jz: the prototype of is_x86_event() already changed in upstream commit ec980e4facef ("perf/x86/intel: Support auto counter reload") which is already merged] Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 46467b932824e..2bacdd34b6717 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -110,9 +110,16 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +int is_x86_event(struct perf_event *event); + +static inline bool check_leader_group(struct perf_event *leader, int flags) +{ + return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false; +} + static inline bool is_branch_counters_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; + return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS); } static inline bool is_pebs_counter_event_group(struct perf_event *event) From 2499d55c4f14e0c3fed53ffa3053e20bbead1225 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:15 -0700 Subject: [PATCH 39/89] perf/x86/intel: Check the X86 leader for pebs_counter_event_group commit e9988ad7b1744991118ac348a804f9395368a284 upstream. The PEBS counters snapshotting group also requires a group flag in the leader. The leader must be a X86 event. Intel-SIG: commit e9988ad7b174 perf/x86/intel: Check the X86 leader for pebs_counter_event_group Backport CWF PMU support and dependency Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2bacdd34b6717..037aed44c412e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -124,7 +124,7 @@ static inline bool is_branch_counters_group(struct perf_event *event) static inline bool is_pebs_counter_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); } static inline bool is_acr_event_group(struct perf_event *event) From 9bf3958597209d9f371e0b860c84e8040a60b8da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:16 -0700 Subject: [PATCH 40/89] perf/x86/intel: Check the X86 leader for ACR group commit efd448540e6243dbdaf0a7e1bcf49734e73f3c93 upstream. The auto counter reload group also requires a group flag in the leader. The leader must be a X86 event. Intel-SIG: commit efd448540e62 perf/x86/intel: Check the X86 leader for ACR group Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-4-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 037aed44c412e..e5237c126e4bc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -129,7 +129,7 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) static inline bool is_acr_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } struct amd_nb { From 16077a3c950100238c220cb467d331b5e808229f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:17 -0700 Subject: [PATCH 41/89] perf/x86: Optimize the is_x86_event commit 3e830f657f69ab6a4822d72ec2f364c6d51beef8 upstream. The current is_x86_event has to go through the hybrid_pmus list to find the matched pmu, then check if it's a X86 PMU and a X86 event. It's not necessary. The X86 PMU has a unique type ID on a non-hybrid machine, and a unique capability type. They are good enough to do the check. Intel-SIG: commit 3e830f657f69 perf/x86: Optimize the is_x86_event Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 444b8d07121ad..84f8c0bc15a09 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -758,15 +758,16 @@ void x86_pmu_enable_all(int added) int is_x86_event(struct perf_event *event) { - int i; - - if (!is_hybrid()) - return event->pmu == &pmu; - - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) - return true; - } + /* + * For a non-hybrid platforms, the type of X86 pmu is + * always PERF_TYPE_RAW. + * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE + * is a unique capability for the X86 PMU. + * Use them to detect a X86 event. + */ + if (event->pmu->type == PERF_TYPE_RAW || + event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) + return true; return false; } From 9f600c299f12d8b7bfd2eb6d12e80451fc20d44e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 42/89] tools/include: Sync uapi/linux/perf.h with the kernel sources commit 8ec9497d3ef34fab216e277eca5035811f06b421 upstream. To pick up changes from: 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake This should be used to beautify perf syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Intel-SIG: commit 8ec9497d3ef3 tools/include: Sync uapi/linux/perf.h with the kernel sources Backport CWF PMU support and dependency Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Cc: linux-perf-users@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 3a64499b0f5d6..4842c36fdf801 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ From 38fc52ebbf7ba446bf5d79c988af9e86db9bd67f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 9 Apr 2025 17:11:18 -0700 Subject: [PATCH 43/89] tools headers: Update the uapi/linux/perf_event.h copy with the kernel sources commit ae62977331fcbf5c9a4260c88d9f94450db2d99a upstream. To pick up the changes in: c53e14f1ea4a8f8d perf: Extend per event callchain limit to branch stack Addressing this perf tools build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for further details. Intel-SIG: commit ae62977331fc tools headers: Update the uapi/linux/perf_event.h copy with the kernel sources Backport CWF PMU support and dependency Acked-by: Ingo Molnar Tested-by: Venkat Rao Bagalkote Link: https://lore.kernel.org/r/20250410001125.391820-4-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4842c36fdf801..008403fe0b0b3 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { From 20ef49253218eeeb32914e1bc1802df7623e09b0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 May 2025 09:51:22 +0200 Subject: [PATCH 44/89] perf/uapi: Clean up a bit commit 44889ff67cee7b9ee2d305690ce7a5488b137a66 upstream. When applying a recent commit to the header I noticed that we have accumulated quite a bit of historic noise in this header, so do a bit of spring cleaning: - Define bitfields in a vertically aligned fashion, like perf_event_mmap_page::capabilities already does. This makes it easier to see the distribution and sizing of bits within a word, at a glance. The following is much more readable: __u64 cap_bit0 : 1, cap_bit0_is_deprecated : 1, cap_user_rdpmc : 1, cap_user_time : 1, cap_user_time_zero : 1, cap_user_time_short : 1, cap_____res : 58; Than: __u64 cap_bit0:1, cap_bit0_is_deprecated:1, cap_user_rdpmc:1, cap_user_time:1, cap_user_time_zero:1, cap_user_time_short:1, cap_____res:58; So convert all bitfield definitions from the latter style to the former style. - Fix typos and grammar - Fix capitalization - Remove whitespace noise - Harmonize the definitions of various generations and groups of PERF_MEM_ ABI values. - Vertically align all definitions and assignments to the same column (48), as the first definition (enum perf_type_id), throughout the entire header. - And in general make the code and comments to be more in sync with each other and to be more readable overall. No change in functionality. Copy the changes over to tools/include/uapi/linux/perf_event.h. Intel-SIG: commit 44889ff67cee perf/uapi: Clean up a bit Backport CWF PMU support and dependency Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Ian Rogers Link: https://lore.kernel.org/r/20250521221529.2547099-1-irogers@google.com Signed-off-by: Jason Zeng --- include/uapi/linux/perf_event.h | 652 +++++++++++++------------- tools/include/uapi/linux/perf_event.h | 652 +++++++++++++------------- 2 files changed, 662 insertions(+), 642 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 008403fe0b0b3..365184d931ada 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -39,18 +39,21 @@ enum perf_type_id { /* * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA * AA: hardware event ID * EEEEEEEE: PMU type ID + * * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB * BB: hardware cache ID * CC: hardware cache op ID * DD: hardware cache op result ID * EEEEEEEE: PMU type ID - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + * + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. */ -#define PERF_PMU_TYPE_SHIFT 32 -#define PERF_HW_EVENT_MASK 0xffffffff +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff /* * Generalized performance event event_id types, used by the @@ -112,7 +115,7 @@ enum perf_hw_cache_op_result_id { /* * Special "software" events provided by the kernel, even if the hardware * does not support performance events. These events measure various - * physical and sw events of the kernel (and allow the profiling of them as + * physical and SW events of the kernel (and allow the profiling of them as * well): */ enum perf_sw_ids { @@ -167,8 +170,9 @@ enum perf_event_sample_format { }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) + /* - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. * * If the user does not pass priv level information via branch_sample_type, * the kernel uses the event's priv level. Branch and event priv levels do @@ -178,20 +182,20 @@ enum perf_event_sample_format { * of branches and therefore it supersedes all the other types. */ enum perf_branch_sample_type_shift { - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ - - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ @@ -210,96 +214,95 @@ enum perf_branch_sample_type_shift { }; enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, - PERF_SAMPLE_BRANCH_TYPE_SAVE = - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; /* - * Common flow change classification + * Common control flow change classifications: */ enum { - PERF_BR_UNKNOWN = 0, /* unknown */ - PERF_BR_COND = 1, /* conditional */ - PERF_BR_UNCOND = 2, /* unconditional */ - PERF_BR_IND = 3, /* indirect */ - PERF_BR_CALL = 4, /* function call */ - PERF_BR_IND_CALL = 5, /* indirect function call */ - PERF_BR_RET = 6, /* function return */ - PERF_BR_SYSCALL = 7, /* syscall */ - PERF_BR_SYSRET = 8, /* syscall return */ - PERF_BR_COND_CALL = 9, /* conditional function call */ - PERF_BR_COND_RET = 10, /* conditional function return */ - PERF_BR_ERET = 11, /* exception return */ - PERF_BR_IRQ = 12, /* irq */ - PERF_BR_SERROR = 13, /* system error */ - PERF_BR_NO_TX = 14, /* not in transaction */ - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_UNKNOWN = 0, /* Unknown */ + PERF_BR_COND = 1, /* Conditional */ + PERF_BR_UNCOND = 2, /* Unconditional */ + PERF_BR_IND = 3, /* Indirect */ + PERF_BR_CALL = 4, /* Function call */ + PERF_BR_IND_CALL = 5, /* Indirect function call */ + PERF_BR_RET = 6, /* Function return */ + PERF_BR_SYSCALL = 7, /* Syscall */ + PERF_BR_SYSRET = 8, /* Syscall return */ + PERF_BR_COND_CALL = 9, /* Conditional function call */ + PERF_BR_COND_RET = 10, /* Conditional function return */ + PERF_BR_ERET = 11, /* Exception return */ + PERF_BR_IRQ = 12, /* IRQ */ + PERF_BR_SERROR = 13, /* System error */ + PERF_BR_NO_TX = 14, /* Not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ PERF_BR_MAX, }; /* - * Common branch speculation outcome classification + * Common branch speculation outcome classifications: */ enum { - PERF_BR_SPEC_NA = 0, /* Not available */ - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ PERF_BR_SPEC_MAX, }; enum { - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ PERF_BR_NEW_MAX, }; enum { - PERF_BR_PRIV_UNKNOWN = 0, - PERF_BR_PRIV_USER = 1, - PERF_BR_PRIV_KERNEL = 2, - PERF_BR_PRIV_HV = 3, + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, }; -#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 -#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 -#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 -#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 -#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ @@ -310,9 +313,9 @@ enum { * Values to determine ABI of the registers dump. */ enum perf_sample_regs_abi { - PERF_SAMPLE_REGS_ABI_NONE = 0, - PERF_SAMPLE_REGS_ABI_32 = 1, - PERF_SAMPLE_REGS_ABI_64 = 2, + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, }; /* @@ -320,21 +323,21 @@ enum perf_sample_regs_abi { * abort events. Multiple bits can be set. */ enum { - PERF_TXN_ELISION = (1 << 0), /* From elision */ - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ - PERF_TXN_MAX = (1 << 8), /* non-ABI */ + PERF_TXN_MAX = (1 << 8), /* non-ABI */ - /* bits 32..63 are reserved for the abort code */ + /* Bits 32..63 are reserved for the abort code */ - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), - PERF_TXN_ABORT_SHIFT = 32, + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, }; /* @@ -369,24 +372,22 @@ enum perf_event_read_format { PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ -#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ -#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ -#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ - /* add: sample_stack_user */ -#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ -#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ -#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ -#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ -#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ +#define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ +#define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ + /* Add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ /* - * Hardware event_id to monitor via a performance monitoring event: - * - * @sample_max_stack: Max number of frame pointers in a callchain, - * should be < /proc/sys/kernel/perf_event_max_stack - * Max number of entries of branch stack - * should be < hardware limit + * 'struct perf_event_attr' contains various attributes that define + * a performance event - most of them hardware related configuration + * details, but also a lot of behavioral switches and values implemented + * by the kernel. */ struct perf_event_attr { @@ -396,7 +397,7 @@ struct perf_event_attr { __u32 type; /* - * Size of the attr structure, for fwd/bwd compat. + * Size of the attr structure, for forward/backwards compatibility. */ __u32 size; @@ -451,21 +452,21 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - write_backward : 1, /* Write ring buffer from end to beginning */ + write_backward : 1, /* write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - bpf_event : 1, /* include bpf events */ + bpf_event : 1, /* include BPF events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - build_id : 1, /* use build id in mmap2 events */ + build_id : 1, /* use build ID in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ __reserved_1 : 26; union { - __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_events; /* wake up every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; @@ -474,13 +475,13 @@ struct perf_event_attr { __u64 bp_addr; __u64 kprobe_func; /* for perf_kprobe */ __u64 uprobe_path; /* for perf_uprobe */ - __u64 config1; /* extension of config */ + __u64 config1; /* extension of config */ }; union { __u64 bp_len; - __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 kprobe_addr; /* when kprobe_func == NULL */ __u64 probe_offset; /* for perf_[k,u]probe */ - __u64 config2; /* extension of config1 */ + __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ @@ -510,7 +511,16 @@ struct perf_event_attr { * Wakeup watermark for AUX area */ __u32 aux_watermark; + + /* + * Max number of frame pointers in a callchain, should be + * lower than /proc/sys/kernel/perf_event_max_stack. + * + * Max number of entries of branch stack should be lower + * than the hardware limit. + */ __u16 sample_max_stack; + __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; @@ -528,7 +538,7 @@ struct perf_event_attr { /* * Structure used by below PERF_EVENT_IOC_QUERY_BPF command - * to query bpf programs attached to the same perf tracepoint + * to query BPF programs attached to the same perf tracepoint * as the given perf event. */ struct perf_event_query_bpf { @@ -550,21 +560,21 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) -#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, + PERF_IOC_FLAG_GROUP = 1U << 0, }; /* @@ -575,7 +585,7 @@ struct perf_event_mmap_page { __u32 compat_version; /* lowest version this is compat with */ /* - * Bits needed to read the hw events in user-space. + * Bits needed to read the HW events in user-space. * * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; @@ -613,7 +623,7 @@ struct perf_event_mmap_page { __u32 index; /* hardware event identifier */ __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ - __u64 time_running; /* time event on cpu */ + __u64 time_running; /* time event on CPU */ union { __u64 capabilities; struct { @@ -641,7 +651,7 @@ struct perf_event_mmap_page { /* * If cap_usr_time the below fields can be used to compute the time - * delta since time_enabled (in ns) using rdtsc or similar. + * delta since time_enabled (in ns) using RDTSC or similar. * * u64 quot, rem; * u64 delta; @@ -714,7 +724,7 @@ struct perf_event_mmap_page { * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data, after issueing + * written by user-space to reflect the last read data, after issuing * an smp_mb() to separate the data read from the ->data_tail store. * In this case the kernel will not over-write unread data. * @@ -730,7 +740,7 @@ struct perf_event_mmap_page { /* * AUX area is defined by aux_{offset,size} fields that should be set - * by the userspace, so that + * by the user-space, so that * * aux_offset >= data_offset + data_size * @@ -804,7 +814,7 @@ struct perf_event_mmap_page { * Indicates that thread was preempted in TASK_RUNNING state. * * PERF_RECORD_MISC_MMAP_BUILD_ID: - * Indicates that mmap2 event carries build id data. + * Indicates that mmap2 event carries build ID data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) @@ -815,26 +825,26 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; + __u32 type; + __u16 misc; + __u16 size; }; struct perf_ns_link_info { - __u64 dev; - __u64 ino; + __u64 dev; + __u64 ino; }; enum { - NET_NS_INDEX = 0, - UTS_NS_INDEX = 1, - IPC_NS_INDEX = 2, - PID_NS_INDEX = 3, - USER_NS_INDEX = 4, - MNT_NS_INDEX = 5, - CGROUP_NS_INDEX = 6, - - NR_NAMESPACES, /* number of available namespaces */ + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ }; enum perf_event_type { @@ -850,11 +860,11 @@ enum perf_event_type { * optional fields being ignored. * * struct sample_id { - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all * @@ -865,7 +875,7 @@ enum perf_event_type { /* * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: + * correlate user-space IPs to code. They have the following structure: * * struct { * struct perf_event_header header; @@ -875,7 +885,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, @@ -885,7 +895,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -896,7 +906,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -907,7 +917,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -918,7 +928,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -930,7 +940,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -941,7 +951,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -996,12 +1006,12 @@ enum perf_event_type { * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * - * { u64 abi; # enum perf_sample_regs_abi - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER * - * { u64 size; - * char data[size]; - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { union perf_sample_weight * { @@ -1061,7 +1071,7 @@ enum perf_event_type { * }; * u32 prot, flags; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP2 = 10, @@ -1070,12 +1080,12 @@ enum perf_event_type { * Records that new data landed in the AUX buffer part. * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u64 aux_offset; - * u64 aux_size; + * u64 aux_offset; + * u64 aux_size; * u64 flags; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_AUX = 11, @@ -1158,7 +1168,7 @@ enum perf_event_type { PERF_RECORD_KSYMBOL = 17, /* - * Record bpf events: + * Record BPF events: * enum perf_bpf_event_type { * PERF_BPF_EVENT_UNKNOWN = 0, * PERF_BPF_EVENT_PROG_LOAD = 1, @@ -1236,181 +1246,181 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) enum perf_bpf_event_type { - PERF_BPF_EVENT_UNKNOWN = 0, - PERF_BPF_EVENT_PROG_LOAD = 1, - PERF_BPF_EVENT_PROG_UNLOAD = 2, - PERF_BPF_EVENT_MAX, /* non-ABI */ + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 127 -#define PERF_MAX_CONTEXTS_PER_STACK 8 +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, - PERF_CONTEXT_MAX = (__u64)-4095, + PERF_CONTEXT_MAX = (__u64)-4095, }; /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ /* CoreSight PMU AUX buffer formats */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ -#define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#define PERF_FLAG_FD_OUTPUT (1UL << 1) -#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ #if defined(__LITTLE_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_op:5, /* type of opcode */ - mem_lvl:14, /* memory hierarchy level */ - mem_snoop:5, /* snoop mode */ - mem_lock:2, /* lock instr */ - mem_dtlb:7, /* tlb access */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_remote:1, /* remote */ - mem_snoopx:2, /* snoop mode, ext */ - mem_blk:3, /* access blocked */ - mem_hops:3, /* hop level */ - mem_rsvd:18; + __u64 mem_op : 5, /* Type of opcode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_snoop : 5, /* Snoop mode */ + mem_lock : 2, /* Lock instr */ + mem_dtlb : 7, /* TLB access */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_remote : 1, /* Remote */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_blk : 3, /* Access blocked */ + mem_hops : 3, /* Hop level */ + mem_rsvd : 18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:18, - mem_hops:3, /* hop level */ - mem_blk:3, /* access blocked */ - mem_snoopx:2, /* snoop mode, ext */ - mem_remote:1, /* remote */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_dtlb:7, /* tlb access */ - mem_lock:2, /* lock instr */ - mem_snoop:5, /* snoop mode */ - mem_lvl:14, /* memory hierarchy level */ - mem_op:5; /* type of opcode */ + __u64 mem_rsvd : 18, + mem_hops : 3, /* Hop level */ + mem_blk : 3, /* Access blocked */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_remote : 1, /* Remote */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_dtlb : 7, /* TLB access */ + mem_lock : 2, /* Lock instr */ + mem_snoop : 5, /* Snoop mode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_op : 5; /* Type of opcode */ }; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif -/* type of opcode (load/store/prefetch,code) */ -#define PERF_MEM_OP_NA 0x01 /* not available */ -#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ -#define PERF_MEM_OP_STORE 0x04 /* store instruction */ -#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ -#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ -#define PERF_MEM_OP_SHIFT 0 +/* Type of memory opcode: */ +#define PERF_MEM_OP_NA 0x0001 /* Not available */ +#define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ +#define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ +#define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ +#define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ +#define PERF_MEM_OP_SHIFT 0 /* - * PERF_MEM_LVL_* namespace being depricated to some extent in the + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. - * Supporting this namespace inorder to not break defined ABIs. + * We support this namespace in order to not break defined ABIs. * - * memory hierarchy (memory level, hit or miss) + * Memory hierarchy (memory level, hit or miss) */ -#define PERF_MEM_LVL_NA 0x01 /* not available */ -#define PERF_MEM_LVL_HIT 0x02 /* hit level */ -#define PERF_MEM_LVL_MISS 0x04 /* miss level */ -#define PERF_MEM_LVL_L1 0x08 /* L1 */ -#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ -#define PERF_MEM_LVL_L2 0x20 /* L2 */ -#define PERF_MEM_LVL_L3 0x40 /* L3 */ -#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ -#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ -#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ -#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ -#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ -#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ -#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ -#define PERF_MEM_LVL_SHIFT 5 - -#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ -#define PERF_MEM_REMOTE_SHIFT 37 - -#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ -#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ -#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ -#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ -/* 0x7 available */ -#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ -#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ -#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ -#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ -#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ - -#define PERF_MEM_LVLNUM_SHIFT 33 - -/* snoop mode */ -#define PERF_MEM_SNOOP_NA 0x01 /* not available */ -#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ -#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ -#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ -#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ -#define PERF_MEM_SNOOP_SHIFT 19 - -#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ -#define PERF_MEM_SNOOPX_SHIFT 38 - -/* locked instruction */ -#define PERF_MEM_LOCK_NA 0x01 /* not available */ -#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ -#define PERF_MEM_LOCK_SHIFT 24 +#define PERF_MEM_LVL_NA 0x0001 /* Not available */ +#define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ +#define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ +#define PERF_MEM_LVL_L1 0x0008 /* L1 */ +#define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x0020 /* L2 */ +#define PERF_MEM_LVL_L3 0x0040 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ +#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ +/* 0x007 available */ +#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* Snoop mode */ +#define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ +#define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ +#define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ +#define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* Locked instruction */ +#define PERF_MEM_LOCK_NA 0x0001 /* Not available */ +#define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 /* TLB access */ -#define PERF_MEM_TLB_NA 0x01 /* not available */ -#define PERF_MEM_TLB_HIT 0x02 /* hit level */ -#define PERF_MEM_TLB_MISS 0x04 /* miss level */ -#define PERF_MEM_TLB_L1 0x08 /* L1 */ -#define PERF_MEM_TLB_L2 0x10 /* L2 */ -#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ -#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ -#define PERF_MEM_TLB_SHIFT 26 +#define PERF_MEM_TLB_NA 0x0001 /* Not available */ +#define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ +#define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ +#define PERF_MEM_TLB_L1 0x0008 /* L1 */ +#define PERF_MEM_TLB_L2 0x0010 /* L2 */ +#define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 /* Access blocked */ -#define PERF_MEM_BLK_NA 0x01 /* not available */ -#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ -#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ -#define PERF_MEM_BLK_SHIFT 40 - -/* hop level */ -#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_BLK_NA 0x0001 /* Not available */ +#define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* Hop level */ +#define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ +#define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ +#define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ +#define PERF_MEM_HOPS_3 0x0004 /* Remote board */ /* 5-7 available */ -#define PERF_MEM_HOPS_SHIFT 43 +#define PERF_MEM_HOPS_SHIFT 43 #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) /* - * single taken branch record layout: + * Layout of single taken branch records: * * from: source instruction (may not always be a branch insn) * to: branch target @@ -1429,37 +1439,37 @@ union perf_mem_data_src { struct perf_branch_entry { __u64 from; __u64 to; - __u64 mispred:1, /* target mispredicted */ - predicted:1,/* target predicted */ - in_tx:1, /* in transaction */ - abort:1, /* transaction abort */ - cycles:16, /* cycle count to last branch */ - type:4, /* branch type */ - spec:2, /* branch speculation info */ - new_type:4, /* additional branch type */ - priv:3, /* privilege level */ - reserved:31; + __u64 mispred : 1, /* target mispredicted */ + predicted : 1, /* target predicted */ + in_tx : 1, /* in transaction */ + abort : 1, /* transaction abort */ + cycles : 16, /* cycle count to last branch */ + type : 4, /* branch type */ + spec : 2, /* branch speculation info */ + new_type : 4, /* additional branch type */ + priv : 3, /* privilege level */ + reserved : 31; }; /* Size of used info bits in struct perf_branch_entry */ #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 union perf_sample_weight { - __u64 full; + __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) struct { - __u32 var1_dw; - __u16 var2_w; - __u16 var3_w; + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; }; #elif defined(__BIG_ENDIAN_BITFIELD) struct { - __u16 var3_w; - __u16 var2_w; - __u32 var1_dw; + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif }; diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 008403fe0b0b3..365184d931ada 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -39,18 +39,21 @@ enum perf_type_id { /* * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA * AA: hardware event ID * EEEEEEEE: PMU type ID + * * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB * BB: hardware cache ID * CC: hardware cache op ID * DD: hardware cache op result ID * EEEEEEEE: PMU type ID - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + * + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. */ -#define PERF_PMU_TYPE_SHIFT 32 -#define PERF_HW_EVENT_MASK 0xffffffff +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff /* * Generalized performance event event_id types, used by the @@ -112,7 +115,7 @@ enum perf_hw_cache_op_result_id { /* * Special "software" events provided by the kernel, even if the hardware * does not support performance events. These events measure various - * physical and sw events of the kernel (and allow the profiling of them as + * physical and SW events of the kernel (and allow the profiling of them as * well): */ enum perf_sw_ids { @@ -167,8 +170,9 @@ enum perf_event_sample_format { }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) + /* - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. * * If the user does not pass priv level information via branch_sample_type, * the kernel uses the event's priv level. Branch and event priv levels do @@ -178,20 +182,20 @@ enum perf_event_sample_format { * of branches and therefore it supersedes all the other types. */ enum perf_branch_sample_type_shift { - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ - - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ @@ -210,96 +214,95 @@ enum perf_branch_sample_type_shift { }; enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, - PERF_SAMPLE_BRANCH_TYPE_SAVE = - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; /* - * Common flow change classification + * Common control flow change classifications: */ enum { - PERF_BR_UNKNOWN = 0, /* unknown */ - PERF_BR_COND = 1, /* conditional */ - PERF_BR_UNCOND = 2, /* unconditional */ - PERF_BR_IND = 3, /* indirect */ - PERF_BR_CALL = 4, /* function call */ - PERF_BR_IND_CALL = 5, /* indirect function call */ - PERF_BR_RET = 6, /* function return */ - PERF_BR_SYSCALL = 7, /* syscall */ - PERF_BR_SYSRET = 8, /* syscall return */ - PERF_BR_COND_CALL = 9, /* conditional function call */ - PERF_BR_COND_RET = 10, /* conditional function return */ - PERF_BR_ERET = 11, /* exception return */ - PERF_BR_IRQ = 12, /* irq */ - PERF_BR_SERROR = 13, /* system error */ - PERF_BR_NO_TX = 14, /* not in transaction */ - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_UNKNOWN = 0, /* Unknown */ + PERF_BR_COND = 1, /* Conditional */ + PERF_BR_UNCOND = 2, /* Unconditional */ + PERF_BR_IND = 3, /* Indirect */ + PERF_BR_CALL = 4, /* Function call */ + PERF_BR_IND_CALL = 5, /* Indirect function call */ + PERF_BR_RET = 6, /* Function return */ + PERF_BR_SYSCALL = 7, /* Syscall */ + PERF_BR_SYSRET = 8, /* Syscall return */ + PERF_BR_COND_CALL = 9, /* Conditional function call */ + PERF_BR_COND_RET = 10, /* Conditional function return */ + PERF_BR_ERET = 11, /* Exception return */ + PERF_BR_IRQ = 12, /* IRQ */ + PERF_BR_SERROR = 13, /* System error */ + PERF_BR_NO_TX = 14, /* Not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ PERF_BR_MAX, }; /* - * Common branch speculation outcome classification + * Common branch speculation outcome classifications: */ enum { - PERF_BR_SPEC_NA = 0, /* Not available */ - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ PERF_BR_SPEC_MAX, }; enum { - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ PERF_BR_NEW_MAX, }; enum { - PERF_BR_PRIV_UNKNOWN = 0, - PERF_BR_PRIV_USER = 1, - PERF_BR_PRIV_KERNEL = 2, - PERF_BR_PRIV_HV = 3, + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, }; -#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 -#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 -#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 -#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 -#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ @@ -310,9 +313,9 @@ enum { * Values to determine ABI of the registers dump. */ enum perf_sample_regs_abi { - PERF_SAMPLE_REGS_ABI_NONE = 0, - PERF_SAMPLE_REGS_ABI_32 = 1, - PERF_SAMPLE_REGS_ABI_64 = 2, + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, }; /* @@ -320,21 +323,21 @@ enum perf_sample_regs_abi { * abort events. Multiple bits can be set. */ enum { - PERF_TXN_ELISION = (1 << 0), /* From elision */ - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ - PERF_TXN_MAX = (1 << 8), /* non-ABI */ + PERF_TXN_MAX = (1 << 8), /* non-ABI */ - /* bits 32..63 are reserved for the abort code */ + /* Bits 32..63 are reserved for the abort code */ - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), - PERF_TXN_ABORT_SHIFT = 32, + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, }; /* @@ -369,24 +372,22 @@ enum perf_event_read_format { PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ -#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ -#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ -#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ - /* add: sample_stack_user */ -#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ -#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ -#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ -#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ -#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ +#define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ +#define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ + /* Add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ /* - * Hardware event_id to monitor via a performance monitoring event: - * - * @sample_max_stack: Max number of frame pointers in a callchain, - * should be < /proc/sys/kernel/perf_event_max_stack - * Max number of entries of branch stack - * should be < hardware limit + * 'struct perf_event_attr' contains various attributes that define + * a performance event - most of them hardware related configuration + * details, but also a lot of behavioral switches and values implemented + * by the kernel. */ struct perf_event_attr { @@ -396,7 +397,7 @@ struct perf_event_attr { __u32 type; /* - * Size of the attr structure, for fwd/bwd compat. + * Size of the attr structure, for forward/backwards compatibility. */ __u32 size; @@ -451,21 +452,21 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - write_backward : 1, /* Write ring buffer from end to beginning */ + write_backward : 1, /* write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - bpf_event : 1, /* include bpf events */ + bpf_event : 1, /* include BPF events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - build_id : 1, /* use build id in mmap2 events */ + build_id : 1, /* use build ID in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ __reserved_1 : 26; union { - __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_events; /* wake up every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; @@ -474,13 +475,13 @@ struct perf_event_attr { __u64 bp_addr; __u64 kprobe_func; /* for perf_kprobe */ __u64 uprobe_path; /* for perf_uprobe */ - __u64 config1; /* extension of config */ + __u64 config1; /* extension of config */ }; union { __u64 bp_len; - __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 kprobe_addr; /* when kprobe_func == NULL */ __u64 probe_offset; /* for perf_[k,u]probe */ - __u64 config2; /* extension of config1 */ + __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ @@ -510,7 +511,16 @@ struct perf_event_attr { * Wakeup watermark for AUX area */ __u32 aux_watermark; + + /* + * Max number of frame pointers in a callchain, should be + * lower than /proc/sys/kernel/perf_event_max_stack. + * + * Max number of entries of branch stack should be lower + * than the hardware limit. + */ __u16 sample_max_stack; + __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; @@ -528,7 +538,7 @@ struct perf_event_attr { /* * Structure used by below PERF_EVENT_IOC_QUERY_BPF command - * to query bpf programs attached to the same perf tracepoint + * to query BPF programs attached to the same perf tracepoint * as the given perf event. */ struct perf_event_query_bpf { @@ -550,21 +560,21 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) -#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, + PERF_IOC_FLAG_GROUP = 1U << 0, }; /* @@ -575,7 +585,7 @@ struct perf_event_mmap_page { __u32 compat_version; /* lowest version this is compat with */ /* - * Bits needed to read the hw events in user-space. + * Bits needed to read the HW events in user-space. * * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; @@ -613,7 +623,7 @@ struct perf_event_mmap_page { __u32 index; /* hardware event identifier */ __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ - __u64 time_running; /* time event on cpu */ + __u64 time_running; /* time event on CPU */ union { __u64 capabilities; struct { @@ -641,7 +651,7 @@ struct perf_event_mmap_page { /* * If cap_usr_time the below fields can be used to compute the time - * delta since time_enabled (in ns) using rdtsc or similar. + * delta since time_enabled (in ns) using RDTSC or similar. * * u64 quot, rem; * u64 delta; @@ -714,7 +724,7 @@ struct perf_event_mmap_page { * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data, after issueing + * written by user-space to reflect the last read data, after issuing * an smp_mb() to separate the data read from the ->data_tail store. * In this case the kernel will not over-write unread data. * @@ -730,7 +740,7 @@ struct perf_event_mmap_page { /* * AUX area is defined by aux_{offset,size} fields that should be set - * by the userspace, so that + * by the user-space, so that * * aux_offset >= data_offset + data_size * @@ -804,7 +814,7 @@ struct perf_event_mmap_page { * Indicates that thread was preempted in TASK_RUNNING state. * * PERF_RECORD_MISC_MMAP_BUILD_ID: - * Indicates that mmap2 event carries build id data. + * Indicates that mmap2 event carries build ID data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) @@ -815,26 +825,26 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; + __u32 type; + __u16 misc; + __u16 size; }; struct perf_ns_link_info { - __u64 dev; - __u64 ino; + __u64 dev; + __u64 ino; }; enum { - NET_NS_INDEX = 0, - UTS_NS_INDEX = 1, - IPC_NS_INDEX = 2, - PID_NS_INDEX = 3, - USER_NS_INDEX = 4, - MNT_NS_INDEX = 5, - CGROUP_NS_INDEX = 6, - - NR_NAMESPACES, /* number of available namespaces */ + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ }; enum perf_event_type { @@ -850,11 +860,11 @@ enum perf_event_type { * optional fields being ignored. * * struct sample_id { - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all * @@ -865,7 +875,7 @@ enum perf_event_type { /* * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: + * correlate user-space IPs to code. They have the following structure: * * struct { * struct perf_event_header header; @@ -875,7 +885,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, @@ -885,7 +895,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -896,7 +906,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -907,7 +917,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -918,7 +928,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -930,7 +940,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -941,7 +951,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -996,12 +1006,12 @@ enum perf_event_type { * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * - * { u64 abi; # enum perf_sample_regs_abi - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER * - * { u64 size; - * char data[size]; - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { union perf_sample_weight * { @@ -1061,7 +1071,7 @@ enum perf_event_type { * }; * u32 prot, flags; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP2 = 10, @@ -1070,12 +1080,12 @@ enum perf_event_type { * Records that new data landed in the AUX buffer part. * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u64 aux_offset; - * u64 aux_size; + * u64 aux_offset; + * u64 aux_size; * u64 flags; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_AUX = 11, @@ -1158,7 +1168,7 @@ enum perf_event_type { PERF_RECORD_KSYMBOL = 17, /* - * Record bpf events: + * Record BPF events: * enum perf_bpf_event_type { * PERF_BPF_EVENT_UNKNOWN = 0, * PERF_BPF_EVENT_PROG_LOAD = 1, @@ -1236,181 +1246,181 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) enum perf_bpf_event_type { - PERF_BPF_EVENT_UNKNOWN = 0, - PERF_BPF_EVENT_PROG_LOAD = 1, - PERF_BPF_EVENT_PROG_UNLOAD = 2, - PERF_BPF_EVENT_MAX, /* non-ABI */ + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 127 -#define PERF_MAX_CONTEXTS_PER_STACK 8 +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, - PERF_CONTEXT_MAX = (__u64)-4095, + PERF_CONTEXT_MAX = (__u64)-4095, }; /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ /* CoreSight PMU AUX buffer formats */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ -#define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#define PERF_FLAG_FD_OUTPUT (1UL << 1) -#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ #if defined(__LITTLE_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_op:5, /* type of opcode */ - mem_lvl:14, /* memory hierarchy level */ - mem_snoop:5, /* snoop mode */ - mem_lock:2, /* lock instr */ - mem_dtlb:7, /* tlb access */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_remote:1, /* remote */ - mem_snoopx:2, /* snoop mode, ext */ - mem_blk:3, /* access blocked */ - mem_hops:3, /* hop level */ - mem_rsvd:18; + __u64 mem_op : 5, /* Type of opcode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_snoop : 5, /* Snoop mode */ + mem_lock : 2, /* Lock instr */ + mem_dtlb : 7, /* TLB access */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_remote : 1, /* Remote */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_blk : 3, /* Access blocked */ + mem_hops : 3, /* Hop level */ + mem_rsvd : 18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:18, - mem_hops:3, /* hop level */ - mem_blk:3, /* access blocked */ - mem_snoopx:2, /* snoop mode, ext */ - mem_remote:1, /* remote */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_dtlb:7, /* tlb access */ - mem_lock:2, /* lock instr */ - mem_snoop:5, /* snoop mode */ - mem_lvl:14, /* memory hierarchy level */ - mem_op:5; /* type of opcode */ + __u64 mem_rsvd : 18, + mem_hops : 3, /* Hop level */ + mem_blk : 3, /* Access blocked */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_remote : 1, /* Remote */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_dtlb : 7, /* TLB access */ + mem_lock : 2, /* Lock instr */ + mem_snoop : 5, /* Snoop mode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_op : 5; /* Type of opcode */ }; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif -/* type of opcode (load/store/prefetch,code) */ -#define PERF_MEM_OP_NA 0x01 /* not available */ -#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ -#define PERF_MEM_OP_STORE 0x04 /* store instruction */ -#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ -#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ -#define PERF_MEM_OP_SHIFT 0 +/* Type of memory opcode: */ +#define PERF_MEM_OP_NA 0x0001 /* Not available */ +#define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ +#define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ +#define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ +#define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ +#define PERF_MEM_OP_SHIFT 0 /* - * PERF_MEM_LVL_* namespace being depricated to some extent in the + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. - * Supporting this namespace inorder to not break defined ABIs. + * We support this namespace in order to not break defined ABIs. * - * memory hierarchy (memory level, hit or miss) + * Memory hierarchy (memory level, hit or miss) */ -#define PERF_MEM_LVL_NA 0x01 /* not available */ -#define PERF_MEM_LVL_HIT 0x02 /* hit level */ -#define PERF_MEM_LVL_MISS 0x04 /* miss level */ -#define PERF_MEM_LVL_L1 0x08 /* L1 */ -#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ -#define PERF_MEM_LVL_L2 0x20 /* L2 */ -#define PERF_MEM_LVL_L3 0x40 /* L3 */ -#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ -#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ -#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ -#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ -#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ -#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ -#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ -#define PERF_MEM_LVL_SHIFT 5 - -#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ -#define PERF_MEM_REMOTE_SHIFT 37 - -#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ -#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ -#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ -#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ -/* 0x7 available */ -#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ -#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ -#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ -#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ -#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ - -#define PERF_MEM_LVLNUM_SHIFT 33 - -/* snoop mode */ -#define PERF_MEM_SNOOP_NA 0x01 /* not available */ -#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ -#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ -#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ -#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ -#define PERF_MEM_SNOOP_SHIFT 19 - -#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ -#define PERF_MEM_SNOOPX_SHIFT 38 - -/* locked instruction */ -#define PERF_MEM_LOCK_NA 0x01 /* not available */ -#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ -#define PERF_MEM_LOCK_SHIFT 24 +#define PERF_MEM_LVL_NA 0x0001 /* Not available */ +#define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ +#define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ +#define PERF_MEM_LVL_L1 0x0008 /* L1 */ +#define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x0020 /* L2 */ +#define PERF_MEM_LVL_L3 0x0040 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ +#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ +/* 0x007 available */ +#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* Snoop mode */ +#define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ +#define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ +#define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ +#define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* Locked instruction */ +#define PERF_MEM_LOCK_NA 0x0001 /* Not available */ +#define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 /* TLB access */ -#define PERF_MEM_TLB_NA 0x01 /* not available */ -#define PERF_MEM_TLB_HIT 0x02 /* hit level */ -#define PERF_MEM_TLB_MISS 0x04 /* miss level */ -#define PERF_MEM_TLB_L1 0x08 /* L1 */ -#define PERF_MEM_TLB_L2 0x10 /* L2 */ -#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ -#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ -#define PERF_MEM_TLB_SHIFT 26 +#define PERF_MEM_TLB_NA 0x0001 /* Not available */ +#define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ +#define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ +#define PERF_MEM_TLB_L1 0x0008 /* L1 */ +#define PERF_MEM_TLB_L2 0x0010 /* L2 */ +#define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 /* Access blocked */ -#define PERF_MEM_BLK_NA 0x01 /* not available */ -#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ -#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ -#define PERF_MEM_BLK_SHIFT 40 - -/* hop level */ -#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_BLK_NA 0x0001 /* Not available */ +#define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* Hop level */ +#define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ +#define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ +#define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ +#define PERF_MEM_HOPS_3 0x0004 /* Remote board */ /* 5-7 available */ -#define PERF_MEM_HOPS_SHIFT 43 +#define PERF_MEM_HOPS_SHIFT 43 #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) /* - * single taken branch record layout: + * Layout of single taken branch records: * * from: source instruction (may not always be a branch insn) * to: branch target @@ -1429,37 +1439,37 @@ union perf_mem_data_src { struct perf_branch_entry { __u64 from; __u64 to; - __u64 mispred:1, /* target mispredicted */ - predicted:1,/* target predicted */ - in_tx:1, /* in transaction */ - abort:1, /* transaction abort */ - cycles:16, /* cycle count to last branch */ - type:4, /* branch type */ - spec:2, /* branch speculation info */ - new_type:4, /* additional branch type */ - priv:3, /* privilege level */ - reserved:31; + __u64 mispred : 1, /* target mispredicted */ + predicted : 1, /* target predicted */ + in_tx : 1, /* in transaction */ + abort : 1, /* transaction abort */ + cycles : 16, /* cycle count to last branch */ + type : 4, /* branch type */ + spec : 2, /* branch speculation info */ + new_type : 4, /* additional branch type */ + priv : 3, /* privilege level */ + reserved : 31; }; /* Size of used info bits in struct perf_branch_entry */ #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 union perf_sample_weight { - __u64 full; + __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) struct { - __u32 var1_dw; - __u16 var2_w; - __u16 var3_w; + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; }; #elif defined(__BIG_ENDIAN_BITFIELD) struct { - __u16 var3_w; - __u16 var2_w; - __u32 var1_dw; + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif }; From 6b7dd50210bb71291f63e16cea137a1d641940a3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 7 Jul 2025 13:17:47 -0700 Subject: [PATCH 45/89] perf/x86/intel/uncore: Support MSR portal for discovery tables commit cf002dafedd06241175e4dbce39ba90a4b75822c upstream. Starting from the Panther Lake, the discovery table mechanism is also supported in client platforms. The difference is that the portal of the global discovery table is retrieved from an MSR. The layout of discovery tables are the same as the server platforms. Factor out __parse_discovery_table() to parse discover tables. The uncore PMON is Die scope. Need to parse the discovery tables for each die. Intel-SIG: commit cf002dafedd0 perf/x86/intel/uncore: Support MSR portal for discovery tables Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250707201750.616527-2-kan.liang@linux.intel.com [Dapeng Mi: Replace rdmsrq_safe_on_cpu() with rdmsrl_safe_on_cpu()] Signed-off-by: Dapeng Mi [jz: use rdmsrl_safe_on_cpu() since rdmsrq_safe_on_cpu() not available yet] Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_discovery.c | 87 ++++++++++++++++++------ arch/x86/events/intel/uncore_discovery.h | 3 + 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 571e44b496910..79656d4d46df2 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -273,32 +273,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore) return false; } -static int parse_discovery_table(struct pci_dev *dev, int die, - u32 bar_offset, bool *parsed, - int *ignore) +static int __parse_discovery_table(resource_size_t addr, int die, + bool *parsed, int *ignore) { struct uncore_global_discovery global; struct uncore_unit_discovery unit; void __iomem *io_addr; - resource_size_t addr; unsigned long size; - u32 val; int i; - pci_read_config_dword(dev, bar_offset, &val); - - if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) - return -EINVAL; - - addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); -#ifdef CONFIG_PHYS_ADDR_T_64BIT - if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { - u32 val2; - - pci_read_config_dword(dev, bar_offset + 4, &val2); - addr |= ((resource_size_t)val2) << 32; - } -#endif size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; io_addr = ioremap(addr, size); if (!io_addr) @@ -341,7 +324,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die, return 0; } -bool intel_uncore_has_discovery_tables(int *ignore) +static int parse_discovery_table(struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed, + int *ignore) +{ + resource_size_t addr; + u32 val; + + pci_read_config_dword(dev, bar_offset, &val); + + if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) + return -EINVAL; + + addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + u32 val2; + + pci_read_config_dword(dev, bar_offset + 4, &val2); + addr |= ((resource_size_t)val2) << 32; + } +#endif + + return __parse_discovery_table(addr, die, parsed, ignore); +} + +static bool intel_uncore_has_discovery_tables_pci(int *ignore) { u32 device, val, entry_id, bar_offset; int die, dvsec = 0, ret = true; @@ -390,6 +398,45 @@ bool intel_uncore_has_discovery_tables(int *ignore) return ret; } +static bool intel_uncore_has_discovery_tables_msr(int *ignore) +{ + unsigned long *die_mask; + bool parsed = false; + int cpu, die; + u64 base; + + die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()), + sizeof(unsigned long), GFP_KERNEL); + if (!die_mask) + return false; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + die = topology_logical_die_id(cpu); + if (__test_and_set_bit(die, die_mask)) + continue; + + if (rdmsrl_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base)) + continue; + + if (!base) + continue; + + __parse_discovery_table(base, die, &parsed, ignore); + } + + cpus_read_unlock(); + + kfree(die_mask); + return parsed; +} + +bool intel_uncore_has_discovery_tables(int *ignore) +{ + return intel_uncore_has_discovery_tables_msr(ignore) || + intel_uncore_has_discovery_tables_pci(ignore); +} + void intel_uncore_clear_discovery_tables(void) { struct intel_uncore_discovery_type *type, *next; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 0e94aa7db8e7d..690f737e6837b 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +/* Store the full address of the global discovery table */ +#define UNCORE_DISCOVERY_MSR 0x201e + /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 /* Capability ID for a discovery table device */ From 00ae609006287d1e1e6809293a76eafd7546e164 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 7 Jul 2025 13:17:48 -0700 Subject: [PATCH 46/89] perf/x86/intel/uncore: Support customized MMIO map size commit fca24bf2b6b619770d7f1222c0284791d7766239 upstream. For a server platform, the MMIO map size is always 0x4000. However, a client platform may have a smaller map size. Make the map size customizable. Intel-SIG: commit fca24bf2b6b6 perf/x86/intel/uncore: Support customized MMIO map size Backport CWF PMU support and dependency Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250707201750.616527-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_discovery.c | 2 +- arch/x86/events/intel/uncore_snbep.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 79656d4d46df2..a543a54fd7140 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -650,7 +650,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) } addr = unit->addr; - box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) { pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", type->type_id, unit->id, (unsigned long long)addr); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 8591d1183ffca..45991b6020664 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6404,6 +6404,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, to_type->get_topology = from_type->get_topology; if (from_type->cleanup_mapping) to_type->cleanup_mapping = from_type->cleanup_mapping; + if (from_type->mmio_map_size) + to_type->mmio_map_size = from_type->mmio_map_size; } static struct intel_uncore_type ** From 41cd0c41b1c59b3705dcb3c17c9bcaeb3661d00b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Jul 2025 00:06:12 -0400 Subject: [PATCH 47/89] perf/x86/intel: Fix crash in icl_update_topdown_event() commit b0823d5fbacb1c551d793cbfe7af24e0d1fa45ed upstream. [ Upstream commit b0823d5fbacb1c551d793cbfe7af24e0d1fa45ed ] The perf_fuzzer found a hard-lockup crash on a RaptorLake machine: Oops: general protection fault, maybe for address 0xffff89aeceab400: 0000 CPU: 23 UID: 0 PID: 0 Comm: swapper/23 Tainted: [W]=WARN Hardware name: Dell Inc. Precision 9660/0VJ762 RIP: 0010:native_read_pmc+0x7/0x40 Code: cc e8 8d a9 01 00 48 89 03 5b cd cc cc cc cc 0f 1f ... RSP: 000:fffb03100273de8 EFLAGS: 00010046 .... Call Trace: icl_update_topdown_event+0x165/0x190 ? ktime_get+0x38/0xd0 intel_pmu_read_event+0xf9/0x210 __perf_event_read+0xf9/0x210 CPUs 16-23 are E-core CPUs that don't support the perf metrics feature. The icl_update_topdown_event() should not be invoked on these CPUs. It's a regression of commit: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") The bug introduced by that commit is that the is_topdown_event() function is mistakenly used to replace the is_topdown_count() call to check if the topdown functions for the perf metrics feature should be invoked. Fix it. Intel-SIG: commit b0823d5fbacb perf/x86/intel: Fix crash in icl_update_topdown_event() Backport CWF PMU support and dependency Fixes: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") Closes: https://lore.kernel.org/lkml/352f0709-f026-cd45-e60c-60dfd97f73f3@maine.edu/ Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vince Weaver Cc: stable@vger.kernel.org # v6.15+ Link: https://lore.kernel.org/r/20250612143818.2889040-1-kan.liang@linux.intel.com [ omitted PEBS check ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 498143b312fa2..53da2672ce4bb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2811,7 +2811,7 @@ static void intel_pmu_read_event(struct perf_event *event) * If the PEBS counters snapshotting is enabled, * the topdown event is available in PEBS records. */ - if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); From e4a76125c67310f58226558dc6908761fcc0705c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 8 May 2025 16:44:52 +0300 Subject: [PATCH 48/89] perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq commit 99bcd91fabada0dbb1d5f0de44532d8008db93c6 upstream. Currently, using PEBS-via-PT with a sample frequency instead of a sample period, causes a segfault. For example: BUG: kernel NULL pointer dereference, address: 0000000000000195 ? __die_body.cold+0x19/0x27 ? page_fault_oops+0xca/0x290 ? exc_page_fault+0x7e/0x1b0 ? asm_exc_page_fault+0x26/0x30 ? intel_pmu_pebs_event_update_no_drain+0x40/0x60 ? intel_pmu_pebs_event_update_no_drain+0x32/0x60 intel_pmu_drain_pebs_icl+0x333/0x350 handle_pmi_common+0x272/0x3c0 intel_pmu_handle_irq+0x10a/0x2e0 perf_event_nmi_handler+0x2a/0x50 That happens because intel_pmu_pebs_event_update_no_drain() assumes all the pebs_enabled bits represent counter indexes, which is not always the case. In this particular case, bits 60 and 61 are set for PEBS-via-PT purposes. The behaviour of PEBS-via-PT with sample frequency is questionable because although a PMI is generated (PEBS_PMI_AFTER_EACH_RECORD), the period is not adjusted anyway. Putting that aside, fix intel_pmu_pebs_event_update_no_drain() by passing the mask of counter bits instead of 'size'. Note, prior to the Fixes commit, 'size' would be limited to the maximum counter index, so the issue was not hit. Intel-SIG: commit 99bcd91fabad perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq Backport CWF PMU support and dependency Fixes: 722e42e45c2f1 ("perf/x86: Support counter mask") Signed-off-by: Adrian Hunter Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250508134452.73960-1-adrian.hunter@intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cd8634165bdee..332b5fc9dd22d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2428,8 +2428,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ setup_pebs_fixed_sample_data); } -static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask) { + u64 pebs_enabled = cpuc->pebs_enabled & mask; struct perf_event *event; int bit; @@ -2440,7 +2441,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int * It needs to call intel_pmu_save_and_restart_reload() to * update the event->count for this case. */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -2475,7 +2476,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2589,7 +2590,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } From 1178697fe23e76af716e4f3cc3d8709ce109332b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:18 -0700 Subject: [PATCH 49/89] perf/x86/intel/ds: Fix counter backwards of non-precise events counters-snapshotting commit 7da9960b59fb7e590eb8538c9428db55a4ea2d23 upstream. The counter backwards may be observed in the PMI handler when counters-snapshotting some non-precise events in the freq mode. For the non-precise events, it's possible the counters-snapshotting records a positive value for an overflowed PEBS event. Then the HW auto-reload mechanism reset the counter to 0 immediately. Because the pebs_event_reset is cleared in the freq mode, which doesn't set the PERF_X86_EVENT_AUTO_RELOAD. In the PMI handler, 0 will be read rather than the positive value recorded in the counters-snapshotting record. The counters-snapshotting case has to be specially handled. Since the event value has been updated when processing the counters-snapshotting record, only needs to set the new period for the counter via x86_pmu_set_period(). Intel-SIG: commit 7da9960b59fb perf/x86/intel/ds: Fix counter backwards of non-precise events counters-snapshotting Backport CWF PMU support and dependency Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-6-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 332b5fc9dd22d..36b2a9dd8399e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2359,8 +2359,25 @@ __intel_pmu_pebs_last_event(struct perf_event *event, */ intel_pmu_save_and_restart_reload(event, count); } - } else - intel_pmu_save_and_restart(event); + } else { + /* + * For a non-precise event, it's possible the + * counters-snapshotting records a positive value for the + * overflowed event. Then the HW auto-reload mechanism + * reset the counter to 0 immediately, because the + * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD + * is not set. The counter backwards may be observed in a + * PMI handler. + * + * Since the event value has been updated when processing the + * counters-snapshotting record, only needs to set the new + * period for the counter. + */ + if (is_pebs_counter_event_group(event)) + static_call(x86_pmu_set_period)(event); + else + intel_pmu_save_and_restart(event); + } } static __always_inline void From fb394b66d58b11fb9120f6ccb1108e70b18b8d90 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 19 Feb 2025 06:10:05 -0800 Subject: [PATCH 50/89] perf/x86/intel: Fix event constraints for LNC commit 782cffeec9ad96daa64ffb2d527b2a052fb02552 upstream. According to the latest event list, update the event constraint tables for Lion Cove core. The general rule (the event codes < 0x90 are restricted to counters 0-3.) has been removed. There is no restriction for most of the performance monitoring events. Intel-SIG: commit 782cffeec9ad perf/x86/intel: Fix event constraints for LNC Backport CWF PMU support and dependency Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Amiri Khalil Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250219141005.2446823-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 20 +++++++------------- arch/x86/events/intel/ds.c | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 53da2672ce4bb..78bbc648e1eee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -387,34 +387,28 @@ static struct event_constraint intel_lnc_event_constraints[] = { METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x012a, 0xf), + INTEL_UEVENT_CONSTRAINT(0x012b, 0xf), INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), - /* - * Generally event codes < 0x90 are restricted to counters 0-3. - * The 0x2E and 0x3C are exception, which has no restriction. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), - INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc), INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), - INTEL_EVENT_CONSTRAINT(0xce, 0x1), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), - /* - * Generally event codes >= 0x90 are likely to have no restrictions. - * The exception are defined as above. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 36b2a9dd8399e..3010a90a339a4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1177,7 +1177,7 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), - INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc), INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ From 892ce0a105468a0bdcfbe01a3f8e94717142b2b3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 08:02:52 -0800 Subject: [PATCH 51/89] perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC commit aa5d2ca7c179c40669edb5e96d931bf9828dea3d upstream. The released OCR and FRONTEND events utilized more bits on Lunar Lake p-core. The corresponding mask in the extra_regs has to be extended to unblock the extra bits. Add a dedicated intel_lnc_extra_regs. Intel-SIG: commit aa5d2ca7c179 perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC Backport CWF PMU support and dependency Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20241216160252.430858-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 78bbc648e1eee..0a54896b62420 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -413,6 +413,16 @@ static struct event_constraint intel_lnc_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6748,7 +6758,7 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) intel_pmu_init_glc(pmu); hybrid(pmu, event_constraints) = intel_lnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; - hybrid(pmu, extra_regs) = intel_rwc_extra_regs; + hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) From 5ed5827e45f4c6f1ec052fce4ef5da346ccbbe05 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 29 May 2025 08:02:36 +0000 Subject: [PATCH 52/89] perf/x86/intel: Fix incorrect MSR index calculations in intel_pmu_config_acr() commit 86aa94cd50b138be0dd872b0779fa3036e641881 upstream. The MSR offset calculations in intel_pmu_config_acr() are buggy. To calculate fixed counter MSR addresses in intel_pmu_config_acr(), the HW counter index "idx" is subtracted by INTEL_PMC_IDX_FIXED. This leads to the ACR mask value of fixed counters to be incorrectly saved to the positions of GP counters in acr_cfg_b[], e.g. For fixed counter 0, its ACR counter mask should be saved to acr_cfg_b[32], but it's saved to acr_cfg_b[0] incorrectly. Fix this issue. [ mingo: Clarified & improved the changelog. ] Intel-SIG: commit 86aa94cd50b1 perf/x86/intel: Fix incorrect MSR index calculations in intel_pmu_config_acr() Backport CWF PMU support and dependency Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250529080236.2552247-2-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0a54896b62420..8d4dd547de90a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2889,6 +2889,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int msr_b, msr_c; + int msr_offset; if (!mask && !cpuc->acr_cfg_b[idx]) return; @@ -2896,19 +2897,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) if (idx < INTEL_PMC_IDX_FIXED) { msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + msr_offset = x86_pmu.addr_offset(idx, false); } else { msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; - idx -= INTEL_PMC_IDX_FIXED; + msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); } if (cpuc->acr_cfg_b[idx] != mask) { - wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + wrmsrl(msr_b + msr_offset, mask); cpuc->acr_cfg_b[idx] = mask; } /* Only need to update the reload value when there is a valid config value. */ if (mask && cpuc->acr_cfg_c[idx] != reload) { - wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + wrmsrl(msr_c + msr_offset, reload); cpuc->acr_cfg_c[idx] = reload; } } From e8fc242be577bbe66734c510f316ded5ef54b77c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 53/89] tools/include: Sync x86 headers with the kernel sources commit f6d9883f8e680460be4714d4d35c7acac1dffeaf upstream. To pick up changes from: 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing 21b362cc762a x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems 4f460bff7b6a cpufreq: acpi: move MSR_K7_HWCR_CPB_DIS_BIT into msr-index.h 7ea81936b853 x86/cpufeatures: Add HWP highest perf change feature flag 78ce84b9e0a5 x86/cpufeatures: Flip the /proc/cpuinfo appearance logic 1beb348d5c7f x86/sev: Provide SVSM discovery support This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Intel-SIG: commit f6d9883f8e68 tools/include: Sync x86 headers with the kernel sources Backport CWF PMU support and dependency Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Conflicts: tools/arch/x86/include/asm/cpufeatures.h tools/arch/x86/include/asm/msr-index.h [jz: only adapt changes in following 3 commits: 149fd4712bcd ("perf/x86/intel: Support Perfmon MSRs aliasing") 21b362cc762a ("x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems") 7ea81936b853 ("x86/cpufeatures: Add HWP highest perf change feature flag") because other commits are not backported in velinux] Signed-off-by: Jason Zeng --- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/msr-index.h | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8f52bdaf99e92..715034e7dce00 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -354,6 +354,7 @@ #define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* "" HWP Highest perf change */ #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1d111350197f3..2e690cd8c948f 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -535,6 +535,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. @@ -1100,6 +1106,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 From d0b461ca4496dc15734d26cafd845b9bfabb0932 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 22:28:49 +0200 Subject: [PATCH 54/89] x86/msr: Standardize on u64 in commit f4138de5e41fae1a0b406f0d354a3028dc46bf1f upstream. Also fix some nearby whitespace damage while at it. Intel-SIG: commit f4138de5e41f x86/msr: Standardize on u64 in Backport CWF PMU support and dependency Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: Juergen Gross Cc: H. Peter Anvin Cc: Dave Hansen Cc: Xin Li Cc: Linus Torvalds Signed-off-by: Jason Zeng --- arch/x86/include/asm/msr-index.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f78b1b7d8c1ee..698f06f572c30 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -501,7 +501,7 @@ #define MSR_HWP_CAPABILITIES 0x00000771 #define MSR_HWP_REQUEST_PKG 0x00000772 #define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_REQUEST 0x00000774 #define MSR_HWP_STATUS 0x00000777 /* CPUID.6.EAX */ @@ -518,16 +518,16 @@ #define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24) #define HWP_EPP_PERFORMANCE 0x00 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF -#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) +#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) From b94badb9aa8ded6e07ae8eed743cb30fcbfb7669 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 17:58:05 -0300 Subject: [PATCH 55/89] tools arch x86: Sync the msr-index.h copy with the kernel sources commit 6143374c6dc874d301dc16612aed144bf4544ba upstream. To pick up the changes from these csets: 159013a7ca18c271 ("x86/its: Enumerate Indirect Target Selection (ITS) bug") f4138de5e41fae1a ("x86/msr: Standardize on u64 in ") ec980e4facef8110 ("perf/x86/intel: Support auto counter reload") That cause no changes to tooling as it doesn't include a new MSR to be captured by the tools/perf/trace/beauty/tracepoints/x86_msr.sh script. Just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Intel-SIG: commit 6143374c6dc8 tools arch x86: Sync the msr-index.h copy with the kernel sources Backport CWF PMU support and dependency Cc: Adrian Hunter Cc: Dave Hansen Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Link: https://lore.kernel.org/r/aEtAUg83OQGx8Kay@x1 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Jason Zeng --- tools/arch/x86/include/asm/msr-index.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 2e690cd8c948f..f7c5d52aaf537 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -469,7 +469,7 @@ #define MSR_HWP_CAPABILITIES 0x00000771 #define MSR_HWP_REQUEST_PKG 0x00000772 #define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_REQUEST 0x00000774 #define MSR_HWP_STATUS 0x00000777 /* CPUID.6.EAX */ @@ -486,16 +486,16 @@ #define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24) #define HWP_EPP_PERFORMANCE 0x00 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF -#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) +#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -538,7 +538,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* From f342d34c4c4e6cec733d2fa47ea2b92c8ab29de9 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:27 +0800 Subject: [PATCH 56/89] perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error commit 43796f30507802d93ead2dc44fc9637f34671a89 upstream. When running perf_fuzzer on PTL, sometimes the below "unchecked MSR access error" is seen when accessing IA32_PMC_x_CFG_B MSRs. [ 55.611268] unchecked MSR access error: WRMSR to 0x1986 (tried to write 0x0000000200000001) at rIP: 0xffffffffac564b28 (native_write_msr+0x8/0x30) [ 55.611280] Call Trace: [ 55.611282] [ 55.611284] ? intel_pmu_config_acr+0x87/0x160 [ 55.611289] intel_pmu_enable_acr+0x6d/0x80 [ 55.611291] intel_pmu_enable_event+0xce/0x460 [ 55.611293] x86_pmu_start+0x78/0xb0 [ 55.611297] x86_pmu_enable+0x218/0x3a0 [ 55.611300] ? x86_pmu_enable+0x121/0x3a0 [ 55.611302] perf_pmu_enable+0x40/0x50 [ 55.611307] ctx_resched+0x19d/0x220 [ 55.611309] __perf_install_in_context+0x284/0x2f0 [ 55.611311] ? __pfx_remote_function+0x10/0x10 [ 55.611314] remote_function+0x52/0x70 [ 55.611317] ? __pfx_remote_function+0x10/0x10 [ 55.611319] generic_exec_single+0x84/0x150 [ 55.611323] smp_call_function_single+0xc5/0x1a0 [ 55.611326] ? __pfx_remote_function+0x10/0x10 [ 55.611329] perf_install_in_context+0xd1/0x1e0 [ 55.611331] ? __pfx___perf_install_in_context+0x10/0x10 [ 55.611333] __do_sys_perf_event_open+0xa76/0x1040 [ 55.611336] __x64_sys_perf_event_open+0x26/0x30 [ 55.611337] x64_sys_call+0x1d8e/0x20c0 [ 55.611339] do_syscall_64+0x4f/0x120 [ 55.611343] entry_SYSCALL_64_after_hwframe+0x76/0x7e On PTL, GP counter 0 and 1 doesn't support auto counter reload feature, thus it would trigger a #GP when trying to write 1 on bit 0 of CFG_B MSR which requires to enable auto counter reload on GP counter 0. The root cause of causing this issue is the check for auto counter reload (ACR) counter mask from user space is incorrect in intel_pmu_acr_late_setup() helper. It leads to an invalid ACR counter mask from user space could be set into hw.config1 and then written into CFG_B MSRs and trigger the MSR access warning. e.g., User may create a perf event with ACR counter mask (config2=0xcb), and there is only 1 event created, so "cpuc->n_events" is 1. The correct check condition should be "i + idx >= cpuc->n_events" instead of "i + idx > cpuc->n_events" (it looks a typo). Otherwise, the counter mask would traverse twice and an invalid "cpuc->assign[1]" bit (bit 0) is set into hw.config1 and cause MSR accessing error. Besides, also check if the ACR counter mask corresponding events are ACR events. If not, filter out these counter mask. If a event is not a ACR event, it could be scheduled to an HW counter which doesn't support ACR. It's invalid to add their counter index in ACR counter mask. Furthermore, remove the WARN_ON_ONCE() since it's easily triggered as user could set any invalid ACR counter mask and the warning message could mislead users. Intel-SIG: commit 43796f305078 perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error Backport CWF PMU support and dependency Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-3-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8d4dd547de90a..a1c960038a58c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2986,7 +2986,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } From 107aaf576db31718a9ba877f5b6cd4fc2f3dbfd1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Nov 2023 18:28:49 -0800 Subject: [PATCH 57/89] KVM: x86/pmu: Add common define to capture fixed counters offset commit be6b067dae1573cf4d53c8b08175d8872d82f030 upstream. Add a common define to "officially" solidify KVM's split of counters, i.e. to commit to using bits 31:0 to track general purpose counters and bits 63:32 to track fixed counters (which only Intel supports). KVM already bleeds this behavior all over common PMU code, and adding a KVM- defined macro allows clarifying that the value is a _base_, as oppposed to the _flag_ that is used to access fixed PMCs via RDPMC (which perf confusingly calls INTEL_PMC_FIXED_RDPMC_BASE). No functional change intended. Intel-SIG: commit be6b067dae15 KVM: x86/pmu: Add common define to capture fixed counters offset Backport CWF PMU support and dependency Link: https://lore.kernel.org/r/20231110022857.1273836-3-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/kvm/pmu.c | 8 ++++---- arch/x86/kvm/pmu.h | 4 +++- arch/x86/kvm/vmx/pmu_intel.c | 12 ++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index da2d82e3a8735..60006554d8468 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -67,7 +67,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters - * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ @@ -362,7 +362,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, int idx) { - int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; if (filter->action == KVM_PMU_EVENT_DENY && test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) @@ -416,7 +416,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); if (fixed_ctr_ctrl & 0x1) eventsel |= ARCH_PERFMON_EVENTSEL_OS; if (fixed_ctr_ctrl & 0x2) @@ -798,7 +798,7 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); select_os = config & 0x1; select_user = config & 0x2; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a46aa9b25150f..9da30eb413304 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,6 +18,8 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED + struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); @@ -153,7 +155,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 48a2f77f62ef3..6bd8b515d922e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -84,18 +84,18 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); - __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); + __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use); kvm_pmu_request_counter_reprogram(pmc); } } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - if (pmc_idx < INTEL_PMC_IDX_FIXED) { + if (pmc_idx < KVM_FIXED_PMC_BASE_IDX) { return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, MSR_P6_EVNTSEL0); } else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + u32 idx = pmc_idx - KVM_FIXED_PMC_BASE_IDX; return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); } @@ -539,7 +539,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_mask = counter_mask; /* @@ -583,7 +583,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } pmu->pebs_data_cfg_mask = ~0xff00000full; } else { @@ -609,7 +609,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; pmu->fixed_counters[i].current_config = 0; } From a315112de5419175799babf99a2261063777c3c8 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 30 Apr 2024 08:52:38 +0800 Subject: [PATCH 58/89] KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu commit 0e102ce3d4133194a26060fe987315133736c37b upstream. Several '_mask' suffixed variables such as, global_ctrl_mask, are defined in kvm_pmu structure. However the _mask suffix is ambiguous and misleading since it's not a real mask with positive logic. On the contrary it represents the reserved bits of corresponding MSRs and these bits should not be accessed. Intel-SIG: commit 0e102ce3d413 KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu Backport CWF PMU support and dependency Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Link: https://lore.kernel.org/r/20240430005239.13527-2-dapeng1.mi@linux.intel.com Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/include/asm/kvm_host.h | 10 +++++----- arch/x86/kvm/pmu.c | 16 ++++++++-------- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 4 ++-- arch/x86/kvm/vmx/pmu_intel.c | 26 +++++++++++++------------- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5dfb8cc9616e5..cf0446916d2a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -518,12 +518,12 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_mask; + u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_mask; - u64 global_status_mask; + u64 global_ctrl_rsvd; + u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; @@ -543,9 +543,9 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; - u64 pebs_enable_mask; + u64 pebs_enable_rsvd; u64 pebs_data_cfg; - u64 pebs_data_cfg_mask; + u64 pebs_data_cfg_rsvd; /* * If a guest counter is cross-mapped to host counter with different diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 60006554d8468..17a96a2ba1a9c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -619,13 +619,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) break; - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: - data &= ~pmu->global_ctrl_mask; + data &= ~pmu->global_ctrl_rsvd; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) @@ -642,7 +642,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in * GLOBAL_STATUS, and so the set of reserved bits is the same. */ - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -709,11 +709,11 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; + pmu->global_ctrl_rsvd = ~0ull; + pmu->global_status_rsvd = ~0ull; + pmu->fixed_ctr_ctrl_rsvd = ~0ull; + pmu->pebs_enable_rsvd = ~0ull; + pmu->pebs_data_cfg_rsvd = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); if (!vcpu->kvm->arch.enable_pmu) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9da30eb413304..65d659ddea88f 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -95,7 +95,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { - return !(pmu->global_ctrl_mask & data); + return !(pmu->global_ctrl_rsvd & data); } /* returns general purpose PMC with the specified MSR. Note that it can be diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 3fd47de14b38a..0ec0e95dabe7b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -204,8 +204,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); - pmu->global_status_mask = pmu->global_ctrl_mask; + pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 6bd8b515d922e..bc32926664a36 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -398,14 +398,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (data & pmu->fixed_ctr_ctrl_mask) + if (data & pmu->fixed_ctr_ctrl_rsvd) return 1; if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; case MSR_IA32_PEBS_ENABLE: - if (data & pmu->pebs_enable_mask) + if (data & pmu->pebs_enable_rsvd) return 1; if (pmu->pebs_enable != data) { @@ -421,7 +421,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmu->ds_area = data; break; case MSR_PEBS_DATA_CFG: - if (data & pmu->pebs_data_cfg_mask) + if (data & pmu->pebs_data_cfg_rsvd) return 1; pmu->pebs_data_cfg = data; @@ -490,7 +490,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid10_eax eax; union cpuid10_edx edx; u64 perf_capabilities; - u64 counter_mask; + u64 counter_rsvd; int i; memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); @@ -537,21 +537,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + pmu->fixed_ctr_ctrl_rsvd &= ~(0xbull << (i * 4)); + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); - pmu->global_ctrl_mask = counter_mask; + pmu->global_ctrl_rsvd = counter_rsvd; /* * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) * share reserved bit definitions. The kernel just happens to use * OVF_CTRL for the names. */ - pmu->global_status_mask = pmu->global_ctrl_mask + pmu->global_status_rsvd = pmu->global_ctrl_rsvd & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_status_mask &= + pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -579,15 +579,15 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = counter_mask; + pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_ctr_ctrl_mask &= + pmu->fixed_ctr_ctrl_rsvd &= ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } - pmu->pebs_data_cfg_mask = ~0xff00000full; + pmu->pebs_data_cfg_rsvd = ~0xff00000full; } else { - pmu->pebs_enable_mask = + pmu->pebs_enable_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); } } From 8f60dcc30f8d52e479ed9e74c1cbcce69a217273 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 30 Apr 2024 08:52:39 +0800 Subject: [PATCH 59/89] KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros commit 75430c412a3139c29404459ab1216a07d1280428 upstream. Magic numbers are used to manipulate the bit fields of FIXED_CTR_CTRL MSR. This makes reading code become difficult, so use pre-defined macros to replace these magic numbers. Intel-SIG: commit 75430c412a31 KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros Backport CWF PMU support and dependency Signed-off-by: Dapeng Mi Link: https://lore.kernel.org/r/20240430005239.13527-3-dapeng1.mi@linux.intel.com [sean: drop unnecessary curly braces] Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/kvm/pmu.c | 10 +++++----- arch/x86/kvm/pmu.h | 6 ++++-- arch/x86/kvm/vmx/pmu_intel.c | 12 ++++++++---- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 17a96a2ba1a9c..a0a83112d6866 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -417,11 +417,11 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - if (fixed_ctr_ctrl & 0x1) + if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL) eventsel |= ARCH_PERFMON_EVENTSEL_OS; - if (fixed_ctr_ctrl & 0x2) + if (fixed_ctr_ctrl & INTEL_FIXED_0_USER) eventsel |= ARCH_PERFMON_EVENTSEL_USR; - if (fixed_ctr_ctrl & 0x8) + if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } @@ -799,8 +799,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - select_os = config & 0x1; - select_user = config & 0x2; + select_os = config & INTEL_FIXED_0_KERNEL; + select_user = config & INTEL_FIXED_0_USER; } return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 65d659ddea88f..aca521bd9f492 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -12,7 +12,8 @@ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) +#define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) #define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 @@ -155,7 +156,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bc32926664a36..841476da9bcf4 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -537,7 +537,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_rsvd &= ~(0xbull << (i * 4)); + pmu->fixed_ctr_ctrl_rsvd &= + ~intel_fixed_bits_by_idx(i, + INTEL_FIXED_0_KERNEL | + INTEL_FIXED_0_USER | + INTEL_FIXED_0_ENABLE_PMI); + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_rsvd = counter_rsvd; @@ -581,10 +586,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_rsvd &= - ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); - } + ~intel_fixed_bits_by_idx(i, ICL_FIXED_0_ADAPTIVE); pmu->pebs_data_cfg_rsvd = ~0xff00000full; } else { pmu->pebs_enable_rsvd = From dec6610ea9f6dc3be9f42141d506bf65ba90046d Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:31 +0800 Subject: [PATCH 60/89] perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, add it. With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can be optimized. The old fixed counter control bits can be unconditionally cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on new configuration. Intel-SIG: commit 2676dbf9f4fb perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK Backport CWF PMU support and dependency Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 10 +++------- arch/x86/include/asm/perf_event.h | 6 +++++- arch/x86/kvm/pmu.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a1c960038a58c..17bb6087766a5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2834,8 +2834,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2874,14 +2874,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2a7d0742403bd..87b9cc9e47ca8 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index aca521bd9f492..004121dfddd64 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -11,7 +11,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) From 86693936d815f8283da011661cc9e1c942638431 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:32 +0800 Subject: [PATCH 61/89] perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() commit f49e1be19542487921e82b29004908966cb99d7c upstream. Along with the introduction Perfmon v6, pmu counters could be incontinuous, like fixed counters on CWF, only fixed counters 0-3 and 5-7 are supported, there is no fixed counter 4 on CWF. To accommodate this change, archPerfmonExt CPUID (0x23) leaves are introduced to enumerate the true-view of counters bitmap. Current perf code already supports archPerfmonExt CPUID and uses counters-bitmap to enumerate HW really supported counters, but x86_pmu_show_pmu_cap() still only dumps the absolute counter number instead of true-view bitmap, it's out-dated and may mislead readers. So dump counters true-view bitmap in x86_pmu_show_pmu_cap() and opportunistically change the dump sequence and words. Intel-SIG: commit f49e1be19542 perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() Backport CWF PMU support and dependency Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-8-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 84f8c0bc15a09..14663df555268 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2058,13 +2058,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) From 011ff3c5b0d0817612ce668778d3493c563992b3 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Thu, 26 Sep 2024 09:40:40 -0500 Subject: [PATCH 62/89] perf mem: Fix printing PERF_MEM_LVLNUM_{L2_MHB|MSC} commit 4f23fc34cc68812c68c3a3dec15e26e87565f430 upstream. With commit 8ec9497d3ef34 ("tools/include: Sync uapi/linux/perf.h with the kernel sources"), 'perf mem report' gives an incorrect memory access string. ... 0.02% 1 3644 L5 hit [.] 0x0000000000009b0e mlc [.] 0x00007fce43f59480 ... This occurs because, if no entry exists in mem_lvlnum, perf_mem__lvl_scnprintf will default to 'L%d, lvl', which in this case for PERF_MEM_LVLNUM_L2_MHB is 0x05. Add entries for PERF_MEM_LVLNUM_L2_MHB and PERF_MEM_LVLNUM_MSC to mem_lvlnum, so that the correct strings are printed. ... 0.02% 1 3644 L2 MHB hit [.] 0x0000000000009b0e mlc [.] 0x00007fce43f59480 ... Intel-SIG: commit 4f23fc34cc68 perf mem: Fix printing PERF_MEM_LVLNUM_{L2_MHB|MSC} Backport CWF PMU support and dependency Fixes: 8ec9497d3ef34 ("tools/include: Sync uapi/linux/perf.h with the kernel sources") Suggested-by: Kan Liang Signed-off-by: Thomas Falcon Reviewed-by: Leo Yan Link: https://lore.kernel.org/r/20240926144040.77897-1-thomas.falcon@intel.com Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/perf/util/mem-events.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 3a2e3687878c1..48acf9ec30aab 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -297,6 +297,12 @@ static const char * const mem_lvl[] = { }; static const char * const mem_lvlnum[] = { + [PERF_MEM_LVLNUM_L1] = "L1", + [PERF_MEM_LVLNUM_L2] = "L2", + [PERF_MEM_LVLNUM_L3] = "L3", + [PERF_MEM_LVLNUM_L4] = "L4", + [PERF_MEM_LVLNUM_L2_MHB] = "L2 MHB", + [PERF_MEM_LVLNUM_MSC] = "Memory-side Cache", [PERF_MEM_LVLNUM_UNC] = "Uncached", [PERF_MEM_LVLNUM_CXL] = "CXL", [PERF_MEM_LVLNUM_IO] = "I/O", @@ -379,7 +385,7 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) if (mem_lvlnum[lvl]) l += scnprintf(out + l, sz - l, mem_lvlnum[lvl]); else - l += scnprintf(out + l, sz - l, "L%d", lvl); + l += scnprintf(out + l, sz - l, "Unknown level %d", lvl); l += scnprintf(out + l, sz - l, " %s", hit_miss); return l; From 1077fb26dc4135652e41c673cfd5c5b83f8f5a89 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:10 +0000 Subject: [PATCH 63/89] perf/x86/intel: Decouple BTS initialization from PEBS initialization commit d971342d38bf228ea4c137249501eb5be38ee958 upstream. Move x86_pmu.bts flag initialization into bts_init() from intel_ds_init() and rename intel_ds_init() to intel_pebs_init() since it fully initializes PEBS now after removing the x86_pmu.bts initialization. It's safe to move x86_pmu.bts into bts_init() since all x86_pmu.bts flag are called after bts_init() execution. Intel-SIG: commit d971342d38bf perf/x86/intel: Decouple BTS initialization from PEBS initialization Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-5-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/bts.c | 6 +++++- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 5 ++--- arch/x86/events/perf_event.h | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 974e917e65b24..d3a4b6630368c 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -587,7 +587,11 @@ static void bts_event_read(struct perf_event *event) static __init int bts_init(void) { - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return -ENODEV; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + if (!x86_pmu.bts) return -ENODEV; if (boot_cpu_has(X86_FEATURE_PTI)) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 17bb6087766a5..5869b777fac1c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6849,7 +6849,7 @@ __init int intel_pmu_init(void) if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) intel_pmu_arch_lbr_init(); - intel_ds_init(); + intel_pebs_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3010a90a339a4..b3c191fd98da5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2650,10 +2650,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d } /* - * BTS, PEBS probe and setup + * PEBS probe and setup */ -void __init intel_ds_init(void) +void __init intel_pebs_init(void) { /* * No support for 32bit formats @@ -2661,7 +2661,6 @@ void __init intel_ds_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64)) return; - x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.version <= 4) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e5237c126e4bc..320595fe0c48f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1672,7 +1672,7 @@ void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); -void intel_ds_init(void); +void intel_pebs_init(void); void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, struct cpu_hw_events *cpuc, From ba380fa4391da5fc7fed90d06a85421e9dbc1901 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:11 +0000 Subject: [PATCH 64/89] perf/x86/intel: Rename x86_pmu.pebs to x86_pmu.ds_pebs commit acb727e0956a2424f22e5ab8c1ff9a39d1acb150 upstream. Since architectural PEBS would be introduced in subsequent patches, rename x86_pmu.pebs to x86_pmu.ds_pebs for distinguishing with the upcoming architectural PEBS. Besides restrict reserve_ds_buffers() helper to work only for the legacy DS based PEBS and avoid it to corrupt the pebs_active flag and release PEBS buffer incorrectly for arch-PEBS since the later patch would reuse these flags and alloc/release_pebs_buffer() helpers for arch-PEBS. Intel-SIG: commit acb727e0956a perf/x86/intel: Rename x86_pmu.pebs to x86_pmu.ds_pebs Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-6-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 6 +++--- arch/x86/events/intel/ds.c | 32 ++++++++++++++++++-------------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5869b777fac1c..cba703a6d5853 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4566,7 +4566,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return arr; /* @@ -5729,7 +5729,7 @@ static __init void intel_clovertown_quirk(void) * these chips. */ pr_warn("PEBS disabled due to CPU errata\n"); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; x86_pmu.pebs_constraints = NULL; } @@ -6234,7 +6234,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.pebs ? attr->mode : 0; + return x86_pmu.ds_pebs ? attr->mode : 0; } static umode_t diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index b3c191fd98da5..3891d1324000d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -602,7 +602,7 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); @@ -637,7 +637,7 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -712,7 +712,7 @@ void release_ds_buffers(void) { int cpu; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; for_each_possible_cpu(cpu) @@ -728,7 +728,8 @@ void release_ds_buffers(void) } for_each_possible_cpu(cpu) { - release_pebs_buffer(cpu); + if (x86_pmu.ds_pebs) + release_pebs_buffer(cpu); release_bts_buffer(cpu); } } @@ -739,15 +740,17 @@ void reserve_ds_buffers(void) int cpu; x86_pmu.bts_active = 0; - x86_pmu.pebs_active = 0; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (x86_pmu.ds_pebs) + x86_pmu.pebs_active = 0; + + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; if (!x86_pmu.bts) bts_err = 1; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) pebs_err = 1; for_each_possible_cpu(cpu) { @@ -759,7 +762,8 @@ void reserve_ds_buffers(void) if (!bts_err && alloc_bts_buffer(cpu)) bts_err = 1; - if (!pebs_err && alloc_pebs_buffer(cpu)) + if (x86_pmu.ds_pebs && !pebs_err && + alloc_pebs_buffer(cpu)) pebs_err = 1; if (bts_err && pebs_err) @@ -771,7 +775,7 @@ void reserve_ds_buffers(void) release_bts_buffer(cpu); } - if (pebs_err) { + if (x86_pmu.ds_pebs && pebs_err) { for_each_possible_cpu(cpu) release_pebs_buffer(cpu); } @@ -783,7 +787,7 @@ void reserve_ds_buffers(void) if (x86_pmu.bts && !bts_err) x86_pmu.bts_active = 1; - if (x86_pmu.pebs && !pebs_err) + if (x86_pmu.ds_pebs && !pebs_err) x86_pmu.pebs_active = 1; for_each_possible_cpu(cpu) { @@ -2661,12 +2665,12 @@ void __init intel_pebs_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64)) return; - x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + x86_pmu.ds_pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.version <= 4) x86_pmu.pebs_no_isolation = 1; - if (x86_pmu.pebs) { + if (x86_pmu.ds_pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; @@ -2750,7 +2754,7 @@ void __init intel_pebs_init(void) default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; } } } @@ -2759,7 +2763,7 @@ void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 320595fe0c48f..00c7c34273b9d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -908,7 +908,7 @@ struct x86_pmu { */ unsigned int bts :1, bts_active :1, - pebs :1, + ds_pebs :1, pebs_active :1, pebs_broken :1, pebs_prec_dist :1, From 014d65bd78edd4aa72cbd05c47b42b47cf3db70a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:12 +0000 Subject: [PATCH 65/89] perf/x86/intel: Introduce pairs of PEBS static calls commit 4a3fd13054a98c43dfcfcbdb93deb43c7b1b9c34 upstream. Arch-PEBS retires IA32_PEBS_ENABLE and MSR_PEBS_DATA_CFG MSRs, so intel_pmu_pebs_enable/disable() and intel_pmu_pebs_enable/disable_all() are not needed to call for ach-PEBS. To make the code cleaner, introduce static calls x86_pmu_pebs_enable/disable() and x86_pmu_pebs_enable/disable_all() instead of adding "x86_pmu.arch_pebs" check directly in these helpers. Intel-SIG: commit 4a3fd13054a9 perf/x86/intel: Introduce pairs of PEBS static calls Suggested-by: Peter Zijlstra Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-7-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 10 ++++++++++ arch/x86/events/intel/core.c | 8 ++++---- arch/x86/events/intel/ds.c | 5 +++++ arch/x86/events/perf_event.h | 8 ++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 14663df555268..4fe1a49ec1851 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -96,6 +96,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -2049,6 +2054,11 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); + + static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); + static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); + static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); + static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); } static void _x86_pmu_read(struct perf_event *event) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cba703a6d5853..32c40950e4f91 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2296,7 +2296,7 @@ static __always_inline void __intel_pmu_disable_all(bool bts) static __always_inline void intel_pmu_disable_all(void) { __intel_pmu_disable_all(true); - intel_pmu_pebs_disable_all(); + static_call_cond(x86_pmu_pebs_disable_all)(); intel_pmu_lbr_disable_all(); } @@ -2328,7 +2328,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) static void intel_pmu_enable_all(int added) { - intel_pmu_pebs_enable_all(); + static_call_cond(x86_pmu_pebs_enable_all)(); __intel_pmu_enable_all(added, false); } @@ -2585,7 +2585,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * so we don't trigger the event without PEBS bit set. */ if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); + static_call(x86_pmu_pebs_disable)(event); } static void intel_pmu_assign_event(struct perf_event *event, int idx) @@ -2936,7 +2936,7 @@ static void intel_pmu_enable_event(struct perf_event *event) int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); + static_call(x86_pmu_pebs_enable)(event); switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3891d1324000d..237bdb01fbaa7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2678,6 +2678,11 @@ void __init intel_pebs_init(void) if (format < 4) x86_pmu.intel_cap.pebs_baseline = 0; + x86_pmu.pebs_enable = intel_pmu_pebs_enable; + x86_pmu.pebs_disable = intel_pmu_pebs_disable; + x86_pmu.pebs_enable_all = intel_pmu_pebs_enable_all; + x86_pmu.pebs_disable_all = intel_pmu_pebs_disable_all; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 00c7c34273b9d..9c99c4ff48928 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -818,6 +818,10 @@ struct x86_pmu { int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); void (*late_setup)(void); + void (*pebs_enable)(struct perf_event *event); + void (*pebs_disable)(struct perf_event *event); + void (*pebs_enable_all)(void); + void (*pebs_disable_all)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -1137,6 +1141,10 @@ DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { From 6f051d8b350c855e4ce5d08bd45364e66f1b3e9c Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:26 +0800 Subject: [PATCH 66/89] perf/x86/intel: Use early_initcall() to hook bts_init() commit d9cf9c6884d21e01483c4e17479d27636ea4bb50 upstream. After the commit 'd971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization")' is introduced, x86_pmu.bts would initialized in bts_init() which is hooked by arch_initcall(). Whereas init_hw_perf_events() is hooked by early_initcall(). Once the core PMU is initialized, nmi watchdog initialization is called immediately before bts_init() is called. It leads to the BTS buffer is not really initialized since bts_init() is not called and x86_pmu.bts is still false at that time. Worse, BTS buffer would never be initialized then unless all core PMU events are freed and reserve_ds_buffers() is called again. Thus aligning with init_hw_perf_events(), use early_initcall() to hook bts_init() to ensure x86_pmu.bts is initialized before nmi watchdog initialization. Intel-SIG: commit d9cf9c6884d2 perf/x86/intel: Use early_initcall() to hook bts_init() Fixes: d971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-2-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/bts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index d3a4b6630368c..94f81ead16b4f 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -626,4 +626,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); From 459b6f715fae7cc8b0a1995bf8babaedfce64ecd Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:29 +0800 Subject: [PATCH 67/89] perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag commit 0c5caea762de31a85cbcce65d978cec83449f699 upstream. IA32_PERF_CAPABILITIES.PEBS_TIMING_INFO[bit 17] is introduced to indicate whether timed PEBS is supported. Timed PEBS adds a new "retired latency" field in basic info group to show the timing info. Please find detailed information about timed PEBS in section 8.4.1 "Timed Processor Event Based Sampling" of "Intel Architecture Instruction Set Extensions and Future Features". This patch adds PERF_CAP_PEBS_TIMING_INFO flag and KVM module leverages this flag to expose timed PEBS feature to guest. Moreover, opportunistically refine the indents and make the macros share consistent indents. Intel-SIG: commit 0c5caea762de perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-5-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/include/asm/msr-index.h | 14 ++++++++------ tools/arch/x86/include/asm/msr-index.h | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 698f06f572c30..c0321a755bb3d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -289,12 +289,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index f7c5d52aaf537..27e7c7ffc565e 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -258,12 +258,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) From 608dfd21d4dd3979b2e733f270aeca33f9440be6 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 6 May 2025 17:49:07 +0800 Subject: [PATCH 68/89] perf/x86/intel/ds: Remove redundant assignments to sample.period commit 75a9001bab36f0456f6aae1ab0aa487db456464a upstream. The perf_sample_data_init() has already set the period of sample, so no need to do it again. Intel-SIG: commit 75a9001bab36 perf/x86/intel/ds: Remove redundant assignments to sample.period Signed-off-by: Changbin Du Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250506094907.2724-1-changbin.du@huawei.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 237bdb01fbaa7..6435543a7e6a7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1809,8 +1809,6 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; - /* * Use latency for weight (only avail with PEBS-LL) */ @@ -2065,7 +2063,6 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, sample_type = event->attr.sample_type; format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; setup_pebs_time(event, data, basic->tsc); From 1b463b309c6ac93b06c2403d6757a526b86b65e2 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:08 -0700 Subject: [PATCH 69/89] perf/core: Check sample_type in perf_sample_save_callchain commit f226805bc5f60adf03783d8e4cbfe303ccecd64e upstream. Check sample_type in perf_sample_save_callchain() to prevent saving callchain data when it isn't required. Intel-SIG: commit f226805bc5f6 perf/core: Check sample_type in perf_sample_save_callchain Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-3-yabinc@google.com Signed-off-by: Jason Zeng --- arch/x86/events/amd/ibs.c | 3 +-- arch/x86/events/intel/ds.c | 6 ++---- include/linux/perf_event.h | 5 +++++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d34ee6f04f18f..1022d1dcbe447 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1138,8 +1138,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(&data, event, iregs); + perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6435543a7e6a7..24ed189e1f250 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1831,8 +1831,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); /* * We use the interrupt regs as a base because the PEBS record does not @@ -2072,8 +2071,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1818251a2105d..569a4f44fe6e5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1240,6 +1240,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; From a99a5a9d5ac50e38ea8322e31f7f35754516595f Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:09 -0700 Subject: [PATCH 70/89] perf/core: Check sample_type in perf_sample_save_brstack commit faac6f105ef169e2e5678c14e1ffebf2a7d780b6 upstream. Check sample_type in perf_sample_save_brstack() to prevent saving branch stack data when it isn't required. Intel-SIG: commit faac6f105ef1 perf/core: Check sample_type in perf_sample_save_brstack Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-4-yabinc@google.com Conflicts: include/linux/perf_event.h [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- arch/x86/events/amd/core.c | 3 +-- arch/x86/events/core.c | 3 +-- arch/x86/events/intel/ds.c | 3 +-- include/linux/perf_event.h | 15 ++++++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 29104ba0a2f1a..beec1a3f4ee66 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -968,8 +968,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4fe1a49ec1851..7c00cf7edd18c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1723,8 +1723,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 24ed189e1f250..765af942fb909 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1930,8 +1930,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 3) setup_pebs_time(event, data, pebs->tsc); - if (has_branch_stack(event)) - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } static void adaptive_pebs_save_regs(struct pt_regs *regs, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 569a4f44fe6e5..6a1386279b5c9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1281,6 +1281,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1288,6 +1293,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); @@ -1674,11 +1684,6 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_arch_bpf_user_pt_regs(regs) regs #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; From 108cf76acd3ac8f10617bf35643c4d2d98d9b94f Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 28 Oct 2025 14:42:14 +0800 Subject: [PATCH 71/89] perf/x86/intel: Fix KASAN global-out-of-bounds warning commit 0ba6502ce167fc3d598c08c2cc3b4ed7ca5aa251 upstream. When running "perf mem record" command on CWF, the below KASAN global-out-of-bounds warning is seen. ================================================================== BUG: KASAN: global-out-of-bounds in cmt_latency_data+0x176/0x1b0 Read of size 4 at addr ffffffffb721d000 by task dtlb/9850 Call Trace: kasan_report+0xb8/0xf0 cmt_latency_data+0x176/0x1b0 setup_arch_pebs_sample_data+0xf49/0x2560 intel_pmu_drain_arch_pebs+0x577/0xb00 handle_pmi_common+0x6c4/0xc80 The issue is caused by below code in __grt_latency_data(). The code tries to access x86_hybrid_pmu structure which doesn't exist on non-hybrid platform like CWF. WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big) So add is_hybrid() check before calling this WARN_ON_ONCE to fix the global-out-of-bounds access issue. Intel-SIG: commit 0ba6502ce167 perf/x86/intel: Fix KASAN global-out-of-bounds warning Fixes: 090262439f66 ("perf/x86/intel: Rename model-specific pebs_latency_data functions") Reported-by: Xudong Hao Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Zide Chen Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251028064214.1451968-1-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 765af942fb909..da55bf957b008 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -304,7 +304,8 @@ static u64 __grt_latency_data(struct perf_event *event, u64 status, { u64 val; - WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); + WARN_ON_ONCE(is_hybrid() && + hybrid_pmu(event->pmu)->pmu_type == hybrid_big); dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; From ef681086fd11e6e9460f2b68dded6553bceb1d3a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:25 +0800 Subject: [PATCH 72/89] perf/x86: Remove redundant is_x86_event() prototype commit c7f69dc073e51f1c448713320ccd2e2be63fb1f6 upstream. 2 is_x86_event() prototypes are defined in perf_event.h. Remove the redundant one. Intel-SIG: commit c7f69dc073e5 perf/x86: Remove redundant is_x86_event() prototype Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-2-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9c99c4ff48928..ae5fcd2ee6843 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1133,7 +1133,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } -int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; From 738201da654c6551a8ab83d357242912c9ce2054 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:26 +0800 Subject: [PATCH 73/89] perf/x86: Fix NULL event access and potential PEBS record loss commit 7e772a93eb61cb6265bdd1c5bde17d0f2718b452 upstream. When intel_pmu_drain_pebs_icl() is called to drain PEBS records, the perf_event_overflow() could be called to process the last PEBS record. While perf_event_overflow() could trigger the interrupt throttle and stop all events of the group, like what the below call-chain shows. perf_event_overflow() -> __perf_event_overflow() ->__perf_event_account_interrupt() -> perf_event_throttle_group() -> perf_event_throttle() -> event->pmu->stop() -> x86_pmu_stop() The side effect of stopping the events is that all corresponding event pointers in cpuc->events[] array are cleared to NULL. Assume there are two PEBS events (event a and event b) in a group. When intel_pmu_drain_pebs_icl() calls perf_event_overflow() to process the last PEBS record of PEBS event a, interrupt throttle is triggered and all pointers of event a and event b are cleared to NULL. Then intel_pmu_drain_pebs_icl() tries to process the last PEBS record of event b and encounters NULL pointer access. To avoid this issue, move cpuc->events[] clearing from x86_pmu_stop() to x86_pmu_del(). It's safe since cpuc->active_mask or cpuc->pebs_enabled is always checked before access the event pointer from cpuc->events[]. Intel-SIG: commit 7e772a93eb61 perf/x86: Fix NULL event access and potential PEBS record loss Closes: https://lore.kernel.org/oe-lkp/202507042103.a15d2923-lkp@intel.com Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Reported-by: kernel test robot Suggested-by: Peter Zijlstra Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-3-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7c00cf7edd18c..375ce24154c9a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1344,6 +1344,7 @@ static void x86_pmu_enable(struct pmu *pmu) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[hwc->idx] = NULL; } /* @@ -1365,6 +1366,7 @@ static void x86_pmu_enable(struct pmu *pmu) * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ + cpuc->events[hwc->idx] = event; x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1531,7 +1533,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) event->hw.state = 0; - cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); @@ -1608,7 +1609,6 @@ void x86_pmu_stop(struct perf_event *event, int flags) if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -1646,6 +1646,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) From 3424e47099ea66b998d4cd720b151f2474ae30de Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:27 +0800 Subject: [PATCH 74/89] perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call commit ee98b8bfc7c4baca69a6852c4ecc399794f7e53b upstream. Use x86_pmu_drain_pebs static call to replace calling x86_pmu.drain_pebs function pointer. Intel-SIG: commit ee98b8bfc7c4 perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call Suggested-by: Peter Zijlstra Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-4-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 32c40950e4f91..1602118e3dc39 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3258,7 +3258,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * The PEBS buffer has to be drained before handling the A-PMI */ if (is_pebs_counter_event_group(event)) - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); if (!intel_pmu_save_and_restart(event)) continue; From 68e8c2722a7e69ff80416f1a1376432928d63b69 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:28 +0800 Subject: [PATCH 75/89] perf/x86/intel: Correct large PEBS flag check commit 5e4e355ae7cdeb0fef5dbe908866e1f895abfacc upstream. current large PEBS flag check only checks if sample_regs_user contains unsupported GPRs but doesn't check if sample_regs_intr contains unsupported GPRs. Of course, currently PEBS HW supports to sample all perf supported GPRs, the missed check doesn't cause real issue. But it won't be true any more after the subsequent patches support to sample SSP register. SSP sampling is not supported by adaptive PEBS HW and it would be supported until arch-PEBS HW. So correct this issue. Intel-SIG: commit 5e4e355ae7cd perf/x86/intel: Correct large PEBS flag check Fixes: a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-5-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1602118e3dc39..908b5d7138f4d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4017,7 +4017,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; if (event->attr.sample_regs_user & ~PEBS_GP_REGS) - flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_intr & ~PEBS_GP_REGS) + flags &= ~PERF_SAMPLE_REGS_INTR; return flags; } From f0dc3c53fe037bd481d287c99547ce8013fa1b88 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:29 +0800 Subject: [PATCH 76/89] perf/x86/intel: Initialize architectural PEBS commit d243d0bb64af1e90ec18ac2fa6e7cadfe8895913 upstream. arch-PEBS leverages CPUID.23H.4/5 sub-leaves enumerate arch-PEBS supported capabilities and counters bitmap. This patch parses these 2 sub-leaves and initializes arch-PEBS capabilities and corresponding structures. Since IA32_PEBS_ENABLE and MSR_PEBS_DATA_CFG MSRs are no longer existed for arch-PEBS, arch-PEBS doesn't need to manipulate these MSRs. Thus add a simple pair of __intel_pmu_pebs_enable/disable() callbacks for arch-PEBS. Intel-SIG: commit d243d0bb64af perf/x86/intel: Initialize architectural PEBS Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-6-dapeng1.mi@linux.intel.com Conflicts: arch/x86/events/intel/ds.c arch/x86/events/perf_event.h [jz: resolve context conflict, use wrmsrl(), instead of wrmsrq()] Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 21 ++++++++--- arch/x86/events/intel/core.c | 60 ++++++++++++++++++++++--------- arch/x86/events/intel/ds.c | 52 ++++++++++++++++++++++----- arch/x86/events/perf_event.h | 25 +++++++++++-- arch/x86/include/asm/perf_event.h | 7 +++- 5 files changed, 132 insertions(+), 33 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 375ce24154c9a..addaff4238dd1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -554,14 +554,22 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_max_precise(void) +int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { - precise++; + /* arch PEBS */ + if (x86_pmu.arch_pebs) { + precise = 2; + if (hybrid(pmu, arch_pebs_cap).pdists) + precise++; + + return precise; + } + /* legacy PEBS - support for constant skid */ + precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; @@ -569,13 +577,14 @@ int x86_pmu_max_precise(void) if (x86_pmu.pebs_prec_dist) precise++; } + return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { - int precise = x86_pmu_max_precise(); + int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -2628,7 +2637,9 @@ static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); + struct pmu *pmu = dev_get_drvdata(cdev); + + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 908b5d7138f4d..453da07c14628 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5233,34 +5233,59 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } +#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) + static void update_pmu_cap(struct pmu *pmu) { - unsigned int cntr, fixed_cntr, ecx, edx; - union cpuid35_eax eax; - union cpuid35_ebx ebx; + unsigned int eax, ebx, ecx, edx; + union cpuid35_eax eax_0; + union cpuid35_ebx ebx_0; + u64 cntrs_mask = 0; + u64 pebs_mask = 0; + u64 pdists_mask = 0; - cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx); - if (ebx.split.umask2) + if (ebx_0.split.umask2) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx.split.eq) + if (ebx_0.split.eq) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; - if (eax.split.cntr_subleaf) { + if (eax_0.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); - hybrid(pmu, cntr_mask64) = cntr; - hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + &eax, &ebx, &ecx, &edx); + hybrid(pmu, cntr_mask64) = eax; + hybrid(pmu, fixed_cntr_mask64) = ebx; + cntrs_mask = counter_mask(eax, ebx); } - if (eax.split.acr_subleaf) { + if (eax_0.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); + &eax, &ebx, &ecx, &edx); /* The mask of the counters which can be reloaded */ - hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); - + hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx); /* The mask of the counters which can cause a reload of reloadable counters */ - hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx); + } + + /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */ + if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF, + &eax, &ebx, &ecx, &edx); + hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32; + + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF, + &eax, &ebx, &ecx, &edx); + pebs_mask = counter_mask(eax, ecx); + pdists_mask = counter_mask(ebx, edx); + hybrid(pmu, arch_pebs_cap).counters = pebs_mask; + hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; + + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) + x86_pmu.arch_pebs = 0; + } else { + WARN_ON(x86_pmu.arch_pebs == 1); + x86_pmu.arch_pebs = 0; } if (!intel_pmu_broken_perf_cap()) { @@ -6236,7 +6261,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.ds_pebs ? attr->mode : 0; + return intel_pmu_has_pebs() ? attr->mode : 0; } static umode_t @@ -7612,6 +7637,9 @@ __init int intel_pmu_init(void) if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(NULL); + if (x86_pmu.arch_pebs) + pr_cont("Architectural PEBS, "); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index da55bf957b008..b0d1692cd925e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1509,6 +1509,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) intel_pmu_drain_pebs_buffer(); } +static void __intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + cpuc->pebs_enabled |= 1ULL << hwc->idx; +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1517,9 +1526,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct debug_store *ds = cpuc->ds; unsigned int idx = hwc->idx; - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - - cpuc->pebs_enabled |= 1ULL << hwc->idx; + __intel_pmu_pebs_enable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); @@ -1581,14 +1588,22 @@ void intel_pmu_pebs_del(struct perf_event *event) pebs_update_state(needed_cb, cpuc, event, false); } -void intel_pmu_pebs_disable(struct perf_event *event) +static void __intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; intel_pmu_drain_large_pebs(cpuc); - cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + __intel_pmu_pebs_disable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) @@ -1600,8 +1615,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); - - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; } void intel_pmu_pebs_enable_all(void) @@ -2648,11 +2661,26 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d } } +static void __init intel_arch_pebs_init(void) +{ + /* + * Current hybrid platforms always both support arch-PEBS or not + * on all kinds of cores. So directly set x86_pmu.arch_pebs flag + * if boot cpu supports arch-PEBS. + */ + x86_pmu.arch_pebs = 1; + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + x86_pmu.pebs_capable = ~0ULL; + + x86_pmu.pebs_enable = __intel_pmu_pebs_enable; + x86_pmu.pebs_disable = __intel_pmu_pebs_disable; +} + /* * PEBS probe and setup */ -void __init intel_pebs_init(void) +static void __init intel_ds_pebs_init(void) { /* * No support for 32bit formats @@ -2759,6 +2787,14 @@ void __init intel_pebs_init(void) } } +void __init intel_pebs_init(void) +{ + if (x86_pmu.intel_cap.pebs_format == 0xf) + intel_arch_pebs_init(); + else + intel_ds_pebs_init(); +} + void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ae5fcd2ee6843..32c91b5b9f470 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -710,6 +710,12 @@ enum hybrid_pmu_type { #define X86_HYBRID_NUM_PMUS 2 +struct arch_pebs_cap { + u64 caps; + u64 counters; + u64 pdists; +}; + struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -754,6 +760,8 @@ struct x86_hybrid_pmu { mid_ack :1, enabled_ack :1; + struct arch_pebs_cap arch_pebs_cap; + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; }; @@ -908,7 +916,7 @@ struct x86_pmu { union perf_capabilities intel_cap; /* - * Intel DebugStore bits + * Intel DebugStore and PEBS bits */ unsigned int bts :1, bts_active :1, @@ -919,7 +927,8 @@ struct x86_pmu { pebs_no_tlb :1, pebs_no_isolation :1, pebs_block :1, - pebs_ept :1; + pebs_ept :1, + arch_pebs :1; int pebs_record_size; int pebs_buffer_size; u64 pebs_events_mask; @@ -931,6 +940,11 @@ struct x86_pmu { u64 rtm_abort_event; u64 pebs_capable; + /* + * Intel Architectural PEBS + */ + struct arch_pebs_cap arch_pebs_cap; + /* * Intel LBR */ @@ -1225,7 +1239,7 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); -int x86_pmu_max_precise(void); +int x86_pmu_max_precise(struct pmu *pmu); void hw_perf_lbr_event_destroy(struct perf_event *event); @@ -1782,6 +1796,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu) return fls((u32)hybrid(pmu, pebs_events_mask)); } +static inline bool intel_pmu_has_pebs(void) +{ + return x86_pmu.ds_pebs || x86_pmu.arch_pebs; +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 87b9cc9e47ca8..b7220d4c4f49a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -200,6 +200,8 @@ union cpuid10_edx { #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 #define ARCH_PERFMON_ACR_LEAF 0x2 +#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4 +#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5 union cpuid35_eax { struct { @@ -210,7 +212,10 @@ union cpuid35_eax { unsigned int acr_subleaf:1; /* Events Sub-Leaf */ unsigned int events_subleaf:1; - unsigned int reserved:28; + /* arch-PEBS Sub-Leaves */ + unsigned int pebs_caps_subleaf:1; + unsigned int pebs_cnts_subleaf:1; + unsigned int reserved:26; } split; unsigned int full; }; From 614cfbb2a5b4362e2458b46ae6de26781534bfd7 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:30 +0800 Subject: [PATCH 77/89] perf/x86/intel/ds: Factor out PEBS record processing code to functions commit 8807d922705f0a137d8de5f636b50e7b4fbef155 upstream. Beside some PEBS record layout difference, arch-PEBS can share most of PEBS record processing code with adaptive PEBS. Thus, factor out these common processing code to independent inline functions, so they can be reused by subsequent arch-PEBS handler. Intel-SIG: commit 8807d922705f perf/x86/intel/ds: Factor out PEBS record processing code to functions Suggested-by: Kan Liang Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-7-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 83 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index b0d1692cd925e..ba386b15181b2 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2593,6 +2593,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } } +static __always_inline void +__intel_pmu_handle_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, u64 pebs_status, + short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, + last[bit], setup_sample); + } + + last[bit] = at; + } +} + +static __always_inline void +__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 mask, short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { + if (!counts[bit]) + continue; + + event = cpuc->events[bit]; + + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_sample); + } + +} + static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; @@ -2602,9 +2653,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; struct pebs_basic *basic; - struct perf_event *event; void *base, *at, *top; - int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2617,6 +2666,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d mask = hybrid(cpuc->pmu, pebs_events_mask) | (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); + mask &= cpuc->pebs_enabled; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, mask); @@ -2634,31 +2684,14 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (basic->format_size != cpuc->pebs_record_size) continue; - pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { - event = cpuc->events[bit]; - - if (WARN_ON_ONCE(!event) || - WARN_ON_ONCE(!event->attr.precise_ip)) - continue; - - if (counts[bit]++) { - __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], - setup_pebs_adaptive_sample_data); - } - last[bit] = at; - } + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_pebs_adaptive_sample_data); } - for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (!counts[bit]) - continue; - - event = cpuc->events[bit]; - - __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], - counts[bit], setup_pebs_adaptive_sample_data); - } + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last, + setup_pebs_adaptive_sample_data); } static void __init intel_arch_pebs_init(void) From 95c81223dbbaa110a1d7753d1add094556b1ec4a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:31 +0800 Subject: [PATCH 78/89] perf/x86/intel/ds: Factor out PEBS group processing code to functions commit 167cde7dc9b36b7a88f3c29d836fabce13023327 upstream. Adaptive PEBS and arch-PEBS share lots of same code to process these PEBS groups, like basic, GPR and meminfo groups. Extract these shared code to generic functions to avoid duplicated code. Intel-SIG: commit 167cde7dc9b3 perf/x86/intel/ds: Factor out PEBS group processing code to functions Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-8-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 170 +++++++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 66 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba386b15181b2..996d212225131 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2050,6 +2050,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, #define PEBS_LATENCY_MASK 0xffff +static inline void __setup_perf_sample_data(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data) +{ + perf_sample_data_init(data, 0, event->hw.last_period); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + perf_sample_save_callchain(data, event, iregs); +} + +static inline void __setup_pebs_basic_group(struct perf_event *event, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 sample_type, u64 ip, + u64 tsc, u16 retire) +{ + /* The ip in basic is EventingIP */ + set_linear_ip(regs, ip); + regs->flags = PERF_EFLAGS_EXACT; + setup_pebs_time(event, data, tsc); + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + data->weight.var3_w = retire; +} + +static inline void __setup_pebs_gpr_group(struct perf_event *event, + struct pt_regs *regs, + struct pebs_gprs *gprs, + u64 sample_type) +{ + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) + adaptive_pebs_save_regs(regs, gprs); +} + +static inline void __setup_pebs_meminfo_group(struct perf_event *event, + struct perf_sample_data *data, + u64 sample_type, u64 latency, + u16 instr_latency, u64 address, + u64 aux, u64 tsx_tuning, u64 ax) +{ + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 tsx_latency = intel_get_tsx_weight(tsx_tuning); + + data->weight.var2_w = instr_latency; + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight.full = latency ?: tsx_latency; + else + data->weight.var1_dw = (u32)latency ?: tsx_latency; + + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = get_data_src(event, aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { + data->addr = address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = intel_get_tsx_transaction(tsx_tuning, ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } +} + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -2059,12 +2143,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; + u64 format_group; + u16 retire; if (basic == NULL) return; @@ -2072,31 +2158,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs = container_of(regs, struct x86_perf_regs, regs); perf_regs->xmm_regs = NULL; - sample_type = event->attr.sample_type; format_group = basic->format_group; - perf_sample_data_init(data, 0, event->hw.last_period); - setup_pebs_time(event, data, basic->tsc); - - /* - * We must however always use iregs for the unwinder to stay sane; the - * record BP,SP,IP can point into thin air when the record is from a - * previous PMI context or an (I)RET happened between the record and - * PMI. - */ - perf_sample_save_callchain(data, event, iregs); + __setup_perf_sample_data(event, iregs, data); *regs = *iregs; - /* The ip in basic is EventingIP */ - set_linear_ip(regs, basic->ip); - regs->flags = PERF_EFLAGS_EXACT; - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { - if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = basic->retire_latency; - else - data->weight.var3_w = 0; - } + /* basic group */ + retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ? + basic->retire_latency : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); /* * The record for MEMINFO is in front of GP @@ -2112,54 +2184,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, gprs = next_record; next_record = gprs + 1; - if (event->attr.precise_ip < 2) { - set_linear_ip(regs, gprs->ip); - regs->flags &= ~PERF_EFLAGS_EXACT; - } - - if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) - adaptive_pebs_save_regs(regs, gprs); + __setup_pebs_gpr_group(event, regs, gprs, sample_type); } if (format_group & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? - meminfo->cache_latency : meminfo->mem_latency; - - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) - data->weight.var2_w = meminfo->instr_latency; - - /* - * Although meminfo::latency is defined as a u64, - * only the lower 32 bits include the valid data - * in practice on Ice Lake and earlier platforms. - */ - if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } else { - data->weight.var1_dw = (u32)latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } - - data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; - } - - if (sample_type & PERF_SAMPLE_DATA_SRC) { - data->data_src.val = get_data_src(event, meminfo->aux); - data->sample_flags |= PERF_SAMPLE_DATA_SRC; - } - - if (sample_type & PERF_SAMPLE_ADDR_TYPE) { - data->addr = meminfo->address; - data->sample_flags |= PERF_SAMPLE_ADDR; - } - - if (sample_type & PERF_SAMPLE_TRANSACTION) { - data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, - gprs ? gprs->ax : 0); - data->sample_flags |= PERF_SAMPLE_TRANSACTION; - } + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; + u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->instr_latency : 0; + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, latency, + instr_latency, meminfo->address, + meminfo->aux, meminfo->tsx_tuning, + ax); } if (format_group & PEBS_DATACFG_XMMS) { From 3ac61c4155079f2c64f959ac98be41df4da35bcd Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:32 +0800 Subject: [PATCH 79/89] perf/x86/intel: Process arch-PEBS records or record fragments commit d21954c8a0ffbc94ffdd65106fb6da5b59042e0a upstream. A significant difference with adaptive PEBS is that arch-PEBS record supports fragments which means an arch-PEBS record could be split into several independent fragments which have its own arch-PEBS header in each fragment. This patch defines architectural PEBS record layout structures and add helpers to process arch-PEBS records or fragments. Only legacy PEBS groups like basic, GPR, XMM and LBR groups are supported in this patch, the new added YMM/ZMM/OPMASK vector registers capturing would be supported in the future. Intel-SIG: commit d21954c8a0ff perf/x86/intel: Process arch-PEBS records or record fragments Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-9-dapeng1.mi@linux.intel.com [jz: use rdmsrl/wrmsrl instead of rdmsrq/wrmsrq] Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 13 +++ arch/x86/events/intel/ds.c | 184 ++++++++++++++++++++++++++++++ arch/x86/include/asm/msr-index.h | 6 + arch/x86/include/asm/perf_event.h | 96 ++++++++++++++++ 4 files changed, 299 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 453da07c14628..6cf08d0f3d0d5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3205,6 +3205,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; } + /* + * Arch PEBS sets bit 54 in the global status register + */ + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT, + (unsigned long *)&status)) { + handled++; + static_call(x86_pmu_drain_pebs)(regs, &data); + + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; + } + /* * Intel PT */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 996d212225131..43f9429da4e72 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2248,6 +2248,117 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, format_group); } +static inline bool arch_pebs_record_continued(struct arch_pebs_header *header) +{ + /* Continue bit or null PEBS record indicates fragment follows. */ + return header->cont || !(header->format & GENMASK_ULL(63, 16)); +} + +static void setup_arch_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, + void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; + struct arch_pebs_header *header = NULL; + struct arch_pebs_aux *meminfo = NULL; + struct arch_pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + void *next_record; + void *at = __pebs; + + if (at == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + __setup_perf_sample_data(event, iregs, data); + + *regs = *iregs; + +again: + header = at; + next_record = at + sizeof(struct arch_pebs_header); + if (header->basic) { + struct arch_pebs_basic *basic = next_record; + u16 retire = 0; + + next_record = basic + 1; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + retire = basic->valid ? basic->retire : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); + } + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (header->aux) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (header->gpr) { + gprs = next_record; + next_record = gprs + 1; + + __setup_pebs_gpr_group(event, regs, + (struct pebs_gprs *)gprs, + sample_type); + } + + if (header->aux) { + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, + meminfo->cache_latency, + meminfo->instr_latency, + meminfo->address, meminfo->aux, + meminfo->tsx_tuning, ax); + } + + if (header->xmm) { + struct pebs_xmm *xmm; + + next_record += sizeof(struct arch_pebs_xer_header); + + xmm = next_record; + perf_regs->xmm_regs = xmm->xmm; + next_record = xmm + 1; + } + + if (header->lbr) { + struct arch_pebs_lbr_header *lbr_header = next_record; + struct lbr_entry *lbr; + int num_lbr; + + next_record = lbr_header + 1; + lbr = next_record; + + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ? + lbr_header->depth : + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES; + next_record += num_lbr * sizeof(struct lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + intel_pmu_lbr_save_brstack(data, cpuc, event); + } + } + + /* Parse followed fragments if there are. */ + if (arch_pebs_record_continued(header)) { + at = at + header->size; + goto again; + } +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -2732,6 +2843,78 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d setup_pebs_adaptive_sample_data); } +static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, + struct perf_sample_data *data) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union arch_pebs_index index; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *base, *at, *top; + u64 mask; + + rdmsrl(MSR_IA32_PEBS_INDEX, index.whole); + + if (unlikely(!index.wr)) { + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + return; + } + + base = cpuc->ds_pebs_vaddr; + top = (void *)((u64)cpuc->ds_pebs_vaddr + + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT)); + + index.wr = 0; + index.full = 0; + wrmsrl(MSR_IA32_PEBS_INDEX, index.whole); + + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; + + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top;) { + struct arch_pebs_header *header; + struct arch_pebs_basic *basic; + u64 pebs_status; + + header = at; + + if (WARN_ON_ONCE(!header->size)) + break; + + /* 1st fragment or single record must have basic group */ + if (!header->basic) { + at += header->size; + continue; + } + + basic = at + sizeof(struct arch_pebs_header); + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_arch_pebs_sample_data); + + /* Skip non-last fragments */ + while (arch_pebs_record_continued(header)) { + if (!header->size) + break; + at += header->size; + header = at; + } + + /* Skip last fragment or the single record */ + at += header->size; + } + + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, + counts, last, + setup_arch_pebs_sample_data); +} + static void __init intel_arch_pebs_init(void) { /* @@ -2741,6 +2924,7 @@ static void __init intel_arch_pebs_init(void) */ x86_pmu.arch_pebs = 1; x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; x86_pmu.pebs_capable = ~0ULL; x86_pmu.pebs_enable = __intel_pmu_pebs_enable; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c0321a755bb3d..1bfdd6ea8419f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -298,6 +298,12 @@ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ PERF_CAP_PEBS_TIMING_INFO) +/* Arch PEBS */ +#define MSR_IA32_PEBS_BASE 0x000003f4 +#define MSR_IA32_PEBS_INDEX 0x000003f5 +#define ARCH_PEBS_OFFSET_MASK 0x7fffff +#define ARCH_PEBS_INDEX_WR_SHIFT 4 + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b7220d4c4f49a..b9adb35c80db5 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -437,6 +437,8 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54 +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 #define GLOBAL_CTRL_EN_PERF_METRICS 48 @@ -507,6 +509,100 @@ struct pebs_cntr_header { #define INTEL_CNTR_METRICS 0x3 +/* + * Arch PEBS + */ +union arch_pebs_index { + struct { + u64 rsvd:4, + wr:23, + rsvd2:4, + full:1, + en:1, + rsvd3:3, + thresh:23, + rsvd4:5; + }; + u64 whole; +}; + +struct arch_pebs_header { + union { + u64 format; + struct { + u64 size:16, /* Record size */ + rsvd:14, + mode:1, /* 64BIT_MODE */ + cont:1, + rsvd2:3, + cntr:5, + lbr:2, + rsvd3:7, + xmm:1, + ymmh:1, + rsvd4:2, + opmask:1, + zmmh:1, + h16zmm:1, + rsvd5:5, + gpr:1, + aux:1, + basic:1; + }; + }; + u64 rsvd6; +}; + +struct arch_pebs_basic { + u64 ip; + u64 applicable_counters; + u64 tsc; + u64 retire :16, /* Retire Latency */ + valid :1, + rsvd :47; + u64 rsvd2; + u64 rsvd3; +}; + +struct arch_pebs_aux { + u64 address; + u64 rsvd; + u64 rsvd2; + u64 rsvd3; + u64 rsvd4; + u64 aux; + u64 instr_latency :16, + pad2 :16, + cache_latency :16, + pad3 :16; + u64 tsx_tuning; +}; + +struct arch_pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp; + u64 rsvd; +}; + +struct arch_pebs_xer_header { + u64 xstate; + u64 rsvd; +}; + +#define ARCH_PEBS_LBR_NAN 0x0 +#define ARCH_PEBS_LBR_NUM_8 0x1 +#define ARCH_PEBS_LBR_NUM_16 0x2 +#define ARCH_PEBS_LBR_NUM_VAR 0x3 +#define ARCH_PEBS_BASE_LBR_ENTRIES 8 +struct arch_pebs_lbr_header { + u64 rsvd; + u64 ctl; + u64 depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; +}; + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ From 2f907e688361f64dc26e881fd20c9ce50a0017dd Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:33 +0800 Subject: [PATCH 80/89] perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR commit 2721e8da2de7271533ac36285332219f700d16ca upstream. Arch-PEBS introduces a new MSR IA32_PEBS_BASE to store the arch-PEBS buffer physical address. This patch allocates arch-PEBS buffer and then initialize IA32_PEBS_BASE MSR with the buffer physical address. Intel-SIG: commit 2721e8da2de7 perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-10-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 11 ++++- arch/x86/events/intel/ds.c | 82 ++++++++++++++++++++++++++++----- arch/x86/events/perf_event.h | 11 ++++- arch/x86/include/asm/intel_ds.h | 3 +- 4 files changed, 92 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6cf08d0f3d0d5..dfa0fc73f9317 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5189,7 +5189,13 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) static int intel_pmu_cpu_prepare(int cpu) { - return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + int ret; + + ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + if (ret) + return ret; + + return alloc_arch_pebs_buf_on_cpu(cpu); } static void flip_smm_bit(void *data) @@ -5418,6 +5424,7 @@ static void intel_pmu_cpu_starting(int cpu) return; init_debug_store_on_cpu(cpu); + init_arch_pebs_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up, and that may * even boot with LBRs enabled. @@ -5515,6 +5522,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); + fini_arch_pebs_on_cpu(cpu); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -5535,6 +5543,7 @@ static void intel_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + release_arch_pebs_buf_on_cpu(cpu); intel_cpuc_finish(cpuc); if (is_hybrid() && cpuc->pmu) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 43f9429da4e72..4f7a5bc733a07 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -603,13 +603,18 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; + if (x86_pmu.arch_pebs) { + hwev->pebs_vaddr = buffer; + return 0; + } + /* * HSW+ already provides us the eventing ip; no need to allocate this * buffer then. @@ -622,7 +627,7 @@ static int alloc_pebs_buffer(int cpu) } per_cpu(insn_buffer, cpu) = insn_buff; } - hwev->ds_pebs_vaddr = buffer; + hwev->pebs_vaddr = buffer; /* Update the cpu entry area mapping */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds->pebs_buffer_base = (unsigned long) cea; @@ -638,17 +643,20 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return; - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; + if (x86_pmu.ds_pebs) { + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; - /* Clear the fixmap */ - cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; - ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); - hwev->ds_pebs_vaddr = NULL; + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); + } + + dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) @@ -801,6 +809,56 @@ void reserve_ds_buffers(void) } } +inline int alloc_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return 0; + + return alloc_pebs_buffer(cpu); +} + +inline void release_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + release_pebs_buffer(cpu); +} + +void init_arch_pebs_on_cpu(int cpu) +{ + struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + u64 arch_pebs_base; + + if (!x86_pmu.arch_pebs) + return; + + if (!cpuc->pebs_vaddr) { + WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu); + x86_pmu.pebs_active = 0; + return; + } + + /* + * 4KB-aligned pointer of the output buffer + * (__alloc_pages_node() return page aligned address) + * Buffer Size = 4KB * 2^SIZE + * contiguous physical buffer (__alloc_pages_node() with order) + */ + arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT; + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base, + (u32)(arch_pebs_base >> 32)); + x86_pmu.pebs_active = 1; +} + +inline void fini_arch_pebs_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0); +} + /* * BTS */ @@ -2862,8 +2920,8 @@ static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, return; } - base = cpuc->ds_pebs_vaddr; - top = (void *)((u64)cpuc->ds_pebs_vaddr + + base = cpuc->pebs_vaddr; + top = (void *)((u64)cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT)); index.wr = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 32c91b5b9f470..d76d30fa37e8d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -282,8 +282,9 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; - void *ds_pebs_vaddr; void *ds_bts_vaddr; + /* DS based PEBS or arch-PEBS buffer address */ + void *pebs_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; @@ -1610,6 +1611,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); +int alloc_arch_pebs_buf_on_cpu(int cpu); + +void release_arch_pebs_buf_on_cpu(int cpu); + +void init_arch_pebs_on_cpu(int cpu); + +void fini_arch_pebs_on_cpu(int cpu); + void init_debug_store_on_cpu(int cpu); void fini_debug_store_on_cpu(int cpu); diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 5dbeac48a5b93..023c2883f9f3e 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -4,7 +4,8 @@ #include #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SHIFT 4 +#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 From 992a5a57bdf27d476212610d22fc0ba2e4221494 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:34 +0800 Subject: [PATCH 81/89] perf/x86/intel: Update dyn_constraint base on PEBS event precise level commit e89c5d1f290e8915e0aad10014f2241086ea95e4 upstream. arch-PEBS provides CPUIDs to enumerate which counters support PEBS sampling and precise distribution PEBS sampling. Thus PEBS constraints should be dynamically configured base on these counter and precise distribution bitmap instead of defining them statically. Update event dyn_constraint base on PEBS event precise level. Intel-SIG: commit e89c5d1f290e perf/x86/intel: Update dyn_constraint base on PEBS event precise level Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-11-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 11 +++++++++++ arch/x86/events/intel/ds.c | 1 + 2 files changed, 12 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dfa0fc73f9317..13f498849b874 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4240,6 +4240,8 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (event->attr.precise_ip) { + struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap); + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; @@ -4253,6 +4255,15 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (x86_pmu.arch_pebs) { + u64 cntr_mask = hybrid(event->pmu, intel_ctrl) & + ~GLOBAL_CTRL_EN_PERF_METRICS; + u64 pebs_mask = event->attr.precise_ip >= 3 ? + pebs_cap.pdists : pebs_cap.counters; + if (cntr_mask != pebs_mask) + event->hw.dyn_constraint &= pebs_mask; + } } if (needs_branch_stack(event) && is_sampling_event(event)) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4f7a5bc733a07..d7897ebec56b2 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2984,6 +2984,7 @@ static void __init intel_arch_pebs_init(void) x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; x86_pmu.pebs_capable = ~0ULL; + x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.pebs_enable = __intel_pmu_pebs_enable; x86_pmu.pebs_disable = __intel_pmu_pebs_disable; From caca09400ead611f6c4d771ea0bbe0072c305ac0 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:35 +0800 Subject: [PATCH 82/89] perf/x86/intel: Setup PEBS data configuration and enable legacy groups commit 52448a0a739002eca3d051a6ec314a0b178949a1 upstream. Different with legacy PEBS, arch-PEBS provides per-counter PEBS data configuration by programing MSR IA32_PMC_GPx/FXx_CFG_C MSRs. This patch obtains PEBS data configuration from event attribute and then writes the PEBS data configuration to MSR IA32_PMC_GPx/FXx_CFG_C and enable corresponding PEBS groups. Please notice this patch only enables XMM SIMD regs sampling for arch-PEBS, the other SIMD regs (OPMASK/YMM/ZMM) sampling on arch-PEBS would be supported after PMI based SIMD regs (OPMASK/YMM/ZMM) sampling is supported. Intel-SIG: commit 52448a0a7390 perf/x86/intel: Setup PEBS data configuration and enable legacy groups Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-12-dapeng1.mi@linux.intel.com [jz: use rdmsrl/wrmsrl instead of rdmsrq/wrmsrq] Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 136 ++++++++++++++++++++++++++++++- arch/x86/events/intel/ds.c | 17 ++++ arch/x86/events/perf_event.h | 4 + arch/x86/include/asm/intel_ds.h | 7 ++ arch/x86/include/asm/msr-index.h | 8 ++ 5 files changed, 171 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 13f498849b874..be7d47b7c9ef5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2552,6 +2552,45 @@ static void intel_pmu_disable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val &= ~mask; } +static inline void __intel_pmu_update_event_ext(int idx, u64 ext) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u32 msr; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr = MSR_IA32_PMC_V6_GP0_CFG_C + + x86_pmu.addr_offset(idx, false); + } else { + msr = MSR_IA32_PMC_V6_FX0_CFG_C + + x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); + } + + cpuc->cfg_c_val[idx] = ext; + wrmsrl(msr, ext); +} + +static void intel_pmu_disable_event_ext(struct perf_event *event) +{ + if (!x86_pmu.arch_pebs) + return; + + /* + * Only clear CFG_C MSR for PEBS counter group events, + * it avoids the HW counter's value to be added into + * other PEBS records incorrectly after PEBS counter + * group events are disabled. + * + * For other events, it's unnecessary to clear CFG_C MSRs + * since CFG_C doesn't take effect if counter is in + * disabled state. That helps to reduce the WRMSR overhead + * in context switches. + */ + if (!is_pebs_counter_event_group(event)) + return; + + __intel_pmu_update_event_ext(event->hw.idx, 0); +} + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -2560,9 +2599,12 @@ static void intel_pmu_disable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); + intel_pmu_disable_event_ext(event); x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + intel_pmu_disable_event_ext(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); break; @@ -2929,6 +2971,66 @@ static void intel_pmu_enable_acr(struct perf_event *event) DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); +static void intel_pmu_enable_event_ext(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + union arch_pebs_index old, new; + struct arch_pebs_cap cap; + u64 ext = 0; + + if (!x86_pmu.arch_pebs) + return; + + cap = hybrid(cpuc->pmu, arch_pebs_cap); + + if (event->attr.precise_ip) { + u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event); + + ext |= ARCH_PEBS_EN; + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) + ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD; + + if (pebs_data_cfg && cap.caps) { + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + ext |= ARCH_PEBS_AUX & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_GP) + ext |= ARCH_PEBS_GPR & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + ext |= ARCH_PEBS_VECR_XMM & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + ext |= ARCH_PEBS_LBR & cap.caps; + } + + if (cpuc->n_pebs == cpuc->n_large_pebs) + new.thresh = ARCH_PEBS_THRESH_MULTI; + else + new.thresh = ARCH_PEBS_THRESH_SINGLE; + + rdmsrl(MSR_IA32_PEBS_INDEX, old.whole); + if (new.thresh != old.thresh || !old.en) { + if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) { + /* + * Large PEBS was enabled. + * Drain PEBS buffer before applying the single PEBS. + */ + intel_pmu_drain_pebs_buffer(); + } else { + new.wr = 0; + new.full = 0; + new.en = 1; + wrmsrl(MSR_IA32_PEBS_INDEX, new.whole); + } + } + } + + if (cpuc->cfg_c_val[hwc->idx] != ext) + __intel_pmu_update_event_ext(hwc->idx, ext); +} + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2944,10 +3046,12 @@ static void intel_pmu_enable_event(struct perf_event *event) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); + intel_pmu_enable_event_ext(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: static_call_cond(intel_pmu_enable_acr_event)(event); + intel_pmu_enable_event_ext(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); @@ -5263,6 +5367,30 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } +static inline void __intel_update_pmu_caps(struct pmu *pmu) +{ + struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id()); + + if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM) + dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; +} + +static inline void __intel_update_large_pebs_flags(struct pmu *pmu) +{ + u64 caps = hybrid(pmu, arch_pebs_cap).caps; + + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; + if (caps & ARCH_PEBS_LBR) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; + + if (!(caps & ARCH_PEBS_AUX)) + x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; + if (!(caps & ARCH_PEBS_GPR)) { + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER); + } +} + #define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) static void update_pmu_cap(struct pmu *pmu) @@ -5311,8 +5439,12 @@ static void update_pmu_cap(struct pmu *pmu) hybrid(pmu, arch_pebs_cap).counters = pebs_mask; hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; - if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) { x86_pmu.arch_pebs = 0; + } else { + __intel_update_pmu_caps(pmu); + __intel_update_large_pebs_flags(pmu); + } } else { WARN_ON(x86_pmu.arch_pebs == 1); x86_pmu.arch_pebs = 0; @@ -5474,6 +5606,8 @@ static void intel_pmu_cpu_starting(int cpu) } } + __intel_update_pmu_caps(cpuc->pmu); + if (!cpuc->shared_regs) return; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index d7897ebec56b2..dae34a22aac20 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1506,6 +1506,18 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, } } +u64 intel_get_arch_pebs_data_config(struct perf_event *event) +{ + u64 pebs_data_cfg = 0; + + if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) + return 0; + + pebs_data_cfg |= pebs_update_adaptive_cfg(event); + + return pebs_data_cfg; +} + void intel_pmu_pebs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2926,6 +2938,11 @@ static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, index.wr = 0; index.full = 0; + index.en = 1; + if (cpuc->n_pebs == cpuc->n_large_pebs) + index.thresh = ARCH_PEBS_THRESH_MULTI; + else + index.thresh = ARCH_PEBS_THRESH_SINGLE; wrmsrl(MSR_IA32_PEBS_INDEX, index.whole); mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d76d30fa37e8d..569b9295571e9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -303,6 +303,8 @@ struct cpu_hw_events { /* Intel ACR configuration */ u64 acr_cfg_b[X86_PMC_IDX_MAX]; u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* Cached CFG_C values */ + u64 cfg_c_val[X86_PMC_IDX_MAX]; /* * Intel LBR bits @@ -1773,6 +1775,8 @@ void intel_pmu_pebs_data_source_cmt(void); void intel_pmu_pebs_data_source_lnl(void); +u64 intel_get_arch_pebs_data_config(struct perf_event *event); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 023c2883f9f3e..695f87efbeb89 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -7,6 +7,13 @@ #define PEBS_BUFFER_SHIFT 4 #define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) +/* + * The largest PEBS record could consume a page, ensure + * a record at least can be written after triggering PMI. + */ +#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT) +#define ARCH_PEBS_THRESH_SINGLE 1 + /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 #define MAX_PEBS_EVENTS 32 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1bfdd6ea8419f..f9caf35e0c002 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -304,6 +304,14 @@ #define ARCH_PEBS_OFFSET_MASK 0x7fffff #define ARCH_PEBS_INDEX_WR_SHIFT 4 +#define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_LBR_SHIFT 40 +#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) +#define ARCH_PEBS_VECR_XMM BIT_ULL(49) +#define ARCH_PEBS_GPR BIT_ULL(61) +#define ARCH_PEBS_AUX BIT_ULL(62) +#define ARCH_PEBS_EN BIT_ULL(63) + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) From 39baca84e1733abe18723c938ce91412a09c9d17 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 29 Oct 2025 18:21:36 +0800 Subject: [PATCH 83/89] perf/x86/intel: Add counter group support for arch-PEBS commit bb5f13df3c455110c4468a31a5b21954268108c9 upstream. Base on previous adaptive PEBS counter snapshot support, add counter group support for architectural PEBS. Since arch-PEBS shares same counter group layout with adaptive PEBS, directly reuse __setup_pebs_counter_group() helper to process arch-PEBS counter group. Intel-SIG: commit bb5f13df3c45 perf/x86/intel: Add counter group support for arch-PEBS Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251029102136.61364-13-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 38 ++++++++++++++++++++++++++++--- arch/x86/events/intel/ds.c | 29 ++++++++++++++++++++--- arch/x86/include/asm/msr-index.h | 6 +++++ arch/x86/include/asm/perf_event.h | 13 ++++++++--- 4 files changed, 77 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index be7d47b7c9ef5..be25e92466845 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3003,6 +3003,17 @@ static void intel_pmu_enable_event_ext(struct perf_event *event) if (pebs_data_cfg & PEBS_DATACFG_LBRS) ext |= ARCH_PEBS_LBR & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT)) + ext |= ARCH_PEBS_CNTR_GP & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT)) + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps; } if (cpuc->n_pebs == cpuc->n_large_pebs) @@ -3027,6 +3038,9 @@ static void intel_pmu_enable_event_ext(struct perf_event *event) } } + if (is_pebs_counter_event_group(event)) + ext |= ARCH_PEBS_CNTR_ALLOW; + if (cpuc->cfg_c_val[hwc->idx] != ext) __intel_pmu_update_event_ext(hwc->idx, ext); } @@ -4311,6 +4325,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event) return false; } +static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu) +{ + u64 caps; + + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline) + return true; + + caps = hybrid(pmu, arch_pebs_cap).caps; + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK)) + return true; + + return false; +} + static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, u64 *cause_mask, int *num) { @@ -4455,8 +4483,7 @@ static int intel_pmu_hw_config(struct perf_event *event) } if ((event->attr.sample_type & PERF_SAMPLE_READ) && - (x86_pmu.intel_cap.pebs_format >= 6) && - x86_pmu.intel_cap.pebs_baseline && + intel_pmu_has_pebs_counter_group(event->pmu) && is_sampling_event(event) && event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; @@ -5382,6 +5409,8 @@ static inline void __intel_update_large_pebs_flags(struct pmu *pmu) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; if (caps & ARCH_PEBS_LBR) x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; + if (caps & ARCH_PEBS_CNTR_MASK) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; if (!(caps & ARCH_PEBS_AUX)) x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; @@ -7057,8 +7086,11 @@ __init int intel_pmu_init(void) * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. */ - if (version >= 6) + if (version >= 6) { x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; + x86_pmu.late_setup = intel_pmu_late_setup; + } + /* * Install the hw-cache-events table: */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index dae34a22aac20..231c3bda375fd 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1508,13 +1508,20 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, u64 intel_get_arch_pebs_data_config(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); u64 pebs_data_cfg = 0; + u64 cntr_mask; if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) return 0; pebs_data_cfg |= pebs_update_adaptive_cfg(event); + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) | + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) | + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS; + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask; + return pebs_data_cfg; } @@ -2422,6 +2429,24 @@ static void setup_arch_pebs_sample_data(struct perf_event *event, } } + if (header->cntr) { + struct arch_pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct arch_pebs_cntr_header); + + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, + (struct pebs_cntr_header *)cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + /* Parse followed fragments if there are. */ if (arch_pebs_record_continued(header)) { at = at + header->size; @@ -3073,10 +3098,8 @@ static void __init intel_ds_pebs_init(void) break; case 6: - if (x86_pmu.intel_cap.pebs_baseline) { + if (x86_pmu.intel_cap.pebs_baseline) x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; - x86_pmu.late_setup = intel_pmu_late_setup; - } fallthrough; case 5: x86_pmu.pebs_ept = 1; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f9caf35e0c002..bdaab382d392a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -305,12 +305,18 @@ #define ARCH_PEBS_INDEX_WR_SHIFT 4 #define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) +#define ARCH_PEBS_CNTR_GP BIT_ULL(36) +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) #define ARCH_PEBS_LBR_SHIFT 40 #define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) #define ARCH_PEBS_VECR_XMM BIT_ULL(49) #define ARCH_PEBS_GPR BIT_ULL(61) #define ARCH_PEBS_AUX BIT_ULL(62) #define ARCH_PEBS_EN BIT_ULL(63) +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ + ARCH_PEBS_CNTR_METRICS) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b9adb35c80db5..6404acdea23af 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,16 +141,16 @@ #define ARCH_PERFMON_EVENTS_COUNT 7 #define PEBS_DATACFG_MEMINFO BIT_ULL(0) -#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_GP BIT_ULL(1) #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) -#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_METRICS BIT_ULL(5) +#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR_SHIFT 32 #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) #define PEBS_DATACFG_FIX_SHIFT 48 #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) -#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -603,6 +603,13 @@ struct arch_pebs_lbr_header { u64 ler_info; }; +struct arch_pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ From d42f315a7d41fb635ee9cc24de9a8d7cd69a24d2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 12 May 2025 10:55:42 -0700 Subject: [PATCH 84/89] perf/x86/intel: Add a check for dynamic constraints commit bd24f9beed591422f45fa6d8d0d3bd3a755b8a48 upstream. The current event scheduler has a limit. If the counter constraint of an event is not a subset of any other counter constraint with an equal or higher weight. The counters may not be fully utilized. To workaround it, the commit bc1738f6ee83 ("perf, x86: Fix event scheduler for constraints with overlapping counters") introduced an overlap flag, which is hardcoded to the event constraint that may trigger the limit. It only works for static constraints. Many features on and after Intel PMON v6 require dynamic constraints. An event constraint is decided by both static and dynamic constraints at runtime. See commit 4dfe3232cc04 ("perf/x86: Add dynamic constraint"). The dynamic constraints are from CPUID enumeration. It's impossible to hardcode it in advance. It's not practical to set the overlap flag to all events. It's harmful to the scheduler. For the existing Intel platforms, the dynamic constraints don't trigger the limit. A real fix is not required. However, for virtualization, VMM may give a weird CPUID enumeration to a guest. It's impossible to indicate what the weird enumeration is. A check is introduced, which can list the possible breaks if a weird enumeration is used. Check the dynamic constraints enumerated for normal, branch counters logging, and auto-counter reload. Check both PEBS and non-PEBS constratins. Intel-SIG: commit bd24f9beed59 perf/x86/intel: Add a check for dynamic constraints Closes: https://lore.kernel.org/lkml/20250416195610.GC38216@noisy.programming.kicks-ass.net/ Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20250512175542.2000708-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 156 +++++++++++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index be25e92466845..3d46b6bb38225 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5382,6 +5382,151 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con u64 fixed_cntr_mask, u64 intel_ctrl); +enum dyn_constr_type { + DYN_CONSTR_NONE, + DYN_CONSTR_BR_CNTR, + DYN_CONSTR_ACR_CNTR, + DYN_CONSTR_ACR_CAUSE, + + DYN_CONSTR_MAX, +}; + +static const char * const dyn_constr_type_name[] = { + [DYN_CONSTR_NONE] = "a normal event", + [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", + [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", + [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", +}; + +static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, + enum dyn_constr_type type, u64 mask) +{ + struct event_constraint *c1, *c2; + int new_weight, check_weight; + u64 new_mask, check_mask; + + for_each_event_constraint(c1, constr) { + new_mask = c1->idxmsk64 & mask; + new_weight = hweight64(new_mask); + + /* ignore topdown perf metrics event */ + if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) + continue; + + if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) { + pr_info("The event 0x%llx is not supported as %s.\n", + c1->code, dyn_constr_type_name[type]); + } + + if (new_weight <= 1) + continue; + + for_each_event_constraint(c2, c1 + 1) { + bool check_fail = false; + + check_mask = c2->idxmsk64 & mask; + check_weight = hweight64(check_mask); + + if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN || + !check_weight) + continue; + + /* The same constraints or no overlap */ + if (new_mask == check_mask || + (new_mask ^ check_mask) == (new_mask | check_mask)) + continue; + + /* + * A scheduler issue may be triggered in the following cases. + * - Two overlap constraints have the same weight. + * E.g., A constraints: 0x3, B constraints: 0x6 + * event counter failure case + * B PMC[2:1] 1 + * A PMC[1:0] 0 + * A PMC[1:0] FAIL + * - Two overlap constraints have different weight. + * The constraint has a low weight, but has high last bit. + * E.g., A constraints: 0x7, B constraints: 0xC + * event counter failure case + * B PMC[3:2] 2 + * A PMC[2:0] 0 + * A PMC[2:0] 1 + * A PMC[2:0] FAIL + */ + if (new_weight == check_weight) { + check_fail = true; + } else if (new_weight < check_weight) { + if ((new_mask | check_mask) != check_mask && + fls64(new_mask) > fls64(check_mask)) + check_fail = true; + } else { + if ((new_mask | check_mask) != new_mask && + fls64(new_mask) < fls64(check_mask)) + check_fail = true; + } + + if (check_fail) { + pr_info("The two events 0x%llx and 0x%llx may not be " + "fully scheduled under some circumstances as " + "%s.\n", + c1->code, c2->code, dyn_constr_type_name[type]); + } + } + } +} + +static void intel_pmu_check_dyn_constr(struct pmu *pmu, + struct event_constraint *constr, + u64 cntr_mask) +{ + enum dyn_constr_type i; + u64 mask; + + for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) { + mask = 0; + switch (i) { + case DYN_CONSTR_NONE: + mask = cntr_mask; + break; + case DYN_CONSTR_BR_CNTR: + if (x86_pmu.flags & PMU_FL_BR_CNTR) + mask = x86_pmu.lbr_counters; + break; + case DYN_CONSTR_ACR_CNTR: + mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_ACR_CAUSE: + if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64)) + continue; + mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + default: + pr_warn("Unsupported dynamic constraint type %d\n", i); + } + + if (mask) + __intel_pmu_check_dyn_constr(constr, i, mask); + } +} + +static void intel_pmu_check_event_constraints_all(struct pmu *pmu) +{ + struct event_constraint *event_constraints = hybrid(pmu, event_constraints); + struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints); + u64 cntr_mask = hybrid(pmu, cntr_mask64); + u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64); + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + intel_pmu_check_event_constraints(event_constraints, cntr_mask, + fixed_cntr_mask, intel_ctrl); + + if (event_constraints) + intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask); + + if (pebs_constraints) + intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask); +} + static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); static inline bool intel_pmu_broken_perf_cap(void) @@ -5504,10 +5649,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->cntr_mask64, - pmu->fixed_cntr_mask64, - pmu->intel_ctrl); + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); } @@ -7847,10 +7989,8 @@ __init int intel_pmu_init(void) if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.cntr_mask64, - x86_pmu.fixed_cntr_mask64, - x86_pmu.intel_ctrl); + intel_pmu_check_event_constraints_all(NULL); + /* * Access LBR MSR may cause #GP under certain circumstances. * Check all LBR MSR here. From de5ea71204211628279490b39979cc38d53840c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 14:50:16 +0100 Subject: [PATCH 85/89] perf/x86/intel: Check PEBS dyn_constraints commit 02da693f6658b9f73b97fce3695358ef3f13d0d1 upstream. Handle the interaction between ("perf/x86/intel: Update dyn_constraint base on PEBS event precise level") and ("perf/x86/intel: Add a check for dynamic constraints"). Intel-SIG: commit 02da693f6658 perf/x86/intel: Check PEBS dyn_constraints Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3d46b6bb38225..0723301bd0e6c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5387,6 +5387,8 @@ enum dyn_constr_type { DYN_CONSTR_BR_CNTR, DYN_CONSTR_ACR_CNTR, DYN_CONSTR_ACR_CAUSE, + DYN_CONSTR_PEBS, + DYN_CONSTR_PDIST, DYN_CONSTR_MAX, }; @@ -5396,6 +5398,8 @@ static const char * const dyn_constr_type_name[] = { [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", + [DYN_CONSTR_PEBS] = "a PEBS event", + [DYN_CONSTR_PDIST] = "a PEBS PDIST event", }; static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, @@ -5500,6 +5504,14 @@ static void intel_pmu_check_dyn_constr(struct pmu *pmu, continue; mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); break; + case DYN_CONSTR_PEBS: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).counters; + break; + case DYN_CONSTR_PDIST: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).pdists; + break; default: pr_warn("Unsupported dynamic constraint type %d\n", i); } From 8c936a2767b82b520d599e05b6a9ca06ac7bda51 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 14:50:20 +0100 Subject: [PATCH 86/89] perf/x86/intel: Optimize PEBS extended config commit 2093d8cf80fa5552d1025a78a8f3a10bf3b6466e upstream. Similar to enable_acr_event, avoid the branch. Intel-SIG: commit 2093d8cf80fa perf/x86/intel: Optimize PEBS extended config Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0723301bd0e6c..9fd6bf8c7e9b0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2571,9 +2571,6 @@ static inline void __intel_pmu_update_event_ext(int idx, u64 ext) static void intel_pmu_disable_event_ext(struct perf_event *event) { - if (!x86_pmu.arch_pebs) - return; - /* * Only clear CFG_C MSR for PEBS counter group events, * it avoids the HW counter's value to be added into @@ -2591,6 +2588,8 @@ static void intel_pmu_disable_event_ext(struct perf_event *event) __intel_pmu_update_event_ext(event->hw.idx, 0); } +DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext); + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -2599,11 +2598,11 @@ static void intel_pmu_disable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); - intel_pmu_disable_event_ext(event); + static_call_cond(intel_pmu_disable_event_ext)(event); x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: - intel_pmu_disable_event_ext(event); + static_call_cond(intel_pmu_disable_event_ext)(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); @@ -2979,9 +2978,6 @@ static void intel_pmu_enable_event_ext(struct perf_event *event) struct arch_pebs_cap cap; u64 ext = 0; - if (!x86_pmu.arch_pebs) - return; - cap = hybrid(cpuc->pmu, arch_pebs_cap); if (event->attr.precise_ip) { @@ -3045,6 +3041,8 @@ static void intel_pmu_enable_event_ext(struct perf_event *event) __intel_pmu_update_event_ext(hwc->idx, ext); } +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -3060,12 +3058,12 @@ static void intel_pmu_enable_event(struct perf_event *event) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); - intel_pmu_enable_event_ext(event); + static_call_cond(intel_pmu_enable_event_ext)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: static_call_cond(intel_pmu_enable_acr_event)(event); - intel_pmu_enable_event_ext(event); + static_call_cond(intel_pmu_enable_event_ext)(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); @@ -7990,8 +7988,13 @@ __init int intel_pmu_init(void) if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(NULL); - if (x86_pmu.arch_pebs) + if (x86_pmu.arch_pebs) { + static_call_update(intel_pmu_disable_event_ext, + intel_pmu_disable_event_ext); + static_call_update(intel_pmu_enable_event_ext, + intel_pmu_enable_event_ext); pr_cont("Architectural PEBS, "); + } intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, From 147220db2e768730d4640709c3a5e46e070bb29a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2025 10:40:26 +0100 Subject: [PATCH 87/89] perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use commit 9929dffce5ed7e2988e0274f4db98035508b16d9 upstream. The following commit introduced a build failure on x86-32: 21954c8a0ff ("perf/x86/intel: Process arch-PEBS records or record fragments") ... arch/x86/events/intel/ds.c:2983:24: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] The forced type conversion to 'u64' and 'void *' are not 32-bit clean, but they are also entirely unnecessary: ->pebs_vaddr is 'void *' already, and integer-compatible pointer arithmetics will work just fine on it. Fix & simplify the code. Intel-SIG: commit 9929dffce5ed perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use Reported-by: Stephen Rothwell Fixes: d21954c8a0ff ("perf/x86/intel: Process arch-PEBS records or record fragments") Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Acked-by: Dapeng Mi Cc: Kan Liang Link: https://patch.msgid.link/20251029102136.61364-10-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 231c3bda375fd..c26dd42751114 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2958,8 +2958,7 @@ static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, } base = cpuc->pebs_vaddr; - top = (void *)((u64)cpuc->pebs_vaddr + - (index.wr << ARCH_PEBS_INDEX_WR_SHIFT)); + top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT); index.wr = 0; index.full = 0; From ded4acf34f1557ddd89a505d0b5ea8a72a1f9fe2 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Fri, 12 Dec 2025 16:49:43 +0800 Subject: [PATCH 88/89] perf/x86/intel: Fix NULL event dereference crash in handle_pmi_common() commit 9415f749d34b926b9e4853da1462f4d941f89a0d upstream. handle_pmi_common() may observe an active bit set in cpuc->active_mask while the corresponding cpuc->events[] entry has already been cleared, which leads to a NULL pointer dereference. This can happen when interrupt throttling stops all events in a group while PEBS processing is still in progress. perf_event_overflow() can trigger perf_event_throttle_group(), which stops the group and clears the cpuc->events[] entry, but the active bit may still be set when handle_pmi_common() iterates over the events. The following recent fix: 7e772a93eb61 ("perf/x86: Fix NULL event access and potential PEBS record loss") moved the cpuc->events[] clearing from x86_pmu_stop() to x86_pmu_del() and relied on cpuc->active_mask/pebs_enabled checks. However, handle_pmi_common() can still encounter a NULL cpuc->events[] entry despite the active bit being set. Add an explicit NULL check on the event pointer before using it, to cover this legitimate scenario and avoid the NULL dereference crash. Intel-SIG: commit 9415f749d34b perf/x86/intel: Fix NULL event dereference crash in handle_pmi_common() Fixes: 7e772a93eb61 ("perf/x86: Fix NULL event access and potential PEBS record loss") Reported-by: kitta Co-developed-by: kitta Signed-off-by: Evan Li Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251212084943.2124787-1-evan.li@linux.alibaba.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220855 Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9fd6bf8c7e9b0..d143928c5c4ef 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3367,6 +3367,9 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* Event may have already been cleared: */ + if (!event) + continue; /* * There may be unprocessed PEBS records in the PEBS buffer, From ea6ad52f0c52e3fd647f74bd8b9bf7c926a6fe48 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 22 Dec 2025 14:57:12 -0800 Subject: [PATCH 89/89] tools headers: Sync x86 headers with kernel sources commit 369e91bd201d15a711f952ee9ac253a8b91628a3 upstream. To pick up changes from: 54de197c9a5e8f52 ("Merge tag 'x86_sgx_for_6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip") 679fcce0028bf101 ("Merge tag 'kvm-x86-svm-6.19' of https://github.com/kvm-x86/linux into HEAD") 3767def18f4cc394 ("x86/cpufeatures: Add support for L3 Smart Data Cache Injection Allocation Enforcement") f6106d41ec84e552 ("x86/bugs: Use an x86 feature to track the MMIO Stale Data mitigation") 7baadd463e147fdc ("x86/cpufeatures: Enumerate the LASS feature bits") 47955b58cf9b97fe ("x86/cpufeatures: Correct LKGS feature flag description") 5d0316e25defee47 ("x86/cpufeatures: Add X86_FEATURE_X2AVIC_EXT") 6ffdb49101f02313 ("x86/cpufeatures: Add X86_FEATURE_SGX_EUPDATESVN feature flag") 4793f990ea152330 ("KVM: x86: Advertise EferLmsleUnsupported to userspace") bb5f13df3c455110 ("perf/x86/intel: Add counter group support for arch-PEBS") 52448a0a739002ec ("perf/x86/intel: Setup PEBS data configuration and enable legacy groups") d21954c8a0ffbc94 ("perf/x86/intel: Process arch-PEBS records or record fragments") bffeb2fd0b9c99d8 ("x86/microcode/intel: Enable staging when available") 740144bc6bde9d44 ("x86/microcode/intel: Establish staging control logic") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Please see tools/include/uapi/README. Intel-SIG: commit 369e91bd201d tools headers: Sync x86 headers with kernel sources Cc: x86@kernel.org Signed-off-by: Namhyung Kim [jz: only following 3 relevant commits are sync'ed: bb5f13df3c455110 ("perf/x86/intel: Add counter group support for arch-PEBS") 52448a0a739002ec ("perf/x86/intel: Setup PEBS data configuration and enable legacy groups") d21954c8a0ffbc94 ("perf/x86/intel: Process arch-PEBS records or record fragments")] Signed-off-by: Jason Zeng --- tools/arch/x86/include/asm/msr-index.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 27e7c7ffc565e..da9b580261e84 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -267,6 +267,26 @@ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ PERF_CAP_PEBS_TIMING_INFO) +/* Arch PEBS */ +#define MSR_IA32_PEBS_BASE 0x000003f4 +#define MSR_IA32_PEBS_INDEX 0x000003f5 +#define ARCH_PEBS_OFFSET_MASK 0x7fffff +#define ARCH_PEBS_INDEX_WR_SHIFT 4 + +#define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) +#define ARCH_PEBS_CNTR_GP BIT_ULL(36) +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) +#define ARCH_PEBS_LBR_SHIFT 40 +#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) +#define ARCH_PEBS_VECR_XMM BIT_ULL(49) +#define ARCH_PEBS_GPR BIT_ULL(61) +#define ARCH_PEBS_AUX BIT_ULL(62) +#define ARCH_PEBS_EN BIT_ULL(63) +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ + ARCH_PEBS_CNTR_METRICS) + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1)