From 620a60323b08e99b67add42f91a9841af944dc04 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Wed, 11 Oct 2023 08:48:42 +0200 Subject: [PATCH 01/72] x86/resctrl: Fix remaining kernel-doc warnings commit f05fd4ce99635975caa3e6a0eeb02118637f72a3 upstream. The kernel test robot reported kernel-doc warnings here: arch/x86/kernel/cpu/resctrl/rdtgroup.c:915: warning: Function parameter or member 'of' not described in 'rdt_bit_usage_show' arch/x86/kernel/cpu/resctrl/rdtgroup.c:915: warning: Function parameter or member 'seq' not described in 'rdt_bit_usage_show' arch/x86/kernel/cpu/resctrl/rdtgroup.c:915: warning: Function parameter or member 'v' not described in 'rdt_bit_usage_show' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1144: warning: Function parameter or member 'type' not described in '__rdtgroup_cbm_overlaps' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1224: warning: Function parameter or member 'rdtgrp' not described in 'rdtgroup_mode_test_exclusive' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1261: warning: Function parameter or member 'of' not described in 'rdtgroup_mode_write' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1261: warning: Function parameter or member 'buf' not described in 'rdtgroup_mode_write' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1261: warning: Function parameter or member 'nbytes' not described in 'rdtgroup_mode_write' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1261: warning: Function parameter or member 'off' not described in 'rdtgroup_mode_write' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1370: warning: Function parameter or member 'of' not described in 'rdtgroup_size_show' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1370: warning: Function parameter or member 's' not described in 'rdtgroup_size_show' arch/x86/kernel/cpu/resctrl/rdtgroup.c:1370: warning: Function parameter or member 'v' not described in 'rdtgroup_size_show' The first two functions are missing an argument description while the other three are file callbacks and don't require a kernel-doc comment. Intel-SIG: commit f05fd4ce9963 x86/resctrl: Fix remaining kernel-doc warnings. Incremental backporting patches for Intel RDT on Intel Xeon platform. Closes: https://lore.kernel.org/oe-kbuild-all/202310070434.mD8eRNAz-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Maciej Wieczor-Retman Signed-off-by: Ingo Molnar Cc: Peter Newman Cc: Borislav Petkov Cc: Reinette Chatre Link: https://lore.kernel.org/r/20231011064843.246592-1-maciej.wieczor-retman@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 1c0f00cd212da..f4e25fbdb1ab1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -895,7 +895,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } -/** +/* * rdt_bit_usage_show - Display current usage of resources * * A domain is a shared resource that can now be allocated differently. Here @@ -1134,6 +1134,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. + * @type: CDP type of @r. * @exclusive: Only check if overlaps with exclusive resource groups * * Checks if provided @cbm intended to be used for @closid on domain @@ -1220,6 +1221,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, /** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. * * An exclusive resource group implies that there should be no sharing of * its allocated resources. At the time this group is considered to be @@ -1262,9 +1264,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) return true; } -/** +/* * rdtgroup_mode_write - Modify the resource group's mode - * */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) @@ -1368,12 +1369,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; } -/** +/* * rdtgroup_size_show - Display size in bytes of allocated regions * * The "size" file mirrors the layout of the "schemata" file, printing the * size in bytes of each region instead of the capacity bitmask. - * */ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) From bf51bba3a58f1f8a4633a5d25afda78cbd58ed35 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:00 -0500 Subject: [PATCH 02/72] x86/resctrl: Add multiple tasks to the resctrl group at once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fe2a20ea0b0953189e57740debc7dcc789d1ea55 upstream. The resctrl task assignment for monitor or control group needs to be done one at a time. For example: $mount -t resctrl resctrl /sys/fs/resctrl/ $mkdir /sys/fs/resctrl/ctrl_grp1 $echo 123 > /sys/fs/resctrl/ctrl_grp1/tasks $echo 456 > /sys/fs/resctrl/ctrl_grp1/tasks $echo 789 > /sys/fs/resctrl/ctrl_grp1/tasks This is not user-friendly when dealing with hundreds of tasks. Support multiple task assignment in one command with tasks ids separated by commas. For example: $echo 123,456,789 > /sys/fs/resctrl/ctrl_grp1/tasks Intel-SIG: commit fe2a20ea0b09 x86/resctrl: Add multiple tasks to the resctrl group at once. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-2-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 9 ++++++++- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 25 ++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 4c6421e2aa31c..178ab1d8f7475 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -306,7 +306,14 @@ All groups contain the following files: "tasks": Reading this file shows the list of all tasks that belong to this group. Writing a task id to the file will add a task to the - group. If the group is a CTRL_MON group the task is removed from + group. Multiple tasks can be added by separating the task ids + with commas. Tasks will be assigned sequentially. Multiple + failures are not supported. A single failure encountered while + attempting to assign a task will cause the operation to abort and + already added tasks before the failure will remain in the group. + Failures will be logged to /sys/fs/resctrl/info/last_cmd_status. + + If the group is a CTRL_MON group the task is removed from whichever previous CTRL_MON group owned the task and also from any MON group that owned the task. If the group is a MON group, then the task must already belong to the CTRL_MON parent of this diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f4e25fbdb1ab1..161b63c86328f 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -696,11 +696,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + char *pid_str; int ret = 0; pid_t pid; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); @@ -715,7 +714,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, goto unlock; } - ret = rdtgroup_move_task(pid, rdtgrp, of); + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } unlock: rdtgroup_kn_unlock(of->kn); From 298d1e6e5df92f070941679b3c415dda82aafdb8 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 3 Oct 2023 18:54:22 -0500 Subject: [PATCH 03/72] x86/resctrl: Simplify rftype flag definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6846dc1a31d1894a7acf52d8442fe73b34091022 upstream. The rftype flags are bitmaps used for adding files under the resctrl filesystem. Some of these bitmap defines have one extra level of indirection which is not necessary. Drop the RF_* defines and simplify the macros. [ bp: Massage commit message. ] Intel-SIG: commit 6846dc1a31d1 x86/resctrl: Simplify rftype flag definitions. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-3-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 9 +++------ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ca86a96e80c27..5959026075c99 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -242,12 +242,9 @@ struct rdtgroup { */ #define RFTYPE_INFO BIT(0) #define RFTYPE_BASE BIT(1) -#define RF_CTRLSHIFT 4 -#define RF_MONSHIFT 5 -#define RF_TOPSHIFT 6 -#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) -#define RFTYPE_MON BIT(RF_MONSHIFT) -#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_CTRL BIT(4) +#define RFTYPE_MON BIT(5) +#define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) #define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 161b63c86328f..784d37c90c235 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3262,7 +3262,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + if (rtype == RDTCTRL_GROUP) + files = RFTYPE_BASE | RFTYPE_CTRL; + else + files = RFTYPE_BASE | RFTYPE_MON; + ret = rdtgroup_add_files(kn, files); if (ret) { rdt_last_cmd_puts("kernfs fill error\n"); From d649b84bc9b62e72fa2882f3bc0035799d4f5695 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:02 -0500 Subject: [PATCH 04/72] x86/resctrl: Rename rftype flags for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d41592435cde9a658a1bd3b3fdfeb8db7b330d78 upstream. resctrl associates rftype flags with its files so that files can be chosen based on the resource, whether it is info or base, and if it is control or monitor type file. These flags use the RF_ as well as RFTYPE_ prefixes. Change the prefix to RFTYPE_ for all these flags to be consistent. Intel-SIG: commit d41592435cde x86/resctrl: Rename rftype flags for consistency. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-4-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 10 +++--- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 44 +++++++++++++------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5959026075c99..2c310fe7f1d6d 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -247,10 +247,10 @@ struct rdtgroup { #define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) -#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -266,7 +266,7 @@ void __exit rdtgroup_exit(void); * @mode: Access mode * @kf_ops: File operations * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RF_* or RFTYPE_* flags + * @fflags: File specific RFTYPE_* flags * @seq_show: Show content of the file * @write: Write to the file */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 784d37c90c235..933b1b13eb179 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1718,77 +1718,77 @@ static struct rftype res_common_files[] = { .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_last_cmd_status_show, - .fflags = RF_TOP_INFO, + .fflags = RFTYPE_TOP_INFO, }, { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, - .fflags = RF_CTRL_INFO, + .fflags = RFTYPE_CTRL_INFO, }, { .name = "mon_features", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_mon_features_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_rmids_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_shareable_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "bit_usage", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bit_usage_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, /* * Platform specific which (if any) capabilities are provided by @@ -1807,7 +1807,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = max_threshold_occ_write, .seq_show = max_threshold_occ_show, - .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { .name = "mbm_total_bytes_config", @@ -1854,7 +1854,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_schemata_write, .seq_show = rdtgroup_schemata_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "mode", @@ -1862,21 +1862,21 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_mode_write, .seq_show = rdtgroup_mode_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "size", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdtgroup_size_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "sparse_masks", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, }; @@ -1933,7 +1933,7 @@ void __init thread_throttle_mode_init(void) if (!rft) return; - rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; + rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; } void __init mbm_config_rftype_init(const char *config) @@ -1942,7 +1942,7 @@ void __init mbm_config_rftype_init(const char *config) rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; } /** @@ -2077,21 +2077,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); if (ret) goto out_destroy; /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RF_CTRL_INFO; + fflags = r->fflags | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RF_MON_INFO; + fflags = r->fflags | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -3729,7 +3729,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); + ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RFTYPE_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; From 258532cbf232471837e845f8edc7090d7c4f2d2a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:03 -0500 Subject: [PATCH 05/72] x86/resctrl: Unwind properly from rdt_enable_ctx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit df5f3a1dd8a6d3ddb1f07a10817f735194717422 upstream. rdt_enable_ctx() enables the features provided during resctrl mount. Additions to rdt_enable_ctx() are required to also modify error paths of rdt_enable_ctx() callers to ensure correct unwinding if errors are encountered after calling rdt_enable_ctx(). This is error prone. Introduce rdt_disable_ctx() to refactor the error unwinding of rdt_enable_ctx() to simplify future additions. This also simplifies cleanup in rdt_kill_sb(). Intel-SIG: commit df5f3a1dd8a6 x86/resctrl: Unwind properly from rdt_enable_ctx(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-5-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 53 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 933b1b13eb179..6fc9739658ed5 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2310,14 +2310,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -static void cdp_disable_all(void) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -2397,19 +2389,42 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); +} + static int rdt_enable_ctx(struct rdt_fs_context *ctx) { int ret = 0; - if (ctx->enable_cdpl2) + if (ctx->enable_cdpl2) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } - if (!ret && ctx->enable_cdpl3) + if (ctx->enable_cdpl3) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } - if (!ret && ctx->enable_mba_mbps) + if (ctx->enable_mba_mbps) { ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + return 0; +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: return ret; } @@ -2517,13 +2532,13 @@ static int rdt_get_tree(struct fs_context *fc) } ret = rdt_enable_ctx(ctx); - if (ret < 0) - goto out_cdp; + if (ret) + goto out; ret = schemata_list_create(); if (ret) { schemata_list_destroy(); - goto out_mba; + goto out_ctx; } closid_init(); @@ -2582,11 +2597,8 @@ static int rdt_get_tree(struct fs_context *fc) kernfs_remove(kn_info); out_schemata_free: schemata_list_destroy(); -out_mba: - if (ctx->enable_mba_mbps) - set_mba_sc(false); -out_cdp: - cdp_disable_all(); +out_ctx: + rdt_disable_ctx(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2818,12 +2830,11 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - set_mba_sc(false); + rdt_disable_ctx(); /*Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); - cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; From ece9c5ac4cc90bd12cb731c5e17d126616a557a5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:04 -0500 Subject: [PATCH 06/72] x86/resctrl: Move default group file creation to mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d27567a0eb54be457b25e240593fdbd1c35c8618 upstream. The default resource group and its files are created during kernel init time. Upcoming changes will make some resctrl files optional based on a mount parameter. If optional files are to be added to the default group based on the mount option, then each new file needs to be created separately and call kernfs_activate() again. Create all files of the default resource group during resctrl mount, destroyed during unmount, to avoid scattering resctrl file addition across two separate code flows. Intel-SIG: commit d27567a0eb54 x86/resctrl: Move default group file creation to mount. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-6-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 58 +++++++++++++++----------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6fc9739658ed5..d04dd495d7cf2 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -54,6 +54,9 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); +static void rdtgroup_destroy_root(void); + struct dentry *debugfs_resctrl; void rdt_last_cmd_clear(void) @@ -2531,10 +2534,14 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } - ret = rdt_enable_ctx(ctx); + ret = rdtgroup_setup_root(ctx); if (ret) goto out; + ret = rdt_enable_ctx(ctx); + if (ret) + goto out_root; + ret = schemata_list_create(); if (ret) { schemata_list_destroy(); @@ -2543,6 +2550,12 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); + ret = rdtgroup_add_files(rdtgroup_default.kn, RFTYPE_CTRL_BASE); + if (ret) + goto out_schemata_free; + + kernfs_activate(rdtgroup_default.kn); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) goto out_schemata_free; @@ -2599,6 +2612,8 @@ static int rdt_get_tree(struct fs_context *fc) schemata_list_destroy(); out_ctx: rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2669,7 +2684,6 @@ static int rdt_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->kfc.root = rdt_root; ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; @@ -2839,6 +2853,7 @@ static void rdt_kill_sb(struct super_block *sb) rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); + rdtgroup_destroy_root(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -3720,10 +3735,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .show_options = rdtgroup_show_options, }; -static int __init rdtgroup_setup_root(void) +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) { - int ret; - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, @@ -3731,6 +3744,20 @@ static int __init rdtgroup_setup_root(void) if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void __init rdtgroup_setup_default(void) +{ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; @@ -3740,19 +3767,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RFTYPE_CTRL_BASE); - if (ret) { - kernfs_destroy_root(rdt_root); - goto out; - } - - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - kernfs_activate(rdtgroup_default.kn); - -out: mutex_unlock(&rdtgroup_mutex); - - return ret; } static void domain_destroy_mon_state(struct rdt_domain *d) @@ -3874,13 +3889,11 @@ int __init rdtgroup_init(void) seq_buf_init(&last_cmd_status, last_cmd_status_buf, sizeof(last_cmd_status_buf)); - ret = rdtgroup_setup_root(); - if (ret) - return ret; + rdtgroup_setup_default(); ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) - goto cleanup_root; + return ret; ret = register_filesystem(&rdt_fs_type); if (ret) @@ -3913,8 +3926,6 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); -cleanup_root: - kernfs_destroy_root(rdt_root); return ret; } @@ -3924,5 +3935,4 @@ void __exit rdtgroup_exit(void) debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); - kernfs_destroy_root(rdt_root); } From 4a0239b5e27552e8109c3cb6d56365f2292e8e36 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:05 -0500 Subject: [PATCH 07/72] x86/resctrl: Introduce "-o debug" mount option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cb07d71f01017b7c2885ed629da9b973cb56b1d2 upstream. Add "-o debug" option to mount resctrl filesystem in debug mode. When in debug mode resctrl displays files that have the new RFTYPE_DEBUG flag to help resctrl debugging. Intel-SIG: commit cb07d71f0101 x86/resctrl: Introduce "-o debug" mount option. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-7-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 5 ++++- arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 178ab1d8f7475..68f11611f3416 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -35,7 +35,7 @@ about the feature from resctrl's info directory. To use the feature mount the file system:: - # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl + # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps][,debug]] /sys/fs/resctrl mount options are: @@ -46,6 +46,9 @@ mount options are: "mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA bandwidth in MBps +"debug": + Make debug files accessible. Available debug files are annotated with + "Available only with debug option". L2 and L3 CDP are controlled separately. diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 2c310fe7f1d6d..77cfbbe251d57 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -58,6 +58,7 @@ struct rdt_fs_context { bool enable_cdpl2; bool enable_cdpl3; bool enable_mba_mbps; + bool enable_debug; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -247,6 +248,7 @@ struct rdtgroup { #define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) +#define RFTYPE_DEBUG BIT(10) #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d04dd495d7cf2..c419603dba16d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -59,6 +59,8 @@ static void rdtgroup_destroy_root(void); struct dentry *debugfs_resctrl; +static bool resctrl_debug; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -1894,6 +1896,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + for (rft = rfts; rft < rfts + len; rft++) { if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); @@ -2397,6 +2402,8 @@ static void rdt_disable_ctx(void) resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); set_mba_sc(false); + + resctrl_debug = false; } static int rdt_enable_ctx(struct rdt_fs_context *ctx) @@ -2421,6 +2428,9 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) goto out_cdpl3; } + if (ctx->enable_debug) + resctrl_debug = true; + return 0; out_cdpl3: @@ -2625,6 +2635,7 @@ enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mbps, + Opt_debug, nr__rdt_params }; @@ -2632,6 +2643,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), {} }; @@ -2657,6 +2669,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; ctx->enable_mba_mbps = true; return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; } return -EINVAL; @@ -3725,6 +3740,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); + if (resctrl_debug) + seq_puts(seq, ",debug"); + return 0; } From f0d40ddf0dd5ce6758e55ef53b2cbd6bde46238c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:06 -0500 Subject: [PATCH 08/72] x86/resctrl: Display CLOSID for resource group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ca8dad225e237493f19b1c5d4a8531f13a9b078f upstream. In x86, hardware uses CLOSID to identify a control group. When a user creates a control group this information is not visible to the user. It can help resctrl debugging. Add CLOSID(ctrl_hw_id) to the control groups display in the resctrl interface. Users can see this detail when resctrl is mounted with the "-o debug" option. Other architectures do not use "CLOSID". Use the names ctrl_hw_id to refer to "CLOSID" in an effort to keep the naming generic. For example: $cat /sys/fs/resctrl/ctrl_grp1/ctrl_hw_id 1 Intel-SIG: commit ca8dad225e23 x86/resctrl: Display CLOSID for resource group. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-8-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 4 ++++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 68f11611f3416..7412252f95a7d 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -359,6 +359,10 @@ When control is enabled all CTRL_MON groups will also contain: file. On successful pseudo-locked region creation the mode will automatically change to "pseudo-locked". +"ctrl_hw_id": + Available only with debug option. The identifier used by hardware + for the control group. On x86 this is the CLOSID. + When monitoring is enabled all MON groups will also contain: "mon_data": diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index c419603dba16d..069a9c395f6c1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -779,6 +779,22 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -1883,6 +1899,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_has_sparse_bitmasks_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, + }, }; From 346cc2925a61b2aa5b124ed69ebfedfd48adf7c2 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:07 -0500 Subject: [PATCH 09/72] x86/resctrl: Add support for the files of MON groups only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 918f211b5e4e709e91acf856967a850569c96b71 upstream. Files unique to monitoring groups have the RFTYPE_MON flag. When a new monitoring group is created the resctrl files with flags RFTYPE_BASE (files common to all resource groups) and RFTYPE_MON (files unique to monitoring groups) are created to support interacting with the new monitoring group. A resource group can support both monitoring and control, also termed a CTRL_MON resource group. CTRL_MON groups should get both monitoring and control resctrl files but that is not the case. Only the RFTYPE_BASE and RFTYPE_CTRL files are created for CTRL_MON groups. Ensure that files with the RFTYPE_MON flag are created for CTRL_MON groups. Intel-SIG: commit 918f211b5e4e x86/resctrl: Add support for the files of MON groups only. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Ilpo Järvinen Reviewed-by: Reinette Chatre Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-9-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 069a9c395f6c1..910db69f9fa53 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2553,6 +2553,7 @@ static void schemata_list_destroy(void) static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; struct rdt_domain *dom; struct rdt_resource *r; int ret; @@ -2583,7 +2584,10 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - ret = rdtgroup_add_files(rdtgroup_default.kn, RFTYPE_CTRL_BASE); + if (rdt_mon_capable) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); if (ret) goto out_schemata_free; @@ -3273,8 +3277,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, enum rdt_group_type rtype, struct rdtgroup **r) { struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; struct kernfs_node *kn; - uint files = 0; int ret; prdtgrp = rdtgroup_kn_lock_live(parent_kn); @@ -3326,10 +3330,13 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rtype == RDTCTRL_GROUP) + if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - else + if (rdt_mon_capable) + files |= RFTYPE_MON; + } else { files = RFTYPE_BASE | RFTYPE_MON; + } ret = rdtgroup_add_files(kn, files); if (ret) { From 56b23220c4626cde54a0657326bb41ac375fd4e1 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 16 Oct 2023 19:23:08 -0500 Subject: [PATCH 10/72] x86/resctrl: Display RMID of resource group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4cee14bcb14881aae81d60f106a335c68553ac1f upstream. In x86, hardware uses RMID to identify a monitoring group. When a user creates a monitor group these details are not visible. These details can help resctrl debugging. Add RMID(mon_hw_id) to the monitor groups display in the resctrl interface. Users can see these details when resctrl is mounted with "-o debug" option. Add RFTYPE_MON_BASE that complements existing RFTYPE_CTRL_BASE and represents files belonging to monitoring groups. Other architectures do not use "RMID". Use the name mon_hw_id to refer to "RMID" in an effort to keep the naming generic. For example: $cat /sys/fs/resctrl/mon_groups/mon_grp1/mon_hw_id 3 Intel-SIG: commit 4cee14bcb148 x86/resctrl: Display RMID of resource group. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Peter Newman Reviewed-by: Tan Shaopeng Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Ilpo Järvinen Tested-by: Peter Newman Tested-by: Tan Shaopeng Link: https://lore.kernel.org/r/20231017002308.134480-10-babu.moger@amd.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 4 ++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 7412252f95a7d..a6279df64a9db 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -376,6 +376,10 @@ When monitoring is enabled all MON groups will also contain: the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. +"mon_hw_id": + Available only with debug option. The identifier used by hardware + for the monitor group. On x86 this is the RMID. + Resource allocation rules ------------------------- diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 77cfbbe251d57..52e7e7deee106 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -253,6 +253,7 @@ struct rdtgroup { #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) #define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) /* List of all resource groups */ extern struct list_head rdt_all_groups; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 910db69f9fa53..2b69e560b05f1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -795,6 +795,22 @@ static int rdtgroup_closid_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -1869,6 +1885,13 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_tasks_show, .fflags = RFTYPE_BASE, }, + { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, { .name = "schemata", .mode = 0644, From b7de2a9f4a4e7883837952b9adb371a7cbcb07b6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 1 Nov 2023 14:26:15 -0700 Subject: [PATCH 11/72] x86/resctrl: Fix unused variable warning in cache_alloc_hsw_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1b908debf53ff3cf0e43e0fa51e7319a23518e6c upstream. In a "W=1" build gcc throws a warning: arch/x86/kernel/cpu/resctrl/core.c: In function ‘cache_alloc_hsw_probe’: arch/x86/kernel/cpu/resctrl/core.c:139:16: warning: variable ‘h’ set but not used Switch from wrmsr_safe() to wrmsrl_safe(), and from rdmsr() to rdmsrl() using a single u64 argument for the MSR value instead of the pair of u32 for the high and low halves. Intel-SIG: commit 1b908debf53f x86/resctrl: Fix unused variable warning in cache_alloc_hsw_probe(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Babu Moger Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/ZULCd/TGJL9Dmncf@agluck-desk3 [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d04371e851b4c..aa9810a64258e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -136,15 +136,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; From 87c4545eff6d87c21b69966d863c227b91d62903 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 24 Jan 2024 11:52:56 -0600 Subject: [PATCH 12/72] x86/resctrl: Remove redundant variable in mbm_config_write_domain() commit fc747eebef734563cf68a512f57937c8f231834a upstream. The kernel test robot reported the following warning after commit 54e35eb8611c ("x86/resctrl: Read supported bandwidth sources from CPUID"). even though the issue is present even in the original commit 92bd5a139033 ("x86/resctrl: Add interface to write mbm_total_bytes_config") which added this function. The reported warning is: $ make C=1 CHECK=scripts/coccicheck arch/x86/kernel/cpu/resctrl/rdtgroup.o ... arch/x86/kernel/cpu/resctrl/rdtgroup.c:1621:5-8: Unneeded variable: "ret". Return "0" on line 1655 Remove the local variable 'ret'. [ bp: Massage commit message, make mbm_config_write_domain() void. ] Intel-SIG: commit fc747eebef73 x86/resctrl: Remove redundant variable in mbm_config_write_domain(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Fixes: 92bd5a139033 ("x86/resctrl: Add interface to write mbm_total_bytes_config") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401241810.jbd8Ipa1-lkp@intel.com/ Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/202401241810.jbd8Ipa1-lkp@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2b69e560b05f1..aa24343f1d237 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1614,11 +1614,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; /* * Read the current config value first. If both are the same then @@ -1627,7 +1626,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1650,9 +1649,6 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) @@ -1661,7 +1657,6 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; - int ret = 0; next: if (!tok || tok[0] == '\0') @@ -1690,9 +1685,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + mbm_config_write_domain(r, d, evtid, val); goto next; } } From 05d040ee7984cc73edc89f25bec09298e4e21a76 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:15 +0000 Subject: [PATCH 13/72] tick/nohz: Move tick_nohz_full_mask declaration outside the #ifdef commit 31a5c0b7c674977889ce721d69101bc35f25e041 upstream. tick_nohz_full_mask lists the CPUs that are nohz_full. This is only needed when CONFIG_NO_HZ_FULL is defined. tick_nohz_full_cpu() allows a specific CPU to be tested against the mask, and evaluates to false when CONFIG_NO_HZ_FULL is not defined. The resctrl code needs to pick a CPU to run some work on, a new helper prefers housekeeping CPUs by examining the tick_nohz_full_mask. Hiding the declaration behind #ifdef CONFIG_NO_HZ_FULL forces all the users to be behind an #ifdef too. Move the tick_nohz_full_mask declaration, this lets callers drop the #ifdef, and guard access to tick_nohz_full_mask with IS_ENABLED() or something like tick_nohz_full_cpu(). The definition does not need to be moved as any callers should be removed at compile time unless CONFIG_NO_HZ_FULL is defined. Intel-SIG: commit 31a5c0b7c674 tick/nohz: Move tick_nohz_full_mask declaration outside the #ifdef. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Thomas Gleixner Acked-by: Reinette Chatre # for resctrl dependency Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-2-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- include/linux/tick.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 9459fef5b8573..65af90ca409ae 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -174,9 +174,16 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } static inline void tick_nohz_idle_stop_tick_protected(void) { } #endif /* !CONFIG_NO_HZ_COMMON */ +/* + * Mask of CPUs that are nohz_full. + * + * Users should be guarded by CONFIG_NO_HZ_FULL or a tick_nohz_full_cpu() + * check. + */ +extern cpumask_var_t tick_nohz_full_mask; + #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; -extern cpumask_var_t tick_nohz_full_mask; static inline bool tick_nohz_full_enabled(void) { From bc09a623fc69d20ba42272510dc8f12369065bfc Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:16 +0000 Subject: [PATCH 14/72] x86/resctrl: Free rmid_ptrs from resctrl_exit() commit 3f7b07380d58cfbb6a2d3aa672dcc76c0f4b0745 upstream. rmid_ptrs[] is allocated from dom_data_init() but never free()d. While the exit text ends up in the linker script's DISCARD section, the direction of travel is for resctrl to be/have loadable modules. Add resctrl_put_mon_l3_config() to cleanup any memory allocated by rdt_get_mon_l3_config(). There is no reason to backport this to a stable kernel. Intel-SIG: commit 3f7b07380d58 x86/resctrl: Free rmid_ptrs from resctrl_exit(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Babu Moger Reviewed-by: Reinette Chatre Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-3-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 6 ++++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index aa9810a64258e..9641c42d0f856 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -990,8 +990,14 @@ late_initcall(resctrl_late_init); static void __exit resctrl_exit(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); + + if (r->mon_capable) + rdt_put_mon_l3_config(); } __exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 52e7e7deee106..61c763604fc98 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -544,6 +544,7 @@ void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 3a6c069614eb8..3a73db0579d85 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -719,6 +719,16 @@ static int dom_data_init(struct rdt_resource *r) return 0; } +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); +} + static struct mon_evt llc_occupancy_event = { .name = "llc_occupancy", .evtid = QOS_L3_OCCUP_EVENT_ID, @@ -814,6 +824,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; From 25c6b7b4559e640058eeb2e27a87a0953aea6ffe Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:17 +0000 Subject: [PATCH 15/72] x86/resctrl: Create helper for RMID allocation and mondata dir creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b1de313979af99dc0f999656fc99bbcb52559a38 upstream. When monitoring is supported, each monitor and control group is allocated an RMID. For control groups, rdtgroup_mkdir_ctrl_mon() later goes on to allocate the CLOSID. MPAM's equivalent of RMID are not an independent number, so can't be allocated until the CLOSID is known. An RMID allocation for one CLOSID may fail, whereas another may succeed depending on how many monitor groups a control group has. The RMID allocation needs to move to be after the CLOSID has been allocated. Move the RMID allocation and mondata dir creation to a helper. Intel-SIG: commit b1de313979af x86/resctrl: Create helper for RMID allocation and mondata dir creation. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Ilpo Järvinen Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Peter Newman Tested-by: Shaopeng Tan Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-4-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 42 +++++++++++++++++--------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index aa24343f1d237..4ea5a871be498 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3288,6 +3288,30 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) return ret; } +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!rdt_mon_capable) + return 0; + + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3360,20 +3384,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rdt_mon_capable) { - ret = alloc_rmid(); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - goto out_destroy; - } - rdtgrp->mon.rmid = ret; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_destroy; - ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_idfree; - } - } kernfs_activate(kn); /* @@ -3381,8 +3395,6 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, */ return 0; -out_idfree: - free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); From 18a13cd9891a2e972801379b6dee27d92d803fbd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:18 +0000 Subject: [PATCH 16/72] x86/resctrl: Move RMID allocation out of mkdir_rdt_prepare() commit 311639e9512bb3af2abae32be9322b8a9b30eaa1 upstream. RMIDs are allocated for each monitor or control group directory, because each of these needs its own RMID. For control groups, rdtgroup_mkdir_ctrl_mon() later goes on to allocate the CLOSID. MPAM's equivalent of RMID is not an independent number, so can't be allocated until the CLOSID is known. An RMID allocation for one CLOSID may fail, whereas another may succeed depending on how many monitor groups a control group has. The RMID allocation needs to move to be after the CLOSID has been allocated. Move the RMID allocation out of mkdir_rdt_prepare() to occur in its caller, after the mkdir_rdt_prepare() call. This allows the RMID allocator to know the CLOSID. Intel-SIG: commit 311639e9512b x86/resctrl: Move RMID allocation out of mkdir_rdt_prepare(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-5-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 35 +++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 4ea5a871be498..f455a10b74ab1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3312,6 +3312,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) return 0; } +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (rdt_mon_capable) + free_rmid(rgrp->mon.rmid); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3384,12 +3390,6 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - /* * The caller unlocks the parent_kn upon success. */ @@ -3408,7 +3408,6 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); - free_rmid(rgrp->mon.rmid); rdtgroup_remove(rgrp); } @@ -3430,12 +3429,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, prgrp = rdtgrp->mon.parent; rdtgrp->closid = prgrp->closid; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + /* * Add the rdtgrp to the list of rdtgrps the parent * ctrl_mon group has to track. */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); +out_unlock: rdtgroup_kn_unlock(parent_kn); return ret; } @@ -3466,9 +3474,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + ret = rdtgroup_init_alloc(rdtgrp); if (ret < 0) - goto out_id_free; + goto out_rmid_free; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); @@ -3488,7 +3503,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, out_del_list: list_del(&rdtgrp->rdtgroup_list); -out_id_free: +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: closid_free(closid); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); From 7158e4f981213d58109f30c5d635a14212297abd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:19 +0000 Subject: [PATCH 17/72] x86/resctrl: Track the closid with the rmid commit 40fc735b78f0c81cea7d1c511cfd83892cb4d679 upstream. x86's RMID are independent of the CLOSID. An RMID can be allocated, used and freed without considering the CLOSID. MPAM's equivalent feature is PMG, which is not an independent number, it extends the CLOSID/PARTID space. For MPAM, only PMG-bits worth of 'RMID' can be allocated for a single CLOSID. i.e. if there is 1 bit of PMG space, then each CLOSID can have two monitor groups. To allow resctrl to disambiguate RMID values for different CLOSID, everything in resctrl that keeps an RMID value needs to know the CLOSID too. This will always be ignored on x86. Intel-SIG: commit 40fc735b78f0 x86/resctrl: Track the closid with the rmid. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Xin Hao Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-6-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 7 +++ arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/monitor.c | 73 +++++++++++++++-------- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 12 ++-- include/linux/resctrl.h | 16 ++++- 6 files changed, 77 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 255a78d9d9067..cc6e1bce7b1ad 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,6 +7,13 @@ #include #include +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) + /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 61c763604fc98..ae0e3338abc46 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -542,7 +542,7 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); -void free_rmid(u32 rmid); +void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 3a73db0579d85..3dad4134d2c9b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -24,7 +24,20 @@ #include "internal.h" +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -136,7 +149,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +static inline struct rmid_entry *__rmid_entry(u32 closid, u32 rmid) { struct rmid_entry *entry; @@ -190,7 +203,8 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; @@ -230,7 +244,8 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) } int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -285,9 +300,9 @@ void __check_limbo(struct rdt_domain *d, bool force_free) if (nrmid >= r->num_rmid) break; - entry = __rmid_entry(nrmid); + entry = __rmid_entry(X86_RESCTRL_EMPTY_CLOSID, nrmid);// temporary - if (resctrl_arch_rmid_read(r, d, entry->rmid, + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val)) { rmid_dirty = true; } else { @@ -342,7 +357,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, + err = resctrl_arch_rmid_read(r, d, entry->closid, + entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val); if (err || val <= resctrl_rmid_realloc_threshold) @@ -366,7 +382,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) list_add_tail(&entry->list, &rmid_free_lru); } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { struct rmid_entry *entry; @@ -375,7 +391,7 @@ void free_rmid(u32 rmid) lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + entry = __rmid_entry(closid, rmid); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,8 +399,8 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: @@ -396,20 +412,21 @@ static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, + &tval); if (rr->err) return rr->err; @@ -421,6 +438,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,7 +447,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { struct mbm_state *m = &rr->d->mbm_local[rmid]; u64 cur_bw, bytes, cur_bytes; @@ -456,7 +474,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -467,7 +485,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -578,7 +597,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, u32 rmid) { struct rmid_read rr; @@ -593,12 +613,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + __mon_event_count(closid, rmid, &rr); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + __mon_event_count(closid, rmid, &rr); /* * Call the MBA software controller only for the @@ -606,7 +626,7 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) * the software controller explicitly. */ if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); + mbm_bw_count(closid, rmid, &rr); } } @@ -663,11 +683,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -710,10 +730,11 @@ static int dom_data_init(struct rdt_resource *r) } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + entry = __rmid_entry(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); list_del(&entry->list); return 0; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8f559eeae08ed..65bee6f11015e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * anymore when this group would be used for pseudo-locking. This * is safe to call on platforms not capable of monitoring. */ - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); ret = 0; goto out; @@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ret = rdtgroup_locksetup_user_restore(rdtgrp); if (ret) { - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f455a10b74ab1..ad7da7254f4dc 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2837,7 +2837,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->mon.rmid); + free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); if (atomic_read(&sentry->waitcount) != 0) @@ -2877,7 +2877,7 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); @@ -3305,7 +3305,7 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3315,7 +3315,7 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { if (rdt_mon_capable) - free_rmid(rgrp->mon.rmid); + free_rmid(rgrp->closid, rgrp->mon.rmid); } static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, @@ -3574,7 +3574,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* * Remove the rdtgrp from the parent ctrl_mon group's list @@ -3620,8 +3620,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); - free_rmid(rdtgrp->mon.rmid); rdtgroup_ctrl_remove(rdtgrp); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 66942d7fba7fc..bd4ec22b5a961 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,10 @@ #include #include +/* CLOSID, RMID value used by the default control group */ +#define RESCTRL_RESERVED_CLOSID 0 +#define RESCTRL_RESERVED_RMID 0 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, @@ -225,6 +229,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * for this resource and domain. * @r: resource that the counter should be read from. * @d: domain that the counter should be read from. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid + * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. @@ -235,20 +242,25 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * 0 on success, or -EIO, -EINVAL etc on error. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val); + u32 closid, u32 rmid, enum resctrl_event_id eventid, + u64 *val); + /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. * @r: The domain's resource. * @d: The rmid's domain. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid only. * @rmid: The rmid whose counter values should be reset. * @eventid: The eventid whose counter values should be reset. * * This can be called from any CPU. */ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid); + u32 closid, u32 rmid, + enum resctrl_event_id eventid); /** * resctrl_arch_reset_rmid_all() - Reset all private state associated with From d12bf2293195e47029ffc29fc262421c6d41e556 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:20 +0000 Subject: [PATCH 18/72] x86/resctrl: Access per-rmid structures by index commit 6791e0ea30711b937d5cb6e2b17f59a2a2af5386 upstream. x86 systems identify traffic using the CLOSID and RMID. The CLOSID is used to lookup the control policy, the RMID is used for monitoring. For x86 these are independent numbers. Arm's MPAM has equivalent features PARTID and PMG, where the PARTID is used to lookup the control policy. The PMG in contrast is a small number of bits that are used to subdivide PARTID when monitoring. The cache-occupancy monitors require the PARTID to be specified when monitoring. This means MPAM's PMG field is not unique. There are multiple PMG-0, one per allocated CLOSID/PARTID. If PMG is treated as equivalent to RMID, it cannot be allocated as an independent number. Bitmaps like rmid_busy_llc need to be sized by the number of unique entries for this resource. Treat the combined CLOSID and RMID as an index, and provide architecture helpers to pack and unpack an index. This makes the MPAM values unique. The domain's rmid_busy_llc and rmid_ptrs[] are then sized by index, as are domain mbm_local[] and mbm_total[]. x86 can ignore the CLOSID field when packing and unpacking an index, and report as many indexes as RMID. Intel-SIG: commit 6791e0ea3071 x86/resctrl: Access per-rmid structures by index. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Babu Moger Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-7-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 17 +++++ arch/x86/kernel/cpu/resctrl/core.c | 13 ++-- arch/x86/kernel/cpu/resctrl/internal.h | 4 +- arch/x86/kernel/cpu/resctrl/monitor.c | 98 +++++++++++++++++--------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 13 ++-- 5 files changed, 100 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index cc6e1bce7b1ad..db4c84dde2d54 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -101,6 +101,23 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } +static inline u32 resctrl_arch_system_num_rmid_idx(void) +{ + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return boot_cpu_data.x86_cache_max_rmid + 1; +} + +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; +} + +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 9641c42d0f856..d1dc80a21ea96 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -587,7 +587,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) mbm_setup_overflow_handler(d, 0); } if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(r, d)) { + has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0); } @@ -598,11 +598,12 @@ static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - state->default_closid = 0; - state->default_rmid = 0; - state->cur_closid = 0; - state->cur_rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + state->default_closid = RESCTRL_RESERVED_CLOSID; + state->default_rmid = RESCTRL_RESERVED_RMID; + state->cur_closid = RESCTRL_RESERVED_CLOSID; + state->cur_rmid = RESCTRL_RESERVED_RMID; + wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID, + RESCTRL_RESERVED_CLOSID); } static int resctrl_online_cpu(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ae0e3338abc46..cbba782acd0c8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -8,6 +8,8 @@ #include #include +#include + #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL @@ -558,7 +560,7 @@ void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 3dad4134d2c9b..bc5ceef143ab4 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -149,12 +149,29 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 closid, u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } @@ -284,8 +301,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, void __check_limbo(struct rdt_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; bool rmid_dirty; u64 val = 0; @@ -296,12 +314,11 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(X86_RESCTRL_EMPTY_CLOSID, nrmid);// temporary - + entry = __rmid_entry(idx); if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val)) { rmid_dirty = true; @@ -310,19 +327,21 @@ void __check_limbo(struct rdt_domain *d, bool force_free) } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); + clear_bit(idx, d->rmid_busy_llc); if (!--entry->busy) { rmid_limbo_count--; list_add_tail(&entry->list, &rmid_free_lru); } } - crmid = nrmid + 1; + cur_idx = idx + 1; } } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +bool has_busy_rmid(struct rdt_domain *d) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; } /* @@ -352,6 +371,9 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) struct rdt_domain *d; int cpu, err; u64 val = 0; + u32 idx; + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; cpu = get_cpu(); @@ -369,9 +391,9 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) + if (!has_busy_rmid(d)) cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } put_cpu(); @@ -384,14 +406,21 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(closid, rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -402,11 +431,13 @@ void free_rmid(u32 closid, u32 rmid) static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } @@ -449,7 +480,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *m = &rr->d->mbm_local[idx]; u64 cur_bw, bytes, cur_bytes; cur_bytes = rr->val; @@ -538,9 +570,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) struct mbm_state *pmbm_data, *cmbm_data; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; + u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; - u32 cur_bw, user_bw; if (!is_mbm_local_enabled()) return; @@ -549,7 +581,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + idx = resctrl_arch_rmid_idx_encode(closid, rmid); + pmbm_data = &dom_mbm->mbm_local[idx]; dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { @@ -638,17 +671,15 @@ void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); int cpu = smp_processor_id(); - struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) + if (has_busy_rmid(d)) schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); mutex_unlock(&rdtgroup_mutex); @@ -713,19 +744,20 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry = NULL; - int i, nr_rmids; + u32 idx; + int i; - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); if (!rmid_ptrs) return -ENOMEM; - for (i = 0; i < nr_rmids; i++) { + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } @@ -734,7 +766,9 @@ static int dom_data_init(struct rdt_resource *r) * are always allocated. These are used for the rdtgroup_default * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); return 0; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index ad7da7254f4dc..a7dbc0e7e5595 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3853,8 +3853,8 @@ static void __init rdtgroup_setup_default(void) { mutex_lock(&rdtgroup_mutex); - rdtgroup_default.closid = 0; - rdtgroup_default.mon.rmid = 0; + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; rdtgroup_default.type = RDTCTRL_GROUP; INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); @@ -3889,7 +3889,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -3907,16 +3907,17 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } if (is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { bitmap_free(d->rmid_busy_llc); return -ENOMEM; @@ -3924,7 +3925,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) } if (is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); From de8bf8b1e18e210a45c2394f64503c45dd660085 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:21 +0000 Subject: [PATCH 19/72] x86/resctrl: Allow RMID allocation to be scoped by CLOSID commit c4c0376eefe185b790d89ca8016b7f837ebf25da upstream. MPAMs RMID values are not unique unless the CLOSID is considered as well. alloc_rmid() expects the RMID to be an independent number. Pass the CLOSID in to alloc_rmid(). Use this to compare indexes when allocating. If the CLOSID is not relevant to the index, this ends up comparing the free RMID with itself, and the first free entry will be used. With MPAM the CLOSID is included in the index, so this becomes a walk of the free RMID entries, until one that matches the supplied CLOSID is found. Intel-SIG: commit c4c0376eefe1 x86/resctrl: Allow RMID allocation to be scoped by CLOSID. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-8-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/monitor.c | 43 ++++++++++++++++++----- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index cbba782acd0c8..872ba1a341039 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -543,7 +543,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); -int alloc_rmid(void); +int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void __exit rdt_put_mon_l3_config(void); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index bc5ceef143ab4..c49f2e89ef29d 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -344,24 +344,49 @@ bool has_busy_rmid(struct rdt_domain *d) return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; } +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 65bee6f11015e..d8f44113ed1f1 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -777,7 +777,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) int ret; if (rdt_mon_capable) { - ret = alloc_rmid(); + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a7dbc0e7e5595..dcffd1c4a476f 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3295,7 +3295,7 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) if (!rdt_mon_capable) return 0; - ret = alloc_rmid(); + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; From 5cdc06155bc16219e6f5b1d55d9ad46dbda604c5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:22 +0000 Subject: [PATCH 20/72] x86/resctrl: Track the number of dirty RMID a CLOSID has commit b30a55df60c35df09b9ef08dfb0a0cbb543abe81 upstream. MPAM's PMG bits extend its PARTID space, meaning the same PMG value can be used for different control groups. This means once a CLOSID is allocated, all its monitoring ids may still be dirty, and held in limbo. Keep track of the number of RMID held in limbo each CLOSID has. This will allow a future helper to find the 'cleanest' CLOSID when allocating. The array is only needed when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. This will never be the case on x86. Intel-SIG: commit b30a55df60c3 x86/resctrl: Track the number of dirty RMID a CLOSID has. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-9-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 75 +++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c49f2e89ef29d..13b0c8d14f3de 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -50,6 +50,13 @@ struct rmid_entry { */ static LIST_HEAD(rmid_free_lru); +/* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + /* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. @@ -292,6 +299,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and @@ -328,10 +346,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free) if (force_free || !rmid_dirty) { clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + if (!--entry->busy) + limbo_release_entry(entry); } cur_idx = idx + 1; } @@ -398,6 +414,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) u64 val = 0; u32 idx; + lockdep_assert_held(&rdtgroup_mutex); + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; @@ -423,10 +441,13 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) } put_cpu(); - if (entry->busy) + if (entry->busy) { rmid_limbo_count++; - else + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; + } else { list_add_tail(&entry->list, &rmid_free_lru); + } } void free_rmid(u32 closid, u32 rmid) @@ -770,13 +791,39 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) static int dom_data_init(struct rdt_resource *r) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; + int err = 0, i; u32 idx; - int i; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; @@ -796,13 +843,21 @@ static int dom_data_init(struct rdt_resource *r) entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; } static void __exit dom_data_exit(void) { mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + kfree(rmid_ptrs); rmid_ptrs = NULL; From c7202f7a695b7351425811eeac4e974759f69ef4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:23 +0000 Subject: [PATCH 21/72] x86/resctrl: Use __set_bit()/__clear_bit() instead of open coding commit 5d920b6881f2249be3a028ce0a7f31c5cc61b1ee upstream. The resctrl CLOSID allocator uses a single 32bit word to track which CLOSID are free. The setting and clearing of bits is open coded. Convert the existing open coded bit manipulations of closid_free_map to use __set_bit() and friends. These don't need to be atomic as this list is protected by the mutex. Intel-SIG: commit 5d920b6881f2 x86/resctrl: Use __set_bit()/__clear_bit() instead of open coding. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-10-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index dcffd1c4a476f..bc6e0f83c8472 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -111,7 +111,7 @@ void rdt_staged_configs_clear(void) * - Our choices on how to configure each resource become progressively more * limited as the number of resources grows. */ -static int closid_free_map; +static unsigned long closid_free_map; static int closid_free_map_len; int closids_supported(void) @@ -130,8 +130,8 @@ static void closid_init(void) closid_free_map = BIT_MASK(rdt_min_closid) - 1; - /* CLOSID 0 is always reserved for the default group */ - closid_free_map &= ~1; + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); closid_free_map_len = rdt_min_closid; } @@ -139,17 +139,21 @@ static int closid_alloc(void) { u32 closid = ffs(closid_free_map); + lockdep_assert_held(&rdtgroup_mutex); + if (closid == 0) return -ENOSPC; closid--; - closid_free_map &= ~(1 << closid); + __clear_bit(closid, &closid_free_map); return closid; } void closid_free(int closid) { - closid_free_map |= 1 << closid; + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, &closid_free_map); } /** @@ -161,7 +165,9 @@ void closid_free(int closid) */ static bool closid_allocated(unsigned int closid) { - return (closid_free_map & (1 << closid)) == 0; + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, &closid_free_map); } /** From add9a881e961182105012825c4f9780af25e3823 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:24 +0000 Subject: [PATCH 22/72] x86/resctrl: Allocate the cleanest CLOSID by searching closid_num_dirty_rmid commit 6eac36bb9eb0349c983313c71692c19d50b56878 upstream. MPAM's PMG bits extend its PARTID space, meaning the same PMG value can be used for different control groups. This means once a CLOSID is allocated, all its monitoring ids may still be dirty, and held in limbo. Instead of allocating the first free CLOSID, on architectures where CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is enabled, search closid_num_dirty_rmid[] to find the cleanest CLOSID. The CLOSID found is returned to closid_alloc() for the free list to be updated. Intel-SIG: commit 6eac36bb9eb0 x86/resctrl: Allocate the cleanest CLOSID by searching closid_num_dirty_rmid. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-11-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/monitor.c | 45 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 19 ++++++++--- 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 872ba1a341039..b7b9d9230bef0 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -566,5 +566,7 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); void rdt_staged_configs_clear(void); +bool closid_allocated(unsigned int closid); +int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 13b0c8d14f3de..101f1b112d170 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -386,6 +386,51 @@ static struct rmid_entry *resctrl_find_free_rmid(u32 closid) return ERR_PTR(-ENOSPC); } +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; +} + /* * For MPAM the RMID value is not unique, and has to be considered with * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index bc6e0f83c8472..8fc46204a6ccd 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -137,13 +137,22 @@ static void closid_init(void) static int closid_alloc(void) { - u32 closid = ffs(closid_free_map); + int cleanest_closid; + u32 closid; lockdep_assert_held(&rdtgroup_mutex); - if (closid == 0) - return -ENOSPC; - closid--; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = ffs(closid_free_map); + if (closid == 0) + return -ENOSPC; + closid--; + } __clear_bit(closid, &closid_free_map); return closid; @@ -163,7 +172,7 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool closid_allocated(unsigned int closid) +bool closid_allocated(unsigned int closid) { lockdep_assert_held(&rdtgroup_mutex); From ebac54ad50584265e94c1ca84a6d6a248afed4d5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:25 +0000 Subject: [PATCH 23/72] x86/resctrl: Move CLOSID/RMID matching and setting to use helpers commit 6eca639d8340b569ff78ffd753796e83ef7075ae upstream. When switching tasks, the CLOSID and RMID that the new task should use are stored in struct task_struct. For x86 the CLOSID known by resctrl, the value in task_struct, and the value written to the CPU register are all the same thing. MPAM's CPU interface has two different PARTIDs - one for data accesses the other for instruction fetch. Storing resctrl's CLOSID value in struct task_struct implies the arch code knows whether resctrl is using CDP. Move the matching and setting of the struct task_struct properties to use helpers. This allows arm64 to store the hardware format of the register, instead of having to convert it each time. __rdtgroup_move_task()s use of READ_ONCE()/WRITE_ONCE() ensures torn values aren't seen as another CPU may schedule the task being moved while the value is being changed. MPAM has an additional corner-case here as the PMG bits extend the PARTID space. If the scheduler sees a new-CLOSID but old-RMID, the task will dirty an RMID that the limbo code is not watching causing an inaccurate count. x86's RMID are independent values, so the limbo code will still be watching the old-RMID in this circumstance. To avoid this, arm64 needs both the CLOSID/RMID WRITE_ONCE()d together. Both values must be provided together. Because MPAM's RMID values are not unique, the CLOSID must be provided when matching the RMID. Intel-SIG: commit 6eca639d8340 x86/resctrl: Move CLOSID/RMID matching and setting to use helpers. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-12-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 18 ++++++++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 62 ++++++++++++++++---------- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index db4c84dde2d54..1d274dbabc444 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -95,6 +95,24 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 8fc46204a6ccd..e42cbdf8f6a3c 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -102,7 +102,7 @@ void rdt_staged_configs_clear(void) * * Using a global CLOSID across all resources has some advantages and * some drawbacks: - * + We can simply set "current->closid" to assign a task to a resource + * + We can simply set current's closid to assign a task to a resource * group. * + Context switch code can avoid extra memory references deciding which * CLOSID to load into the PQR_ASSOC MSR @@ -574,14 +574,26 @@ static void update_task_closid_rmid(struct task_struct *t) _update_task_closid_rmid(t); } +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { /* If the task is already in rdtgrp, no need to move the task. */ - if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && - tsk->rmid == rdtgrp->mon.rmid) || - (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && - tsk->closid == rdtgrp->mon.parent->closid)) + if (task_in_rdtgroup(tsk, rdtgrp)) return 0; /* @@ -592,19 +604,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * For monitor groups, can move the tasks only from * their parent CTRL group. */ - - if (rdtgrp->type == RDTCTRL_GROUP) { - WRITE_ONCE(tsk->closid, rdtgrp->closid); - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. @@ -626,14 +638,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); + return (rdt_alloc_capable && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); + return (rdt_mon_capable && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); } /** @@ -884,7 +897,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, rdtg->mode != RDT_MODE_EXCLUSIVE) continue; - if (rdtg->closid != tsk->closid) + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) continue; seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", @@ -892,7 +905,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, seq_puts(s, "mon:"); list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, mon.crdtgrp_list) { - if (tsk->rmid != crg->mon.rmid) + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) continue; seq_printf(s, "%s", crg->kn->name); break; @@ -2820,8 +2834,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - WRITE_ONCE(t->closid, to->closid); - WRITE_ONCE(t->rmid, to->mon.rmid); + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); /* * Order the closid/rmid stores above before the loads From 903bec714a0259379752fa11fcfd21de2d3b202d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:26 +0000 Subject: [PATCH 24/72] x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow commit a4846aaf39455fe69fce3522b385319383666eef upstream. The limbo and overflow code picks a CPU to use from the domain's list of online CPUs. Work is then scheduled on these CPUs to maintain the limbo list and any counters that may overflow. cpumask_any() may pick a CPU that is marked nohz_full, which will either penalise the work that CPU was dedicated to, or delay the processing of limbo list or counters that may overflow. Perhaps indefinitely. Delaying the overflow handling will skew the bandwidth values calculated by mba_sc, which expects to be called once a second. Add cpumask_any_housekeeping() as a replacement for cpumask_any() that prefers housekeeping CPUs. This helper will still return a nohz_full CPU if that is the only option. The CPU to use is re-evaluated each time the limbo/overflow work runs. This ensures the work will move off a nohz_full CPU once a housekeeping CPU is available. Intel-SIG: commit a4846aaf3945 x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-13-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 24 ++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/monitor.c | 20 +++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index b7b9d9230bef0..81f5de916db8c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -55,6 +56,29 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * + * Returns a CPU in @mask. If there are housekeeping CPUs that don't use + * nohz_full, these are preferred. + */ +static inline unsigned int cpumask_any_housekeeping(const struct cpumask *mask) +{ + unsigned int cpu, hk_cpu; + + cpu = cpumask_any(mask); + if (!tick_nohz_full_cpu(cpu)) + return cpu; + + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu < nr_cpu_ids) + cpu = hk_cpu; + + return cpu; +} + struct rdt_fs_context { struct kernfs_fs_context kfc; bool enable_cdpl2; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 101f1b112d170..38f85e53ca931 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -761,7 +761,6 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -770,8 +769,11 @@ void cqm_handle_limbo(struct work_struct *work) __check_limbo(d, false); - if (has_busy_rmid(d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); } @@ -781,7 +783,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask); dom->cqm_work_cpu = cpu; schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); @@ -791,7 +793,6 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); struct list_head *head; struct rdt_resource *r; struct rdt_domain *d; @@ -815,7 +816,12 @@ void mbm_handle_overflow(struct work_struct *work) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); @@ -828,7 +834,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) if (!static_branch_likely(&rdt_mon_enable_key)) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask); dom->mbm_work_cpu = cpu; schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } From 484887a6c223db508df6bf5d9c93d9c54990b43a Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:27 +0000 Subject: [PATCH 25/72] x86/resctrl: Queue mon_event_read() instead of sending an IPI commit 09909e098113bed99c9f63e1df89073e92c69891 upstream. Intel is blessed with an abundance of monitors, one per RMID, that can be read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC, the number implemented is up to the manufacturer. This means when there are fewer monitors than needed, they need to be allocated and freed. MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The CSU counter is allowed to return 'not ready' for a small number of micro-seconds after programming. To allow one CSU hardware monitor to be used for multiple control or monitor groups, the CPU accessing the monitor needs to be able to block when configuring and reading the counter. Worse, the domain may be broken up into slices, and the MMIO accesses for each slice may need performing from different CPUs. These two details mean MPAMs monitor code needs to be able to sleep, and IPI another CPU in the domain to read from a resource that has been sliced. mon_event_read() already invokes mon_event_count() via IPI, which means this isn't possible. On systems using nohz-full, some CPUs need to be interrupted to run kernel work as they otherwise stay in user-space running realtime workloads. Interrupting these CPUs should be avoided, and scheduling work on them may never complete. Change mon_event_read() to pick a housekeeping CPU, (one that is not using nohz_full) and schedule mon_event_count() and wait. If all the CPUs in a domain are using nohz-full, then an IPI is used as the fallback. This function is only used in response to a user-space filesystem request (not the timing sensitive overflow code). This allows MPAM to hide the slice behaviour from resctrl, and to keep the monitor-allocation in monitor.c. When the IPI fallback is used on machines where MPAM needs to make an access on multiple CPUs, the counter read will always fail. Intel-SIG: commit 09909e098113 x86/resctrl: Queue mon_event_read() instead of sending an IPI. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Peter Newman Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 26 +++++++++++++++++++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index beccb0e87ba74..e933e1cdb1c92 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -19,6 +19,8 @@ #include #include #include +#include + #include "internal.h" /* @@ -522,12 +524,21 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first) { + int cpu; + /* - * setup the parameters to send to the IPI to read the data. + * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; @@ -536,7 +547,18 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->val = 0; rr->first = first; - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + cpu = cpumask_any_housekeeping(&d->cpu_mask); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 38f85e53ca931..fd060ef86f386 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -585,7 +585,7 @@ static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) From e1e636d38f6cc075a735d029c41e2556549b23bc Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:28 +0000 Subject: [PATCH 26/72] x86/resctrl: Allow resctrl_arch_rmid_read() to sleep commit 6fde1424f29b151b9dc8c660eecf4d1645facea5 upstream. MPAM's cache occupancy counters can take a little while to settle once the monitor has been configured. The maximum settling time is described to the driver via a firmware table. The value could be large enough that it makes sense to sleep. To avoid exposing this to resctrl, it should be hidden behind MPAM's resctrl_arch_rmid_read(). resctrl_arch_rmid_read() may be called via IPI meaning it is unable to sleep. In this case, it should return an error if it needs to sleep. This will only affect MPAM platforms where the cache occupancy counter isn't available immediately, nohz_full is in use, and there are no housekeeping CPUs in the necessary domain. There are three callers of resctrl_arch_rmid_read(): __mon_event_count() and __check_limbo() are both called from a non-migrateable context. mon_event_read() invokes __mon_event_count() using smp_call_on_cpu(), which adds work to the target CPUs workqueue. rdtgroup_mutex() is held, meaning this cannot race with the resctrl cpuhp callback. __check_limbo() is invoked via schedule_delayed_work_on() also adds work to a per-cpu workqueue. The remaining call is add_rmid_to_limbo() which is called in response to a user-space syscall that frees an RMID. This opportunistically reads the LLC occupancy counter on the current domain to see if the RMID is over the dirty threshold. This has to disable preemption to avoid reading the wrong domain's value. Disabling preemption here prevents resctrl_arch_rmid_read() from sleeping. add_rmid_to_limbo() walks each domain, but only reads the counter on one domain. If the system has more than one domain, the RMID will always be added to the limbo list. If the RMIDs usage was not over the threshold, it will be removed from the list when __check_limbo() runs. Make this the default behaviour. Free RMIDs are always added to the limbo list for each domain. The user visible effect of this is that a clean RMID is not available for re-allocation immediately after 'rmdir()' completes. This behaviour was never portable as it never happened on a machine with multiple domains. Removing this path allows resctrl_arch_rmid_read() to sleep if its called with interrupts unmasked. Document this is the expected behaviour, and add a might_sleep() annotation to catch changes that won't work on arm64. Intel-SIG: commit 6fde1424f29b x86/resctrl: Allow resctrl_arch_rmid_read() to sleep. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-15-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 25 +++++-------------------- include/linux/resctrl.h | 23 ++++++++++++++++++++++- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fd060ef86f386..e8aeff6673ea3 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -277,6 +277,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u64 msr_val, chunks; int ret; + resctrl_arch_rmid_read_context_check(); + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; @@ -455,8 +457,6 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu, err; - u64 val = 0; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -464,17 +464,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->closid, - entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } - /* * For the first limbo RMID in the domain, * setup up the limbo worker. @@ -484,15 +474,10 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) { - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; - } else { - list_add_tail(&entry->list, &rmid_free_lru); - } + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } void free_rmid(u32 closid, u32 rmid) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bd4ec22b5a961..8649fc84aac25 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -236,7 +236,12 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. * - * Call from process context on a CPU that belongs to domain @d. + * Some architectures need to sleep when first programming some of the counters. + * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' + * for a short period of time). Call from a non-migrateable process context on + * a CPU that belongs to domain @d. e.g. use smp_call_on_cpu() or + * schedule_work_on(). This function can be called with interrupts masked, + * e.g. using smp_call_function_any(), but may consistently return an error. * * Return: * 0 on success, or -EIO, -EINVAL etc on error. @@ -245,6 +250,22 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val); +/** + * resctrl_arch_rmid_read_context_check() - warn about invalid contexts + * + * When built with CONFIG_DEBUG_ATOMIC_SLEEP generate a warning when + * resctrl_arch_rmid_read() is called with preemption disabled. + * + * The contract with resctrl_arch_rmid_read() is that if interrupts + * are unmasked, it can sleep. This allows NOHZ_FULL systems to use an + * IPI, (and fail if the call needed to sleep), while most of the time + * the work is scheduled, allowing the call to sleep. + */ +static inline void resctrl_arch_rmid_read_context_check(void) +{ + if (!irqs_disabled()) + might_sleep(); +} /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid From 51d9a3f0752b4a84f84f4cbcb2be67367c48e7f8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:29 +0000 Subject: [PATCH 27/72] x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read() commit e557999f80a5ee4ec812f594ab42bb76c3ec4eb2 upstream. Depending on the number of monitors available, Arm's MPAM may need to allocate a monitor prior to reading the counter value. Allocating a contended resource may involve sleeping. __check_limbo() and mon_event_count() each make multiple calls to resctrl_arch_rmid_read(), to avoid extra work on contended systems, the allocation should be valid for multiple invocations of resctrl_arch_rmid_read(). The memory or hardware allocated is not specific to a domain. Add arch hooks for this allocation, which need calling before resctrl_arch_rmid_read(). The allocated monitor is passed to resctrl_arch_rmid_read(), then freed again afterwards. The helper can be called on any CPU, and can sleep. Intel-SIG: commit e557999f80a5 x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-16-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 11 +++++++ arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 7 +++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 35 +++++++++++++++++++++-- include/linux/resctrl.h | 5 +++- 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 1d274dbabc444..29c4cc3437871 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -136,6 +136,17 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) return rmid; } +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index e933e1cdb1c92..52fa0e14cb86c 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -546,6 +546,11 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->d = d; rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } cpu = cpumask_any_housekeeping(&d->cpu_mask); @@ -559,6 +564,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); + + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 81f5de916db8c..e089d1a1a0555 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -137,6 +137,7 @@ struct rmid_read { bool first; int err; u64 val; + void *arch_mon_ctx; }; extern bool rdt_alloc_capable; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index e8aeff6673ea3..9b503e6ac490f 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -269,7 +269,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val) + u64 *val, void *ignored) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -324,9 +324,17 @@ void __check_limbo(struct rdt_domain *d, bool force_free) u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -340,7 +348,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free) entry = __rmid_entry(idx); if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); @@ -353,6 +362,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free) } cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } bool has_busy_rmid(struct rdt_domain *d) @@ -533,7 +544,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, - &tval); + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -722,11 +733,27 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + __mon_event_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + __mon_event_count(closid, rmid, &rr); /* @@ -736,6 +763,8 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, */ if (is_mba_sc(NULL)) mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 8649fc84aac25..bf460c912bf52 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -235,6 +235,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. + * @arch_mon_ctx: An architecture specific value from + * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies + * the hardware monitor allocated for this read request. * * Some architectures need to sleep when first programming some of the counters. * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' @@ -248,7 +251,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val); + u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts From 337a1fdc8a70d16eb588daa7a6969b7a5ac14bd7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:30 +0000 Subject: [PATCH 28/72] x86/resctrl: Make resctrl_mounted checks explicit commit 13e5769debf09588543db83836c524148873929f upstream. The rdt_enable_key is switched when resctrl is mounted, and used to prevent a second mount of the filesystem. It also enables the architecture's context switch code. This requires another architecture to have the same set of static keys, as resctrl depends on them too. The existing users of these static keys are implicitly also checking if the filesystem is mounted. Make the resctrl_mounted checks explicit: resctrl can keep track of whether it has been mounted once. This doesn't need to be combined with whether the arch code is context switching the CLOSID. rdt_mon_enable_key is never used just to test that resctrl is mounted, but does also have this implication. Add a resctrl_mounted to all uses of rdt_mon_enable_key. This will allow the static key changing to be moved behind resctrl_arch_ calls. Intel-SIG: commit 13e5769debf0 x86/resctrl: Make resctrl_mounted checks explicit. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-17-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 12 ++++++++++-- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 23 +++++++++++++++++------ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e089d1a1a0555..9bfda69637943 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -144,6 +144,7 @@ extern bool rdt_alloc_capable; extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; +extern bool resctrl_mounted; enum rdt_group_type { RDTCTRL_GROUP = 0, diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 9b503e6ac490f..d5d8a58d96f2a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -813,7 +813,11 @@ void mbm_handle_overflow(struct work_struct *work) mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; @@ -846,7 +850,11 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !static_branch_likely(&rdt_mon_enable_key)) return; cpu = cpumask_any_housekeeping(&dom->cpu_mask); dom->mbm_work_cpu = cpu; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index e42cbdf8f6a3c..857fbbc3c8390 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -42,6 +42,9 @@ LIST_HEAD(rdt_all_groups); /* list of entries for the schemata file */ LIST_HEAD(resctrl_schema_all); +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -881,7 +884,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, mutex_lock(&rdtgroup_mutex); /* Return empty if resctrl has not been mounted. */ - if (!static_branch_unlikely(&rdt_enable_key)) { + if (!resctrl_mounted) { seq_puts(s, "res:\nmon:\n"); goto unlock; } @@ -2608,7 +2611,7 @@ static int rdt_get_tree(struct fs_context *fc) /* * resctrl file system can only be mounted once. */ - if (static_branch_unlikely(&rdt_enable_key)) { + if (resctrl_mounted) { ret = -EBUSY; goto out; } @@ -2669,8 +2672,10 @@ static int rdt_get_tree(struct fs_context *fc) if (rdt_mon_capable) static_branch_enable_cpuslocked(&rdt_mon_enable_key); - if (rdt_alloc_capable || rdt_mon_capable) + if (rdt_alloc_capable || rdt_mon_capable) { static_branch_enable_cpuslocked(&rdt_enable_key); + resctrl_mounted = true; + } if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; @@ -2944,6 +2949,7 @@ static void rdt_kill_sb(struct super_block *sb) static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); + resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); @@ -3913,7 +3919,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); if (is_mbm_enabled()) @@ -3990,8 +3996,13 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - /* If resctrl is mounted, add per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && static_branch_unlikely(&rdt_mon_enable_key)) mkdir_mondata_subdir_allrdtgrp(r, d); return 0; From de92da262b5c7a0a555d68c9af2b2742f791d394 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:31 +0000 Subject: [PATCH 29/72] x86/resctrl: Move alloc/mon static keys into helpers commit 5db6a4a75c95f6967d57906ba7b82756d1985d63 upstream. resctrl enables three static keys depending on the features it has enabled. Another architecture's context switch code may look different, any static keys that control it should be buried behind helpers. Move the alloc/mon logic into arch-specific helpers as a preparatory step for making the rdt_enable_key's status something the arch code decides. This means other architectures don't have to mirror the static keys. Intel-SIG: commit 5db6a4a75c95 x86/resctrl: Move alloc/mon static keys into helpers. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-18-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 20 ++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 5 ----- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 8 ++++---- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 29c4cc3437871..3c9137b6ad4f2 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -42,6 +42,26 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 9bfda69637943..78580855139dd 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -94,9 +94,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) return container_of(kfc, struct rdt_fs_context, kfc); } -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - /** * struct mon_evt - Entry in the event list of a resource * @evtid: event id @@ -452,8 +449,6 @@ extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - extern struct dentry *debugfs_resctrl; enum resctrl_res_level { diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 857fbbc3c8390..231207f09e040 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2668,9 +2668,9 @@ static int rdt_get_tree(struct fs_context *fc) goto out_psl; if (rdt_alloc_capable) - static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + resctrl_arch_enable_alloc(); if (rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_mon_enable_key); + resctrl_arch_enable_mon(); if (rdt_alloc_capable || rdt_mon_capable) { static_branch_enable_cpuslocked(&rdt_enable_key); @@ -2946,8 +2946,8 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - static_branch_disable_cpuslocked(&rdt_alloc_enable_key); - static_branch_disable_cpuslocked(&rdt_mon_enable_key); + resctrl_arch_disable_alloc(); + resctrl_arch_disable_mon(); static_branch_disable_cpuslocked(&rdt_enable_key); resctrl_mounted = false; kernfs_kill_sb(sb); From 9f43c91cafc017b9dda162e66ca04648c05cd737 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:32 +0000 Subject: [PATCH 30/72] x86/resctrl: Make rdt_enable_key the arch's decision to switch commit 0a2f4d9b548c5b1e2e3fcfa966f5d47b1cacff01 upstream. rdt_enable_key is switched when resctrl is mounted. It was also previously used to prevent a second mount of the filesystem. Any other architecture that wants to support resctrl has to provide identical static keys. Now that there are helpers for enabling and disabling the alloc/mon keys, resctrl doesn't need to switch this extra key, it can be done by the arch code. Use the static-key increment and decrement helpers, and change resctrl to ensure the calls are balanced. Intel-SIG: commit 0a2f4d9b548c x86/resctrl: Make rdt_enable_key the arch's decision to switch. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-19-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 4 ++++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 11 +++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 3c9137b6ad4f2..b74aa34dc9e8f 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -45,21 +45,25 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); static inline void resctrl_arch_enable_alloc(void) { static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); } static inline void resctrl_arch_disable_alloc(void) { static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); } static inline void resctrl_arch_enable_mon(void) { static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); } static inline void resctrl_arch_disable_mon(void) { static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); } /* diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 231207f09e040..7e57ac9d81f7b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2672,10 +2672,8 @@ static int rdt_get_tree(struct fs_context *fc) if (rdt_mon_capable) resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) { - static_branch_enable_cpuslocked(&rdt_enable_key); + if (rdt_alloc_capable || rdt_mon_capable) resctrl_mounted = true; - } if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; @@ -2946,9 +2944,10 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - resctrl_arch_disable_alloc(); - resctrl_arch_disable_mon(); - static_branch_disable_cpuslocked(&rdt_enable_key); + if (rdt_alloc_capable) + resctrl_arch_disable_alloc(); + if (rdt_mon_capable) + resctrl_arch_disable_mon(); resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); From 0434cd0dd2215de3d177c39bf2d21bd3aae5e25c Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:33 +0000 Subject: [PATCH 31/72] x86/resctrl: Add helpers for system wide mon/alloc capable commit 30017b60706c2ba72a0a4da7d5ef8f5fa95a2f01 upstream. resctrl reads rdt_alloc_capable or rdt_mon_capable to determine whether any of the resources support the corresponding features. resctrl also uses the static keys that affect the architecture's context-switch code to determine the same thing. This forces another architecture to have the same static keys. As the static key is enabled based on the capable flag, and none of the filesystem uses of these are in the scheduler path, move the capable flags behind helpers, and use these in the filesystem code instead of the static key. After this change, only the architecture code manages and uses the static keys to ensure __resctrl_sched_in() does not need runtime checks. This avoids multiple architectures having to define the same static keys. Cases where the static key implicitly tested if the resctrl filesystem was mounted all have an explicit check now. Intel-SIG: commit 30017b60706c x86/resctrl: Add helpers for system wide mon/alloc capable. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-20-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 13 ++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 2 -- arch/x86/kernel/cpu/resctrl/monitor.c | 4 +-- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 6 ++-- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 38 +++++++++++------------ 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index b74aa34dc9e8f..12dbd2588ca7c 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -38,10 +38,18 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + static inline void resctrl_arch_enable_alloc(void) { static_branch_enable_cpuslocked(&rdt_alloc_enable_key); @@ -54,6 +62,11 @@ static inline void resctrl_arch_disable_alloc(void) static_branch_dec_cpuslocked(&rdt_enable_key); } +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + static inline void resctrl_arch_enable_mon(void) { static_branch_enable_cpuslocked(&rdt_mon_enable_key); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 78580855139dd..3ee855c374471 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -137,8 +137,6 @@ struct rmid_read { void *arch_mon_ctx; }; -extern bool rdt_alloc_capable; -extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; extern bool resctrl_mounted; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index d5d8a58d96f2a..92d7ba674003b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -817,7 +817,7 @@ void mbm_handle_overflow(struct work_struct *work) * If the filesystem has been unmounted this work no longer needs to * run. */ - if (!resctrl_mounted || !static_branch_likely(&rdt_mon_enable_key)) + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; @@ -854,7 +854,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) * When a domain comes online there is no guarantee the filesystem is * mounted. If not, there is no need to catch counter overflow. */ - if (!resctrl_mounted || !static_branch_likely(&rdt_mon_enable_key)) + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; cpu = cpumask_any_housekeeping(&dom->cpu_mask); dom->mbm_work_cpu = cpu; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d8f44113ed1f1..8056bed033cc7 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); if (ret) goto err_cpus_list; @@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; @@ -776,7 +776,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) { int ret; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 7e57ac9d81f7b..ed5fc677a99dd 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -641,13 +641,13 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && (r->type == RDTCTRL_GROUP) && + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && (r->type == RDTMON_GROUP) && + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && resctrl_arch_match_rmid(t, r->mon.parent->closid, r->mon.rmid)); } @@ -2632,7 +2632,7 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) flags |= RFTYPE_MON; ret = rdtgroup_add_files(rdtgroup_default.kn, flags); @@ -2645,7 +2645,7 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_schemata_free; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = mongroup_create_dir(rdtgroup_default.kn, &rdtgroup_default, "mon_groups", &kn_mongrp); @@ -2667,12 +2667,12 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_psl; - if (rdt_alloc_capable) + if (resctrl_arch_alloc_capable()) resctrl_arch_enable_alloc(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) resctrl_mounted = true; if (is_mbm_enabled()) { @@ -2686,10 +2686,10 @@ static int rdt_get_tree(struct fs_context *fc) out_psl: rdt_pseudo_lock_release(); out_mondata: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); @@ -2944,9 +2944,9 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - if (rdt_alloc_capable) + if (resctrl_arch_alloc_capable()) resctrl_arch_disable_alloc(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) resctrl_arch_disable_mon(); resctrl_mounted = false; kernfs_kill_sb(sb); @@ -3326,7 +3326,7 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) { int ret; - if (!rdt_mon_capable) + if (!resctrl_arch_mon_capable()) return 0; ret = alloc_rmid(rdtgrp->closid); @@ -3348,7 +3348,7 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) free_rmid(rgrp->closid, rgrp->mon.rmid); } @@ -3412,7 +3412,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) files |= RFTYPE_MON; } else { files = RFTYPE_BASE | RFTYPE_MON; @@ -3521,7 +3521,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { /* * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. @@ -3576,14 +3576,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * allocation is supported, add a control and monitoring * subdirectory */ - if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ - if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; @@ -3918,7 +3918,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (resctrl_mounted && static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, d->id); if (is_mbm_enabled()) @@ -4001,7 +4001,7 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) * by rdt_get_tree() calling mkdir_mondata_all(). * If resctrl is mounted, add per domain monitor data directories. */ - if (resctrl_mounted && static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); return 0; From 9148de14d0d48644eeae1e93e1c6510328037bf2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:34 +0000 Subject: [PATCH 32/72] x86/resctrl: Add CPU online callback for resctrl work commit 1b3e50ce7f5001f1e0edaf7d6abea43b264db7ee upstream. The resctrl architecture specific code may need to create a domain when a CPU comes online, it also needs to reset the CPUs PQR_ASSOC register. The resctrl filesystem code needs to update the rdtgroup_default CPU mask when CPUs are brought online. Currently, this is all done in one function, resctrl_online_cpu(). It will need to be split into architecture and filesystem parts before resctrl can be moved to /fs/. Pull the rdtgroup_default update work out as a filesystem specific cpu_online helper. resctrl_online_cpu() is the obvious name for this, which means the version in core.c needs renaming. resctrl_online_cpu() is called by the arch code once it has done the work to add the new CPU to any domains. In future patches, resctrl_online_cpu() will take the rdtgroup_mutex itself. Intel-SIG: commit 1b3e50ce7f50 x86/resctrl: Add CPU online callback for resctrl work. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-21-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++++---- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d1dc80a21ea96..4627d447bc3dc 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -606,16 +606,16 @@ static void clear_closid_rmid(int cpu) RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); clear_closid_rmid(cpu); + + resctrl_online_cpu(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -967,7 +967,7 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, resctrl_offline_cpu); if (state < 0) return state; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index ed5fc677a99dd..38d3b19a3aca1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -4007,6 +4007,14 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) return 0; } +void resctrl_online_cpu(unsigned int cpu) +{ + lockdep_assert_held(&rdtgroup_mutex); + + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bf460c912bf52..4c4bad3c34e48 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -223,6 +223,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_online_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid From 6bda055dd81c1adcd1bcc87adc3ccef0b5ee6254 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:35 +0000 Subject: [PATCH 33/72] x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but CPU commit 978fcca954cb52249babbc14e53de53c88dd6433 upstream. When a CPU is taken offline resctrl may need to move the overflow or limbo handlers to run on a different CPU. Once the offline callbacks have been split, cqm_setup_limbo_handler() will be called while the CPU that is going offline is still present in the CPU mask. Pass the CPU to exclude to cqm_setup_limbo_handler() and mbm_setup_overflow_handler(). These functions can use a variant of cpumask_any_but() when selecting the CPU. -1 is used to indicate no CPUs need excluding. Intel-SIG: commit 978fcca954cb x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but CPU. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Babu Moger Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-22-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 8 +++-- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 2 +- arch/x86/kernel/cpu/resctrl/internal.h | 33 ++++++++++++++---- arch/x86/kernel/cpu/resctrl/monitor.c | 42 ++++++++++++++++++----- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 ++-- include/linux/resctrl.h | 2 ++ 6 files changed, 72 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4627d447bc3dc..55322ba629da3 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -584,12 +584,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); + /* + * temporary: exclude_cpu=-1 as this CPU has already + * been removed by cpumask_clear_cpu()d + */ + mbm_setup_overflow_handler(d, 0, RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); + cqm_setup_limbo_handler(d, 0, RESCTRL_PICK_ANY_CPU); } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 52fa0e14cb86c..20b02d6f02c19 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -552,7 +552,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, return; } - cpu = cpumask_any_housekeeping(&d->cpu_mask); + cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); /* * cpumask_any_housekeeping() prefers housekeeping CPUs, but diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 3ee855c374471..c99f26ebe7a65 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -60,19 +60,36 @@ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that * aren't marked nohz_full * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. * - * Returns a CPU in @mask. If there are housekeeping CPUs that don't use - * nohz_full, these are preferred. + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. */ -static inline unsigned int cpumask_any_housekeeping(const struct cpumask *mask) +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) { unsigned int cpu, hk_cpu; - cpu = cpumask_any(mask); - if (!tick_nohz_full_cpu(cpu)) + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) return cpu; + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + if (hk_cpu < nr_cpu_ids) cpu = hk_cpu; @@ -573,11 +590,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 92d7ba674003b..67edd4c440f06 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -481,7 +481,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) * setup up the limbo worker. */ if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); set_bit(idx, d->rmid_busy_llc); entry->busy++; } @@ -784,7 +785,8 @@ void cqm_handle_limbo(struct work_struct *work) __check_limbo(d, false); if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, delay); } @@ -792,15 +794,25 @@ void cqm_handle_limbo(struct work_struct *work) mutex_unlock(&rdtgroup_mutex); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any_housekeeping(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) @@ -838,14 +850,24 @@ void mbm_handle_overflow(struct work_struct *work) * Re-check for housekeeping CPUs. This allows the overflow handler to * move off a nohz_full CPU quickly. */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; @@ -856,9 +878,11 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) */ if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any_housekeeping(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 38d3b19a3aca1..f5688c79d94f4 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2678,7 +2678,8 @@ static int rdt_get_tree(struct fs_context *fc) if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -3989,7 +3990,8 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 4c4bad3c34e48..ccbbbe5d18d34 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -10,6 +10,8 @@ #define RESCTRL_RESERVED_CLOSID 0 #define RESCTRL_RESERVED_RMID 0 +#define RESCTRL_PICK_ANY_CPU -1 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, From f2dfd2200caaabaf8c885b5b1d1bb03d30c17cee Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:36 +0000 Subject: [PATCH 34/72] x86/resctrl: Add CPU offline callback for resctrl work commit 258c91e84fedc789353a35ad91d827a9111d3cbd upstream. The resctrl architecture specific code may need to free a domain when a CPU goes offline, it also needs to reset the CPUs PQR_ASSOC register. Amongst other things, the resctrl filesystem code needs to clear this CPU from the cpu_mask of any control and monitor groups. Currently, this is all done in core.c and called from resctrl_offline_cpu(), making the split between architecture and filesystem code unclear. Move the filesystem work to remove the CPU from the control and monitor groups into a filesystem helper called resctrl_offline_cpu(), and rename the one in core.c resctrl_arch_offline_cpu(). Intel-SIG: commit 258c91e84fed x86/resctrl: Add CPU offline callback for resctrl work. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-23-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 25 +++++-------------------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 24 ++++++++++++++++++++++++ include/linux/resctrl.h | 1 + 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 55322ba629da3..4aedefa22f611 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -625,31 +625,15 @@ static int resctrl_arch_online_cpu(unsigned int cpu) return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) -{ - struct rdtgroup *rdtgrp; struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); @@ -971,7 +955,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_arch_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f5688c79d94f4..5bd3d8fb3f67d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -4017,6 +4017,30 @@ void resctrl_online_cpu(unsigned int cpu) cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ccbbbe5d18d34..270ff1d5c051c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -226,6 +226,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_online_cpu(unsigned int cpu); +void resctrl_offline_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid From 90964f8bb889dbc2d6d3eb56395ca1dc38108220 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:37 +0000 Subject: [PATCH 35/72] x86/resctrl: Move domain helper migration into resctrl_offline_cpu() commit eeff1d4f118bdf0870227fee5a770f03056e3adc upstream. When a CPU is taken offline the resctrl filesystem code needs to check if it was the CPU nominated to perform the periodic overflow and limbo work. If so, another CPU needs to be chosen to do this work. This is currently done in core.c, mixed in with the code that removes the CPU from the domain's mask, and potentially free()s the domain. Move the migration of the overflow and limbo helpers into the filesystem code, into resctrl_offline_cpu(). As resctrl_offline_cpu() runs before the architecture code has removed the CPU from the domain mask, the callers need to be told which CPU is being removed, to avoid picking it as the new CPU. This uses the exclude_cpu feature previously added. Intel-SIG: commit eeff1d4f118b x86/resctrl: Move domain helper migration into resctrl_offline_cpu(). Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-24-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 16 ---------------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4aedefa22f611..b03a6c658ae5e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -580,22 +580,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) return; } - - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - /* - * temporary: exclude_cpu=-1 as this CPU has already - * been removed by cpumask_clear_cpu()d - */ - mbm_setup_overflow_handler(d, 0, RESCTRL_PICK_ANY_CPU); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, RESCTRL_PICK_ANY_CPU); - } - } } static void clear_closid_rmid(int cpu) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5bd3d8fb3f67d..777e9f6803325 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -4029,7 +4029,9 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) void resctrl_offline_cpu(unsigned int cpu) { + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdtgroup *rdtgrp; + struct rdt_domain *d; lockdep_assert_held(&rdtgroup_mutex); @@ -4039,6 +4041,22 @@ void resctrl_offline_cpu(unsigned int cpu) break; } } + + if (!l3->mon_capable) + return; + + d = get_domain_from_cpu(cpu, l3); + if (d) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } } /* From af5ad16f19ceecaf747f1151d008ecbc78505aae Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:38 +0000 Subject: [PATCH 36/72] x86/resctrl: Separate arch and fs resctrl locks commit fb700810d30b9eb333a7bf447012e1158e35c62f upstream. resctrl has one mutex that is taken by the architecture-specific code, and the filesystem parts. The two interact via cpuhp, where the architecture code updates the domain list. Filesystem handlers that walk the domains list should not run concurrently with the cpuhp callback modifying the list. Exposing a lock from the filesystem code means the interface is not cleanly defined, and creates the possibility of cross-architecture lock ordering headaches. The interaction only exists so that certain filesystem paths are serialised against CPU hotplug. The CPU hotplug code already has a mechanism to do this using cpus_read_lock(). MPAM's monitors have an overflow interrupt, so it needs to be possible to walk the domains list in irq context. RCU is ideal for this, but some paths need to be able to sleep to allocate memory. Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp callback, cpus_read_lock() must always be taken first. rdtgroup_schemata_write() already does this. Most of the filesystem code's domain list walkers are currently protected by the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are rdt_bit_usage_show() and the mon_config helpers which take the lock directly. Make the domain list protected by RCU. An architecture-specific lock prevents concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU, but to keep all the filesystem operations the same, this is changed to call cpus_read_lock(). The mon_config helpers send multiple IPIs, take the cpus_read_lock() in these cases. The other filesystem list walkers need to be able to sleep. Add cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't be invoked when file system operations are occurring. Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live() call isn't obvious. Resctrl's domain online/offline calls now need to take the rdtgroup_mutex themselves. [ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ] Intel-SIG: commit fb700810d30b x86/resctrl: Separate arch and fs resctrl locks. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 44 +++++++++++---- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 15 ++++- arch/x86/kernel/cpu/resctrl/monitor.c | 8 +++ arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 3 + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 68 ++++++++++++++++++----- include/linux/resctrl.h | 2 +- 6 files changed, 112 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b03a6c658ae5e..8a4ef4f5bddc8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include #include #include @@ -25,8 +26,15 @@ #include #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -354,6 +362,15 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) { struct rdt_domain *d; + /* + * Walking r->domains, ensure it can't race with cpuhp. + * Because this is called via IPI by rdt_ctrl_update(), assertions + * about locks this thread holds will lead to false positives. Check + * someone is holding the CPUs lock. + */ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON_ONCE(!lockdep_is_cpus_held()); + list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ if (cpumask_test_cpu(cpu, &d->cpu_mask)) @@ -510,6 +527,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct rdt_domain *d; int err; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -543,11 +562,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail(&d->list, add_pos); + list_add_tail_rcu(&d->list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); domain_free(hw_dom); } } @@ -558,6 +578,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -568,7 +590,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { resctrl_offline_domain(r, d); - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); /* * rdt_domain "d" is going to be freed below, so clear @@ -598,13 +621,13 @@ static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - clear_closid_rmid(cpu); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); resctrl_online_cpu(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -613,13 +636,14 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); resctrl_offline_cpu(cpu); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 20b02d6f02c19..7997b47743a21 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -212,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -316,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_domain *d; u32 idx; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -381,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -447,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } @@ -467,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) @@ -537,6 +543,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, { int cpu; + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* * Setup the parameters to pass to mon_event_count() to read the data. */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 67edd4c440f06..c34a35ec0f031 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,7 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#include #include #include #include @@ -472,6 +473,9 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) lockdep_assert_held(&rdtgroup_mutex); + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; @@ -778,6 +782,7 @@ void cqm_handle_limbo(struct work_struct *work) unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); d = container_of(work, struct rdt_domain, cqm_limbo.work); @@ -792,6 +797,7 @@ void cqm_handle_limbo(struct work_struct *work) } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } /** @@ -823,6 +829,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -856,6 +863,7 @@ void mbm_handle_overflow(struct work_struct *work) out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } /** diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8056bed033cc7..884b88e251413 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 777e9f6803325..011e17efb1a66 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -35,6 +35,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -1014,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { @@ -1074,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1329,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) @@ -1593,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid struct rdt_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->domains, list) { @@ -1609,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1690,6 +1701,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) unsigned long dom_id, val; struct rdt_domain *d; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + next: if (!tok || tok[0] == '\0') return 0; @@ -1736,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1745,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1760,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1769,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -2245,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2444,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2461,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2793,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_domain *d; int i; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -3077,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_domain *dom; int ret; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(dom, &r->domains, list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) @@ -3907,13 +3937,13 @@ static void domain_destroy_mon_state(struct rdt_domain *d) void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); if (!r->mon_capable) - return; + goto out_unlock; /* * If resctrl is mounted, remove all the @@ -3938,6 +3968,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) @@ -3973,20 +4006,22 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + goto out_unlock; + } if (!r->mon_capable) - return 0; + goto out_unlock; err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); @@ -4006,15 +4041,18 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; } void resctrl_online_cpu(unsigned int cpu) { - lockdep_assert_held(&rdtgroup_mutex); - + mutex_lock(&rdtgroup_mutex); /* The CPU is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); } static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) @@ -4033,8 +4071,7 @@ void resctrl_offline_cpu(unsigned int cpu) struct rdtgroup *rdtgrp; struct rdt_domain *d; - lockdep_assert_held(&rdtgroup_mutex); - + mutex_lock(&rdtgroup_mutex); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { clear_childcpus(rdtgrp, cpu); @@ -4043,7 +4080,7 @@ void resctrl_offline_cpu(unsigned int cpu) } if (!l3->mon_capable) - return; + goto out_unlock; d = get_domain_from_cpu(cpu, l3); if (d) { @@ -4057,6 +4094,9 @@ void resctrl_offline_cpu(unsigned int cpu) cqm_setup_limbo_handler(d, 0, cpu); } } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 270ff1d5c051c..a365f67131ece 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -159,7 +159,7 @@ struct resctrl_schema; * @cache_level: Which cache level defines scope of this resource * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. - * @domains: All domains for this resource + * @domains: RCU list of all domains for this resource * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. From 4347095c4bc3a48cd78d263f5d2b841c5bd4c3ce Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 21 Feb 2024 12:23:06 +0000 Subject: [PATCH 37/72] x86/resctrl: Remove lockdep annotation that triggers false positive commit c0d848fcb09d80a5f48b99f85e448185125ef59f upstream. get_domain_from_cpu() walks a list of domains to find the one that contains the specified CPU. This needs to be protected against races with CPU hotplug when the list is modified. It has recently gained a lockdep annotation to check this. The lockdep annotation causes false positives when called via IPI as the lock is held, but by another process. Remove it. [ bp: Refresh it ontop of x86/cache. ] Intel-SIG: commit c0d848fcb09d x86/resctrl: Remove lockdep annotation that triggers false positive. Incremental backporting patches for Intel RDT on Intel Xeon platform. Fixes: fb700810d30b ("x86/resctrl: Separate arch and fs resctrl locks") Reported-by: Tony Luck Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/all/ZdUSwOM9UUNpw84Y@agluck-desk3 [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8a4ef4f5bddc8..83e40341583e6 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -362,15 +362,6 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) { struct rdt_domain *d; - /* - * Walking r->domains, ensure it can't race with cpuhp. - * Because this is called via IPI by rdt_ctrl_update(), assertions - * about locks this thread holds will lead to false positives. Check - * someone is holding the CPUs lock. - */ - if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && IS_ENABLED(CONFIG_LOCKDEP)) - WARN_ON_ONCE(!lockdep_is_cpus_held()); - list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ if (cpumask_test_cpu(cpu, &d->cpu_mask)) From cffda4a032da1eee2ca83ced6b4e8b35af2d5cc2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 22 Mar 2024 11:20:15 -0700 Subject: [PATCH 38/72] Documentation/x86: Document that resctrl bandwidth control units are MiB commit a8ed59a3a8de2648e69dd5936f5771ac4c92d085 upstream. The memory bandwidth software controller uses 2^20 units rather than 10^6. See mbm_bw_count() which computes bandwidth using the "SZ_1M" Linux define for 0x00100000. Update the documentation to use MiB when describing this feature. It's too late to fix the mount option "mba_MBps" as that is now an established user interface. Intel-SIG: commit a8ed59a3a8de Documentation/x86: Document that resctrl bandwidth control units are MiB. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240322182016.196544-1-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index a6279df64a9db..3712d81cb50c6 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -45,7 +45,7 @@ mount options are: Enable code/data prioritization in L2 cache allocations. "mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA - bandwidth in MBps + bandwidth in MiBps "debug": Make debug files accessible. Available debug files are annotated with "Available only with debug option". @@ -526,7 +526,7 @@ threads start using more cores in an rdtgroup, the actual bandwidth may increase or vary although user specified bandwidth percentage is same. In order to mitigate this and make the interface more user friendly, -resctrl added support for specifying the bandwidth in MBps as well. The +resctrl added support for specifying the bandwidth in MiBps as well. The kernel underneath would use a software feedback mechanism or a "Software Controller(mba_sc)" which reads the actual bandwidth using MBM counters and adjust the memory bandwidth percentages to ensure:: @@ -573,13 +573,13 @@ Memory b/w domain is L3 cache. MB:=bandwidth0;=bandwidth1;... -Memory bandwidth Allocation specified in MBps +Memory bandwidth Allocation specified in MiBps --------------------------------------------- Memory bandwidth domain is L3 cache. :: - MB:=bw_MBps0;=bw_MBps1;... + MB:=bw_MiBps0;=bw_MiBps1;... Slow Memory Bandwidth Allocation (SMBA) --------------------------------------- From aad43a82bcdc634595550a2843e1eec20165b7da Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 1 Apr 2024 11:16:39 -0700 Subject: [PATCH 39/72] x86/resctrl: Fix uninitialized memory read when last CPU of domain goes offline commit c3eeb1ffc6a88af9b002e22be0f70851759be03a upstream. Tony encountered this OOPS when the last CPU of a domain goes offline while running a kernel built with CONFIG_NO_HZ_FULL: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI ... RIP: 0010:__find_nth_andnot_bit+0x66/0x110 ... Call Trace: ? __die() ? page_fault_oops() ? exc_page_fault() ? asm_exc_page_fault() cpumask_any_housekeeping() mbm_setup_overflow_handler() resctrl_offline_cpu() resctrl_arch_offline_cpu() cpuhp_invoke_callback() cpuhp_thread_fun() smpboot_thread_fn() kthread() ret_from_fork() ret_from_fork_asm() The NULL pointer dereference is encountered while searching for another online CPU in the domain (of which there are none) that can be used to run the MBM overflow handler. Because the kernel is configured with CONFIG_NO_HZ_FULL the search for another CPU (in its effort to prefer those CPUs that aren't marked nohz_full) consults the mask representing the nohz_full CPUs, tick_nohz_full_mask. On a kernel with CONFIG_CPUMASK_OFFSTACK=y tick_nohz_full_mask is not allocated unless the kernel is booted with the "nohz_full=" parameter and because of that any access to tick_nohz_full_mask needs to be guarded with tick_nohz_full_enabled(). Replace the IS_ENABLED(CONFIG_NO_HZ_FULL) with tick_nohz_full_enabled(). The latter ensures tick_nohz_full_mask can be accessed safely and can be used whether kernel is built with CONFIG_NO_HZ_FULL enabled or not. [ Use Ingo's suggestion that combines the two NO_HZ checks into one. ] Intel-SIG: commit c3eeb1ffc6a8 x86/resctrl: Fix uninitialized memory read when last CPU of domain goes offline. Incremental backporting patches for Intel RDT on Intel Xeon platform. Fixes: a4846aaf3945 ("x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow") Reported-by: Tony Luck Signed-off-by: Reinette Chatre Signed-off-by: Ingo Molnar Reviewed-by: Babu Moger Link: https://lore.kernel.org/r/ff8dfc8d3dcb04b236d523d1e0de13d2ef585223.1711993956.git.reinette.chatre@intel.com Closes: https://lore.kernel.org/lkml/ZgIFT5gZgIQ9A9G7@agluck-desk3/ [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c99f26ebe7a65..1a8687f8073a8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -78,7 +78,8 @@ cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) else cpu = cpumask_any_but(mask, exclude_cpu); - if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) return cpu; /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ From 62a3d918a6bdde0d0235f11c772f7d31e4c77b8f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 8 Mar 2024 13:38:45 -0800 Subject: [PATCH 40/72] x86/resctrl: Pass domain to target CPU commit e3ca96e479c91d6ee657d3caa5092a6a3a620f9f upstream. reset_all_ctrls() and resctrl_arch_update_domains() use on_each_cpu_mask() to call rdt_ctrl_update() on potentially one CPU from each domain. But this means rdt_ctrl_update() needs to figure out which domain to apply changes to. Doing so requires a search of all domains in a resource, which can only be done safely if cpus_lock is held. Both callers do hold this lock, but there isn't a way for a function called on another CPU via IPI to verify this. Commit c0d848fcb09d ("x86/resctrl: Remove lockdep annotation that triggers false positive") removed the incorrect assertions. Add the target domain to the msr_param structure and call rdt_ctrl_update() for each domain separately using smp_call_function_single(). This means that rdt_ctrl_update() doesn't need to search for the domain and get_domain_from_cpu() can safely assert that the cpus_lock is held since the remaining callers do not use IPI. Intel-SIG: commit e3ca96e479c9 x86/resctrl: Pass domain to target CPU. Incremental backporting patches for Intel RDT on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: James Morse Reviewed-by: Babu Moger Tested-by: Maciej Wieczor-Retman Link: https://lore.kernel.org/r/20240308213846.77075-2-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 17 ++++------ arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 38 +++++------------------ arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 12 ++----- 4 files changed, 17 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 83e40341583e6..acf52aa185e09 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -362,6 +362,8 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) { struct rdt_domain *d; + lockdep_assert_cpus_held(); + list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ if (cpumask_test_cpu(cpu, &d->cpu_mask)) @@ -378,19 +380,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r) void rdt_ctrl_update(void *arg) { + struct rdt_hw_resource *hw_res; struct msr_param *m = arg; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_resource *r = m->res; - int cpu = smp_processor_id(); - struct rdt_domain *d; - d = get_domain_from_cpu(cpu, r); - if (d) { - hw_res->msr_update(d, m, r); - return; - } - pr_warn_once("cpu %d not found in any domain for resource %s\n", - cpu, r->name); + hw_res = resctrl_to_arch_res(m->res); + hw_res->msr_update(m->dom, m, m->res); } /* @@ -463,6 +457,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) hw_dom->ctrl_val = dc; setup_default_ctrlval(r, dc); + m.dom = d; m.low = 0; m.high = hw_res->num_closid; hw_res->msr_update(d, &m, r); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 7997b47743a21..165d8d453c041 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -272,22 +272,6 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) } } -static bool apply_config(struct rdt_hw_domain *hw_dom, - struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask) -{ - struct rdt_domain *dom = &hw_dom->d_resctrl; - - if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { - cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - hw_dom->ctrl_val[idx] = cfg->new_ctrl; - - return true; - } - - return false; -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { @@ -302,6 +286,7 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, hw_dom->ctrl_val[idx] = cfg_val; msr_param.res = r; + msr_param.dom = d; msr_param.low = idx; msr_param.high = idx + 1; hw_res->msr_update(d, &msr_param, r); @@ -315,48 +300,39 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_hw_domain *hw_dom; struct msr_param msr_param; enum resctrl_conf_type t; - cpumask_var_t cpu_mask; struct rdt_domain *d; u32 idx; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - msr_param.res = NULL; list_for_each_entry(d, &r->domains, list) { hw_dom = resctrl_to_arch_dom(d); + msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { cfg = &hw_dom->d_resctrl.staged_config[t]; if (!cfg->have_new_ctrl) continue; idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask)) + if (cfg->new_ctrl == hw_dom->ctrl_val[idx]) continue; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; if (!msr_param.res) { msr_param.low = idx; msr_param.high = msr_param.low + 1; msr_param.res = r; + msr_param.dom = d; } else { msr_param.low = min(msr_param.low, idx); msr_param.high = max(msr_param.high, idx + 1); } } + if (msr_param.res) + smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1); } - if (cpumask_empty(cpu_mask)) - goto done; - - /* Update resource control msr on all the CPUs. */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - -done: - free_cpumask_var(cpu_mask); - return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1a8687f8073a8..ab2d315f7a2e8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -379,11 +379,13 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use + * @dom: The domain to update * @low: Beginning index from base MSR * @high: End index */ struct msr_param { struct rdt_resource *res; + struct rdt_domain *dom; u32 low; u32 high; }; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 011e17efb1a66..02f213f1c51c5 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2813,16 +2813,12 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom; struct msr_param msr_param; - cpumask_var_t cpu_mask; struct rdt_domain *d; int i; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - msr_param.res = r; msr_param.low = 0; msr_param.high = hw_res->num_closid; @@ -2834,17 +2830,13 @@ static int reset_all_ctrls(struct rdt_resource *r) */ list_for_each_entry(d, &r->domains, list) { hw_dom = resctrl_to_arch_dom(d); - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; + msr_param.dom = d; + smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1); } - /* Update CBM on all the CPUs in cpu_mask */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - - free_cpumask_var(cpu_mask); - return 0; } From 493342122a338cb6a1c22cf213b281eff03bf204 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 8 Mar 2024 13:38:46 -0800 Subject: [PATCH 41/72] x86/resctrl: Simplify call convention for MSR update functions commit bd4955d4bc2182ccb660c9c30a4dd7f36feaf943 upstream. The per-resource MSR update functions cat_wrmsr(), mba_wrmsr_intel(), and mba_wrmsr_amd() all take three arguments: (struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) struct msr_param contains pointers to both struct rdt_resource and struct rdt_domain, thus only struct msr_param is necessary. Pass struct msr_param as a single parameter. Clean up formatting and fix some fir tree declaration ordering. No functional change. Intel-SIG: commit bd4955d4bc21 x86/resctrl: Simplify call convention for MSR update functions. Incremental backporting patches for Intel RDT on Intel Xeon platform. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Maciej Wieczor-Retman Link: https://lore.kernel.org/r/20240308213846.77075-3-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 40 +++++++++-------------- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 2 +- arch/x86/kernel/cpu/resctrl/internal.h | 3 +- 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index acf52aa185e09..7751eea19fd24 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -56,14 +56,9 @@ int max_name_width, max_data_width; */ bool rdt_alloc_capable; -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); +static void mba_wrmsr_intel(struct msr_param *m); +static void cat_wrmsr(struct msr_param *m); +static void mba_wrmsr_amd(struct msr_param *m); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) @@ -309,12 +304,11 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2); } -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void mba_wrmsr_amd(struct msr_param *m) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); @@ -334,25 +328,22 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) return r->default_ctrl; } -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r) +static void mba_wrmsr_intel(struct msr_param *m) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r)); + wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void cat_wrmsr(struct msr_param *m) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); @@ -384,7 +375,7 @@ void rdt_ctrl_update(void *arg) struct msr_param *m = arg; hw_res = resctrl_to_arch_res(m->res); - hw_res->msr_update(m->dom, m, m->res); + hw_res->msr_update(m); } /* @@ -457,10 +448,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) hw_dom->ctrl_val = dc; setup_default_ctrlval(r, dc); + m.res = r; m.dom = d; m.low = 0; m.high = hw_res->num_closid; - hw_res->msr_update(d, &m, r); + hw_res->msr_update(&m); return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 165d8d453c041..b7291f60399c0 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -289,7 +289,7 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, msr_param.dom = d; msr_param.low = idx; msr_param.high = idx + 1; - hw_res->msr_update(d, &msr_param, r); + hw_res->msr_update(&msr_param); return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ab2d315f7a2e8..f1d926832ec8a 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -445,8 +445,7 @@ struct rdt_hw_resource { struct rdt_resource r_resctrl; u32 num_closid; unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); + void (*msr_update)(struct msr_param *m); unsigned int mon_scale; unsigned int mbm_width; unsigned int mbm_cfg_mask; From c652648586fcefe53f990340e75c3958a847e31b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 8 Apr 2024 17:23:02 +0800 Subject: [PATCH 42/72] x86/resctrl: Rename pseudo_lock_event.h to trace.h commit 87739229485ac724849178eb6c35e38c6161eb77 upstream. Now only the pseudo-locking part uses tracepoints to do event tracking, but other parts of resctrl may need new tracepoints. It is unnecessary to create separate header files and define CREATE_TRACE_POINTS in different c files which fragments the resctrl tracing. Therefore, give the resctrl tracepoint header file a generic name to support its use for tracepoints that are not specific to pseudo-locking. No functional change. Intel-SIG: commit 87739229485a x86/resctrl: Rename pseudo_lock_event.h to trace.h. Incremental backporting patches for Intel RDT on Intel Xeon platform. Suggested-by: Reinette Chatre Signed-off-by: Haifeng Xu Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240408092303.26413-2-haifeng.xu@shopee.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- .../kernel/cpu/resctrl/{pseudo_lock_event.h => trace.h} | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename arch/x86/kernel/cpu/resctrl/{pseudo_lock_event.h => trace.h} (86%) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 884b88e251413..492c8e28c4ce4 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -31,7 +31,7 @@ #include "internal.h" #define CREATE_TRACE_POINTS -#include "pseudo_lock_event.h" +#include "trace.h" /* * The bits needed to disable hardware prefetching varies based on the diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h similarity index 86% rename from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h rename to arch/x86/kernel/cpu/resctrl/trace.h index 428ebbd4270b9..495fb90c85722 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h +++ b/arch/x86/kernel/cpu/resctrl/trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PSEUDO_LOCK_H +#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RESCTRL_H #include @@ -35,9 +35,9 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -#endif /* _TRACE_PSEUDO_LOCK_H */ +#endif /* _TRACE_RESCTRL_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE pseudo_lock_event +#define TRACE_INCLUDE_FILE trace #include From 6b4b107accea2434163a05a0ecae34c42419ec7f Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 8 Apr 2024 17:23:03 +0800 Subject: [PATCH 43/72] x86/resctrl: Add tracepoint for llc_occupancy tracking commit 931be446c6cbc15691dd499957e961f4e1d56afb upstream. In our production environment, after removing monitor groups, those unused RMIDs get stuck in the limbo list forever because their llc_occupancy is always larger than the threshold. But the unused RMIDs can be successfully freed by turning up the threshold. In order to know how much the threshold should be, perf can be used to acquire the llc_occupancy of RMIDs in each rdt domain. Instead of using perf tool to track llc_occupancy and filter the log manually, it is more convenient for users to use tracepoint to do this work. So add a new tracepoint that shows the llc_occupancy of busy RMIDs when scanning the limbo list. Intel-SIG: commit 931be446c6cb x86/resctrl: Add tracepoint for llc_occupancy tracking. Incremental backporting patches for Intel RDT on Intel Xeon platform. Suggested-by: Reinette Chatre Suggested-by: James Morse Signed-off-by: Haifeng Xu Signed-off-by: Borislav Petkov (AMD) Reviewed-by: James Morse Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240408092303.26413-3-haifeng.xu@shopee.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 6 ++++++ arch/x86/kernel/cpu/resctrl/monitor.c | 11 +++++++++++ arch/x86/kernel/cpu/resctrl/trace.h | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 3712d81cb50c6..5f26c6267ac78 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -446,6 +446,12 @@ during mkdir. max_threshold_occupancy is a user configurable value to determine the occupancy at which an RMID can be freed. +The mon_llc_occupancy_limbo tracepoint gives the precise occupancy in bytes +for a subset of RMID that are not immediately available for allocation. +This can't be relied on to produce output every second, it may be necessary +to attempt to create an empty monitor group to force an update. Output may +only be produced if creation of a control or monitor group fails. + Schemata files - general concepts --------------------------------- Each line in the file describes one resource. The line starts with diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c34a35ec0f031..2345e6836593f 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -24,6 +24,7 @@ #include #include "internal.h" +#include "trace.h" /** * struct rmid_entry - dirty tracking for all RMID. @@ -354,6 +355,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free) rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val); } if (force_free || !rmid_dirty) { diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/trace.h index 495fb90c85722..2a506316b3034 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/trace.h @@ -35,6 +35,22 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); +TRACE_EVENT(mon_llc_occupancy_limbo, + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), + TP_STRUCT__entry(__field(u32, ctrl_hw_id) + __field(u32, mon_hw_id) + __field(int, domain_id) + __field(u64, llc_occupancy_bytes)), + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; + __entry->mon_hw_id = mon_hw_id; + __entry->domain_id = domain_id; + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, + __entry->llc_occupancy_bytes) + ); + #endif /* _TRACE_RESCTRL_H */ #undef TRACE_INCLUDE_PATH From a543ce6a4e249f45091e137c45db642a1ca959dc Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 18 Jun 2024 15:01:52 +0100 Subject: [PATCH 44/72] x86/resctrl: Don't try to free nonexistent RMIDs commit 739c9765793e5794578a64aab293c58607f1826a upstream. Commit 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") adds logic to map individual monitoring groups into a global index space used for tracking allocated RMIDs. Attempts to free the default RMID are ignored in free_rmid(), and this works fine on x86. With arm64 MPAM, there is a latent bug here however: on platforms with no monitors exposed through resctrl, each control group still gets a different monitoring group ID as seen by the hardware, since the CLOSID always forms part of the monitoring group ID. This means that when removing a control group, the code may try to free this group's default monitoring group RMID for real. If there are no monitors however, the RMID tracking table rmid_ptrs[] would be a waste of memory and is never allocated, leading to a splat when free_rmid() tries to dereference the table. One option would be to treat RMID 0 as special for every CLOSID, but this would be ugly since bookkeeping still needs to be done for these monitoring group IDs when there are monitors present in the hardware. Instead, add a gating check of resctrl_arch_mon_capable() in free_rmid(), and just do nothing if the hardware doesn't have monitors. This fix mirrors the gating checks already present in mkdir_rdt_prepare_rmid_alloc() and elsewhere. No functional change on x86. [ bp: Massage commit message. ] Intel-SIG: commit 739c9765793e x86/resctrl: Don't try to free nonexistent RMIDs. Incremental backporting patches for Intel RDT on Intel Xeon platform. Fixes: 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") Signed-off-by: Dave Martin Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Reinette Chatre Link: https://lore.kernel.org/r/20240618140152.83154-1-Dave.Martin@arm.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2345e6836593f..366f496ca3ce2 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -519,7 +519,8 @@ void free_rmid(u32 closid, u32 rmid) * allows architectures that ignore the closid parameter to avoid an * unnecessary check. */ - if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID)) return; From 51f5d0f8b6b2cb6e4b28e61abb61e43f8b5e6b43 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:24 -0700 Subject: [PATCH 45/72] cpu: Move CPU hotplug function declarations into their own header commit 195fb517ee25bfefde9c74ecd86348eccbd6d2e4 upstream. Avoid upcoming #include hell when wants to use lockdep_assert_cpus_held() and creates a #include loop that would break the build for arch/riscv. [ bp: s/cpu/CPU/g ] Intel-SIG: commit 195fb517ee25 cpu: Move CPU hotplug function declarations into their own header. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240610003927.341707-2-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- include/linux/cpu.h | 33 +-------------------------- include/linux/cpuhplock.h | 47 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 32 deletions(-) create mode 100644 include/linux/cpuhplock.h diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a7d91a167a8b6..152f6cf8bc318 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -18,6 +18,7 @@ #include #include #include +#include #include struct device; @@ -127,38 +128,6 @@ static inline int add_cpu(unsigned int cpu) { return 0;} #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; -extern int lockdep_is_cpus_held(void); - -#ifdef CONFIG_HOTPLUG_CPU -extern void cpus_write_lock(void); -extern void cpus_write_unlock(void); -extern void cpus_read_lock(void); -extern void cpus_read_unlock(void); -extern int cpus_read_trylock(void); -extern void lockdep_assert_cpus_held(void); -extern void cpu_hotplug_disable(void); -extern void cpu_hotplug_enable(void); -void clear_tasks_mm_cpumask(int cpu); -int remove_cpu(unsigned int cpu); -int cpu_device_down(struct device *dev); -extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); - -#else /* CONFIG_HOTPLUG_CPU */ - -static inline void cpus_write_lock(void) { } -static inline void cpus_write_unlock(void) { } -static inline void cpus_read_lock(void) { } -static inline void cpus_read_unlock(void) { } -static inline int cpus_read_trylock(void) { return true; } -static inline void lockdep_assert_cpus_held(void) { } -static inline void cpu_hotplug_disable(void) { } -static inline void cpu_hotplug_enable(void) { } -static inline int remove_cpu(unsigned int cpu) { return -EPERM; } -static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } -#endif /* !CONFIG_HOTPLUG_CPU */ - -DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) - #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h new file mode 100644 index 0000000000000..386abc4822641 --- /dev/null +++ b/include/linux/cpuhplock.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/cpuhplock.h - CPU hotplug locking + * + * Locking functions for CPU hotplug. + */ +#ifndef _LINUX_CPUHPLOCK_H_ +#define _LINUX_CPUHPLOCK_H_ + +#include +#include + +struct device; + +extern int lockdep_is_cpus_held(void); + +#ifdef CONFIG_HOTPLUG_CPU +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); +extern int cpus_read_trylock(void); +extern void lockdep_assert_cpus_held(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); +void clear_tasks_mm_cpumask(int cpu); +int remove_cpu(unsigned int cpu); +int cpu_device_down(struct device *dev); +extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); + +#else /* CONFIG_HOTPLUG_CPU */ + +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline int cpus_read_trylock(void) { return true; } +static inline void lockdep_assert_cpus_held(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +static inline int remove_cpu(unsigned int cpu) { return -EPERM; } +static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) + +#endif /* _LINUX_CPUHPLOCK_H_ */ From 256b1daf67828780d4d2537b1146cda2a771d34c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:25 -0700 Subject: [PATCH 46/72] cpu: Drop "extern" from function declarations in cpuhplock.h commit ddefcfdeb5a2238cbcb07b80dda9ac3136735b1e upstream. This file was created with a direct cut and paste from cpu.h so kept the legacy declaration style. But the Linux coding standard for function declarations in header files is to avoid use of "extern". Drop "extern" from all function declarations. Intel-SIG: commit ddefcfdeb5a2 cpu: Drop "extern" from function declarations in cpuhplock.h. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240610003927.341707-3-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- include/linux/cpuhplock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h index 386abc4822641..431560bbd0453 100644 --- a/include/linux/cpuhplock.h +++ b/include/linux/cpuhplock.h @@ -15,18 +15,18 @@ struct device; extern int lockdep_is_cpus_held(void); #ifdef CONFIG_HOTPLUG_CPU -extern void cpus_write_lock(void); -extern void cpus_write_unlock(void); -extern void cpus_read_lock(void); -extern void cpus_read_unlock(void); -extern int cpus_read_trylock(void); -extern void lockdep_assert_cpus_held(void); -extern void cpu_hotplug_disable(void); -extern void cpu_hotplug_enable(void); +void cpus_write_lock(void); +void cpus_write_unlock(void); +void cpus_read_lock(void); +void cpus_read_unlock(void); +int cpus_read_trylock(void); +void lockdep_assert_cpus_held(void); +void cpu_hotplug_disable(void); +void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int remove_cpu(unsigned int cpu); int cpu_device_down(struct device *dev); -extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); +void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); #else /* CONFIG_HOTPLUG_CPU */ From 0ab64228cbfa2bbbbd8f61261800e8aab6154a36 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:26 -0700 Subject: [PATCH 47/72] cacheinfo: Add function to get cacheinfo for a given CPU and cache level commit 685cb1674060c2cb1b9da051a12933c082b8e874 upstream. Resctrl open codes a search for information about a given cache level in a couple of places (and more are on the way). Provide a new inline function get_cpu_cacheinfo_level() in to do the search and return a pointer to the cacheinfo structure. Add lockdep_assert_cpus_held() to enforce the comment that cpuhp lock must be held. Simplify the existing get_cpu_cacheinfo_id() by using this new function to do the search. Intel-SIG: commit 685cb1674060 cacheinfo: Add function to get cacheinfo for a given CPU and cache level. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240610003927.341707-4-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- include/linux/cacheinfo.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index a5cfd44fab45b..bd7289141eb97 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,6 +3,7 @@ #define _LINUX_CACHEINFO_H #include +#include #include #include @@ -112,23 +113,37 @@ int acpi_get_cache_info(unsigned int cpu, const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); /* - * Get the id of the cache associated with @cpu at level @level. + * Get the cacheinfo structure for the cache associated with @cpu at + * level @level. * cpuhp lock must be held. */ -static inline int get_cpu_cacheinfo_id(int cpu, int level) +static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); int i; + lockdep_assert_cpus_held(); + for (i = 0; i < ci->num_leaves; i++) { if (ci->info_list[i].level == level) { if (ci->info_list[i].attributes & CACHE_ID) - return ci->info_list[i].id; - return -1; + return &ci->info_list[i]; + return NULL; } } - return -1; + return NULL; +} + +/* + * Get the id of the cache associated with @cpu at level @level. + * cpuhp lock must be held. + */ +static inline int get_cpu_cacheinfo_id(int cpu, int level) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(cpu, level); + + return ci ? ci->id : -1; } #ifdef CONFIG_ARM64 From 1047bda6c1cb45aa1e06eae7ff35ac72d4ab6cc1 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:27 -0700 Subject: [PATCH 48/72] x86/resctrl: Replace open coded cacheinfo searches commit f385f024639431bec3e70c33cdbc9563894b3ee5 upstream. pseudo_lock_region_init() and rdtgroup_cbm_to_size() open code a search for details of a particular cache level. Replace with get_cpu_cacheinfo_level(). Intel-SIG: commit f385f0246394 x86/resctrl: Replace open coded cacheinfo searches. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240610003927.341707-5-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 17 ++++++----------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 +++++--------- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 492c8e28c4ce4..1e3f9d28a4b54 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -292,9 +292,8 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) */ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) { - struct cpu_cacheinfo *ci; + struct cacheinfo *ci; int ret; - int i; /* Pick the first cpu we find that is associated with the cache. */ plr->cpu = cpumask_first(&plr->d->cpu_mask); @@ -306,15 +305,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) goto out_region; } - ci = get_cpu_cacheinfo(plr->cpu); - - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == plr->s->res->cache_level) { - plr->line_size = ci->info_list[i].coherency_line_size; - return 0; - } + ci = get_cpu_cacheinfo_level(plr->cpu, plr->s->res->cache_level); + if (ci) { + plr->line_size = ci->coherency_line_size; + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + return 0; } ret = -1; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 02f213f1c51c5..cb68a121dabb6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1450,18 +1450,14 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, unsigned long cbm) { - struct cpu_cacheinfo *ci; unsigned int size = 0; - int num_b, i; + struct cacheinfo *ci; + int num_b; num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == r->cache_level) { - size = ci->info_list[i].size / r->cache.cbm_len * num_b; - break; - } - } + ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->cache_level); + if (ci) + size = ci->size / r->cache.cbm_len * num_b; return size; } From 80248a28b0d5ac1372ebd9d1da3b5851ef8d47eb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:01 -0700 Subject: [PATCH 49/72] x86/resctrl: Prepare for new domain scope commit f436cb6913a57bf3e1e66d18bc663e6c20751929 upstream. Resctrl resources operate on subsets of CPUs in the system with the defining attribute of each subset being an instance of a particular level of cache. E.g. all CPUs sharing an L3 cache would be part of the same domain. In preparation for features that are scoped at the NUMA node level, change the code from explicit references to "cache_level" to a more generic scope. At this point the only options for this scope are groups of CPUs that share an L2 cache or L3 cache. Clean up the error handling when looking up domains. Report invalid ids before calling rdt_find_domain() in preparation for better messages when scope can be other than cache scope. This means that rdt_find_domain() will never return an error. So remove checks for error from the call sites. Intel-SIG: commit f436cb6913a5 x86/resctrl: Prepare for new domain scope. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-2-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 46 ++++++++++++++++------- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 2 +- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 6 ++- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 5 ++- include/linux/resctrl.h | 9 ++++- 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7751eea19fd24..4c5e985e13880 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -68,7 +68,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L3, .name = "L3", - .cache_level = 3, + .scope = RESCTRL_L3_CACHE, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -82,7 +82,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L2, .name = "L2", - .cache_level = 2, + .scope = RESCTRL_L2_CACHE, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -96,7 +96,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_MBA, .name = "MB", - .cache_level = 3, + .scope = RESCTRL_L3_CACHE, .domains = domain_init(RDT_RESOURCE_MBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", @@ -108,7 +108,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_SMBA, .name = "SMBA", - .cache_level = 3, + .scope = RESCTRL_L3_CACHE, .domains = domain_init(RDT_RESOURCE_SMBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", @@ -392,9 +392,6 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, struct rdt_domain *d; struct list_head *l; - if (id < 0) - return ERR_PTR(-ENODEV); - list_for_each(l, &r->domains) { d = list_entry(l, struct rdt_domain, list); /* When id is found, return its domain. */ @@ -484,6 +481,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) return 0; } +static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) +{ + switch (scope) { + case RESCTRL_L2_CACHE: + case RESCTRL_L3_CACHE: + return get_cpu_cacheinfo_id(cpu, scope); + default: + break; + } + + return -EINVAL; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -499,7 +509,7 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + int id = get_domain_id_from_scope(cpu, r->scope); struct list_head *add_pos = NULL; struct rdt_hw_domain *hw_dom; struct rdt_domain *d; @@ -507,12 +517,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) lockdep_assert_held(&domain_list_lock); - d = rdt_find_domain(r, id, &add_pos); - if (IS_ERR(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); + if (id < 0) { + pr_warn_once("Can't find domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->scope, r->name); return; } + d = rdt_find_domain(r, id, &add_pos); + if (d) { cpumask_set_cpu(cpu, &d->cpu_mask); if (r->cache.arch_has_per_cpu_cfg) @@ -552,15 +564,21 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) static void domain_remove_cpu(int cpu, struct rdt_resource *r) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + int id = get_domain_id_from_scope(cpu, r->scope); struct rdt_hw_domain *hw_dom; struct rdt_domain *d; lockdep_assert_held(&domain_list_lock); + if (id < 0) { + pr_warn_once("Can't find domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->scope, r->name); + return; + } + d = rdt_find_domain(r, id, NULL); - if (IS_ERR_OR_NULL(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); + if (!d) { + pr_warn("Couldn't find domain with id=%d for CPU %d\n", id, cpu); return; } hw_dom = resctrl_to_arch_dom(d); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b7291f60399c0..2bf021d425007 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -577,7 +577,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) r = &rdt_resources_all[resid].r_resctrl; d = rdt_find_domain(r, domid, NULL); - if (IS_ERR_OR_NULL(d)) { + if (!d) { ret = -ENOENT; goto out; } diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 1e3f9d28a4b54..1041c6401e9e7 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -292,9 +292,13 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) */ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) { + enum resctrl_scope scope = plr->s->res->scope; struct cacheinfo *ci; int ret; + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) + return -ENODEV; + /* Pick the first cpu we find that is associated with the cache. */ plr->cpu = cpumask_first(&plr->d->cpu_mask); @@ -305,7 +309,7 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) goto out_region; } - ci = get_cpu_cacheinfo_level(plr->cpu, plr->s->res->cache_level); + ci = get_cpu_cacheinfo_level(plr->cpu, scope); if (ci) { plr->line_size = ci->coherency_line_size; plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index cb68a121dabb6..50f5876a3020e 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1454,8 +1454,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct cacheinfo *ci; int num_b; + if (WARN_ON_ONCE(r->scope != RESCTRL_L2_CACHE && r->scope != RESCTRL_L3_CACHE)) + return size; + num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->cache_level); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->scope); if (ci) size = ci->size / r->cache.cbm_len * num_b; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a365f67131ece..ed693bfe474d5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -150,13 +150,18 @@ struct resctrl_membw { struct rdt_parse_data; struct resctrl_schema; +enum resctrl_scope { + RESCTRL_L2_CACHE = 2, + RESCTRL_L3_CACHE = 3, +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine * @num_rmid: Number of RMIDs available - * @cache_level: Which cache level defines scope of this resource + * @scope: Scope of this resource * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. * @domains: RCU list of all domains for this resource @@ -174,7 +179,7 @@ struct rdt_resource { bool alloc_capable; bool mon_capable; int num_rmid; - int cache_level; + enum resctrl_scope scope; struct resctrl_cache cache; struct resctrl_membw membw; struct list_head domains; From 2fd9dcb17c6e8c2e9df99bc8264f6c14c9447545 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:02 -0700 Subject: [PATCH 50/72] x86/resctrl: Prepare to split rdt_domain structure commit c103d4d48e1599a88001fa6215be27d55f3c025b upstream. The rdt_domain structure is used for both control and monitor features. It is about to be split into separate structures for these two usages because the scope for control and monitoring features for a resource will be different for future resources. To allow for common code that scans a list of domains looking for a specific domain id, move all the common fields ("list", "id", "cpu_mask") into their own structure within the rdt_domain structure. Intel-SIG: commit c103d4d48e15 x86/resctrl: Prepare to split rdt_domain structure. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-3-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 26 +++++----- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 24 ++++----- arch/x86/kernel/cpu/resctrl/monitor.c | 14 +++--- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 14 +++--- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 60 +++++++++++------------ include/linux/resctrl.h | 16 ++++-- 6 files changed, 81 insertions(+), 73 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4c5e985e13880..7c15959c2768d 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -355,9 +355,9 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) lockdep_assert_cpus_held(); - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) return d; } @@ -393,12 +393,12 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, struct list_head *l; list_for_each(l, &r->domains) { - d = list_entry(l, struct rdt_domain, list); + d = list_entry(l, struct rdt_domain, hdr.list); /* When id is found, return its domain. */ - if (id == d->id) + if (id == d->hdr.id) return d; /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) + if (id < d->hdr.id) break; } @@ -526,7 +526,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d = rdt_find_domain(r, id, &add_pos); if (d) { - cpumask_set_cpu(cpu, &d->cpu_mask); + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (r->cache.arch_has_per_cpu_cfg) rdt_domain_reconfigure_cdp(r); return; @@ -537,8 +537,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d = &hw_dom->d_resctrl; - d->id = id; - cpumask_set_cpu(cpu, &d->cpu_mask); + d->hdr.id = id; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -552,11 +552,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail_rcu(&d->list, add_pos); + list_add_tail_rcu(&d->hdr.list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del_rcu(&d->list); + list_del_rcu(&d->hdr.list); synchronize_rcu(); domain_free(hw_dom); } @@ -583,10 +583,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) } hw_dom = resctrl_to_arch_dom(d); - cpumask_clear_cpu(cpu, &d->cpu_mask); - if (cpumask_empty(&d->cpu_mask)) { + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { resctrl_offline_domain(r, d); - list_del_rcu(&d->list); + list_del_rcu(&d->hdr.list); synchronize_rcu(); /* diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2bf021d425007..6246f48b04490 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -69,7 +69,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); return -EINVAL; } @@ -148,7 +148,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); return -EINVAL; } @@ -231,8 +231,8 @@ static int parse_line(char *line, struct resctrl_schema *s, return -EINVAL; } dom = strim(dom); - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { + list_for_each_entry(d, &r->domains, hdr.list) { + if (d->hdr.id == dom_id) { data.buf = dom; data.rdtgrp = rdtgrp; if (r->parse_ctrlval(&data, s, d)) @@ -280,7 +280,7 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, u32 idx = get_config_index(closid, t); struct msr_param msr_param; - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) return -EINVAL; hw_dom->ctrl_val[idx] = cfg_val; @@ -306,7 +306,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { hw_dom = resctrl_to_arch_dom(d); msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { @@ -330,7 +330,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) } } if (msr_param.res) - smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1); + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } return 0; @@ -450,7 +450,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo lockdep_assert_cpus_held(); seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -460,7 +460,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo ctrl_val = resctrl_arch_get_config(r, dom, closid, schema->conf_type); - seq_printf(s, r->format_str, dom->id, max_data_width, + seq_printf(s, r->format_str, dom->hdr.id, max_data_width, ctrl_val); sep = true; } @@ -489,7 +489,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, } else { seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->s->res->name, - rdtgrp->plr->d->id, + rdtgrp->plr->d->hdr.id, rdtgrp->plr->cbm); } } else { @@ -537,7 +537,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, return; } - cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); + cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, RESCTRL_PICK_ANY_CPU); /* * cpumask_any_housekeeping() prefers housekeeping CPUs, but @@ -546,7 +546,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * counters on some platforms if its called in IRQ context. */ if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + smp_call_function_any(&d->hdr.cpu_mask, mon_event_count, rr, 1); else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 366f496ca3ce2..4d7c596f9276e 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -281,7 +281,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, resctrl_arch_rmid_read_context_check(); - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) return -EINVAL; ret = __rmid_read(rmid, eventid, &msr_val); @@ -364,7 +364,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * CLOSID and RMID because there may be dependencies between them * on some architectures. */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val); + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); } if (force_free || !rmid_dirty) { @@ -490,7 +490,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { /* * For the first limbo RMID in the domain, * setup up the limbo worker. @@ -802,7 +802,7 @@ void cqm_handle_limbo(struct work_struct *work) __check_limbo(d, false); if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, delay); @@ -826,7 +826,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; if (cpu < nr_cpu_ids) @@ -869,7 +869,7 @@ void mbm_handle_overflow(struct work_struct *work) * Re-check for housekeeping CPUs. This allows the overflow handler to * move off a nohz_full CPU quickly. */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); @@ -898,7 +898,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, */ if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; if (cpu < nr_cpu_ids) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 1041c6401e9e7..86436464959c1 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -221,7 +221,7 @@ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) int cpu; int ret; - for_each_cpu(cpu, &plr->d->cpu_mask) { + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); if (!pm_req) { rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); @@ -300,7 +300,7 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) return -ENODEV; /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->cpu_mask); + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); if (!cpu_online(plr->cpu)) { rdt_last_cmd_printf("CPU %u associated with cache not online\n", @@ -854,10 +854,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * associated with them. */ for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->domains, list) { + list_for_each_entry(d_i, &r->domains, hdr.list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->cpu_mask); + &d_i->hdr.cpu_mask); } } @@ -865,7 +865,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * Next test if new pseudo-locked region would intersect with * existing region. */ - if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) ret = true; free_cpumask_var(cpu_with_psl); @@ -1197,7 +1197,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) } plr->thread_done = 0; - cpu = cpumask_first(&plr->d->cpu_mask); + cpu = cpumask_first(&plr->d->hdr.cpu_mask); if (!cpu_online(cpu)) { ret = -ENODEV; goto out; @@ -1527,7 +1527,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * may be scheduled elsewhere and invalidate entries in the * pseudo-locked region. */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 50f5876a3020e..b6ba77cdf0e89 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -98,7 +98,7 @@ void rdt_staged_configs_clear(void) lockdep_assert_held(&rdtgroup_mutex); for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->domains, list) + list_for_each_entry(dom, &r->domains, hdr.list) memset(dom->staged_config, 0, sizeof(dom->staged_config)); } } @@ -317,7 +317,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdt_last_cmd_puts("Cache domain offline\n"); ret = -ENODEV; } else { - mask = &rdtgrp->plr->d->cpu_mask; + mask = &rdtgrp->plr->d->hdr.cpu_mask; seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", cpumask_pr_args(mask)); @@ -1021,12 +1021,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->domains, hdr.list) { if (sep) seq_putc(seq, ';'); sw_shareable = 0; exclusive = 0; - seq_printf(seq, "%d=", dom->id); + seq_printf(seq, "%d=", dom->hdr.id); for (i = 0; i < closids_supported(); i++) { if (!closid_allocated(i)) continue; @@ -1343,7 +1343,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) continue; has_cache = true; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { ctrl = resctrl_arch_get_config(r, d, closid, s->conf_type); if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { @@ -1458,7 +1458,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->scope); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->scope); if (ci) size = ci->size / r->cache.cbm_len * num_b; @@ -1502,7 +1502,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, rdtgrp->plr->d, rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); } goto out; } @@ -1514,7 +1514,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { if (sep) seq_putc(s, ';'); if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -1532,7 +1532,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, else size = rdtgroup_cbm_to_size(r, d, ctrl); } - seq_printf(s, "%d=%u", d->id, size); + seq_printf(s, "%d=%u", d->hdr.id, size); sep = true; } seq_putc(s, '\n'); @@ -1592,7 +1592,7 @@ static void mon_event_config_read(void *info) static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) { - smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); + smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1); } static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) @@ -1604,7 +1604,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -1612,7 +1612,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid mon_info.evtid = evtid; mondata_config_read(dom, &mon_info); - seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); sep = true; } seq_puts(s, "\n"); @@ -1678,7 +1678,7 @@ static void mbm_config_write_domain(struct rdt_resource *r, * are scoped at the domain level. Writing any of these MSRs * on one CPU is observed by all the CPUs in the domain. */ - smp_call_function_any(&d->cpu_mask, mon_event_config_write, + smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write, &mon_info, 1); /* @@ -1728,8 +1728,8 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) return -EINVAL; } - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { + list_for_each_entry(d, &r->domains, hdr.list) { + if (d->hdr.id == dom_id) { mbm_config_write_domain(r, d, evtid, val); goto next; } @@ -2276,14 +2276,14 @@ static int set_cache_qos_cfg(int level, bool enable) return -ENOMEM; r_l = &rdt_resources_all[level].r_resctrl; - list_for_each_entry(d, &r_l->domains, list) { + list_for_each_entry(d, &r_l->domains, hdr.list) { if (r_l->cache.arch_has_per_cpu_cfg) /* Pick all the CPUs in the domain instance */ - for_each_cpu(cpu, &d->cpu_mask) + for_each_cpu(cpu, &d->hdr.cpu_mask) cpumask_set_cpu(cpu, cpu_mask); else /* Pick one CPU from each domain instance to update MSR */ - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask); } /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ @@ -2312,7 +2312,7 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) { u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->cpu_mask); + int cpu = cpumask_any(&d->hdr.cpu_mask); int i; d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), @@ -2361,7 +2361,7 @@ static int set_mba_sc(bool mba_sc) r->membw.mba_sc = mba_sc; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { for (i = 0; i < num_closid; i++) d->mbps_val[i] = MBA_MAX_MBPS; } @@ -2700,7 +2700,7 @@ static int rdt_get_tree(struct fs_context *fc) if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - list_for_each_entry(dom, &r->domains, list) + list_for_each_entry(dom, &r->domains, hdr.list) mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, RESCTRL_PICK_ANY_CPU); } @@ -2827,13 +2827,13 @@ static int reset_all_ctrls(struct rdt_resource *r) * CBMs in all domains to the maximum mask value. Pick one CPU * from each domain to update the MSRs below. */ - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { hw_dom = resctrl_to_arch_dom(d); for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; msr_param.dom = d; - smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1); + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } return 0; @@ -3031,7 +3031,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, char name[32]; int ret; - sprintf(name, "mon_%s_%02d", r->name, d->id); + sprintf(name, "mon_%s_%02d", r->name, d->hdr.id); /* create the directory */ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); if (IS_ERR(kn)) @@ -3047,7 +3047,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, } priv.u.rid = r->rid; - priv.u.domid = d->id; + priv.u.domid = d->hdr.id; list_for_each_entry(mevt, &r->evt_list, list) { priv.u.evtid = mevt->evtid; ret = mon_addfile(kn, mevt->name, priv.priv); @@ -3098,7 +3098,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->domains, hdr.list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) return ret; @@ -3257,7 +3257,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, */ tmp_cbm = cfg->new_ctrl; if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id); + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); return -ENOSPC; } cfg->have_new_ctrl = true; @@ -3280,7 +3280,7 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) struct rdt_domain *d; int ret; - list_for_each_entry(d, &s->res->domains, list) { + list_for_each_entry(d, &s->res->domains, hdr.list) { ret = __init_one_rdt_domain(d, s, closid); if (ret < 0) return ret; @@ -3295,7 +3295,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) struct resctrl_staged_config *cfg; struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->domains, hdr.list) { if (is_mba_sc(r)) { d->mbps_val[closid] = MBA_MAX_MBPS; continue; @@ -3941,7 +3941,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + rmdir_mondata_subdir_allrdtgrp(r, d->hdr.id); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ed693bfe474d5..f63fcf17a3bc1 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -59,10 +59,20 @@ struct resctrl_staged_config { }; /** - * struct rdt_domain - group of CPUs sharing a resctrl resource + * struct rdt_domain_hdr - common header for different domain types * @list: all instances of this resource * @id: unique id for this instance * @cpu_mask: which CPUs share this resource + */ +struct rdt_domain_hdr { + struct list_head list; + int id; + struct cpumask cpu_mask; +}; + +/** + * struct rdt_domain - group of CPUs sharing a resctrl resource + * @hdr: common header for different domain types * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -77,9 +87,7 @@ struct resctrl_staged_config { * by closid */ struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; + struct rdt_domain_hdr hdr; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; From c56be7dbae17592be8c87f74a1f96eed798e9398 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:03 -0700 Subject: [PATCH 51/72] x86/resctrl: Prepare for different scope for control/monitor operations commit cd84f72b6a5c10f79f19fab67b0edfbc4fdbc5b1 upstream. Resctrl assumes that control and monitor operations on a resource are performed at the same scope. Prepare for systems that use different scope (specifically Intel needs to split the RDT_RESOURCE_L3 resource to use L3 scope for cache control and NODE scope for cache occupancy and memory bandwidth monitoring). Create separate domain lists for control and monitor operations. Note that errors during initialization of either control or monitor functions on a domain would previously result in that domain being excluded from both control and monitor operations. Now the domains are allocated independently it is no longer required to disable both control and monitor operations if either fail. Intel-SIG: commit cd84f72b6a5c x86/resctrl: Prepare for different scope for control/monitor operations. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-4-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 224 +++++++++++++++++----- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 12 +- arch/x86/kernel/cpu/resctrl/internal.h | 7 +- arch/x86/kernel/cpu/resctrl/monitor.c | 4 +- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 60 +++--- include/linux/resctrl.h | 25 ++- 7 files changed, 240 insertions(+), 96 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7c15959c2768d..66a5a270d66f5 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -60,7 +60,8 @@ static void mba_wrmsr_intel(struct msr_param *m); static void cat_wrmsr(struct msr_param *m); static void mba_wrmsr_amd(struct msr_param *m); -#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) +#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains) +#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains) struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_L3] = @@ -68,8 +69,10 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L3, .name = "L3", - .scope = RESCTRL_L3_CACHE, - .domains = domain_init(RDT_RESOURCE_L3), + .ctrl_scope = RESCTRL_L3_CACHE, + .mon_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3), + .mon_domains = mon_domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, @@ -82,8 +85,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L2, .name = "L2", - .scope = RESCTRL_L2_CACHE, - .domains = domain_init(RDT_RESOURCE_L2), + .ctrl_scope = RESCTRL_L2_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, @@ -96,8 +99,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_MBA, .name = "MB", - .scope = RESCTRL_L3_CACHE, - .domains = domain_init(RDT_RESOURCE_MBA), + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, @@ -108,8 +111,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_SMBA, .name = "SMBA", - .scope = RESCTRL_L3_CACHE, - .domains = domain_init(RDT_RESOURCE_SMBA), + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, @@ -349,13 +352,28 @@ static void cat_wrmsr(struct msr_param *m) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +struct rdt_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) { struct rdt_domain *d; lockdep_assert_cpus_held(); - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +struct rdt_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { /* Find the domain that contains this CPU */ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) return d; @@ -379,26 +397,26 @@ void rdt_ctrl_update(void *arg) } /* - * rdt_find_domain - Find a domain in a resource that matches input resource id + * rdt_find_domain - Search for a domain id in a resource domain list. * - * Search resource r's domain list to find the resource id. If the resource - * id is found in a domain, return the domain. Otherwise, if requested by - * caller, return the first domain whose id is bigger than the input id. - * The domain list is sorted by id in ascending order. + * Search the domain list to find the domain id. If the domain id is + * found, return the domain. NULL otherwise. If the domain id is not + * found (and NULL returned) then the first domain with id bigger than + * the input id can be returned to the caller via @pos. */ -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, + struct list_head **pos) { - struct rdt_domain *d; + struct rdt_domain_hdr *d; struct list_head *l; - list_for_each(l, &r->domains) { - d = list_entry(l, struct rdt_domain, hdr.list); + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); /* When id is found, return its domain. */ - if (id == d->hdr.id) + if (id == d->id) return d; /* Stop searching when finding id's position in sorted list. */ - if (id < d->hdr.id) + if (id < d->id) break; } @@ -494,38 +512,29 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return -EINVAL; } -/* - * domain_add_cpu - Add a cpu to a resource's domain list. - * - * If an existing domain in the resource r's domain list matches the cpu's - * resource id, add the cpu in the domain. - * - * Otherwise, a new domain is allocated and inserted into the right position - * in the domain list sorted by id in ascending order. - * - * The order in the domain list is visible to users when we print entries - * in the schemata file and schemata input is validated to have the same order - * as this list. - */ -static void domain_add_cpu(int cpu, struct rdt_resource *r) +static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) { - int id = get_domain_id_from_scope(cpu, r->scope); + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); struct list_head *add_pos = NULL; struct rdt_hw_domain *hw_dom; + struct rdt_domain_hdr *hdr; struct rdt_domain *d; int err; lockdep_assert_held(&domain_list_lock); if (id < 0) { - pr_warn_once("Can't find domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->scope, r->name); + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); return; } - d = rdt_find_domain(r, id, &add_pos); + hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + d = container_of(hdr, struct rdt_domain, hdr); - if (d) { cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (r->cache.arch_has_per_cpu_cfg) rdt_domain_reconfigure_cdp(r); @@ -538,23 +547,70 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; + d->hdr.type = RESCTRL_CTRL_DOMAIN; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); - if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + if (domain_setup_ctrlval(r, d)) { domain_free(hw_dom); return; } - if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_ctrl_domain(r, d); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_hw_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_domain *d; + int err; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = rdt_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + d = container_of(hdr, struct rdt_domain, hdr); + + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + return; + } + + hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!hw_dom) + return; + + d = &hw_dom->d_resctrl; + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + + if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { domain_free(hw_dom); return; } list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_domain(r, d); + err = resctrl_online_mon_domain(r, d); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); @@ -562,30 +618,45 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) } } -static void domain_remove_cpu(int cpu, struct rdt_resource *r) +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + if (r->alloc_capable) + domain_add_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_add_cpu_mon(cpu, r); +} + +static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) { - int id = get_domain_id_from_scope(cpu, r->scope); + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); struct rdt_hw_domain *hw_dom; + struct rdt_domain_hdr *hdr; struct rdt_domain *d; lockdep_assert_held(&domain_list_lock); if (id < 0) { - pr_warn_once("Can't find domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->scope, r->name); + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); return; } - d = rdt_find_domain(r, id, NULL); - if (!d) { - pr_warn("Couldn't find domain with id=%d for CPU %d\n", id, cpu); + hdr = rdt_find_domain(&r->ctrl_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); return; } + + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_domain, hdr); hw_dom = resctrl_to_arch_dom(d); cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_domain(r, d); + resctrl_offline_ctrl_domain(r, d); list_del_rcu(&d->hdr.list); synchronize_rcu(); @@ -601,6 +672,53 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) } } +static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct rdt_hw_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_domain *d; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = rdt_find_domain(&r->mon_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); + return; + } + + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_domain, hdr); + hw_dom = resctrl_to_arch_dom(d); + + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { + resctrl_offline_mon_domain(r, d); + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + domain_free(hw_dom); + + return; + } +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + if (r->alloc_capable) + domain_remove_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_remove_cpu_mon(cpu, r); +} + static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 6246f48b04490..8cc36723f0770 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -231,7 +231,7 @@ static int parse_line(char *line, struct resctrl_schema *s, return -EINVAL; } dom = strim(dom); - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { data.buf = dom; data.rdtgrp = rdtgrp; @@ -306,7 +306,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { hw_dom = resctrl_to_arch_dom(d); msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { @@ -450,7 +450,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo lockdep_assert_cpus_held(); seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->domains, hdr.list) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -556,6 +556,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; + struct rdt_domain_hdr *hdr; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; @@ -576,11 +577,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) evtid = md.u.evtid; r = &rdt_resources_all[resid].r_resctrl; - d = rdt_find_domain(r, domid, NULL); - if (!d) { + hdr = rdt_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { ret = -ENOENT; goto out; } + d = container_of(hdr, struct rdt_domain, hdr); mon_event_read(&rr, r, d, rdtgrp, evtid, false); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index f1d926832ec8a..377679b799194 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -558,8 +558,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn); int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, umode_t mask); -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos); +struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, + struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, @@ -578,7 +578,8 @@ int rdt_pseudo_lock_init(void); void rdt_pseudo_lock_release(void); int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); int alloc_rmid(u32 closid); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 4d7c596f9276e..18ed3a6b08181 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -490,7 +490,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { /* * For the first limbo RMID in the domain, * setup up the limbo worker. @@ -688,7 +688,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) idx = resctrl_arch_rmid_idx_encode(closid, rmid); pmbm_data = &dom_mbm->mbm_local[idx]; - dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { pr_warn_once("Failure to get domain for MBA update\n"); return; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 86436464959c1..63941dab11b21 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -292,7 +292,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) */ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) { - enum resctrl_scope scope = plr->s->res->scope; + enum resctrl_scope scope = plr->s->res->ctrl_scope; struct cacheinfo *ci; int ret; @@ -854,7 +854,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * associated with them. */ for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->domains, hdr.list) { + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, &d_i->hdr.cpu_mask); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index b6ba77cdf0e89..17d4610eecf58 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -98,7 +98,7 @@ void rdt_staged_configs_clear(void) lockdep_assert_held(&rdtgroup_mutex); for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->domains, hdr.list) + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) memset(dom->staged_config, 0, sizeof(dom->staged_config)); } } @@ -1021,7 +1021,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->domains, hdr.list) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(seq, ';'); sw_shareable = 0; @@ -1343,7 +1343,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) continue; has_cache = true; - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { ctrl = resctrl_arch_get_config(r, d, closid, s->conf_type); if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { @@ -1454,11 +1454,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct cacheinfo *ci; int num_b; - if (WARN_ON_ONCE(r->scope != RESCTRL_L2_CACHE && r->scope != RESCTRL_L3_CACHE)) + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->scope); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); if (ci) size = ci->size / r->cache.cbm_len * num_b; @@ -1514,7 +1514,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(s, ';'); if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -1604,7 +1604,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - list_for_each_entry(dom, &r->domains, hdr.list) { + list_for_each_entry(dom, &r->mon_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -1728,7 +1728,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) return -EINVAL; } - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->hdr.id == dom_id) { mbm_config_write_domain(r, d, evtid, val); goto next; @@ -2276,7 +2276,7 @@ static int set_cache_qos_cfg(int level, bool enable) return -ENOMEM; r_l = &rdt_resources_all[level].r_resctrl; - list_for_each_entry(d, &r_l->domains, hdr.list) { + list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) { if (r_l->cache.arch_has_per_cpu_cfg) /* Pick all the CPUs in the domain instance */ for_each_cpu(cpu, &d->hdr.cpu_mask) @@ -2361,7 +2361,7 @@ static int set_mba_sc(bool mba_sc) r->membw.mba_sc = mba_sc; - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { for (i = 0; i < num_closid; i++) d->mbps_val[i] = MBA_MAX_MBPS; } @@ -2700,7 +2700,7 @@ static int rdt_get_tree(struct fs_context *fc) if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - list_for_each_entry(dom, &r->domains, hdr.list) + list_for_each_entry(dom, &r->mon_domains, hdr.list) mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, RESCTRL_PICK_ANY_CPU); } @@ -2824,10 +2824,10 @@ static int reset_all_ctrls(struct rdt_resource *r) /* * Disable resource control for this resource by setting all - * CBMs in all domains to the maximum mask value. Pick one CPU + * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU * from each domain to update the MSRs below. */ - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { hw_dom = resctrl_to_arch_dom(d); for (i = 0; i < hw_res->num_closid; i++) @@ -3098,7 +3098,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->domains, hdr.list) { + list_for_each_entry(dom, &r->mon_domains, hdr.list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) return ret; @@ -3280,7 +3280,7 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) struct rdt_domain *d; int ret; - list_for_each_entry(d, &s->res->domains, hdr.list) { + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { ret = __init_one_rdt_domain(d, s, closid); if (ret < 0) return ret; @@ -3295,7 +3295,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) struct resctrl_staged_config *cfg; struct rdt_domain *d; - list_for_each_entry(d, &r->domains, hdr.list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (is_mba_sc(r)) { d->mbps_val[closid] = MBA_MAX_MBPS; continue; @@ -3926,15 +3926,19 @@ static void domain_destroy_mon_state(struct rdt_domain *d) kfree(d->mbm_local); } -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) { mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); - if (!r->mon_capable) - goto out_unlock; + mutex_unlock(&rdtgroup_mutex); +} + +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + mutex_lock(&rdtgroup_mutex); /* * If resctrl is mounted, remove all the @@ -3960,7 +3964,6 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) domain_destroy_mon_state(d); -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -3995,7 +3998,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) return 0; } -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) { int err = 0; @@ -4004,11 +4007,18 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ err = mba_sc_domain_allocate(r, d); - goto out_unlock; } - if (!r->mon_capable) - goto out_unlock; + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + mutex_lock(&rdtgroup_mutex); err = domain_setup_mon_state(r, d); if (err) @@ -4073,7 +4083,7 @@ void resctrl_offline_cpu(unsigned int cpu) if (!l3->mon_capable) goto out_unlock; - d = get_domain_from_cpu(cpu, l3); + d = get_mon_domain_from_cpu(cpu, l3); if (d) { if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { cancel_delayed_work(&d->mbm_over); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f63fcf17a3bc1..96ddf9ff31837 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -58,15 +58,22 @@ struct resctrl_staged_config { bool have_new_ctrl; }; +enum resctrl_domain_type { + RESCTRL_CTRL_DOMAIN, + RESCTRL_MON_DOMAIN, +}; + /** * struct rdt_domain_hdr - common header for different domain types * @list: all instances of this resource * @id: unique id for this instance + * @type: type of this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; int id; + enum resctrl_domain_type type; struct cpumask cpu_mask; }; @@ -169,10 +176,12 @@ enum resctrl_scope { * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine * @num_rmid: Number of RMIDs available - * @scope: Scope of this resource + * @ctrl_scope: Scope of this resource for control functions + * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. - * @domains: RCU list of all domains for this resource + * @ctrl_domains: RCU list of all control domains for this resource + * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. @@ -187,10 +196,12 @@ struct rdt_resource { bool alloc_capable; bool mon_capable; int num_rmid; - enum resctrl_scope scope; + enum resctrl_scope ctrl_scope; + enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; - struct list_head domains; + struct list_head ctrl_domains; + struct list_head mon_domains; char *name; int data_width; u32 default_ctrl; @@ -236,8 +247,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); From ae688d67dc306878d90487af6d885cb8e7a8da17 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:04 -0700 Subject: [PATCH 52/72] x86/resctrl: Split the rdt_domain and rdt_hw_domain structures commit cae2bcb6a2c691ef7b537ad07e9819a5ed645bcc upstream. The same rdt_domain structure is used for both control and monitor functions. But this results in wasted memory as some of the fields are only used by control functions, while most are only used for monitor functions. Split into separate rdt_ctrl_domain and rdt_mon_domain structures with just the fields required for control and monitoring respectively. Similar split of the rdt_hw_domain structure into rdt_hw_ctrl_domain and rdt_hw_mon_domain. Intel-SIG: commit cae2bcb6a2c6 x86/resctrl: Split the rdt_domain and rdt_hw_domain structures. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-5-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 71 ++++++++++++----------- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 28 ++++----- arch/x86/kernel/cpu/resctrl/internal.h | 62 ++++++++++++-------- arch/x86/kernel/cpu/resctrl/monitor.c | 40 ++++++------- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 6 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 64 ++++++++++---------- include/linux/resctrl.h | 48 ++++++++------- 7 files changed, 174 insertions(+), 145 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 66a5a270d66f5..cd58c9d4710fb 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -309,8 +309,8 @@ static void rdt_get_cdp_l2_config(void) static void mba_wrmsr_amd(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; for (i = m->low; i < m->high; i++) @@ -333,8 +333,8 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) static void mba_wrmsr_intel(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; /* Write the delay values for mba. */ @@ -344,17 +344,17 @@ static void mba_wrmsr_intel(struct msr_param *m) static void cat_wrmsr(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom); unsigned int i; for (i = m->low; i < m->high; i++) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } -struct rdt_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) +struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) { - struct rdt_domain *d; + struct rdt_ctrl_domain *d; lockdep_assert_cpus_held(); @@ -367,9 +367,9 @@ struct rdt_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) return NULL; } -struct rdt_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) +struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) { - struct rdt_domain *d; + struct rdt_mon_domain *d; lockdep_assert_cpus_held(); @@ -440,18 +440,23 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) *dc = r->default_ctrl; } -static void domain_free(struct rdt_hw_domain *hw_dom) +static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) +{ + kfree(hw_dom->ctrl_val); + kfree(hw_dom); +} + +static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { kfree(hw_dom->arch_mbm_total); kfree(hw_dom->arch_mbm_local); - kfree(hw_dom->ctrl_val); kfree(hw_dom); } -static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; u32 *dc; @@ -476,7 +481,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; @@ -515,10 +520,10 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->ctrl_scope); + struct rdt_hw_ctrl_domain *hw_dom; struct list_head *add_pos = NULL; - struct rdt_hw_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int err; lockdep_assert_held(&domain_list_lock); @@ -533,7 +538,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) return; - d = container_of(hdr, struct rdt_domain, hdr); + d = container_of(hdr, struct rdt_ctrl_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (r->cache.arch_has_per_cpu_cfg) @@ -553,7 +558,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (domain_setup_ctrlval(r, d)) { - domain_free(hw_dom); + ctrl_domain_free(hw_dom); return; } @@ -563,7 +568,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - domain_free(hw_dom); + ctrl_domain_free(hw_dom); } } @@ -571,9 +576,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); struct list_head *add_pos = NULL; - struct rdt_hw_domain *hw_dom; + struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_domain *d; + struct rdt_mon_domain *d; int err; lockdep_assert_held(&domain_list_lock); @@ -588,7 +593,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) return; - d = container_of(hdr, struct rdt_domain, hdr); + d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); return; @@ -604,7 +609,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { - domain_free(hw_dom); + mon_domain_free(hw_dom); return; } @@ -614,7 +619,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - domain_free(hw_dom); + mon_domain_free(hw_dom); } } @@ -629,9 +634,9 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->ctrl_scope); - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; lockdep_assert_held(&domain_list_lock); @@ -651,8 +656,8 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) return; - d = container_of(hdr, struct rdt_domain, hdr); - hw_dom = resctrl_to_arch_dom(d); + d = container_of(hdr, struct rdt_ctrl_domain, hdr); + hw_dom = resctrl_to_arch_ctrl_dom(d); cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); if (cpumask_empty(&d->hdr.cpu_mask)) { @@ -661,12 +666,12 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) synchronize_rcu(); /* - * rdt_domain "d" is going to be freed below, so clear + * rdt_ctrl_domain "d" is going to be freed below, so clear * its pointer from pseudo_lock_region struct. */ if (d->plr) d->plr->d = NULL; - domain_free(hw_dom); + ctrl_domain_free(hw_dom); return; } @@ -675,9 +680,9 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_domain *hw_dom; + struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_domain *d; + struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -697,15 +702,15 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) return; - d = container_of(hdr, struct rdt_domain, hdr); - hw_dom = resctrl_to_arch_dom(d); + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); if (cpumask_empty(&d->hdr.cpu_mask)) { resctrl_offline_mon_domain(r, d); list_del_rcu(&d->hdr.list); synchronize_rcu(); - domain_free(hw_dom); + mon_domain_free(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 8cc36723f0770..3b9383612c359 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -60,7 +60,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) } int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; u32 closid = data->rdtgrp->closid; @@ -139,7 +139,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) * resource type. */ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { struct rdtgroup *rdtgrp = data->rdtgrp; struct resctrl_staged_config *cfg; @@ -208,8 +208,8 @@ static int parse_line(char *line, struct resctrl_schema *s, struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; struct rdt_parse_data data; + struct rdt_ctrl_domain *d; char *dom = NULL, *id; - struct rdt_domain *d; unsigned long dom_id; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -272,11 +272,11 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) } } -int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); u32 idx = get_config_index(closid, t); struct msr_param msr_param; @@ -297,17 +297,17 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; + struct rdt_ctrl_domain *d; enum resctrl_conf_type t; - struct rdt_domain *d; u32 idx; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - hw_dom = resctrl_to_arch_dom(d); + hw_dom = resctrl_to_arch_ctrl_dom(d); msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { cfg = &hw_dom->d_resctrl.staged_config[t]; @@ -430,10 +430,10 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return ret ?: nbytes; } -u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); u32 idx = get_config_index(closid, type); return hw_dom->ctrl_val[idx]; @@ -442,7 +442,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) { struct rdt_resource *r = schema->res; - struct rdt_domain *dom; + struct rdt_ctrl_domain *dom; bool sep = false; u32 ctrl_val; @@ -514,7 +514,7 @@ static int smp_mon_event_count(void *arg) } void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, int evtid, int first) { int cpu; @@ -557,11 +557,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; union mon_data_bits md; - struct rdt_domain *d; struct rmid_read rr; int ret = 0; @@ -582,7 +582,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) ret = -ENOENT; goto out; } - d = container_of(hdr, struct rdt_domain, hdr); + d = container_of(hdr, struct rdt_mon_domain, hdr); mon_event_read(&rr, r, d, rdtgrp, evtid, false); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 377679b799194..135190e0711c0 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -147,7 +147,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_domain *d; + struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; int err; @@ -232,7 +232,7 @@ struct mongroup { */ struct pseudo_lock_region { struct resctrl_schema *s; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; u32 cbm; wait_queue_head_t lock_thread_wq; int thread_done; @@ -355,25 +355,41 @@ struct arch_mbm_state { }; /** - * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share - * a resource + * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share + * a resource for a control function * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * + * Members of this structure are accessed via helpers that provide abstraction. + */ +struct rdt_hw_ctrl_domain { + struct rdt_ctrl_domain d_resctrl; + u32 *ctrl_val; +}; + +/** + * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share + * a resource for a monitor function + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_total: arch private state for MBM total bandwidth * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_domain { - struct rdt_domain d_resctrl; - u32 *ctrl_val; +struct rdt_hw_mon_domain { + struct rdt_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_total; struct arch_mbm_state *arch_mbm_local; }; -static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) +static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) +{ + return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); +} + +static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) { - return container_of(r, struct rdt_hw_domain, d_resctrl); + return container_of(r, struct rdt_hw_mon_domain, d_resctrl); } /** @@ -385,7 +401,7 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) */ struct msr_param { struct rdt_resource *res; - struct rdt_domain *dom; + struct rdt_ctrl_domain *dom; u32 low; u32 high; }; @@ -458,9 +474,9 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r } int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); extern struct mutex rdtgroup_mutex; @@ -564,22 +580,22 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, unsigned long cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); int rdt_pseudo_lock_init(void); void rdt_pseudo_lock_release(void); int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); -struct rdt_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); int alloc_rmid(u32 closid); @@ -590,19 +606,19 @@ bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); -void mbm_setup_overflow_handler(struct rdt_domain *dom, +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_domain *d); -void __check_limbo(struct rdt_domain *d, bool force_free); +bool has_busy_rmid(struct rdt_mon_domain *d); +void __check_limbo(struct rdt_mon_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 18ed3a6b08181..2f5d35f0716fb 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -209,7 +209,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -228,11 +228,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, return NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -248,9 +248,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); if (is_mbm_total_enabled()) memset(hw_dom->arch_mbm_total, 0, @@ -269,12 +269,12 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; u64 msr_val, chunks; int ret; @@ -320,7 +320,7 @@ static void limbo_release_entry(struct rmid_entry *entry) * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_domain *d, bool force_free) +void __check_limbo(struct rdt_mon_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -378,7 +378,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free) resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_domain *d) +bool has_busy_rmid(struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -479,7 +479,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_domain *d; + struct rdt_mon_domain *d; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -532,7 +532,7 @@ void free_rmid(u32 closid, u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -668,12 +668,12 @@ void mon_event_count(void *info) * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; + struct rdt_ctrl_domain *dom_mba; struct rdt_resource *r_mba; - struct rdt_domain *dom_mba; u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; @@ -734,7 +734,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid) { struct rmid_read rr; @@ -792,12 +792,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_domain *d; + struct rdt_mon_domain *d; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - d = container_of(work, struct rdt_domain, cqm_limbo.work); + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); __check_limbo(d, false); @@ -820,7 +820,7 @@ void cqm_handle_limbo(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -837,9 +837,9 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; struct list_head *head; struct rdt_resource *r; - struct rdt_domain *d; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); @@ -852,7 +852,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, mbm_over.work); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); @@ -886,7 +886,7 @@ void mbm_handle_overflow(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 63941dab11b21..3e7e405ea715d 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -809,7 +809,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) * Return: true if @cbm overlaps with pseudo-locked region on @d, false * otherwise. */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) { unsigned int cbm_len; unsigned long cbm_b; @@ -836,11 +836,11 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm * if it is not possible to test due to memory allocation issue, * false otherwise. */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) { + struct rdt_ctrl_domain *d_i; cpumask_var_t cpu_with_psl; struct rdt_resource *r; - struct rdt_domain *d_i; bool ret = false; /* Walking r->domains, ensure it can't race with cpuhp */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 17d4610eecf58..eb3bbfa96d5ab 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -92,8 +92,8 @@ void rdt_last_cmd_printf(const char *fmt, ...) void rdt_staged_configs_clear(void) { + struct rdt_ctrl_domain *dom; struct rdt_resource *r; - struct rdt_domain *dom; lockdep_assert_held(&rdtgroup_mutex); @@ -1012,7 +1012,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, unsigned long sw_shareable = 0, hw_shareable = 0; unsigned long exclusive = 0, pseudo_locked = 0; struct rdt_resource *r = s->res; - struct rdt_domain *dom; + struct rdt_ctrl_domain *dom; int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; bool sep = false; @@ -1243,7 +1243,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, * * Return: false if CBM does not overlap, true if it does. */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, enum resctrl_conf_type type, bool exclusive) { @@ -1298,7 +1298,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d * * Return: true if CBM overlap detected, false if there is no overlap */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, bool exclusive) { enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); @@ -1329,10 +1329,10 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; + struct rdt_ctrl_domain *d; struct resctrl_schema *s; struct rdt_resource *r; bool has_cache = false; - struct rdt_domain *d; u32 ctrl; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -1448,7 +1448,7 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, * bitmap functions work correctly. */ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_domain *d, unsigned long cbm) + struct rdt_ctrl_domain *d, unsigned long cbm) { unsigned int size = 0; struct cacheinfo *ci; @@ -1476,9 +1476,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, { struct resctrl_schema *schema; enum resctrl_conf_type type; + struct rdt_ctrl_domain *d; struct rdtgroup *rdtgrp; struct rdt_resource *r; - struct rdt_domain *d; unsigned int size; int ret = 0; u32 closid; @@ -1590,7 +1590,7 @@ static void mon_event_config_read(void *info) mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) +static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info) { smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1); } @@ -1598,7 +1598,7 @@ static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mo static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { struct mon_config_info mon_info = {0}; - struct rdt_domain *dom; + struct rdt_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1657,7 +1657,7 @@ static void mon_event_config_write(void *info) } static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) + struct rdt_mon_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; @@ -1698,7 +1698,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; - struct rdt_domain *d; + struct rdt_mon_domain *d; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); @@ -2257,9 +2257,9 @@ static inline bool is_mba_linear(void) static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); + struct rdt_ctrl_domain *d; struct rdt_resource *r_l; cpumask_var_t cpu_mask; - struct rdt_domain *d; int cpu; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -2309,7 +2309,7 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) { u32 num_closid = resctrl_arch_get_num_closid(r); int cpu = cpumask_any(&d->hdr.cpu_mask); @@ -2327,7 +2327,7 @@ static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) } static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { kfree(d->mbps_val); d->mbps_val = NULL; @@ -2353,7 +2353,7 @@ static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int i; if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) @@ -2625,7 +2625,7 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_domain *dom; + struct rdt_mon_domain *dom; struct rdt_resource *r; int ret; @@ -2810,9 +2810,9 @@ static int rdt_init_fs_context(struct fs_context *fc) static int reset_all_ctrls(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int i; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -2828,7 +2828,7 @@ static int reset_all_ctrls(struct rdt_resource *r) * from each domain to update the MSRs below. */ list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - hw_dom = resctrl_to_arch_dom(d); + hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; @@ -3021,7 +3021,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_domain *d, + struct rdt_mon_domain *d, struct rdt_resource *r, struct rdtgroup *prgrp) { union mon_data_bits priv; @@ -3070,7 +3070,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) + struct rdt_mon_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3092,7 +3092,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_domain *dom; + struct rdt_mon_domain *dom; int ret; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -3197,7 +3197,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) * Set the RDT domain up to start off with all usable allocations. That is, * all shareable and unused bits. All-zero CBM is invalid. */ -static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, +static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, u32 closid) { enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); @@ -3277,7 +3277,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, */ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int ret; list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { @@ -3293,7 +3293,7 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (is_mba_sc(r)) { @@ -3919,14 +3919,14 @@ static void __init rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_domain *d) +static void domain_destroy_mon_state(struct rdt_mon_domain *d) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); } -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) { mutex_lock(&rdtgroup_mutex); @@ -3936,7 +3936,7 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) mutex_unlock(&rdtgroup_mutex); } -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) { mutex_lock(&rdtgroup_mutex); @@ -3967,7 +3967,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d) mutex_unlock(&rdtgroup_mutex); } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; @@ -3998,7 +3998,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) return 0; } -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) { int err = 0; @@ -4014,7 +4014,7 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d) return err; } -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d) +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) { int err; @@ -4069,8 +4069,8 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) void resctrl_offline_cpu(unsigned int cpu) { struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; - struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 96ddf9ff31837..aa2c22a8e37b3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -78,7 +78,23 @@ struct rdt_domain_hdr { }; /** - * struct rdt_domain - group of CPUs sharing a resctrl resource + * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource + * @hdr: common header for different domain types + * @plr: pseudo-locked region (if any) associated with domain + * @staged_config: parsed configuration to be applied + * @mbps_val: When mba_sc is enabled, this holds the array of user + * specified control values for mba_sc in MBps, indexed + * by closid + */ +struct rdt_ctrl_domain { + struct rdt_domain_hdr hdr; + struct pseudo_lock_region *plr; + struct resctrl_staged_config staged_config[CDP_NUM_TYPES]; + u32 *mbps_val; +}; + +/** + * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth @@ -87,13 +103,8 @@ struct rdt_domain_hdr { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters - * @plr: pseudo-locked region (if any) associated with domain - * @staged_config: parsed configuration to be applied - * @mbps_val: When mba_sc is enabled, this holds the array of user - * specified control values for mba_sc in MBps, indexed - * by closid */ -struct rdt_domain { +struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; @@ -102,9 +113,6 @@ struct rdt_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; - struct pseudo_lock_region *plr; - struct resctrl_staged_config staged_config[CDP_NUM_TYPES]; - u32 *mbps_val; }; /** @@ -208,7 +216,7 @@ struct rdt_resource { const char *format_str; int (*parse_ctrlval)(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); struct list_head evt_list; unsigned long fflags; bool cdp_capable; @@ -242,15 +250,15 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. */ -int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val); -u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); @@ -279,7 +287,7 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *arch_mon_ctx); @@ -312,7 +320,7 @@ static inline void resctrl_arch_rmid_read_context_check(void) * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -325,7 +333,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; From 8bc2443123999eaea311d3996669e0b599d1af6d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:05 -0700 Subject: [PATCH 53/72] x86/resctrl: Add node-scope to the options for feature scope commit 1a171608ee8d40d22d604303e42f033c69151123 upstream. Currently supported resctrl features are all domain scoped the same as the scope of the L2 or L3 caches. Add RESCTRL_L3_NODE as a new option for features that are scoped at the same granularity as NUMA nodes. This is needed for Intel's Sub-NUMA Cluster (SNC) feature where monitoring features are divided between nodes that share an L3 cache. Intel-SIG: commit 1a171608ee8d x86/resctrl: Add node-scope to the options for feature scope. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-6-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 2 ++ include/linux/resctrl.h | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index cd58c9d4710fb..44af35a28d5f1 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -510,6 +510,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) case RESCTRL_L2_CACHE: case RESCTRL_L3_CACHE: return get_cpu_cacheinfo_id(cpu, scope); + case RESCTRL_L3_NODE: + return cpu_to_node(cpu); default: break; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index aa2c22a8e37b3..64b6ad1b22a14 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -176,6 +176,7 @@ struct resctrl_schema; enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, + RESCTRL_L3_NODE, }; /** From 3ecafe3be57863ecbd6d2f19c7776359eb1c1fc7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:06 -0700 Subject: [PATCH 54/72] x86/resctrl: Introduce snc_nodes_per_l3_cache commit e13db55b5a0d447dea63cde772c1078405bbbf96 upstream. Intel Sub-NUMA Cluster (SNC) is a feature that subdivides the CPU cores and memory controllers on a socket into two or more groups. These are presented to the operating system as NUMA nodes. This may enable some workloads to have slightly lower latency to memory as the memory controller(s) in an SNC node are electrically closer to the CPU cores on that SNC node. This cost may be offset by lower bandwidth since the memory accesses for each core can only be interleaved between the memory controllers on the same SNC node. Resctrl monitoring on an Intel system depends upon attaching RMIDs to tasks to track L3 cache occupancy and memory bandwidth. There is an MSR that controls how the RMIDs are shared between SNC nodes. The default mode divides them numerically. E.g. when there are two SNC nodes on a socket the lower number half of the RMIDs are given to the first node, the remainder to the second node. This would be difficult to use with the Linux resctrl interface as specific RMID values assigned to resctrl groups are not visible to users. RMID sharing mode divides the physical RMIDs evenly between SNC nodes but uses a logical RMID in the IA32_PQR_ASSOC MSR. For example a system with 200 physical RMIDs (as enumerated by CPUID leaf 0xF) that has two SNC nodes per L3 cache instance would have 100 logical RMIDs available for Linux to use. A task running on SNC node 0 with RMID 5 would accumulate LLC occupancy and MBM bandwidth data in physical RMID 5. Another task using RMID 5, but running on SNC node 1 would accumulate data in physical RMID 105. Even with this renumbering SNC mode requires several changes in resctrl behavior for correct operation. Add a static global to arch/x86/kernel/cpu/resctrl/monitor.c to indicate how many SNC domains share an L3 cache instance. Initialize this to "1". Runtime detection of SNC mode will adjust this value. Update all places to take appropriate action when SNC mode is enabled: 1) The number of logical RMIDs per L3 cache available for use is the number of physical RMIDs divided by the number of SNC nodes. 2) Likewise the "mon_scale" value must be divided by the number of SNC nodes. 3) Add a function to convert from logical RMID values (assigned to tasks and loaded into the IA32_PQR_ASSOC MSR on context switch) to physical RMID values to load into IA32_QM_EVTSEL MSR when reading counters on each SNC node. Intel-SIG: commit e13db55b5a0d x86/resctrl: Introduce snc_nodes_per_l3_cache. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-7-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 56 ++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2f5d35f0716fb..b6220089c68cb 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -97,6 +97,8 @@ unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) +static int snc_nodes_per_l3_cache = 1; + /* * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied @@ -185,7 +187,43 @@ static inline struct rmid_entry *__rmid_entry(u32 idx) return entry; } -static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +/* + * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by + * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is + * needed. The physical RMID is the same as the logical RMID. + * + * On a platform with SNC mode enabled, Linux enables RMID sharing mode + * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel + * Resource Director Technology Architecture Specification" for a full + * description of RMID sharing mode). + * + * In RMID sharing mode there are fewer "logical RMID" values available + * to accumulate data ("physical RMIDs" are divided evenly between SNC + * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * each SNC node. + * + * The value loaded into IA32_PQR_ASSOC is the "logical RMID". + * + * Data is collected independently on each SNC node and can be retrieved + * using the "physical RMID" value computed by this function and loaded + * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. + * + * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 + * cache. So a "physical RMID" may be read from any CPU that shares + * the L3 cache with the desired SNC node, not just from a CPU in + * the specific SNC node. + */ +static int logical_rmid_to_physical_rmid(int cpu, int lrmid) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + if (snc_nodes_per_l3_cache == 1) + return lrmid; + + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; +} + +static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) { u64 msr_val; @@ -197,7 +235,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) * are error bits. */ - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); rdmsrl(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) @@ -233,14 +271,17 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id eventid) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; + u32 prmid; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { memset(am, 0, sizeof(*am)); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); /* Record any initial, non-zero count value. */ - __rmid_read(rmid, eventid, &am->prev_msr); + __rmid_read_phys(prmid, eventid, &am->prev_msr); } } @@ -275,8 +316,10 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u64 msr_val, chunks; + u32 prmid; int ret; resctrl_arch_rmid_read_context_check(); @@ -284,7 +327,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) return -EINVAL; - ret = __rmid_read(rmid, eventid, &msr_val); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); if (ret) return ret; @@ -1023,8 +1067,8 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) int ret; resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; - hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; - r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; + r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) From 4db2f374bb339203d77d1c23dfaf1894f09927ea Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:07 -0700 Subject: [PATCH 55/72] x86/resctrl: Block use of mba_MBps mount option on Sub-NUMA Cluster (SNC) systems commit ac20aa423052553c005089b00f1e3caf79d3c1d3 upstream. When SNC is enabled there is a mismatch between the MBA control function which operates at L3 cache scope and the MBM monitor functions which measure memory bandwidth on each SNC node. Block use of the mba_MBps when scopes for MBA/MBM do not match. Improve user diagnostics by adding invalfc() message when mba_MBps is not supported. Intel-SIG: commit ac20aa423052 x86/resctrl: Block use of mba_MBps mount option on Sub-NUMA Cluster (SNC) systems. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-8-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index eb3bbfa96d5ab..d3b0fa9582669 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2335,14 +2335,18 @@ static void mba_sc_domain_destroy(struct rdt_resource *r, /* * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale. + * MBM is supported and MBA is in linear scale, + * and the MBM monitor scope is the same as MBA + * control scope. */ static bool supports_mba_mbps(void) { + struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; return (is_mbm_local_enabled() && - r->alloc_capable && is_mba_linear()); + r->alloc_capable && is_mba_linear() && + r->ctrl_scope == rmbm->mon_scope); } /* @@ -2750,6 +2754,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct rdt_fs_context *ctx = rdt_fc2context(fc); struct fs_parse_result result; + const char *msg; int opt; opt = fs_parse(fc, rdt_fs_parameters, param, &result); @@ -2764,8 +2769,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: + msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope"; if (!supports_mba_mbps()) - return -EINVAL; + return invalfc(fc, msg); ctx->enable_mba_mbps = true; return 0; case Opt_debug: From 82140d84149ef3c30d094119a73725f6e14aa61a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:08 -0700 Subject: [PATCH 56/72] x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 328ea688746420e12ced6cfbc5064413180244cc upstream. When SNC is enabled, monitoring data is collected at the SNC node granularity, but must be reported at L3-cache granularity for backwards compatibility in addition to reporting at the node level. Add a "ci" field to the rdt_mon_domain structure to save the cache information about the enclosing L3 cache for the domain. This provides: 1) The cache id which is needed to compose the name of the legacy monitoring directory, and to determine which domains should be summed to provide L3-scoped data. 2) The shared_cpu_map which is needed to determine which CPUs can be used to read the RMID counters with the MSR interface. This is the first step to an eventual goal of monitor reporting files like this (for a system with two SNC nodes per L3): $ cd /sys/fs/resctrl/mon_data $ tree mon_L3_00 mon_L3_00 <- 00 here is L3 cache id ├── llc_occupancy \ These files provide legacy support ├── mbm_local_bytes > for non-SNC aware monitor apps ├── mbm_total_bytes / that expect data at L3 cache level ├── mon_sub_L3_00 <- 00 here is SNC node id │   ├── llc_occupancy \ These files are finer grained │   ├── mbm_local_bytes > data from each SNC node │   └── mbm_total_bytes / └── mon_sub_L3_01 ├── llc_occupancy \ ├── mbm_local_bytes > As above, but for node 1. └── mbm_total_bytes / [ bp: Massage commit message. ] Intel-SIG: commit 328ea6887464 x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-9-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/core.c | 7 ++++++- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 1 - arch/x86/kernel/cpu/resctrl/rdtgroup.c | 1 - include/linux/resctrl.h | 3 +++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 44af35a28d5f1..8205b6a56e85a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -608,6 +607,12 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!d->ci) { + pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); + mon_domain_free(hw_dom); + return; + } cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 3e7e405ea715d..180bcacddf75b 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d3b0fa9582669..70d41a8fd7886 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -12,7 +12,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 64b6ad1b22a14..b0875b99e8111 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -2,6 +2,7 @@ #ifndef _RESCTRL_H #define _RESCTRL_H +#include #include #include #include @@ -96,6 +97,7 @@ struct rdt_ctrl_domain { /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types + * @ci: cache info for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -106,6 +108,7 @@ struct rdt_ctrl_domain { */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; + struct cacheinfo *ci; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; From f255cb833d9ec6672c0ce724584a5e915b0229a8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:09 -0700 Subject: [PATCH 57/72] x86/resctrl: Add a new field to struct rmid_read for summation of domains commit fb1f51f677585f1b1ba17d2390963bbebe7a8cfa upstream. When a user reads a monitor file rdtgroup_mondata_show() calls mon_event_read() to package up all the required details into an rmid_read structure which is passed across the smp_call*() infrastructure to code that will read data from hardware and return the value (or error status) in the rmid_read structure. Sub-NUMA Cluster (SNC) mode adds files with new semantics. These require the smp_call-ed code to sum event data from all domains that share an L3 cache. Add a pointer to the L3 "cacheinfo" structure to struct rmid_read for the data collection routines to use to pick the domains to be summed. [ Reinette: the rmid_read structure has become complex enough so document each of its fields and provide the kerneldoc documentation for struct rmid_read. ] Intel-SIG: commit fb1f51f67758 x86/resctrl: Add a new field to struct rmid_read for summation of domains. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Co-developed-by: Reinette Chatre Signed-off-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-10-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 135190e0711c0..681b5bdcd2f95 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -144,12 +144,31 @@ union mon_data_bits { } u; }; +/** + * struct rmid_read - Data passed across smp_call*() to read event count. + * @rgrp: Resource group for which the counter is being read. If it is a parent + * resource group then its event count is summed with the count from all + * its child resource groups. + * @r: Resource describing the properties of the event being read. + * @d: Domain that the counter should be read from. If NULL then sum all + * domains in @r sharing L3 @ci.id + * @evtid: Which monitor event to read. + * @first: Initialize MBM counter when true. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @err: Error encountered when reading counter. + * @val: Returned value of event counter. If @rgrp is a parent resource group, + * @val includes the sum of event counts from its child resource groups. + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, + * (summed across child resource groups if @rgrp is a parent resource group). + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). + */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; + struct cacheinfo *ci; int err; u64 val; void *arch_mon_ctx; From 54fcbe76da1fca5e0349582f088f9cb2a76172e4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:10 -0700 Subject: [PATCH 58/72] x86/resctrl: Initialize on-stack struct rmid_read instances commit 587edd7069b9e7dc7993d2df9371e7c37a4d2133 upstream. New semantics rely on some struct rmid_read members having NULL values to distinguish between the SNC and non-SNC scenarios. resctrl can thus no longer rely on this struct not being initialized properly. Initialize all on-stack declarations of struct rmid_read: rdtgroup_mondata_show() mbm_update() mkdir_mondata_subdir() to ensure that garbage values from the stack are not passed down to other functions. [ bp: Massage commit message. ] Intel-SIG: commit 587edd7069b9 x86/resctrl: Initialize on-stack struct rmid_read instances. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-11-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 3 +-- arch/x86/kernel/cpu/resctrl/monitor.c | 3 +-- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 3b9383612c359..4d76ff31a9e0d 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -529,7 +529,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->evtid = evtid; rr->r = r; rr->d = d; - rr->val = 0; rr->first = first; rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); if (IS_ERR(rr->arch_mon_ctx)) { @@ -557,12 +556,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; struct rdt_domain_hdr *hdr; + struct rmid_read rr = {0}; struct rdt_mon_domain *d; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; union mon_data_bits md; - struct rmid_read rr; int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index b6220089c68cb..9bba9d59d5f6e 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -781,9 +781,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid) { - struct rmid_read rr; + struct rmid_read rr = {0}; - rr.first = false; rr.r = r; rr.d = d; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 70d41a8fd7886..d0443589cd861 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3029,10 +3029,10 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, struct rdt_mon_domain *d, struct rdt_resource *r, struct rdtgroup *prgrp) { + struct rmid_read rr = {0}; union mon_data_bits priv; struct kernfs_node *kn; struct mon_evt *mevt; - struct rmid_read rr; char name[32]; int ret; From 0788d7b22c230cc11471e63fcae37a7c61990435 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:11 -0700 Subject: [PATCH 59/72] x86/resctrl: Refactor mkdir_mondata_subdir() with a helper function commit 603cf1e28838a01e4f140c3054ce147f8b087d08 upstream. In Sub-NUMA Cluster (SNC) mode Linux must create the monitor files in the original "mon_L3_XX" directories and also in each of the "mon_sub_L3_YY" directories. Refactor mkdir_mondata_subdir() to move the creation of monitoring files into a helper function to avoid the need to duplicate code later. No functional change. Intel-SIG: commit 603cf1e28838 x86/resctrl: Refactor mkdir_mondata_subdir() with a helper function. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-12-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 45 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d0443589cd861..9c38ddcfe1509 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3025,14 +3025,37 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct rmid_read rr = {0}; + union mon_data_bits priv; + struct mon_evt *mevt; + int ret; + + if (WARN_ON(list_empty(&r->evt_list))) + return -EPERM; + + priv.u.rid = r->rid; + priv.u.domid = d->hdr.id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + return ret; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); + } + + return 0; +} + static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, struct rdt_mon_domain *d, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rmid_read rr = {0}; - union mon_data_bits priv; struct kernfs_node *kn; - struct mon_evt *mevt; char name[32]; int ret; @@ -3046,22 +3069,10 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (ret) goto out_destroy; - if (WARN_ON(list_empty(&r->evt_list))) { - ret = -EPERM; + ret = mon_add_all_files(kn, d, r, prgrp); + if (ret) goto out_destroy; - } - priv.u.rid = r->rid; - priv.u.domid = d->hdr.id; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - goto out_destroy; - - if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); - } kernfs_activate(kn); return 0; From cf740c56cfa246abfd401b2903eb9533b5436d28 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:12 -0700 Subject: [PATCH 60/72] x86/resctrl: Allocate a new field in union mon_data_bits commit 92b5d0b1189ea9e9f00ae493fc99102fe7f2442f upstream. When Sub-NUMA Cluster (SNC) mode is enabled, the legacy monitor reporting files must report the sum of the data from all of the SNC nodes that share the L3 cache that is referenced by the monitor file. Resctrl squeezes all the attributes of these files into 32 bits so they can be stored in the "priv" field of struct kernfs_node. Currently, only three monitor events are defined by enum resctrl_event_id so reducing it from 8 bits to 7 bits still provides more than enough space to represent all the known event types. But note that this choice was arbitrary. The "rid" field is also far wider than needed for the current number of resource id types. This structure is purely internal to resctrl, no ABI issues with modifying it. Subsequent changes may rearrange the allocation of bits between each of the fields as needed. Give the bit to a new "sum" field that indicates that reading this file must sum across SNC nodes. This bit also indicates that the domid field is the id of an L3 cache (instead of a domain id) to find which domains must be summed. Fix up other issues in the kerneldoc description for mon_data_bits. Intel-SIG: commit 92b5d0b1189e x86/resctrl: Allocate a new field in union mon_data_bits. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-13-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/internal.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 681b5bdcd2f95..13d862221f9c4 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -127,19 +127,25 @@ struct mon_evt { }; /** - * union mon_data_bits - Monitoring details for each event file + * union mon_data_bits - Monitoring details for each event file. * @priv: Used to store monitoring event data in @u - * as kernfs private data - * @rid: Resource id associated with the event file - * @evtid: Event id associated with the event file - * @domid: The domain to which the event file belongs - * @u: Name of the bit fields struct + * as kernfs private data. + * @u.rid: Resource id associated with the event file. + * @u.evtid: Event id associated with the event file. + * @u.sum: Set when event must be summed across multiple + * domains. + * @u.domid: When @u.sum is zero this is the domain to which + * the event file belongs. When @sum is one this + * is the id of the L3 cache that all domains to be + * summed share. + * @u: Name of the bit fields struct. */ union mon_data_bits { void *priv; struct { unsigned int rid : 10; - enum resctrl_event_id evtid : 8; + enum resctrl_event_id evtid : 7; + unsigned int sum : 1; unsigned int domid : 14; } u; }; From e2eb6decd80aece43adc2f2920f2b7f2e0140faf Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:13 -0700 Subject: [PATCH 61/72] x86/resctrl: Create Sub-NUMA Cluster (SNC) monitor files commit 0158ed6a1335ff37f0336a986d7b99d6e97d46e9 upstream. When SNC mode is enabled, create subdirectories and files to monitor at the SNC node granularity. Legacy behavior is preserved by tagging the monitor files at the L3 granularity with the "sum" attribute. When the user reads these files the kernel will read monitor data from all SNC nodes that share the same L3 cache instance and return the aggregated value to the user. Note that the "domid" field for files that must sum across SNC domains has the L3 cache instance id, while non-summing files use the domain id. The "sum" files do not need to make a call to mon_event_read() to initialize the MBM counters. This will be handled by initializing the individual SNC nodes that share the L3. Intel-SIG: commit 0158ed6a1335 x86/resctrl: Create Sub-NUMA Cluster (SNC) monitor files. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-14-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 62 +++++++++++++++++++------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 9c38ddcfe1509..8502385e389fa 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3026,7 +3026,8 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) + struct rdt_resource *r, struct rdtgroup *prgrp, + bool do_sum) { struct rmid_read rr = {0}; union mon_data_bits priv; @@ -3037,14 +3038,15 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return -EPERM; priv.u.rid = r->rid; - priv.u.domid = d->hdr.id; + priv.u.domid = do_sum ? d->ci->id : d->hdr.id; + priv.u.sum = do_sum; list_for_each_entry(mevt, &r->evt_list, list) { priv.u.evtid = mevt->evtid; ret = mon_addfile(kn, mevt->name, priv.priv); if (ret) return ret; - if (is_mbm_event(mevt->evtid)) + if (!do_sum && is_mbm_event(mevt->evtid)) mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } @@ -3055,23 +3057,51 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, struct rdt_mon_domain *d, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct kernfs_node *kn; + struct kernfs_node *kn, *ckn; char name[32]; - int ret; + bool snc_mode; + int ret = 0; - sprintf(name, "mon_%s_%02d", r->name, d->hdr.id); - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); + lockdep_assert_held(&rdtgroup_mutex); - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + kn = kernfs_find_and_get(parent_kn, name); + if (kn) { + /* + * rdtgroup_mutex will prevent this directory from being + * removed. No need to keep this hold. + */ + kernfs_put(kn); + } else { + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); - ret = mon_add_all_files(kn, d, r, prgrp); - if (ret) - goto out_destroy; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + if (ret) + goto out_destroy; + } + + if (snc_mode) { + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); + if (IS_ERR(ckn)) { + ret = -EINVAL; + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(ckn); + if (ret) + goto out_destroy; + + ret = mon_add_all_files(ckn, d, r, prgrp, false); + if (ret) + goto out_destroy; + } kernfs_activate(kn); return 0; From 9dd2b9e8c100531e2989842f9799e2a8551d210c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 2 Jul 2024 10:38:19 -0700 Subject: [PATCH 62/72] x86/resctrl: Handle removing directories in Sub-NUMA Cluster (SNC) mode commit 6b48b80b08e6f08eea8eaf7e44555ada191b6bee upstream. In SNC mode, there are multiple subdirectories in each L3 level monitor directory (one for each SNC node). If all the CPUs in an SNC node are taken offline, just remove the SNC directory for that node. In non-SNC mode, or when the last SNC node directory is removed, remove the L3 monitor directory. Add a helper function to avoid duplicated code. Intel-SIG: commit 6b48b80b08e6 x86/resctrl: Handle removing directories in Sub-NUMA Cluster (SNC) mode. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240702173820.90368-2-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 35 +++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 8502385e389fa..58e53f1f52a05 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3006,22 +3006,45 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, return ret; } +static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get(pkn, name); + if (!kn) + return; + kernfs_put(kn); + + if (kn->dir.subdirs <= 1) + kernfs_remove(kn); + else + kernfs_remove_by_name(kn, subname); +} + /* * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups with given domain id. + * and monitor groups for the given domain. + * Remove files and directories containing "sum" of domain data + * when last domain being summed is removed. */ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id) + struct rdt_mon_domain *d) { struct rdtgroup *prgrp, *crgrp; + char subname[32]; + bool snc_mode; char name[32]; + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + if (snc_mode) + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - sprintf(name, "mon_%s_%02d", r->name, dom_id); - kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); } } @@ -3991,7 +4014,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d->hdr.id); + rmdir_mondata_subdir_allrdtgrp(r, d); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); From 8dfac274fcd6bf1a3ff19af844ab6fcf17822fb8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:15 -0700 Subject: [PATCH 63/72] x86/resctrl: Fill out rmid_read structure for smp_call*() to read a counter commit c8c7d3d904b76c45fe2b5dc982fb5090d12a63af upstream. mon_event_read() fills out most fields of the struct rmid_read that is passed via an smp_call*() function to a CPU that is part of the correct domain to read the monitor counters. With Sub-NUMA Cluster (SNC) mode there are now two cases to handle: 1) Reading a file that returns a value for a single domain. + Choose the CPU to execute from the domain cpu_mask 2) Reading a file that must sum across domains sharing an L3 cache instance. + Indicate to called code that a sum is needed by passing a NULL rdt_mon_domain pointer. + Choose the CPU from the L3 shared_cpu_map. Intel-SIG: commit c8c7d3d904b7 x86/resctrl: Fill out rmid_read structure for smp_call*() to read a counter. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-16-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 40 ++++++++++++++++++----- arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 4d76ff31a9e0d..50fa1fe9a073f 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -515,7 +515,7 @@ static int smp_mon_event_count(void *arg) void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first) + cpumask_t *cpumask, int evtid, int first) { int cpu; @@ -536,7 +536,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, return; } - cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, RESCTRL_PICK_ANY_CPU); + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); /* * cpumask_any_housekeeping() prefers housekeeping CPUs, but @@ -545,7 +545,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * counters on some platforms if its called in IRQ context. */ if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(&d->hdr.cpu_mask, mon_event_count, rr, 1); + smp_call_function_any(cpumask, mon_event_count, rr, 1); else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); @@ -574,16 +574,40 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md.u.rid; domid = md.u.domid; evtid = md.u.evtid; - r = &rdt_resources_all[resid].r_resctrl; - hdr = rdt_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + + if (md.u.sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * mon_data_bits union. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci->id == domid) { + rr.ci = d->ci; + mon_event_read(&rr, r, NULL, rdtgrp, + &d->ci->shared_cpu_map, evtid, false); + goto checkresult; + } + } ret = -ENOENT; goto out; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = rdt_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + ret = -ENOENT; + goto out; + } + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, evtid, false); +checkresult: if (rr.err == -EIO) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 13d862221f9c4..16982d1baf992 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -632,7 +632,7 @@ void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first); + cpumask_t *cpumask, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 58e53f1f52a05..d7163b764c626 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3070,7 +3070,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return ret; if (!do_sum && is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); } return 0; From b3de5caef374f0a9f391c219b1f2a75106eb7cb6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:16 -0700 Subject: [PATCH 64/72] x86/resctrl: Make __mon_event_count() handle sum domains commit 9fbb303ec949a376f3cbdf6a2b66ad2212c24ebc upstream. Legacy resctrl monitor files must provide the sum of event values across all Sub-NUMA Cluster (SNC) domains that share an L3 cache instance. There are now two cases: 1) A specific domain is provided in struct rmid_read This is either a non-SNC system, or the request is to read data from just one SNC node. 2) Domain pointer is NULL. In this case the cacheinfo field in struct rmid_read indicates that all SNC nodes that share that L3 cache instance should have the event read and return the sum of all values. Update the CPU sanity check. The existing check that an event is read from a CPU in the requested domain still applies when reading a single domain. But when summing across domains a more relaxed check that the current CPU is in the scope of the L3 cache instance is appropriate since the MSRs to read events are scoped at L3 cache level. Intel-SIG: commit 9fbb303ec949 x86/resctrl: Make __mon_event_count() handle sum domains. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-17-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 51 ++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 9bba9d59d5f6e..7ee4e0c901590 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -324,9 +324,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, resctrl_arch_rmid_read_context_check(); - if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) - return -EINVAL; - prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); if (ret) @@ -593,7 +590,10 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; struct mbm_state *m; + int err, ret; u64 tval = 0; if (rr->first) { @@ -604,14 +604,47 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, - &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; } /* From 5fdb9b07403fb7b7b55153cf6a3a97d25f3be90a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 2 Jul 2024 10:38:20 -0700 Subject: [PATCH 65/72] x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems commit 21b362cc762aabb3e8496d33d7b4538154c95a0b upstream. Hardware has two RMID configuration options for SNC systems. The default mode divides RMID counters between SNC nodes. E.g. with 200 RMIDs and two SNC nodes per L3 cache RMIDs 0..99 are used on node 0, and 100..199 on node 1. This isn't compatible with Linux resctrl usage. On this example system a process using RMID 5 would only update monitor counters while running on SNC node 0. The other mode is "RMID Sharing Mode". This is enabled by clearing bit 0 of the RMID_SNC_CONFIG (0xCA0) model specific register. In this mode the number of logical RMIDs is the number of physical RMIDs (from CPUID leaf 0xF) divided by the number of SNC nodes per L3 cache instance. A process can use the same RMID across different SNC nodes. See the "Intel Resource Director Technology Architecture Specification" for additional details. When SNC is enabled, update the MSR when a monitor domain is marked online. Technically this is overkill. It only needs to be done once per L3 cache instance rather than per SNC domain. But there is no harm in doing it more than once, and this is not in a critical path. Intel-SIG: commit 21b362cc762a x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240702173820.90368-3-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/core.c | 2 ++ arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/monitor.c | 20 ++++++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8a07cfa720eb3..75dfc36b19cfd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1127,6 +1127,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8205b6a56e85a..ac2592f19c499 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -615,6 +615,8 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) } cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + arch_mon_domain_online(r, d); + if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 16982d1baf992..955999aecfca9 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -534,6 +534,8 @@ static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); + /* * To return the common struct rdt_resource, which is contained in struct * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ee4e0c901590..a62c4dc91161d 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -1091,6 +1091,26 @@ static void l3_mon_evt_init(struct rdt_resource *r) list_add_tail(&mbm_local_event.list, &r->evt_list); } +/* + * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 + * which indicates that RMIDs are configured in legacy mode. + * This mode is incompatible with Linux resctrl semantics + * as RMIDs are partitioned between SNC nodes, which requires + * a user to know which RMID is allocated to a task. + * Clearing bit 0 reconfigures the RMID counters for use + * in RMID sharing mode. This mode is better for Linux. + * The RMID space is divided between all SNC nodes with the + * RMIDs renumbered to start from zero in each node when + * counting operations from tasks. Code to read the counters + * must adjust RMID counter numbers based on SNC node. See + * logical_rmid_to_physical_rmid() for code that does this. + */ +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + if (snc_nodes_per_l3_cache > 1) + msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); +} + int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; From 1be4215fabe82686e0b2e3a9ef4a8bdcc5b1425f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:18 -0700 Subject: [PATCH 66/72] x86/resctrl: Detect Sub-NUMA Cluster (SNC) mode commit 13488150f5e2a9b84a335ae18bee33a918ead85d upstream. There isn't a simple hardware bit that indicates whether a CPU is running in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in the same NUMA node as CPU0. Add the missing definition of pr_fmt() to monitor.c. This wasn't noticed before as there are only "can't happen" console messages from this file. [ bp: Massage commit message. ] Intel-SIG: commit 13488150f5e2 x86/resctrl: Detect Sub-NUMA Cluster (SNC) mode. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-19-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/kernel/cpu/resctrl/monitor.c | 66 +++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a62c4dc91161d..cc0f1c48d7b25 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,8 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#define pr_fmt(fmt) "resctrl: " fmt + #include #include #include @@ -1111,6 +1113,68 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); } +/* CPU models that support MSR_RMID_SNC_CONFIG */ +static const struct x86_cpu_id snc_cpu_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, 0), + {} +}; + +/* + * There isn't a simple hardware bit that indicates whether a CPU is running + * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the + * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in + * the same NUMA node as CPU0. + * It is not possible to accurately determine SNC state if the system is + * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes + * to L3 caches. It will be OK if system is booted with hyperthreading + * disabled (since this doesn't affect the ratio). + */ +static __init int snc_get_config(void) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); + const cpumask_t *node0_cpumask; + int cpus_per_node, cpus_per_l3; + int ret; + + if (!x86_match_cpu(snc_cpu_ids) || !ci) + return 1; + + cpus_read_lock(); + if (num_online_cpus() != num_present_cpus()) + pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); + cpus_read_unlock(); + + node0_cpumask = cpumask_of_node(cpu_to_node(0)); + + cpus_per_node = cpumask_weight(node0_cpumask); + cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); + + if (!cpus_per_node || !cpus_per_l3) + return 1; + + ret = cpus_per_l3 / cpus_per_node; + + /* sanity check: Only valid results are 1, 2, 3, 4 */ + switch (ret) { + case 1: + break; + case 2 ... 4: + pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); + rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; + break; + default: + pr_warn("Ignore improbable SNC node count %d\n", ret); + ret = 1; + break; + } + + return ret; +} + int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1118,6 +1182,8 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int threshold; int ret; + snc_nodes_per_l3_cache = snc_get_config(); + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; From 7d771c32f9b871eb1e9c02c741b1c89f8c7daa1a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:19 -0700 Subject: [PATCH 67/72] x86/resctrl: Update documentation with Sub-NUMA cluster changes commit ea34999f41873c96ac89e861e5fdfc7d0403f9e3 upstream. With Sub-NUMA Cluster (SNC) mode enabled, the scope of monitoring resources is per-NODE instead of per-L3 cache. Backwards compatibility is maintained by providing files in the mon_L3_XX directories that sum event counts for all SNC nodes sharing an L3 cache. New files provide per-SNC node event counts. Users should be aware that SNC mode also affects the amount of L3 cache available for allocation within each SNC node. Intel-SIG: commit ea34999f4187 x86/resctrl: Update documentation with Sub-NUMA cluster changes. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-20-tony.luck@intel.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- Documentation/arch/x86/resctrl.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index 5f26c6267ac78..6f07af4a885ec 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -375,6 +375,10 @@ When monitoring is enabled all MON groups will also contain: all tasks in the group. In CTRL_MON groups these files provide the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. + On systems with Sub-NUMA Cluster (SNC) enabled there are extra + directories for each node (located within the "mon_L3_XX" directory + for the L3 cache they occupy). These are named "mon_sub_L3_YY" + where "YY" is the node number. "mon_hw_id": Available only with debug option. The identifier used by hardware @@ -484,6 +488,29 @@ if non-contiguous 1s value is supported. On a system with a 20-bit mask each bit represents 5% of the capacity of the cache. You could partition the cache into four equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. +Notes on Sub-NUMA Cluster mode +============================== +When SNC mode is enabled, Linux may load balance tasks between Sub-NUMA +nodes much more readily than between regular NUMA nodes since the CPUs +on Sub-NUMA nodes share the same L3 cache and the system may report +the NUMA distance between Sub-NUMA nodes with a lower value than used +for regular NUMA nodes. + +The top-level monitoring files in each "mon_L3_XX" directory provide +the sum of data across all SNC nodes sharing an L3 cache instance. +Users who bind tasks to the CPUs of a specific Sub-NUMA node can read +the "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes" in the +"mon_sub_L3_YY" directories to get node local data. + +Memory bandwidth allocation is still performed at the L3 cache +level. I.e. throttling controls are applied to all SNC nodes. + +L3 cache allocation bitmaps also apply to all SNC nodes. But note that +the amount of L3 cache represented by each bit is divided by the number +of SNC nodes per L3 cache. E.g. with a 100MB cache on a system with 10-bit +allocation masks each bit normally represents 10MB. With SNC mode enabled +with two SNC nodes per L3 cache, each bit only represents 5MB. + Memory bandwidth Allocation and monitoring ========================================== From 9daf82adedaaec389a0c4d006349d39c96b4bff8 Mon Sep 17 00:00:00 2001 From: Peter Newman Date: Thu, 22 Aug 2024 12:02:11 -0700 Subject: [PATCH 68/72] x86/resctrl: Fix arch_mbm_* array overrun on SNC commit a547a5880cba6f287179135381f1b484b251be31 upstream. When using resctrl on systems with Sub-NUMA Clustering enabled, monitoring groups may be allocated RMID values which would overrun the arch_mbm_{local,total} arrays. This is due to inconsistencies in whether the SNC-adjusted num_rmid value or the unadjusted value in resctrl_arch_system_num_rmid_idx() is used. The num_rmid value for the L3 resource is currently: resctrl_arch_system_num_rmid_idx() / snc_nodes_per_l3_cache As a simple fix, make resctrl_arch_system_num_rmid_idx() return the SNC-adjusted, L3 num_rmid value on x86. Intel-SIG: commit a547a5880cba x86/resctrl: Fix arch_mbm_* array overrun on SNC. Backporting patches for Intel RDT monitoring with SNC on Intel Xeon platform. Fixes: e13db55b5a0d ("x86/resctrl: Introduce snc_nodes_per_l3_cache") Signed-off-by: Peter Newman Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240822190212.1848788-1-peternewman@google.com [ Xiaochen Shen: amend commit log ] Signed-off-by: Xiaochen Shen --- arch/x86/include/asm/resctrl.h | 6 ------ arch/x86/kernel/cpu/resctrl/core.c | 8 ++++++++ include/linux/resctrl.h | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 12dbd2588ca7c..8b1b6ce1e51b2 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } -static inline u32 resctrl_arch_system_num_rmid_idx(void) -{ - /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return boot_cpu_data.x86_cache_max_rmid + 1; -} - static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *rmid = idx; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index ac2592f19c499..ee2eaebe9daa1 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b0875b99e8111..d94abba1c716e 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -248,6 +248,7 @@ struct resctrl_schema { /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); +u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); /* From 46c8e5a3401d18647d0ef868830bf9b524baec19 Mon Sep 17 00:00:00 2001 From: zengxianjun Date: Fri, 29 Nov 2024 15:52:06 +0800 Subject: [PATCH 69/72] update workflow repo --- .github/workflows/pull-request.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 13445f3661df1..bca7b1d0b852d 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -11,10 +11,10 @@ on: jobs: StaticCheck: - uses: bytedance/kernel/.github/workflows/static.yml@5.4.143-velinux + uses: mistachio/kernel/.github/workflows/static.yml@5.4.143-velinux secrets: ssh_key: ${{ secrets.SSH_KEY }} BuildAndTestKernel: - uses: bytedance/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux + uses: mistachio/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux secrets: ssh_key: ${{ secrets.SSH_KEY }} From 54b6c4bd1233894ca8404cef984686d11ecdc225 Mon Sep 17 00:00:00 2001 From: zengxianjun Date: Fri, 29 Nov 2024 16:02:38 +0800 Subject: [PATCH 70/72] skip check format --- .github/workflows/pull-request.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index bca7b1d0b852d..298781093ac72 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -10,10 +10,6 @@ on: - 6.6-velinux jobs: - StaticCheck: - uses: mistachio/kernel/.github/workflows/static.yml@5.4.143-velinux - secrets: - ssh_key: ${{ secrets.SSH_KEY }} BuildAndTestKernel: uses: mistachio/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux secrets: From 421be2eef6f8953b0409e92865d19b18c5a9ce2e Mon Sep 17 00:00:00 2001 From: zengxianjun Date: Fri, 29 Nov 2024 17:43:23 +0800 Subject: [PATCH 71/72] retrigger --- .github/workflows/pull-request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 298781093ac72..7d47a6df685b0 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -13,4 +13,4 @@ jobs: BuildAndTestKernel: uses: mistachio/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux secrets: - ssh_key: ${{ secrets.SSH_KEY }} + ssh_key: ${{ secrets.SSH_KEY }} From 1247f228917d15d23d4acff163ce59effeb3dae7 Mon Sep 17 00:00:00 2001 From: zengxianjun Date: Fri, 29 Nov 2024 17:46:31 +0800 Subject: [PATCH 72/72] trgger again --- .github/workflows/pull-request.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 7d47a6df685b0..798059f226956 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -10,6 +10,10 @@ on: - 6.6-velinux jobs: + StaticCheck: + uses: mistachio/kernel/.github/workflows/static.yml@5.4.143-velinux + secrets: + ssh_key: ${{ secrets.SSH_KEY }} BuildAndTestKernel: uses: mistachio/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux secrets: