Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
620a603
x86/resctrl: Fix remaining kernel-doc warnings
maciejwieczorretman Oct 11, 2023
bf51bba
x86/resctrl: Add multiple tasks to the resctrl group at once
babumoger Oct 17, 2023
298d1e6
x86/resctrl: Simplify rftype flag definitions
babumoger Oct 3, 2023
d649b84
x86/resctrl: Rename rftype flags for consistency
babumoger Oct 17, 2023
258532c
x86/resctrl: Unwind properly from rdt_enable_ctx()
babumoger Oct 17, 2023
ece9c5a
x86/resctrl: Move default group file creation to mount
babumoger Oct 17, 2023
4a0239b
x86/resctrl: Introduce "-o debug" mount option
babumoger Oct 17, 2023
f0d40dd
x86/resctrl: Display CLOSID for resource group
babumoger Oct 17, 2023
346cc29
x86/resctrl: Add support for the files of MON groups only
babumoger Oct 17, 2023
56b2322
x86/resctrl: Display RMID of resource group
babumoger Oct 17, 2023
b7de2a9
x86/resctrl: Fix unused variable warning in cache_alloc_hsw_probe()
aegl Nov 1, 2023
87c4545
x86/resctrl: Remove redundant variable in mbm_config_write_domain()
babumoger Jan 24, 2024
05d040e
tick/nohz: Move tick_nohz_full_mask declaration outside the #ifdef
Feb 13, 2024
bc09a62
x86/resctrl: Free rmid_ptrs from resctrl_exit()
Feb 13, 2024
25c6b7b
x86/resctrl: Create helper for RMID allocation and mondata dir creation
Feb 13, 2024
18a13cd
x86/resctrl: Move RMID allocation out of mkdir_rdt_prepare()
Feb 13, 2024
7158e4f
x86/resctrl: Track the closid with the rmid
Feb 13, 2024
d12bf22
x86/resctrl: Access per-rmid structures by index
Feb 13, 2024
de8bf8b
x86/resctrl: Allow RMID allocation to be scoped by CLOSID
Feb 13, 2024
5cdc061
x86/resctrl: Track the number of dirty RMID a CLOSID has
Feb 13, 2024
c7202f7
x86/resctrl: Use __set_bit()/__clear_bit() instead of open coding
Feb 13, 2024
add9a88
x86/resctrl: Allocate the cleanest CLOSID by searching closid_num_dir…
Feb 13, 2024
ebac54a
x86/resctrl: Move CLOSID/RMID matching and setting to use helpers
Feb 13, 2024
903bec7
x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow
Feb 13, 2024
484887a
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Feb 13, 2024
e1e636d
x86/resctrl: Allow resctrl_arch_rmid_read() to sleep
Feb 13, 2024
51d9a3f
x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmi…
Feb 13, 2024
337a1fd
x86/resctrl: Make resctrl_mounted checks explicit
Feb 13, 2024
de92da2
x86/resctrl: Move alloc/mon static keys into helpers
Feb 13, 2024
9f43c91
x86/resctrl: Make rdt_enable_key the arch's decision to switch
Feb 13, 2024
0434cd0
x86/resctrl: Add helpers for system wide mon/alloc capable
Feb 13, 2024
9148de1
x86/resctrl: Add CPU online callback for resctrl work
Feb 13, 2024
6bda055
x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but…
Feb 13, 2024
f2dfd22
x86/resctrl: Add CPU offline callback for resctrl work
Feb 13, 2024
90964f8
x86/resctrl: Move domain helper migration into resctrl_offline_cpu()
Feb 13, 2024
af5ad16
x86/resctrl: Separate arch and fs resctrl locks
Feb 13, 2024
4347095
x86/resctrl: Remove lockdep annotation that triggers false positive
Feb 21, 2024
cffda4a
Documentation/x86: Document that resctrl bandwidth control units are MiB
aegl Mar 22, 2024
aad43a8
x86/resctrl: Fix uninitialized memory read when last CPU of domain go…
rchatre Apr 1, 2024
62a3d91
x86/resctrl: Pass domain to target CPU
aegl Mar 8, 2024
4933421
x86/resctrl: Simplify call convention for MSR update functions
aegl Mar 8, 2024
c652648
x86/resctrl: Rename pseudo_lock_event.h to trace.h
hfxsp Apr 8, 2024
6b4b107
x86/resctrl: Add tracepoint for llc_occupancy tracking
hfxsp Apr 8, 2024
a543ce6
x86/resctrl: Don't try to free nonexistent RMIDs
Jun 18, 2024
51f5d0f
cpu: Move CPU hotplug function declarations into their own header
aegl Jun 10, 2024
256b1da
cpu: Drop "extern" from function declarations in cpuhplock.h
aegl Jun 10, 2024
0ab6422
cacheinfo: Add function to get cacheinfo for a given CPU and cache level
aegl Jun 10, 2024
1047bda
x86/resctrl: Replace open coded cacheinfo searches
aegl Jun 10, 2024
80248a2
x86/resctrl: Prepare for new domain scope
aegl Jun 28, 2024
2fd9dcb
x86/resctrl: Prepare to split rdt_domain structure
aegl Jun 28, 2024
c56be7d
x86/resctrl: Prepare for different scope for control/monitor operations
aegl Jun 28, 2024
ae688d6
x86/resctrl: Split the rdt_domain and rdt_hw_domain structures
aegl Jun 28, 2024
8bc2443
x86/resctrl: Add node-scope to the options for feature scope
aegl Jun 28, 2024
3ecafe3
x86/resctrl: Introduce snc_nodes_per_l3_cache
aegl Jun 28, 2024
4db2f37
x86/resctrl: Block use of mba_MBps mount option on Sub-NUMA Cluster (…
aegl Jun 28, 2024
82140d8
x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files
aegl Jun 28, 2024
f255cb8
x86/resctrl: Add a new field to struct rmid_read for summation of dom…
aegl Jun 28, 2024
54fcbe7
x86/resctrl: Initialize on-stack struct rmid_read instances
aegl Jun 28, 2024
0788d7b
x86/resctrl: Refactor mkdir_mondata_subdir() with a helper function
aegl Jun 28, 2024
cf740c5
x86/resctrl: Allocate a new field in union mon_data_bits
aegl Jun 28, 2024
e2eb6de
x86/resctrl: Create Sub-NUMA Cluster (SNC) monitor files
aegl Jun 28, 2024
9dd2b9e
x86/resctrl: Handle removing directories in Sub-NUMA Cluster (SNC) mode
aegl Jul 2, 2024
8dfac27
x86/resctrl: Fill out rmid_read structure for smp_call*() to read a c…
aegl Jun 28, 2024
b3de5ca
x86/resctrl: Make __mon_event_count() handle sum domains
aegl Jun 28, 2024
5fdb9b0
x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems
aegl Jul 2, 2024
1be4215
x86/resctrl: Detect Sub-NUMA Cluster (SNC) mode
aegl Jun 28, 2024
7d771c3
x86/resctrl: Update documentation with Sub-NUMA cluster changes
aegl Jun 28, 2024
9daf82a
x86/resctrl: Fix arch_mbm_* array overrun on SNC
Aug 22, 2024
46c8e5a
update workflow repo
mistachio Nov 29, 2024
54b6c4b
skip check format
mistachio Nov 29, 2024
421be2e
retrigger
mistachio Nov 29, 2024
1247f22
trgger again
mistachio Nov 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ on:

jobs:
StaticCheck:
uses: bytedance/kernel/.github/workflows/static.yml@5.4.143-velinux
uses: mistachio/kernel/.github/workflows/static.yml@5.4.143-velinux
secrets:
ssh_key: ${{ secrets.SSH_KEY }}
BuildAndTestKernel:
uses: bytedance/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux
uses: mistachio/kernel/.github/workflows/build-and-test.yml@5.4.143-velinux
secrets:
ssh_key: ${{ secrets.SSH_KEY }}
ssh_key: ${{ secrets.SSH_KEY }}
63 changes: 57 additions & 6 deletions Documentation/arch/x86/resctrl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ about the feature from resctrl's info directory.

To use the feature mount the file system::

# mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl
# mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps][,debug]] /sys/fs/resctrl

mount options are:

Expand All @@ -45,7 +45,10 @@ mount options are:
Enable code/data prioritization in L2 cache allocations.
"mba_MBps":
Enable the MBA Software Controller(mba_sc) to specify MBA
bandwidth in MBps
bandwidth in MiBps
"debug":
Make debug files accessible. Available debug files are annotated with
"Available only with debug option".

L2 and L3 CDP are controlled separately.

Expand Down Expand Up @@ -306,7 +309,14 @@ All groups contain the following files:
"tasks":
Reading this file shows the list of all tasks that belong to
this group. Writing a task id to the file will add a task to the
group. If the group is a CTRL_MON group the task is removed from
group. Multiple tasks can be added by separating the task ids
with commas. Tasks will be assigned sequentially. Multiple
failures are not supported. A single failure encountered while
attempting to assign a task will cause the operation to abort and
already added tasks before the failure will remain in the group.
Failures will be logged to /sys/fs/resctrl/info/last_cmd_status.

If the group is a CTRL_MON group the task is removed from
whichever previous CTRL_MON group owned the task and also from
any MON group that owned the task. If the group is a MON group,
then the task must already belong to the CTRL_MON parent of this
Expand Down Expand Up @@ -349,6 +359,10 @@ When control is enabled all CTRL_MON groups will also contain:
file. On successful pseudo-locked region creation the mode will
automatically change to "pseudo-locked".

"ctrl_hw_id":
Available only with debug option. The identifier used by hardware
for the control group. On x86 this is the CLOSID.

When monitoring is enabled all MON groups will also contain:

"mon_data":
Expand All @@ -361,6 +375,14 @@ When monitoring is enabled all MON groups will also contain:
all tasks in the group. In CTRL_MON groups these files provide
the sum for all tasks in the CTRL_MON group and all tasks in
MON groups. Please see example section for more details on usage.
On systems with Sub-NUMA Cluster (SNC) enabled there are extra
directories for each node (located within the "mon_L3_XX" directory
for the L3 cache they occupy). These are named "mon_sub_L3_YY"
where "YY" is the node number.

"mon_hw_id":
Available only with debug option. The identifier used by hardware
for the monitor group. On x86 this is the RMID.

Resource allocation rules
-------------------------
Expand Down Expand Up @@ -428,6 +450,12 @@ during mkdir.
max_threshold_occupancy is a user configurable value to determine the
occupancy at which an RMID can be freed.

The mon_llc_occupancy_limbo tracepoint gives the precise occupancy in bytes
for a subset of RMID that are not immediately available for allocation.
This can't be relied on to produce output every second, it may be necessary
to attempt to create an empty monitor group to force an update. Output may
only be produced if creation of a control or monitor group fails.

Schemata files - general concepts
---------------------------------
Each line in the file describes one resource. The line starts with
Expand Down Expand Up @@ -460,6 +488,29 @@ if non-contiguous 1s value is supported. On a system with a 20-bit mask
each bit represents 5% of the capacity of the cache. You could partition
the cache into four equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.

Notes on Sub-NUMA Cluster mode
==============================
When SNC mode is enabled, Linux may load balance tasks between Sub-NUMA
nodes much more readily than between regular NUMA nodes since the CPUs
on Sub-NUMA nodes share the same L3 cache and the system may report
the NUMA distance between Sub-NUMA nodes with a lower value than used
for regular NUMA nodes.

The top-level monitoring files in each "mon_L3_XX" directory provide
the sum of data across all SNC nodes sharing an L3 cache instance.
Users who bind tasks to the CPUs of a specific Sub-NUMA node can read
the "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes" in the
"mon_sub_L3_YY" directories to get node local data.

Memory bandwidth allocation is still performed at the L3 cache
level. I.e. throttling controls are applied to all SNC nodes.

L3 cache allocation bitmaps also apply to all SNC nodes. But note that
the amount of L3 cache represented by each bit is divided by the number
of SNC nodes per L3 cache. E.g. with a 100MB cache on a system with 10-bit
allocation masks each bit normally represents 10MB. With SNC mode enabled
with two SNC nodes per L3 cache, each bit only represents 5MB.

Memory bandwidth Allocation and monitoring
==========================================

Expand Down Expand Up @@ -508,7 +559,7 @@ threads start using more cores in an rdtgroup, the actual bandwidth may
increase or vary although user specified bandwidth percentage is same.

In order to mitigate this and make the interface more user friendly,
resctrl added support for specifying the bandwidth in MBps as well. The
resctrl added support for specifying the bandwidth in MiBps as well. The
kernel underneath would use a software feedback mechanism or a "Software
Controller(mba_sc)" which reads the actual bandwidth using MBM counters
and adjust the memory bandwidth percentages to ensure::
Expand Down Expand Up @@ -555,13 +606,13 @@ Memory b/w domain is L3 cache.

MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...

Memory bandwidth Allocation specified in MBps
Memory bandwidth Allocation specified in MiBps
---------------------------------------------

Memory bandwidth domain is L3 cache.
::

MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;...
MB:<cache_id0>=bw_MiBps0;<cache_id1>=bw_MiBps1;...

Slow Memory Bandwidth Allocation (SMBA)
---------------------------------------
Expand Down
1 change: 1 addition & 0 deletions arch/x86/include/asm/msr-index.h
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,7 @@
#define MSR_IA32_QM_CTR 0xc8e
#define MSR_IA32_PQR_ASSOC 0xc8f
#define MSR_IA32_L3_CBM_BASE 0xc90
#define MSR_RMID_SNC_CONFIG 0xca0
#define MSR_IA32_L2_CBM_BASE 0xd10
#define MSR_IA32_MBA_THRTL_BASE 0xd50

Expand Down
84 changes: 84 additions & 0 deletions arch/x86/include/asm/resctrl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
#include <linux/sched.h>
#include <linux/jump_label.h>

/*
* This value can never be a valid CLOSID, and is used when mapping a
* (closid, rmid) pair to an index and back. On x86 only the RMID is
* needed. The index is a software defined value.
*/
#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0)

/**
* struct resctrl_pqr_state - State cache for the PQR MSR
* @cur_rmid: The cached Resource Monitoring ID
Expand All @@ -31,10 +38,47 @@ struct resctrl_pqr_state {

DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);

extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;

DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);

static inline bool resctrl_arch_alloc_capable(void)
{
return rdt_alloc_capable;
}

static inline void resctrl_arch_enable_alloc(void)
{
static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
static_branch_inc_cpuslocked(&rdt_enable_key);
}

static inline void resctrl_arch_disable_alloc(void)
{
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_dec_cpuslocked(&rdt_enable_key);
}

static inline bool resctrl_arch_mon_capable(void)
{
return rdt_mon_capable;
}

static inline void resctrl_arch_enable_mon(void)
{
static_branch_enable_cpuslocked(&rdt_mon_enable_key);
static_branch_inc_cpuslocked(&rdt_enable_key);
}

static inline void resctrl_arch_disable_mon(void)
{
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_dec_cpuslocked(&rdt_enable_key);
}

/*
* __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
Expand Down Expand Up @@ -88,12 +132,52 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
return val * scale;
}

static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
u32 closid, u32 rmid)
{
WRITE_ONCE(tsk->closid, closid);
WRITE_ONCE(tsk->rmid, rmid);
}

static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
{
return READ_ONCE(tsk->closid) == closid;
}

static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
u32 rmid)
{
return READ_ONCE(tsk->rmid) == rmid;
}

static inline void resctrl_sched_in(struct task_struct *tsk)
{
if (static_branch_likely(&rdt_enable_key))
__resctrl_sched_in(tsk);
}

static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
*closid = X86_RESCTRL_EMPTY_CLOSID;
}

static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
{
return rmid;
}

/* x86 can always read an rmid, nothing needs allocating */
struct rdt_resource;
static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid)
{
might_sleep();
return NULL;
};

static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
void *ctx) { };

void resctrl_cpu_detect(struct cpuinfo_x86 *c);

#else
Expand Down
Loading