diff --git a/README.md b/README.md index 6fc2abb1e2a2..bbd2b07645a4 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ pair of .c and .py files, and some are directories of files. - tools/[mdflush](tools/mdflush.py): Trace md flush events. [Examples](tools/mdflush_example.txt). - tools/[mountsnoop](tools/mountsnoop.py): Trace mount and umount syscalls system-wide. [Examples](tools/mountsnoop_example.txt). - tools/[virtiostat](tools/virtiostat.py): Show VIRTIO device IO statistics. [Examples](tools/virtiostat_example.txt). +- tools/[filewriteback](tools/filewriteback.py): Summarize write IOPs/Bandwidth/times of cached files to disk as folded stack format. [Examples](tools/filewriteback_example.txt). ###### Filesystems Tools diff --git a/man/man8/filewriteback.8 b/man/man8/filewriteback.8 new file mode 100644 index 000000000000..bf1c790448b1 --- /dev/null +++ b/man/man8/filewriteback.8 @@ -0,0 +1,49 @@ +.TH filewriteback 8 "2025-09-01" "USER COMMANDS" +.SH NAME +filewriteback \- Summarize write IOPs/Bandwidth/times of cached files to disk +in folded stack format. +.SH SYNOPSIS +.B filewriteback [\-h] [duration] +.SH DESCRIPTION +This tool traces block device I/O (disk I/O) from cached files, and summarizes +the write IOPs/Bandwidth/times in folded stack format. For each cached file, we +restore the full path by the inode information. Then, the full file path with +the write IOPs/Bandwidth/times is stored in files. + +The write information counts the IOs from the block function +"blk_mq_start_request". The full path is restored by inode and the dentry in +page cache. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +duration +Trace ends when the running time equals the duration value. +.SH EXAMPLES +.TP +Summarize write IOPs/Bandwidth/times of cached files to disk in folded stack format. +# +.B filewriteback +.TP +.SH OVERHEAD +The overhead for this tool is expected to be negligible if the IOPs of the +system is relatively low (e.g., < 10K/s). For high IOPS storage systems, test +and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Jie Xu diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index 6fab54b78d0e..150c6cb26855 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -454,5 +454,9 @@ def test_zfsslower(self): # Fails to attach the probe if zfs is not installed. pass + @skipUnless(kernel_version_ge(4,19), "requires kernel >= 4.19") + def test_filewriteback(self): + self.run_with_int("filewriteback.py") + if __name__ == "__main__": main() diff --git a/tools/filewriteback.py b/tools/filewriteback.py new file mode 100644 index 000000000000..bea8398d43bf --- /dev/null +++ b/tools/filewriteback.py @@ -0,0 +1,322 @@ +#! /usr/bin/python3 +from time import sleep, strftime +import argparse +import os +import subprocess +import json +from requests.exceptions import RequestException +from bcc import BPF + +OUTPUT_FILES = [ + "IOflame.write.iops", + "IOflame.write.bandwidth", + "IOflame.write.times" +] +OUTPUT_DIR = "IOflamefiles" +REQ_OP_BITS = 8 +REQ_OP_MASK = ((1 << REQ_OP_BITS) - 1) + +examples = """examples: + ./filewriteback.py # get write IOPs/bandwidth/times and store in + # IOflamefiles directory + ./filewriteback.py 10 # 10 sec's duration +""" +parser = argparse.ArgumentParser( + description="""Summarize write IOPs/bandwidth/times in folded stack format. + The info will be stored in three files which are IOflame.write.iops, + IOflame.write.bandwidth, IOflame.write.times in the directory IOflamefiles. + """, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) + +parser.add_argument("duration", nargs="?", default=60, type=int, + help="data collection duration, in seconds") +args = parser.parse_args() + +bpf_text = """ +#include +#include +#include +#include + +#ifndef containerof +#define containerof(ptr, type, member) ({ \ + const typeof(((type *)0)->member)*__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#define FILE_NUM 40960 +#define REQUEST_NUM 40960 +#define PATH_NUM 102400 +#define MAX_DENTRY 18 + +typedef struct file_key { + dev_t dev; + unsigned long i_ino; + signed long tv_sec; + long tv_nsec; +} file_key_t; + +typedef struct file_req_key { + file_key_t req_key; + unsigned long cmd_flags; +} file_req_key_t; + +typedef struct file_path_key { + file_key_t path_key; + unsigned long depth; +} file_path_key_t; + +//If each path_part contains only one dentry, the file_full_path may be full +//quickly. We use three dentries in one path_part. +typedef struct path_part { + char path_name0[DNAME_INLINE_LEN]; + char path_name1[DNAME_INLINE_LEN]; + char path_name2[DNAME_INLINE_LEN]; +} path_part_t; + +BPF_HASH(file_exist, struct file_key, bool, FILE_NUM); +BPF_HASH(file_req, struct file_req_key, unsigned int, REQUEST_NUM); +BPF_HASH(file_full_path, struct file_path_key, struct path_part, PATH_NUM); +BPF_HASH(file_bandwidth, struct file_req_key, unsigned int, REQUEST_NUM); + +int trace_path(struct pt_regs *ctx, struct request *req) +{ + struct inode *inode_addr = req->bio->bi_io_vec->bv_page->mapping->host; + struct hlist_head identry; + bpf_probe_read_kernel(&identry, sizeof(identry), &inode_addr->i_dentry); + struct dentry *dentry_addr = containerof(identry.first, + struct dentry, + d_u.d_alias); + + //From the struct of ext4_inode_info, we find that btime address equals + //inode address plus sizes of struct inode, struct jbd2_inode *, spinlock_t + struct timespec64 *btime = (struct timespec64 *)((char *)inode_addr + + sizeof(struct inode) + 0x10); + unsigned long tv_sec = btime->tv_sec; + long tv_nsec = btime->tv_nsec; + struct file_key key = { + .dev = __DEV__, + .i_ino = inode_addr->i_ino, + .tv_sec = (inode_addr->i_ino > 0) ? tv_sec : 0, + .tv_nsec = (inode_addr->i_ino > 0) ? tv_nsec : 0 + }; + + struct file_req_key req_ins = { + .req_key = key, + .cmd_flags = req->cmd_flags + }; + + file_req.atomic_increment(req_ins); + file_bandwidth.atomic_increment(req_ins, req->__data_len/1024); + + if (!file_exist.lookup(&key)) + { + bool exist = true; + + file_exist.update(&key, &exist); + int loop = 0; + struct path_part path_ins; + struct qstr d_name; + struct dentry *de0; + struct dentry *de1; + struct dentry *de2; + __builtin_memset(&path_ins, 0, sizeof(path_ins)); + __builtin_memset(&d_name, 0, sizeof(d_name)); + + bpf_probe_read_kernel(&de0, sizeof(de0), &dentry_addr); + + #pragma clang loop unroll(full) + while (loop < MAX_DENTRY / 3) + { + bpf_probe_read_kernel(&d_name, sizeof(d_name), &de0->d_name); + bpf_probe_read_kernel(&path_ins.path_name0, + sizeof(path_ins.path_name0), + d_name.name); + + bpf_probe_read_kernel(&de1, sizeof(de1), &de0->d_parent); + bpf_probe_read_kernel(&d_name, sizeof(d_name), &de1->d_name); + bpf_probe_read_kernel(&path_ins.path_name1, + sizeof(path_ins.path_name1), + d_name.name); + + bpf_probe_read_kernel(&de2, sizeof(de2), &de1->d_parent); + bpf_probe_read_kernel(&d_name, sizeof(d_name), &de2->d_name); + bpf_probe_read_kernel(&path_ins.path_name2, + sizeof(path_ins.path_name2), + d_name.name); + struct file_path_key path_key_ins = { + .path_key = key, + .depth = loop + }; + file_full_path.update(&path_key_ins, &path_ins); + bpf_probe_read_kernel(&de0, sizeof(de0), &de2->d_parent); + if (de0 == de2) + { + break; + } + loop++; + } + } + return 0; +} + +""" + +def run_cmd(cmd): + try: + res = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + check=False + ) + if res.returncode == 0: + return True, res.stdout + else: + print("error:", res.stderr) + exit() + except subprocess.CalledProcessError as e: + print("command failed: {}".format(e)) + exit() + +def get_mount_info(): + lsblk = ["lsblk", "-o", "NAME,MAJ:MIN,MOUNTPOINT", "-J"] + success, mount = run_cmd(lsblk) + if success is True: + print("get mount info success") + return json.loads(mount)['blockdevices'] + + +mount_info = get_mount_info() + +if BPF.kernel_struct_has_field(b'block_device', b'bd_part') == 0: + bpf_text = bpf_text.replace('__DEV__', 'req->part->bd_dev') +else: + bpf_text = bpf_text.replace('__DEV__', 'req->part->__dev.devt') + +b = BPF(text=bpf_text) +if BPF.get_kprobe_functions(b'blk_mq_start_request'): + b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_path") + +partition = {} +def get_mount_entry(entry): + major, minor = entry['maj:min'].split(':') + dev = (int(major) << 20 | int(minor)) + if entry['mountpoint'] is not None: + partition[dev] = entry['name'], entry['mountpoint'] + + +for entry in mount_info: + get_mount_entry(entry) + if 'children' in entry: + for child_entry in entry['children']: + get_mount_entry(child_entry) + +# The full_path contains path components, we merge the components of the same +# inode into one flattened full path +def merge_path(full_path): + for key in full_path: + path = full_path[key] + tmpfp = list() + # To ensure the path in the right order, need to sort by the path depth + for i in sorted(path): + tmpfp = tmpfp + list(path[i]) + fp_flattened = list() + for i in range(len(tmpfp)): + if i == len(tmpfp) - 1: + fp_flattened.append(tmpfp[i]) + break + if (tmpfp[i] == "/") and (tmpfp[i + 1] == "/"): + fp_flattened.append(tmpfp[i]) + break + fp_flattened.append(tmpfp[i]) + full_path[key] = fp_flattened + + full_path_merge = {} + for key in full_path: + path = full_path[key] + s = "" + for i in range(len(path)): + if i == len(path) - 1: + if key[0] in partition: + s = partition[key[0]][1] + s + break + s = ";" + path[i] + s + full_path_merge[key] = s + return full_path_merge + +def main(): + print("Tracing file writeback... Hit Ctrl-C to end") + try: + sleep(int(args.duration)) + except KeyboardInterrupt: + print('\nkeyboardInterrupt') + b.detach_kprobe(event="blk_mq_start_request", fn_name="trace_path") + + b['file_exist'].clear() + + full_path = {} + for k, v in b.get_table("file_full_path").items(): + key = k.path_key.dev, k.path_key.i_ino,\ + k.path_key.tv_sec, k.path_key.tv_nsec + value = {} + if key in full_path: + value = full_path[key] + value[k.depth] = v.path_name0.decode('utf-8'),\ + v.path_name1.decode('utf-8'),\ + v.path_name2.decode('utf-8') + full_path[key] = value + full_path_merge = merge_path(full_path) + + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + os.chdir(OUTPUT_DIR) + + path_write = {} + f = open(OUTPUT_FILES[0], 'w') + for k, v in b.get_table("file_req").items(): + key = k.req_key.dev, k.req_key.i_ino,\ + k.req_key.tv_sec, k.req_key.tv_nsec + if (k.cmd_flags & REQ_OP_MASK) == 1 and key in full_path_merge: + line = full_path_merge[key] + " " + str(v.value) + path_write[key] = full_path_merge[key] + f.write(line + "\n") + f.close() + + f = open(OUTPUT_FILES[1], 'w') + for k, v in b.get_table('file_bandwidth').items(): + key = k.req_key.dev, k.req_key.i_ino,\ + k.req_key.tv_sec, k.req_key.tv_nsec + if (k.cmd_flags & REQ_OP_MASK) == 1 and key in full_path_merge: + line = full_path_merge[key] + " " + str(v.value) + f.write(line + "\n") + f.close() + + path_write_times = {} + for k, v in path_write.items(): + if k[1] <= 0: + continue + if v in path_write_times: + path_write_times[v] += 1 + else: + path_write_times[v] = 1 + + f = open(OUTPUT_FILES[2], 'w') + for k, v in path_write_times.items(): + line = k + " " + str(v) + f.write(line + "\n") + f.close() + + b['file_full_path'].clear() + b['file_req'].clear() + b['file_bandwidth'].clear() + + print("Trace ends, please analyze with the files stored in IOflamefiles.") + + exit() + + +if __name__ == '__main__': + main() diff --git a/tools/filewriteback_example.txt b/tools/filewriteback_example.txt new file mode 100644 index 000000000000..75f7c5a60da5 --- /dev/null +++ b/tools/filewriteback_example.txt @@ -0,0 +1,34 @@ +Demonstrations of filewriteback, the Linux eBPF/bcc version. + +This program traces the writeback of cached files to disk. +The data are in folded stack format and can be used to generate the flamegraph. + +How to run the script +Run ./filewriteback.py with root permissions. +The traced information will be stored in the "IOflamefiles" directory, +which contains three files, i.e., "IOflame.write.iops", +"IOflame.write.bandwidth", "IOflame.write.times". + +How to interpret the result +Each line in the file contains the write information of the file path. +E.g., the information stored in "IOflame.write.iops" of the directory +"IOflamefiles" may be like this +""" +/;opt;log;tiger;tiger.log. 1 +/;etc;system;agent;log;log 1 +/;opt;log;tiger;dird 1 +/;opt;log;tiger;dirc 1 +/;opt;tiger;log;app;dira;dirb 1 +""" +The first line means that the file path "/;opt;log;tiger;tiger.log." has "1" +iops. + +How to generate the flamegraph +It's recommended to generate the flamegraph to visualize the result. +Using the command "./flamegraph.pl IOflame.write.iops > write_iops.svg", +we can get the flamegraph. + +The meaning of the three files +(1)"IOflame.write.iops" is the write iops of the cached files to disk. +(2)"IOflame.write.bandwidth" is write bandwidth of the cached files to disk. +(3)"IOflame.write.times" is the write times of the cached files to disk.