diff --git a/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch b/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch new file mode 100644 index 00000000000000..833c1afd041faf --- /dev/null +++ b/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch @@ -0,0 +1,265 @@ +From 29e6df3db77234a44a680344a61eb5bd735f6d8e Mon Sep 17 00:00:00 2001 +From: Andrey Vagin +Date: Mon, 16 Feb 2015 19:20:52 +0300 +Subject: [PATCH 0/15] task_diag: add a new interface to get information + about processes (v3) + +Current interface is a bunch of files in /proc/PID. While this appears to be +simple and there are a number of problems with it. + +* Lots of syscalls + + At least three syscalls per each PID are required — open(), read(), and + close() + +* Variety of formats + + There are many different formats used by files in /proc/PID/ hierarchy. + Therefore, there is a need to write parser for each such format. + +* Non-extendable formats + + Some formats in /proc/PID are non-extendable. For example, /proc/PID/maps + last column (file name) is optional, therefore there is no way to add more + columns without breaking the format. + +* Slow read due to extra info[edit] + Sometimes getting information is slow due to extra attributes that are not + always needed. For example, /proc/PID/smaps contains VmFlags field (which + can't be added to /proc/PID/maps, see previous item), but it also contains + page stats that take long time to generate. + + $ time cat /proc/*/maps > /dev/null + real 0m0.061s + user 0m0.002s + sys 0m0.059s + + + $ time cat /proc/*/smaps > /dev/null + real 0m0.253s + user 0m0.004s + sys 0m0.247s + +Proposed solution +----------------- + +The proposed solution is the /proc/task_diag file, which operates based on the +following principles: + +* Transactional: write request, read response +* Netlink message format (same as used by sock_diag; binary and extendable) +* Ability to specify a set of processes to get info about +* Optimal grouping of attributes + Any attribute in a group can't affect a response time + +The user-kernel interface is encapsulated in include/uapi/linux/task_diag.h + +A request is described by the task_diag_pid structure: + +struct task_diag_pid { + __u64 show_flags; /* specify which information are required */ + __u64 dump_strategy; /* specify a group of processes */ + + __u32 pid; +}; + +dump_strategy specifies a group of processes: +/* system wide strategies (the pid fiel is ignored) */ +TASK_DIAG_DUMP_ALL - all processes +TASK_DIAG_DUMP_ALL_THREAD - all threads +/* per-process strategies */ +TASK_DIAG_DUMP_CHILDREN - all children +TASK_DIAG_DUMP_THREAD - all threads +TASK_DIAG_DUMP_ONE - one process + +show_flags specifies which information are required. If we set the +TASK_DIAG_SHOW_BASE flag, the response message will contain the TASK_DIAG_BASE +attribute which is described by the task_diag_base structure. + +struct task_diag_base { + __u32 tgid; + __u32 pid; + __u32 ppid; + __u32 tpid; + __u32 sid; + __u32 pgid; + __u8 state; + char comm[TASK_DIAG_COMM_LEN]; +}; + +In future, it can be extended by optional attributes. The request describes +which task properties are required and for which processes they are required +for. + +A response can be divided into a few netlink packets. Each task is described +by a netlink message. If all information about a process doesn't fit into a +message, the TASK_DIAG_FLAG_CONT flag will be set and the next message will +continue describing the same process. + +The task diag is much faster than the proc file system. We don't need to create +a new file descriptor for each task. We need to send a request and get a +response. It allows to get information for a few tasks for one request-response +iteration. + +As for security, task_diag always works as procfs with hidepid = 2 (highest +level of security). + +I have compared performance of procfs and task-diag for the +"ps ax -o pid,ppid" command. + +ps uses /proc/PID/* files: +$ time ./ps/pscommand ax | wc -l +50089 + +real 0m1.596s +user 0m0.475s +sys 0m1.126s + +ps uses the task_diag interface +$ time ./ps/pscommand ax | wc -l +50089 + +real 0m0.148s +user 0m0.069s +sys 0m0.086s + +Read /proc/PID/stat for 30K tasks: +$ time ./task_proc_all > /dev/null + +real 0m0.258s +user 0m0.019s +sys 0m0.232s + +Get the same information via task_diag: +$ time ./task_diag_all > /dev/null + +real 0m0.052s +user 0m0.013s +sys 0m0.036s + +And here are statistics on syscalls which were called by each +command. + +$ perf trace -s -o log -- ./task_proc_all > /dev/null + + Summary of events: + + task_proc_all (30781), 180785 events, 100.0%, 0.000 msec + + syscall calls min avg max stddev + (msec) (msec) (msec) (%) + --------------- -------- --------- --------- --------- ------ + read 30111 0.000 0.013 0.107 0.21% + write 1 0.008 0.008 0.008 0.00% + open 30111 0.007 0.012 0.145 0.24% + close 30112 0.004 0.011 0.110 0.20% + fstat 3 0.009 0.013 0.016 16.15% + mmap 8 0.011 0.020 0.027 11.24% + mprotect 4 0.019 0.023 0.028 8.33% + munmap 1 0.026 0.026 0.026 0.00% + brk 8 0.007 0.015 0.024 11.94% + ioctl 1 0.007 0.007 0.007 0.00% + access 1 0.019 0.019 0.019 0.00% + execve 1 0.000 0.000 0.000 0.00% + getdents 29 0.008 1.010 2.215 8.88% + arch_prctl 1 0.016 0.016 0.016 0.00% + openat 1 0.021 0.021 0.021 0.00% + + +$ perf trace -s -o log -- ./task_diag_all > /dev/null + Summary of events: + + task_diag_all (30762), 717 events, 98.9%, 0.000 msec + + syscall calls min avg max stddev + (msec) (msec) (msec) (%) + --------------- -------- --------- --------- --------- ------ + read 2 0.000 0.008 0.016 100.00% + write 197 0.008 0.019 0.041 3.00% + open 2 0.023 0.029 0.036 22.45% + close 3 0.010 0.012 0.014 11.34% + fstat 3 0.012 0.044 0.106 70.52% + mmap 8 0.014 0.031 0.054 18.88% + mprotect 4 0.016 0.023 0.027 10.93% + munmap 1 0.022 0.022 0.022 0.00% + brk 1 0.040 0.040 0.040 0.00% + ioctl 1 0.011 0.011 0.011 0.00% + access 1 0.032 0.032 0.032 0.00% + getpid 1 0.012 0.012 0.012 0.00% + socket 1 0.032 0.032 0.032 0.00% + sendto 2 0.032 0.095 0.157 65.77% + recvfrom 129 0.009 0.235 0.418 2.45% + bind 1 0.018 0.018 0.018 0.00% + execve 1 0.000 0.000 0.000 0.00% + arch_prctl 1 0.012 0.012 0.012 0.00% + +You can find the test programs from this experiment in tools/test/selftest/task_diag. + +The idea of this functionality was suggested by Pavel Emelyanov (xemul@), +when he found that operations with /proc forms a significant part +of a checkpointing time. + +Ten years ago there was attempt to add a netlink interface to access to /proc +information: +http://lwn.net/Articles/99600/ + +Links +----- + +kernel: https://github.com/avagin/linux-task-diag +procps: https://github.com/avagin/procps-task-diag +wiki: https://criu.org/Task-diag + +Changes from the first version: +------------------------------- + +David Ahern implemented all required functionality to use task_diag in +perf. + +Bellow you can find his results how it affects performance. +> Using the fork test command: +> 10,000 processes; 10k proc with 5 threads = 50,000 tasks +> reading /proc: 11.3 sec +> task_diag: 2.2 sec +> +> @7,440 tasks, reading /proc is at 0.77 sec and task_diag at 0.096 +> +> 128 instances of sepcjbb, 80,000+ tasks: +> reading /proc: 32.1 sec +> task_diag: 3.9 sec +> +> So overall much snappier startup times. + +Many thanks to David Ahern for the help with improving task_diag. + +Changes from the second version: +-------------------------------- + +Use a proc transation file instead of the netlink interface. +Andy Lutomirski pointed out on security problems related to netlink sockets: + +> Slightly off-topic, but this netlink is really rather bad as an +> example of how fds can be used as capabilities (in the real capability +> sense, not the Linux capabilities sense). You call socket and get a +> socket. That socket captures f_cred. Then you drop privs, and you +> assume that the socket you're holding on to retains the right to do +> certain things. +> +> This breaks pretty badly when, through things such as this patch set, +> existing code that creates netlink sockets suddenly starts capturing +> brand-new rights that didn't exist as part of a netlink socket before. + +Cc: Oleg Nesterov +Cc: Andrew Morton +Cc: Cyrill Gorcunov +Cc: Pavel Emelyanov +Cc: Roger Luethi +Cc: Arnd Bergmann +Cc: Arnaldo Carvalho de Melo +Cc: David Ahern +Cc: Andy Lutomirski +Cc: Pavel Odintsov +Signed-off-by: Andrey Vagin +-- +2.1.0 + diff --git a/Documentation/accounting/task_diag.txt b/Documentation/accounting/task_diag.txt new file mode 100644 index 00000000000000..ff486b91dc382a --- /dev/null +++ b/Documentation/accounting/task_diag.txt @@ -0,0 +1,57 @@ +The task-diag interface allows to get information about running processes +(roughly same info that is now available from /proc/PID/* files). Compared to +/proc/PID/* files, it is faster, more flexible and provides data in a binary +format. Task-diag was created using the basic idea of socket_diag. + +Interface +--------- + +Here is the /proc/task-diag file, which operates based on the following +principles: + +* Transactional: write request, read response +* Netlink message format (same as used by sock_diag; binary and extendable) + +The user-kernel interface is encapsulated in include/uapi/linux/task_diag.h + +Request +------- + +A request is described by the task_diag_pid structure. + +struct task_diag_pid { + __u64 show_flags; /* TASK_DIAG_SHOW_* */ + __u64 dump_stratagy; /* TASK_DIAG_DUMP_* */ + + __u32 pid; +}; + +dump_stratagy specifies a group of processes: +/* per-process strategies */ +TASK_DIAG_DUMP_CHILDREN - all children +TASK_DIAG_DUMP_THREAD - all threads +TASK_DIAG_DUMP_ONE - one process +/* system wide strategies (the pid fiel is ignored) */ +TASK_DIAG_DUMP_ALL - all processes +TASK_DIAG_DUMP_ALL_THREAD - all threads + +show_flags specifies which information are required. If we set the +TASK_DIAG_SHOW_BASE flag, the response message will contain the TASK_DIAG_BASE +attribute which is described by the task_diag_base structure. + +In future, it can be extended by optional attributes. The request describes +which task properties are required and for which processes they are required +for. + +Response +-------- + +A response can be divided into a few packets. Each task is described by a +netlink message. If all information about a process doesn't fit into a message, +the TASK_DIAG_FLAG_CONT flag will be set and the next message will continue +describing the same process. + +Examples +-------- + +A few examples can be found in tools/testing/selftests/task_diag/ diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 817c02b13b1d54..a5ece4874607ad 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -97,3 +97,16 @@ config PROC_CHILDREN Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. + +config TASK_DIAG + bool "Task-diag support (/proc/task-diag)" + depends on NET + default n + help + Export selected properties for tasks/processes through the /proc/task-diag + transaction file. Unlike the proc file system, task_diag returns + information in a binary format (netlink) and allows to specify which + properties are required. + + Say N if unsure. + diff --git a/fs/proc/Makefile b/fs/proc/Makefile index ead487e8051087..d8ecc1f65a3519 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -33,3 +33,6 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o + +obj-$(CONFIG_TASK_DIAG) += task_diag.o + diff --git a/fs/proc/array.c b/fs/proc/array.c index 0ceb3b6b37e731..9e4390866221ca 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -651,31 +651,25 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, } #ifdef CONFIG_PROC_CHILDREN -static struct pid * -get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +struct task_struct *task_next_child(struct task_struct *parent, + struct task_struct *prev, loff_t pos) { - struct task_struct *start, *task; - struct pid *pid = NULL; - - read_lock(&tasklist_lock); - - start = pid_task(proc_pid(inode), PIDTYPE_PID); - if (!start) - goto out; + struct task_struct *task; /* * Lets try to continue searching first, this gives * us significant speedup on children-rich processes. */ - if (pid_prev) { - task = pid_task(pid_prev, PIDTYPE_PID); - if (task && task->real_parent == start && + if (prev) { + task = prev; + if (task && task->real_parent == parent && !(list_empty(&task->sibling))) { - if (list_is_last(&task->sibling, &start->children)) + if (list_is_last(&task->sibling, &parent->children)) { + task = NULL; goto out; + } task = list_first_entry(&task->sibling, struct task_struct, sibling); - pid = get_pid(task_pid(task)); goto out; } } @@ -695,12 +689,31 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) * So one need to stop or freeze the leader and all * its children to get a precise result. */ - list_for_each_entry(task, &start->children, sibling) { - if (pos-- == 0) { - pid = get_pid(task_pid(task)); - break; - } + list_for_each_entry(task, &parent->children, sibling) { + if (pos-- == 0) + goto out; } + task = NULL; +out: + return task; +} + +static struct pid * +get_children_pid(struct inode *inode, struct pid *prev_pid, loff_t pos) +{ + struct task_struct *start, *task, *prev; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + prev = prev_pid ? pid_task(prev_pid, PIDTYPE_PID) : NULL; + + task = task_next_child(start, prev, pos); + if (task) + pid = get_pid(task_pid(task)); out: read_unlock(&tasklist_lock); diff --git a/fs/proc/base.c b/fs/proc/base.c index 7e9f07bf260d20..8278e93ca2b45d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3169,11 +3169,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign * Find the first task with tgid >= tgid * */ -struct tgid_iter { - unsigned int tgid; - struct task_struct *task; -}; -static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) +struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) { struct pid *pid; @@ -3476,7 +3472,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry * In the case of a seek we start with the leader and walk nr * threads past it. */ -static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, +struct task_struct *task_first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns) { struct task_struct *pos, *task; @@ -3525,7 +3521,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, * * The reference to the input task_struct is released. */ -static struct task_struct *next_tid(struct task_struct *start) +struct task_struct *task_next_tid(struct task_struct *start) { struct task_struct *pos = NULL; rcu_read_lock(); @@ -3561,9 +3557,9 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) ns = proc_pid_ns(inode); tid = (int)file->f_version; file->f_version = 0; - for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); + for (task = task_first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; - task = next_tid(task), ctx->pos++) { + task = task_next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; tid = task_pid_nr_ns(task, ns); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51ee8..9851bea31ece3b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -304,3 +304,40 @@ extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); + +struct tgid_iter { + unsigned int tgid; + struct task_struct *task; +}; +struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter); + +struct task_struct *task_next_child(struct task_struct *parent, + struct task_struct *prev, loff_t pos); +struct task_struct *task_first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns); +struct task_struct *task_next_tid(struct task_struct *start); + +struct mem_size_stats { + bool first; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long lazyfree; + unsigned long anonymous_thp; + unsigned long shmem_thp; + unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; + u64 pss; + u64 pss_locked; + u64 swap_pss; + bool check_shmem_swap; +}; + +struct mm_walk; +int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk); diff --git a/fs/proc/task_diag.c b/fs/proc/task_diag.c new file mode 100644 index 00000000000000..5269906a447cf7 --- /dev/null +++ b/fs/proc/task_diag.c @@ -0,0 +1,1062 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +struct task_diag_cb { + struct sk_buff *req; + struct sk_buff *resp; + const struct nlmsghdr *nlh; + loff_t pos; + pid_t pid; + int attr; + union { /* per-attribute */ + struct { + unsigned long mark; + } vma; + }; +}; + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const __u8 task_state_array[] = { + TASK_DIAG_RUNNING, + TASK_DIAG_INTERRUPTIBLE, + TASK_DIAG_UNINTERRUPTIBLE, + TASK_DIAG_STOPPED, + TASK_DIAG_TRACE_STOP, + TASK_DIAG_DEAD, + TASK_DIAG_ZOMBIE, + TASK_DIAG_PARKED, + TASK_DIAG_IDLE, +}; + +static inline const __u8 get_task_state(struct task_struct *tsk) +{ + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); + + return task_state_array[task_state_index(tsk)]; +} + +static int fill_task_base(struct task_struct *p, + struct sk_buff *skb, struct pid_namespace *ns) +{ + struct task_diag_base *base; + struct nlattr *attr; + char tcomm[sizeof(p->comm)]; + struct task_struct *tracer; + + attr = nla_reserve(skb, TASK_DIAG_BASE, sizeof(struct task_diag_base)); + if (!attr) + return -EMSGSIZE; + + base = nla_data(attr); + + rcu_read_lock(); + base->ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + + base->tpid = 0; + tracer = ptrace_parent(p); + if (tracer) + base->tpid = task_pid_nr_ns(tracer, ns); + + base->tgid = task_tgid_nr_ns(p, ns); + base->pid = task_pid_nr_ns(p, ns); + base->sid = task_session_nr_ns(p, ns); + base->pgid = task_pgrp_nr_ns(p, ns); + + rcu_read_unlock(); + + get_task_comm(tcomm, p); + memset(base->comm, 0, TASK_DIAG_COMM_LEN); + strncpy(base->comm, tcomm, TASK_DIAG_COMM_LEN); + + base->state = get_task_state(p); + + return 0; +} + +static inline void caps2diag(struct task_diag_caps *diag, const kernel_cap_t *cap) +{ + int i; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + diag->cap[i] = cap->cap[i]; +} + +static int fill_creds(struct task_struct *p, struct sk_buff *skb, + struct user_namespace *user_ns) +{ + struct task_diag_creds *diag_cred; + const struct cred *cred; + struct nlattr *attr; + + attr = nla_reserve(skb, TASK_DIAG_CRED, sizeof(struct task_diag_creds)); + if (!attr) + return -EMSGSIZE; + + diag_cred = nla_data(attr); + + cred = get_task_cred(p); + + caps2diag(&diag_cred->cap_inheritable, &cred->cap_inheritable); + caps2diag(&diag_cred->cap_permitted, &cred->cap_permitted); + caps2diag(&diag_cred->cap_effective, &cred->cap_effective); + caps2diag(&diag_cred->cap_bset, &cred->cap_bset); + + diag_cred->uid = from_kuid_munged(user_ns, cred->uid); + diag_cred->euid = from_kuid_munged(user_ns, cred->euid); + diag_cred->suid = from_kuid_munged(user_ns, cred->suid); + diag_cred->fsuid = from_kuid_munged(user_ns, cred->fsuid); + diag_cred->gid = from_kgid_munged(user_ns, cred->gid); + diag_cred->egid = from_kgid_munged(user_ns, cred->egid); + diag_cred->sgid = from_kgid_munged(user_ns, cred->sgid); + diag_cred->fsgid = from_kgid_munged(user_ns, cred->fsgid); + + put_cred(cred); + + return 0; +} + +static u64 get_vma_flags(struct vm_area_struct *vma) +{ + u64 flags = 0; + + static const u64 mnemonics[BITS_PER_LONG] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = 0, + + [ilog2(VM_READ)] = TASK_DIAG_VMA_F_READ, + [ilog2(VM_WRITE)] = TASK_DIAG_VMA_F_WRITE, + [ilog2(VM_EXEC)] = TASK_DIAG_VMA_F_EXEC, + [ilog2(VM_SHARED)] = TASK_DIAG_VMA_F_SHARED, + [ilog2(VM_MAYREAD)] = TASK_DIAG_VMA_F_MAYREAD, + [ilog2(VM_MAYWRITE)] = TASK_DIAG_VMA_F_MAYWRITE, + [ilog2(VM_MAYEXEC)] = TASK_DIAG_VMA_F_MAYEXEC, + [ilog2(VM_MAYSHARE)] = TASK_DIAG_VMA_F_MAYSHARE, + [ilog2(VM_GROWSDOWN)] = TASK_DIAG_VMA_F_GROWSDOWN, + [ilog2(VM_PFNMAP)] = TASK_DIAG_VMA_F_PFNMAP, + [ilog2(VM_DENYWRITE)] = TASK_DIAG_VMA_F_DENYWRITE, +#ifdef CONFIG_X86_INTEL_MPX + [ilog2(VM_MPX)] = TASK_DIAG_VMA_F_MPX, +#endif + [ilog2(VM_LOCKED)] = TASK_DIAG_VMA_F_LOCKED, + [ilog2(VM_IO)] = TASK_DIAG_VMA_F_IO, + [ilog2(VM_SEQ_READ)] = TASK_DIAG_VMA_F_SEQ_READ, + [ilog2(VM_RAND_READ)] = TASK_DIAG_VMA_F_RAND_READ, + [ilog2(VM_DONTCOPY)] = TASK_DIAG_VMA_F_DONTCOPY, + [ilog2(VM_DONTEXPAND)] = TASK_DIAG_VMA_F_DONTEXPAND, + [ilog2(VM_ACCOUNT)] = TASK_DIAG_VMA_F_ACCOUNT, + [ilog2(VM_NORESERVE)] = TASK_DIAG_VMA_F_NORESERVE, + [ilog2(VM_HUGETLB)] = TASK_DIAG_VMA_F_HUGETLB, + [ilog2(VM_ARCH_1)] = TASK_DIAG_VMA_F_ARCH_1, + [ilog2(VM_DONTDUMP)] = TASK_DIAG_VMA_F_DONTDUMP, +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = TASK_DIAG_VMA_F_SOFTDIRTY, +#endif + [ilog2(VM_MIXEDMAP)] = TASK_DIAG_VMA_F_MIXEDMAP, + [ilog2(VM_HUGEPAGE)] = TASK_DIAG_VMA_F_HUGEPAGE, + [ilog2(VM_NOHUGEPAGE)] = TASK_DIAG_VMA_F_NOHUGEPAGE, + [ilog2(VM_MERGEABLE)] = TASK_DIAG_VMA_F_MERGEABLE, + }; + size_t i; + + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) + flags |= mnemonics[i]; + } + + return flags; +} + +/* + * use a tmp variable and copy to input arg to deal with + * alignment issues. diag_vma contains u64 elements which + * means extended load operations can be used and those can + * require 8-byte alignment (e.g., sparc) + */ +static void fill_diag_vma(struct vm_area_struct *vma, + struct task_diag_vma *diag_vma) +{ + struct task_diag_vma tmp; + + /* We don't show the stack guard page in /proc/maps */ + tmp.start = vma->vm_start; + tmp.end = vma->vm_end; + tmp.vm_flags = get_vma_flags(vma); + + if (vma->vm_file) { + struct inode *inode = file_inode(vma->vm_file); + dev_t dev; + + dev = inode->i_sb->s_dev; + tmp.major = MAJOR(dev); + tmp.minor = MINOR(dev); + tmp.inode = inode->i_ino; + tmp.generation = inode->i_generation; + tmp.pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } else { + tmp.major = 0; + tmp.minor = 0; + tmp.inode = 0; + tmp.generation = 0; + tmp.pgoff = 0; + } + + memcpy(diag_vma, &tmp, sizeof(*diag_vma)); +} + +static const char *get_vma_name(struct vm_area_struct *vma, char *page) +{ + const char *name = NULL; + + if (vma->vm_file) { + name = d_path(&vma->vm_file->f_path, page, PAGE_SIZE); + goto out; + } + + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto out; + } + + name = arch_vma_name(vma); + +out: + return name; +} + +static void fill_diag_vma_stat(struct vm_area_struct *vma, + struct task_diag_vma_stat *stat) +{ + struct task_diag_vma_stat tmp; + struct mem_size_stats mss; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; + + memset(&mss, 0, sizeof(mss)); + memset(&tmp, 0, sizeof(tmp)); + + /* mmap_sem is held in m_start */ + walk_page_vma(vma, &smaps_walk); + + tmp.resident = mss.resident; + tmp.pss = mss.pss; + tmp.shared_clean = mss.shared_clean; + tmp.private_clean = mss.private_clean; + tmp.private_dirty = mss.private_dirty; + tmp.referenced = mss.referenced; + tmp.anonymous = mss.anonymous; + tmp.anonymous_thp = mss.anonymous_thp; + tmp.swap = mss.swap; + + memcpy(stat, &tmp, sizeof(*stat)); +} + +static int fill_vma(struct task_struct *p, struct sk_buff *skb, + struct task_diag_cb *cb, bool *progress, u64 show_flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + struct nlattr *attr = NULL; + struct task_diag_vma *diag_vma; + unsigned long mark = 0; + char *page; + int i, rc = -EMSGSIZE, size; + + if (cb) + mark = cb->vma.mark; + + mm = p->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) { + mmput(mm); + return -ENOMEM; + } + + size = NLA_ALIGN(sizeof(struct task_diag_vma)); + if (show_flags & TASK_DIAG_SHOW_VMA_STAT) + size += NLA_ALIGN(sizeof(struct task_diag_vma_stat)); + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next, i++) { + unsigned char *b = skb_tail_pointer(skb); + const char *name; + void *pfile; + + + if (mark >= vma->vm_start) + continue; + + /* setup pointer for next map */ + if (attr == NULL) { + attr = nla_reserve(skb, TASK_DIAG_VMA, size); + if (!attr) + goto err; + + diag_vma = nla_data(attr); + } else { + diag_vma = nla_reserve_nohdr(skb, size); + + if (diag_vma == NULL) { + nlmsg_trim(skb, b); + goto out; + } + } + + fill_diag_vma(vma, diag_vma); + + if (show_flags & TASK_DIAG_SHOW_VMA_STAT) { + struct task_diag_vma_stat *stat; + + stat = (void *) diag_vma + NLA_ALIGN(sizeof(*diag_vma)); + + fill_diag_vma_stat(vma, stat); + diag_vma->stat_len = sizeof(struct task_diag_vma_stat); + diag_vma->stat_off = (void *) stat - (void *)diag_vma; + } else { + diag_vma->stat_len = 0; + diag_vma->stat_off = 0; + } + + name = get_vma_name(vma, page); + if (IS_ERR(name)) { + nlmsg_trim(skb, b); + rc = PTR_ERR(name); + goto out; + } + + if (name) { + diag_vma->name_len = strlen(name) + 1; + + /* reserves NLA_ALIGN(len) */ + pfile = nla_reserve_nohdr(skb, diag_vma->name_len); + if (pfile == NULL) { + nlmsg_trim(skb, b); + goto out; + } + diag_vma->name_off = pfile - (void *) diag_vma; + memcpy(pfile, name, diag_vma->name_len); + } else { + diag_vma->name_len = 0; + diag_vma->name_off = 0; + } + + mark = vma->vm_start; + + diag_vma->vma_len = skb_tail_pointer(skb) - (unsigned char *) diag_vma; + + *progress = true; + } + + rc = 0; + mark = 0; +out: + if (*progress) + attr->nla_len = skb_tail_pointer(skb) - (unsigned char *) attr; + +err: + up_read(&mm->mmap_sem); + mmput(mm); + free_page((unsigned long) page); + if (cb) + cb->vma.mark = mark; + + return rc; +} + +static int fill_task_stat(struct task_struct *task, struct sk_buff *skb, int whole) +{ + struct task_diag_stat *st; + struct nlattr *attr; + + int num_threads = 0; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + u64 cutime, cstime, utime, stime; + u64 cgtime, gtime; + unsigned long flags; + + attr = nla_reserve(skb, TASK_DIAG_STAT, sizeof(struct task_diag_stat)); + if (!attr) + return -EMSGSIZE; + + st = nla_data(attr); + + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + num_threads = get_nr_threads(task); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } while_each_thread(task, t); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + thread_group_cputime_adjusted(task, &utime, &stime); + gtime += sig->gtime; + } + + unlock_task_sighand(task, &flags); + } + + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + task_cputime_adjusted(task, &utime, &stime); + gtime = task_gtime(task); + } + + st->minflt = min_flt; + st->cminflt = cmin_flt; + st->majflt = maj_flt; + st->cmajflt = cmaj_flt; + st->utime = nsec_to_clock_t(utime); + st->stime = nsec_to_clock_t(stime); + st->cutime = nsec_to_clock_t(cutime); + st->cstime = nsec_to_clock_t(cstime); + + st->threads = num_threads; + + return 0; +} + +static int fill_task_statm(struct task_struct *task, struct sk_buff *skb, int whole) +{ + struct task_diag_statm *st; + struct nlattr *attr; + + unsigned long text, lib, swap, ptes, anon, file, shmem; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + unsigned long stack_vm, data_vm, locked_vm, pinned_vm; + struct mm_struct *mm; + + mm = get_task_mm(task); + if (!mm) + return 0; + + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = anon + file + shmem; + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; + + text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; + lib = mm->exec_vm - text; + swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = mm_pgtables_bytes(mm); + + data_vm = mm->data_vm; + stack_vm = mm->stack_vm; + locked_vm = mm->locked_vm; + pinned_vm = mm->pinned_vm; + + mmput(mm); + + attr = nla_reserve(skb, TASK_DIAG_STATM, sizeof(*st)); + if (!attr) + return -EMSGSIZE; + + st = nla_data(attr); + + st->anon = anon; + st->file = file; + st->shmem = shmem; + st->hiwater_vm = hiwater_vm; + st->hiwater_rss = hiwater_rss; + st->text = text; + st->lib = lib; + st->swap = swap; + st->ptes = ptes; + st->total_rss = total_rss; + st->total_vm = total_vm; + st->data_vm = data_vm; + st->stack_vm = stack_vm; + st->locked_vm = locked_vm; + st->pinned_vm = pinned_vm; + + return 0; +} + +static int fill_task_cmdline(struct task_struct *tsk, struct sk_buff *skb) +{ + unsigned long arg_start, arg_end, env_start, env_end; + struct nlattr *attr; + long nr_read, len; + struct mm_struct *mm; + void *pos; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + BUG_ON(arg_start > arg_end); + BUG_ON(env_start > env_end); + + len = arg_end - arg_start; + + pos = nlmsg_get_pos(skb); + + attr = nla_reserve(skb, TASK_DIAG_CMDLINE, len); + if (!attr) + return -EMSGSIZE; + + nr_read = access_remote_vm(mm, arg_start, nla_data(attr), len, 0); + if (nr_read != len) + nlmsg_trim(skb, pos); + + return 0; +} + +static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb, + struct task_diag_pid *req, + struct task_diag_cb *cb, struct pid_namespace *pidns, + struct user_namespace *userns) +{ + u64 show_flags = req->show_flags; + struct nlmsghdr *nlh; + struct task_diag_msg *msg; + int err = 0, i = 0, n = 0; + bool progress = false; + int flags = 0; + + if (cb) { + n = cb->attr; + flags |= NLM_F_MULTI; + } + + nlh = nlmsg_put(skb, 0, cb->nlh->nlmsg_seq, + TASK_DIAG_CMD_GET, sizeof(*msg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + msg = nlmsg_data(nlh); + msg->pid = task_pid_nr_ns(tsk, pidns); + msg->tgid = task_tgid_nr_ns(tsk, pidns); + msg->flags |= TASK_DIAG_FLAG_CONT; + + if (show_flags & TASK_DIAG_SHOW_BASE) { + if (i >= n) + err = fill_task_base(tsk, skb, pidns); + if (err) + goto err; + i++; + } + + if (show_flags & TASK_DIAG_SHOW_CRED) { + if (i >= n) + err = fill_creds(tsk, skb, userns); + if (err) + goto err; + i++; + } + + if (show_flags & TASK_DIAG_SHOW_VMA) { + bool dump_vma = true; + + /* if the request is to dump all threads of all processes + * only show VMAs for group leader. + */ + if ((req->dump_strategy == TASK_DIAG_DUMP_ALL_THREAD || + req->dump_strategy == TASK_DIAG_DUMP_THREAD) && + !thread_group_leader(tsk)) + dump_vma = false; + + if (dump_vma && i >= n) + err = fill_vma(tsk, skb, cb, &progress, show_flags); + if (err) + goto err; + i++; + } + + if (show_flags & TASK_DIAG_SHOW_STAT) { + int whole = 1; + + if (req->dump_strategy == TASK_DIAG_DUMP_ALL_THREAD || + req->dump_strategy == TASK_DIAG_DUMP_THREAD) + whole = 0; + + if (i >= n) + err = fill_task_stat(tsk, skb, whole); + if (err) + goto err; + i++; + } + + if (show_flags & TASK_DIAG_SHOW_STATM) { + if (i >= n) + err = fill_task_statm(tsk, skb, 1); + if (err) + goto err; + i++; + } + + if (show_flags & TASK_DIAG_SHOW_CMDLINE) { + if (i >= n) + err = fill_task_cmdline(tsk, skb); + if (err) + goto err; + i++; + } + + msg->flags &= ~TASK_DIAG_FLAG_CONT; + + nlmsg_end(skb, nlh); + if (cb) + cb->attr = 0; + + return 0; +err: + if (err == -EMSGSIZE && (i > n || progress)) { + if (cb) + cb->attr = i; + nlmsg_end(skb, nlh); + } else + nlmsg_cancel(skb, nlh); + + return err; +} + +struct task_iter { + struct task_diag_pid req; + struct pid_namespace *ns; + struct task_struct *parent; + + struct task_diag_cb *cb; + + struct tgid_iter tgid; + loff_t pos; + struct task_struct *task; +}; + +static void iter_stop(struct task_iter *iter) +{ + struct task_struct *task; + + if (iter->parent) + put_task_struct(iter->parent); + + switch (iter->req.dump_strategy) { + case TASK_DIAG_DUMP_ALL: + task = iter->tgid.task; + break; + case TASK_DIAG_DUMP_ALL_THREAD: + /* release both tgid task and thread task */ + if (iter->task) + put_task_struct(iter->task); + task = iter->tgid.task; + break; + default: + task = iter->task; + } + if (task) + put_task_struct(task); +} + +static struct task_struct * +task_diag_next_child(struct task_struct *parent, + struct task_struct *prev, loff_t pos) +{ + struct task_struct *task; + + read_lock(&tasklist_lock); + task = task_next_child(parent, prev, pos); + if (prev) + put_task_struct(prev); + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); + + return task; +} + +static struct task_struct *iter_start(struct task_iter *iter) +{ + if (iter->req.pid > 0) { + rcu_read_lock(); + iter->parent = find_task_by_pid_ns(iter->req.pid, iter->ns); + if (iter->parent) + get_task_struct(iter->parent); + rcu_read_unlock(); + } + + switch (iter->req.dump_strategy) { + case TASK_DIAG_DUMP_ONE: + if (iter->parent == NULL) + return ERR_PTR(-ESRCH); + iter->pos = iter->cb->pos; + if (iter->pos == 0) { + iter->task = iter->parent; + iter->parent = NULL; + } else + iter->task = NULL; + return iter->task; + + case TASK_DIAG_DUMP_THREAD: + if (iter->parent == NULL) + return ERR_PTR(-ESRCH); + + iter->pos = iter->cb->pos; + iter->task = task_first_tid(task_pid(iter->parent), + iter->cb->pid,iter->pos, iter->ns); + return iter->task; + + case TASK_DIAG_DUMP_CHILDREN: + if (iter->parent == NULL) + return ERR_PTR(-ESRCH); + + iter->pos = iter->cb->pos; + iter->task = task_diag_next_child(iter->parent, NULL, iter->pos); + return iter->task; + + case TASK_DIAG_DUMP_ALL: + iter->tgid.tgid = iter->cb->pid; + iter->tgid.task = NULL; + iter->tgid = next_tgid(iter->ns, iter->tgid); + return iter->tgid.task; + + case TASK_DIAG_DUMP_ALL_THREAD: + iter->pos = iter->cb->pos; + iter->tgid.tgid = iter->cb->pid; + iter->tgid.task = NULL; + iter->tgid = next_tgid(iter->ns, iter->tgid); + if (!iter->tgid.task) + return NULL; + + iter->task = task_first_tid(task_pid(iter->tgid.task), + 0, iter->pos, iter->ns); + if (!iter->task) { + iter->pos = 0; + iter->tgid.tgid += 1; + iter->tgid = next_tgid(iter->ns, iter->tgid); + iter->task = iter->tgid.task; + if (iter->task) + get_task_struct(iter->task); + } + return iter->task; + } + + return ERR_PTR(-EINVAL); +} + +static struct task_struct *iter_next(struct task_iter *iter) +{ + switch (iter->req.dump_strategy) { + case TASK_DIAG_DUMP_ONE: + iter->pos++; + iter->cb->pos = iter->pos; + if (iter->task) + put_task_struct(iter->task); + iter->task = NULL; + return NULL; + + case TASK_DIAG_DUMP_THREAD: + iter->pos++; + iter->task = task_next_tid(iter->task); + iter->cb->pos = iter->pos; + if (iter->task) + iter->cb->pid = task_pid_nr_ns(iter->task, iter->ns); + else + iter->cb->pid = -1; + return iter->task; + case TASK_DIAG_DUMP_CHILDREN: + iter->pos++; + iter->task = task_diag_next_child(iter->parent, iter->task, iter->pos); + iter->cb->pos = iter->pos; + return iter->task; + + case TASK_DIAG_DUMP_ALL: + iter->tgid.tgid += 1; + iter->tgid = next_tgid(iter->ns, iter->tgid); + iter->cb->pid = iter->tgid.tgid; + return iter->tgid.task; + + case TASK_DIAG_DUMP_ALL_THREAD: + iter->pos++; + iter->task = task_next_tid(iter->task); + if (!iter->task) { + iter->pos = 0; + iter->tgid.tgid += 1; + iter->tgid = next_tgid(iter->ns, iter->tgid); + iter->task = iter->tgid.task; + if (iter->task) + get_task_struct(iter->task); + } + + /* save current position */ + iter->cb->pid = iter->tgid.tgid; + iter->cb->pos = iter->pos; + + return iter->task; + } + + return NULL; +} + +static int __taskdiag_dumpit(struct task_iter *iter, + struct task_diag_cb *cb, struct task_struct **start) +{ + struct user_namespace *userns = current_user_ns(); + struct task_struct *task = *start; + int rc; + + for (; task; task = iter_next(iter)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + continue; + + rc = task_diag_fill(task, cb->resp, &iter->req, + cb, iter->ns, userns); + if (rc < 0) { + if (rc != -EMSGSIZE) + return rc; + break; + } + } + *start = task; + + return 0; +} + +static int taskdiag_dumpit(struct task_diag_cb *cb, + struct pid_namespace *pidns, + struct msghdr *msg, size_t len) +{ + struct sk_buff *skb = cb->resp; + struct task_struct *task; + struct task_iter iter; + struct nlattr *na; + size_t copied; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(iter.req)) + return -EINVAL; + + na = nlmsg_data(cb->nlh); + if (na->nla_type < 0) + return -EINVAL; + + memcpy(&iter.req, na, sizeof(iter.req)); + + iter.ns = pidns; + iter.cb = cb; + iter.parent = NULL; + iter.pos = 0; + iter.task = NULL; + + task = iter_start(&iter); + if (IS_ERR(task)) + return PTR_ERR(task); + + copied = 0; + while (1) { + err = __taskdiag_dumpit(&iter, cb, &task); + if (err < 0) + goto err; + if (skb->len == 0) + break; + + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err < 0) + goto err; + + copied += skb->len; + + skb_trim(skb, 0); + if (skb_tailroom(skb) + copied > len) + break; + + if (signal_pending(current)) + break; + } + + iter_stop(&iter); + return copied; +err: + iter_stop(&iter); + return err; +} + +static ssize_t task_diag_write(struct file *f, const char __user *buf, + size_t len, loff_t *off) +{ + struct task_diag_cb *cb = f->private_data; + struct sk_buff *skb; + struct msghdr msg; + struct iovec iov; + int err; + + if (cb->req) + return -EBUSY; + if (len < nlmsg_total_size(0)) + return -EINVAL; + + err = import_single_range(WRITE, (void __user *) buf, len, + &iov, &msg.msg_iter); + if (unlikely(err)) + return err; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_flags = 0; + + skb = nlmsg_new(len, GFP_KERNEL); + if (skb == NULL) + return -ENOMEM; + + if (memcpy_from_msg(skb_put(skb, len), &msg, len)) { + kfree_skb(skb); + return -EFAULT; + } + + memset(cb, 0, sizeof(*cb)); + cb->req = skb; + cb->nlh = nlmsg_hdr(skb); + + return len; +} + +static ssize_t task_diag_read(struct file *file, char __user *ubuf, + size_t len, loff_t *off) +{ + struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + struct task_diag_cb *cb = file->private_data; + struct iovec iov; + struct msghdr msg; + int size, err; + + if (cb->req == NULL) + return 0; + + err = import_single_range(READ, ubuf, len, &iov, &msg.msg_iter); + if (unlikely(err)) + goto err; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_name = NULL; + msg.msg_namelen = 0; + + if (!cb->resp) { + size = min_t(size_t, len, 16384); + cb->resp = alloc_skb(size, GFP_KERNEL); + if (cb->resp == NULL) { + err = -ENOMEM; + goto err; + } + /* Trim skb to allocated size. */ + skb_reserve(cb->resp, skb_tailroom(cb->resp) - size); + } + + err = taskdiag_dumpit(cb, ns, &msg, len); + +err: + skb_trim(cb->resp, 0); + if (err <= 0) { + kfree_skb(cb->req); + cb->req = NULL; + } + + return err; +} + +static int task_diag_open (struct inode *inode, struct file *f) +{ + f->private_data = kzalloc(sizeof(struct task_diag_cb), GFP_KERNEL); + if (f->private_data == NULL) + return -ENOMEM; + + return 0; +} + +static int task_diag_release(struct inode *inode, struct file *f) +{ + struct task_diag_cb *cb = f->private_data; + + kfree_skb(cb->req); + kfree_skb(cb->resp); + + kfree(f->private_data); + return 0; +} + +static const struct file_operations task_diag_fops = { + .owner = THIS_MODULE, + .open = task_diag_open, + .release = task_diag_release, + .write = task_diag_write, + .read = task_diag_read, +}; + +static __init int task_diag_init(void) +{ + if (!proc_create("task-diag", S_IRUGO | S_IWUGO, NULL, &task_diag_fops)) + return -ENOMEM; + + return 0; +} + +static __exit void task_diag_exit(void) +{ + remove_proc_entry("task-diag", NULL); +} + +module_init(task_diag_init); +module_exit(task_diag_exit); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4c1..0c766a8f818774 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -402,25 +402,6 @@ const struct file_operations proc_pid_maps_operations = { #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR -struct mem_size_stats { - unsigned long resident; - unsigned long shared_clean; - unsigned long shared_dirty; - unsigned long private_clean; - unsigned long private_dirty; - unsigned long referenced; - unsigned long anonymous; - unsigned long lazyfree; - unsigned long anonymous_thp; - unsigned long shmem_thp; - unsigned long swap; - unsigned long shared_hugetlb; - unsigned long private_hugetlb; - u64 pss; - u64 pss_locked; - u64 swap_pss; - bool check_shmem_swap; -}; static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty) @@ -564,7 +545,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } #endif -static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; diff --git a/include/uapi/linux/cgroupstats.h b/include/uapi/linux/cgroupstats.h index aa306e4cd6c1f8..40e1405f5dbdf1 100644 --- a/include/uapi/linux/cgroupstats.h +++ b/include/uapi/linux/cgroupstats.h @@ -38,21 +38,6 @@ struct cgroupstats { __u64 nr_io_wait; /* Number of tasks waiting on IO */ }; -/* - * Commands sent from userspace - * Not versioned. New commands should only be inserted at the enum's end - * prior to __CGROUPSTATS_CMD_MAX - */ - -enum { - CGROUPSTATS_CMD_UNSPEC = __TASKSTATS_CMD_MAX, /* Reserved */ - CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */ - CGROUPSTATS_CMD_NEW, /* kernel->user event */ - __CGROUPSTATS_CMD_MAX, -}; - -#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1) - enum { CGROUPSTATS_TYPE_UNSPEC = 0, /* Reserved */ CGROUPSTATS_TYPE_CGROUP_STATS, /* contains name + stats */ diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h new file mode 100644 index 00000000000000..8e43da6f55cf68 --- /dev/null +++ b/include/uapi/linux/task_diag.h @@ -0,0 +1,216 @@ +#ifndef _LINUX_TASK_DIAG_H +#define _LINUX_TASK_DIAG_H + +#include +#include +#include + +#define TASK_DIAG_CMD_GET 0xd101U + +struct task_diag_msg { + __u32 pid; + __u32 tgid; + __u32 flags; +}; + +#define TASK_DIAG_FLAG_CONT 0x00000001 + +enum { + TASK_DIAG_BASE = 0, + TASK_DIAG_CRED, + TASK_DIAG_VMA, + TASK_DIAG_VMA_STAT, + TASK_DIAG_STAT, + TASK_DIAG_STATM, + TASK_DIAG_CMDLINE, + + __TASK_DIAG_ATTR_MAX +#define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1) +}; + +#define TASK_DIAG_SHOW_BASE (1ULL << TASK_DIAG_BASE) +#define TASK_DIAG_SHOW_CRED (1ULL << TASK_DIAG_CRED) +#define TASK_DIAG_SHOW_VMA (1ULL << TASK_DIAG_VMA) +#define TASK_DIAG_SHOW_VMA_STAT (1ULL << TASK_DIAG_VMA_STAT) +#define TASK_DIAG_SHOW_STAT (1ULL << TASK_DIAG_STAT) +#define TASK_DIAG_SHOW_STATM (1ULL << TASK_DIAG_STATM) +#define TASK_DIAG_SHOW_CMDLINE (1ULL << TASK_DIAG_CMDLINE) + +enum { + TASK_DIAG_RUNNING, + TASK_DIAG_INTERRUPTIBLE, + TASK_DIAG_UNINTERRUPTIBLE, + TASK_DIAG_STOPPED, + TASK_DIAG_TRACE_STOP, + TASK_DIAG_DEAD, + TASK_DIAG_ZOMBIE, + TASK_DIAG_PARKED, + TASK_DIAG_IDLE, +}; + +#define TASK_DIAG_COMM_LEN 16 + +struct task_diag_base { + __u32 tgid; + __u32 pid; + __u32 ppid; + __u32 tpid; + __u32 sid; + __u32 pgid; + __u8 state; + char comm[TASK_DIAG_COMM_LEN]; +}; + +struct task_diag_caps { + __u32 cap[_LINUX_CAPABILITY_U32S_3]; +}; + +struct task_diag_creds { + struct task_diag_caps cap_inheritable; + struct task_diag_caps cap_permitted; + struct task_diag_caps cap_effective; + struct task_diag_caps cap_bset; + + __u32 uid; + __u32 euid; + __u32 suid; + __u32 fsuid; + __u32 gid; + __u32 egid; + __u32 sgid; + __u32 fsgid; +}; + +#define TASK_DIAG_VMA_F_READ (1ULL << 0) +#define TASK_DIAG_VMA_F_WRITE (1ULL << 1) +#define TASK_DIAG_VMA_F_EXEC (1ULL << 2) +#define TASK_DIAG_VMA_F_SHARED (1ULL << 3) +#define TASK_DIAG_VMA_F_MAYREAD (1ULL << 4) +#define TASK_DIAG_VMA_F_MAYWRITE (1ULL << 5) +#define TASK_DIAG_VMA_F_MAYEXEC (1ULL << 6) +#define TASK_DIAG_VMA_F_MAYSHARE (1ULL << 7) +#define TASK_DIAG_VMA_F_GROWSDOWN (1ULL << 8) +#define TASK_DIAG_VMA_F_PFNMAP (1ULL << 9) +#define TASK_DIAG_VMA_F_DENYWRITE (1ULL << 10) +#define TASK_DIAG_VMA_F_MPX (1ULL << 11) +#define TASK_DIAG_VMA_F_LOCKED (1ULL << 12) +#define TASK_DIAG_VMA_F_IO (1ULL << 13) +#define TASK_DIAG_VMA_F_SEQ_READ (1ULL << 14) +#define TASK_DIAG_VMA_F_RAND_READ (1ULL << 15) +#define TASK_DIAG_VMA_F_DONTCOPY (1ULL << 16) +#define TASK_DIAG_VMA_F_DONTEXPAND (1ULL << 17) +#define TASK_DIAG_VMA_F_ACCOUNT (1ULL << 18) +#define TASK_DIAG_VMA_F_NORESERVE (1ULL << 19) +#define TASK_DIAG_VMA_F_HUGETLB (1ULL << 20) +#define TASK_DIAG_VMA_F_ARCH_1 (1ULL << 21) +#define TASK_DIAG_VMA_F_DONTDUMP (1ULL << 22) +#define TASK_DIAG_VMA_F_SOFTDIRTY (1ULL << 23) +#define TASK_DIAG_VMA_F_MIXEDMAP (1ULL << 24) +#define TASK_DIAG_VMA_F_HUGEPAGE (1ULL << 25) +#define TASK_DIAG_VMA_F_NOHUGEPAGE (1ULL << 26) +#define TASK_DIAG_VMA_F_MERGEABLE (1ULL << 27) + +struct task_diag_vma_stat { + __u64 resident; + __u64 shared_clean; + __u64 shared_dirty; + __u64 private_clean; + __u64 private_dirty; + __u64 referenced; + __u64 anonymous; + __u64 anonymous_thp; + __u64 swap; + __u64 pss; +} __attribute__((__aligned__(NLA_ALIGNTO))); + +/* task_diag_vma must be NLA_ALIGN'ed */ +struct task_diag_vma { + __u64 start, end; + __u64 vm_flags; + __u64 pgoff; + __u32 major; + __u32 minor; + __u64 inode; + __u32 generation; + __u16 vma_len; + __u16 name_off; + __u16 name_len; + __u16 stat_off; + __u16 stat_len; +} __attribute__((__aligned__(NLA_ALIGNTO))); + +static inline char *task_diag_vma_name(struct task_diag_vma *vma) +{ + if (!vma->name_len) + return NULL; + + return ((char *)vma) + vma->name_off; +} + +static inline +struct task_diag_vma_stat *task_diag_vma_stat(struct task_diag_vma *vma) +{ + if (!vma->stat_len) + return NULL; + + return ((void *)vma) + vma->stat_off; +} + +#define task_diag_for_each_vma(vma, attr) \ + for (vma = nla_data(attr); \ + (void *) vma < nla_data(attr) + nla_len(attr); \ + vma = (void *) vma + vma->vma_len) + +struct task_diag_stat { + __u64 minflt; + __u64 cminflt; + __u64 majflt; + __u64 cmajflt; + __u64 utime; + __u64 stime; + __u64 cutime; + __u64 cstime; + + __u32 threads; +}; + +struct task_diag_statm { + __u64 anon; + __u64 file; + __u64 shmem; + __u64 total_vm; + __u64 total_rss; + __u64 hiwater_vm; + __u64 hiwater_rss; + __u64 text; + __u64 lib; + __u64 swap; + __u64 ptes; + __u64 locked_vm; + __u64 pinned_vm; + __u64 data_vm; + __u64 stack_vm; +}; + +#define TASK_DIAG_DUMP_ALL 0 +#define TASK_DIAG_DUMP_ONE 1 +#define TASK_DIAG_DUMP_ALL_THREAD 2 +#define TASK_DIAG_DUMP_CHILDREN 3 +#define TASK_DIAG_DUMP_THREAD 4 + +struct task_diag_pid { + __u64 show_flags; + __u64 dump_strategy; + + __u32 pid; +}; + +enum { + TASK_DIAG_CMD_ATTR_UNSPEC = 0, + TASK_DIAG_CMD_ATTR_GET, + __TASK_DIAG_CMD_ATTR_MAX, +}; + +#define TASK_DIAG_CMD_ATTR_MAX (__TASK_DIAG_CMD_ATTR_MAX - 1) + +#endif /* _LINUX_TASK_DIAG_H */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb2349f7b..f06af282b97cab 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -177,9 +177,16 @@ enum { TASKSTATS_CMD_UNSPEC = 0, /* Reserved */ TASKSTATS_CMD_GET, /* user->kernel request/get-response */ TASKSTATS_CMD_NEW, /* kernel->user event */ + __TASKSTATS_CMD_RESERVED, + + CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */ + CGROUPSTATS_CMD_NEW, /* kernel->user event */ + __TASKSTATS_CMD_MAX, }; +#define __CGROUPSTATS_CMD_MAX __TASKSTATS_CMD_MAX +#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1) #define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1) enum { diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f1fe492c8e17d0..d4ab3d91e3d74b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -46,6 +46,7 @@ TARGETS += user TARGETS += vm TARGETS += x86 TARGETS += zram +TARGETS += task_diag #Please keep the TARGETS list alphabetically sorted # Run "make quicktest=1 run_tests" or # "make quicktest=1 kselftest" from top level Makefile diff --git a/tools/testing/selftests/task_diag/.gitignore b/tools/testing/selftests/task_diag/.gitignore new file mode 100644 index 00000000000000..f963a1f37a7f79 --- /dev/null +++ b/tools/testing/selftests/task_diag/.gitignore @@ -0,0 +1,4 @@ +task_diag +task_diag_all +task_proc_all +fork diff --git a/tools/testing/selftests/task_diag/Makefile b/tools/testing/selftests/task_diag/Makefile new file mode 100644 index 00000000000000..c99772316f49ae --- /dev/null +++ b/tools/testing/selftests/task_diag/Makefile @@ -0,0 +1,18 @@ +all: task_diag_all fork task_proc_all fork + +CFLAGS += -g -Wall -O2 -I/usr/include/libnl3 +LDFLAGS += -lnl-3 +TEST_PROGS := run.sh +include ../lib.mk + +task_diag_all.o: task_diag_all.c task_diag_comm.h +task_diag_comm.o: task_diag_comm.c task_diag_comm.h + +task_diag_all: task_diag_all.o task_diag_comm.o +fork: fork.c + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ -lpthread + +task_proc_all: task_proc_all.c + +clean: + rm -rf task_diag task_diag_all task_diag_comm.o task_diag_all.o task_diag.o fork task_proc_all diff --git a/tools/testing/selftests/task_diag/_run.sh b/tools/testing/selftests/task_diag/_run.sh new file mode 100755 index 00000000000000..d2e854430565d6 --- /dev/null +++ b/tools/testing/selftests/task_diag/_run.sh @@ -0,0 +1,25 @@ +#!/bin/sh +set -o pipefail +set -e -x + +./fork 1000 10 + +nprocesses=`./task_diag_all all --maps | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l` +nthreads=`./task_diag_all All --smaps --cred | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l` +nchildren=`./task_diag_all children --pid 1 | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l` + +./task_diag_all one --pid 1 --cred + +( exec -a fork_thread ./fork 1 1234 ) +pid=`pidof fork_thread` +ntaskthreads=`./task_diag_all thread --maps --cred --smaps --pid $pid | grep 'pid.*tgid.*ppid.*comm' | wc -l` +killall -9 fork + +[ "$nthreads" -eq 10000 ] && +[ "$nprocesses" -eq 1000 ] && +[ "$nchildren" -eq 1000 ] && +[ "$ntaskthreads" -eq 1234 ] && +true || { + echo "Unexpected number of tasks $nthreads:$nprocesses" 1>&2 + exit 1 +} diff --git a/tools/testing/selftests/task_diag/fork.c b/tools/testing/selftests/task_diag/fork.c new file mode 100644 index 00000000000000..ebddedd21bfeb0 --- /dev/null +++ b/tools/testing/selftests/task_diag/fork.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include + +void *f(void *arg) +{ + unsigned long t = (unsigned long) arg; + + sleep(t); + return NULL; +} + +/* usage: fork nproc [mthreads [sleep]] */ +int main(int argc, char **argv) +{ + int i, j, n, m = 0; + unsigned long t_sleep = 1000; + pthread_attr_t attr; + pthread_t id; + + if (argc < 2) { + fprintf(stderr, "usage: fork nproc [mthreads [sleep]]\n"); + return 1; + } + + n = atoi(argv[1]); + + if (argc > 2) + m = atoi(argv[2]); + + if (argc > 3) + t_sleep = atoi(argv[3]); + + pthread_attr_init(&attr); + + for (i = 0; i < n; i++) { + pid_t pid; + + pid = fork(); + if (pid < 0) { + printf("Unable to fork: %m\n"); + return 1; + } + if (pid == 0) { + if (m) { + for (j = 0; j < m-1; ++j) + pthread_create(&id, &attr, f, (void *)t_sleep); + } + + sleep(t_sleep); + return 0; + } + } + + return 0; +} diff --git a/tools/testing/selftests/task_diag/run.sh b/tools/testing/selftests/task_diag/run.sh new file mode 100755 index 00000000000000..28a8550903e861 --- /dev/null +++ b/tools/testing/selftests/task_diag/run.sh @@ -0,0 +1 @@ +unshare -p -f -m --mount-proc ./_run.sh && { echo PASS; exit 0; } || { echo FAIL; exit 1; } diff --git a/tools/testing/selftests/task_diag/task_diag.h b/tools/testing/selftests/task_diag/task_diag.h new file mode 120000 index 00000000000000..d20a38c7f19b95 --- /dev/null +++ b/tools/testing/selftests/task_diag/task_diag.h @@ -0,0 +1 @@ +../../../../include/uapi/linux/task_diag.h \ No newline at end of file diff --git a/tools/testing/selftests/task_diag/task_diag_all.c b/tools/testing/selftests/task_diag/task_diag_all.c new file mode 100644 index 00000000000000..52ab1bba3e27e3 --- /dev/null +++ b/tools/testing/selftests/task_diag/task_diag_all.c @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "task_diag.h" +#include "task_diag_comm.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +#ifndef NETLINK_SCM_PID +#define NETLINK_SCM_PID 11 +#endif + +static void usage(char *name) +{ + pr_err("Usage: %s command [options]", name); + pr_err( +"Commands:\n" +"\tall - dump all processes\n" +"\tAll - dump all threads\n" +"\tthreads - dump all thread for the specified process\n" +"\tchildren - dump all thread for the specified process\n" +"\tone - dump the specified process\n" +"Options:\n" +"\t-p|--pid - PID of the required process\n" +"\t-m|--maps - dump memory regions\n" +"\t-s|--smaps - dump statistics for memory regions\n" +"\t-c|--cred - dump credentials\n" +"\t-l|--cmdline - dump command line\n" +"\t-q|--quiet - do not write anything to standard output\n" +); +} +int main(int argc, char *argv[]) +{ + int exit_status = 1, fd; + struct task_diag_pid *req; + char nl_req[4096]; + struct nlmsghdr *hdr = (void *)nl_req; + int last_pid = 0; + int opt, idx; + int err, size = 0; + static const char short_opts[] = "p:cmslqx"; + static struct option long_opts[] = { + { "pid", required_argument, 0, 'p' }, + { "maps", no_argument, 0, 'm' }, + { "smaps", no_argument, 0, 's' }, + { "cred", no_argument, 0, 'c' }, + { "cmdline", no_argument, 0, 'l' }, + { "quiet", no_argument, 0, 'q' }, + { "stat", no_argument, 0, 'x' }, + {}, + }; + + hdr->nlmsg_len = nlmsg_total_size(0); + + req = nlmsg_data(hdr); + size += nla_total_size(sizeof(*req)); + + hdr->nlmsg_len += size; + + + req->show_flags = TASK_DIAG_SHOW_BASE; + + if (argc < 2) { + pr_err("Usage: %s type pid scm_pid", argv[0]); + return 1; + } + + req->pid = 0; /* dump all tasks by default */ + + switch (argv[1][0]) { + case 'c': + req->dump_strategy = TASK_DIAG_DUMP_CHILDREN; + break; + case 't': + req->dump_strategy = TASK_DIAG_DUMP_THREAD; + break; + case 'o': + req->dump_strategy = TASK_DIAG_DUMP_ONE; + break; + case 'a': + req->dump_strategy = TASK_DIAG_DUMP_ALL; + req->pid = 0; + break; + case 'A': + req->dump_strategy = TASK_DIAG_DUMP_ALL_THREAD; + req->pid = 0; + break; + default: + usage(argv[0]); + return 1; + } + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + switch (opt) { + case 'p': + req->pid = atoi(optarg); + break; + case 'c': + req->show_flags |= TASK_DIAG_SHOW_CRED; + break; + case 'm': + req->show_flags |= TASK_DIAG_SHOW_VMA; + break; + case 's': + req->show_flags |= TASK_DIAG_SHOW_VMA_STAT | TASK_DIAG_SHOW_VMA; + break; + case 'l': + req->show_flags |= TASK_DIAG_SHOW_CMDLINE; + break; + case 'q': + quiet = 1; + break; + case 'x': + req->show_flags |= TASK_DIAG_SHOW_STAT; + break; + default: + usage(argv[0]); + return 1; + } + } + + fd = open("/proc/task-diag", O_RDWR); + if (fd < 0) + return -1; + + if (write(fd, hdr, hdr->nlmsg_len) != hdr->nlmsg_len) + return -1; + + while (1) { + char buf[163840]; + size = read(fd, buf, sizeof(buf)); + + if (size < 0) + goto err; + + if (size == 0) + break; + + err = nlmsg_receive(buf, size, &show_task, &last_pid); + if (err < 0) + goto err; + + if (err == 0) + break; + } + + exit_status = 0; +err: + return exit_status; +} diff --git a/tools/testing/selftests/task_diag/task_diag_comm.c b/tools/testing/selftests/task_diag/task_diag_comm.c new file mode 100644 index 00000000000000..9440e2f82ed7a0 --- /dev/null +++ b/tools/testing/selftests/task_diag/task_diag_comm.c @@ -0,0 +1,211 @@ +#include +#include +#include + +#include +#include +#include + +#include "task_diag.h" +#include "task_diag_comm.h" + +int quiet; + +#define PSS_SHIFT 12 + +int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *, void *), void *args) +{ + struct nlmsghdr *hdr; + + for (hdr = (struct nlmsghdr *)buf; + NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { + + if (hdr->nlmsg_type == NLMSG_DONE) { + int *len = (int *)NLMSG_DATA(hdr); + + if (*len < 0) { + pr_err("ERROR %d reported by netlink (%s)\n", + *len, strerror(-*len)); + return *len; + } + + return 0; + } + + if (hdr->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr); + + if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) { + pr_err("ERROR truncated\n"); + return -1; + } + + if (err->error == 0) + return 0; + + return -1; + } + if (cb && cb(hdr, args)) + return -1; + } + + return 1; +} + +int show_task(struct nlmsghdr *hdr, void *arg) +{ + int msg_len; + struct msgtemplate *msg; + struct task_diag_msg *diag_msg; + struct nlattr *na; + int *last_pid = arg; + int len; + + msg_len = NLMSG_PAYLOAD(hdr, 0); + + msg = (struct msgtemplate *)hdr; + diag_msg = NLMSG_DATA(msg); + +#if 1 + if (diag_msg->pid != *last_pid) + pr_info("Start getting information about %d\n", diag_msg->pid); + else + pr_info("Continue getting information about %d\n", diag_msg->pid); +#endif + *last_pid = diag_msg->pid; + + na = ((void *) diag_msg) + NLMSG_ALIGN(sizeof(*diag_msg)); + len = NLMSG_ALIGN(sizeof(*diag_msg)); + while (len < msg_len) { + len += NLA_ALIGN(na->nla_len); + switch (na->nla_type) { + case TASK_DIAG_BASE: + { + struct task_diag_base *msg; + + /* For nested attributes, na follows */ + msg = NLA_DATA(na); + pr_info("pid %5d tgid %5d ppid %5d sid %5d pgid %5d comm %s\n", + msg->pid, msg->tgid, msg->ppid, msg->sid, msg->pgid, msg->comm); + } + break; + + case TASK_DIAG_CRED: + { + struct task_diag_creds *creds; + + creds = NLA_DATA(na); + pr_info("uid: %d %d %d %d\n", creds->uid, + creds->euid, creds->suid, creds->fsuid); + pr_info("gid: %d %d %d %d\n", creds->uid, + creds->euid, creds->suid, creds->fsuid); + pr_info("CapInh: %08x%08x\n", + creds->cap_inheritable.cap[1], + creds->cap_inheritable.cap[0]); + pr_info("CapPrm: %08x%08x\n", + creds->cap_permitted.cap[1], + creds->cap_permitted.cap[0]); + pr_info("CapEff: %08x%08x\n", + creds->cap_effective.cap[1], + creds->cap_effective.cap[0]); + pr_info("CapBnd: %08x%08x\n", creds->cap_bset.cap[1], + creds->cap_bset.cap[0]); + } + break; + + case TASK_DIAG_CMDLINE: + { + char *cmdline = NLA_DATA(na); + long i; + + for (i = 0; i < nla_len(na); i++) + if (cmdline[i] == 0) + cmdline[i] = ' '; + cmdline[i - 1] = 0; + pr_info("cmdline: %s\n", cmdline); + } + break; + + case TASK_DIAG_VMA: + { + struct task_diag_vma *vma_tmp, vma; + + task_diag_for_each_vma(vma_tmp, na) { + char *name; + struct task_diag_vma_stat *stat_tmp, stat; + + name = task_diag_vma_name(vma_tmp); + if (name == NULL) + name = ""; + + memcpy(&vma, vma_tmp, sizeof(vma)); + pr_info("%016llx-%016llx %016llx %s\n", + vma.start, vma.end, vma.vm_flags, name); + + stat_tmp = task_diag_vma_stat(vma_tmp); + if (stat_tmp) + memcpy(&stat, stat_tmp, sizeof(stat)); + else + memset(&stat, 0, sizeof(stat)); + + pr_info( + "Size: %8llu kB\n" + "Rss: %8llu kB\n" + "Pss: %8llu kB\n" + "Shared_Clean: %8llu kB\n" + "Shared_Dirty: %8llu kB\n" + "Private_Clean: %8llu kB\n" + "Private_Dirty: %8llu kB\n" + "Referenced: %8llu kB\n" + "Anonymous: %8llu kB\n" + "AnonHugePages: %8llu kB\n" + "Swap: %8llu kB\n", + (vma.end - vma.start) >> 10, + stat.resident >> 10, + (stat.pss >> (10 + PSS_SHIFT)), + stat.shared_clean >> 10, + stat.shared_dirty >> 10, + stat.private_clean >> 10, + stat.private_dirty >> 10, + stat.referenced >> 10, + stat.anonymous >> 10, + stat.anonymous_thp >> 10, + stat.swap >> 10); + } + } + break; + case TASK_DIAG_STAT: + { + struct task_diag_stat *stat; + stat = NLA_DATA(na); + + pr_info( + "minflt: %llu\n" + "cminflt: %llu\n" + "majflt: %llu\n" + "cmajflt: %llu\n" + "utime: %llu\n" + "stime: %llu\n" + "cutime: %llu\n" + "cstime: %llu\n" + "threads: %u\n", + stat->minflt, + stat->cminflt, + stat->majflt, + stat->cmajflt, + stat->utime, + stat->stime, + stat->cutime, + stat->cstime, + stat->threads); + } + break; + default: + pr_info("Unknown nla_type %d\n", + na->nla_type); + } + na = ((void *) diag_msg) + len; + } + + return 0; +} diff --git a/tools/testing/selftests/task_diag/task_diag_comm.h b/tools/testing/selftests/task_diag/task_diag_comm.h new file mode 100644 index 00000000000000..40e83b79f8b262 --- /dev/null +++ b/tools/testing/selftests/task_diag/task_diag_comm.h @@ -0,0 +1,34 @@ +#ifndef __TASK_DIAG_COMM__ +#define __TASK_DIAG_COMM__ + +#include + +#include "task_diag.h" + +/* + * Generic macros for dealing with netlink sockets. Might be duplicated + * elsewhere. It is recommended that commercial grade applications use + * libnl or libnetlink and use the interfaces provided by the library + */ +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define pr_err(fmt, ...) \ + fprintf(stderr, "%s:%d" fmt"\n", __func__, __LINE__, ##__VA_ARGS__) + +#define pr_perror(fmt, ...) \ + fprintf(stderr, fmt " : %m\n", ##__VA_ARGS__) + +extern int quiet; +#define pr_info(fmt, arg...) \ + do { \ + if (!quiet) \ + printf(fmt, ##arg); \ + } while (0) \ + +int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *, void *), void *args); +extern int show_task(struct nlmsghdr *hdr, void *arg); + +#endif /* __TASK_DIAG_COMM__ */ diff --git a/tools/testing/selftests/task_diag/task_proc_all.c b/tools/testing/selftests/task_diag/task_proc_all.c new file mode 100644 index 00000000000000..15b934cd47d147 --- /dev/null +++ b/tools/testing/selftests/task_diag/task_proc_all.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +int main(int argc, char **argv) +{ + DIR *d; + int fd, tasks = 0; + struct dirent *de; + char buf[4096 * 4]; + static const char short_opts[] = "R"; + static struct option long_opts[] = { + { "noread", no_argument, 0, 'R'}, + {} + }; + bool noread = false; + int idx, opt; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + switch (opt) { + case 'R': + noread = true; + break; + default: + return 1; + } + } + + if (optind >= argc) + return 1; + + d = opendir("/proc"); + if (d == NULL) + return 1; + + while ((de = readdir(d))) { + if (de->d_name[0] < '0' || de->d_name[0] > '9') + continue; + snprintf(buf, sizeof(buf), "/proc/%s/%s", de->d_name, argv[optind]); + fd = open(buf, O_RDONLY); + if (fd < 0) + return 1; + if (!noread && read(fd, buf, sizeof(buf)) < 0) + return 1; + close(fd); + tasks++; + } + + closedir(d); + + printf("tasks: %d\n", tasks); + + return 0; +}