diff --git a/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch b/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch
new file mode 100644
index 00000000000000..833c1afd041faf
--- /dev/null
+++ b/0000-RFC-kernel-add-a-netlink-interface-to-get-informatio.patch
@@ -0,0 +1,265 @@
+From 29e6df3db77234a44a680344a61eb5bd735f6d8e Mon Sep 17 00:00:00 2001
+From: Andrey Vagin <avagin@openvz.org>
+Date: Mon, 16 Feb 2015 19:20:52 +0300
+Subject: [PATCH 0/15] task_diag: add a new interface to get information
+ about processes (v3)
+
+Current interface is a bunch of files in /proc/PID. While this appears to be
+simple and there are a number of problems with it.
+
+* Lots of syscalls
+
+  At least three syscalls per each PID are required — open(), read(), and
+  close()
+
+* Variety of formats
+
+  There are many different formats used by files in /proc/PID/ hierarchy.
+  Therefore, there is a need to write parser for each such format.
+
+* Non-extendable formats
+
+  Some formats in /proc/PID are non-extendable. For example, /proc/PID/maps
+  last column (file name) is optional, therefore there is no way to add more
+  columns without breaking the format.
+
+* Slow read due to extra info[edit]
+  Sometimes getting information is slow due to extra attributes that are not
+  always needed. For example, /proc/PID/smaps contains VmFlags field (which
+  can't be added to /proc/PID/maps, see previous item), but it also contains
+  page stats that take long time to generate.
+
+	$ time cat /proc/*/maps > /dev/null
+	real	0m0.061s
+	user	0m0.002s
+	sys	0m0.059s
+
+
+	$ time cat /proc/*/smaps > /dev/null
+	real	0m0.253s
+	user	0m0.004s
+	sys	0m0.247s
+
+Proposed solution
+-----------------
+
+The proposed solution is the /proc/task_diag file, which operates based on the
+following principles:
+
+* Transactional: write request, read response
+* Netlink message format (same as used by sock_diag; binary and extendable)
+* Ability to specify a set of processes to get info about
+* Optimal grouping of attributes
+  Any attribute in a group can't affect a response time
+
+The user-kernel interface is encapsulated in include/uapi/linux/task_diag.h
+
+A request is described by the task_diag_pid structure:
+
+struct task_diag_pid {
+       __u64   show_flags;	/* specify which information are required */
+       __u64   dump_strategy;   /* specify a group of processes */
+
+       __u32   pid;
+};
+
+dump_strategy specifies a group of processes:
+/* system wide strategies (the pid fiel is ignored) */
+TASK_DIAG_DUMP_ALL	  - all processes
+TASK_DIAG_DUMP_ALL_THREAD - all threads
+/* per-process strategies */
+TASK_DIAG_DUMP_CHILDREN	 - all children
+TASK_DIAG_DUMP_THREAD	 - all threads
+TASK_DIAG_DUMP_ONE	 - one process
+
+show_flags specifies which information are required.  If we set the
+TASK_DIAG_SHOW_BASE flag, the response message will contain the TASK_DIAG_BASE
+attribute which is described by the task_diag_base structure.
+
+struct task_diag_base {
+	__u32	tgid;
+	__u32	pid;
+	__u32	ppid;
+	__u32	tpid;
+	__u32	sid;
+	__u32	pgid;
+	__u8	state;
+	char	comm[TASK_DIAG_COMM_LEN];
+};
+
+In future, it can be extended by optional attributes. The request describes
+which task properties are required and for which processes they are required
+for.
+
+A response can be divided into a few netlink packets. Each task is described
+by a netlink message. If all information about a process doesn't fit into a
+message, the TASK_DIAG_FLAG_CONT flag will be set and the next message will
+continue describing the same process.
+
+The task diag is much faster than the proc file system. We don't need to create
+a new file descriptor for each task. We need to send a request and get a
+response. It allows to get information for a few tasks for one request-response
+iteration.
+
+As for security, task_diag always works as procfs with hidepid = 2 (highest
+level of security).
+
+I have compared performance of procfs and task-diag for the
+"ps ax -o pid,ppid" command.
+
+ps uses /proc/PID/* files:
+$ time ./ps/pscommand ax | wc -l
+50089
+
+real    0m1.596s
+user    0m0.475s
+sys     0m1.126s
+
+ps uses the task_diag interface
+$ time ./ps/pscommand ax | wc -l
+50089
+
+real    0m0.148s
+user    0m0.069s
+sys     0m0.086s
+
+Read /proc/PID/stat for 30K tasks:
+$ time ./task_proc_all > /dev/null
+
+real	0m0.258s
+user	0m0.019s
+sys	0m0.232s
+
+Get the same information via task_diag:
+$ time ./task_diag_all > /dev/null
+
+real	0m0.052s
+user	0m0.013s
+sys	0m0.036s
+
+And here are statistics on syscalls which were called by each
+command.
+
+$ perf trace -s -o log -- ./task_proc_all > /dev/null
+
+ Summary of events:
+
+ task_proc_all (30781), 180785 events, 100.0%, 0.000 msec
+
+   syscall            calls      min       avg       max      stddev
+                               (msec)    (msec)    (msec)        (%)
+   --------------- -------- --------- --------- ---------     ------
+   read               30111     0.000     0.013     0.107      0.21%
+   write                  1     0.008     0.008     0.008      0.00%
+   open               30111     0.007     0.012     0.145      0.24%
+   close              30112     0.004     0.011     0.110      0.20%
+   fstat                  3     0.009     0.013     0.016     16.15%
+   mmap                   8     0.011     0.020     0.027     11.24%
+   mprotect               4     0.019     0.023     0.028      8.33%
+   munmap                 1     0.026     0.026     0.026      0.00%
+   brk                    8     0.007     0.015     0.024     11.94%
+   ioctl                  1     0.007     0.007     0.007      0.00%
+   access                 1     0.019     0.019     0.019      0.00%
+   execve                 1     0.000     0.000     0.000      0.00%
+   getdents              29     0.008     1.010     2.215      8.88%
+   arch_prctl             1     0.016     0.016     0.016      0.00%
+   openat                 1     0.021     0.021     0.021      0.00%
+
+
+$ perf trace -s -o log -- ./task_diag_all > /dev/null
+ Summary of events:
+
+ task_diag_all (30762), 717 events, 98.9%, 0.000 msec
+
+   syscall            calls      min       avg       max      stddev
+                               (msec)    (msec)    (msec)        (%)
+   --------------- -------- --------- --------- ---------     ------
+   read                   2     0.000     0.008     0.016    100.00%
+   write                197     0.008     0.019     0.041      3.00%
+   open                   2     0.023     0.029     0.036     22.45%
+   close                  3     0.010     0.012     0.014     11.34%
+   fstat                  3     0.012     0.044     0.106     70.52%
+   mmap                   8     0.014     0.031     0.054     18.88%
+   mprotect               4     0.016     0.023     0.027     10.93%
+   munmap                 1     0.022     0.022     0.022      0.00%
+   brk                    1     0.040     0.040     0.040      0.00%
+   ioctl                  1     0.011     0.011     0.011      0.00%
+   access                 1     0.032     0.032     0.032      0.00%
+   getpid                 1     0.012     0.012     0.012      0.00%
+   socket                 1     0.032     0.032     0.032      0.00%
+   sendto                 2     0.032     0.095     0.157     65.77%
+   recvfrom             129     0.009     0.235     0.418      2.45%
+   bind                   1     0.018     0.018     0.018      0.00%
+   execve                 1     0.000     0.000     0.000      0.00%
+   arch_prctl             1     0.012     0.012     0.012      0.00%
+
+You can find the test programs from this experiment in tools/test/selftest/task_diag.
+
+The idea of this functionality was suggested by Pavel Emelyanov (xemul@),
+when he found that operations with /proc forms a significant part
+of a checkpointing time.
+
+Ten years ago there was attempt to add a netlink interface to access to /proc
+information:
+http://lwn.net/Articles/99600/
+
+Links
+-----
+
+kernel: https://github.com/avagin/linux-task-diag
+procps: https://github.com/avagin/procps-task-diag
+wiki: https://criu.org/Task-diag
+
+Changes from the first version:
+-------------------------------
+
+David Ahern implemented all required functionality to use task_diag in
+perf.
+
+Bellow you can find his results how it affects performance.
+> Using the fork test command:
+>    10,000 processes; 10k proc with 5 threads = 50,000 tasks
+>    reading /proc: 11.3 sec
+>    task_diag:      2.2 sec
+>
+> @7,440 tasks, reading /proc is at 0.77 sec and task_diag at 0.096
+>
+> 128 instances of sepcjbb, 80,000+ tasks:
+>     reading /proc: 32.1 sec
+>     task_diag:      3.9 sec
+>
+> So overall much snappier startup times.
+
+Many thanks to David Ahern for the help with improving task_diag.
+
+Changes from the second version:
+--------------------------------
+
+Use a proc transation file instead of the netlink interface.
+Andy Lutomirski pointed out on security problems related to netlink sockets:
+
+> Slightly off-topic, but this netlink is really rather bad as an
+> example of how fds can be used as capabilities (in the real capability
+> sense, not the Linux capabilities sense).  You call socket and get a
+> socket.  That socket captures f_cred.  Then you drop privs, and you
+> assume that the socket you're holding on to retains the right to do
+> certain things.
+>
+> This breaks pretty badly when, through things such as this patch set,
+> existing code that creates netlink sockets suddenly starts capturing
+> brand-new rights that didn't exist as part of a netlink socket before.
+
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Cyrill Gorcunov <gorcunov@openvz.org>
+Cc: Pavel Emelyanov <xemul@parallels.com>
+Cc: Roger Luethi <rl@hellgate.ch>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
+Cc: David Ahern <dsahern@gmail.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Pavel Odintsov <pavel.odintsov@gmail.com>
+Signed-off-by: Andrey Vagin <avagin@openvz.org>
+--
+2.1.0
+
diff --git a/Documentation/accounting/task_diag.txt b/Documentation/accounting/task_diag.txt
new file mode 100644
index 00000000000000..ff486b91dc382a
--- /dev/null
+++ b/Documentation/accounting/task_diag.txt
@@ -0,0 +1,57 @@
+The task-diag interface allows to get information about running processes
+(roughly same info that is now available from /proc/PID/* files). Compared to
+/proc/PID/* files, it is faster, more flexible and provides data in a binary
+format. Task-diag was created using the basic idea of socket_diag.
+
+Interface
+---------
+
+Here is the /proc/task-diag file, which operates based on the following
+principles:
+
+* Transactional: write request, read response
+* Netlink message format (same as used by sock_diag; binary and extendable)
+
+The user-kernel interface is encapsulated in include/uapi/linux/task_diag.h
+
+Request
+-------
+
+A request is described by the task_diag_pid structure.
+
+struct task_diag_pid {
+	__u64	show_flags;	/* TASK_DIAG_SHOW_* */
+	__u64	dump_stratagy;	/* TASK_DIAG_DUMP_* */
+
+	__u32	pid;
+};
+
+dump_stratagy specifies a group of processes:
+/* per-process strategies */
+TASK_DIAG_DUMP_CHILDREN	- all children
+TASK_DIAG_DUMP_THREAD	- all threads
+TASK_DIAG_DUMP_ONE	- one process
+/* system wide strategies (the pid fiel is ignored) */
+TASK_DIAG_DUMP_ALL	  - all processes
+TASK_DIAG_DUMP_ALL_THREAD - all threads
+
+show_flags specifies which information are required.  If we set the
+TASK_DIAG_SHOW_BASE flag, the response message will contain the TASK_DIAG_BASE
+attribute which is described by the task_diag_base structure.
+
+In future, it can be extended by optional attributes. The request describes
+which task properties are required and for which processes they are required
+for.
+
+Response
+--------
+
+A response can be divided into a few packets. Each task is described by a
+netlink message. If all information about a process doesn't fit into a message,
+the TASK_DIAG_FLAG_CONT flag will be set and the next message will continue
+describing the same process.
+
+Examples
+--------
+
+A few examples can be found in tools/testing/selftests/task_diag/
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b13b1d54..a5ece4874607ad 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -97,3 +97,16 @@ config PROC_CHILDREN
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config TASK_DIAG
+	bool "Task-diag support (/proc/task-diag)"
+	depends on NET
+	default n
+	help
+	  Export selected properties for tasks/processes through the /proc/task-diag
+	  transaction file. Unlike the proc file system, task_diag returns
+	  information in a binary format (netlink) and allows to specify which
+	  properties are required.
+
+	  Say N if unsure.
+
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e8051087..d8ecc1f65a3519 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -33,3 +33,6 @@ proc-$(CONFIG_PROC_KCORE)	+= kcore.o
 proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PRINTK)	+= kmsg.o
 proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
+
+obj-$(CONFIG_TASK_DIAG) += task_diag.o
+
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0ceb3b6b37e731..9e4390866221ca 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -651,31 +651,25 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
 }
 
 #ifdef CONFIG_PROC_CHILDREN
-static struct pid *
-get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+struct task_struct *task_next_child(struct task_struct *parent,
+				struct task_struct *prev, loff_t pos)
 {
-	struct task_struct *start, *task;
-	struct pid *pid = NULL;
-
-	read_lock(&tasklist_lock);
-
-	start = pid_task(proc_pid(inode), PIDTYPE_PID);
-	if (!start)
-		goto out;
+	struct task_struct *task;
 
 	/*
 	 * Lets try to continue searching first, this gives
 	 * us significant speedup on children-rich processes.
 	 */
-	if (pid_prev) {
-		task = pid_task(pid_prev, PIDTYPE_PID);
-		if (task && task->real_parent == start &&
+	if (prev) {
+		task = prev;
+		if (task && task->real_parent == parent &&
 		    !(list_empty(&task->sibling))) {
-			if (list_is_last(&task->sibling, &start->children))
+			if (list_is_last(&task->sibling, &parent->children)) {
+				task = NULL;
 				goto out;
+			}
 			task = list_first_entry(&task->sibling,
 						struct task_struct, sibling);
-			pid = get_pid(task_pid(task));
 			goto out;
 		}
 	}
@@ -695,12 +689,31 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
 	 * So one need to stop or freeze the leader and all
 	 * its children to get a precise result.
 	 */
-	list_for_each_entry(task, &start->children, sibling) {
-		if (pos-- == 0) {
-			pid = get_pid(task_pid(task));
-			break;
-		}
+	list_for_each_entry(task, &parent->children, sibling) {
+		if (pos-- == 0)
+			goto out;
 	}
+	task = NULL;
+out:
+	return task;
+}
+
+static struct pid *
+get_children_pid(struct inode *inode, struct pid *prev_pid, loff_t pos)
+{
+	struct task_struct *start, *task, *prev;
+	struct pid *pid = NULL;
+
+	read_lock(&tasklist_lock);
+	start = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (!start)
+		goto out;
+
+	prev = prev_pid ? pid_task(prev_pid, PIDTYPE_PID) : NULL;
+
+	task = task_next_child(start, prev, pos);
+	if (task)
+		pid = get_pid(task_pid(task));
 
 out:
 	read_unlock(&tasklist_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf260d20..8278e93ca2b45d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3169,11 +3169,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
  * Find the first task with tgid >= tgid
  *
  */
-struct tgid_iter {
-	unsigned int tgid;
-	struct task_struct *task;
-};
-static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
 {
 	struct pid *pid;
 
@@ -3476,7 +3472,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
  * In the case of a seek we start with the leader and walk nr
  * threads past it.
  */
-static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+struct task_struct *task_first_tid(struct pid *pid, int tid, loff_t f_pos,
 					struct pid_namespace *ns)
 {
 	struct task_struct *pos, *task;
@@ -3525,7 +3521,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
  *
  * The reference to the input task_struct is released.
  */
-static struct task_struct *next_tid(struct task_struct *start)
+struct task_struct *task_next_tid(struct task_struct *start)
 {
 	struct task_struct *pos = NULL;
 	rcu_read_lock();
@@ -3561,9 +3557,9 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	ns = proc_pid_ns(inode);
 	tid = (int)file->f_version;
 	file->f_version = 0;
-	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
+	for (task = task_first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
 	     task;
-	     task = next_tid(task), ctx->pos++) {
+	     task = task_next_tid(task), ctx->pos++) {
 		char name[10 + 1];
 		unsigned int len;
 		tid = task_pid_nr_ns(task, ns);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51ee8..9851bea31ece3b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -304,3 +304,40 @@ extern unsigned long task_statm(struct mm_struct *,
 				unsigned long *, unsigned long *,
 				unsigned long *, unsigned long *);
 extern void task_mem(struct seq_file *, struct mm_struct *);
+
+struct tgid_iter {
+	unsigned int tgid;
+	struct task_struct *task;
+};
+struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter);
+
+struct task_struct *task_next_child(struct task_struct *parent,
+				struct task_struct *prev, loff_t pos);
+struct task_struct *task_first_tid(struct pid *pid, int tid, loff_t f_pos,
+					struct pid_namespace *ns);
+struct task_struct *task_next_tid(struct task_struct *start);
+
+struct mem_size_stats {
+	bool first;
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+	unsigned long referenced;
+	unsigned long anonymous;
+	unsigned long lazyfree;
+	unsigned long anonymous_thp;
+	unsigned long shmem_thp;
+	unsigned long swap;
+	unsigned long shared_hugetlb;
+	unsigned long private_hugetlb;
+	u64 pss;
+	u64 pss_locked;
+	u64 swap_pss;
+	bool check_shmem_swap;
+};
+
+struct mm_walk;
+int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   struct mm_walk *walk);
diff --git a/fs/proc/task_diag.c b/fs/proc/task_diag.c
new file mode 100644
index 00000000000000..5269906a447cf7
--- /dev/null
+++ b/fs/proc/task_diag.c
@@ -0,0 +1,1062 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/task_diag.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cputime.h>
+
+#include <net/netlink.h>
+
+#include "internal.h"
+
+struct task_diag_cb {
+	struct sk_buff		*req;
+	struct sk_buff		*resp;
+	const struct nlmsghdr	*nlh;
+	loff_t			pos;
+	pid_t			pid;
+	int			attr;
+	union { /* per-attribute */
+		struct {
+			unsigned long mark;
+		} vma;
+	};
+};
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const __u8 task_state_array[] = {
+	TASK_DIAG_RUNNING,
+	TASK_DIAG_INTERRUPTIBLE,
+	TASK_DIAG_UNINTERRUPTIBLE,
+	TASK_DIAG_STOPPED,
+	TASK_DIAG_TRACE_STOP,
+	TASK_DIAG_DEAD,
+	TASK_DIAG_ZOMBIE,
+	TASK_DIAG_PARKED,
+	TASK_DIAG_IDLE,
+};
+
+static inline const __u8 get_task_state(struct task_struct *tsk)
+{
+	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
+
+	return task_state_array[task_state_index(tsk)];
+}
+
+static int fill_task_base(struct task_struct *p,
+			  struct sk_buff *skb, struct pid_namespace *ns)
+{
+	struct task_diag_base *base;
+	struct nlattr *attr;
+	char tcomm[sizeof(p->comm)];
+	struct task_struct *tracer;
+
+	attr = nla_reserve(skb, TASK_DIAG_BASE, sizeof(struct task_diag_base));
+	if (!attr)
+		return -EMSGSIZE;
+
+	base = nla_data(attr);
+
+	rcu_read_lock();
+	base->ppid = pid_alive(p) ?
+		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+	base->tpid = 0;
+	tracer = ptrace_parent(p);
+	if (tracer)
+		base->tpid = task_pid_nr_ns(tracer, ns);
+
+	base->tgid = task_tgid_nr_ns(p, ns);
+	base->pid  = task_pid_nr_ns(p, ns);
+	base->sid  = task_session_nr_ns(p, ns);
+	base->pgid = task_pgrp_nr_ns(p, ns);
+
+	rcu_read_unlock();
+
+	get_task_comm(tcomm, p);
+	memset(base->comm, 0, TASK_DIAG_COMM_LEN);
+	strncpy(base->comm, tcomm, TASK_DIAG_COMM_LEN);
+
+	base->state = get_task_state(p);
+
+	return 0;
+}
+
+static inline void caps2diag(struct task_diag_caps *diag, const kernel_cap_t *cap)
+{
+	int i;
+
+	for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++)
+		diag->cap[i] = cap->cap[i];
+}
+
+static int fill_creds(struct task_struct *p, struct sk_buff *skb,
+					struct user_namespace *user_ns)
+{
+	struct task_diag_creds *diag_cred;
+	const struct cred *cred;
+	struct nlattr *attr;
+
+	attr = nla_reserve(skb, TASK_DIAG_CRED, sizeof(struct task_diag_creds));
+	if (!attr)
+		return -EMSGSIZE;
+
+	diag_cred = nla_data(attr);
+
+	cred = get_task_cred(p);
+
+	caps2diag(&diag_cred->cap_inheritable, &cred->cap_inheritable);
+	caps2diag(&diag_cred->cap_permitted, &cred->cap_permitted);
+	caps2diag(&diag_cred->cap_effective, &cred->cap_effective);
+	caps2diag(&diag_cred->cap_bset, &cred->cap_bset);
+
+	diag_cred->uid   = from_kuid_munged(user_ns, cred->uid);
+	diag_cred->euid  = from_kuid_munged(user_ns, cred->euid);
+	diag_cred->suid  = from_kuid_munged(user_ns, cred->suid);
+	diag_cred->fsuid = from_kuid_munged(user_ns, cred->fsuid);
+	diag_cred->gid   = from_kgid_munged(user_ns, cred->gid);
+	diag_cred->egid  = from_kgid_munged(user_ns, cred->egid);
+	diag_cred->sgid  = from_kgid_munged(user_ns, cred->sgid);
+	diag_cred->fsgid = from_kgid_munged(user_ns, cred->fsgid);
+
+	put_cred(cred);
+
+	return 0;
+}
+
+static u64 get_vma_flags(struct vm_area_struct *vma)
+{
+	u64 flags = 0;
+
+	static const u64 mnemonics[BITS_PER_LONG] = {
+		/*
+		 * In case if we meet a flag we don't know about.
+		 */
+		[0 ... (BITS_PER_LONG-1)] = 0,
+
+		[ilog2(VM_READ)]	= TASK_DIAG_VMA_F_READ,
+		[ilog2(VM_WRITE)]	= TASK_DIAG_VMA_F_WRITE,
+		[ilog2(VM_EXEC)]	= TASK_DIAG_VMA_F_EXEC,
+		[ilog2(VM_SHARED)]	= TASK_DIAG_VMA_F_SHARED,
+		[ilog2(VM_MAYREAD)]	= TASK_DIAG_VMA_F_MAYREAD,
+		[ilog2(VM_MAYWRITE)]	= TASK_DIAG_VMA_F_MAYWRITE,
+		[ilog2(VM_MAYEXEC)]	= TASK_DIAG_VMA_F_MAYEXEC,
+		[ilog2(VM_MAYSHARE)]	= TASK_DIAG_VMA_F_MAYSHARE,
+		[ilog2(VM_GROWSDOWN)]	= TASK_DIAG_VMA_F_GROWSDOWN,
+		[ilog2(VM_PFNMAP)]	= TASK_DIAG_VMA_F_PFNMAP,
+		[ilog2(VM_DENYWRITE)]	= TASK_DIAG_VMA_F_DENYWRITE,
+#ifdef CONFIG_X86_INTEL_MPX
+		[ilog2(VM_MPX)]		= TASK_DIAG_VMA_F_MPX,
+#endif
+		[ilog2(VM_LOCKED)]	= TASK_DIAG_VMA_F_LOCKED,
+		[ilog2(VM_IO)]		= TASK_DIAG_VMA_F_IO,
+		[ilog2(VM_SEQ_READ)]	= TASK_DIAG_VMA_F_SEQ_READ,
+		[ilog2(VM_RAND_READ)]	= TASK_DIAG_VMA_F_RAND_READ,
+		[ilog2(VM_DONTCOPY)]	= TASK_DIAG_VMA_F_DONTCOPY,
+		[ilog2(VM_DONTEXPAND)]	= TASK_DIAG_VMA_F_DONTEXPAND,
+		[ilog2(VM_ACCOUNT)]	= TASK_DIAG_VMA_F_ACCOUNT,
+		[ilog2(VM_NORESERVE)]	= TASK_DIAG_VMA_F_NORESERVE,
+		[ilog2(VM_HUGETLB)]	= TASK_DIAG_VMA_F_HUGETLB,
+		[ilog2(VM_ARCH_1)]	= TASK_DIAG_VMA_F_ARCH_1,
+		[ilog2(VM_DONTDUMP)]	= TASK_DIAG_VMA_F_DONTDUMP,
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		[ilog2(VM_SOFTDIRTY)]	= TASK_DIAG_VMA_F_SOFTDIRTY,
+#endif
+		[ilog2(VM_MIXEDMAP)]	= TASK_DIAG_VMA_F_MIXEDMAP,
+		[ilog2(VM_HUGEPAGE)]	= TASK_DIAG_VMA_F_HUGEPAGE,
+		[ilog2(VM_NOHUGEPAGE)]	= TASK_DIAG_VMA_F_NOHUGEPAGE,
+		[ilog2(VM_MERGEABLE)]	= TASK_DIAG_VMA_F_MERGEABLE,
+	};
+	size_t i;
+
+	for (i = 0; i < BITS_PER_LONG; i++) {
+		if (vma->vm_flags & (1UL << i))
+			flags |= mnemonics[i];
+	}
+
+	return flags;
+}
+
+/*
+ * use a tmp variable and copy to input arg to deal with
+ * alignment issues. diag_vma contains u64 elements which
+ * means extended load operations can be used and those can
+ * require 8-byte alignment (e.g., sparc)
+ */
+static void fill_diag_vma(struct vm_area_struct *vma,
+			  struct task_diag_vma *diag_vma)
+{
+	struct task_diag_vma tmp;
+
+	/* We don't show the stack guard page in /proc/maps */
+	tmp.start = vma->vm_start;
+	tmp.end = vma->vm_end;
+	tmp.vm_flags = get_vma_flags(vma);
+
+	if (vma->vm_file) {
+		struct inode *inode = file_inode(vma->vm_file);
+		dev_t dev;
+
+		dev = inode->i_sb->s_dev;
+		tmp.major = MAJOR(dev);
+		tmp.minor = MINOR(dev);
+		tmp.inode = inode->i_ino;
+		tmp.generation = inode->i_generation;
+		tmp.pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
+	} else {
+		tmp.major = 0;
+		tmp.minor = 0;
+		tmp.inode = 0;
+		tmp.generation = 0;
+		tmp.pgoff = 0;
+	}
+
+	memcpy(diag_vma, &tmp, sizeof(*diag_vma));
+}
+
+static const char *get_vma_name(struct vm_area_struct *vma, char *page)
+{
+	const char *name = NULL;
+
+	if (vma->vm_file) {
+		name = d_path(&vma->vm_file->f_path, page, PAGE_SIZE);
+		goto out;
+	}
+
+	if (vma->vm_ops && vma->vm_ops->name) {
+		name = vma->vm_ops->name(vma);
+		if (name)
+			goto out;
+	}
+
+	name = arch_vma_name(vma);
+
+out:
+	return name;
+}
+
+static void fill_diag_vma_stat(struct vm_area_struct *vma,
+				struct task_diag_vma_stat *stat)
+{
+	struct task_diag_vma_stat tmp;
+	struct mem_size_stats mss;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
+
+	memset(&mss, 0, sizeof(mss));
+	memset(&tmp, 0, sizeof(tmp));
+
+	/* mmap_sem is held in m_start */
+	walk_page_vma(vma, &smaps_walk);
+
+	tmp.resident		= mss.resident;
+	tmp.pss			= mss.pss;
+	tmp.shared_clean	= mss.shared_clean;
+	tmp.private_clean	= mss.private_clean;
+	tmp.private_dirty	= mss.private_dirty;
+	tmp.referenced		= mss.referenced;
+	tmp.anonymous		= mss.anonymous;
+	tmp.anonymous_thp	= mss.anonymous_thp;
+	tmp.swap		= mss.swap;
+
+	memcpy(stat, &tmp, sizeof(*stat));
+}
+
+static int fill_vma(struct task_struct *p, struct sk_buff *skb,
+		    struct task_diag_cb *cb, bool *progress, u64 show_flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct nlattr *attr = NULL;
+	struct task_diag_vma *diag_vma;
+	unsigned long mark = 0;
+	char *page;
+	int i, rc = -EMSGSIZE, size;
+
+	if (cb)
+		mark = cb->vma.mark;
+
+	mm = p->mm;
+	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+		return 0;
+
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page) {
+		mmput(mm);
+		return -ENOMEM;
+	}
+
+	size = NLA_ALIGN(sizeof(struct task_diag_vma));
+	if (show_flags & TASK_DIAG_SHOW_VMA_STAT)
+		size += NLA_ALIGN(sizeof(struct task_diag_vma_stat));
+
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next, i++) {
+		unsigned char *b = skb_tail_pointer(skb);
+		const char *name;
+		void *pfile;
+
+
+		if (mark >= vma->vm_start)
+			continue;
+
+		/* setup pointer for next map */
+		if (attr == NULL) {
+			attr = nla_reserve(skb, TASK_DIAG_VMA, size);
+			if (!attr)
+				goto err;
+
+			diag_vma = nla_data(attr);
+		} else {
+			diag_vma = nla_reserve_nohdr(skb, size);
+
+			if (diag_vma == NULL) {
+				nlmsg_trim(skb, b);
+				goto out;
+			}
+		}
+
+		fill_diag_vma(vma, diag_vma);
+
+		if (show_flags & TASK_DIAG_SHOW_VMA_STAT) {
+			struct task_diag_vma_stat *stat;
+
+			stat = (void *) diag_vma + NLA_ALIGN(sizeof(*diag_vma));
+
+			fill_diag_vma_stat(vma, stat);
+			diag_vma->stat_len = sizeof(struct task_diag_vma_stat);
+			diag_vma->stat_off = (void *) stat - (void *)diag_vma;
+		} else {
+			diag_vma->stat_len = 0;
+			diag_vma->stat_off = 0;
+		}
+
+		name = get_vma_name(vma, page);
+		if (IS_ERR(name)) {
+			nlmsg_trim(skb, b);
+			rc = PTR_ERR(name);
+			goto out;
+		}
+
+		if (name) {
+			diag_vma->name_len = strlen(name) + 1;
+
+			/* reserves NLA_ALIGN(len) */
+			pfile = nla_reserve_nohdr(skb, diag_vma->name_len);
+			if (pfile == NULL) {
+				nlmsg_trim(skb, b);
+				goto out;
+			}
+			diag_vma->name_off = pfile - (void *) diag_vma;
+			memcpy(pfile, name, diag_vma->name_len);
+		} else {
+			diag_vma->name_len = 0;
+			diag_vma->name_off = 0;
+		}
+
+		mark = vma->vm_start;
+
+		diag_vma->vma_len = skb_tail_pointer(skb) - (unsigned char *) diag_vma;
+
+		*progress = true;
+	}
+
+	rc = 0;
+	mark = 0;
+out:
+	if (*progress)
+		attr->nla_len = skb_tail_pointer(skb) - (unsigned char *) attr;
+
+err:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	free_page((unsigned long) page);
+	if (cb)
+		cb->vma.mark = mark;
+
+	return rc;
+}
+
+static int fill_task_stat(struct task_struct *task, struct sk_buff *skb, int whole)
+{
+	struct task_diag_stat *st;
+	struct nlattr *attr;
+
+	int num_threads = 0;
+	unsigned long cmin_flt = 0, cmaj_flt = 0;
+	unsigned long  min_flt = 0,  maj_flt = 0;
+	u64 cutime, cstime, utime, stime;
+	u64 cgtime, gtime;
+	unsigned long flags;
+
+	attr = nla_reserve(skb, TASK_DIAG_STAT, sizeof(struct task_diag_stat));
+	if (!attr)
+		return -EMSGSIZE;
+
+	st = nla_data(attr);
+
+	cutime = cstime = utime = stime = 0;
+	cgtime = gtime = 0;
+	if (lock_task_sighand(task, &flags)) {
+		struct signal_struct *sig = task->signal;
+
+		num_threads = get_nr_threads(task);
+
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		cgtime = sig->cgtime;
+
+		/* add up live thread stats at the group level */
+		if (whole) {
+			struct task_struct *t = task;
+
+			do {
+				min_flt += t->min_flt;
+				maj_flt += t->maj_flt;
+				gtime += task_gtime(t);
+			} while_each_thread(task, t);
+
+			min_flt += sig->min_flt;
+			maj_flt += sig->maj_flt;
+			thread_group_cputime_adjusted(task, &utime, &stime);
+			gtime += sig->gtime;
+		}
+
+		unlock_task_sighand(task, &flags);
+	}
+
+	if (!whole) {
+		min_flt = task->min_flt;
+		maj_flt = task->maj_flt;
+		task_cputime_adjusted(task, &utime, &stime);
+		gtime = task_gtime(task);
+	}
+
+	st->minflt	= min_flt;
+	st->cminflt	= cmin_flt;
+	st->majflt	= maj_flt;
+	st->cmajflt	= cmaj_flt;
+	st->utime	= nsec_to_clock_t(utime);
+	st->stime	= nsec_to_clock_t(stime);
+	st->cutime	= nsec_to_clock_t(cutime);
+	st->cstime	= nsec_to_clock_t(cstime);
+
+	st->threads	= num_threads;
+
+	return 0;
+}
+
+static int fill_task_statm(struct task_struct *task, struct sk_buff *skb, int whole)
+{
+	struct task_diag_statm *st;
+	struct nlattr *attr;
+
+	unsigned long text, lib, swap, ptes, anon, file, shmem;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long stack_vm, data_vm, locked_vm, pinned_vm;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		return 0;
+
+	anon = get_mm_counter(mm, MM_ANONPAGES);
+	file = get_mm_counter(mm, MM_FILEPAGES);
+	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = anon + file + shmem;
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
+
+	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT;
+	lib = mm->exec_vm - text;
+	swap = get_mm_counter(mm, MM_SWAPENTS);
+	ptes = mm_pgtables_bytes(mm);
+
+	data_vm   = mm->data_vm;
+	stack_vm  = mm->stack_vm;
+	locked_vm = mm->locked_vm;
+	pinned_vm = mm->pinned_vm;
+
+	mmput(mm);
+
+	attr = nla_reserve(skb, TASK_DIAG_STATM, sizeof(*st));
+	if (!attr)
+		return -EMSGSIZE;
+
+	st = nla_data(attr);
+
+	st->anon	= anon;
+	st->file	= file;
+	st->shmem	= shmem;
+	st->hiwater_vm	= hiwater_vm;
+	st->hiwater_rss	= hiwater_rss;
+	st->text	= text;
+	st->lib		= lib;
+	st->swap	= swap;
+	st->ptes	= ptes;
+	st->total_rss	= total_rss;
+	st->total_vm	= total_vm;
+	st->data_vm	= data_vm;
+	st->stack_vm	= stack_vm;
+	st->locked_vm	= locked_vm;
+	st->pinned_vm	= pinned_vm;
+
+	return 0;
+}
+
+static int fill_task_cmdline(struct task_struct *tsk, struct sk_buff *skb)
+{
+	unsigned long arg_start, arg_end, env_start, env_end;
+	struct nlattr *attr;
+	long nr_read, len;
+	struct mm_struct *mm;
+	void *pos;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+	arg_start = mm->arg_start;
+	arg_end = mm->arg_end;
+	env_start = mm->env_start;
+	env_end = mm->env_end;
+	up_read(&mm->mmap_sem);
+
+	BUG_ON(arg_start > arg_end);
+	BUG_ON(env_start > env_end);
+
+	len = arg_end - arg_start;
+
+	pos = nlmsg_get_pos(skb);
+
+	attr = nla_reserve(skb, TASK_DIAG_CMDLINE, len);
+	if (!attr)
+		return -EMSGSIZE;
+
+	nr_read = access_remote_vm(mm, arg_start, nla_data(attr), len, 0);
+	if (nr_read != len)
+		nlmsg_trim(skb, pos);
+
+	return 0;
+}
+
+static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
+			  struct task_diag_pid *req,
+			  struct task_diag_cb *cb, struct pid_namespace *pidns,
+			  struct user_namespace *userns)
+{
+	u64 show_flags = req->show_flags;
+	struct nlmsghdr *nlh;
+	struct task_diag_msg *msg;
+	int err = 0, i = 0, n = 0;
+	bool progress = false;
+	int flags = 0;
+
+	if (cb) {
+		n = cb->attr;
+		flags |= NLM_F_MULTI;
+	}
+
+	nlh = nlmsg_put(skb, 0, cb->nlh->nlmsg_seq,
+			TASK_DIAG_CMD_GET, sizeof(*msg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	msg = nlmsg_data(nlh);
+	msg->pid  = task_pid_nr_ns(tsk, pidns);
+	msg->tgid = task_tgid_nr_ns(tsk, pidns);
+	msg->flags |= TASK_DIAG_FLAG_CONT;
+
+	if (show_flags & TASK_DIAG_SHOW_BASE) {
+		if (i >= n)
+			err = fill_task_base(tsk, skb, pidns);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	if (show_flags & TASK_DIAG_SHOW_CRED) {
+		if (i >= n)
+			err = fill_creds(tsk, skb, userns);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	if (show_flags & TASK_DIAG_SHOW_VMA) {
+		bool dump_vma = true;
+
+		/* if the request is to dump all threads of all processes
+		 * only show VMAs for group leader.
+		 */
+		if ((req->dump_strategy == TASK_DIAG_DUMP_ALL_THREAD ||
+		     req->dump_strategy == TASK_DIAG_DUMP_THREAD) &&
+		    !thread_group_leader(tsk))
+			dump_vma = false;
+
+		if (dump_vma && i >= n)
+			err = fill_vma(tsk, skb, cb, &progress, show_flags);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	if (show_flags & TASK_DIAG_SHOW_STAT) {
+		int whole = 1;
+
+		if (req->dump_strategy == TASK_DIAG_DUMP_ALL_THREAD ||
+		    req->dump_strategy == TASK_DIAG_DUMP_THREAD)
+			whole = 0;
+
+		if (i >= n)
+			err = fill_task_stat(tsk, skb, whole);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	if (show_flags & TASK_DIAG_SHOW_STATM) {
+		if (i >= n)
+			err = fill_task_statm(tsk, skb, 1);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	if (show_flags & TASK_DIAG_SHOW_CMDLINE) {
+		if (i >= n)
+			err = fill_task_cmdline(tsk, skb);
+		if (err)
+			goto err;
+		i++;
+	}
+
+	msg->flags &= ~TASK_DIAG_FLAG_CONT;
+
+	nlmsg_end(skb, nlh);
+	if (cb)
+		cb->attr = 0;
+
+	return 0;
+err:
+	if (err == -EMSGSIZE && (i > n || progress)) {
+		if (cb)
+			cb->attr = i;
+		nlmsg_end(skb, nlh);
+	} else
+		nlmsg_cancel(skb, nlh);
+
+	return err;
+}
+
+struct task_iter {
+	struct task_diag_pid	req;
+	struct pid_namespace	*ns;
+	struct task_struct	*parent;
+
+	struct task_diag_cb	*cb;
+
+	struct tgid_iter	tgid;
+	loff_t			pos;
+	struct task_struct	*task;
+};
+
+static void iter_stop(struct task_iter *iter)
+{
+	struct task_struct *task;
+
+	if (iter->parent)
+		put_task_struct(iter->parent);
+
+	switch (iter->req.dump_strategy) {
+	case TASK_DIAG_DUMP_ALL:
+		task = iter->tgid.task;
+		break;
+	case TASK_DIAG_DUMP_ALL_THREAD:
+		/* release both tgid task and thread task */
+		if (iter->task)
+			put_task_struct(iter->task);
+		task = iter->tgid.task;
+		break;
+	default:
+		task = iter->task;
+	}
+	if (task)
+		put_task_struct(task);
+}
+
+static struct task_struct *
+task_diag_next_child(struct task_struct *parent,
+			struct task_struct *prev, loff_t pos)
+{
+	struct task_struct *task;
+
+	read_lock(&tasklist_lock);
+	task = task_next_child(parent, prev, pos);
+	if (prev)
+		put_task_struct(prev);
+	if (task)
+		get_task_struct(task);
+	read_unlock(&tasklist_lock);
+
+	return task;
+}
+
+static struct task_struct *iter_start(struct task_iter *iter)
+{
+	if (iter->req.pid > 0) {
+		rcu_read_lock();
+		iter->parent = find_task_by_pid_ns(iter->req.pid, iter->ns);
+		if (iter->parent)
+			get_task_struct(iter->parent);
+		rcu_read_unlock();
+	}
+
+	switch (iter->req.dump_strategy) {
+	case TASK_DIAG_DUMP_ONE:
+		if (iter->parent == NULL)
+			return ERR_PTR(-ESRCH);
+		iter->pos = iter->cb->pos;
+		if (iter->pos == 0) {
+			iter->task = iter->parent;
+			iter->parent = NULL;
+		} else
+			iter->task = NULL;
+		return iter->task;
+
+	case TASK_DIAG_DUMP_THREAD:
+		if (iter->parent == NULL)
+			return ERR_PTR(-ESRCH);
+
+		iter->pos = iter->cb->pos;
+		iter->task = task_first_tid(task_pid(iter->parent),
+					    iter->cb->pid,iter->pos, iter->ns);
+		return iter->task;
+
+	case TASK_DIAG_DUMP_CHILDREN:
+		if (iter->parent == NULL)
+			return ERR_PTR(-ESRCH);
+
+		iter->pos = iter->cb->pos;
+		iter->task = task_diag_next_child(iter->parent, NULL, iter->pos);
+		return iter->task;
+
+	case TASK_DIAG_DUMP_ALL:
+		iter->tgid.tgid = iter->cb->pid;
+		iter->tgid.task = NULL;
+		iter->tgid = next_tgid(iter->ns, iter->tgid);
+		return iter->tgid.task;
+
+	case TASK_DIAG_DUMP_ALL_THREAD:
+		iter->pos = iter->cb->pos;
+		iter->tgid.tgid = iter->cb->pid;
+		iter->tgid.task = NULL;
+		iter->tgid = next_tgid(iter->ns, iter->tgid);
+		if (!iter->tgid.task)
+			return NULL;
+
+		iter->task = task_first_tid(task_pid(iter->tgid.task),
+						0, iter->pos, iter->ns);
+		if (!iter->task) {
+			iter->pos = 0;
+			iter->tgid.tgid += 1;
+			iter->tgid = next_tgid(iter->ns, iter->tgid);
+			iter->task = iter->tgid.task;
+			if (iter->task)
+				get_task_struct(iter->task);
+		}
+		return iter->task;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct task_struct *iter_next(struct task_iter *iter)
+{
+	switch (iter->req.dump_strategy) {
+	case TASK_DIAG_DUMP_ONE:
+		iter->pos++;
+		iter->cb->pos = iter->pos;
+		if (iter->task)
+			put_task_struct(iter->task);
+		iter->task = NULL;
+		return NULL;
+
+	case TASK_DIAG_DUMP_THREAD:
+		iter->pos++;
+		iter->task = task_next_tid(iter->task);
+		iter->cb->pos = iter->pos;
+		if (iter->task)
+			iter->cb->pid = task_pid_nr_ns(iter->task, iter->ns);
+		else
+			iter->cb->pid = -1;
+		return iter->task;
+	case TASK_DIAG_DUMP_CHILDREN:
+		iter->pos++;
+		iter->task = task_diag_next_child(iter->parent, iter->task, iter->pos);
+		iter->cb->pos = iter->pos;
+		return iter->task;
+
+	case TASK_DIAG_DUMP_ALL:
+		iter->tgid.tgid += 1;
+		iter->tgid = next_tgid(iter->ns, iter->tgid);
+		iter->cb->pid = iter->tgid.tgid;
+		return iter->tgid.task;
+
+	case TASK_DIAG_DUMP_ALL_THREAD:
+		iter->pos++;
+		iter->task = task_next_tid(iter->task);
+		if (!iter->task) {
+			iter->pos = 0;
+			iter->tgid.tgid += 1;
+			iter->tgid = next_tgid(iter->ns, iter->tgid);
+			iter->task = iter->tgid.task;
+			if (iter->task)
+				get_task_struct(iter->task);
+		}
+
+		/* save current position */
+		iter->cb->pid = iter->tgid.tgid;
+		iter->cb->pos = iter->pos;
+
+		return iter->task;
+	}
+
+	return NULL;
+}
+
+static int __taskdiag_dumpit(struct task_iter *iter,
+			     struct task_diag_cb *cb, struct task_struct **start)
+{
+	struct user_namespace *userns = current_user_ns();
+	struct task_struct *task = *start;
+	int rc;
+
+	for (; task; task = iter_next(iter)) {
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+			continue;
+
+		rc = task_diag_fill(task, cb->resp, &iter->req,
+				cb, iter->ns, userns);
+		if (rc < 0) {
+			if (rc != -EMSGSIZE)
+				return rc;
+			break;
+		}
+	}
+	*start = task;
+
+	return 0;
+}
+
+static int taskdiag_dumpit(struct task_diag_cb *cb,
+				struct pid_namespace *pidns,
+				struct msghdr *msg, size_t len)
+{
+	struct sk_buff *skb = cb->resp;
+	struct task_struct *task;
+	struct task_iter iter;
+	struct nlattr *na;
+	size_t copied;
+	int err;
+
+	if (nlmsg_len(cb->nlh) < sizeof(iter.req))
+		return -EINVAL;
+
+	na = nlmsg_data(cb->nlh);
+	if (na->nla_type < 0)
+		return -EINVAL;
+
+	memcpy(&iter.req, na, sizeof(iter.req));
+
+	iter.ns     = pidns;
+	iter.cb     = cb;
+	iter.parent = NULL;
+	iter.pos    = 0;
+	iter.task   = NULL;
+
+	task = iter_start(&iter);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	copied = 0;
+	while (1) {
+		err = __taskdiag_dumpit(&iter, cb, &task);
+		if (err < 0)
+			goto err;
+		if (skb->len == 0)
+			break;
+
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+		if (err < 0)
+			goto err;
+
+		copied += skb->len;
+
+		skb_trim(skb, 0);
+		if (skb_tailroom(skb) + copied > len)
+			break;
+
+		if (signal_pending(current))
+			break;
+	}
+
+	iter_stop(&iter);
+	return copied;
+err:
+	iter_stop(&iter);
+	return err;
+}
+
+static ssize_t task_diag_write(struct file *f, const char __user *buf,
+						size_t len, loff_t *off)
+{
+	struct task_diag_cb *cb = f->private_data;
+	struct sk_buff *skb;
+	struct msghdr msg;
+	struct iovec iov;
+	int err;
+
+	if (cb->req)
+		return -EBUSY;
+	if (len < nlmsg_total_size(0))
+		return -EINVAL;
+
+	err = import_single_range(WRITE, (void __user *) buf, len,
+						&iov, &msg.msg_iter);
+	if (unlikely(err))
+		return err;
+
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_flags = 0;
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (memcpy_from_msg(skb_put(skb, len), &msg, len)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
+
+	memset(cb, 0, sizeof(*cb));
+	cb->req = skb;
+	cb->nlh = nlmsg_hdr(skb);
+
+	return len;
+}
+
+static ssize_t task_diag_read(struct file *file, char __user *ubuf,
+						size_t len, loff_t *off)
+{
+	struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+	struct task_diag_cb *cb = file->private_data;
+	struct iovec iov;
+	struct msghdr msg;
+	int size, err;
+
+	if (cb->req == NULL)
+		return 0;
+
+	err = import_single_range(READ, ubuf, len, &iov, &msg.msg_iter);
+	if (unlikely(err))
+		goto err;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+
+	if (!cb->resp) {
+		size = min_t(size_t, len, 16384);
+		cb->resp = alloc_skb(size, GFP_KERNEL);
+		if (cb->resp == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		/* Trim skb to allocated size. */
+		skb_reserve(cb->resp, skb_tailroom(cb->resp) - size);
+	}
+
+	err = taskdiag_dumpit(cb, ns, &msg, len);
+
+err:
+	skb_trim(cb->resp, 0);
+	if (err <= 0) {
+		kfree_skb(cb->req);
+		cb->req = NULL;
+	}
+
+	return err;
+}
+
+static int task_diag_open (struct inode *inode, struct file *f)
+{
+	f->private_data = kzalloc(sizeof(struct task_diag_cb), GFP_KERNEL);
+	if (f->private_data == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int task_diag_release(struct inode *inode, struct file *f)
+{
+	struct task_diag_cb *cb = f->private_data;
+
+	kfree_skb(cb->req);
+	kfree_skb(cb->resp);
+
+	kfree(f->private_data);
+	return 0;
+}
+
+static const struct file_operations task_diag_fops = {
+	.owner		= THIS_MODULE,
+	.open		= task_diag_open,
+	.release	= task_diag_release,
+	.write		= task_diag_write,
+	.read		= task_diag_read,
+};
+
+static __init int task_diag_init(void)
+{
+	if (!proc_create("task-diag", S_IRUGO | S_IWUGO, NULL, &task_diag_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static __exit void task_diag_exit(void)
+{
+	remove_proc_entry("task-diag", NULL);
+}
+
+module_init(task_diag_init);
+module_exit(task_diag_exit);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64cb0b4c1..0c766a8f818774 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -402,25 +402,6 @@ const struct file_operations proc_pid_maps_operations = {
 #define PSS_SHIFT 12
 
 #ifdef CONFIG_PROC_PAGE_MONITOR
-struct mem_size_stats {
-	unsigned long resident;
-	unsigned long shared_clean;
-	unsigned long shared_dirty;
-	unsigned long private_clean;
-	unsigned long private_dirty;
-	unsigned long referenced;
-	unsigned long anonymous;
-	unsigned long lazyfree;
-	unsigned long anonymous_thp;
-	unsigned long shmem_thp;
-	unsigned long swap;
-	unsigned long shared_hugetlb;
-	unsigned long private_hugetlb;
-	u64 pss;
-	u64 pss_locked;
-	u64 swap_pss;
-	bool check_shmem_swap;
-};
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
 		bool compound, bool young, bool dirty)
@@ -564,7 +545,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 }
 #endif
 
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			   struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->vma;
diff --git a/include/uapi/linux/cgroupstats.h b/include/uapi/linux/cgroupstats.h
index aa306e4cd6c1f8..40e1405f5dbdf1 100644
--- a/include/uapi/linux/cgroupstats.h
+++ b/include/uapi/linux/cgroupstats.h
@@ -38,21 +38,6 @@ struct cgroupstats {
 	__u64	nr_io_wait;		/* Number of tasks waiting on IO */
 };
 
-/*
- * Commands sent from userspace
- * Not versioned. New commands should only be inserted at the enum's end
- * prior to __CGROUPSTATS_CMD_MAX
- */
-
-enum {
-	CGROUPSTATS_CMD_UNSPEC = __TASKSTATS_CMD_MAX,	/* Reserved */
-	CGROUPSTATS_CMD_GET,		/* user->kernel request/get-response */
-	CGROUPSTATS_CMD_NEW,		/* kernel->user event */
-	__CGROUPSTATS_CMD_MAX,
-};
-
-#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1)
-
 enum {
 	CGROUPSTATS_TYPE_UNSPEC = 0,	/* Reserved */
 	CGROUPSTATS_TYPE_CGROUP_STATS,	/* contains name + stats */
diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h
new file mode 100644
index 00000000000000..8e43da6f55cf68
--- /dev/null
+++ b/include/uapi/linux/task_diag.h
@@ -0,0 +1,216 @@
+#ifndef _LINUX_TASK_DIAG_H
+#define _LINUX_TASK_DIAG_H
+
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/capability.h>
+
+#define TASK_DIAG_CMD_GET 0xd101U
+
+struct task_diag_msg {
+	__u32 pid;
+	__u32 tgid;
+	__u32 flags;
+};
+
+#define TASK_DIAG_FLAG_CONT 0x00000001
+
+enum {
+	TASK_DIAG_BASE	= 0,
+	TASK_DIAG_CRED,
+	TASK_DIAG_VMA,
+	TASK_DIAG_VMA_STAT,
+	TASK_DIAG_STAT,
+	TASK_DIAG_STATM,
+	TASK_DIAG_CMDLINE,
+
+	__TASK_DIAG_ATTR_MAX
+#define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1)
+};
+
+#define TASK_DIAG_SHOW_BASE	(1ULL << TASK_DIAG_BASE)
+#define TASK_DIAG_SHOW_CRED	(1ULL << TASK_DIAG_CRED)
+#define TASK_DIAG_SHOW_VMA	(1ULL << TASK_DIAG_VMA)
+#define TASK_DIAG_SHOW_VMA_STAT	(1ULL << TASK_DIAG_VMA_STAT)
+#define TASK_DIAG_SHOW_STAT	(1ULL << TASK_DIAG_STAT)
+#define TASK_DIAG_SHOW_STATM	(1ULL << TASK_DIAG_STATM)
+#define TASK_DIAG_SHOW_CMDLINE	(1ULL << TASK_DIAG_CMDLINE)
+
+enum {
+	TASK_DIAG_RUNNING,
+	TASK_DIAG_INTERRUPTIBLE,
+	TASK_DIAG_UNINTERRUPTIBLE,
+	TASK_DIAG_STOPPED,
+	TASK_DIAG_TRACE_STOP,
+	TASK_DIAG_DEAD,
+	TASK_DIAG_ZOMBIE,
+	TASK_DIAG_PARKED,
+	TASK_DIAG_IDLE,
+};
+
+#define TASK_DIAG_COMM_LEN 16
+
+struct task_diag_base {
+	__u32	tgid;
+	__u32	pid;
+	__u32	ppid;
+	__u32	tpid;
+	__u32	sid;
+	__u32	pgid;
+	__u8	state;
+	char	comm[TASK_DIAG_COMM_LEN];
+};
+
+struct task_diag_caps {
+	__u32 cap[_LINUX_CAPABILITY_U32S_3];
+};
+
+struct task_diag_creds {
+	struct task_diag_caps cap_inheritable;
+	struct task_diag_caps cap_permitted;
+	struct task_diag_caps cap_effective;
+	struct task_diag_caps cap_bset;
+
+	__u32 uid;
+	__u32 euid;
+	__u32 suid;
+	__u32 fsuid;
+	__u32 gid;
+	__u32 egid;
+	__u32 sgid;
+	__u32 fsgid;
+};
+
+#define TASK_DIAG_VMA_F_READ		(1ULL <<  0)
+#define TASK_DIAG_VMA_F_WRITE		(1ULL <<  1)
+#define TASK_DIAG_VMA_F_EXEC		(1ULL <<  2)
+#define TASK_DIAG_VMA_F_SHARED		(1ULL <<  3)
+#define TASK_DIAG_VMA_F_MAYREAD		(1ULL <<  4)
+#define TASK_DIAG_VMA_F_MAYWRITE	(1ULL <<  5)
+#define TASK_DIAG_VMA_F_MAYEXEC		(1ULL <<  6)
+#define TASK_DIAG_VMA_F_MAYSHARE	(1ULL <<  7)
+#define TASK_DIAG_VMA_F_GROWSDOWN	(1ULL <<  8)
+#define TASK_DIAG_VMA_F_PFNMAP		(1ULL <<  9)
+#define TASK_DIAG_VMA_F_DENYWRITE	(1ULL << 10)
+#define TASK_DIAG_VMA_F_MPX		(1ULL << 11)
+#define TASK_DIAG_VMA_F_LOCKED		(1ULL << 12)
+#define TASK_DIAG_VMA_F_IO		(1ULL << 13)
+#define TASK_DIAG_VMA_F_SEQ_READ	(1ULL << 14)
+#define TASK_DIAG_VMA_F_RAND_READ	(1ULL << 15)
+#define TASK_DIAG_VMA_F_DONTCOPY	(1ULL << 16)
+#define TASK_DIAG_VMA_F_DONTEXPAND	(1ULL << 17)
+#define TASK_DIAG_VMA_F_ACCOUNT		(1ULL << 18)
+#define TASK_DIAG_VMA_F_NORESERVE	(1ULL << 19)
+#define TASK_DIAG_VMA_F_HUGETLB		(1ULL << 20)
+#define TASK_DIAG_VMA_F_ARCH_1		(1ULL << 21)
+#define TASK_DIAG_VMA_F_DONTDUMP	(1ULL << 22)
+#define TASK_DIAG_VMA_F_SOFTDIRTY	(1ULL << 23)
+#define TASK_DIAG_VMA_F_MIXEDMAP	(1ULL << 24)
+#define TASK_DIAG_VMA_F_HUGEPAGE	(1ULL << 25)
+#define TASK_DIAG_VMA_F_NOHUGEPAGE	(1ULL << 26)
+#define TASK_DIAG_VMA_F_MERGEABLE	(1ULL << 27)
+
+struct task_diag_vma_stat {
+	__u64 resident;
+	__u64 shared_clean;
+	__u64 shared_dirty;
+	__u64 private_clean;
+	__u64 private_dirty;
+	__u64 referenced;
+	__u64 anonymous;
+	__u64 anonymous_thp;
+	__u64 swap;
+	__u64 pss;
+} __attribute__((__aligned__(NLA_ALIGNTO)));
+
+/* task_diag_vma must be NLA_ALIGN'ed */
+struct task_diag_vma {
+	__u64 start, end;
+	__u64 vm_flags;
+	__u64 pgoff;
+	__u32 major;
+	__u32 minor;
+	__u64 inode;
+	__u32 generation;
+	__u16 vma_len;
+	__u16 name_off;
+	__u16 name_len;
+	__u16 stat_off;
+	__u16 stat_len;
+} __attribute__((__aligned__(NLA_ALIGNTO)));
+
+static inline char *task_diag_vma_name(struct task_diag_vma *vma)
+{
+	if (!vma->name_len)
+		return NULL;
+
+	return ((char *)vma) + vma->name_off;
+}
+
+static inline
+struct task_diag_vma_stat *task_diag_vma_stat(struct task_diag_vma *vma)
+{
+	if (!vma->stat_len)
+		return NULL;
+
+	return ((void *)vma) + vma->stat_off;
+}
+
+#define task_diag_for_each_vma(vma, attr)			\
+	for (vma = nla_data(attr);				\
+		(void *) vma < nla_data(attr) + nla_len(attr);	\
+		vma = (void *) vma + vma->vma_len)
+
+struct task_diag_stat {
+	__u64 minflt;
+	__u64 cminflt;
+	__u64 majflt;
+	__u64 cmajflt;
+	__u64 utime;
+	__u64 stime;
+	__u64 cutime;
+	__u64 cstime;
+
+	__u32 threads;
+};
+
+struct task_diag_statm {
+	__u64 anon;
+	__u64 file;
+	__u64 shmem;
+	__u64 total_vm;
+	__u64 total_rss;
+	__u64 hiwater_vm;
+	__u64 hiwater_rss;
+	__u64 text;
+	__u64 lib;
+	__u64 swap;
+	__u64 ptes;
+	__u64 locked_vm;
+	__u64 pinned_vm;
+	__u64 data_vm;
+	__u64 stack_vm;
+};
+
+#define TASK_DIAG_DUMP_ALL		0
+#define TASK_DIAG_DUMP_ONE		1
+#define TASK_DIAG_DUMP_ALL_THREAD	2
+#define TASK_DIAG_DUMP_CHILDREN		3
+#define TASK_DIAG_DUMP_THREAD		4
+
+struct task_diag_pid {
+	__u64	show_flags;
+	__u64	dump_strategy;
+
+	__u32	pid;
+};
+
+enum {
+	TASK_DIAG_CMD_ATTR_UNSPEC = 0,
+	TASK_DIAG_CMD_ATTR_GET,
+	__TASK_DIAG_CMD_ATTR_MAX,
+};
+
+#define TASK_DIAG_CMD_ATTR_MAX (__TASK_DIAG_CMD_ATTR_MAX - 1)
+
+#endif /* _LINUX_TASK_DIAG_H */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index b7aa7bb2349f7b..f06af282b97cab 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -177,9 +177,16 @@ enum {
 	TASKSTATS_CMD_UNSPEC = 0,	/* Reserved */
 	TASKSTATS_CMD_GET,		/* user->kernel request/get-response */
 	TASKSTATS_CMD_NEW,		/* kernel->user event */
+	__TASKSTATS_CMD_RESERVED,
+
+	CGROUPSTATS_CMD_GET,		/* user->kernel request/get-response */
+	CGROUPSTATS_CMD_NEW,		/* kernel->user event */
+
 	__TASKSTATS_CMD_MAX,
 };
 
+#define __CGROUPSTATS_CMD_MAX __TASKSTATS_CMD_MAX
+#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1)
 #define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1)
 
 enum {
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f1fe492c8e17d0..d4ab3d91e3d74b 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -46,6 +46,7 @@ TARGETS += user
 TARGETS += vm
 TARGETS += x86
 TARGETS += zram
+TARGETS += task_diag
 #Please keep the TARGETS list alphabetically sorted
 # Run "make quicktest=1 run_tests" or
 # "make quicktest=1 kselftest" from top level Makefile
diff --git a/tools/testing/selftests/task_diag/.gitignore b/tools/testing/selftests/task_diag/.gitignore
new file mode 100644
index 00000000000000..f963a1f37a7f79
--- /dev/null
+++ b/tools/testing/selftests/task_diag/.gitignore
@@ -0,0 +1,4 @@
+task_diag
+task_diag_all
+task_proc_all
+fork
diff --git a/tools/testing/selftests/task_diag/Makefile b/tools/testing/selftests/task_diag/Makefile
new file mode 100644
index 00000000000000..c99772316f49ae
--- /dev/null
+++ b/tools/testing/selftests/task_diag/Makefile
@@ -0,0 +1,18 @@
+all: task_diag_all fork task_proc_all fork
+
+CFLAGS += -g -Wall -O2 -I/usr/include/libnl3
+LDFLAGS += -lnl-3
+TEST_PROGS := run.sh
+include ../lib.mk
+
+task_diag_all.o: task_diag_all.c task_diag_comm.h
+task_diag_comm.o: task_diag_comm.c task_diag_comm.h
+
+task_diag_all: task_diag_all.o task_diag_comm.o
+fork: fork.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ -lpthread
+
+task_proc_all: task_proc_all.c
+
+clean:
+	rm -rf task_diag task_diag_all task_diag_comm.o task_diag_all.o task_diag.o fork task_proc_all
diff --git a/tools/testing/selftests/task_diag/_run.sh b/tools/testing/selftests/task_diag/_run.sh
new file mode 100755
index 00000000000000..d2e854430565d6
--- /dev/null
+++ b/tools/testing/selftests/task_diag/_run.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -o pipefail
+set -e -x
+
+./fork 1000 10
+
+nprocesses=`./task_diag_all all --maps | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l`
+nthreads=`./task_diag_all All --smaps --cred | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l`
+nchildren=`./task_diag_all children --pid 1 | grep 'pid.*tgid.*ppid.*comm fork$' | wc -l`
+
+./task_diag_all one --pid 1 --cred
+
+( exec -a fork_thread ./fork 1 1234 )
+pid=`pidof fork_thread`
+ntaskthreads=`./task_diag_all thread --maps --cred --smaps --pid $pid |  grep 'pid.*tgid.*ppid.*comm' | wc -l`
+killall -9 fork
+
+[ "$nthreads"     -eq 10000 ] &&
+[ "$nprocesses"   -eq 1000  ] &&
+[ "$nchildren"    -eq 1000  ] &&
+[ "$ntaskthreads" -eq 1234  ] &&
+true ||  {
+	echo "Unexpected number of tasks $nthreads:$nprocesses" 1>&2
+	exit 1
+}
diff --git a/tools/testing/selftests/task_diag/fork.c b/tools/testing/selftests/task_diag/fork.c
new file mode 100644
index 00000000000000..ebddedd21bfeb0
--- /dev/null
+++ b/tools/testing/selftests/task_diag/fork.c
@@ -0,0 +1,58 @@
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+void *f(void *arg)
+{
+	unsigned long t = (unsigned long) arg;
+
+	sleep(t);
+	return NULL;
+}
+
+/* usage: fork nproc [mthreads [sleep]] */
+int main(int argc, char **argv)
+{
+	int i, j, n, m = 0;
+	unsigned long t_sleep = 1000;
+	pthread_attr_t attr;
+	pthread_t id;
+
+	if (argc < 2) {
+		fprintf(stderr, "usage: fork nproc [mthreads [sleep]]\n");
+		return 1;
+	}
+
+	n = atoi(argv[1]);
+
+	if (argc > 2)
+		m = atoi(argv[2]);
+
+	if (argc > 3)
+		t_sleep = atoi(argv[3]);
+
+	pthread_attr_init(&attr);
+
+	for (i = 0; i < n; i++) {
+		pid_t pid;
+
+		pid = fork();
+		if (pid < 0) {
+			printf("Unable to fork: %m\n");
+			return 1;
+		}
+		if (pid == 0) {
+			if (m) {
+				for (j = 0; j < m-1; ++j)
+					pthread_create(&id, &attr, f, (void *)t_sleep);
+			}
+
+			sleep(t_sleep);
+			return 0;
+		}
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/task_diag/run.sh b/tools/testing/selftests/task_diag/run.sh
new file mode 100755
index 00000000000000..28a8550903e861
--- /dev/null
+++ b/tools/testing/selftests/task_diag/run.sh
@@ -0,0 +1 @@
+unshare -p -f -m --mount-proc ./_run.sh && { echo PASS; exit 0; } || { echo FAIL; exit 1; }
diff --git a/tools/testing/selftests/task_diag/task_diag.h b/tools/testing/selftests/task_diag/task_diag.h
new file mode 120000
index 00000000000000..d20a38c7f19b95
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag.h
@@ -0,0 +1 @@
+../../../../include/uapi/linux/task_diag.h
\ No newline at end of file
diff --git a/tools/testing/selftests/task_diag/task_diag_all.c b/tools/testing/selftests/task_diag/task_diag_all.c
new file mode 100644
index 00000000000000..52ab1bba3e27e3
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_all.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <getopt.h>
+
+#include <linux/netlink.h>
+#include <netlink/msg.h>
+
+#include "task_diag.h"
+#include "task_diag_comm.h"
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK	270
+#endif
+
+#ifndef NETLINK_SCM_PID
+#define NETLINK_SCM_PID	11
+#endif
+
+static void usage(char *name)
+{
+	pr_err("Usage: %s command [options]", name);
+	pr_err(
+"Commands:\n"
+"\tall         - dump all processes\n"
+"\tAll         - dump all threads\n"
+"\tthreads     - dump all thread for the specified process\n"
+"\tchildren    - dump all thread for the specified process\n"
+"\tone         - dump the specified process\n"
+"Options:\n"
+"\t-p|--pid    - PID of the required process\n"
+"\t-m|--maps   - dump memory regions\n"
+"\t-s|--smaps  - dump statistics for memory regions\n"
+"\t-c|--cred   - dump credentials\n"
+"\t-l|--cmdline - dump command line\n"
+"\t-q|--quiet - do  not  write  anything  to  standard   output\n"
+);
+}
+int main(int argc, char *argv[])
+{
+	int exit_status = 1, fd;
+	struct task_diag_pid *req;
+	char nl_req[4096];
+	struct nlmsghdr *hdr = (void *)nl_req;
+	int last_pid = 0;
+	int opt, idx;
+	int err, size = 0;
+	static const char short_opts[] = "p:cmslqx";
+	static struct option long_opts[] = {
+		{ "pid",	required_argument, 0, 'p' },
+		{ "maps",	no_argument, 0, 'm' },
+		{ "smaps",	no_argument, 0, 's' },
+		{ "cred",	no_argument, 0, 'c' },
+		{ "cmdline",	no_argument, 0, 'l' },
+		{ "quiet",	no_argument, 0, 'q' },
+		{ "stat",	no_argument, 0, 'x' },
+		{},
+	};
+
+	hdr->nlmsg_len = nlmsg_total_size(0);
+
+	req = nlmsg_data(hdr);
+	size += nla_total_size(sizeof(*req));
+
+	hdr->nlmsg_len += size;
+
+
+	req->show_flags = TASK_DIAG_SHOW_BASE;
+
+	if (argc < 2) {
+		pr_err("Usage: %s type pid scm_pid", argv[0]);
+		return 1;
+	}
+
+	req->pid = 0; /* dump all tasks by default */
+
+	switch (argv[1][0]) {
+	case 'c':
+		req->dump_strategy = TASK_DIAG_DUMP_CHILDREN;
+		break;
+	case 't':
+		req->dump_strategy = TASK_DIAG_DUMP_THREAD;
+		break;
+	case 'o':
+		req->dump_strategy = TASK_DIAG_DUMP_ONE;
+		break;
+	case 'a':
+		req->dump_strategy = TASK_DIAG_DUMP_ALL;
+		req->pid = 0;
+		break;
+	case 'A':
+		req->dump_strategy = TASK_DIAG_DUMP_ALL_THREAD;
+		req->pid = 0;
+		break;
+	default:
+		usage(argv[0]);
+		return 1;
+	}
+
+	while (1) {
+		idx = -1;
+		opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
+		if (opt == -1)
+			break;
+		switch (opt) {
+		case 'p':
+			req->pid = atoi(optarg);
+			break;
+		case 'c':
+			req->show_flags |= TASK_DIAG_SHOW_CRED;
+			break;
+		case 'm':
+			req->show_flags |= TASK_DIAG_SHOW_VMA;
+			break;
+		case 's':
+			req->show_flags |= TASK_DIAG_SHOW_VMA_STAT | TASK_DIAG_SHOW_VMA;
+			break;
+		case 'l':
+			req->show_flags |= TASK_DIAG_SHOW_CMDLINE;
+			break;
+		case 'q':
+			quiet = 1;
+			break;
+		case 'x':
+			req->show_flags |= TASK_DIAG_SHOW_STAT;
+			break;
+		default:
+			usage(argv[0]);
+			return 1;
+		}
+	}
+
+	fd = open("/proc/task-diag", O_RDWR);
+	if (fd < 0)
+		return -1;
+
+	if (write(fd, hdr, hdr->nlmsg_len) != hdr->nlmsg_len)
+		return -1;
+
+	while (1) {
+		char buf[163840];
+		size = read(fd, buf, sizeof(buf));
+
+		if (size < 0)
+			goto err;
+
+		if (size == 0)
+			break;
+
+		err = nlmsg_receive(buf, size, &show_task, &last_pid);
+		if (err < 0)
+			goto err;
+
+		if (err == 0)
+			break;
+	}
+
+	exit_status = 0;
+err:
+	return exit_status;
+}
diff --git a/tools/testing/selftests/task_diag/task_diag_comm.c b/tools/testing/selftests/task_diag/task_diag_comm.c
new file mode 100644
index 00000000000000..9440e2f82ed7a0
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_comm.c
@@ -0,0 +1,211 @@
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <linux/netlink.h>
+#include <netlink/cli/utils.h>
+
+#include "task_diag.h"
+#include "task_diag_comm.h"
+
+int quiet;
+
+#define PSS_SHIFT 12
+
+int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *, void *), void *args)
+{
+	struct nlmsghdr *hdr;
+
+	for (hdr = (struct nlmsghdr *)buf;
+			NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+
+		if (hdr->nlmsg_type == NLMSG_DONE) {
+			int *len = (int *)NLMSG_DATA(hdr);
+
+			if (*len < 0) {
+				pr_err("ERROR %d reported by netlink (%s)\n",
+					*len, strerror(-*len));
+				return *len;
+			}
+
+			return 0;
+		}
+
+		if (hdr->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr);
+
+			if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) {
+				pr_err("ERROR truncated\n");
+				return -1;
+			}
+
+			if (err->error == 0)
+				return 0;
+
+			return -1;
+		}
+		if (cb && cb(hdr, args))
+			return -1;
+	}
+
+	return 1;
+}
+
+int show_task(struct nlmsghdr *hdr, void *arg)
+{
+	int msg_len;
+	struct msgtemplate *msg;
+	struct task_diag_msg *diag_msg;
+	struct nlattr *na;
+	int *last_pid = arg;
+	int len;
+
+	msg_len = NLMSG_PAYLOAD(hdr, 0);
+
+	msg = (struct msgtemplate *)hdr;
+	diag_msg = NLMSG_DATA(msg);
+
+#if 1
+	if (diag_msg->pid != *last_pid)
+		pr_info("Start getting information about %d\n", diag_msg->pid);
+	else
+		pr_info("Continue getting information about %d\n", diag_msg->pid);
+#endif
+	*last_pid = diag_msg->pid;
+
+	na = ((void *) diag_msg) + NLMSG_ALIGN(sizeof(*diag_msg));
+	len = NLMSG_ALIGN(sizeof(*diag_msg));
+	while (len < msg_len) {
+		len += NLA_ALIGN(na->nla_len);
+		switch (na->nla_type) {
+		case TASK_DIAG_BASE:
+		{
+			struct task_diag_base *msg;
+
+			/* For nested attributes, na follows */
+			msg = NLA_DATA(na);
+			pr_info("pid %5d tgid %5d ppid %5d sid %5d pgid %5d comm %s\n",
+				msg->pid, msg->tgid, msg->ppid, msg->sid, msg->pgid, msg->comm);
+		}
+		break;
+
+		case TASK_DIAG_CRED:
+		{
+			struct task_diag_creds *creds;
+
+			creds = NLA_DATA(na);
+			pr_info("uid: %d %d %d %d\n", creds->uid,
+					creds->euid, creds->suid, creds->fsuid);
+			pr_info("gid: %d %d %d %d\n", creds->uid,
+					creds->euid, creds->suid, creds->fsuid);
+			pr_info("CapInh: %08x%08x\n",
+						creds->cap_inheritable.cap[1],
+						creds->cap_inheritable.cap[0]);
+			pr_info("CapPrm: %08x%08x\n",
+						creds->cap_permitted.cap[1],
+						creds->cap_permitted.cap[0]);
+			pr_info("CapEff: %08x%08x\n",
+						creds->cap_effective.cap[1],
+						creds->cap_effective.cap[0]);
+			pr_info("CapBnd: %08x%08x\n", creds->cap_bset.cap[1],
+						creds->cap_bset.cap[0]);
+		}
+		break;
+
+		case TASK_DIAG_CMDLINE:
+		{
+			char *cmdline = NLA_DATA(na);
+			long i;
+
+			for (i = 0; i < nla_len(na); i++)
+				if (cmdline[i] == 0)
+					cmdline[i] = ' ';
+			cmdline[i - 1] = 0;
+			pr_info("cmdline: %s\n", cmdline);
+		}
+		break;
+
+		case TASK_DIAG_VMA:
+		{
+			struct task_diag_vma *vma_tmp, vma;
+
+			task_diag_for_each_vma(vma_tmp, na) {
+				char *name;
+				struct task_diag_vma_stat *stat_tmp, stat;
+
+				name = task_diag_vma_name(vma_tmp);
+				if (name == NULL)
+					name = "";
+
+				memcpy(&vma, vma_tmp, sizeof(vma));
+				pr_info("%016llx-%016llx %016llx %s\n",
+					vma.start, vma.end, vma.vm_flags, name);
+
+				stat_tmp = task_diag_vma_stat(vma_tmp);
+				if (stat_tmp)
+					memcpy(&stat, stat_tmp, sizeof(stat));
+				else
+					memset(&stat, 0, sizeof(stat));
+
+				pr_info(
+					   "Size:           %8llu kB\n"
+					   "Rss:            %8llu kB\n"
+					   "Pss:            %8llu kB\n"
+					   "Shared_Clean:   %8llu kB\n"
+					   "Shared_Dirty:   %8llu kB\n"
+					   "Private_Clean:  %8llu kB\n"
+					   "Private_Dirty:  %8llu kB\n"
+					   "Referenced:     %8llu kB\n"
+					   "Anonymous:      %8llu kB\n"
+					   "AnonHugePages:  %8llu kB\n"
+					   "Swap:           %8llu kB\n",
+					   (vma.end - vma.start) >> 10,
+					   stat.resident >> 10,
+					   (stat.pss >> (10 + PSS_SHIFT)),
+					   stat.shared_clean  >> 10,
+					   stat.shared_dirty  >> 10,
+					   stat.private_clean >> 10,
+					   stat.private_dirty >> 10,
+					   stat.referenced >> 10,
+					   stat.anonymous >> 10,
+					   stat.anonymous_thp >> 10,
+					   stat.swap >> 10);
+			}
+		}
+		break;
+		case TASK_DIAG_STAT:
+		{
+			struct task_diag_stat *stat;
+			stat = NLA_DATA(na);
+
+			pr_info(
+				"minflt: %llu\n"
+				"cminflt: %llu\n"
+				"majflt: %llu\n"
+				"cmajflt: %llu\n"
+				"utime: %llu\n"
+				"stime: %llu\n"
+				"cutime: %llu\n"
+				"cstime: %llu\n"
+				"threads: %u\n",
+				stat->minflt,
+				stat->cminflt,
+				stat->majflt,
+				stat->cmajflt,
+				stat->utime,
+				stat->stime,
+				stat->cutime,
+				stat->cstime,
+				stat->threads);
+		}
+		break;
+		default:
+			pr_info("Unknown nla_type %d\n",
+				na->nla_type);
+		}
+		na = ((void *) diag_msg) + len;
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/task_diag/task_diag_comm.h b/tools/testing/selftests/task_diag/task_diag_comm.h
new file mode 100644
index 00000000000000..40e83b79f8b262
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_comm.h
@@ -0,0 +1,34 @@
+#ifndef __TASK_DIAG_COMM__
+#define __TASK_DIAG_COMM__
+
+#include <stdio.h>
+
+#include "task_diag.h"
+
+/*
+ * Generic macros for dealing with netlink sockets. Might be duplicated
+ * elsewhere. It is recommended that commercial grade applications use
+ * libnl or libnetlink and use the interfaces provided by the library
+ */
+#define GENLMSG_DATA(glh)	((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na)		((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)
+
+#define pr_err(fmt, ...)				\
+		fprintf(stderr, "%s:%d" fmt"\n", __func__, __LINE__, ##__VA_ARGS__)
+
+#define pr_perror(fmt, ...)				\
+		fprintf(stderr, fmt " : %m\n", ##__VA_ARGS__)
+
+extern int quiet;
+#define pr_info(fmt, arg...)			\
+	do {					\
+		if (!quiet)			\
+			printf(fmt, ##arg);	\
+	} while (0)				\
+
+int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *, void *), void *args);
+extern int show_task(struct nlmsghdr *hdr, void *arg);
+
+#endif /* __TASK_DIAG_COMM__ */
diff --git a/tools/testing/selftests/task_diag/task_proc_all.c b/tools/testing/selftests/task_diag/task_proc_all.c
new file mode 100644
index 00000000000000..15b934cd47d147
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_proc_all.c
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+
+
+int main(int argc, char **argv)
+{
+	DIR *d;
+	int fd, tasks = 0;
+	struct dirent *de;
+	char buf[4096 * 4];
+	static const char short_opts[] = "R";
+	static struct option long_opts[] = {
+		{ "noread",	no_argument,	0, 'R'},
+		{}
+	};
+	bool noread = false;
+	int idx, opt;
+
+	while (1) {
+		idx = -1;
+		opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
+		if (opt == -1)
+			break;
+		switch (opt) {
+		case 'R':
+			noread = true;
+			break;
+		default:
+			return 1;
+		}
+	}
+
+	if (optind >= argc)
+		return 1;
+
+	d = opendir("/proc");
+	if (d == NULL)
+		return 1;
+
+	while ((de = readdir(d))) {
+		if (de->d_name[0] < '0' || de->d_name[0] > '9')
+			continue;
+		snprintf(buf, sizeof(buf), "/proc/%s/%s", de->d_name, argv[optind]);
+		fd = open(buf, O_RDONLY);
+		if (fd < 0)
+			return 1;
+		if (!noread && read(fd, buf, sizeof(buf)) < 0)
+			return 1;
+		close(fd);
+		tasks++;
+	}
+
+	closedir(d);
+
+	printf("tasks: %d\n", tasks);
+
+	return 0;
+}