From fdc8c87e5bb56abc25a788263472135d46a8e5fa Mon Sep 17 00:00:00 2001 From: Mao Date: Sat, 23 Nov 2024 18:21:19 +0000 Subject: [PATCH 1/5] Very basic instant checkpoint implementation with read/write syscall PoC --- .config | 2 + Kconfig | 6 + fs/read_write.c | 34 +++- include/linux/ick.h | 30 +++ include/linux/sched.h | 5 + init/init_task.c | 4 + kernel/Makefile | 2 + kernel/fork.c | 4 + kernel/ick.c | 425 ++++++++++++++++++++++++++++++++++++++++++ mm/memory.c | 14 ++ mm/mmap.c | 10 + mm/mprotect.c | 5 + 12 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 include/linux/ick.h create mode 100644 kernel/ick.c diff --git a/.config b/.config index 4fbf402746d765..8e4cf38650d82f 100644 --- a/.config +++ b/.config @@ -3432,3 +3432,5 @@ CONFIG_ARCH_USE_MEMTEST=y # # end of Rust hacking # end of Kernel hacking + +CONFIG_ICK=y diff --git a/Kconfig b/Kconfig index 745bc773f56706..9b13ab8e4f292e 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,9 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +config ICK + bool "ick" + default y + help + instant checkpoint diff --git a/fs/read_write.c b/fs/read_write.c index 90e283b31ca181..f7c0b5eb99bad7 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@ #include #include +#include +#include const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -607,8 +609,19 @@ static inline loff_t *file_ppos(struct file *file) ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; + struct fd f; + + if (fd == 0 && current->hack_target && !current->ick_data) { + trace_printk("ick checkpoint on hacked process %s[%u]\n", current->comm, current->pid); + ret = ick_checkpoint_proc(); + if (ret) { + pr_err("sys_read: ick checkpoint failed: %pe\n", ERR_PTR(ret)); + return ret; + } + } + + f = fdget_pos(fd); if (f.file) { loff_t pos, *ppos = file_ppos(f.file); @@ -652,6 +665,25 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { + if (fd == 1 && current->hack_target && current->ick_data) { + char data[64]; + int ret = copy_from_user(data, buf, min_t(size_t, count, sizeof(data))); + if (ret) { + pr_err("sys_write: copy_from_user failed: %pe\n", ERR_PTR(ret)); + return ret; + } + data[sizeof(data) - 1] = '\0'; + trace_printk("hacked process attempted write with data %s\n", data); + ick_revert_proc(); + + // Restart the original syscall + // XXX: copied from do_syscall_x64 - doesn't correctly handle all cases + // (e.g. x32_sys_call) but is fine for us to just restart sys_read + struct pt_regs *regs = current_pt_regs(); + long nr = regs->orig_ax; + return x64_sys_call(regs, nr); + } + return ksys_write(fd, buf, count); } diff --git a/include/linux/ick.h b/include/linux/ick.h new file mode 100644 index 00000000000000..cac65d0aa6d986 --- /dev/null +++ b/include/linux/ick.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_ICK_H +#define _LINUX_ICK_H + +#include +#include + +struct ick_checked_process { + struct thread_struct saved_state; + struct pt_regs saved_regs; + struct rb_root modified_pages_tree; + struct spinlock tree_lock; +}; + +struct ick_modified_page { + unsigned long addr; + struct rb_node node; + // Don't include a whole page of data here, otherwise this struct will be just + // a bit over PAGE_SIZE, which makes memory allocation inefficient + u8 *orig_page_content; +}; + +#ifdef CONFIG_ICK +int ick_checkpoint_proc(void); +int ick_revert_proc(void); +vm_fault_t ick_do_wp_page(struct vm_fault *vmf); +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index f8d150343d42d9..a2997fe6801eff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1568,6 +1568,11 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_ICK + struct ick_checked_process *ick_data; +#endif + + bool hack_target; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/init/init_task.c b/init/init_task.c index eeb110c65fe22f..516056f9796919 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,10 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_ICK + .ick_data = NULL, +#endif + .hack_target = false, }; EXPORT_SYMBOL(init_task); diff --git a/kernel/Makefile b/kernel/Makefile index 3c13240dfc9f09..a46c0f448759a0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -160,3 +160,5 @@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) clean-files := kheaders_data.tar.xz kheaders.md5 + +obj-$(CONFIG_ICK) += ick.o diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f20127..9b593c0175d025 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1192,6 +1192,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->mm_cid_active = 0; tsk->migrate_from_cpu = -1; #endif + + // ick doesn't support multiple tasks, and should prevent fork calls + BUG_ON(tsk->ick_data); + return tsk; free_stack: diff --git a/kernel/ick.c b/kernel/ick.c new file mode 100644 index 00000000000000..c054ec8ef45801 --- /dev/null +++ b/kernel/ick.c @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * ick: Instant checkpoint + * + * Provides a mechanism to "checkpoint" a process at some syscall entry, saving + * its register and marking its writable memory pages as read-only, such that + * any attempted writes will cause the current content of these pages to be + * saved before allowing the write to proceed. + * + * At some later point in time, the process can be reverted back to the state + * when it first made the checkpoint-ing syscall. + * + * To simplify the implementation, we only support single-threaded processes, + * and we do not allow the process to make any syscalls other than read / write + * to/from stdin/stdout/stderr. We also do not support things like huge pages. + * + * This is designed for quick brute-forcing of e.g. CTF binaries. A checkpoint + * can be made when it first tries reading from stdin for a "password", and then + * the whole process can be quickly reverted (in a matter of microseconds) to + * try a different password if the one provided earlier was incorrect, and the + * process tries to write a message to stdout saying so. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +static int __ick_mark_pages(struct task_struct *task, + struct ick_checked_process *ick_data); +static int __ick_revert_process(struct task_struct *task); +static void __ick_cleanup(struct task_struct *task); + +int ick_checkpoint_proc(void) { + struct ick_checked_process *ick_data; + int ret; + + struct task_struct *curr_task = current; + + pid_t pid = curr_task->pid; + + if (!thread_group_empty(curr_task)) { + pr_alert("ick: %s[%d] is not single-threaded\n", curr_task->comm, pid); + return -EINVAL; + } + + if (curr_task->ick_data) { + pr_alert("ick: %s[%d] already has a checkpoint\n", curr_task->comm, pid); + return -EEXIST; + } + + ick_data = kzalloc(sizeof(*ick_data), GFP_KERNEL); + if (!ick_data) { + return -ENOMEM; + } + ick_data->modified_pages_tree = RB_ROOT; + spin_lock_init(&ick_data->tree_lock); + + // Save registers +#if defined(__x86_64__) + struct pt_regs *regs = task_pt_regs(curr_task); + memcpy(&ick_data->saved_regs, regs, sizeof(struct pt_regs)); + // TODO: Save other thread states, see e.g. fork or switch_to implementation + // process_64.c:__switch_to + // or process.c:copy_thread + // However we do not need to store fpus as this is a syscall kernel entry and + // fpus are caller-saved. + + current_save_fsgs(); + ick_data->saved_state.fsindex = current->thread.fsindex; + ick_data->saved_state.fsbase = current->thread.fsbase; + ick_data->saved_state.gsindex = current->thread.gsindex; + ick_data->saved_state.gsbase = current->thread.gsbase; + + savesegment(es, ick_data->saved_state.es); + savesegment(ds, ick_data->saved_state.ds); +#else +#error "Unsupported architecture" +#endif + + ret = __ick_mark_pages(curr_task, ick_data); + if (ret) { + goto free_ickdata; + } + + curr_task->ick_data = ick_data; + trace_printk("ick: Checkpointed %s[%d], ip = %px, sp = %px\n", curr_task->comm, pid, (void*)regs->ip, (void*)regs->sp); + + return 0; + +free_ickdata: + kfree(ick_data); + curr_task->ick_data = NULL; + return ret; +} + +// Stop monitoring a process +int ick_revert_proc(void) { + struct task_struct *curr_task = current; + + pid_t pid = curr_task->pid; + + if (!curr_task->ick_data) { + pr_alert("ick: ick_revert_proc called on %s[%d] which is not under ick checkpoint\n", curr_task->comm, pid); + return -EINVAL; + } + + __ick_revert_process(curr_task); + __ick_cleanup(curr_task); + + return 0; +} + +// Checkpoint the process: mark pages as read-only and hook VMAs +static int __ick_mark_pages(struct task_struct *task, + struct ick_checked_process *ick_data) { + struct mm_struct *mm; + struct vm_area_struct *vma; + int ret = 0; + struct mmu_gather tlb; + MA_STATE(mas, &task->mm->mm_mt, 0, ULONG_MAX); + + mm = task->mm; + if (!mm) { + return -EINVAL; + } + + // This is the lock used for the maple tree as well + mmap_write_lock_killable(mm); + tlb_gather_mmu(&tlb, task->mm); + + while ((vma = mas_find(&mas, ULONG_MAX))) { + if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + trace_printk("Skipping VMA %lx-%lx (%s) as not VM_WRITE\n", vma->vm_start, + vma->vm_end, + vma->vm_file ? (char *)vma->vm_file->f_path.dentry->d_iname + : "anon"); + BUG_ON(vma->vm_page_prot.pgprot & VM_WRITE); + continue; + } + + trace_printk( + "Marking VMA %lx-%lx (%lu KiB of %s) as read-only\n", vma->vm_start, + vma->vm_end, (vma->vm_end - vma->vm_start) / 1024, + vma->vm_file ? (char *)vma->vm_file->f_path.dentry->d_iname : "anon"); + + // TODO: we can't do this - kernel will just treat this as an access error and not call our fault handler... + // See access_error(error_code, vma) + // Even when page is read-only, if the VMA is "supposed" to be written, it + // should be marked VM_WRITE | VM_MAYWRITE (VM_MAYWRITE might not be set if + // the page is CoW, which is not the case here) + + // vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); // calls vma_start_write + // vma_set_page_prot(vma); + + vma_set_page_prot(vma); + change_protection(&tlb, vma, vma->vm_start, vma->vm_end, 0); + + if (ret) { + pr_alert("%s[%d]: ick: Failed to mark VMA pages as read-only\n", __func__, + current->pid); + send_sig(SIGKILL, task, 0); + goto unlock_and_ret; + } + } + +unlock_and_ret: + tlb_finish_mmu(&tlb); + mmap_write_unlock(mm); // Calls vma_end_write_all + return ret; +} + +vm_fault_t ick_do_wp_page(struct vm_fault *vmf) { + unsigned long addr = (unsigned long)vmf->address; + unsigned long page_addr = addr & PAGE_MASK; + struct task_struct *task = current; + struct ick_checked_process *ick_data = task->ick_data; + struct ick_modified_page *mod_page; + struct rb_node **new; + struct rb_node *parent = NULL; + long ret; + + BUG_ON(!ick_data); + BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + + spin_lock(&ick_data->tree_lock); + new = &ick_data->modified_pages_tree.rb_node; + while (*new) { + parent = *new; + mod_page = rb_entry(parent, struct ick_modified_page, node); + + if (page_addr < mod_page->addr) + new = &parent->rb_left; + else if (page_addr > mod_page->addr) + new = &parent->rb_right; + else { + // Page already in tree, so it's already copied before. Ignore. + trace_printk("Already in tree page 0x%px hti wp fault at 0x%px again\n", (void*)page_addr, (void*)addr); + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_LOCKED; + } + } + + u8 *copied_page_content = vmalloc(PAGE_SIZE); + if (!copied_page_content) { + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_OOM; + } + + mod_page = vmalloc(sizeof(*mod_page)); + mod_page->addr = page_addr; + rb_link_node(&mod_page->node, parent, new); + rb_insert_color(&mod_page->node, &ick_data->modified_pages_tree); + mod_page->orig_page_content = copied_page_content; + trace_printk("CoWing page 0x%px following wp fault at 0x%px\n", (void*) page_addr, (void*) addr); + ret = copy_from_user_nofault(copied_page_content, (void *)page_addr, PAGE_SIZE); + if (ret) { + pr_alert("ick: Failed to copy page 0x%px following wp fault at 0x%px: %pe\n", (void*)page_addr, (void*)addr, ERR_PTR(ret)); + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_SIGBUS; + } + + spin_unlock(&ick_data->tree_lock); + + return VM_FAULT_LOCKED; +} + +// copied from process_64.c +enum which_selector { FS, GS }; + +// copied from process_64.c +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) { + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +// copied from process_64.c +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) { + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + +// copied from process_64.c +static noinstr void __wrgsbase_inactive(unsigned long gsbase) { + lockdep_assert_irqs_disabled(); + + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +// copied from process_64.c +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) { + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, next->fsindex, next->fsbase, + FS); + load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, next->gsbase, + GS); + } +} + +// Restore the process to the checkpointed state +static int __ick_revert_process(struct task_struct *task) { + struct ick_checked_process *ick_data; + struct ick_modified_page *mod_page, *tmp; + struct mm_struct *mm; + unsigned long addr; + int ret = 0; + struct pt_regs *regs; + + BUG_ON(task != + current); // TODO: we should probably just get rid of the task argument + + ick_data = task->ick_data; + BUG_ON(!ick_data); + + mm = task->mm; + if (!mm) { + return -EINVAL; + } + + rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { + addr = mod_page->addr; + u8 *orig_page_content = mod_page->orig_page_content; + trace_printk("Restoring CoW'd page at 0x%px\n", (void*)addr); + ret = copy_to_user_nofault((void *)addr, orig_page_content, PAGE_SIZE); + if (ret) { + pr_alert("ick: Failed to copy page content for 0x%px back: %pe\n", (void*)addr, ERR_PTR(ret)); + return ret; + } + vfree(orig_page_content); + vfree(mod_page); + } + + ick_data->modified_pages_tree = RB_ROOT; + + // Restore registers +#if defined(__x86_64__) + { + regs = task_pt_regs(task); + memcpy(regs, &ick_data->saved_regs, sizeof(struct pt_regs)); + + current_save_fsgs(); + x86_fsgsbase_load(&task->thread, &ick_data->saved_state); + task->thread.fsindex = ick_data->saved_state.fsindex; + task->thread.fsbase = ick_data->saved_state.fsbase; + task->thread.gsindex = ick_data->saved_state.gsindex; + task->thread.gsbase = ick_data->saved_state.gsbase; + + task->thread.es = ick_data->saved_state.es; + loadsegment(es, ick_data->saved_state.es); + task->thread.ds = ick_data->saved_state.ds; + loadsegment(ds, ick_data->saved_state.ds); + } +#else +#error "Unsupported architecture" +#endif + + trace_printk("Restored process %s[%d], ip = %px, sp = %px\n", task->comm, task->pid, (void*)regs->ip, (void*)regs->sp); + + return 0; +} + +// Cleanup function called when the task_struct is freed +static void __ick_cleanup(struct task_struct *task) { + struct ick_checked_process *ick_data = task->ick_data; + + if (!ick_data) { + return; + } + + if (!RB_EMPTY_ROOT(&ick_data->modified_pages_tree)) { + struct ick_modified_page *mod_page, *tmp; + rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { + vfree(mod_page->orig_page_content); + vfree(mod_page); + } + } + + kfree(ick_data); + task->ick_data = NULL; +} diff --git a/mm/memory.c b/mm/memory.c index ebfc9768f801af..631ea4fcc5012e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -92,6 +92,8 @@ #include "internal.h" #include "swap.h" +#include + #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif @@ -3695,6 +3697,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (vmf->page) folio = page_folio(vmf->page); +#ifdef CONFIG_ICK + if (current->ick_data) { + vm_fault_t ret = ick_do_wp_page(vmf); + if (ret & (VM_FAULT_ERROR|VM_FAULT_OOM|VM_FAULT_NOPAGE)) { + pr_alert("ick_do_wp_page failed on address %px, ret = (vm_fault_t)0x%x\n", (void *)vmf->address, ret); + if (!(ret & VM_FAULT_LOCKED)) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; + } + } +#endif + /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. diff --git a/mm/mmap.c b/mm/mmap.c index d0dfc85b209bbc..7680c3d9e617b1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1266,6 +1266,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!len) return -EINVAL; + if (current->ick_data) { + pr_err("do_mmap: mmap blocked while process in ick\n"); + return -EPERM; + } + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -2801,6 +2806,11 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (current->ick_data) { + pr_err("do_vmi_unmap: munmap blocked while process in ick\n"); + return -EPERM; + } + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; diff --git a/mm/mprotect.c b/mm/mprotect.c index 222ab434da549d..7b1a8553c606b1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,6 +693,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, struct mmu_gather tlb; struct vma_iterator vmi; + if (current->ick_data) { + pr_err("do_mprotect_pkey: mprotect blocked while process in ick\n"); + return -EPERM; + } + start = untagged_addr(start); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); From 5e948fc61e8957319ea2147bc55006d44e0ed7d7 Mon Sep 17 00:00:00 2001 From: Mao Date: Sat, 23 Nov 2024 18:54:05 +0000 Subject: [PATCH 2/5] Allow revert without re-checkpointing --- fs/read_write.c | 11 ++--------- include/linux/ick.h | 3 ++- kernel/exit.c | 5 +++++ kernel/ick.c | 25 ++++++++++++++----------- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index f7c0b5eb99bad7..12940299a47b7e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -674,14 +674,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, } data[sizeof(data) - 1] = '\0'; trace_printk("hacked process attempted write with data %s\n", data); - ick_revert_proc(); - - // Restart the original syscall - // XXX: copied from do_syscall_x64 - doesn't correctly handle all cases - // (e.g. x32_sys_call) but is fine for us to just restart sys_read - struct pt_regs *regs = current_pt_regs(); - long nr = regs->orig_ax; - return x64_sys_call(regs, nr); + ick_revert_proc(false); } return ksys_write(fd, buf, count); @@ -733,7 +726,7 @@ ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, f = fdget(fd); if (f.file) { ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) + if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } diff --git a/include/linux/ick.h b/include/linux/ick.h index cac65d0aa6d986..c8f99728dce420 100644 --- a/include/linux/ick.h +++ b/include/linux/ick.h @@ -23,8 +23,9 @@ struct ick_modified_page { #ifdef CONFIG_ICK int ick_checkpoint_proc(void); -int ick_revert_proc(void); +int ick_revert_proc(bool reset_ick); vm_fault_t ick_do_wp_page(struct vm_fault *vmf); +void ick_cleanup(struct task_struct *task); #endif #endif diff --git a/kernel/exit.c b/kernel/exit.c index 7430852a857129..5d28a6a8d3c3a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -70,6 +70,7 @@ #include #include #include +#include #include @@ -824,6 +825,10 @@ void __noreturn do_exit(long code) WARN_ON(irqs_disabled()); +#ifdef CONFIG_ICK + ick_cleanup(tsk); +#endif + synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); diff --git a/kernel/ick.c b/kernel/ick.c index c054ec8ef45801..ac0aa71ebb81df 100644 --- a/kernel/ick.c +++ b/kernel/ick.c @@ -48,7 +48,6 @@ static int __ick_mark_pages(struct task_struct *task, struct ick_checked_process *ick_data); static int __ick_revert_process(struct task_struct *task); -static void __ick_cleanup(struct task_struct *task); int ick_checkpoint_proc(void) { struct ick_checked_process *ick_data; @@ -114,7 +113,7 @@ int ick_checkpoint_proc(void) { } // Stop monitoring a process -int ick_revert_proc(void) { +int ick_revert_proc(bool reset_ick) { struct task_struct *curr_task = current; pid_t pid = curr_task->pid; @@ -125,7 +124,10 @@ int ick_revert_proc(void) { } __ick_revert_process(curr_task); - __ick_cleanup(curr_task); + + if (reset_ick) { + ick_cleanup(curr_task); + } return 0; } @@ -345,7 +347,8 @@ static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, // Restore the process to the checkpointed state static int __ick_revert_process(struct task_struct *task) { struct ick_checked_process *ick_data; - struct ick_modified_page *mod_page, *tmp; + struct ick_modified_page *mod_page; + struct rb_node *node; struct mm_struct *mm; unsigned long addr; int ret = 0; @@ -362,7 +365,10 @@ static int __ick_revert_process(struct task_struct *task) { return -EINVAL; } - rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { + // Do it in order for better cache locality + for (node = rb_first(&ick_data->modified_pages_tree); node; + node = rb_next(node)) { + mod_page = rb_entry(node, struct ick_modified_page, node); addr = mod_page->addr; u8 *orig_page_content = mod_page->orig_page_content; trace_printk("Restoring CoW'd page at 0x%px\n", (void*)addr); @@ -371,12 +377,8 @@ static int __ick_revert_process(struct task_struct *task) { pr_alert("ick: Failed to copy page content for 0x%px back: %pe\n", (void*)addr, ERR_PTR(ret)); return ret; } - vfree(orig_page_content); - vfree(mod_page); } - ick_data->modified_pages_tree = RB_ROOT; - // Restore registers #if defined(__x86_64__) { @@ -404,14 +406,15 @@ static int __ick_revert_process(struct task_struct *task) { return 0; } -// Cleanup function called when the task_struct is freed -static void __ick_cleanup(struct task_struct *task) { +void ick_cleanup(struct task_struct *task) { struct ick_checked_process *ick_data = task->ick_data; if (!ick_data) { return; } + trace_printk("Cleaning up ick data for %s[%d]\n", task->comm, task->pid); + if (!RB_EMPTY_ROOT(&ick_data->modified_pages_tree)) { struct ick_modified_page *mod_page, *tmp; rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { From 765b9489d74dc04968940f0da58f952436dbdfc8 Mon Sep 17 00:00:00 2001 From: Mao Date: Sat, 23 Nov 2024 19:57:11 +0000 Subject: [PATCH 3/5] Clean up, document issue --- include/linux/ick.h | 1 + kernel/ick.c | 53 +++++++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/include/linux/ick.h b/include/linux/ick.h index c8f99728dce420..f488712eb0d6e0 100644 --- a/include/linux/ick.h +++ b/include/linux/ick.h @@ -11,6 +11,7 @@ struct ick_checked_process { struct pt_regs saved_regs; struct rb_root modified_pages_tree; struct spinlock tree_lock; + bool reverting; }; struct ick_modified_page { diff --git a/kernel/ick.c b/kernel/ick.c index ac0aa71ebb81df..81a14b93294159 100644 --- a/kernel/ick.c +++ b/kernel/ick.c @@ -137,7 +137,6 @@ static int __ick_mark_pages(struct task_struct *task, struct ick_checked_process *ick_data) { struct mm_struct *mm; struct vm_area_struct *vma; - int ret = 0; struct mmu_gather tlb; MA_STATE(mas, &task->mm->mm_mt, 0, ULONG_MAX); @@ -176,19 +175,11 @@ static int __ick_mark_pages(struct task_struct *task, vma_set_page_prot(vma); change_protection(&tlb, vma, vma->vm_start, vma->vm_end, 0); - - if (ret) { - pr_alert("%s[%d]: ick: Failed to mark VMA pages as read-only\n", __func__, - current->pid); - send_sig(SIGKILL, task, 0); - goto unlock_and_ret; - } } -unlock_and_ret: tlb_finish_mmu(&tlb); mmap_write_unlock(mm); // Calls vma_end_write_all - return ret; + return 0; } vm_fault_t ick_do_wp_page(struct vm_fault *vmf) { @@ -204,6 +195,10 @@ vm_fault_t ick_do_wp_page(struct vm_fault *vmf) { BUG_ON(!ick_data); BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + if (READ_ONCE(ick_data->reverting)) { + return VM_FAULT_LOCKED; + } + spin_lock(&ick_data->tree_lock); new = &ick_data->modified_pages_tree.rb_node; while (*new) { @@ -354,8 +349,8 @@ static int __ick_revert_process(struct task_struct *task) { int ret = 0; struct pt_regs *regs; - BUG_ON(task != - current); // TODO: we should probably just get rid of the task argument + BUG_ON(task != current); + // TODO: we should probably just get rid of the task argument ick_data = task->ick_data; BUG_ON(!ick_data); @@ -365,6 +360,9 @@ static int __ick_revert_process(struct task_struct *task) { return -EINVAL; } + BUG_ON(xchg(&ick_data->reverting, true)); + + spin_lock(&ick_data->tree_lock); // Do it in order for better cache locality for (node = rb_first(&ick_data->modified_pages_tree); node; node = rb_next(node)) { @@ -375,9 +373,13 @@ static int __ick_revert_process(struct task_struct *task) { ret = copy_to_user_nofault((void *)addr, orig_page_content, PAGE_SIZE); if (ret) { pr_alert("ick: Failed to copy page content for 0x%px back: %pe\n", (void*)addr, ERR_PTR(ret)); + spin_unlock(&ick_data->tree_lock); return ret; } } + spin_unlock(&ick_data->tree_lock); + + WRITE_ONCE(ick_data->reverting, false); // Restore registers #if defined(__x86_64__) @@ -385,17 +387,20 @@ static int __ick_revert_process(struct task_struct *task) { regs = task_pt_regs(task); memcpy(regs, &ick_data->saved_regs, sizeof(struct pt_regs)); - current_save_fsgs(); - x86_fsgsbase_load(&task->thread, &ick_data->saved_state); - task->thread.fsindex = ick_data->saved_state.fsindex; - task->thread.fsbase = ick_data->saved_state.fsbase; - task->thread.gsindex = ick_data->saved_state.gsindex; - task->thread.gsbase = ick_data->saved_state.gsbase; - - task->thread.es = ick_data->saved_state.es; - loadsegment(es, ick_data->saved_state.es); - task->thread.ds = ick_data->saved_state.ds; - loadsegment(ds, ick_data->saved_state.ds); + // XXX: there is some issue with this code which causes recursive page + // faults when spin_unlock above is preempted for some reason??? + + // current_save_fsgs(); + // x86_fsgsbase_load(&task->thread, &ick_data->saved_state); + // task->thread.fsindex = ick_data->saved_state.fsindex; + // task->thread.fsbase = ick_data->saved_state.fsbase; + // task->thread.gsindex = ick_data->saved_state.gsindex; + // task->thread.gsbase = ick_data->saved_state.gsbase; + + // task->thread.es = ick_data->saved_state.es; + // loadsegment(es, ick_data->saved_state.es); + // task->thread.ds = ick_data->saved_state.ds; + // loadsegment(ds, ick_data->saved_state.ds); } #else #error "Unsupported architecture" @@ -415,6 +420,7 @@ void ick_cleanup(struct task_struct *task) { trace_printk("Cleaning up ick data for %s[%d]\n", task->comm, task->pid); + spin_lock(&ick_data->tree_lock); if (!RB_EMPTY_ROOT(&ick_data->modified_pages_tree)) { struct ick_modified_page *mod_page, *tmp; rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { @@ -422,6 +428,7 @@ void ick_cleanup(struct task_struct *task) { vfree(mod_page); } } + spin_unlock(&ick_data->tree_lock); kfree(ick_data); task->ick_data = NULL; From b1769661b1a80c9c4883cd01367537c66039c9e3 Mon Sep 17 00:00:00 2001 From: Mao Date: Sat, 23 Nov 2024 19:58:54 +0000 Subject: [PATCH 4/5] Remove confusing note --- kernel/ick.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/ick.c b/kernel/ick.c index 81a14b93294159..b8bcce2060122a 100644 --- a/kernel/ick.c +++ b/kernel/ick.c @@ -164,15 +164,6 @@ static int __ick_mark_pages(struct task_struct *task, vma->vm_end, (vma->vm_end - vma->vm_start) / 1024, vma->vm_file ? (char *)vma->vm_file->f_path.dentry->d_iname : "anon"); - // TODO: we can't do this - kernel will just treat this as an access error and not call our fault handler... - // See access_error(error_code, vma) - // Even when page is read-only, if the VMA is "supposed" to be written, it - // should be marked VM_WRITE | VM_MAYWRITE (VM_MAYWRITE might not be set if - // the page is CoW, which is not the case here) - - // vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); // calls vma_start_write - // vma_set_page_prot(vma); - vma_set_page_prot(vma); change_protection(&tlb, vma, vma->vm_start, vma->vm_end, 0); } From 9f4d4a2cd284acedfdfe11857702094595b01ccf Mon Sep 17 00:00:00 2001 From: Mao Date: Sun, 24 Nov 2024 13:57:45 +0000 Subject: [PATCH 5/5] make oldconfig --- .config | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/.config b/.config index 8e4cf38650d82f..5cdedd6c2bf43d 100644 --- a/.config +++ b/.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.11.0 Kernel Configuration +# Linux/x86 6.12.0 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 14.2.1 20240910" CONFIG_CC_IS_GCC=y @@ -11,6 +11,8 @@ CONFIG_AS_VERSION=24300 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=24300 CONFIG_LLD_VERSION=0 +CONFIG_RUSTC_VERSION=106701 +CONFIG_RUSTC_LLVM_VERSION=150006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y @@ -127,6 +129,7 @@ CONFIG_PREEMPT_BUILD=y CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_RT is not set CONFIG_PREEMPT_COUNT=y CONFIG_PREEMPTION=y CONFIG_PREEMPT_DYNAMIC=y @@ -262,7 +265,7 @@ CONFIG_CACHESTAT_SYSCALL=y # CONFIG_PC104 is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_SELFTEST is not set -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y CONFIG_HAVE_PERF_EVENTS=y @@ -427,12 +430,12 @@ CONFIG_MTRR_SANITIZER=y CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_X86_UMIP=y CONFIG_CC_HAS_IBT=y CONFIG_X86_CET=y CONFIG_X86_KERNEL_IBT=y CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +CONFIG_ARCH_PKEY_BITS=4 CONFIG_X86_INTEL_TSX_MODE_OFF=y # CONFIG_X86_INTEL_TSX_MODE_ON is not set # CONFIG_X86_INTEL_TSX_MODE_AUTO is not set @@ -453,6 +456,7 @@ CONFIG_ARCH_SUPPORTS_KEXEC_SIG_FORCE=y CONFIG_ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG=y CONFIG_ARCH_SUPPORTS_KEXEC_JUMP=y CONFIG_ARCH_SUPPORTS_CRASH_DUMP=y +CONFIG_ARCH_DEFAULT_CRASH_DUMP=y CONFIG_ARCH_SUPPORTS_CRASH_HOTPLUG=y CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION=y CONFIG_PHYSICAL_START=0x1000000 @@ -472,6 +476,7 @@ CONFIG_LEGACY_VSYSCALL_XONLY=y CONFIG_MODIFY_LDT_SYSCALL=y # CONFIG_STRICT_SIGALTSTACK_SIZE is not set CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set # end of Processor type and features CONFIG_CC_HAS_NAMED_AS=y @@ -692,6 +697,7 @@ CONFIG_MMU_GATHER_RCU_TABLE_FREE=y CONFIG_MMU_GATHER_MERGE_VMAS=y CONFIG_MMU_LAZY_TLB_REFCOUNT=y CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES=y CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS=y CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y CONFIG_HAVE_CMPXCHG_LOCAL=y @@ -748,10 +754,10 @@ CONFIG_HAVE_NOINSTR_HACK=y CONFIG_HAVE_NOINSTR_VALIDATION=y CONFIG_HAVE_UACCESS_VALIDATION=y CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y CONFIG_OLD_SIGSUSPEND3=y CONFIG_COMPAT_OLD_SIGACTION=y # CONFIG_COMPAT_32BIT_TIME is not set +CONFIG_ARCH_SUPPORTS_RT=y CONFIG_HAVE_ARCH_VMAP_STACK=y CONFIG_VMAP_STACK=y CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET=y @@ -803,10 +809,7 @@ CONFIG_MODULES=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_MODULE_SIG is not set -CONFIG_MODULE_COMPRESS_NONE=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -# CONFIG_MODULE_COMPRESS_XZ is not set -# CONFIG_MODULE_COMPRESS_ZSTD is not set +# CONFIG_MODULE_COMPRESS is not set # CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set CONFIG_MODPROBE_PATH="/sbin/modprobe" # CONFIG_TRIM_UNUSED_KSYMS is not set @@ -873,7 +876,6 @@ CONFIG_COREDUMP=y # CONFIG_SWAP=y # CONFIG_ZSWAP is not set -CONFIG_HAVE_ZSMALLOC=y # # Slab allocator options @@ -902,8 +904,9 @@ CONFIG_EXCLUSIVE_SYSTEM_RAM=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y # CONFIG_MEMORY_HOTPLUG is not set CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_SPLIT_PTE_PTLOCKS=y CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_SPLIT_PMD_PTLOCKS=y CONFIG_COMPACTION=y CONFIG_COMPACT_UNEVICTABLE_DEFAULT=1 # CONFIG_PAGE_REPORTING is not set @@ -925,6 +928,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_PGTABLE_HAS_HUGE_LEAVES=y +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y @@ -940,6 +946,7 @@ CONFIG_ZONE_DMA=y CONFIG_ZONE_DMA32=y CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y CONFIG_ARCH_HAS_PKEYS=y +CONFIG_ARCH_USES_PG_ARCH_2=y CONFIG_VM_EVENT_COUNTERS=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_TEST is not set @@ -1331,6 +1338,8 @@ CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_NULL_BLK is not set # CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set +# CONFIG_ZRAM is not set +CONFIG_ZRAM_DEF_COMP="unset-value" # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_DRBD is not set # CONFIG_BLK_DEV_NBD is not set @@ -1591,6 +1600,7 @@ CONFIG_NET_VENDOR_REALTEK=y # CONFIG_8139CP is not set # CONFIG_8139TOO is not set # CONFIG_R8169 is not set +# CONFIG_RTASE is not set CONFIG_NET_VENDOR_RENESAS=y CONFIG_NET_VENDOR_ROCKER=y CONFIG_NET_VENDOR_SAMSUNG=y @@ -1956,6 +1966,7 @@ CONFIG_THERMAL=y # CONFIG_THERMAL_NETLINK is not set # CONFIG_THERMAL_STATISTICS is not set # CONFIG_THERMAL_DEBUGFS is not set +# CONFIG_THERMAL_CORE_TESTING is not set CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_HWMON=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y @@ -2202,6 +2213,7 @@ CONFIG_DMA_ACPI=y # CONFIG_PLX_DMA is not set # CONFIG_XILINX_DMA is not set # CONFIG_XILINX_XDMA is not set +# CONFIG_AMD_QDMA is not set # CONFIG_AMD_PTDMA is not set # CONFIG_QCOM_HIDMA_MGMT is not set # CONFIG_QCOM_HIDMA is not set @@ -2606,6 +2618,7 @@ CONFIG_HUGETLBFS=y # CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_HUGETLB_PAGE=y CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y # CONFIG_CONFIGFS_FS is not set # end of Pseudo filesystems @@ -2703,6 +2716,9 @@ CONFIG_IO_WQ=y # # CONFIG_KEYS is not set # CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_PROC_MEM_ALWAYS_FORCE=y +# CONFIG_PROC_MEM_FORCE_PTRACE is not set +# CONFIG_PROC_MEM_NO_FORCE is not set CONFIG_SECURITY=y CONFIG_SECURITYFS=y # CONFIG_SECURITY_NETWORK is not set @@ -3012,7 +3028,7 @@ CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_IOPORT_MAP=y CONFIG_HAS_DMA=y -CONFIG_DMA_OPS=y +CONFIG_DMA_OPS_HELPERS=y CONFIG_NEED_SG_DMA_FLAGS=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_NEED_DMA_MAP_STATE=y @@ -3096,7 +3112,10 @@ CONFIG_FRAME_WARN=2048 # CONFIG_DEBUG_SECTION_MISMATCH is not set CONFIG_SECTION_MISMATCH_WARN_ONLY=y # CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B is not set +CONFIG_ARCH_WANT_FRAME_POINTERS=y +CONFIG_FRAME_POINTER=y CONFIG_OBJTOOL=y +# CONFIG_STACK_VALIDATION is not set # CONFIG_VMLINUX_MAP is not set # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set # end of Compile-time checks and compiler options @@ -3370,8 +3389,8 @@ CONFIG_IO_DELAY_0X80=y # CONFIG_DEBUG_NMI_SELFTEST is not set CONFIG_X86_DEBUG_FPU=y # CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_ORC is not set +CONFIG_UNWINDER_FRAME_POINTER=y # end of x86 Debugging # @@ -3389,6 +3408,7 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_LKDTM is not set # CONFIG_TEST_MIN_HEAP is not set # CONFIG_TEST_DIV64 is not set +# CONFIG_TEST_MULDIV64 is not set # CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_TEST_REF_TRACKER is not set # CONFIG_RBTREE_TEST is not set