diff --git a/.config b/.config index 4fbf402746d765..5cdedd6c2bf43d 100644 --- a/.config +++ b/.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.11.0 Kernel Configuration +# Linux/x86 6.12.0 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 14.2.1 20240910" CONFIG_CC_IS_GCC=y @@ -11,6 +11,8 @@ CONFIG_AS_VERSION=24300 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=24300 CONFIG_LLD_VERSION=0 +CONFIG_RUSTC_VERSION=106701 +CONFIG_RUSTC_LLVM_VERSION=150006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y @@ -127,6 +129,7 @@ CONFIG_PREEMPT_BUILD=y CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_RT is not set CONFIG_PREEMPT_COUNT=y CONFIG_PREEMPTION=y CONFIG_PREEMPT_DYNAMIC=y @@ -262,7 +265,7 @@ CONFIG_CACHESTAT_SYSCALL=y # CONFIG_PC104 is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_SELFTEST is not set -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y CONFIG_HAVE_PERF_EVENTS=y @@ -427,12 +430,12 @@ CONFIG_MTRR_SANITIZER=y CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_X86_UMIP=y CONFIG_CC_HAS_IBT=y CONFIG_X86_CET=y CONFIG_X86_KERNEL_IBT=y CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +CONFIG_ARCH_PKEY_BITS=4 CONFIG_X86_INTEL_TSX_MODE_OFF=y # CONFIG_X86_INTEL_TSX_MODE_ON is not set # CONFIG_X86_INTEL_TSX_MODE_AUTO is not set @@ -453,6 +456,7 @@ CONFIG_ARCH_SUPPORTS_KEXEC_SIG_FORCE=y CONFIG_ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG=y CONFIG_ARCH_SUPPORTS_KEXEC_JUMP=y CONFIG_ARCH_SUPPORTS_CRASH_DUMP=y +CONFIG_ARCH_DEFAULT_CRASH_DUMP=y CONFIG_ARCH_SUPPORTS_CRASH_HOTPLUG=y CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION=y CONFIG_PHYSICAL_START=0x1000000 @@ -472,6 +476,7 @@ CONFIG_LEGACY_VSYSCALL_XONLY=y CONFIG_MODIFY_LDT_SYSCALL=y # CONFIG_STRICT_SIGALTSTACK_SIZE is not set CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set # end of Processor type and features CONFIG_CC_HAS_NAMED_AS=y @@ -692,6 +697,7 @@ CONFIG_MMU_GATHER_RCU_TABLE_FREE=y CONFIG_MMU_GATHER_MERGE_VMAS=y CONFIG_MMU_LAZY_TLB_REFCOUNT=y CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES=y CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS=y CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y CONFIG_HAVE_CMPXCHG_LOCAL=y @@ -748,10 +754,10 @@ CONFIG_HAVE_NOINSTR_HACK=y CONFIG_HAVE_NOINSTR_VALIDATION=y CONFIG_HAVE_UACCESS_VALIDATION=y CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y CONFIG_OLD_SIGSUSPEND3=y CONFIG_COMPAT_OLD_SIGACTION=y # CONFIG_COMPAT_32BIT_TIME is not set +CONFIG_ARCH_SUPPORTS_RT=y CONFIG_HAVE_ARCH_VMAP_STACK=y CONFIG_VMAP_STACK=y CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET=y @@ -803,10 +809,7 @@ CONFIG_MODULES=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_MODULE_SIG is not set -CONFIG_MODULE_COMPRESS_NONE=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -# CONFIG_MODULE_COMPRESS_XZ is not set -# CONFIG_MODULE_COMPRESS_ZSTD is not set +# CONFIG_MODULE_COMPRESS is not set # CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set CONFIG_MODPROBE_PATH="/sbin/modprobe" # CONFIG_TRIM_UNUSED_KSYMS is not set @@ -873,7 +876,6 @@ CONFIG_COREDUMP=y # CONFIG_SWAP=y # CONFIG_ZSWAP is not set -CONFIG_HAVE_ZSMALLOC=y # # Slab allocator options @@ -902,8 +904,9 @@ CONFIG_EXCLUSIVE_SYSTEM_RAM=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y # CONFIG_MEMORY_HOTPLUG is not set CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_SPLIT_PTE_PTLOCKS=y CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_SPLIT_PMD_PTLOCKS=y CONFIG_COMPACTION=y CONFIG_COMPACT_UNEVICTABLE_DEFAULT=1 # CONFIG_PAGE_REPORTING is not set @@ -925,6 +928,9 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y CONFIG_THP_SWAP=y # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_PGTABLE_HAS_HUGE_LEAVES=y +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PUD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y @@ -940,6 +946,7 @@ CONFIG_ZONE_DMA=y CONFIG_ZONE_DMA32=y CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y CONFIG_ARCH_HAS_PKEYS=y +CONFIG_ARCH_USES_PG_ARCH_2=y CONFIG_VM_EVENT_COUNTERS=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_TEST is not set @@ -1331,6 +1338,8 @@ CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_NULL_BLK is not set # CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set +# CONFIG_ZRAM is not set +CONFIG_ZRAM_DEF_COMP="unset-value" # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_DRBD is not set # CONFIG_BLK_DEV_NBD is not set @@ -1591,6 +1600,7 @@ CONFIG_NET_VENDOR_REALTEK=y # CONFIG_8139CP is not set # CONFIG_8139TOO is not set # CONFIG_R8169 is not set +# CONFIG_RTASE is not set CONFIG_NET_VENDOR_RENESAS=y CONFIG_NET_VENDOR_ROCKER=y CONFIG_NET_VENDOR_SAMSUNG=y @@ -1956,6 +1966,7 @@ CONFIG_THERMAL=y # CONFIG_THERMAL_NETLINK is not set # CONFIG_THERMAL_STATISTICS is not set # CONFIG_THERMAL_DEBUGFS is not set +# CONFIG_THERMAL_CORE_TESTING is not set CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_HWMON=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y @@ -2202,6 +2213,7 @@ CONFIG_DMA_ACPI=y # CONFIG_PLX_DMA is not set # CONFIG_XILINX_DMA is not set # CONFIG_XILINX_XDMA is not set +# CONFIG_AMD_QDMA is not set # CONFIG_AMD_PTDMA is not set # CONFIG_QCOM_HIDMA_MGMT is not set # CONFIG_QCOM_HIDMA is not set @@ -2606,6 +2618,7 @@ CONFIG_HUGETLBFS=y # CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_HUGETLB_PAGE=y CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y # CONFIG_CONFIGFS_FS is not set # end of Pseudo filesystems @@ -2703,6 +2716,9 @@ CONFIG_IO_WQ=y # # CONFIG_KEYS is not set # CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_PROC_MEM_ALWAYS_FORCE=y +# CONFIG_PROC_MEM_FORCE_PTRACE is not set +# CONFIG_PROC_MEM_NO_FORCE is not set CONFIG_SECURITY=y CONFIG_SECURITYFS=y # CONFIG_SECURITY_NETWORK is not set @@ -3012,7 +3028,7 @@ CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_IOPORT_MAP=y CONFIG_HAS_DMA=y -CONFIG_DMA_OPS=y +CONFIG_DMA_OPS_HELPERS=y CONFIG_NEED_SG_DMA_FLAGS=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_NEED_DMA_MAP_STATE=y @@ -3096,7 +3112,10 @@ CONFIG_FRAME_WARN=2048 # CONFIG_DEBUG_SECTION_MISMATCH is not set CONFIG_SECTION_MISMATCH_WARN_ONLY=y # CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B is not set +CONFIG_ARCH_WANT_FRAME_POINTERS=y +CONFIG_FRAME_POINTER=y CONFIG_OBJTOOL=y +# CONFIG_STACK_VALIDATION is not set # CONFIG_VMLINUX_MAP is not set # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set # end of Compile-time checks and compiler options @@ -3370,8 +3389,8 @@ CONFIG_IO_DELAY_0X80=y # CONFIG_DEBUG_NMI_SELFTEST is not set CONFIG_X86_DEBUG_FPU=y # CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_ORC is not set +CONFIG_UNWINDER_FRAME_POINTER=y # end of x86 Debugging # @@ -3389,6 +3408,7 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_LKDTM is not set # CONFIG_TEST_MIN_HEAP is not set # CONFIG_TEST_DIV64 is not set +# CONFIG_TEST_MULDIV64 is not set # CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_TEST_REF_TRACKER is not set # CONFIG_RBTREE_TEST is not set @@ -3432,3 +3452,5 @@ CONFIG_ARCH_USE_MEMTEST=y # # end of Rust hacking # end of Kernel hacking + +CONFIG_ICK=y diff --git a/Kconfig b/Kconfig index 745bc773f56706..9b13ab8e4f292e 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,9 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +config ICK + bool "ick" + default y + help + instant checkpoint diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a7f..01c0041b6fb84e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@ #include #include +#include +#include const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -700,8 +702,19 @@ static inline loff_t *file_ppos(struct file *file) ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; + struct fd f; + + if (fd == 0 && current->hack_target && !current->ick_data) { + trace_printk("ick checkpoint on hacked process %s[%u]\n", current->comm, current->pid); + ret = ick_checkpoint_proc(); + if (ret) { + pr_err("sys_read: ick checkpoint failed: %pe\n", ERR_PTR(ret)); + return ret; + } + } + + f = fdget_pos(fd); if (fd_file(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); @@ -745,6 +758,18 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { + if (fd == 1 && current->hack_target && current->ick_data) { + char data[64]; + int ret = copy_from_user(data, buf, min_t(size_t, count, sizeof(data))); + if (ret) { + pr_err("sys_write: copy_from_user failed: %pe\n", ERR_PTR(ret)); + return ret; + } + data[sizeof(data) - 1] = '\0'; + trace_printk("hacked process attempted write with data %s\n", data); + ick_revert_proc(false); + } + return ksys_write(fd, buf, count); } diff --git a/include/linux/ick.h b/include/linux/ick.h new file mode 100644 index 00000000000000..f488712eb0d6e0 --- /dev/null +++ b/include/linux/ick.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_ICK_H +#define _LINUX_ICK_H + +#include +#include + +struct ick_checked_process { + struct thread_struct saved_state; + struct pt_regs saved_regs; + struct rb_root modified_pages_tree; + struct spinlock tree_lock; + bool reverting; +}; + +struct ick_modified_page { + unsigned long addr; + struct rb_node node; + // Don't include a whole page of data here, otherwise this struct will be just + // a bit over PAGE_SIZE, which makes memory allocation inefficient + u8 *orig_page_content; +}; + +#ifdef CONFIG_ICK +int ick_checkpoint_proc(void); +int ick_revert_proc(bool reset_ick); +vm_fault_t ick_do_wp_page(struct vm_fault *vmf); +void ick_cleanup(struct task_struct *task); +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index bb343136ddd05d..a1af75cfdaefbe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1592,6 +1592,11 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_ICK + struct ick_checked_process *ick_data; +#endif + + bool hack_target; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/init/init_task.c b/init/init_task.c index 136a8231355ab7..dec7012c5d9060 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -219,6 +219,10 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_ICK + .ick_data = NULL, +#endif + .hack_target = false, }; EXPORT_SYMBOL(init_task); diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbed3..d129c357e52102 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -159,3 +159,5 @@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) clean-files := kheaders_data.tar.xz kheaders.md5 + +obj-$(CONFIG_ICK) += ick.o diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33be4..cf643240ed5ac0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -70,6 +70,7 @@ #include #include #include +#include #include @@ -881,6 +882,10 @@ void __noreturn do_exit(long code) WARN_ON(irqs_disabled()); +#ifdef CONFIG_ICK + ick_cleanup(tsk); +#endif + synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); diff --git a/kernel/fork.c b/kernel/fork.c index 22f43721d031d4..2a6e9fc003081f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1195,6 +1195,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->mm_cid_active = 0; tsk->migrate_from_cpu = -1; #endif + + // ick doesn't support multiple tasks, and should prevent fork calls + BUG_ON(tsk->ick_data); + return tsk; free_stack: diff --git a/kernel/ick.c b/kernel/ick.c new file mode 100644 index 00000000000000..b8bcce2060122a --- /dev/null +++ b/kernel/ick.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * ick: Instant checkpoint + * + * Provides a mechanism to "checkpoint" a process at some syscall entry, saving + * its register and marking its writable memory pages as read-only, such that + * any attempted writes will cause the current content of these pages to be + * saved before allowing the write to proceed. + * + * At some later point in time, the process can be reverted back to the state + * when it first made the checkpoint-ing syscall. + * + * To simplify the implementation, we only support single-threaded processes, + * and we do not allow the process to make any syscalls other than read / write + * to/from stdin/stdout/stderr. We also do not support things like huge pages. + * + * This is designed for quick brute-forcing of e.g. CTF binaries. A checkpoint + * can be made when it first tries reading from stdin for a "password", and then + * the whole process can be quickly reverted (in a matter of microseconds) to + * try a different password if the one provided earlier was incorrect, and the + * process tries to write a message to stdout saying so. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +static int __ick_mark_pages(struct task_struct *task, + struct ick_checked_process *ick_data); +static int __ick_revert_process(struct task_struct *task); + +int ick_checkpoint_proc(void) { + struct ick_checked_process *ick_data; + int ret; + + struct task_struct *curr_task = current; + + pid_t pid = curr_task->pid; + + if (!thread_group_empty(curr_task)) { + pr_alert("ick: %s[%d] is not single-threaded\n", curr_task->comm, pid); + return -EINVAL; + } + + if (curr_task->ick_data) { + pr_alert("ick: %s[%d] already has a checkpoint\n", curr_task->comm, pid); + return -EEXIST; + } + + ick_data = kzalloc(sizeof(*ick_data), GFP_KERNEL); + if (!ick_data) { + return -ENOMEM; + } + ick_data->modified_pages_tree = RB_ROOT; + spin_lock_init(&ick_data->tree_lock); + + // Save registers +#if defined(__x86_64__) + struct pt_regs *regs = task_pt_regs(curr_task); + memcpy(&ick_data->saved_regs, regs, sizeof(struct pt_regs)); + // TODO: Save other thread states, see e.g. fork or switch_to implementation + // process_64.c:__switch_to + // or process.c:copy_thread + // However we do not need to store fpus as this is a syscall kernel entry and + // fpus are caller-saved. + + current_save_fsgs(); + ick_data->saved_state.fsindex = current->thread.fsindex; + ick_data->saved_state.fsbase = current->thread.fsbase; + ick_data->saved_state.gsindex = current->thread.gsindex; + ick_data->saved_state.gsbase = current->thread.gsbase; + + savesegment(es, ick_data->saved_state.es); + savesegment(ds, ick_data->saved_state.ds); +#else +#error "Unsupported architecture" +#endif + + ret = __ick_mark_pages(curr_task, ick_data); + if (ret) { + goto free_ickdata; + } + + curr_task->ick_data = ick_data; + trace_printk("ick: Checkpointed %s[%d], ip = %px, sp = %px\n", curr_task->comm, pid, (void*)regs->ip, (void*)regs->sp); + + return 0; + +free_ickdata: + kfree(ick_data); + curr_task->ick_data = NULL; + return ret; +} + +// Stop monitoring a process +int ick_revert_proc(bool reset_ick) { + struct task_struct *curr_task = current; + + pid_t pid = curr_task->pid; + + if (!curr_task->ick_data) { + pr_alert("ick: ick_revert_proc called on %s[%d] which is not under ick checkpoint\n", curr_task->comm, pid); + return -EINVAL; + } + + __ick_revert_process(curr_task); + + if (reset_ick) { + ick_cleanup(curr_task); + } + + return 0; +} + +// Checkpoint the process: mark pages as read-only and hook VMAs +static int __ick_mark_pages(struct task_struct *task, + struct ick_checked_process *ick_data) { + struct mm_struct *mm; + struct vm_area_struct *vma; + struct mmu_gather tlb; + MA_STATE(mas, &task->mm->mm_mt, 0, ULONG_MAX); + + mm = task->mm; + if (!mm) { + return -EINVAL; + } + + // This is the lock used for the maple tree as well + mmap_write_lock_killable(mm); + tlb_gather_mmu(&tlb, task->mm); + + while ((vma = mas_find(&mas, ULONG_MAX))) { + if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + trace_printk("Skipping VMA %lx-%lx (%s) as not VM_WRITE\n", vma->vm_start, + vma->vm_end, + vma->vm_file ? (char *)vma->vm_file->f_path.dentry->d_iname + : "anon"); + BUG_ON(vma->vm_page_prot.pgprot & VM_WRITE); + continue; + } + + trace_printk( + "Marking VMA %lx-%lx (%lu KiB of %s) as read-only\n", vma->vm_start, + vma->vm_end, (vma->vm_end - vma->vm_start) / 1024, + vma->vm_file ? (char *)vma->vm_file->f_path.dentry->d_iname : "anon"); + + vma_set_page_prot(vma); + change_protection(&tlb, vma, vma->vm_start, vma->vm_end, 0); + } + + tlb_finish_mmu(&tlb); + mmap_write_unlock(mm); // Calls vma_end_write_all + return 0; +} + +vm_fault_t ick_do_wp_page(struct vm_fault *vmf) { + unsigned long addr = (unsigned long)vmf->address; + unsigned long page_addr = addr & PAGE_MASK; + struct task_struct *task = current; + struct ick_checked_process *ick_data = task->ick_data; + struct ick_modified_page *mod_page; + struct rb_node **new; + struct rb_node *parent = NULL; + long ret; + + BUG_ON(!ick_data); + BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + + if (READ_ONCE(ick_data->reverting)) { + return VM_FAULT_LOCKED; + } + + spin_lock(&ick_data->tree_lock); + new = &ick_data->modified_pages_tree.rb_node; + while (*new) { + parent = *new; + mod_page = rb_entry(parent, struct ick_modified_page, node); + + if (page_addr < mod_page->addr) + new = &parent->rb_left; + else if (page_addr > mod_page->addr) + new = &parent->rb_right; + else { + // Page already in tree, so it's already copied before. Ignore. + trace_printk("Already in tree page 0x%px hti wp fault at 0x%px again\n", (void*)page_addr, (void*)addr); + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_LOCKED; + } + } + + u8 *copied_page_content = vmalloc(PAGE_SIZE); + if (!copied_page_content) { + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_OOM; + } + + mod_page = vmalloc(sizeof(*mod_page)); + mod_page->addr = page_addr; + rb_link_node(&mod_page->node, parent, new); + rb_insert_color(&mod_page->node, &ick_data->modified_pages_tree); + mod_page->orig_page_content = copied_page_content; + trace_printk("CoWing page 0x%px following wp fault at 0x%px\n", (void*) page_addr, (void*) addr); + ret = copy_from_user_nofault(copied_page_content, (void *)page_addr, PAGE_SIZE); + if (ret) { + pr_alert("ick: Failed to copy page 0x%px following wp fault at 0x%px: %pe\n", (void*)page_addr, (void*)addr, ERR_PTR(ret)); + spin_unlock(&ick_data->tree_lock); + return VM_FAULT_SIGBUS; + } + + spin_unlock(&ick_data->tree_lock); + + return VM_FAULT_LOCKED; +} + +// copied from process_64.c +enum which_selector { FS, GS }; + +// copied from process_64.c +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) { + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +// copied from process_64.c +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) { + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + +// copied from process_64.c +static noinstr void __wrgsbase_inactive(unsigned long gsbase) { + lockdep_assert_irqs_disabled(); + + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +// copied from process_64.c +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) { + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, next->fsindex, next->fsbase, + FS); + load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, next->gsbase, + GS); + } +} + +// Restore the process to the checkpointed state +static int __ick_revert_process(struct task_struct *task) { + struct ick_checked_process *ick_data; + struct ick_modified_page *mod_page; + struct rb_node *node; + struct mm_struct *mm; + unsigned long addr; + int ret = 0; + struct pt_regs *regs; + + BUG_ON(task != current); + // TODO: we should probably just get rid of the task argument + + ick_data = task->ick_data; + BUG_ON(!ick_data); + + mm = task->mm; + if (!mm) { + return -EINVAL; + } + + BUG_ON(xchg(&ick_data->reverting, true)); + + spin_lock(&ick_data->tree_lock); + // Do it in order for better cache locality + for (node = rb_first(&ick_data->modified_pages_tree); node; + node = rb_next(node)) { + mod_page = rb_entry(node, struct ick_modified_page, node); + addr = mod_page->addr; + u8 *orig_page_content = mod_page->orig_page_content; + trace_printk("Restoring CoW'd page at 0x%px\n", (void*)addr); + ret = copy_to_user_nofault((void *)addr, orig_page_content, PAGE_SIZE); + if (ret) { + pr_alert("ick: Failed to copy page content for 0x%px back: %pe\n", (void*)addr, ERR_PTR(ret)); + spin_unlock(&ick_data->tree_lock); + return ret; + } + } + spin_unlock(&ick_data->tree_lock); + + WRITE_ONCE(ick_data->reverting, false); + + // Restore registers +#if defined(__x86_64__) + { + regs = task_pt_regs(task); + memcpy(regs, &ick_data->saved_regs, sizeof(struct pt_regs)); + + // XXX: there is some issue with this code which causes recursive page + // faults when spin_unlock above is preempted for some reason??? + + // current_save_fsgs(); + // x86_fsgsbase_load(&task->thread, &ick_data->saved_state); + // task->thread.fsindex = ick_data->saved_state.fsindex; + // task->thread.fsbase = ick_data->saved_state.fsbase; + // task->thread.gsindex = ick_data->saved_state.gsindex; + // task->thread.gsbase = ick_data->saved_state.gsbase; + + // task->thread.es = ick_data->saved_state.es; + // loadsegment(es, ick_data->saved_state.es); + // task->thread.ds = ick_data->saved_state.ds; + // loadsegment(ds, ick_data->saved_state.ds); + } +#else +#error "Unsupported architecture" +#endif + + trace_printk("Restored process %s[%d], ip = %px, sp = %px\n", task->comm, task->pid, (void*)regs->ip, (void*)regs->sp); + + return 0; +} + +void ick_cleanup(struct task_struct *task) { + struct ick_checked_process *ick_data = task->ick_data; + + if (!ick_data) { + return; + } + + trace_printk("Cleaning up ick data for %s[%d]\n", task->comm, task->pid); + + spin_lock(&ick_data->tree_lock); + if (!RB_EMPTY_ROOT(&ick_data->modified_pages_tree)) { + struct ick_modified_page *mod_page, *tmp; + rbtree_postorder_for_each_entry_safe(mod_page, tmp, &ick_data->modified_pages_tree, node) { + vfree(mod_page->orig_page_content); + vfree(mod_page); + } + } + spin_unlock(&ick_data->tree_lock); + + kfree(ick_data); + task->ick_data = NULL; +} diff --git a/mm/memory.c b/mm/memory.c index bdf77a3ec47bc2..5a6aee2b1c1787 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -92,6 +92,8 @@ #include "internal.h" #include "swap.h" +#include + #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif @@ -3696,6 +3698,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (vmf->page) folio = page_folio(vmf->page); +#ifdef CONFIG_ICK + if (current->ick_data) { + vm_fault_t ret = ick_do_wp_page(vmf); + if (ret & (VM_FAULT_ERROR|VM_FAULT_OOM|VM_FAULT_NOPAGE)) { + pr_alert("ick_do_wp_page failed on address %px, ret = (vm_fault_t)0x%x\n", (void *)vmf->address, ret); + if (!(ret & VM_FAULT_LOCKED)) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; + } + } +#endif + /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. diff --git a/mm/mmap.c b/mm/mmap.c index 79d541f1502b22..62976e266dd0b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -294,6 +294,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!len) return -EINVAL; + if (current->ick_data) { + pr_err("do_mmap: mmap blocked while process in ick\n"); + return -EPERM; + } + /* * Does the application expect PROT_READ to imply PROT_EXEC? * diff --git a/mm/mprotect.c b/mm/mprotect.c index 6f450af3252eba..ddc952d1a575c7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -718,6 +718,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, struct mmu_gather tlb; struct vma_iterator vmi; + if (current->ick_data) { + pr_err("do_mprotect_pkey: mprotect blocked while process in ick\n"); + return -EPERM; + } + start = untagged_addr(start); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); diff --git a/mm/vma.c b/mm/vma.c index 7621384d64cf5f..115e790ebe8949 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1383,6 +1383,11 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (current->ick_data) { + pr_err("do_vmi_unmap: munmap blocked while process in ick\n"); + return -EPERM; + } + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL;