From baa17d408549f3b83d93fdbb975375fc97769221 Mon Sep 17 00:00:00 2001 From: julia Date: Wed, 28 May 2025 15:18:43 +1000 Subject: [PATCH 1/2] use the initial task untypes for no kernel patch Previously, microkit used a kernel patch which added two extra parameters to the seL4 kernel entry code: `extra_device_addr_p` and `extra_device_size`. This allowed for microkit to pass the protection domains ELFs and system invocation data to the initial task without the data being zeroed by seL4 upon an Untyped Retype invocation. This was done by making it appear as extra device untypes. That had several downsides. For one, having a kernel patch is not a great situation to live in. Furthermore, on certain platforms, e.g. RISC-V SMP, it would be impossible to add these two extra parameters as it would require extra C-ABI registers which don't exist. Instead, what this change does, is pass the extra data as part of the initial task region. This is fine, as it was *already* contiguous with the initial task (monitor) ELF file; being present in memory just before it. What this looks like from the initial task's perspective is that it gets capabilities to Frames in the seL4_BootInfo data. These frames are for the initial task's memory. As they are already frame objects, we don't have to do untype retypes for these objects. As they are already mapped into the initial task memory, we also don't have to create page tables mappings via the monitor bootstrap invocations. Signed-off-by: julia --- loader/src/loader.c | 10 +- monitor/src/main.c | 2 +- tool/microkit/src/loader.rs | 61 ++-- tool/microkit/src/main.rs | 565 +++++++++++++++++------------------- tool/microkit/src/sdf.rs | 1 + tool/microkit/src/sel4.rs | 4 +- 6 files changed, 306 insertions(+), 337 deletions(-) diff --git a/loader/src/loader.c b/loader/src/loader.c index e6da97dd9..c8c3e9f19 100644 --- a/loader/src/loader.c +++ b/loader/src/loader.c @@ -64,8 +64,6 @@ struct loader_data { uintptr_t ui_p_reg_end; uintptr_t pv_offset; uintptr_t v_entry; - uintptr_t extra_device_addr_p; - uintptr_t extra_device_size; uintptr_t num_regions; struct region regions[]; @@ -77,9 +75,7 @@ typedef void (*sel4_entry)( intptr_t pv_offset, uintptr_t v_entry, uintptr_t dtb_addr_p, - uintptr_t dtb_size, - uintptr_t extra_device_addr_p, - uintptr_t extra_device_size + uintptr_t dtb_size ); static void *memcpy(void *dst, const void *src, size_t sz) @@ -650,9 +646,7 @@ static void start_kernel(void) loader_data->pv_offset, loader_data->v_entry, 0, - 0, - loader_data->extra_device_addr_p, - loader_data->extra_device_size + 0 ); } diff --git a/monitor/src/main.c b/monitor/src/main.c index 4f73796a0..a7c11ec4e 100644 --- a/monitor/src/main.c +++ b/monitor/src/main.c @@ -116,7 +116,7 @@ seL4_Word bootstrap_invocation_count; seL4_Word bootstrap_invocation_data[BOOTSTRAP_INVOCATION_DATA_SIZE]; seL4_Word system_invocation_count; -seL4_Word *system_invocation_data = (void *)0x80000000; +seL4_Word *system_invocation_data; struct untyped_info untyped_info; diff --git a/tool/microkit/src/loader.rs b/tool/microkit/src/loader.rs index 94be093ba..959bfe130 100644 --- a/tool/microkit/src/loader.rs +++ b/tool/microkit/src/loader.rs @@ -116,8 +116,6 @@ struct LoaderHeader64 { ui_p_reg_end: u64, pv_offset: u64, v_entry: u64, - extra_device_addr_p: u64, - extra_device_size: u64, num_regions: u64, } @@ -129,18 +127,20 @@ pub struct Loader<'a> { } impl<'a> Loader<'a> { + #[allow(clippy::too_many_arguments)] pub fn new( config: &Config, loader_elf_path: &Path, kernel_elf: &'a ElfFile, - initial_task_elf: &'a ElfFile, - initial_task_phys_base: Option, - reserved_region: MemoryRegion, + monitor_elf: &'a ElfFile, + monitor_phys_subregion: MemoryRegion, + monitor_virt_subregion: MemoryRegion, + // includes monitor subregion + initial_task_phys_region: MemoryRegion, + initial_task_virt_region: MemoryRegion, + // ... excluding the monitor?? system_regions: Vec<(u64, &'a [u8])>, ) -> Loader<'a> { - // Note: If initial_task_phys_base is not None, then it just this address - // as the base physical address of the initial task, rather than the address - // that comes from the initial_task_elf file. let elf = ElfFile::from_path(loader_elf_path).unwrap(); let sz = elf.word_size; let magic = match sz { @@ -193,26 +193,26 @@ impl<'a> Loader<'a> { // (and indeed initial did support multi-segment ELF files). However // it adds significant complexity, and the calling functions enforce // only single-segment ELF files, so we keep things simple here. - let initial_task_segments: Vec<_> = initial_task_elf - .segments - .iter() - .filter(|s| s.loadable) - .collect(); - assert!(initial_task_segments.len() == 1); - let segment = &initial_task_segments[0]; + let monitor_segments: Vec<_> = monitor_elf.segments.iter().filter(|s| s.loadable).collect(); + assert!(monitor_segments.len() == 1); + let segment = &monitor_segments[0]; assert!(segment.loadable); - let inittask_first_vaddr = segment.virt_addr; - let inittask_last_vaddr = round_up(segment.virt_addr + segment.mem_size(), kb(4)); + let monitor_first_vaddr = segment.virt_addr; + let monitor_last_vaddr = round_up(segment.virt_addr + segment.mem_size(), kb(4)); - let inittask_first_paddr = match initial_task_phys_base { - Some(paddr) => paddr, - None => segment.phys_addr, - }; - let inittask_p_v_offset = inittask_first_vaddr - inittask_first_paddr; + // TODO: Why are we even rederiving it here, tbh? + assert_eq!(monitor_virt_subregion.base, monitor_first_vaddr); + assert_eq!(monitor_virt_subregion.end, monitor_last_vaddr); + // Initial task contains the PD elfs and invocation data and monitor. + // So the initial task contains the monitor at the very end. + assert_eq!(initial_task_virt_region.end, monitor_last_vaddr); + assert!(initial_task_virt_region.base < monitor_first_vaddr); + + let monitor_first_paddr = monitor_phys_subregion.base; // Note: For now we include any zeroes. We could optimize in the future - regions.push((inittask_first_paddr, &segment.data)); + regions.push((monitor_first_paddr, &segment.data)); // Determine the pagetable variables assert!(kernel_first_vaddr.is_some()); @@ -253,16 +253,15 @@ impl<'a> Loader<'a> { let kernel_entry = kernel_elf.entry; - let pv_offset = inittask_first_paddr.wrapping_sub(inittask_first_vaddr); + // wrapping_sub because phys could be higher/lower in vmem than virt + // and the kernel is in C and so (unsigned) overflow wraps. + let pv_offset = (initial_task_phys_region.base).wrapping_sub(initial_task_virt_region.base); - let ui_p_reg_start = inittask_first_paddr; - let ui_p_reg_end = inittask_last_vaddr - inittask_p_v_offset; + let ui_p_reg_start = initial_task_phys_region.base; + let ui_p_reg_end = initial_task_phys_region.end; assert!(ui_p_reg_end > ui_p_reg_start); - let v_entry = initial_task_elf.entry; - - let extra_device_addr_p = reserved_region.base; - let extra_device_size = reserved_region.size(); + let v_entry = monitor_elf.entry; let mut all_regions = Vec::with_capacity(regions.len() + system_regions.len()); for region_set in [regions, system_regions] { @@ -306,8 +305,6 @@ impl<'a> Loader<'a> { ui_p_reg_end, pv_offset, v_entry, - extra_device_addr_p, - extra_device_size, num_regions: all_regions.len() as u64, }; diff --git a/tool/microkit/src/main.rs b/tool/microkit/src/main.rs index 985bba8b3..9b3dc2539 100644 --- a/tool/microkit/src/main.rs +++ b/tool/microkit/src/main.rs @@ -108,6 +108,7 @@ struct MonitorConfig { bootstrap_invocation_count_symbol_name: &'static str, bootstrap_invocation_data_symbol_name: &'static str, system_invocation_count_symbol_name: &'static str, + system_invocation_data_symbol_name: &'static str, } impl MonitorConfig { @@ -122,11 +123,14 @@ struct InitSystem<'a> { cnode_cap: u64, cnode_mask: u64, invocations: &'a mut Vec, - cap_slot: u64, + next_system_cap_slot: u64, last_fixed_address: u64, normal_untyped: &'a mut ObjectAllocator, device_untyped: &'a mut ObjectAllocator, cap_address_names: &'a mut HashMap, + // seL4_BootInfo caps (sharedFrames, userImageFrames, etc). + // TODO: at the moment just userImageFrames + unused_boot_info_frames: Vec, objects: Vec, } @@ -136,26 +140,29 @@ impl<'a> InitSystem<'a> { config: &'a Config, cnode_cap: u64, cnode_mask: u64, - first_available_cap_slot: u64, + starting_system_cap_slot: u64, normal_untyped: &'a mut ObjectAllocator, device_untyped: &'a mut ObjectAllocator, invocations: &'a mut Vec, cap_address_names: &'a mut HashMap, + unused_boot_info_frames: Vec, ) -> InitSystem<'a> { InitSystem { config, cnode_cap, cnode_mask, invocations, - cap_slot: first_available_cap_slot, + next_system_cap_slot: starting_system_cap_slot, last_fixed_address: 0, normal_untyped, device_untyped, cap_address_names, + unused_boot_info_frames, objects: Vec::new(), } } + #[allow(dead_code)] pub fn reserve(&mut self, allocations: Vec<(&UntypedObject, u64)>) { for alloc in allocations { self.device_untyped.reserve(alloc); @@ -225,16 +232,16 @@ impl<'a> InitSystem<'a> { root: self.cnode_cap, node_index: 1, node_depth: 1, - node_offset: self.cap_slot, + node_offset: self.next_system_cap_slot, num_objects: 1, }, )); - self.cap_slot += 1; + self.next_system_cap_slot += 1; } } - let object_cap = self.cap_slot; - self.cap_slot += 1; + let object_cap = self.next_system_cap_slot; + self.next_system_cap_slot += 1; self.invocations.push(Invocation::new( self.config, InvocationArgs::UntypedRetype { @@ -262,6 +269,29 @@ impl<'a> InitSystem<'a> { kernel_object } + pub fn allocate_from_bootinfo_frames( + &mut self, + phys_address: u64, + object_type: ObjectType, + name: String, + ) -> Object { + let boot_info_frame_idx = self + .unused_boot_info_frames + .iter() + .position(|o| o.phys_addr == phys_address) + .expect("couldn't find the boot info frame at phys addr"); + + let boot_info_frame = self.unused_boot_info_frames[boot_info_frame_idx]; + assert_eq!(boot_info_frame.object_type, object_type); + + self.unused_boot_info_frames.remove(boot_info_frame_idx); + self.objects.push(boot_info_frame); + self.cap_address_names + .insert(boot_info_frame.cap_addr, name); + + boot_info_frame + } + pub fn allocate_objects( &mut self, object_type: ObjectType, @@ -307,8 +337,8 @@ impl<'a> InitSystem<'a> { std::process::exit(1); } ); - let base_cap_slot = self.cap_slot; - self.cap_slot += count; + let base_cap_slot = self.next_system_cap_slot; + self.next_system_cap_slot += count; let mut to_alloc = count; let mut alloc_cap_slot = base_cap_slot; @@ -360,7 +390,6 @@ struct BuiltSystem { bootstrap_invocations: Vec, system_invocations: Vec, kernel_boot_info: BootInfo, - reserved_region: MemoryRegion, fault_ep_cap_address: u64, reply_cap_address: u64, cap_lookup: HashMap, @@ -368,12 +397,19 @@ struct BuiltSystem { vm_tcb_caps: Vec, sched_caps: Vec, ntfn_caps: Vec, + // subset of the initial task pd_elf_regions: Vec>, pd_setvar_values: Vec>, pd_stack_addrs: Vec, kernel_objects: Vec, + + // full is temp. initial_task_virt_region: MemoryRegion, initial_task_phys_region: MemoryRegion, + initial_task_monitor_virt_subregion: MemoryRegion, + initial_task_monitor_phys_subregion: MemoryRegion, + initial_task_invocations_virt_subregion: MemoryRegion, + initial_task_invocations_phys_subregion: MemoryRegion, } pub fn pd_write_symbols( @@ -649,7 +685,6 @@ fn emulate_kernel_boot( kernel_elf: &ElfFile, initial_task_phys_region: MemoryRegion, initial_task_virt_region: MemoryRegion, - reserved_region: MemoryRegion, ) -> BootInfo { assert!(initial_task_phys_region.size() == initial_task_virt_region.size()); let partial_info = kernel_partial_boot(config, kernel_elf); @@ -657,8 +692,9 @@ fn emulate_kernel_boot( let device_memory = partial_info.device_memory; let boot_region = partial_info.boot_region; + // The initial task memory does not appear in the untypes, but instead + // appears in the userImageFrames part of the bootinfo as frames. normal_memory.remove_region(initial_task_phys_region.base, initial_task_phys_region.end); - normal_memory.remove_region(reserved_region.base, reserved_region.end); // Now, the tricky part! determine which memory is used for the initial task objects let initial_objects_size = calculate_rootserver_size(config, initial_task_virt_region); @@ -688,21 +724,47 @@ fn emulate_kernel_boot( let fixed_cap_count = 0x10; let sched_control_cap_count = 1; let paging_cap_count = get_arch_n_paging(config, initial_task_virt_region); + + let sched_control_cap = fixed_cap_count + paging_cap_count; + + // TODO: Does the kernel allocate these like this, or is it done purely in 4k? + let user_image_frames = { + let mut frames = Vec::new(); + let user_image_pages = initial_task_virt_region.size() / config.minimum_page_size; + let mut phys_addr = initial_task_phys_region.base; + + for i in 1..=user_image_pages { + // TODO: make the loader able to check these frames. + frames.push(Object { + object_type: ObjectType::SmallPage, + cap_addr: i + sched_control_cap, + phys_addr, + }); + + phys_addr += config.minimum_page_size; + } + + frames + }; + + assert_eq!( + initial_task_phys_region.size(), + initial_task_virt_region.size() + ); let page_cap_count = initial_task_virt_region.size() / config.minimum_page_size; + assert_eq!(page_cap_count, user_image_frames.len() as u64); + let first_untyped_cap = fixed_cap_count + paging_cap_count + sched_control_cap_count + page_cap_count; - let sched_control_cap = fixed_cap_count + paging_cap_count; let max_bits = match config.arch { Arch::Aarch64 => 47, Arch::Riscv64 => 38, }; - let device_regions: Vec = [ - reserved_region.aligned_power_of_two_regions(config, max_bits), - device_memory.aligned_power_of_two_regions(config, max_bits), - ] - .concat(); + let device_regions: Vec = + device_memory.aligned_power_of_two_regions(config, max_bits); let normal_regions: Vec = [ + // TODO: ?? boot_region.aligned_power_of_two_regions(config, max_bits), normal_memory.aligned_power_of_two_regions(config, max_bits), ] @@ -723,7 +785,7 @@ fn emulate_kernel_boot( BootInfo { fixed_cap_count, paging_cap_count, - page_cap_count, + user_image_frames, sched_control_cap, first_available_cap, untyped_objects, @@ -754,55 +816,97 @@ fn build_system( let system_cnode_bits = system_cnode_size.ilog2() as u64; - // Emulate kernel boot - - // Determine physical memory region used by the monitor - let initial_task_size = phys_mem_region_from_elf(monitor_elf, config.minimum_page_size).size(); - - // Determine physical memory region for 'reserved' memory. + // ==== Emulate kernel boot ==== // - // The 'reserved' memory region will not be touched by seL4 during boot - // and allows the monitor (initial task) to create memory regions - // from this area, which can then be made available to the appropriate - // protection domains - let mut pd_elf_size = 0; - for pd_elf in pd_elf_files { - for r in phys_mem_regions_from_elf(pd_elf, config.minimum_page_size) { - pd_elf_size += r.size(); + // Determine physical memory region used by the monitor (the initial task) + let monitor_size = phys_mem_region_from_elf(monitor_elf, config.minimum_page_size).size(); + let pd_elfs_size = { + let mut size = 0; + for pd_elf in pd_elf_files { + for r in phys_mem_regions_from_elf(pd_elf, config.minimum_page_size) { + size += r.size(); + } } - } - let reserved_size = invocation_table_size + pd_elf_size; + + size + }; + + // The 'initial task' region consists of the monitor ELF (the real initial task) + // but also the invocation table and the size of the PD elfs. + // If we don't include these here or otherwise reserve the memory, seL4 + // will clear it and we need it after bootup. + let initial_task_size = monitor_size + invocation_table_size + pd_elfs_size; // Now that the size is determined, find a free region in the physical memory // space. let (mut available_memory, kernel_boot_region) = emulate_kernel_boot_partial(config, kernel_elf); - // The kernel relies on the reserved region being allocated above the kernel + // The kernel relies on the initial task being allocated above the kernel // boot/ELF region, so we have the end of the kernel boot region as the lower - // bound for allocating the reserved region. - let reserved_base = available_memory.allocate_from(reserved_size, kernel_boot_region.end); - assert!(kernel_boot_region.base < reserved_base); - // The kernel relies on the initial task being allocated above the reserved - // region, so we have the address of the end of the reserved region as the - // lower bound for allocating the initial task. - let initial_task_phys_base = - available_memory.allocate_from(initial_task_size, reserved_base + reserved_size); - assert!(reserved_base < initial_task_phys_base); - - let initial_task_phys_region = MemoryRegion::new( - initial_task_phys_base, - initial_task_phys_base + initial_task_size, + // bound for allocating the initial task. + let initial_task_phys_region = { + let initial_task_phys_base = + available_memory.allocate_from(initial_task_size, kernel_boot_region.end); + assert!(kernel_boot_region.base < initial_task_phys_base); + + MemoryRegion::new( + initial_task_phys_base, + initial_task_phys_base + initial_task_size, + ) + }; + + // TODO: this can be different. + let monitor_virt_region = virt_mem_region_from_elf(monitor_elf, config.minimum_page_size); + + let initial_task_virt_region = MemoryRegion::new( + // TODO: is the best spot for the invoc table + elfs before the monitor, + // or after? note they have to be contiguously mapped. + monitor_virt_region.base - (invocation_table_size + pd_elfs_size), + monitor_virt_region.end, ); - let initial_task_virt_region = virt_mem_region_from_elf(monitor_elf, config.minimum_page_size); - let reserved_region = MemoryRegion::new(reserved_base, reserved_base + reserved_size); + let monitor_phys_region = MemoryRegion::new( + initial_task_phys_region.base + (invocation_table_size + pd_elfs_size), + initial_task_phys_region.base + (invocation_table_size + pd_elfs_size + monitor_size), + ); // Now that the reserved region has been allocated we can determine the specific // region of physical memory required for the inovcation table itself, and // all the ELF segments - let invocation_table_region = - MemoryRegion::new(reserved_base, reserved_base + invocation_table_size); + let invocation_table_phys_region = MemoryRegion::new( + initial_task_phys_region.base, + initial_task_phys_region.base + invocation_table_size, + ); + let invocation_table_virt_region = MemoryRegion::new( + initial_task_virt_region.base, + initial_task_virt_region.base + invocation_table_size, + ); + let pd_elfs_phys_region = MemoryRegion::new( + invocation_table_phys_region.end, + invocation_table_phys_region.end + pd_elfs_size, + ); + let pd_elfs_virt_region = MemoryRegion::new( + monitor_virt_region.base - pd_elfs_size, + monitor_virt_region.base, + ); + + // contiguous and matches + assert_eq!( + initial_task_phys_region.base, + invocation_table_phys_region.base + ); + assert_eq!(invocation_table_phys_region.end, pd_elfs_phys_region.base); + assert_eq!(pd_elfs_phys_region.end, monitor_phys_region.base); + assert_eq!(monitor_phys_region.end, initial_task_phys_region.end); + // same for virt. + assert_eq!( + initial_task_virt_region.base, + invocation_table_virt_region.base + ); + assert_eq!(invocation_table_virt_region.end, pd_elfs_virt_region.base); + assert_eq!(pd_elfs_virt_region.end, monitor_virt_region.base); + assert_eq!(monitor_virt_region.end, initial_task_virt_region.end); // 1.3 With both the initial task region and reserved region determined the kernel // boot can be emulated. This provides the boot info information which is needed @@ -812,7 +916,6 @@ fn build_system( kernel_elf, initial_task_phys_region, initial_task_virt_region, - reserved_region, ); for ut in &kernel_boot_info.untyped_objects { @@ -984,172 +1087,6 @@ fn build_system( }, )); - // 2.2 At this point it is necessary to get the frames containing the - // main system invocations into the virtual address space. (Remember the - // invocations we are writing out here actually _execute_ at run time! - // It is a bit weird that we talk about mapping in the invocation data - // before we have even generated the invocation data!). - // - // This needs a few steps: - // - // 1. Turn untyped into page objects - // 2. Map the page objects into the address space - // - - // 2.2.1: The memory for the system invocation data resides at the start - // of the reserved region. We can retype multiple frames as a time ( - // which reduces the number of invocations we need). However, it is possible - // that the region spans multiple untyped objects. - // At this point in time we assume we will map the area using the minimum - // page size. It would be good in the future to use super pages (when - // it makes sense to - this would reduce memory usage, and the number of - // invocations required to set up the address space - let pages_required = invocation_table_size / config.minimum_page_size; - let base_page_cap = 0; - for pta in base_page_cap..base_page_cap + pages_required { - cap_address_names.insert( - system_cap_address_mask | pta, - "SmallPage: monitor invocation table".to_string(), - ); - } - - let mut remaining_pages = pages_required; - let mut invocation_table_allocations = Vec::new(); - let mut cap_slot = base_page_cap; - let mut phys_addr = invocation_table_region.base; - - let boot_info_device_untypeds: Vec<&UntypedObject> = kernel_boot_info - .untyped_objects - .iter() - .filter(|o| o.is_device) - .collect(); - for ut in boot_info_device_untypeds { - let ut_pages = ut.region.size() / config.minimum_page_size; - let retype_page_count = min(ut_pages, remaining_pages); - - let mut retypes_remaining = retype_page_count; - while retypes_remaining > 0 { - let num_retypes = min(retypes_remaining, config.fan_out_limit); - bootstrap_invocations.push(Invocation::new( - config, - InvocationArgs::UntypedRetype { - untyped: ut.cap, - object_type: ObjectType::SmallPage, - size_bits: 0, - root: root_cnode_cap, - node_index: 1, - node_depth: 1, - node_offset: cap_slot, - num_objects: num_retypes, - }, - )); - - retypes_remaining -= num_retypes; - cap_slot += num_retypes; - } - - remaining_pages -= retype_page_count; - phys_addr += retype_page_count * config.minimum_page_size; - invocation_table_allocations.push((ut, phys_addr)); - if remaining_pages == 0 { - break; - } - } - - // 2.2.1: Now that physical pages have been allocated it is possible to setup - // the virtual memory objects so that the pages can be mapped into virtual memory - // At this point we map into the arbitrary address of 0x0.8000.0000 (i.e.: 2GiB) - // We arbitrary limit the maximum size to be 128MiB. This allows for at least 1 million - // invocations to occur at system startup. This should be enough for any reasonable - // sized system. - // - // Before mapping it is necessary to install page tables that can cover the region. - let large_page_size = ObjectType::LargePage.fixed_size(config).unwrap(); - let page_table_size = ObjectType::PageTable.fixed_size(config).unwrap(); - let page_tables_required = - util::round_up(invocation_table_size, large_page_size) / large_page_size; - let page_table_allocation = kao - .alloc_n(page_table_size, page_tables_required) - .unwrap_or_else(|| panic!("Internal error: failed to allocate page tables")); - let base_page_table_cap = cap_slot; - - for pta in base_page_table_cap..base_page_table_cap + page_tables_required { - cap_address_names.insert( - system_cap_address_mask | pta, - "PageTable: monitor".to_string(), - ); - } - - assert!(page_tables_required <= config.fan_out_limit); - bootstrap_invocations.push(Invocation::new( - config, - InvocationArgs::UntypedRetype { - untyped: page_table_allocation.untyped_cap_address, - object_type: ObjectType::PageTable, - size_bits: 0, - root: root_cnode_cap, - node_index: 1, - node_depth: 1, - node_offset: cap_slot, - num_objects: page_tables_required, - }, - )); - cap_slot += page_tables_required; - - let page_table_vaddr: u64 = 0x8000_0000; - // Now that the page tables are allocated they can be mapped into vspace - let bootstrap_pt_attr = match config.arch { - Arch::Aarch64 => ArmVmAttributes::default(), - Arch::Riscv64 => RiscvVmAttributes::default(), - }; - let mut pt_map_invocation = Invocation::new( - config, - InvocationArgs::PageTableMap { - page_table: system_cap_address_mask | base_page_table_cap, - vspace: INIT_VSPACE_CAP_ADDRESS, - vaddr: page_table_vaddr, - attr: bootstrap_pt_attr, - }, - ); - pt_map_invocation.repeat( - page_tables_required as u32, - InvocationArgs::PageTableMap { - page_table: 1, - vspace: 0, - vaddr: ObjectType::LargePage.fixed_size(config).unwrap(), - attr: 0, - }, - ); - bootstrap_invocations.push(pt_map_invocation); - - // Finally, once the page tables are allocated the pages can be mapped - let page_vaddr: u64 = 0x8000_0000; - let bootstrap_page_attr = match config.arch { - Arch::Aarch64 => ArmVmAttributes::default() | ArmVmAttributes::ExecuteNever as u64, - Arch::Riscv64 => RiscvVmAttributes::default() | RiscvVmAttributes::ExecuteNever as u64, - }; - let mut map_invocation = Invocation::new( - config, - InvocationArgs::PageMap { - page: system_cap_address_mask | base_page_cap, - vspace: INIT_VSPACE_CAP_ADDRESS, - vaddr: page_vaddr, - rights: Rights::Read as u64, - attr: bootstrap_page_attr, - }, - ); - map_invocation.repeat( - pages_required as u32, - InvocationArgs::PageMap { - page: 1, - vspace: 0, - vaddr: config.minimum_page_size, - rights: 0, - attr: 0, - }, - ); - bootstrap_invocations.push(map_invocation); - // 3. Now we can start setting up the system based on the information // the user provided in the System Description Format. // @@ -1167,75 +1104,88 @@ fn build_system( // as needed by MRs // Page table structs: // as needed by protection domains based on mappings required - let mut phys_addr_next = reserved_base + invocation_table_size; - // Now we create additional MRs (and mappings) for the ELF files. - let mut pd_elf_regions: Vec> = Vec::with_capacity(system.protection_domains.len()); let mut extra_mrs = Vec::new(); let mut pd_extra_maps: HashMap<&ProtectionDomain, Vec> = HashMap::new(); - for (i, pd) in system.protection_domains.iter().enumerate() { - pd_elf_regions.push(Vec::with_capacity(pd_elf_files[i].segments.len())); - for (seg_idx, segment) in pd_elf_files[i].segments.iter().enumerate() { - if !segment.loadable { - continue; - } - let segment_phys_addr = phys_addr_next + (segment.virt_addr % config.minimum_page_size); - pd_elf_regions[i].push(Region::new( - format!("PD-ELF {}-{}", pd.name, seg_idx), - segment_phys_addr, - segment.data.len() as u64, - seg_idx, - )); + let pd_elf_regions = { + let mut phys_addr_next = pd_elfs_phys_region.base; + + // Now we create additional MRs (and mappings) for the ELF files. + // But: the ELF files are special, and are part of the initial task + // => we do not create them from untypes, but from pre-existing frames + // from the boot info. + let mut pd_elf_regions: Vec> = + Vec::with_capacity(system.protection_domains.len()); + for (i, pd) in system.protection_domains.iter().enumerate() { + pd_elf_regions.push(Vec::with_capacity(pd_elf_files[i].segments.len())); + for (seg_idx, segment) in pd_elf_files[i].segments.iter().enumerate() { + if !segment.loadable { + continue; + } - let mut perms = 0; - if segment.is_readable() { - perms |= SysMapPerms::Read as u8; - } - if segment.is_writable() { - perms |= SysMapPerms::Write as u8; - } - if segment.is_executable() { - perms |= SysMapPerms::Execute as u8; - } + let segment_phys_addr = + phys_addr_next + (segment.virt_addr % config.minimum_page_size); + pd_elf_regions[i].push(Region::new( + format!("PD-ELF {}-{}", pd.name, seg_idx), + segment_phys_addr, + segment.data.len() as u64, + seg_idx, + )); - let base_vaddr = util::round_down(segment.virt_addr, config.minimum_page_size); - let end_vaddr = util::round_up( - segment.virt_addr + segment.mem_size(), - config.minimum_page_size, - ); - let aligned_size = end_vaddr - base_vaddr; - let name = format!("ELF:{}-{}", pd.name, seg_idx); - let mr = SysMemoryRegion { - name, - size: aligned_size, - page_size: PageSize::Small, - page_count: aligned_size / PageSize::Small as u64, - phys_addr: Some(phys_addr_next), - text_pos: None, - kind: SysMemoryRegionKind::Elf, - }; - phys_addr_next += aligned_size; - - let mp = SysMap { - mr: mr.name.clone(), - vaddr: base_vaddr, - perms, - cached: true, - text_pos: None, - }; - if let Some(extra_maps) = pd_extra_maps.get_mut(pd) { - extra_maps.push(mp); - } else { - pd_extra_maps.insert(pd, vec![mp]); - } + let mut perms = 0; + if segment.is_readable() { + perms |= SysMapPerms::Read as u8; + } + if segment.is_writable() { + perms |= SysMapPerms::Write as u8; + } + if segment.is_executable() { + perms |= SysMapPerms::Execute as u8; + } + + let base_vaddr = util::round_down(segment.virt_addr, config.minimum_page_size); + let end_vaddr = util::round_up( + segment.virt_addr + segment.mem_size(), + config.minimum_page_size, + ); + let aligned_size = end_vaddr - base_vaddr; + let name = format!("ELF:{}-{}", pd.name, seg_idx); + + // This memory region is special! + let mr = SysMemoryRegion { + name, + size: aligned_size, + page_size: PageSize::Small, + page_count: aligned_size / PageSize::Small as u64, + phys_addr: Some(phys_addr_next), + text_pos: None, + kind: SysMemoryRegionKind::Elf, + }; + phys_addr_next += aligned_size; + + let mp = SysMap { + mr: mr.name.clone(), + vaddr: base_vaddr, + perms, + cached: true, + text_pos: None, + }; + if let Some(extra_maps) = pd_extra_maps.get_mut(pd) { + extra_maps.push(mp); + } else { + pd_extra_maps.insert(pd, vec![mp]); + } - // Add to extra_mrs at the end to avoid movement issues with the MR since it's used in - // constructing the SysMap struct - extra_mrs.push(mr); + // Add to extra_mrs at the end to avoid movement issues with the MR since it's used in + // constructing the SysMap struct + extra_mrs.push(mr); + } } - } - assert!(phys_addr_next - (reserved_base + invocation_table_size) == pd_elf_size); + assert!(phys_addr_next == pd_elfs_phys_region.end); + + pd_elf_regions + }; // Here we create a memory region/mapping for the stack for each PD. // We allocate the stack at the highest possible virtual address that the @@ -1281,14 +1231,14 @@ fn build_system( config, root_cnode_cap, system_cap_address_mask, - cap_slot, + /* starting_system_cap_slot */ 0, &mut kao, &mut kad, &mut system_invocations, &mut cap_address_names, + kernel_boot_info.user_image_frames.clone(), ); - init_system.reserve(invocation_table_allocations); let mut mr_pages: HashMap<&SysMemoryRegion, Vec> = HashMap::new(); // 3.1 Work out how many fixed page objects are required @@ -1321,7 +1271,14 @@ fn build_system( "Page({} {}): MR={} @ {:x}", page_size_human, page_size_label, mr.name, phys_addr ); - let page = init_system.allocate_fixed_object(phys_addr, obj_type, name); + let page = if mr.kind == SysMemoryRegionKind::Elf { + // These ones can from the user image (initial task) frames that + // the kernel passes in the boot info. + init_system.allocate_from_bootinfo_frames(phys_addr, obj_type, name) + } else { + init_system.allocate_fixed_object(phys_addr, obj_type, name) + }; + mr_pages.get_mut(mr).unwrap().push(page); } @@ -1710,7 +1667,7 @@ fn build_system( let vm_cnode_objs = &cnode_objs[system.protection_domains.len()..]; - let mut cap_slot = init_system.cap_slot; + let mut cap_slot = init_system.next_system_cap_slot; let kernel_objects = init_system.objects; // Create all the necessary interrupt handler objects. These aren't @@ -2787,7 +2744,6 @@ fn build_system( bootstrap_invocations, system_invocations, kernel_boot_info, - reserved_region, fault_ep_cap_address: fault_ep_endpoint_object.cap_addr, reply_cap_address: reply_obj.cap_addr, cap_lookup: cap_address_names, @@ -2801,6 +2757,10 @@ fn build_system( kernel_objects, initial_task_phys_region, initial_task_virt_region, + initial_task_monitor_phys_subregion: monitor_phys_region, + initial_task_monitor_virt_subregion: monitor_virt_region, + initial_task_invocations_phys_subregion: invocation_table_phys_region, + initial_task_invocations_virt_subregion: invocation_table_virt_region, }) } @@ -2822,10 +2782,11 @@ fn write_report( " # of page table caps: {:>8}", comma_sep_u64(built_system.kernel_boot_info.paging_cap_count) )?; + // TODO: print out the details? writeln!( buf, " # of page caps : {:>8}", - comma_sep_u64(built_system.kernel_boot_info.page_cap_count) + comma_sep_u64(built_system.kernel_boot_info.user_image_frames.len() as u64) )?; writeln!( buf, @@ -2838,7 +2799,10 @@ fn write_report( writeln!(buf, " {}", region)?; } } - writeln!(buf, "\n# Monitor (Initial Task) Info\n")?; + writeln!( + buf, + "\n# Initial Task Info (Monitor + Invocation Data + PD Elfs)\n" + )?; writeln!( buf, " virtual memory : {}", @@ -3309,6 +3273,7 @@ fn main() -> Result<(), String> { bootstrap_invocation_count_symbol_name: "bootstrap_invocation_count", bootstrap_invocation_data_symbol_name: "bootstrap_invocation_data", system_invocation_count_symbol_name: "system_invocation_count", + system_invocation_data_symbol_name: "system_invocation_data", }; let kernel_elf = ElfFile::from_path(&kernel_elf_path)?; @@ -3468,6 +3433,14 @@ fn main() -> Result<(), String> { monitor_config.bootstrap_invocation_count_symbol_name, &built_system.bootstrap_invocations.len().to_le_bytes(), )?; + monitor_elf.write_symbol( + // address + monitor_config.system_invocation_data_symbol_name, + &built_system + .initial_task_invocations_virt_subregion + .base + .to_le_bytes(), + )?; monitor_elf.write_symbol( monitor_config.system_invocation_count_symbol_name, &built_system.system_invocations.len().to_le_bytes(), @@ -3550,10 +3523,11 @@ fn main() -> Result<(), String> { } report_buf.flush().unwrap(); - let mut loader_regions: Vec<(u64, &[u8])> = vec![( - built_system.reserved_region.base, - &built_system.invocation_data, - )]; + #[rustfmt::skip] + let mut loader_regions: Vec<(u64, &[u8])> = vec![ + // TODO: we should really have the monitor elf as a region here, no? + (built_system.initial_task_invocations_phys_subregion.base, &built_system.invocation_data) + ]; for (i, regions) in built_system.pd_elf_regions.iter().enumerate() { for r in regions { loader_regions.push((r.addr, r.data(&pd_elf_files[i]))); @@ -3564,9 +3538,12 @@ fn main() -> Result<(), String> { &kernel_config, Path::new(&loader_elf_path), &kernel_elf, + // see the above todo as part of loader regions &monitor_elf, - Some(built_system.initial_task_phys_region.base), - built_system.reserved_region, + built_system.initial_task_monitor_phys_subregion, + built_system.initial_task_monitor_virt_subregion, + built_system.initial_task_phys_region, + built_system.initial_task_virt_region, loader_regions, ); loader.write_image(Path::new(args.output)); diff --git a/tool/microkit/src/sdf.rs b/tool/microkit/src/sdf.rs index c41c46e43..fe44931e3 100644 --- a/tool/microkit/src/sdf.rs +++ b/tool/microkit/src/sdf.rs @@ -92,6 +92,7 @@ pub struct SysMap { #[derive(Debug, PartialEq, Eq, Hash, Clone)] pub enum SysMemoryRegionKind { User, + // This one is special and is allocated from the existing frames in the initial task Elf, Stack, } diff --git a/tool/microkit/src/sel4.rs b/tool/microkit/src/sel4.rs index 8184c2c23..a04d79d94 100644 --- a/tool/microkit/src/sel4.rs +++ b/tool/microkit/src/sel4.rs @@ -14,7 +14,7 @@ pub struct BootInfo { pub fixed_cap_count: u64, pub sched_control_cap: u64, pub paging_cap_count: u64, - pub page_cap_count: u64, + pub user_image_frames: Vec, pub untyped_objects: Vec, pub first_available_cap: u64, } @@ -40,7 +40,7 @@ pub struct PlatformConfig { /// The cap_address refers to a cap address that addresses this cap. /// The cap_address is is intended to be valid within the context of the /// initial task. -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone)] pub struct Object { /// Type of kernel object pub object_type: ObjectType, From c0e15d5ba17ea913efa3502c9880ab1e4d1d0888 Mon Sep 17 00:00:00 2001 From: julia Date: Wed, 28 May 2025 15:19:30 +1000 Subject: [PATCH 2/2] TODO: can we check that things match(?) Signed-off-by: julia --- monitor/src/debug.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/monitor/src/debug.c b/monitor/src/debug.c index 1a4eaed4e..2ecc1e77d 100644 --- a/monitor/src/debug.c +++ b/monitor/src/debug.c @@ -95,6 +95,25 @@ void dump_bootinfo(seL4_BootInfo *bi) puthex64(bi->extraBIPages.end - 1); puts("\n"); +#if 0 + for (i = bi->userImageFrames.start; i < bi->userImageFrames.end; i++) { + // seL4_DebugCapIdentify(i); + // puts("userImageFramesList["); + // puthex32(i); + // puts("] = slot: "); + // puthex32(bi->userImageFrames.start + i); + // puts(", paddr: "); + // puthex64(bi->untypedList[i].paddr); + // puts(" - "); + // puthex64(bi->untypedList[i].paddr + (1UL << bi->untypedList[i].sizeBits)); + // puts(" ("); + // puts(bi->untypedList[i].isDevice ? "device" : "normal"); + // puts(") bits: "); + // puthex32(bi->untypedList[i].sizeBits); + // puts("\n"); + } +#endif + #if 1 for (i = 0; i < bi->untyped.end - bi->untyped.start; i++) { puts("untypedList[");