/* $NetBSD: mm.c,v 1.28 2021/05/04 21:09:16 khorben Exp $ */ /* * Copyright (c) 2017-2020 The NetBSD Foundation, Inc. All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prekern.h" #define ELFROUND 64 static const uint8_t pads[4] = { [BTSEG_NONE] = 0x00, [BTSEG_TEXT] = 0xCC, [BTSEG_RODATA] = 0x00, [BTSEG_DATA] = 0x00 }; #define MM_PROT_READ 0x00 #define MM_PROT_WRITE 0x01 #define MM_PROT_EXECUTE 0x02 static const pt_entry_t protection_codes[3] = { [MM_PROT_READ] = PTE_NX, [MM_PROT_WRITE] = PTE_W | PTE_NX, [MM_PROT_EXECUTE] = 0, /* RWX does not exist */ }; struct bootspace bootspace; extern paddr_t kernpa_start, kernpa_end; vaddr_t iom_base; paddr_t pa_avail = 0; static const vaddr_t tmpva = (PREKERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); void mm_init(paddr_t first_pa) { pa_avail = first_pa; } static void mm_enter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot) { if (PTE_BASE[pl1_i(va)] & PTE_P) { fatal("mm_enter_pa: mapping already present"); } PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot]; } static void mm_reenter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot) { PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot]; } static void mm_flush_va(vaddr_t va) { asm volatile("invlpg (%0)" ::"r" (va) : "memory"); } static paddr_t mm_palloc(size_t npages) { paddr_t pa; size_t i; /* Allocate the physical pages */ pa = pa_avail; pa_avail += npages * PAGE_SIZE; /* Zero them out */ for (i = 0; i < npages; i++) { mm_reenter_pa(pa + i * PAGE_SIZE, tmpva, MM_PROT_READ|MM_PROT_WRITE); mm_flush_va(tmpva); memset((void *)tmpva, 0, PAGE_SIZE); } return pa; } static bool mm_pte_is_valid(pt_entry_t pte) { return ((pte & PTE_P) != 0); } static void mm_mprotect(vaddr_t startva, size_t size, pte_prot_t prot) { size_t i, npages; vaddr_t va; paddr_t pa; ASSERT(size % PAGE_SIZE == 0); npages = size / PAGE_SIZE; for (i = 0; i < npages; i++) { va = startva + i * PAGE_SIZE; pa = (PTE_BASE[pl1_i(va)] & PTE_FRAME); mm_reenter_pa(pa, va, prot); mm_flush_va(va); } } void mm_bootspace_mprotect(void) { pte_prot_t prot; size_t i; /* Remap the kernel segments with proper permissions. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_TEXT) { prot = MM_PROT_READ|MM_PROT_EXECUTE; } else if (bootspace.segs[i].type == BTSEG_RODATA) { prot = MM_PROT_READ; } else { continue; } mm_mprotect(bootspace.segs[i].va, bootspace.segs[i].sz, prot); } print_state(STATE_NORMAL, "Segments protection updated"); } static size_t mm_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) { size_t npages; npages = roundup((endva / PAGE_SIZE), (pgsz / PAGE_SIZE)) - rounddown((startva / PAGE_SIZE), (pgsz / PAGE_SIZE)); return (npages / (pgsz / PAGE_SIZE)); } static void mm_map_tree(vaddr_t startva, vaddr_t endva) { size_t i, nL4e, nL3e, nL2e; size_t L4e_idx, L3e_idx, L2e_idx; paddr_t pa; /* Build L4. */ L4e_idx = pl4_i(startva); nL4e = mm_nentries_range(startva, endva, NBPD_L4); ASSERT(L4e_idx == 511); ASSERT(nL4e == 1); if (!mm_pte_is_valid(L4_BASE[L4e_idx])) { pa = mm_palloc(1); L4_BASE[L4e_idx] = pa | PTE_P | PTE_W; } /* Build L3. */ L3e_idx = pl3_i(startva); nL3e = mm_nentries_range(startva, endva, NBPD_L3); for (i = 0; i < nL3e; i++) { if (mm_pte_is_valid(L3_BASE[L3e_idx+i])) { continue; } pa = mm_palloc(1); L3_BASE[L3e_idx+i] = pa | PTE_P | PTE_W; } /* Build L2. */ L2e_idx = pl2_i(startva); nL2e = mm_nentries_range(startva, endva, NBPD_L2); for (i = 0; i < nL2e; i++) { if (mm_pte_is_valid(L2_BASE[L2e_idx+i])) { continue; } pa = mm_palloc(1); L2_BASE[L2e_idx+i] = pa | PTE_P | PTE_W; } } static vaddr_t mm_randva_kregion(size_t size, size_t pagesz) { vaddr_t sva, eva; vaddr_t randva; uint64_t rnd; size_t i; bool ok; while (1) { prng_get_rand(&rnd, sizeof(rnd)); randva = rounddown(KASLR_WINDOW_BASE + rnd % (KASLR_WINDOW_SIZE - size), pagesz); /* Detect collisions */ ok = true; for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_NONE) { continue; } sva = bootspace.segs[i].va; eva = sva + bootspace.segs[i].sz; if ((sva <= randva) && (randva < eva)) { ok = false; break; } if ((sva < randva + size) && (randva + size <= eva)) { ok = false; break; } if (randva < sva && eva < (randva + size)) { ok = false; break; } } if (ok) { break; } } mm_map_tree(randva, randva + size); return randva; } static paddr_t bootspace_get_kern_segs_end_pa(void) { paddr_t pa, max = 0; size_t i; for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_NONE) { continue; } pa = bootspace.segs[i].pa + bootspace.segs[i].sz; if (pa > max) max = pa; } return max; } static void bootspace_addseg(int type, vaddr_t va, paddr_t pa, size_t sz) { size_t i; for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_NONE) { bootspace.segs[i].type = type; bootspace.segs[i].va = va; bootspace.segs[i].pa = pa; bootspace.segs[i].sz = sz; return; } } fatal("bootspace_addseg: segments full"); } static size_t mm_shift_segment(vaddr_t va, size_t pagesz, size_t elfsz, size_t elfalign) { size_t shiftsize, offset; uint64_t rnd; /* * If possible, shift the segment in memory using a random offset. Once * shifted the segment remains in the same page, of size pagesz. Make * sure to respect the ELF alignment constraint. */ if (elfalign == 0) { elfalign = ELFROUND; } ASSERT(pagesz >= elfalign); ASSERT(pagesz % elfalign == 0); shiftsize = roundup(elfsz, pagesz) - roundup(elfsz, elfalign); if (shiftsize == 0) { return 0; } prng_get_rand(&rnd, sizeof(rnd)); offset = roundup(rnd % shiftsize, elfalign); ASSERT((va + offset) % elfalign == 0); memmove((void *)(va + offset), (void *)va, elfsz); return offset; } static void mm_map_head(void) { size_t i, npages, size; uint64_t rnd; vaddr_t randva; /* * The HEAD window is 1GB below the main KASLR window. This is to * ensure that head always comes first in virtual memory. The reason * for that is that we use (headva + sh_offset), and sh_offset is * unsigned. */ /* * To get the size of the head, we give a look at the read-only * mapping of the kernel we created in locore. We're identity mapped, * so kernpa = kernva. */ size = elf_get_head_size((vaddr_t)kernpa_start); npages = size / PAGE_SIZE; /* * Choose a random range of VAs in the HEAD window, and create the page * tree for it. */ prng_get_rand(&rnd, sizeof(rnd)); randva = rounddown(HEAD_WINDOW_BASE + rnd % (HEAD_WINDOW_SIZE - size), PAGE_SIZE); mm_map_tree(randva, randva + size); /* Enter the area and build the ELF info */ for (i = 0; i < npages; i++) { mm_enter_pa(kernpa_start + i * PAGE_SIZE, randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); } elf_build_head(randva); /* Register the values in bootspace */ bootspace.head.va = randva; bootspace.head.pa = kernpa_start; bootspace.head.sz = size; } vaddr_t mm_map_segment(int segtype, paddr_t pa, size_t elfsz, size_t elfalign) { size_t i, npages, size, pagesz, offset; vaddr_t randva; char pad; if (elfsz <= PAGE_SIZE) { pagesz = NBPD_L1; } else { pagesz = NBPD_L2; } /* Create the page tree */ size = roundup(elfsz, pagesz); randva = mm_randva_kregion(size, pagesz); /* Enter the segment */ npages = size / PAGE_SIZE; for (i = 0; i < npages; i++) { mm_enter_pa(pa + i * PAGE_SIZE, randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); } /* Shift the segment in memory */ offset = mm_shift_segment(randva, pagesz, elfsz, elfalign); ASSERT(offset + elfsz <= size); /* Fill the paddings */ pad = pads[segtype]; memset((void *)randva, pad, offset); memset((void *)(randva + offset + elfsz), pad, size - elfsz - offset); /* Register the bootspace information */ bootspace_addseg(segtype, randva, pa, size); return (randva + offset); } static void mm_map_boot(void) { size_t i, npages, size; vaddr_t randva; paddr_t bootpa; /* * The "boot" region is special: its page tree has a fixed size, but * the number of pages entered is lower. */ /* Create the page tree, starting at a random VA */ size = (NKL2_KIMG_ENTRIES + 1) * NBPD_L2; randva = mm_randva_kregion(size, PAGE_SIZE); /* The "boot" region begins right after the kernel segments */ bootpa = bootspace_get_kern_segs_end_pa(); /* The prekern consumed some EXTRA memory up until pa_avail, this * covers REL/RELA/SYM/STR and EXTRA */ size = (pa_avail - bootpa); npages = size / PAGE_SIZE; /* Enter the whole area linearly */ for (i = 0; i < npages; i++) { mm_enter_pa(bootpa + i * PAGE_SIZE, randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); } /* Fix up the ELF sections located in the "boot" region */ elf_fixup_boot(randva, bootpa); /* Map the ISA I/O MEM right after EXTRA, in pure VA */ iom_base = randva + npages * PAGE_SIZE; npages = IOM_SIZE / PAGE_SIZE; for (i = 0; i < npages; i++) { mm_enter_pa(IOM_BEGIN + i * PAGE_SIZE, iom_base + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); } /* Register the values in bootspace */ bootspace.boot.va = randva; bootspace.boot.pa = bootpa; bootspace.boot.sz = (size_t)(iom_base + IOM_SIZE) - (size_t)bootspace.boot.va; /* Initialize the values that are located in the "boot" region */ extern uint64_t PDPpaddr; bootspace.spareva = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2; bootspace.pdir = bootspace.boot.va + (PDPpaddr - bootspace.boot.pa); bootspace.smodule = (vaddr_t)iom_base + IOM_SIZE; bootspace.emodule = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2; } /* * The bootloader has set up the following layout of physical memory: * +------------+--------------+------------+------------------------+-------+ * | ELF HEADER | SECT HEADERS | KERN SECTS | REL/RELA/SYM/STR SECTS | EXTRA | * +------------+--------------+------------+------------------------+-------+ * This was done in the loadfile_elf32.c:loadfile_dynamic() function. * * We abstract this layout into several "regions": * +---------------------------+------------+--------------------------------+ * | Head region | Kern segs | Boot region | * +---------------------------+------------+--------------------------------+ * * There is a variable number of independent regions we create: one head, * several kernel segments, one boot. They are all mapped at random VAs. * * "Head" contains the ELF Header and ELF Section Headers, and we use them to * map the rest of the regions. Head must be placed *before* the other * regions, in both virtual memory and physical memory. * * The "Kernel Segments" contain the kernel SHT_NOBITS and SHT_PROGBITS * sections, in a 1:1 manner (one segment is associated with one section). * The segments are mapped at random VAs and referenced in bootspace.segs[]. * * "Boot" contains miscellaneous information: * - The ELF Rel/Rela/Sym/Str sections of the kernel * - Some extra memory the prekern has consumed so far * - The ISA I/O MEM, in pure VA * - Eventually the module_map, in pure VA (the kernel uses the available VA * at the end of "boot") * Boot is placed *after* the other regions in physical memory. In virtual * memory however there is no constraint, so its VA is randomly selected in * the main KASLR window. * * At the end of this function, the bootspace structure is fully constructed. */ void mm_map_kernel(void) { memset(&bootspace, 0, sizeof(bootspace)); mm_map_head(); print_state(STATE_NORMAL, "Head region mapped"); elf_map_sections(); print_state(STATE_NORMAL, "Segments mapped"); mm_map_boot(); print_state(STATE_NORMAL, "Boot region mapped"); }