3 * \brief pmap management
5 * x86_64 specific management of page tables
7 * Warning: This code is coupled with the code in slot_alloc/. and pinned.c
9 * The maximum number of slots required to map a BASE_PAGE_SIZE
10 * sized page is the number of page table levels + 1.
11 * The sum for x86_64 is 4.
13 * Warning: Additional slots will be required to map a BASE_PAGE_SIZE size page,
14 * if we also track the actual frames that are mapped.
15 * Currently this is not the case.
19 * Copyright (c) 2009-2013 ETH Zurich.
20 * Copyright (c) 2014 HP Labs.
21 * All rights reserved.
23 * This file is distributed under the terms in the attached LICENSE file.
24 * If you do not find this file, copies can be found by writing to:
25 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group.
28 #include <barrelfish/barrelfish.h>
29 #include <barrelfish/dispatch.h>
30 #include "target/x86/pmap_x86.h"
33 // Size of virtual region mapped by a single PML4 entry
34 #define PML4_MAPPING_SIZE ((genvaddr_t)512*512*512*BASE_PAGE_SIZE)
36 // Location and size of virtual address space reserved for mapping
37 // frames backing refill_slabs
38 #define META_DATA_RESERVED_BASE (PML4_MAPPING_SIZE * (disp_get_core_id() + 1))
39 #define META_DATA_RESERVED_SIZE (X86_64_BASE_PAGE_SIZE * 80000)
42 * \brief Translate generic vregion flags to architecture specific pmap flags
44 static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags)
46 paging_x86_64_flags_t pmap_flags =
47 PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE;
49 if (!(vregion_flags & VREGION_FLAGS_GUARD)) {
50 if (vregion_flags & VREGION_FLAGS_WRITE) {
51 pmap_flags |= PTABLE_READ_WRITE;
53 if (vregion_flags & VREGION_FLAGS_EXECUTE) {
54 pmap_flags &= ~PTABLE_EXECUTE_DISABLE;
56 if (vregion_flags & VREGION_FLAGS_NOCACHE) {
57 pmap_flags |= PTABLE_CACHE_DISABLED;
64 // returns whether va1 and va2 share a page directory entry
65 // not using X86_64_PDIR_BASE() macro as this would give false positives (same
66 // entry in different directories)
67 static inline bool is_same_pdir(genvaddr_t va1, genvaddr_t va2)
69 return (va1>>X86_64_LARGE_PAGE_BITS) == ((va2-1)>>X86_64_LARGE_PAGE_BITS);
71 // returns whether va1 and va2 share a page directory pointer table entry
72 static inline bool is_same_pdpt(genvaddr_t va1, genvaddr_t va2)
74 return (va1>>X86_64_HUGE_PAGE_BITS) == ((va2-1)>>X86_64_HUGE_PAGE_BITS);
76 // returns whether va1 and va2 share a page map level 4 entry
77 static inline bool is_same_pml4(genvaddr_t va1, genvaddr_t va2)
79 // the base macros work here as we only have one pml4.
80 return X86_64_PML4_BASE(va1) == X86_64_PML4_BASE(va2-1);
82 // size indicates how many bits to shift
83 static inline genvaddr_t get_addr_prefix(genvaddr_t va, uint8_t size)
89 * \brief Returns the vnode for the pdpt mapping a given vspace address
91 static inline errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base,
95 struct vnode *root = &pmap->root;
99 if((*pdpt = find_vnode(root, X86_64_PML4_BASE(base))) == NULL) {
100 err = alloc_vnode(pmap, root, ObjType_VNode_x86_64_pdpt,
101 X86_64_PML4_BASE(base), pdpt);
102 if (err_is_fail(err)) {
103 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
111 * \brief Returns the vnode for the page directory mapping a given vspace
114 static inline errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base,
119 err = get_pdpt(pmap, base, &pdpt);
120 if (err_is_fail(err)) {
123 assert(pdpt != NULL);
126 if((*pdir = find_vnode(pdpt, X86_64_PDPT_BASE(base))) == NULL) {
127 err = alloc_vnode(pmap, pdpt, ObjType_VNode_x86_64_pdir,
128 X86_64_PDPT_BASE(base), pdir);
129 if (err_is_fail(err)) {
130 printf("failure mapping pdpt\n");
131 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
139 * \brief Returns the vnode for the pagetable mapping a given vspace address
141 static inline errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base,
142 struct vnode **ptable)
146 err = get_pdir(pmap, base, &pdir);
147 if (err_is_fail(err)) {
150 assert(pdir != NULL);
153 if((*ptable = find_vnode(pdir, X86_64_PDIR_BASE(base))) == NULL) {
154 err = alloc_vnode(pmap, pdir, ObjType_VNode_x86_64_ptable,
155 X86_64_PDIR_BASE(base), ptable);
156 if (err_is_fail(err)) {
157 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
165 * \brief Returns the vnode for the page directory pointer table mapping for a
166 * given vspace address
168 static inline struct vnode *find_pdpt(struct pmap_x86 *pmap, genvaddr_t base)
170 struct vnode *root = &pmap->root;
171 assert(root != NULL);
174 return find_vnode(root, X86_64_PML4_BASE(base));
178 * \brief Returns the vnode for the page directory mapping a given vspace
179 * address, without performing allocations as get_pdir() does
181 static inline struct vnode *find_pdir(struct pmap_x86 *pmap, genvaddr_t base)
183 struct vnode *pdpt = find_pdpt(pmap, base);
187 return find_vnode(pdpt, X86_64_PDPT_BASE(base));
194 * \brief Returns the vnode for the pagetable mapping a given vspace address,
195 * without performing allocations as get_ptable() does
197 static inline struct vnode *find_ptable(struct pmap_x86 *pmap, genvaddr_t base)
199 struct vnode *pdir = find_pdir(pmap, base);
203 return find_vnode(pdir, X86_64_PDIR_BASE(base));
209 static errval_t do_single_map(struct pmap_x86 *pmap, genvaddr_t vaddr,
210 genvaddr_t vend, struct capref frame,
211 size_t offset, size_t pte_count,
212 vregion_flags_t flags)
214 if (pte_count == 0) {
215 debug_printf("do_single_map: pte_count == 0, called from %p\n",
216 __builtin_return_address(0));
219 assert(pte_count > 0);
221 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
223 // Get the paging structure and set paging relevant parameters
224 struct vnode *ptable;
228 // get the right paging table and address part
229 if(flags & VREGION_FLAGS_LARGE) {
230 //large 2M pages, mapped into pdir
231 err = get_pdir(pmap, vaddr, &ptable);
232 table_base = X86_64_PDIR_BASE(vaddr);
233 } else if (flags & VREGION_FLAGS_HUGE) {
234 //huge 1GB pages, mapped into pdpt
235 err = get_pdpt(pmap, vaddr, &ptable);
236 table_base = X86_64_PDPT_BASE(vaddr);
238 //normal 4K pages, mapped into ptable
239 err = get_ptable(pmap, vaddr, &ptable);
240 table_base = X86_64_PTABLE_BASE(vaddr);
242 if (err_is_fail(err)) {
243 return err_push(err, LIB_ERR_PMAP_GET_PTABLE);
245 assert(ptable->is_vnode);
247 // check if there is an overlapping mapping
248 if (has_vnode(ptable, table_base, pte_count, false)) {
249 if (has_vnode(ptable, table_base, pte_count, true)) {
250 printf("page already exists in 0x%"
251 PRIxGENVADDR"--0x%"PRIxGENVADDR"\n", vaddr, vend);
252 return LIB_ERR_PMAP_EXISTING_MAPPING;
254 // clean out empty page tables. We do this here because we benefit
255 // from having the page tables in place when doing lots of small
257 remove_empty_vnodes(pmap, ptable, table_base, pte_count);
261 // setup userspace mapping
262 struct vnode *page = slab_alloc(&pmap->slab);
264 page->is_vnode = false;
265 page->entry = table_base;
266 page->next = ptable->u.vnode.children;
267 ptable->u.vnode.children = page;
268 page->u.frame.cap = frame;
269 page->u.frame.offset = offset;
270 page->u.frame.flags = flags;
271 page->u.frame.pte_count = pte_count;
274 err = vnode_map(ptable->u.vnode.cap, frame, table_base,
275 pmap_flags, offset, pte_count);
276 if (err_is_fail(err)) {
277 return err_push(err, LIB_ERR_VNODE_MAP);
284 * \brief Called when enough slabs exist for the given mapping
286 static errval_t do_map(struct pmap_x86 *pmap, genvaddr_t vaddr,
287 struct capref frame, size_t offset, size_t size,
288 vregion_flags_t flags, size_t *retoff, size_t *retsize)
292 // determine page size and relevant address part
293 size_t page_size = X86_64_BASE_PAGE_SIZE;
294 size_t table_base = X86_64_PTABLE_BASE(vaddr);
295 uint8_t map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
296 bool debug_out = false;
298 // get base address and size of frame
299 struct frame_identity fi;
300 err = invoke_frame_identify(frame, &fi);
301 if (err_is_fail(err)) {
302 return err_push(err, LIB_ERR_PMAP_DO_MAP);
305 if ((flags & VREGION_FLAGS_HUGE) &&
306 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
307 fi.bits >= X86_64_HUGE_PAGE_BITS &&
308 ((fi.base & X86_64_HUGE_PAGE_MASK) == 0))
310 // huge page branch (1GB)
311 page_size = X86_64_HUGE_PAGE_SIZE;
312 table_base = X86_64_PDPT_BASE(vaddr);
313 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
315 // remove large flag, if we're doing huge mapping
316 flags &= ~VREGION_FLAGS_LARGE;
317 } else if ((flags & VREGION_FLAGS_LARGE) &&
318 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
319 fi.bits >= X86_64_LARGE_PAGE_BITS &&
320 ((fi.base & X86_64_LARGE_PAGE_MASK) == 0))
322 // large page branch (2MB)
323 page_size = X86_64_LARGE_PAGE_SIZE;
324 table_base = X86_64_PDIR_BASE(vaddr);
325 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
328 // remove large/huge flags
329 flags &= ~(VREGION_FLAGS_LARGE|VREGION_FLAGS_HUGE);
332 // round to the next full page and calculate end address and #ptes
333 size = ROUND_UP(size, page_size);
334 size_t pte_count = DIVIDE_ROUND_UP(size, page_size);
335 genvaddr_t vend = vaddr + size;
337 if (offset+size > (1ULL<<fi.bits)) {
338 debug_printf("do_map: offset=%zu; size=%zu; frame size=%zu\n",
339 offset, size, ((size_t)1<<fi.bits));
340 return LIB_ERR_PMAP_FRAME_SIZE;
344 if (true || debug_out) {
345 genpaddr_t paddr = fi.base + offset;
347 debug_printf("do_map: 0x%"
348 PRIxGENVADDR"--0x%"PRIxGENVADDR" -> 0x%"PRIxGENPADDR
349 "; pte_count = %zd; frame bits = %zd; page size = 0x%zx\n",
350 vaddr, vend, paddr, pte_count, (size_t)fi.bits, page_size);
354 // all mapping on one leaf table?
355 if (is_same_pdir(vaddr, vend) ||
356 (flags & VREGION_FLAGS_LARGE && is_same_pdpt(vaddr, vend)) ||
357 (flags & VREGION_FLAGS_HUGE && is_same_pml4(vaddr, vend))) {
360 debug_printf(" do_map: fast path: %zd\n", pte_count);
362 err = do_single_map(pmap, vaddr, vend, frame, offset, pte_count, flags);
363 if (err_is_fail(err)) {
364 return err_push(err, LIB_ERR_PMAP_DO_MAP);
367 else { // multiple leaf page tables
369 uint32_t c = X86_64_PTABLE_SIZE - table_base;
371 debug_printf(" do_map: slow path: first leaf %"PRIu32"\n", c);
373 genvaddr_t temp_end = vaddr + c * page_size;
374 err = do_single_map(pmap, vaddr, temp_end, frame, offset, c, flags);
375 if (err_is_fail(err)) {
376 return err_push(err, LIB_ERR_PMAP_DO_MAP);
380 while (get_addr_prefix(temp_end, map_bits) <
381 get_addr_prefix(vend, map_bits))
385 temp_end = vaddr + X86_64_PTABLE_SIZE * page_size;
386 offset += c * page_size;
387 c = X86_64_PTABLE_SIZE;
390 err = slot_alloc(&next);
391 if (err_is_fail(err)) {
392 return err_push(err, LIB_ERR_PMAP_DO_MAP);
394 err = cap_copy(next, frame);
395 if (err_is_fail(err)) {
396 return err_push(err, LIB_ERR_PMAP_DO_MAP);
402 debug_printf(" do_map: slow path: full leaf\n");
404 err = do_single_map(pmap, vaddr, temp_end, frame, offset,
405 X86_64_PTABLE_SIZE, flags);
406 if (err_is_fail(err)) {
407 return err_push(err, LIB_ERR_PMAP_DO_MAP);
411 // map remaining part
412 offset += c * page_size;
414 // calculate remaining pages (subtract ptable bits from map_bits to
415 // get #ptes of last-level instead of 2nd-to-last).
416 c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) -
417 get_addr_prefix(temp_end, map_bits-X86_64_PTABLE_BITS);
422 err = slot_alloc(&next);
423 if (err_is_fail(err)) {
424 return err_push(err, LIB_ERR_PMAP_DO_MAP);
426 err = cap_copy(next, frame);
427 if (err_is_fail(err)) {
428 return err_push(err, LIB_ERR_PMAP_DO_MAP);
433 debug_printf("do_map: slow path: last leaf %"PRIu32"\n", c);
435 err = do_single_map(pmap, temp_end, vend, next, offset, c, flags);
436 if (err_is_fail(err)) {
437 return err_push(err, LIB_ERR_PMAP_DO_MAP);
451 /// Computer upper limit on number of slabs required to perform a mapping
452 static size_t max_slabs_for_mapping(size_t bytes)
454 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_BASE_PAGE_SIZE);
455 size_t max_ptable = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
456 size_t max_pdir = DIVIDE_ROUND_UP(max_ptable, X86_64_PTABLE_SIZE);
457 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
458 return max_pages + max_ptable + max_pdir + max_pdpt;
461 static size_t max_slabs_for_mapping_large(size_t bytes)
463 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_LARGE_PAGE_SIZE);
464 size_t max_pdir = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
465 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
466 return max_pages + max_pdir + max_pdpt;
469 static size_t max_slabs_for_mapping_huge(size_t bytes)
471 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_HUGE_PAGE_SIZE);
472 size_t max_pdpt = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
473 return max_pages + max_pdpt;
477 * \brief Refill slabs used for metadata
479 * \param pmap The pmap to refill in
480 * \param request The number of slabs the allocator must have
481 * when the function returns
483 * When the current pmap is initialized,
484 * it reserves some virtual address space for metadata.
485 * This reserved address space is used here
487 * Can only be called for the current pmap
488 * Will recursively call into itself till it has enough slabs
490 static errval_t refill_slabs(struct pmap_x86 *pmap, size_t request)
494 /* Keep looping till we have #request slabs */
495 while (slab_freecount(&pmap->slab) < request) {
496 // Amount of bytes required for #request
497 size_t bytes = SLAB_STATIC_SIZE(request - slab_freecount(&pmap->slab),
498 sizeof(struct vnode));
500 /* Get a frame of that size */
502 err = frame_alloc(&cap, bytes, &bytes);
503 if (err_is_fail(err)) {
504 return err_push(err, LIB_ERR_FRAME_ALLOC);
507 /* If we do not have enough slabs to map the frame in, recurse */
508 size_t required_slabs_for_frame = max_slabs_for_mapping(bytes);
509 if (slab_freecount(&pmap->slab) < required_slabs_for_frame) {
510 // If we recurse, we require more slabs than to map a single page
511 assert(required_slabs_for_frame > 4);
513 err = refill_slabs(pmap, required_slabs_for_frame);
514 if (err_is_fail(err)) {
515 return err_push(err, LIB_ERR_SLAB_REFILL);
519 /* Perform mapping */
520 genvaddr_t genvaddr = pmap->vregion_offset;
521 pmap->vregion_offset += (genvaddr_t)bytes;
522 assert(pmap->vregion_offset < vregion_get_base_addr(&pmap->vregion) +
523 vregion_get_size(&pmap->vregion));
525 err = do_map(pmap, genvaddr, cap, 0, bytes,
526 VREGION_FLAGS_READ_WRITE, NULL, NULL);
527 if (err_is_fail(err)) {
528 return err_push(err, LIB_ERR_PMAP_DO_MAP);
532 lvaddr_t buf = vspace_genvaddr_to_lvaddr(genvaddr);
533 slab_grow(&pmap->slab, (void*)buf, bytes);
539 /// Minimally refill the slab allocator
540 static errval_t min_refill_slabs(struct pmap_x86 *pmap)
542 return refill_slabs(pmap, 5);
546 * \brief Create page mappings
548 * \param pmap The pmap object
549 * \param vaddr The virtual address to create the mapping for
550 * \param frame The frame cap to map in
551 * \param offset Offset into the frame cap
552 * \param size Size of the mapping
553 * \param flags Flags for the mapping
554 * \param retoff If non-NULL, filled in with adjusted offset of mapped region
555 * \param retsize If non-NULL, filled in with adjusted size of mapped region
557 static errval_t map(struct pmap *pmap, genvaddr_t vaddr, struct capref frame,
558 size_t offset, size_t size, vregion_flags_t flags,
559 size_t *retoff, size_t *retsize)
562 struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
564 struct frame_identity fi;
565 err = invoke_frame_identify(frame, &fi);
566 if (err_is_fail(err)) {
567 return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY);
571 // Adjust the parameters to page boundaries
572 // TODO: overestimating needed slabs shouldn't hurt much in the long run,
573 // and would keep the code easier to read and possibly faster due to less
575 if ((flags & VREGION_FLAGS_LARGE) &&
576 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
577 (fi.base & X86_64_LARGE_PAGE_MASK) == 0 &&
578 (1UL<<fi.bits) >= offset+size) {
579 //case large pages (2MB)
580 size += LARGE_PAGE_OFFSET(offset);
581 size = ROUND_UP(size, LARGE_PAGE_SIZE);
582 offset -= LARGE_PAGE_OFFSET(offset);
583 max_slabs = max_slabs_for_mapping_large(size);
584 } else if ((flags & VREGION_FLAGS_HUGE) &&
585 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
586 (fi.base & X86_64_HUGE_PAGE_MASK) == 0 &&
587 (1UL<<fi.bits) >= offset+size) {
588 // case huge pages (1GB)
589 size += HUGE_PAGE_OFFSET(offset);
590 size = ROUND_UP(size, HUGE_PAGE_SIZE);
591 offset -= HUGE_PAGE_OFFSET(offset);
592 max_slabs = max_slabs_for_mapping_huge(size);
594 //case normal pages (4KB)
595 size += BASE_PAGE_OFFSET(offset);
596 size = ROUND_UP(size, BASE_PAGE_SIZE);
597 offset -= BASE_PAGE_OFFSET(offset);
598 max_slabs = max_slabs_for_mapping(size);
601 // Refill slab allocator if necessary
602 size_t slabs_free = slab_freecount(&x86->slab);
604 max_slabs += 5; // minimum amount required to map a page
605 if (slabs_free < max_slabs) {
606 struct pmap *mypmap = get_current_pmap();
607 if (pmap == mypmap) {
608 err = refill_slabs(x86, max_slabs);
609 if (err_is_fail(err)) {
610 return err_push(err, LIB_ERR_SLAB_REFILL);
613 size_t bytes = SLAB_STATIC_SIZE(max_slabs - slabs_free,
614 sizeof(struct vnode));
615 void *buf = malloc(bytes);
617 return LIB_ERR_MALLOC_FAIL;
619 slab_grow(&x86->slab, buf, bytes);
623 err = do_map(x86, vaddr, frame, offset, size, flags, retoff, retsize);
628 * \brief Find mapping for `vaddr` in `pmap`.
629 * \arg pmap the pmap to search in
630 * \arg vaddr the virtual address to search for
631 * \arg pt the last-level page table meta-data we found if any
632 * \arg page the page meta-data we found if any
633 * \returns `true` iff we found a mapping for vaddr
635 static bool find_mapping(struct pmap_x86 *pmap, genvaddr_t vaddr,
636 struct vnode **outpt, struct vnode **outpage)
638 struct vnode *pdpt = NULL, *pdir = NULL, *pt = NULL, *page = NULL;
640 // find page and last-level page table (can be pdir or pdpt)
641 if ((pdpt = find_pdpt(pmap, vaddr)) != NULL) {
642 page = find_vnode(pdpt, X86_64_PDPT_BASE(vaddr));
643 if (page && page->is_vnode) { // not 1G pages
645 page = find_vnode(pdir, X86_64_PDIR_BASE(vaddr));
646 if (page && page->is_vnode) { // not 2M pages
648 page = find_vnode(pt, X86_64_PTABLE_BASE(vaddr));
669 static errval_t do_single_unmap(struct pmap_x86 *pmap, genvaddr_t vaddr,
670 size_t pte_count, bool delete_cap)
673 struct vnode *pt = NULL, *page = NULL;
675 if (!find_mapping(pmap, vaddr, &pt, &page)) {
676 return LIB_ERR_PMAP_FIND_VNODE;
678 assert(pt && pt->is_vnode && page && !page->is_vnode);
680 if (page->u.frame.pte_count == pte_count) {
681 err = vnode_unmap(pt->u.vnode.cap, page->u.frame.cap, page->entry,
682 page->u.frame.pte_count);
683 if (err_is_fail(err)) {
684 printf("vnode_unmap returned error: %s (%d)\n",
685 err_getstring(err), err_no(err));
686 return err_push(err, LIB_ERR_VNODE_UNMAP);
689 // Free up the resources
691 err = cap_destroy(page->u.frame.cap);
692 if (err_is_fail(err)) {
693 printf("delete_cap\n");
694 return err_push(err, LIB_ERR_PMAP_DO_SINGLE_UNMAP);
697 remove_vnode(pt, page);
698 slab_free(&pmap->slab, page);
704 static inline bool is_large_page(struct vnode *p)
706 return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_LARGE;
708 static inline bool is_huge_page(struct vnode *p)
710 return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_HUGE;
714 * \brief Remove page mappings
716 * \param pmap The pmap object
717 * \param vaddr The start of the virtual region to remove
718 * \param size The size of virtual region to remove
719 * \param retsize If non-NULL, filled in with the actual size removed
721 static errval_t unmap(struct pmap *pmap, genvaddr_t vaddr, size_t size,
724 //printf("[unmap] 0x%"PRIxGENVADDR", %zu\n", vaddr, size);
725 errval_t err, ret = SYS_ERR_OK;
726 struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
728 //determine if we unmap a larger page
729 struct vnode* page = NULL;
731 if (!find_mapping(x86, vaddr, NULL, &page)) {
732 //TODO: better error --> LIB_ERR_PMAP_NOT_MAPPED
733 return LIB_ERR_PMAP_UNMAP;
736 assert(!page->is_vnode);
738 size_t page_size = X86_64_BASE_PAGE_SIZE;
739 size_t table_base = X86_64_PTABLE_BASE(vaddr);
740 uint8_t map_bits= X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
741 if (is_large_page(page)) {
743 page_size = X86_64_LARGE_PAGE_SIZE;
744 table_base = X86_64_PDIR_BASE(vaddr);
745 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
746 } else if (is_huge_page(page)) {
748 page_size = X86_64_HUGE_PAGE_SIZE;
749 table_base = X86_64_PDPT_BASE(vaddr);
750 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
752 if (page->entry > table_base) {
753 debug_printf("trying to partially unmap region\n");
755 return LIB_ERR_PMAP_FIND_VNODE;
758 // TODO: match new policy of map when implemented
759 size = ROUND_UP(size, page_size);
760 genvaddr_t vend = vaddr + size;
762 if (is_same_pdir(vaddr, vend) ||
763 (is_same_pdpt(vaddr, vend) && is_large_page(page)) ||
764 (is_same_pml4(vaddr, vend) && is_huge_page(page)))
767 err = do_single_unmap(x86, vaddr, size / page_size, false);
768 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
769 printf("error fast path\n");
770 return err_push(err, LIB_ERR_PMAP_UNMAP);
775 uint32_t c = X86_64_PTABLE_SIZE - table_base;
777 err = do_single_unmap(x86, vaddr, c, false);
778 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
779 printf("error first leaf\n");
780 return err_push(err, LIB_ERR_PMAP_UNMAP);
784 vaddr += c * page_size;
785 while (get_addr_prefix(vaddr, map_bits) < get_addr_prefix(vend, map_bits)) {
786 c = X86_64_PTABLE_SIZE;
787 err = do_single_unmap(x86, vaddr, X86_64_PTABLE_SIZE, true);
788 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
789 printf("error while loop\n");
790 return err_push(err, LIB_ERR_PMAP_UNMAP);
792 vaddr += c * page_size;
795 // unmap remaining part
796 // subtracting ptable bits from map_bits to get #ptes in last-level table
797 // instead of 2nd-to-last.
798 c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) -
799 get_addr_prefix(vaddr, map_bits-X86_64_PTABLE_BITS);
800 assert(c < X86_64_PTABLE_SIZE);
802 err = do_single_unmap(x86, vaddr, c, true);
803 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
804 printf("error remaining part\n");
805 return err_push(err, LIB_ERR_PMAP_UNMAP);
814 //printf("[unmap] exiting\n");
818 static errval_t do_single_modify_flags(struct pmap_x86 *pmap, genvaddr_t vaddr,
819 size_t pages, vregion_flags_t flags)
821 errval_t err = SYS_ERR_OK;
823 struct vnode *pt = NULL, *page = NULL;
825 if (!find_mapping(pmap, vaddr, &pt, &page)) {
826 return LIB_ERR_PMAP_FIND_VNODE;
828 assert(pt && pt->is_vnode && page && !page->is_vnode);
830 uint16_t ptentry = X86_64_PTABLE_BASE(vaddr);
831 size_t pagesize = BASE_PAGE_SIZE;
832 if (is_large_page(page)) {
834 ptentry = X86_64_PDIR_BASE(vaddr);
835 pagesize = LARGE_PAGE_SIZE;
836 } else if (is_huge_page(page)) {
838 ptentry = X86_64_PDPT_BASE(vaddr);
839 pagesize = HUGE_PAGE_SIZE;
842 if (inside_region(pt, ptentry, pages)) {
843 // we're modifying part of a valid mapped region
844 // arguments to invocation: invoke frame cap, first affected
845 // page (as offset from first page in mapping), #affected
846 // pages, new flags. Invocation mask flags based on capability
847 // access permissions.
848 size_t off = ptentry - page->entry;
849 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
850 // calculate TLB flushing hint
851 genvaddr_t va_hint = 0;
853 // do assisted selective flush for single page
854 va_hint = vaddr & ~X86_64_BASE_PAGE_MASK;
856 err = invoke_frame_modify_flags(page->u.frame.cap, off, pages,
857 pmap_flags, va_hint);
860 // overlaps some region border
861 // XXX: need better error
862 return LIB_ERR_PMAP_EXISTING_MAPPING;
870 * \brief Modify page mapping
872 * \param pmap The pmap object
873 * \param vaddr The first virtual address for which to change the flags
874 * \param size The length of the region to change in bytes
875 * \param flags New flags for the mapping
876 * \param retsize If non-NULL, filled in with the actual size modified
878 static errval_t modify_flags(struct pmap *pmap, genvaddr_t vaddr, size_t size,
879 vregion_flags_t flags, size_t *retsize)
882 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
884 //determine if we unmap a larger page
885 struct vnode* page = NULL;
887 if (!find_mapping(x86, vaddr, NULL, &page)) {
888 return LIB_ERR_PMAP_NOT_MAPPED;
891 assert(page && !page->is_vnode);
893 size_t page_size = X86_64_BASE_PAGE_SIZE;
894 size_t table_base = X86_64_PTABLE_BASE(vaddr);
895 uint8_t map_bits= X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
896 if (is_large_page(page)) {
898 page_size = X86_64_LARGE_PAGE_SIZE;
899 table_base = X86_64_PDIR_BASE(vaddr);
900 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
901 } else if (is_huge_page(page)) {
903 page_size = X86_64_HUGE_PAGE_SIZE;
904 table_base = X86_64_PDPT_BASE(vaddr);
905 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
908 // TODO: match new policy of map when implemented
909 size = ROUND_UP(size, page_size);
910 genvaddr_t vend = vaddr + size;
912 size_t pages = size / page_size;
914 // vaddr and vend specify begin and end of the region (inside a mapping)
915 // that should receive the new set of flags
916 if (is_same_pdir(vaddr, vend) ||
917 (is_same_pdpt(vaddr, vend) && is_large_page(page)) ||
918 (is_same_pml4(vaddr, vend) && is_huge_page(page))) {
920 err = do_single_modify_flags(x86, vaddr, pages, flags);
921 if (err_is_fail(err)) {
922 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
927 uint32_t c = X86_64_PTABLE_SIZE - table_base;
928 err = do_single_modify_flags(x86, vaddr, c, flags);
929 if (err_is_fail(err)) {
930 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
933 // modify full leaves
934 vaddr += c * page_size;
935 while (get_addr_prefix(vaddr, map_bits) < get_addr_prefix(vend, map_bits)) {
936 c = X86_64_PTABLE_SIZE;
937 err = do_single_modify_flags(x86, vaddr, X86_64_PTABLE_SIZE, flags);
938 if (err_is_fail(err)) {
939 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
941 vaddr += c * page_size;
944 // modify remaining part
945 c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) -
946 get_addr_prefix(vaddr, map_bits-X86_64_PTABLE_BITS);
948 err = do_single_modify_flags(x86, vaddr, c, flags);
949 if (err_is_fail(err)) {
950 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
959 //printf("[modify_flags] exiting\n");
964 * \brief Query existing page mapping
966 * \param pmap The pmap object
967 * \param vaddr The virtual address to query
968 * \param retvaddr Returns the base virtual address of the mapping
969 * \param retsize Returns the actual size of the mapping
970 * \param retcap Returns the cap mapped at this address
971 * \param retoffset Returns the offset within the cap that is mapped
972 * \param retflags Returns the flags for this mapping
974 * All of the ret parameters are optional.
976 static errval_t lookup(struct pmap *pmap, genvaddr_t vaddr,
977 genvaddr_t *retvaddr, size_t *retsize,
978 struct capref *retcap, genvaddr_t *retoffset,
979 vregion_flags_t *retflags)
981 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
983 uint32_t base = X86_64_PTABLE_BASE(vaddr);
984 // Find the page table
985 struct vnode *ptable = find_ptable(x86, vaddr);
986 if (ptable == NULL) {
988 ptable = find_pdir(x86, vaddr);
989 if (ptable == NULL) {
990 return LIB_ERR_PMAP_FIND_VNODE;
992 base = X86_64_PDIR_BASE(vaddr);
996 struct vnode *vn = find_vnode(ptable, base);
998 return LIB_ERR_PMAP_FIND_VNODE;
1002 *retvaddr = vaddr & ~(genvaddr_t)BASE_PAGE_MASK;
1006 *retsize = BASE_PAGE_SIZE;
1010 *retcap = vn->u.frame.cap;
1014 *retoffset = vn->u.frame.offset;
1018 *retflags = vn->u.frame.flags;
1026 static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written)
1028 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1029 struct pmap_dump_info *buf_ = buf;
1031 struct vnode *pml4 = &x86->root;
1032 struct vnode *pdpt, *pdir, *pt, *frame;
1033 assert(pml4 != NULL);
1037 // iterate over PML4 entries
1038 size_t pml4_index, pdpt_index, pdir_index;
1039 for (pdpt = pml4->u.vnode.children; pdpt != NULL; pdpt = pdpt->next) {
1040 pml4_index = pdpt->entry;
1041 // iterate over pdpt entries
1042 for (pdir = pdpt->u.vnode.children; pdir != NULL; pdir = pdir->next) {
1043 pdpt_index = pdir->entry;
1044 // iterate over pdir entries
1045 for (pt = pdir->u.vnode.children; pt != NULL; pt = pt->next) {
1046 pdir_index = pt->entry;
1047 // iterate over pt entries
1048 for (frame = pt->u.vnode.children; frame != NULL; frame = frame->next) {
1049 if (*items_written < buflen) {
1050 buf_->pml4_index = pml4_index;
1051 buf_->pdpt_index = pdpt_index;
1052 buf_->pdir_index = pdir_index;
1053 buf_->pt_index = frame->entry;
1054 buf_->cap = frame->u.frame.cap;
1055 buf_->offset = frame->u.frame.offset;
1056 buf_->flags = frame->u.frame.flags;
1067 static errval_t determine_addr_raw(struct pmap *pmap, size_t size,
1068 size_t alignment, genvaddr_t *retvaddr)
1070 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1072 struct vnode *walk_pml4 = x86->root.u.vnode.children;
1073 assert(walk_pml4 != NULL); // assume there's always at least one existing entry
1075 if (alignment == 0) {
1076 alignment = BASE_PAGE_SIZE;
1078 alignment = ROUND_UP(alignment, BASE_PAGE_SIZE);
1080 size = ROUND_UP(size, alignment);
1081 assert(size < 512ul * 1024 * 1024 * 1024); // pml4 size
1083 // try to find free pml4 entry
1085 for (int i = 0; i < 512; i++) {
1088 //debug_printf("entry: %d\n", walk_pml4->entry);
1089 f[walk_pml4->entry] = false;
1091 //debug_printf("looping over pml4 entries\n");
1092 assert(walk_pml4->is_vnode);
1093 f[walk_pml4->entry] = false;
1094 walk_pml4 = walk_pml4->next;
1096 genvaddr_t first_free = 16;
1097 for (; first_free < 512; first_free++) {
1098 //debug_printf("f[%"PRIuGENVADDR"] = %d\n", first_free, f[first_free]);
1099 if (f[first_free]) {
1103 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1104 if (first_free < 512) {
1105 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1106 *retvaddr = first_free << 39;
1109 return LIB_ERR_OUT_OF_VIRTUAL_ADDR;
1113 static struct pmap_funcs pmap_funcs = {
1114 .determine_addr = pmap_x86_determine_addr,
1115 .determine_addr_raw = determine_addr_raw,
1119 .modify_flags = modify_flags,
1120 .serialise = pmap_x86_serialise,
1121 .deserialise = pmap_x86_deserialise,
1126 * \brief Initialize a x86 pmap object
1128 * \param pmap Pmap object of type x86
1130 errval_t pmap_x86_64_init(struct pmap *pmap, struct vspace *vspace,
1131 struct capref vnode,
1132 struct slot_allocator *opt_slot_alloc)
1134 struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
1136 /* Generic portion */
1137 pmap->f = pmap_funcs;
1138 pmap->vspace = vspace;
1140 if (opt_slot_alloc != NULL) {
1141 pmap->slot_alloc = opt_slot_alloc;
1142 } else { /* use default allocator for this dispatcher */
1143 pmap->slot_alloc = get_default_slot_allocator();
1146 /* x86 specific portion */
1147 slab_init(&x86->slab, sizeof(struct vnode), NULL);
1148 slab_grow(&x86->slab, x86->slab_buffer,
1149 sizeof(x86->slab_buffer));
1150 x86->refill_slabs = min_refill_slabs;
1152 x86->root.is_vnode = true;
1153 x86->root.u.vnode.cap = vnode;
1154 x86->root.u.vnode.children = NULL;
1155 x86->root.next = NULL;
1157 // choose a minimum mappable VA for most domains; enough to catch NULL
1158 // pointer derefs with suitably large offsets
1159 x86->min_mappable_va = 64 * 1024;
1161 // maximum mappable VA is derived from X86_64_MEMORY_OFFSET in kernel
1162 x86->max_mappable_va = (genvaddr_t)0xffffff8000000000;
1168 * \brief Initialize the current pmap. Reserve space for metadata
1170 * This code is coupled with #vspace_current_init()
1172 errval_t pmap_x86_64_current_init(bool init_domain)
1174 struct pmap_x86 *x86 = (struct pmap_x86*)get_current_pmap();
1176 // To reserve a block of virtual address space,
1177 // a vregion representing the address space is required.
1178 // We construct a superficial one here and add it to the vregion list.
1179 struct vregion *vregion = &x86->vregion;
1180 vregion->vspace = NULL;
1181 vregion->memobj = NULL;
1182 vregion->base = META_DATA_RESERVED_BASE;
1183 vregion->offset = 0;
1184 vregion->size = META_DATA_RESERVED_SIZE;
1186 vregion->next = NULL;
1188 struct vspace *vspace = x86->p.vspace;
1189 assert(!vspace->head);
1190 vspace->head = vregion;
1192 x86->vregion_offset = x86->vregion.base;
1194 // We don't know the vnode layout for the first part of our address space
1195 // (which was setup by the kernel), so we avoid mapping there until told it.
1196 x86->min_mappable_va = META_DATA_RESERVED_BASE;