3 * \brief x86-64 architecture initialization.
7 * Copyright (c) 2007, 2008, 2009, 2010, 2011, ETH Zurich.
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Haldeneggsteig 4, CH-8092 Zurich. Attn: Systems Group.
18 #include <paging_kernel_arch.h>
24 #include <kernel_multiboot.h>
26 #include <getopt/getopt.h>
29 #include <arch/x86/conio.h>
30 #include <arch/x86/pic.h>
31 #include <arch/x86/apic.h>
32 #include <arch/x86/mcheck.h>
33 #include <arch/x86/perfmon.h>
34 #include <arch/x86/rtc.h>
35 #include <target/x86/barrelfish_kpi/coredata_target.h>
36 #include <arch/x86/timing.h>
37 #include <arch/x86/startup_x86.h>
38 #include <arch/x86/start_aps.h>
39 #include <arch/x86/ipi_notify.h>
40 #include <barrelfish_kpi/cpu_arch.h>
41 #include <target/x86_64/barrelfish_kpi/cpu_target.h>
45 #include <dev/xapic_dev.h> // XXX
46 #include <dev/ia32_dev.h>
47 #include <dev/amd64_dev.h>
50 * Used to store the address of global struct passed during boot across kernel
53 static uint64_t addr_global;
56 * \brief Kernel stack.
58 * This is the one and only kernel stack for a kernel instance.
60 uintptr_t x86_64_kernel_stack[X86_64_KERNEL_STACK_SIZE/sizeof(uintptr_t)];
63 * \brief Global Task State Segment (TSS).
65 * This is the global, static and only Task State Segment (TSS). It is used
66 * for interrupt and exception handling (stack setup) while in user-space.
68 static struct task_state_segment tss __attribute__ ((aligned (4)));
71 * \brief Global Descriptor Table (GDT) for processor this kernel is running on.
73 * This descriptor table is completely static, as segments are basically
74 * turned off in 64-bit mode. They map flat-mode code and stack segments for
75 * both kernel- and user-space and the only Task State Segment (TSS).
77 union segment_descriptor gdt[] __attribute__ ((aligned (4))) = {
78 [NULL_SEL] = { // Null segment
81 [KCODE_SEL] = { // Kernel code segment
87 .privilege_level = SEL_KPL,
97 [KSTACK_SEL] = { // Kernel stack segment
103 .privilege_level = SEL_KPL,
113 [USTACK_SEL] = { // User stack segment
119 .privilege_level = SEL_UPL,
129 [UCODE_SEL] = { // User code segment
135 .privilege_level = SEL_UPL,
145 [TSS_LO_SEL] = { // Global Task State Segment (TSS), lower 8 bytes
147 .lo_limit = sizeof(tss) & 0xffff,
149 .privilege_level = SEL_KPL,
151 .hi_limit = (sizeof(tss) >> 16) & 0xf,
156 [TSS_HI_SEL] = { // Global Task State Segment (TSS), upper 8 bytes
161 [LDT_LO_SEL] = { // Local descriptor table (LDT), lower 8 bytes
163 .lo_limit = 0, // # 4k pages (since granularity = 1)
164 .lo_base = 0, // changed by context switch path when doing lldt
166 .privilege_level = SEL_UPL,
174 [LDT_HI_SEL] = { // Local descriptor table (LDT), upper 8 bytes
176 .base = 0 // changed by context switch path when doing lldt
181 union segment_descriptor *ldt_descriptor = &gdt[LDT_LO_SEL];
184 * Bootup PML4, used to map both low (identity-mapped) memory and relocated
185 * memory at the same time.
187 static union x86_64_pdir_entry boot_pml4[PTABLE_SIZE]
188 __attribute__ ((aligned(BASE_PAGE_SIZE)));
191 * Bootup low-map PDPT and hi-map PDPT.
193 static union x86_64_pdir_entry boot_pdpt[PTABLE_SIZE]
194 __attribute__ ((aligned(BASE_PAGE_SIZE))),
195 boot_pdpt_hi[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE)));
198 * Bootup low-map PDIR, hi-map PDIR, and 1GB PDIR.
200 static union x86_64_ptable_entry boot_pdir[PTABLE_SIZE]
201 __attribute__ ((aligned(BASE_PAGE_SIZE))),
202 boot_pdir_hi[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE))),
203 boot_pdir_1GB[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE)));
206 * This flag is set to true once the IDT is initialized and exceptions can be
209 bool idt_initialized = false;
212 * \brief Setup bootup page table.
214 * This function sets up the page table needed to boot the kernel
215 * proper. The table identity maps the first 1 GByte of physical
216 * memory in order to have access to various data structures and the
217 * first MByte containing bootloader-passed data structures. It also
218 * identity maps the local copy of the kernel in low memory and
219 * aliases it in kernel address space.
221 * \param base Start address of kernel image in physical address space.
222 * \param size Size of kernel image.
224 static void paging_init(lpaddr_t base, size_t size)
226 lvaddr_t vbase = local_phys_to_mem(base);
228 // Align base to kernel page size
229 if(base & X86_64_MEM_PAGE_MASK) {
230 size += base & X86_64_MEM_PAGE_MASK;
231 base -= base & X86_64_MEM_PAGE_MASK;
234 // Align vbase to kernel page size
235 if(vbase & X86_64_MEM_PAGE_MASK) {
236 vbase -= vbase & X86_64_MEM_PAGE_MASK;
239 // Align size to kernel page size
240 if(size & X86_64_MEM_PAGE_MASK) {
241 size += X86_64_MEM_PAGE_SIZE - (size & X86_64_MEM_PAGE_MASK);
244 // XXX: Cannot currently map more than one table of pages
245 assert(size <= X86_64_MEM_PAGE_SIZE * X86_64_PTABLE_SIZE);
246 /* assert(size <= MEM_PAGE_SIZE); */
248 for(size_t i = 0; i < size; i += X86_64_MEM_PAGE_SIZE,
249 base += X86_64_MEM_PAGE_SIZE, vbase += X86_64_MEM_PAGE_SIZE) {
250 // No kernel image above 4 GByte
251 assert(base < ((lpaddr_t)4 << 30));
253 // Identity-map the kernel's physical region, so we don't lose ground
254 paging_x86_64_map_table(&boot_pml4[X86_64_PML4_BASE(base)], (lpaddr_t)boot_pdpt);
255 paging_x86_64_map_table(&boot_pdpt[X86_64_PDPT_BASE(base)], (lpaddr_t)boot_pdir);
256 paging_x86_64_map_large(&boot_pdir[X86_64_PDIR_BASE(base)], base, PTABLE_PRESENT
257 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
259 // Alias the same region at MEMORY_OFFSET
260 paging_x86_64_map_table(&boot_pml4[X86_64_PML4_BASE(vbase)], (lpaddr_t)boot_pdpt_hi);
261 paging_x86_64_map_table(&boot_pdpt_hi[X86_64_PDPT_BASE(vbase)], (lpaddr_t)boot_pdir_hi);
262 paging_x86_64_map_large(&boot_pdir_hi[X86_64_PDIR_BASE(vbase)], base, PTABLE_PRESENT
263 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
266 // Identity-map the first 1G of physical memory for bootloader data
267 paging_x86_64_map_table(&boot_pml4[0], (lpaddr_t)boot_pdpt);
268 paging_x86_64_map_table(&boot_pdpt[0], (lpaddr_t)boot_pdir_1GB);
269 for (int i = 0; i < X86_64_PTABLE_SIZE; i++) {
270 paging_x86_64_map_large(&boot_pdir_1GB[X86_64_PDIR_BASE(X86_64_MEM_PAGE_SIZE * i)],
271 X86_64_MEM_PAGE_SIZE * i, PTABLE_PRESENT
272 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
275 // Activate new page tables
276 paging_x86_64_context_switch((lpaddr_t)boot_pml4);
280 * \brief Setup default GDT.
282 * Loads the GDT register with the default GDT and reloads CS and SS
283 * to point to the new entries. Resets all other segment registers to null.
284 * Finally, completes setup of GDT to include TSS base address mapping and
285 * loads TSS into task register.
287 static void gdt_reset(void)
289 lvaddr_t ptss = (lvaddr_t)&tss;
290 struct region_descriptor region = {
291 .rd_limit = sizeof(gdt),
292 .rd_base = (uint64_t)&gdt
296 __asm volatile("lgdt %[region]" :: [region] "m" (region));
299 __asm volatile("mov %[null], %%ds \n\t"
300 "mov %[null], %%es \n\t"
301 "mov %[ss], %%ss \n\t"
302 "mov %[null], %%gs \n\t"
303 "mov %[null], %%fs \n\t"
304 "pushq %[cs] \n\t" // new CS
305 "lea 1f(%%rip), %%rax \n\t" // jumps to after lret
306 "pushq %%rax \n\t" // new IP
307 "lretq \n\t" // fake return
308 "1: \n\t" // we'll continue here
312 [ss] "r" (GSEL(KSTACK_SEL, SEL_KPL)),
313 [cs] "i" (GSEL(KCODE_SEL, SEL_KPL))
317 // Complete setup of TSS descriptor (by inserting base address of TSS)
318 gdt[TSS_LO_SEL].sys_lo.lo_base = ptss & 0xffffff;
319 gdt[TSS_LO_SEL].sys_lo.hi_base = (ptss >> 24) & 0xff;
320 gdt[TSS_HI_SEL].sys_hi.base = ptss >> 32;
322 // Complete setup of TSS
323 tss.rsp[0] = (lvaddr_t)&x86_64_kernel_stack[X86_64_KERNEL_STACK_SIZE / sizeof(uintptr_t)];
325 // Load task state register
326 __asm volatile("ltr %%ax" :: "a" (GSEL(TSS_LO_SEL, SEL_KPL)));
330 * \brief Relocates the active stack.
332 * This function relocates the stack, by adding 'offset' to the stack
335 * \param offset Offset to add to the stack pointer.
337 static inline void __attribute__ ((always_inline))
338 relocate_stack(lvaddr_t offset)
340 __asm volatile("add %[stack], %%rsp\n\t"
342 : [stack] "er" (offset)
348 * \brief Enable SYSCALL/SYSRET fast system calls.
350 * This function enables the SYSCALL/SYSRET pair of fast system calls in
351 * long mode. Also sets the IA32_STAR and IA32_FMASK MSRs to point to the
352 * user-space base selector and RFLAGS mask for SYSCALL/SYSRET fast system
355 static inline void enable_fast_syscalls(void)
357 // Segment selector bases for both kernel- and user-space for fast
359 ia32_star_t star = ia32_star_rd(NULL);
360 star = ia32_star_call_insert(star, GSEL(KCODE_SEL, SEL_KPL));
361 star = ia32_star_ret_insert( star, GSEL(KSTACK_SEL, SEL_UPL));
362 ia32_star_wr(NULL, star);
364 // Set ia32_lstar MSR to point to kernel-space system call multiplexer
365 ia32_lstar_wr(NULL, (lvaddr_t)syscall_entry);
367 // Set IA32_FMASK MSR for our OSes EFLAGS mask
368 // We mask out everything (including interrupts).
369 ia32_fmask_v_wrf(NULL, ~(RFLAGS_ALWAYS1) );
371 // Enable fast system calls
372 ia32_efer_sce_wrf(NULL, 1);
375 static inline void enable_tlb_flush_filter(void)
377 uint32_t eax, ebx, ecx, edx;
379 // Must read "AuthenticAMD"
380 cpuid(0, &eax, &ebx, &ecx, &edx);
381 if(ebx != 0x68747541 || ecx != 0x444d4163 || edx != 0x69746e65) {
385 // Is at least family 0fh?
386 cpuid(1, &eax, &ebx, &ecx, &edx);
387 if(((eax >> 8) & 0xf) != 0xf) {
391 debug(SUBSYS_STARTUP, "Enabling TLB flush filter\n");
392 ia32_amd_hwcr_ffdis_wrf(NULL, 1);
395 static inline void enable_monitor_mwait(void)
397 uint32_t eax, ebx, ecx, edx;
399 if (has_monitor_mwait()) {
400 cpuid(5, &eax, &ebx, &ecx, &edx);
401 debug(SUBSYS_STARTUP, "MONITOR/MWAIT supported: "
402 "min size %u bytes, max %u bytes. %s %s\n",
403 eax, ebx, (ecx & 2) ? "IBE" : "", (ecx & 1) ? "EMX" : "");
406 debug(SUBSYS_STARTUP, "MONITOR/MWAIT are not supported.\n");
411 * \brief Continue kernel initialization in kernel address space.
413 * This function resets paging to map out low memory and map in physical
414 * address space, relocating all remaining data structures. It resets the
415 * Global Descriptor Table for flat mode and to exclude legacy segments from
416 * boot initialization code. It sets up the IDT for exception and interrupt
417 * handling, initializes the local APIC and enables interrupts. After that it
418 * calls kernel_startup(), which should not return (if it does, this function
421 static void __attribute__ ((noreturn, noinline)) text_init(void)
423 // Reset global and locks to point to the memory in the pristine image
424 global = (struct global*)addr_global;
427 * Reset paging once more to use relocated data structures and map in
428 * whole of kernel and available physical memory. Map out low memory.
430 paging_x86_64_reset();
432 // Relocate global to "memory"
433 global = (struct global*)local_phys_to_mem((lpaddr_t)global);
435 // Relocate glbl_core_data to "memory"
436 glbl_core_data = (struct x86_core_data *)
437 local_phys_to_mem((lpaddr_t)glbl_core_data);
440 * Use new physical address space for video memory -- no calls to functions
441 * that end up calling a conio.c function may be called between
442 * paging_reset() and conio_relocate_vidmem()!
444 conio_relocate_vidmem(local_phys_to_mem(VIDEO_MEM));
446 // Re-map physical memory
447 /* XXX: Currently we are statically mapping a fixed amount of
448 memory. We should not map in more memory than the machine
449 actually has. Or else if the kernel tries to access addresses
450 not backed by real memory, it will experience weird faults
451 instead of a simpler pagefault.
453 Ideally, we should use the ACPI information to figure out which
454 memory to map in. Look at ticket #218 for more
457 if(paging_x86_64_map_memory(0, X86_64_PADDR_SPACE_LIMIT) != 0) {
458 panic("error while mapping physical memory!");
462 * Also reset the global descriptor table (GDT), so we get
463 * segmentation again and can catch interrupts/exceptions (the IDT
468 // Arch-independent early startup
469 kernel_startup_early();
471 // XXX: re-init the serial driver, in case the port changed after parsing args
472 serial_console_init(false);
476 idt_initialized = true;
478 // Enable machine check reporting
481 // Initialize local APIC
484 // do not remove/change this printf: needed by regression harness
485 printf("Barrelfish CPU driver starting on x86_64 apic_id %u\n", apic_id);
489 // Initialize classic (8259A) PIC
493 // Initialize real-time clock
496 // Initialize local APIC timer
497 if (kernel_ticks_enabled) {
499 bool periodic = true;
500 #ifdef CONFIG_ONESHOT_TIMER
501 // we probably need a global variable like kernel_ticks_enabled
504 apic_timer_init(false, periodic);
505 timing_apic_timer_set_ms(kernel_timeslice);
507 printk(LOG_WARN, "APIC timer disabled: NO timeslicing\n");
511 // Initialize IPI notification mechanism
514 // Enable SYSCALL/SYSRET fast system calls
515 enable_fast_syscalls();
517 // Enable "no execute" page-level protection bit
518 ia32_efer_nxe_wrf(NULL, 1);
520 // Enable FPU and MMX
523 // Enable user-mode RDPMC opcode
524 amd64_cr4_pce_wrf(NULL, 1);
526 // AMD64: Check if TLB flush filter is enabled
527 enable_tlb_flush_filter();
529 // Enable global pages
530 amd64_cr4_pge_wrf(NULL, 1);
532 // Check/Enable MONITOR/MWAIT opcodes
533 enable_monitor_mwait();
535 // Setup Page Attribute Table MSR
536 configure_page_attribute_table();
538 // Call main kernel startup function -- this should never return
542 // Returning here will crash! -- low pages not mapped anymore!
546 * \brief Architecture-specific initialization function.
548 * This function is called by the bootup code in boot.S to initialize
549 * architecture-specific stuff. It is expected to call the kernel main
550 * loop. This function never returns.
552 * The kernel expects one of two magic values in 'magic' that determine how it
553 * has been booted. If 'magic' is #MULTIBOOT_INFO_MAGIC the kernel has been
554 * booted by a (Multiboot-compliant) bootloader and this is the first image on
555 * the boot CPU. It will relocate itself to a default position. If 'magic' is
556 * #KERNEL_BOOT_MAGIC it has been booted by another image of itself and is
557 * running on an (so-called) application CPU.
559 * This function sets up new page tables to alias the kernel
560 * at #MEMORY_OFFSET. It also does any relocations necessary to the
561 * "position-independent" code to make it run at the new location (e.g.
562 * relocating the GOT). After all relocations, it calls text_init() of
563 * the relocated image, which destroys the lower alias and may never return.
565 * For bsp kernels, the void pointer is of type multiboot_info, for application
566 * CPUs, it is of type global. Global carries a pointer to multiboot_info.
567 * Global also contains pointers to memory that is shared between kernels.
569 * \param magic Boot magic value
570 * \param pointer Pointer to Multiboot Info or to Global structure
572 void arch_init(uint64_t magic, void *pointer)
574 // Sanitize the screen
576 // Initialize serial, only initialize HW if we are
578 serial_console_init((magic == MULTIBOOT_INFO_MAGIC));
580 void __attribute__ ((noreturn)) (*reloc_text_init)(void) =
581 (void *)local_phys_to_mem((lpaddr_t)text_init);
582 struct Elf64_Shdr *rela, *symtab;
583 struct multiboot_info *mb = NULL;
585 apic_bsp = magic == MULTIBOOT_INFO_MAGIC;
588 * If this is the boot image, make Multiboot information structure globally
589 * known. Otherwise the passed value should equal the original structure.
590 * If magic value does not match what we expect, we cannot proceed safely.
593 case MULTIBOOT_INFO_MAGIC:
594 mb = (struct multiboot_info *)pointer;
596 // Construct the global structure and store its address to retrive it
598 memset(&global->locks, 0, sizeof(global->locks));
599 addr_global = (uint64_t)global;
602 case KERNEL_BOOT_MAGIC:
603 global = (struct global*)pointer;
604 // Store the address of global to retrive it across relocation
605 addr_global = (uint64_t)global;
609 panic("Magic value does not match! (0x%x != 0x%lx != 0x%x)",
610 KERNEL_BOOT_MAGIC, magic, MULTIBOOT_INFO_MAGIC);
614 /* determine page-aligned physical address past end of multiboot */
615 lvaddr_t dest = (lvaddr_t)&_start_kernel;
616 if (dest & (BASE_PAGE_SIZE - 1)) {
617 dest &= ~(BASE_PAGE_SIZE - 1);
618 dest += BASE_PAGE_SIZE;
621 // XXX: print kernel address for debugging with gdb
622 printf("Kernel starting at address 0x%"PRIxLVADDR"\n",
623 local_phys_to_mem(dest));
625 struct x86_coredata_elf *elf;
626 uint32_t multiboot_flags;
627 if (mb != NULL) { /* Multiboot info was passed */
628 multiboot_flags = mb->flags;
629 elf = (struct x86_coredata_elf *)&mb->syms.elf;
631 // We need the ELF section header table for relocation
632 if (!(multiboot_flags & MULTIBOOT_INFO_FLAG_HAS_ELF_SYMS)) {
633 panic("Multiboot information structure does not include ELF section"
634 "header information -- Relocation impossible!");
637 // Determine where free RAM starts
638 glbl_core_data->start_free_ram =
639 ROUND_UP(max(multiboot_end_addr(mb), (uintptr_t)&_end_kernel),
642 glbl_core_data->mods_addr = mb->mods_addr;
643 glbl_core_data->mods_count = mb->mods_count;
644 glbl_core_data->cmdline = mb->cmdline;
645 glbl_core_data->mmap_length = mb->mmap_length;
646 glbl_core_data->mmap_addr = mb->mmap_addr;
648 } else { /* No multiboot info, use the core_data struct */
649 struct x86_core_data *core_data =
650 (struct x86_core_data*)(dest - BASE_PAGE_SIZE);
651 multiboot_flags = core_data->multiboot_flags;
652 elf = &core_data->elf;
653 glbl_core_data = core_data;
654 core_data->cmdline = (lpaddr_t)&core_data->kernel_cmdline;
655 my_core_id = core_data->dst_core_id;
657 kcb_current = (struct kcb*) glbl_core_data->kcb;
658 if (core_data->module_end > 4ul * (1ul << 30)) {
659 panic("The cpu module is outside the initial 4GB mapping."
660 " Either move the module or increase initial mapping.");
664 // We're only able to process Elf64_Rela entries
665 if (elf->size != sizeof(struct Elf64_Shdr)) {
666 panic("ELF section header entry size mismatch!");
669 // Find relocation section
670 rela = elf64_find_section_header_type((struct Elf64_Shdr *)
674 panic("Kernel image does not include relocation section!");
677 // Find symbol table section
678 symtab = elf64_find_section_header_type((struct Elf64_Shdr *)
680 elf->num, SHT_DYNSYM);
681 if (symtab == NULL) {
682 panic("Kernel image does not include symbol table!");
685 // Alias kernel on top of memory, keep low memory
686 paging_init((lpaddr_t)&_start_kernel, SIZE_KERNEL_IMAGE);
688 // Relocate kernel image for top of memory
689 elf64_relocate(X86_64_MEMORY_OFFSET + (lvaddr_t)&_start_kernel,
690 (lvaddr_t)&_start_kernel,
691 (struct Elf64_Rela *)(rela->sh_addr - X86_64_START_KERNEL_PHYS + &_start_kernel),
693 (struct Elf64_Sym *)(symtab->sh_addr - X86_64_START_KERNEL_PHYS + &_start_kernel),
695 X86_64_START_KERNEL_PHYS, &_start_kernel);
697 /*** Aliased kernel available now -- low memory still mapped ***/
699 // Relocate stack to aliased location
700 relocate_stack(X86_64_MEMORY_OFFSET);
702 // Call aliased text_init() function and continue initialization