3 * \brief x86-64 architecture initialization.
7 * Copyright (c) 2007, 2008, 2009, 2010, 2011, ETH Zurich.
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Haldeneggsteig 4, CH-8092 Zurich. Attn: Systems Group.
18 #include <paging_kernel_arch.h>
24 #include <kernel_multiboot.h>
26 #include <getopt/getopt.h>
29 #include <arch/x86/conio.h>
30 #include <arch/x86/pic.h>
31 #include <arch/x86/apic.h>
32 #include <arch/x86/mcheck.h>
33 #include <arch/x86/perfmon.h>
34 #include <arch/x86/rtc.h>
35 #include <target/x86/barrelfish_kpi/coredata_target.h>
36 #include <arch/x86/timing.h>
37 #include <arch/x86/startup_x86.h>
38 #include <arch/x86/ipi_notify.h>
39 #include <barrelfish_kpi/cpu_arch.h>
40 #include <target/x86_64/barrelfish_kpi/cpu_target.h>
42 #include <dev/xapic_dev.h> // XXX
43 #include <dev/ia32_dev.h>
44 #include <dev/amd64_dev.h>
47 * Used to store the address of global struct passed during boot across kernel
50 static uint64_t addr_global;
53 * \brief Kernel stack.
55 * This is the one and only kernel stack for a kernel instance.
57 uintptr_t x86_64_kernel_stack[X86_64_KERNEL_STACK_SIZE/sizeof(uintptr_t)];
60 * \brief Global Task State Segment (TSS).
62 * This is the global, static and only Task State Segment (TSS). It is used
63 * for interrupt and exception handling (stack setup) while in user-space.
65 static struct task_state_segment tss __attribute__ ((aligned (4)));
68 * \brief Global Descriptor Table (GDT) for processor this kernel is running on.
70 * This descriptor table is completely static, as segments are basically
71 * turned off in 64-bit mode. They map flat-mode code and stack segments for
72 * both kernel- and user-space and the only Task State Segment (TSS).
74 union segment_descriptor gdt[] __attribute__ ((aligned (4))) = {
75 [NULL_SEL] = { // Null segment
78 [KCODE_SEL] = { // Kernel code segment
84 .privilege_level = SEL_KPL,
94 [KSTACK_SEL] = { // Kernel stack segment
100 .privilege_level = SEL_KPL,
110 [USTACK_SEL] = { // User stack segment
116 .privilege_level = SEL_UPL,
126 [UCODE_SEL] = { // User code segment
132 .privilege_level = SEL_UPL,
142 [TSS_LO_SEL] = { // Global Task State Segment (TSS), lower 8 bytes
144 .lo_limit = sizeof(tss) & 0xffff,
146 .privilege_level = SEL_KPL,
148 .hi_limit = (sizeof(tss) >> 16) & 0xf,
153 [TSS_HI_SEL] = { // Global Task State Segment (TSS), upper 8 bytes
158 [LDT_LO_SEL] = { // Local descriptor table (LDT), lower 8 bytes
160 .lo_limit = 0, // # 4k pages (since granularity = 1)
161 .lo_base = 0, // changed by context switch path when doing lldt
163 .privilege_level = SEL_UPL,
171 [LDT_HI_SEL] = { // Local descriptor table (LDT), upper 8 bytes
173 .base = 0 // changed by context switch path when doing lldt
178 union segment_descriptor *ldt_descriptor = &gdt[LDT_LO_SEL];
181 * Bootup PML4, used to map both low (identity-mapped) memory and relocated
182 * memory at the same time.
184 static union x86_64_pdir_entry boot_pml4[PTABLE_SIZE]
185 __attribute__ ((aligned(BASE_PAGE_SIZE)));
188 * Bootup low-map PDPT and hi-map PDPT.
190 static union x86_64_pdir_entry boot_pdpt[PTABLE_SIZE]
191 __attribute__ ((aligned(BASE_PAGE_SIZE))),
192 boot_pdpt_hi[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE)));
195 * Bootup low-map PDIR, hi-map PDIR, and 1GB PDIR.
197 static union x86_64_ptable_entry boot_pdir[PTABLE_SIZE]
198 __attribute__ ((aligned(BASE_PAGE_SIZE))),
199 boot_pdir_hi[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE))),
200 boot_pdir_1GB[PTABLE_SIZE] __attribute__ ((aligned(BASE_PAGE_SIZE)));
203 * This flag is set to true once the IDT is initialized and exceptions can be
206 bool idt_initialized = false;
209 * \brief Setup bootup page table.
211 * This function sets up the page table needed to boot the kernel
212 * proper. The table identity maps the first 1 GByte of physical
213 * memory in order to have access to various data structures and the
214 * first MByte containing bootloader-passed data structures. It also
215 * identity maps the local copy of the kernel in low memory and
216 * aliases it in kernel address space.
218 * \param base Start address of kernel image in physical address space.
219 * \param size Size of kernel image.
221 static void paging_init(lpaddr_t base, size_t size)
223 lvaddr_t vbase = local_phys_to_mem(base);
225 // Align base to kernel page size
226 if(base & X86_64_MEM_PAGE_MASK) {
227 size += base & X86_64_MEM_PAGE_MASK;
228 base -= base & X86_64_MEM_PAGE_MASK;
231 // Align vbase to kernel page size
232 if(vbase & X86_64_MEM_PAGE_MASK) {
233 vbase -= vbase & X86_64_MEM_PAGE_MASK;
236 // Align size to kernel page size
237 if(size & X86_64_MEM_PAGE_MASK) {
238 size += X86_64_MEM_PAGE_SIZE - (size & X86_64_MEM_PAGE_MASK);
241 // XXX: Cannot currently map more than one table of pages
242 assert(size <= X86_64_MEM_PAGE_SIZE * X86_64_PTABLE_SIZE);
243 /* assert(size <= MEM_PAGE_SIZE); */
245 for(size_t i = 0; i < size; i += X86_64_MEM_PAGE_SIZE,
246 base += X86_64_MEM_PAGE_SIZE, vbase += X86_64_MEM_PAGE_SIZE) {
247 // No kernel image above 4 GByte
248 assert(base < ((lpaddr_t)4 << 30));
250 // Identity-map the kernel's physical region, so we don't lose ground
251 paging_x86_64_map_table(&boot_pml4[X86_64_PML4_BASE(base)], (lpaddr_t)boot_pdpt);
252 paging_x86_64_map_table(&boot_pdpt[X86_64_PDPT_BASE(base)], (lpaddr_t)boot_pdir);
253 paging_x86_64_map_large(&boot_pdir[X86_64_PDIR_BASE(base)], base, PTABLE_PRESENT
254 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
256 // Alias the same region at MEMORY_OFFSET
257 paging_x86_64_map_table(&boot_pml4[X86_64_PML4_BASE(vbase)], (lpaddr_t)boot_pdpt_hi);
258 paging_x86_64_map_table(&boot_pdpt_hi[X86_64_PDPT_BASE(vbase)], (lpaddr_t)boot_pdir_hi);
259 paging_x86_64_map_large(&boot_pdir_hi[X86_64_PDIR_BASE(vbase)], base, PTABLE_PRESENT
260 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
263 // Identity-map the first 1G of physical memory for bootloader data
264 paging_x86_64_map_table(&boot_pml4[0], (lpaddr_t)boot_pdpt);
265 paging_x86_64_map_table(&boot_pdpt[0], (lpaddr_t)boot_pdir_1GB);
266 for (int i = 0; i < X86_64_PTABLE_SIZE; i++) {
267 paging_x86_64_map_large(&boot_pdir_1GB[X86_64_PDIR_BASE(X86_64_MEM_PAGE_SIZE * i)],
268 X86_64_MEM_PAGE_SIZE * i, PTABLE_PRESENT
269 | PTABLE_READ_WRITE | PTABLE_USER_SUPERVISOR);
272 // Activate new page tables
273 paging_x86_64_context_switch((lpaddr_t)boot_pml4);
277 * \brief Setup default GDT.
279 * Loads the GDT register with the default GDT and reloads CS and SS
280 * to point to the new entries. Resets all other segment registers to null.
281 * Finally, completes setup of GDT to include TSS base address mapping and
282 * loads TSS into task register.
284 static void gdt_reset(void)
286 lvaddr_t ptss = (lvaddr_t)&tss;
287 struct region_descriptor region = {
288 .rd_limit = sizeof(gdt),
289 .rd_base = (uint64_t)&gdt
293 __asm volatile("lgdt %[region]" :: [region] "m" (region));
296 __asm volatile("mov %[null], %%ds \n\t"
297 "mov %[null], %%es \n\t"
298 "mov %[ss], %%ss \n\t"
299 "mov %[null], %%gs \n\t"
300 "mov %[null], %%fs \n\t"
301 "pushq %[cs] \n\t" // new CS
302 "lea 1f(%%rip), %%rax \n\t" // jumps to after lret
303 "pushq %%rax \n\t" // new IP
304 "lretq \n\t" // fake return
305 "1: \n\t" // we'll continue here
309 [ss] "r" (GSEL(KSTACK_SEL, SEL_KPL)),
310 [cs] "i" (GSEL(KCODE_SEL, SEL_KPL))
314 // Complete setup of TSS descriptor (by inserting base address of TSS)
315 gdt[TSS_LO_SEL].sys_lo.lo_base = ptss & 0xffffff;
316 gdt[TSS_LO_SEL].sys_lo.hi_base = (ptss >> 24) & 0xff;
317 gdt[TSS_HI_SEL].sys_hi.base = ptss >> 32;
319 // Complete setup of TSS
320 tss.rsp[0] = (lvaddr_t)&x86_64_kernel_stack[X86_64_KERNEL_STACK_SIZE / sizeof(uintptr_t)];
322 // Load task state register
323 __asm volatile("ltr %%ax" :: "a" (GSEL(TSS_LO_SEL, SEL_KPL)));
327 * \brief Relocates the active stack.
329 * This function relocates the stack, by adding 'offset' to the stack
332 * \param offset Offset to add to the stack pointer.
334 static inline void __attribute__ ((always_inline))
335 relocate_stack(lvaddr_t offset)
337 __asm volatile("add %[stack], %%rsp\n\t"
339 : [stack] "er" (offset)
345 * \brief Enable SYSCALL/SYSRET fast system calls.
347 * This function enables the SYSCALL/SYSRET pair of fast system calls in
348 * long mode. Also sets the IA32_STAR and IA32_FMASK MSRs to point to the
349 * user-space base selector and RFLAGS mask for SYSCALL/SYSRET fast system
352 static inline void enable_fast_syscalls(void)
354 // Segment selector bases for both kernel- and user-space for fast
356 ia32_star_t star = ia32_star_rd(NULL);
357 star = ia32_star_call_insert(star, GSEL(KCODE_SEL, SEL_KPL));
358 star = ia32_star_ret_insert( star, GSEL(KSTACK_SEL, SEL_UPL));
359 ia32_star_wr(NULL, star);
361 // Set ia32_lstar MSR to point to kernel-space system call multiplexer
362 ia32_lstar_wr(NULL, (lvaddr_t)syscall_entry);
364 // Set IA32_FMASK MSR for our OSes EFLAGS mask
365 // We mask out everything (including interrupts).
366 ia32_fmask_v_wrf(NULL, ~(RFLAGS_ALWAYS1) );
368 // Enable fast system calls
369 ia32_efer_sce_wrf(NULL, 1);
372 static inline void enable_tlb_flush_filter(void)
374 uint32_t eax, ebx, ecx, edx;
376 // Must read "AuthenticAMD"
377 cpuid(0, &eax, &ebx, &ecx, &edx);
378 if(ebx != 0x68747541 || ecx != 0x444d4163 || edx != 0x69746e65) {
382 // Is at least family 0fh?
383 cpuid(1, &eax, &ebx, &ecx, &edx);
384 if(((eax >> 8) & 0xf) != 0xf) {
388 debug(SUBSYS_STARTUP, "Enabling TLB flush filter\n");
389 ia32_amd_hwcr_ffdis_wrf(NULL, 1);
392 static inline void enable_monitor_mwait(void)
394 uint32_t eax, ebx, ecx, edx;
396 cpuid(1, &eax, &ebx, &ecx, &edx);
398 if (ecx & (1 << 3)) {
399 cpuid(5, &eax, &ebx, &ecx, &edx);
400 debug(SUBSYS_STARTUP, "MONITOR/MWAIT supported: "
401 "min size %u bytes, max %u bytes. %s %s\n",
402 eax, ebx, (ecx & 2) ? "IBE" : "", (ecx & 1) ? "EMX" : "");
407 * \brief Continue kernel initialization in kernel address space.
409 * This function resets paging to map out low memory and map in physical
410 * address space, relocating all remaining data structures. It resets the
411 * Global Descriptor Table for flat mode and to exclude legacy segments from
412 * boot initialization code. It sets up the IDT for exception and interrupt
413 * handling, initializes the local APIC and enables interrupts. After that it
414 * calls kernel_startup(), which should not return (if it does, this function
417 static void __attribute__ ((noreturn, noinline)) text_init(void)
419 // Reset global and locks to point to the memory in the pristine image
420 global = (struct global*)addr_global;
423 * Reset paging once more to use relocated data structures and map in
424 * whole of kernel and available physical memory. Map out low memory.
426 paging_x86_64_reset();
428 // Relocate global to "memory"
429 global = (struct global*)local_phys_to_mem((lpaddr_t)global);
431 // Relocate glbl_core_data to "memory"
432 glbl_core_data = (struct x86_core_data *)
433 local_phys_to_mem((lpaddr_t)glbl_core_data);
436 * Use new physical address space for video memory -- no calls to functions
437 * that end up calling a conio.c function may be called between
438 * paging_reset() and conio_relocate_vidmem()!
440 conio_relocate_vidmem(local_phys_to_mem(VIDEO_MEM));
442 // Re-map physical memory
443 /* XXX: Currently we are statically mapping a fixed amount of
444 memory. We should not map in more memory than the machine
445 actually has. Or else if the kernel tries to access addresses
446 not backed by real memory, it will experience weird faults
447 instead of a simpler pagefault.
449 Ideally, we should use the ACPI information to figure out which
450 memory to map in. Look at ticket #218 for more
453 if(paging_x86_64_map_memory(0, X86_64_PADDR_SPACE_LIMIT) != 0) {
454 panic("error while mapping physical memory!");
458 * Also reset the global descriptor table (GDT), so we get
459 * segmentation again and can catch interrupts/exceptions (the IDT
464 // Arch-independent early startup
465 kernel_startup_early();
467 // XXX: re-init the serial driver, in case the port changed after parsing args
468 serial_console_init();
472 idt_initialized = true;
474 // Enable machine check reporting
477 // Initialize local APIC
480 // do not remove/change this printf: needed by regression harness
481 printf("Barrelfish CPU driver starting on x86_64 apic_id %u\n", apic_id);
484 // Initialize classic (8259A) PIC
488 // Initialize real-time clock
491 // Initialize local APIC timer
492 if (kernel_ticks_enabled) {
494 bool periodic = true;
495 #ifdef CONFIG_ONESHOT_TIMER
496 // we probably need a global variable like kernel_ticks_enabled
499 apic_timer_init(false, periodic);
500 timing_apic_timer_set_ms(kernel_timeslice);
502 printk(LOG_WARN, "APIC timer disabled: NO timeslicing\n");
506 // Initialize IPI notification mechanism
509 // Enable SYSCALL/SYSRET fast system calls
510 enable_fast_syscalls();
512 // Enable "no execute" page-level protection bit
513 ia32_efer_nxe_wrf(NULL, 1);
515 // Enable FPU and MMX
518 // Enable user-mode RDPMC opcode
519 amd64_cr4_pce_wrf(NULL, 1);
521 // AMD64: Check if TLB flush filter is enabled
522 enable_tlb_flush_filter();
524 // Enable global pages
525 amd64_cr4_pge_wrf(NULL, 1);
527 // Check/Enable MONITOR/MWAIT opcodes
528 enable_monitor_mwait();
530 // Call main kernel startup function -- this should never return
534 // Returning here will crash! -- low pages not mapped anymore!
538 * \brief Architecture-specific initialization function.
540 * This function is called by the bootup code in boot.S to initialize
541 * architecture-specific stuff. It is expected to call the kernel main
542 * loop. This function never returns.
544 * The kernel expects one of two magic values in 'magic' that determine how it
545 * has been booted. If 'magic' is #MULTIBOOT_INFO_MAGIC the kernel has been
546 * booted by a (Multiboot-compliant) bootloader and this is the first image on
547 * the boot CPU. It will relocate itself to a default position. If 'magic' is
548 * #KERNEL_BOOT_MAGIC it has been booted by another image of itself and is
549 * running on an (so-called) application CPU.
551 * This function sets up new page tables to alias the kernel
552 * at #MEMORY_OFFSET. It also does any relocations necessary to the
553 * "position-independent" code to make it run at the new location (e.g.
554 * relocating the GOT). After all relocations, it calls text_init() of
555 * the relocated image, which destroys the lower alias and may never return.
557 * For bsp kernels, the void pointer is of type multiboot_info, for application
558 * CPUs, it is of type global. Global carries a pointer to multiboot_info.
559 * Global also contains pointers to memory that is shared between kernels.
561 * \param magic Boot magic value
562 * \param pointer Pointer to Multiboot Info or to Global structure
564 void arch_init(uint64_t magic, void *pointer)
566 // Sanitize the screen
568 serial_console_init();
570 panic("Hello World!\n");
572 void __attribute__ ((noreturn)) (*reloc_text_init)(void) =
573 (void *)local_phys_to_mem((lpaddr_t)text_init);
574 struct Elf64_Shdr *rela, *symtab;
575 struct multiboot_info *mb = NULL;
578 * If this is the boot image, make Multiboot information structure globally
579 * known. Otherwise the passed value should equal the original structure.
580 * If magic value does not match what we expect, we cannot proceed safely.
583 case MULTIBOOT_INFO_MAGIC:
584 mb = (struct multiboot_info *)pointer;
586 // Construct the global structure and store its address to retrive it
588 memset(&global->locks, 0, sizeof(global->locks));
589 addr_global = (uint64_t)global;
592 case KERNEL_BOOT_MAGIC:
593 global = (struct global*)pointer;
594 // Store the address of global to retrive it across relocation
595 addr_global = (uint64_t)global;
599 panic("Magic value does not match! (0x%x != 0x%lx != 0x%x)",
600 KERNEL_BOOT_MAGIC, magic, MULTIBOOT_INFO_MAGIC);
604 /* determine page-aligned physical address past end of multiboot */
605 lvaddr_t dest = (lvaddr_t)&_start_kernel;
606 if (dest & (BASE_PAGE_SIZE - 1)) {
607 dest &= ~(BASE_PAGE_SIZE - 1);
608 dest += BASE_PAGE_SIZE;
611 // XXX: print kernel address for debugging with gdb
612 printf("Kernel starting at address 0x%"PRIxLVADDR"\n",
613 local_phys_to_mem(dest));
615 struct x86_coredata_elf *elf;
616 uint32_t multiboot_flags;
617 if (mb != NULL) { /* Multiboot info was passed */
618 multiboot_flags = mb->flags;
619 elf = (struct x86_coredata_elf *)&mb->syms.elf;
621 // We need the ELF section header table for relocation
622 if (!(multiboot_flags & MULTIBOOT_INFO_FLAG_HAS_ELF_SYMS)) {
623 panic("Multiboot information structure does not include ELF section"
624 "header information -- Relocation impossible!");
627 // Determine where free RAM starts
628 glbl_core_data->start_free_ram =
629 ROUND_UP(max(multiboot_end_addr(mb), (uintptr_t)&_end_kernel),
632 glbl_core_data->mods_addr = mb->mods_addr;
633 glbl_core_data->mods_count = mb->mods_count;
634 glbl_core_data->cmdline = mb->cmdline;
635 glbl_core_data->mmap_length = mb->mmap_length;
636 glbl_core_data->mmap_addr = mb->mmap_addr;
637 } else { /* No multiboot info, use the core_data struct */
638 struct x86_core_data *core_data =
639 (struct x86_core_data*)(dest - BASE_PAGE_SIZE);
640 multiboot_flags = core_data->multiboot_flags;
641 elf = &core_data->elf;
642 glbl_core_data = core_data;
643 core_data->cmdline = (lpaddr_t)&core_data->kernel_cmdline;
644 my_core_id = core_data->dst_core_id;
646 if (core_data->module_end > 4ul * (1ul << 20)) {
647 panic("The cpu module is outside the initial 4MB mapping."
648 " Either move the module or increase initial mapping.");
652 // We're only able to process Elf64_Rela entries
653 if (elf->size != sizeof(struct Elf64_Shdr)) {
654 panic("ELF section header entry size mismatch!");
657 // Find relocation section
658 rela = elf64_find_section_header_type((struct Elf64_Shdr *)
662 panic("Kernel image does not include relocation section!");
665 // Find symbol table section
666 symtab = elf64_find_section_header_type((struct Elf64_Shdr *)
668 elf->num, SHT_DYNSYM);
669 if (symtab == NULL) {
670 panic("Kernel image does not include symbol table!");
673 // Alias kernel on top of memory, keep low memory
674 paging_init((lpaddr_t)&_start_kernel, SIZE_KERNEL_IMAGE);
676 // Relocate kernel image for top of memory
677 elf64_relocate(X86_64_MEMORY_OFFSET + (lvaddr_t)&_start_kernel,
678 (lvaddr_t)&_start_kernel,
679 (struct Elf64_Rela *)(rela->sh_addr - X86_64_START_KERNEL_PHYS + &_start_kernel),
681 (struct Elf64_Sym *)(symtab->sh_addr - X86_64_START_KERNEL_PHYS + &_start_kernel),
683 X86_64_START_KERNEL_PHYS, &_start_kernel);
685 /*** Aliased kernel available now -- low memory still mapped ***/
687 // Relocate stack to aliased location
688 relocate_stack(X86_64_MEMORY_OFFSET);
690 // Call aliased text_init() function and continue initialization