3 * \brief x86-32 architecture initialization.
7 * Copyright (c) 2007, 2008, 2009, 2010, 2011, 2012, ETH Zurich.
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Haldeneggsteig 4, CH-8092 Zurich. Attn: Systems Group.
18 #include <paging_kernel_arch.h>
24 #include <kernel_multiboot.h>
26 #include <getopt/getopt.h>
29 #include <arch/x86/conio.h>
30 #include <arch/x86/pic.h>
31 #include <arch/x86/apic.h>
32 #include <arch/x86/perfmon_intel.h>
33 #include <arch/x86/perfmon_amd.h>
34 #include <arch/x86/rtc.h>
35 #include <arch/x86/ipi_notify.h>
36 #include <target/x86/barrelfish_kpi/coredata_target.h>
37 #include <arch/x86/timing.h>
38 #include <arch/x86/startup_x86.h>
39 #include <arch/x86/start_aps.h>
43 #include <dev/xapic_dev.h> // XXX
46 * Used to store the address of global struct passed during boot across kernel
49 // XXX: This won't work if this kernel is not relocated from a pristine image!
50 static uint32_t addr_global;
53 * EFLAGS mask for fast system calls. Put values to mask out here.
54 * We mask out everything (including interrupts).
56 #define SYSCALL_FMASK (~(EFLAGS_ALWAYS1) & 0xffffffff)
59 * Segment selector bases for both kernel- and user-space for fast
62 #define SYSCALL_STAR \
63 ((((uint64_t)GSEL(KSTACK_SEL, SEL_UPL)) << 48) | \
64 ((uint64_t)GSEL(KCODE_SEL, SEL_KPL) << 32))
67 * \brief Kernel stack.
69 * This is the one and only kernel stack for a kernel instance.
71 uintptr_t x86_32_kernel_stack[X86_32_KERNEL_STACK_SIZE/sizeof(uintptr_t)];
74 * \brief Global Task State Segment (TSS).
76 * This is the global, static and only Task State Segment (TSS). It is used
77 * for interrupt and exception handling (stack setup) while in user-space.
79 static struct task_state_segment tss __attribute__ ((aligned (4)));
82 * \brief Global Descriptor Table (GDT) for processor this kernel is running on.
84 * This descriptor table is completely static, as segments are basically
85 * turned off in 64-bit mode. They map flat-mode code and stack segments for
86 * both kernel- and user-space and the only Task State Segment (TSS).
88 static union segment_descriptor gdt[] __attribute__ ((aligned (4))) = {
92 { // Kernel code segment
98 .privilege_level = SEL_KPL,
108 { // Kernel stack segment
114 .privilege_level = SEL_KPL,
124 { // User stack segment
130 .privilege_level = SEL_UPL,
140 { // User code segment
146 .privilege_level = SEL_UPL,
156 { // Global Task State Segment (TSS)
158 .lo_limit = sizeof(tss) & 0xffff,
160 .privilege_level = SEL_KPL,
162 .hi_limit = (sizeof(tss) >> 16) & 0xf,
168 // Dispatcher "segment"
174 .privilege_level = SEL_UPL,
186 volatile union segment_descriptor *curdisp = &gdt[6];
192 static union x86_32_pdpte_entry boot_pdpte[X86_32_PDPTE_SIZE]
193 __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE)));
196 * Bootup low-map PDIR and hi-map PDIR.
198 static union x86_32_ptable_entry boot_pdir[X86_32_PTABLE_SIZE]
199 __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE))),
200 boot_pdir_hi[X86_32_PTABLE_SIZE] __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE)));
206 static union x86_32_ptable_entry boot_pdir[X86_32_PTABLE_SIZE]
207 __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE)));
212 static union x86_32_pdir_entry boot_pdir[X86_32_PTABLE_SIZE]
213 __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE)));
216 * Bootup low-map PTABLE and hi-map PTABLE.
218 static union x86_32_ptable_entry
219 boot_ptable[MEM_PTABLE_SIZE][X86_32_PTABLE_SIZE]
220 __attribute__ ((aligned(X86_32_BASE_PAGE_SIZE)));
225 * This flag is set to true once the IDT is initialized and exceptions can be
228 bool idt_initialized = false;
231 * \brief Setup bootup page table.
233 * This function sets up the page table needed to boot the kernel proper.
234 * The table identity maps the first 2 MBytes (page size is 2 MBytes) of
235 * physical memory in order to have access to the first MByte containing
236 * bootloader-passed data structures. It also identity maps the local copy
237 * of the kernel in low memory and aliases it in kernel address space.
239 static void paging_init(void)
241 lvaddr_t vbase = X86_32_MEMORY_OFFSET, base = 0;
243 // Align vbase to kernel page size
244 if(vbase & X86_32_MEM_PAGE_MASK) {
245 vbase -= vbase & X86_32_MEM_PAGE_MASK;
249 for(size_t i = 0; i < X86_32_PTABLE_SIZE; i++,
250 base += X86_32_MEM_PAGE_SIZE, vbase += X86_32_MEM_PAGE_SIZE) {
251 // Identity-map the kernel's physical region, so we don't lose ground
252 paging_x86_32_map_pdpte(&boot_pdpte[X86_32_PDPTE_BASE(base)],
253 (lpaddr_t)boot_pdir);
254 paging_x86_32_map_large(&boot_pdir[X86_32_PDIR_BASE(base)], base,
255 X86_32_PTABLE_PRESENT
256 | X86_32_PTABLE_READ_WRITE
257 | X86_32_PTABLE_USER_SUPERVISOR);
259 // Alias the same region at MEMORY_OFFSET
260 paging_x86_32_map_pdpte(&boot_pdpte[X86_32_PDPTE_BASE(vbase)],
261 (lpaddr_t)boot_pdir_hi);
262 paging_x86_32_map_large(&boot_pdir_hi[X86_32_PDIR_BASE(vbase)], base,
263 X86_32_PTABLE_PRESENT
264 | X86_32_PTABLE_READ_WRITE
265 | X86_32_PTABLE_USER_SUPERVISOR);
268 // Activate new page tables
269 paging_x86_32_context_switch((lpaddr_t)boot_pdpte);
271 for(size_t i = 0; i < X86_32_PADDR_SPACE_LIMIT; i += X86_32_MEM_PAGE_SIZE,
272 base += X86_32_MEM_PAGE_SIZE, vbase += X86_32_MEM_PAGE_SIZE) {
274 // Identity-map the kernel's physical region, so we don't lose ground
275 paging_x86_32_map_large(&boot_pdir[X86_32_PDIR_BASE(base)], base,
276 X86_32_PTABLE_PRESENT
277 | X86_32_PTABLE_READ_WRITE
278 | X86_32_PTABLE_USER_SUPERVISOR);
280 // Alias the same region at MEMORY_OFFSET
281 paging_x86_32_map_large(&boot_pdir[X86_32_PDIR_BASE(vbase)], base,
282 X86_32_PTABLE_PRESENT
283 | X86_32_PTABLE_READ_WRITE
284 | X86_32_PTABLE_USER_SUPERVISOR);
286 // Identity-map the kernel's physical region, so we don't lose ground
287 paging_x86_32_map_table(&boot_pdir[X86_32_PDIR_BASE(base)],
288 (lpaddr_t)boot_ptable[X86_32_PDIR_BASE(base)]);
289 paging_x86_32_map(&boot_ptable[X86_32_PDIR_BASE(base)][X86_32_PTABLE_BASE(base)],
291 X86_32_PTABLE_PRESENT
292 | X86_32_PTABLE_READ_WRITE
293 | X86_32_PTABLE_USER_SUPERVISOR);
295 // Alias the same region at MEMORY_OFFSET
296 paging_x86_32_map_table(&boot_pdir[X86_32_PDIR_BASE(vbase)],
297 (lpaddr_t)boot_ptable[X86_32_PDIR_BASE(base)]);
301 // Activate new page tables
302 paging_x86_32_context_switch((lpaddr_t)boot_pdir);
307 * \brief Setup default GDT.
309 * Loads the GDT register with the default GDT and reloads CS and SS
310 * to point to the new entries. Resets all other segment registers to null.
311 * Finally, completes setup of GDT to include TSS base address mapping and
312 * loads TSS into task register.
314 static void gdt_reset(void)
316 lvaddr_t ptss = (lvaddr_t)&tss;
317 struct region_descriptor region = {
318 .rd_limit = sizeof(gdt),
319 .rd_base = (uint32_t)&gdt
323 __asm volatile("lgdt %[region]" :: [region] "m" (region));
326 __asm volatile("mov %[ds], %%ds \n\t"
327 "mov %[ds], %%es \n\t"
328 "mov %[ss], %%ss \n\t"
329 "mov %[null], %%gs \n\t"
330 "mov %[null], %%fs \n\t"
331 "pushl %[cs] \n\t" // new CS
332 "lea 1f, %%eax \n\t" // jumps to after lret
333 "pushl %%eax \n\t" // new IP
334 "lretl \n\t" // fake return
335 "1: \n\t" // we'll continue here
339 [ss] "r" (GSEL(KSTACK_SEL, SEL_KPL)),
340 [cs] "i" (GSEL(KCODE_SEL, SEL_KPL)),
341 [ds] "r" (GSEL(USTACK_SEL, SEL_UPL))
345 // Complete setup of TSS descriptor (by inserting base address of TSS)
346 gdt[TSS_SEL].tss.lo_base = ptss & 0xffffff;
347 gdt[TSS_SEL].tss.hi_base = (ptss >> 24) & 0xff;
349 // Complete setup of TSS
350 tss.esp0 = (lvaddr_t)&x86_32_kernel_stack[X86_32_KERNEL_STACK_SIZE / sizeof(uintptr_t)];
351 tss.ss0 = GSEL(KSTACK_SEL, SEL_KPL);
353 // Load task state register
354 __asm volatile("ltr %%ax" :: "a" (GSEL(TSS_SEL, SEL_KPL)));
358 * \brief Relocates the active stack.
360 * This function relocates the stack, by adding 'offset' to the stack
363 * \param offset Offset to add to the stack pointer.
365 static inline void __attribute__ ((always_inline))
366 relocate_stack(lvaddr_t offset)
368 __asm volatile("add %[stack], %%esp\n\t"
370 : [stack] "g" (offset)
376 * \brief Enable SYSCALL/SYSRET fast system calls.
378 * This function enables the SYSCALL/SYSRET pair of fast system calls in
379 * long mode. Also sets the IA32_STAR and IA32_FMASK MSRs to point to the
380 * user-space base selector and RFLAGS mask for SYSCALL/SYSRET fast system
383 static inline void enable_fast_syscalls(void)
385 // Set IA32_STAR MSR to point to user-space base selector
386 wrmsr(MSR_IA32_STAR, SYSCALL_STAR);
388 // Set IA32_LSTAR MSR to point to kernel-space system call multiplexer
389 wrmsr(MSR_IA32_LSTAR, (lvaddr_t)syscall_entry);
391 // Set IA32_FMASK MSR for our OSes EFLAGS mask
392 wrmsr(MSR_IA32_FMASK, SYSCALL_FMASK);
394 // Enable fast system calls
395 addmsr(MSR_IA32_EFER, IA32_EFER_SCE);
398 #define CR0_CD (1 << 30)
399 #define CR0_NW (1 << 29)
400 #define CR0_PG (1 << 31)
401 #define CR4_MPE (1 << 11)
402 #define CR4_PCE (1 << 8)
403 #define CR4_PGE (1 << 7)
404 #define CR4_PAE (1 << 5)
405 #define CR4_PSE (1 << 4)
407 static inline void enable_user_rdpmc(void)
411 __asm volatile("mov %%cr4, %[cr4]" : [cr4] "=r" (cr4));
413 __asm volatile("mov %[cr4], %%cr4" :: [cr4] "r" (cr4));
416 static inline void enable_tlb_flush_filter(void)
418 uint32_t eax, ebx, ecx, edx;
420 // Must read "AuthenticAMD"
421 cpuid(0, &eax, &ebx, &ecx, &edx);
422 if(ebx != 0x68747541 || ecx != 0x444d4163 || edx != 0x69746e65) {
426 // Is at least family 0fh?
427 cpuid(1, &eax, &ebx, &ecx, &edx);
428 if(((eax >> 8) & 0xf) != 0xf) {
432 debug(SUBSYS_STARTUP, "Enabling TLB flush filter\n");
433 uint64_t hwcr = rdmsr(MSR_AMD_HWCR);
434 hwcr &= ~AMD_HWCR_FFDIS;
435 wrmsr(MSR_AMD_HWCR, hwcr);
438 static inline void enable_pge(void)
442 __asm volatile("mov %%cr4, %[cr4]" : [cr4] "=r" (cr4));
444 __asm volatile("mov %[cr4], %%cr4" :: [cr4] "r" (cr4));
447 static inline void enable_pae(void)
451 __asm volatile("mov %%cr4, %[cr4]" : [cr4] "=r" (cr4));
453 __asm volatile("mov %[cr4], %%cr4" :: [cr4] "r" (cr4));
456 static inline void enable_pse(void)
460 __asm volatile("mov %%cr4, %[cr4]" : [cr4] "=r" (cr4));
462 __asm volatile("mov %[cr4], %%cr4" :: [cr4] "r" (cr4));
465 static inline void enable_pg(void)
469 __asm volatile("mov %%cr0, %[cr0]" : [cr0] "=r" (cr0));
471 __asm volatile("mov %[cr0], %%cr0" :: [cr0] "r" (cr0));
474 static inline void enable_monitor_mwait(void)
476 uint32_t eax, ebx, ecx, edx;
478 cpuid(1, &eax, &ebx, &ecx, &edx);
480 if (ecx & (1 << 3)) {
481 cpuid(5, &eax, &ebx, &ecx, &edx);
482 debug(SUBSYS_STARTUP, "MONITOR/MWAIT supported: "
483 "min size %"PRIu32" bytes, max %"PRIu32" bytes. %s %s\n",
484 eax, ebx, (ecx & 2) ? "IBE" : "", (ecx & 1) ? "EMX" : "");
489 * \brief Continue kernel initialization in kernel address space.
491 * This function resets paging to map out low memory and map in physical
492 * address space, relocating all remaining data structures. It resets the
493 * Global Descriptor Table for flat mode and to exclude legacy segments from
494 * boot initialization code. It sets up the IDT for exception and interrupt
495 * handling, initializes the local APIC and enables interrupts. After that it
496 * calls kernel_startup(), which should not return (if it does, this function
499 static void __attribute__ ((noreturn, noinline)) text_init(void)
501 // Relocate global to "memory"
502 global = (struct global*)local_phys_to_mem((lpaddr_t)addr_global);
504 // Relocate glbl_core_data to "memory"
505 glbl_core_data = (struct x86_core_data *)
506 local_phys_to_mem((lpaddr_t)glbl_core_data);
508 // Map-out low memory
509 paging_x86_32_reset();
512 * Use new physical address space for video memory -- no calls to functions
513 * that end up calling a conio.c function may be called between
514 * paging_reset() and conio_relocate_vidmem()!
516 conio_relocate_vidmem(local_phys_to_mem(VIDEO_MEM));
519 * Also reset the global descriptor table (GDT), so we get
520 * segmentation again and can catch interrupts/exceptions (the IDT
525 // Arch-independent early startup
526 kernel_startup_early();
528 // XXX: re-init the serial driver, in case the port changed after parsing args
529 serial_console_init(false);
533 idt_initialized = true;
535 // Initialize local APIC
538 // do not remove/change this printf: needed by regression harness
539 printf("Barrelfish CPU driver starting on x86_32 core %u\n", apic_id);
542 // Initialize classic (8259A) PIC
546 // Initialize real-time clock
549 // Initialize local APIC timer
550 if (kernel_ticks_enabled) {
552 apic_timer_init(false, true);
553 timing_apic_timer_set_ms(kernel_timeslice);
555 printk(LOG_WARN, "APIC timer disabled: NO timeslicing\n");
559 // Initialize IPI notification mechanism
562 // Enable SYSCALL/SYSRET fast system calls
563 /* enable_fast_syscalls(); */
566 // Enable "no execute" page-level protection bit
567 addmsr(MSR_IA32_EFER, IA32_EFER_NXE);
570 // Enable FPU and MMX
573 // Enable user-mode RDPMC opcode
576 // AMD64: Check if TLB flush filter is enabled
577 enable_tlb_flush_filter();
579 // Enable global pages
582 // Check/Enable MONITOR/MWAIT opcodes
583 enable_monitor_mwait();
585 // Call main kernel startup function -- this should never return
589 // Returning here will crash! -- low pages not mapped anymore!
593 * \brief Architecture-specific initialization function.
595 * This function is called by the bootup code in boot.S to initialize
596 * architecture-specific stuff. It is expected to call the kernel main
597 * loop. This function never returns.
599 * The kernel expects one of two magic values in 'magic' that determine how it
600 * has been booted. If 'magic' is #MULTIBOOT_INFO_MAGIC the kernel has been
601 * booted by a (Multiboot-compliant) bootloader and this is the first image on
602 * the boot CPU. It will relocate itself to a default position. If 'magic' is
603 * #KERNEL_BOOT_MAGIC it has been booted by another image of itself and is
604 * running on an (so-called) application CPU. It expects 'dest' to be a physical
605 * address pointing to the base of a memory area to relocate itself to.
607 * For x86-64, after performing some sanity checks to the kernel image, this
608 * function first copies the whole kernel to a CPU-local version and then calls
609 * local_init(), at the offset of the local copy, to initialize that local
610 * copy. local_init() should never return.
612 * For bsp kernels, the void pointer is of type multiboot_info, for application
613 * CPUs, it is of type global. Global carries a pointer to multiboot_info.
614 * Global also contains pointers to memory that is shared between kernels.
616 * \param magic Boot magic value
617 * \param pointer Pointer to Multiboot Info or to Global structure
619 void arch_init(uint32_t magic, void *pointer)
621 // Sanitize the screen
623 // Initialize serial, only initialize HW if we are
625 serial_console_init((magic == MULTIBOOT_INFO_MAGIC));
627 /* determine page-aligned physical address past end of multiboot */
628 lvaddr_t dest = (lvaddr_t)&_start_kernel;
629 if (dest & (BASE_PAGE_SIZE - 1)) {
630 dest &= ~(BASE_PAGE_SIZE - 1);
631 dest += BASE_PAGE_SIZE;
634 // XXX: print kernel address for debugging with gdb
635 printf("Kernel starting at address 0x%"PRIxLVADDR"\n", local_phys_to_mem(dest));
637 void __attribute__ ((noreturn)) (*reloc_text_init)(void) =
638 (void *)local_phys_to_mem((lpaddr_t)text_init);
639 struct Elf32_Shdr *rela, *symtab;
640 struct x86_coredata_elf *elf;
643 * If this is the boot image, make Multiboot information structure globally
644 * known. Otherwise the passed value should equal the original structure.
645 * If magic value does not match what we expect, we cannot proceed safely.
648 case MULTIBOOT_INFO_MAGIC:
650 struct multiboot_info *mb = (struct multiboot_info *)pointer;
652 elf = (struct x86_coredata_elf *)&mb->syms.elf;
653 // We need the ELF section header table for relocation
654 if (!(mb->flags & MULTIBOOT_INFO_FLAG_HAS_ELF_SYMS)) {
655 panic("Multiboot information structure does not include ELF section"
656 "header information -- Relocation impossible!");
658 assert(mb->flags & MULTIBOOT_INFO_FLAG_HAS_MMAP);
660 // Determine where free RAM starts
661 memset(glbl_core_data, 0, sizeof(struct x86_core_data));
662 glbl_core_data->start_free_ram =
663 ROUND_UP(max(multiboot_end_addr(mb), (uintptr_t)&_end_kernel),
666 glbl_core_data->mods_addr = mb->mods_addr;
667 glbl_core_data->mods_count = mb->mods_count;
668 glbl_core_data->cmdline = mb->cmdline;
669 glbl_core_data->mmap_length = mb->mmap_length;
670 glbl_core_data->mmap_addr = mb->mmap_addr;
674 case KERNEL_BOOT_MAGIC:
675 global = (struct global*)pointer;
676 // Store the address of global to retrive it across relocation
677 addr_global = (uint32_t)global;
678 memset(&global->locks, 0, sizeof(global->locks));
679 struct x86_core_data *core_data =
680 (struct x86_core_data*)(dest - BASE_PAGE_SIZE);
681 glbl_core_data = core_data;
682 glbl_core_data->cmdline = (lpaddr_t)&core_data->kernel_cmdline;
683 my_core_id = core_data->dst_core_id;
684 kcb_current = (struct kcb*) (lpaddr_t)glbl_core_data->kcb;
685 elf = &core_data->elf;
689 panic("Magic value does not match! (0x%x != 0x%"PRIu32" != 0x%x)",
690 KERNEL_BOOT_MAGIC, magic, MULTIBOOT_INFO_MAGIC);
694 if(magic != KERNEL_BOOT_MAGIC) {
695 // Construct the global structure and store its address to retrive it
697 memset(&global->locks, 0, sizeof(global->locks));
698 addr_global = (uint32_t)global;
701 // We're only able to process Elf32_Rela entries
702 if (elf->size != sizeof(struct Elf32_Shdr)) {
703 panic("ELF section header entry size mismatch!");
706 // Find relocation section
707 rela = elf32_find_section_header_type((struct Elf32_Shdr *)
712 panic("Kernel image does not include relocation section!");
715 // Find symbol table section
716 symtab = elf32_find_section_header_type((struct Elf32_Shdr *)
718 elf->num, SHT_DYNSYM);
720 if (symtab == NULL) {
721 panic("Kernel image does not include symbol table!");
724 // Kernel has to fit in mappable area
725 assert((lvaddr_t)&_end_kernel < X86_32_PADDR_SPACE_LIMIT);
727 // Map alias at MEMORY_OFFSET
731 // Put CPU in PAE mode
733 #elif defined(CONFIG_PSE)
734 // Enable page-size extensions
741 // Relocate kernel image for top of memory
742 elf32_relocate(X86_32_MEMORY_OFFSET + (lvaddr_t)&_start_kernel,
743 (lvaddr_t)&_start_kernel,
744 (struct Elf32_Rel *)(rela->sh_addr - X86_32_START_KERNEL_PHYS + &_start_kernel),
746 (struct Elf32_Sym *)(symtab->sh_addr - X86_32_START_KERNEL_PHYS + &_start_kernel),
748 X86_32_START_KERNEL_PHYS, &_start_kernel);
750 /*** Aliased kernel available now -- low memory still mapped ***/
752 // Relocate stack to aliased location
753 relocate_stack(X86_32_MEMORY_OFFSET);
755 // Call aliased text_init() function and continue initialization