7 * Copyright (c) 2009, 2010, ETH Zurich.
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group.
18 #include <barrelfish/lmp_endpoints.h>
31 #include "pci_devices.h"
32 #include "pci_ethernet.h"
33 #include <driverkit/hwmodel.h>
34 #include <driverkit/iommu.h>
37 #define VMCB_SIZE 0x1000 // 4KB
40 #define IOPM_SIZE 0x3000 // 12KB
41 #define MSRPM_SIZE 0x2000 // 8KB
43 #define IOBMP_A_SIZE 0x1000 // 4KB
44 #define IOBMP_B_SIZE 0x1000 // 4KB
45 #define MSRPM_SIZE 0x1000 // 4KB
48 #define RM_MEM_SIZE (0x100000 + BASE_PAGE_SIZE) // 1MB + A20 gate space
50 #define APIC_BASE 0xfee00000
52 #define SERIAL_DRIVER "serial0.raw"
55 extern uint16_t saved_exit_reason;
56 extern uint64_t saved_exit_qual, saved_rip;
58 // List of MSRs that are saved on VM-exit and loaded on VM-entry.
59 static uint32_t msr_list[VMX_MSR_COUNT] =
60 {X86_MSR_KERNEL_GS_BASE, X86_MSR_STAR, X86_MSR_LSTAR, X86_MSR_CSTAR, X86_MSR_SFMASK};
62 // Saved priority of the most recent irq that is asserted.
63 uint8_t interrupt_priority = 0;
67 static inline int vmx_guest_msr_index(uint32_t msr_index)
69 for (int i = 0; i < VMX_MSR_COUNT; i++) {
70 if (msr_list[i] == msr_index) {
77 static void initialize_guest_msr_area(struct guest *g)
79 struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
81 // The values of the MSRs in the guest MSR area are all set to 0.
82 for (int i = 0; i < VMX_MSR_COUNT; i++) {
83 guest_msr_area[i].index = msr_list[i];
84 guest_msr_area[i].val = 0x0;
87 errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_F, g->msr_area_pa);
88 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXIT_MSR_STORE_CNT, VMX_MSR_COUNT);
89 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_F, g->msr_area_pa);
90 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_MSR_LOAD_CNT, VMX_MSR_COUNT);
91 assert(err_is_ok(err));
95 lvaddr_t guest_offset = 0;
96 static struct guest __guest;
97 static struct guest *__guestp = NULL;
100 /// stores the last used guest ASID
101 static uint32_t last_guest_asid = 0;
104 // FIXME: this is somewhat broken by design... we should emit proper exceptions
105 // to the guest opposed to just halt the VM
106 #define guest_assert(g, e) \
107 ((e) ? (void)0 : (handle_vmexit_unhandeled(g), assert(e)))
110 guest_slot_alloc(struct guest *g, struct capref *ret)
112 return g->slot_alloc.a.alloc(&g->slot_alloc.a, ret);
115 errval_t guest_vspace_map_wrapper(struct vspace *vspace, lvaddr_t vaddr,
116 struct capref frame, size_t size)
119 struct vregion *vregion = NULL;
120 struct memobj_one_frame *memobj = NULL;
123 vregion = malloc(sizeof(struct vregion));
125 err = LIB_ERR_MALLOC_FAIL;
128 memobj = malloc(sizeof(struct memobj_one_frame));
130 err = LIB_ERR_MALLOC_FAIL;
134 // Create the objects
135 err = memobj_create_one_frame(memobj, size, 0);
136 if (err_is_fail(err)) {
137 err = err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
140 err = memobj->m.f.fill(&memobj->m, 0, frame, size);
141 if (err_is_fail(err)) {
142 err = err_push(err, LIB_ERR_MEMOBJ_FILL);
145 err = vregion_map_fixed(vregion, vspace, &memobj->m, 0, size, vaddr,
146 VREGION_FLAGS_READ | VREGION_FLAGS_WRITE | VREGION_FLAGS_EXECUTE);
147 if (err_is_fail(err)) {
148 err = LIB_ERR_VSPACE_MAP;
151 err = memobj->m.f.pagefault(&memobj->m, vregion, 0, 0);
152 if (err_is_fail(err)) {
153 err = err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
159 error: // XXX: proper cleanup
170 #define GUEST_VSPACE_SIZE 1073741824UL // 1GB
172 #define GUEST_VSPACE_SIZE (1ul<<32) // GB
174 static errval_t vspace_map_wrapper(lvaddr_t vaddr, struct capref frame,
178 static struct memobj_anon *memobj = NULL;
179 static struct vregion *vregion = NULL;
180 static bool initialized = false;
184 memobj = malloc(sizeof(struct memobj_anon));
186 return LIB_ERR_MALLOC_FAIL;
188 vregion = malloc(sizeof(struct vregion));
190 return LIB_ERR_MALLOC_FAIL;
193 // Create a memobj and vregion
194 err = memobj_create_anon(memobj, GUEST_VSPACE_SIZE, 0);
195 if (err_is_fail(err)) {
196 return err_push(err, LIB_ERR_MEMOBJ_CREATE_ANON);
198 err = vregion_map(vregion, get_current_vspace(), &memobj->m, 0,
199 GUEST_VSPACE_SIZE, VREGION_FLAGS_READ_WRITE);
200 if (err_is_fail(err)) {
201 return err_push(err, LIB_ERR_VREGION_MAP);
204 guest_offset = vregion_get_base_addr(vregion);
209 err = memobj->m.f.fill(&memobj->m, vaddr, frame, size);
210 if (err_is_fail(err)) {
211 return err_push(err, LIB_ERR_MEMOBJ_FILL);
213 err = memobj->m.f.pagefault(&memobj->m, vregion, vaddr, 0);
214 if (err_is_fail(err)) {
215 return err_push(err, LIB_ERR_MEMOBJ_PAGEFAULT_HANDLER);
220 // allocates some bytes of memory for the guest starting at a specific addr
221 // also performs the mapping into the vspace of the monitor
223 alloc_guest_mem(struct guest *g, lvaddr_t guest_paddr, size_t bytes)
227 // only allow multiple of page sizes to be allocated
228 assert(bytes > 0 && (bytes & BASE_PAGE_MASK) == 0);
229 // do not allow allocation outside of the guests physical memory
230 assert(guest_paddr + bytes <= g->mem_high_va);
236 int32_t node_id_self = driverkit_hwmodel_get_my_node_id();
237 int32_t node_id_ram = driverkit_hwmodel_lookup_dram_node_id();
238 int32_t nodes_data[] = {node_id_self, 0};
240 err = driverkit_hwmodel_frame_alloc(&cap, bytes, node_id_ram, nodes_data);
241 if (err_is_fail(err)) {
246 if (err_is_fail(err)) {
247 return err_push(err, LIB_ERR_SLOT_ALLOC);
249 err = frame_create(cap, bytes, NULL);
250 if (err_is_fail(err)) {
251 return err_push(err, LIB_ERR_FRAME_CREATE);
255 // Map into the guest vspace
256 err = guest_vspace_map_wrapper(&g->vspace, guest_paddr, cap, bytes);
257 if (err_is_fail(err)) {
261 // Create a copy of the capability to map in our vspace
262 struct capref host_cap;
263 err = slot_alloc(&host_cap);
264 if (err_is_fail(err)) {
267 err = cap_copy(host_cap, cap);
268 if (err_is_fail(err)) {
272 // Map into my vspace
273 err = vspace_map_wrapper(guest_to_host(guest_paddr), host_cap, bytes);
274 if (err_is_fail(err)) {
278 struct frame_identity frameid = { .base = 0, .bytes = 0 };
279 errval_t r = frame_identify(cap, &frameid);
280 assert(err_is_ok(r));
281 VMKIT_PCI_DEBUG("alloc_guest_mem: frameid.base: 0x%lx, frameid.bytes: %zd, "
282 "g->mem_low_va: 0x%lx, g->mem_high_va: 0x%lx\n",
283 frameid.base, frameid.bytes, g->mem_low_va, g->mem_high_va);
289 initialize_iopm (struct guest *self) {
290 // intercept all IO port accesses (for now)
292 memset((void*)self->iopm_va, 0xFF, IOPM_SIZE);
294 memset((void*)self->iobmp_a_va, 0xFF, IOBMP_A_SIZE);
295 memset((void*)self->iobmp_b_va, 0xFF, IOBMP_B_SIZE);
299 // access_mode: 0 all access, 1 read intercept, 2 write intercept, 3 all interc.
301 set_msr_access (struct guest *g, uint32_t msr, int access_mode)
303 assert(access_mode >= 0 && access_mode <= 3);
305 // a region a 2K bytes represents the access bits of 8K MSRs, therefore each
306 // MSR takes two bits (one for rdmsr and one for wrmsr)
307 uintptr_t byte_offset = (msr & 0xffff) / 4;
308 int bit_offset = ((msr & 0xffff) % 4) * 2;
312 } else if (msr >= 0xc0000000 && msr < 0xc0002000) {
313 byte_offset += 0x800;
314 } else if (msr >= 0xc0010000 && msr < 0xc0012000) {
315 byte_offset += 0x1000;
317 assert(!"not reached");
320 assert(byte_offset < MSRPM_SIZE);
322 // read the byte holding the relevant bits
323 uint8_t val = *(uint8_t *)(g->msrpm_va + byte_offset);
324 // set the access params according to the arguments
325 val = (val & ~(0x3 << bit_offset)) | (access_mode << bit_offset);
326 // store the modified value back in the map
327 *(uint8_t *)(g->msrpm_va + byte_offset) = val;
329 //printf("MSR: msr %x, byte_offset %lx, bit_offset %x, val %x\n", msr, byte_offset, bit_offset, val);
333 initialize_msrpm (struct guest *g) {
334 // intercept all MSR accesses (for now)
335 memset((void*)g->msrpm_va, 0xff, MSRPM_SIZE);
337 // allow performance counters and evnets MSR accesses
338 set_msr_access (g, 0xc0010000, 0);
339 set_msr_access (g, 0xc0010001, 0);
340 set_msr_access (g, 0xc0010002, 0);
341 set_msr_access (g, 0xc0010003, 0);
342 set_msr_access (g, 0xc0010004, 0);
343 set_msr_access (g, 0xc0010005, 0);
344 set_msr_access (g, 0xc0010006, 0);
345 set_msr_access (g, 0xc0010007, 0);
349 #define INIT_DATA_SEGREG(vmcb,x) \
351 amd_vmcb_seg_attrib_t __sa = { \
356 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \
357 amd_vmcb_##x## _selector_wr((vmcb), 0x0); \
358 amd_vmcb_##x## _base_wr((vmcb), 0x0); \
359 amd_vmcb_##x## _limit_wr((vmcb), 0xffff); \
362 #define INIT_CODE_SEGREG(vmcb,x) \
364 amd_vmcb_seg_attrib_t __sa = { \
369 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \
370 amd_vmcb_##x## _selector_wr((vmcb), 0xf000); \
371 amd_vmcb_##x## _base_wr((vmcb), 0xffff0000); \
372 amd_vmcb_##x## _limit_wr((vmcb), 0xffff); \
375 #define INIT_SYS_SEGREG(vmcb,x) \
377 amd_vmcb_seg_attrib_t __sa = { \
381 amd_vmcb_##x## _attrib_wr((vmcb), __sa); \
382 amd_vmcb_##x## _selector_wr((vmcb), 0x0); \
383 amd_vmcb_##x## _base_wr((vmcb), 0x0); \
384 amd_vmcb_##x## _limit_wr((vmcb), 0xffff); \
387 /* This method initializes a new VMCB memory regsion and sets the initial
388 * machine state as defined by the AMD64 architecture specification */
391 initialize_vmcb (struct guest *self) {
392 amd_vmcb_initialize(&self->vmcb, (mackerel_addr_t)self->vmcb_va);
394 // 1. Initialize intercepts
396 /* For now we intercept just everything */
398 amd_vmcb_cr_access_wr_raw(&self->vmcb, ~0u);
399 amd_vmcb_cr_access_rdcr2_wrf(&self->vmcb, 0);
400 amd_vmcb_cr_access_wrcr2_wrf(&self->vmcb, 0);
401 amd_vmcb_cr_access_rdcr4_wrf(&self->vmcb, 0);
402 amd_vmcb_cr_access_wrcr4_wrf(&self->vmcb, 0);
404 // FIXME: ignoring DR accesses may be insecure
405 //amd_vmcb_dr_access_wr_raw(&self->vmcb, ~0u);
406 amd_vmcb_exceptions_wr_raw(&self->vmcb, ~0u);
407 amd_vmcb_exceptions_vector7_wrf(&self->vmcb, 0);
408 amd_vmcb_exceptions_vector14_wrf(&self->vmcb, 0);
410 amd_vmcb_intercepts_wr_raw(&self->vmcb, 0x1fffffffffff);
411 amd_vmcb_intercepts_pushf_wrf(&self->vmcb, 0);
412 amd_vmcb_intercepts_popf_wrf(&self->vmcb, 0);
413 amd_vmcb_intercepts_invlpg_wrf(&self->vmcb, 0);
414 amd_vmcb_intercepts_rdtsc_wrf(&self->vmcb, 0);
415 amd_vmcb_intercepts_rdtscp_wrf(&self->vmcb, 0);
416 amd_vmcb_intercepts_iret_wrf(&self->vmcb, 0);
417 amd_vmcb_intercepts_wbinvd_wrf(&self->vmcb, 0);
418 amd_vmcb_intercepts_pause_wrf(&self->vmcb, 0);
419 amd_vmcb_intercepts_vintr_wrf(&self->vmcb, 0);
421 // 2. Setup some config fields
423 // physical addresses of IOPM and MSRPM_SIZE
424 amd_vmcb_iopm_base_pa_wr(&self->vmcb, self->iopm_pa);
425 amd_vmcb_msrpm_base_pa_wr(&self->vmcb, self->msrpm_pa);
427 // FIXME: use real asid allocator. BF does not know about tagged TLBs atm
428 amd_vmcb_tlb_guest_asid_wrf(&self->vmcb, ++last_guest_asid);
429 // enable virtual intr masking
430 amd_vmcb_vintr_vintr_masking_wrf(&self->vmcb, 1);
431 // enable nested paging
432 amd_vmcb_np_enable_wrf(&self->vmcb, 1);
434 /* 3. Guest state initialization
435 * according to Intels Manual 3A: Table 9-1. */
437 // The second bit of rflags needs to be 1, also indicate that we support the
438 // CPUID instruction.
439 amd_vmcb_rflags_wr_raw(&self->vmcb, 0x00200002);
440 amd_vmcb_rip_wr(&self->vmcb, 0x0000fff0);
441 amd_vmcb_cr0_wr_raw(&self->vmcb, 0x60000010);
443 INIT_CODE_SEGREG(&self->vmcb, cs);
444 INIT_DATA_SEGREG(&self->vmcb, ss);
445 INIT_DATA_SEGREG(&self->vmcb, ds);
446 INIT_DATA_SEGREG(&self->vmcb, es);
447 INIT_DATA_SEGREG(&self->vmcb, fs);
448 INIT_DATA_SEGREG(&self->vmcb, gs);
450 INIT_SYS_SEGREG(&self->vmcb, gdtr);
451 INIT_SYS_SEGREG(&self->vmcb, idtr);
452 INIT_SYS_SEGREG(&self->vmcb, ldtr);
453 INIT_SYS_SEGREG(&self->vmcb, tr);
455 amd_vmcb_dr6_wr(&self->vmcb, 0xffff0ff0);
456 amd_vmcb_dr7_wr(&self->vmcb, 0x00000400);
458 // taken from the linux SVM source
459 amd_vmcb_gpat_wr(&self->vmcb, 0x0007040600070406ul);
461 // svm requires guest EFER.SVME to be set
462 amd_vmcb_efer_svme_wrf(&self->vmcb, 1);
467 idc_handler(void *arg)
469 struct guest *g = arg;
473 struct lmp_recv_buf buf = { .buflen = 0 };
474 err = lmp_endpoint_recv(g->monitor_ep, &buf, NULL);
475 assert(err_is_ok(err));
478 guest_handle_vmexit(g);
481 struct event_closure cl = {
482 .handler = idc_handler,
485 err = lmp_endpoint_register(g->monitor_ep, get_default_waitset(), cl);
486 assert(err_is_ok(err));
489 /* This method duplicates some code from spawndomain since we need to spawn very
492 spawn_guest_domain (struct guest *self) {
495 // create the guest virtual address space
496 struct capref vnode_cap;
497 err = guest_slot_alloc(self, &vnode_cap);
498 assert(err_is_ok(err));
499 err = vnode_create(vnode_cap, ObjType_VNode_x86_64_pml4);
500 assert(err_is_ok(err));
502 struct pmap *pmap = malloc(sizeof(struct pmap_x86));
504 err = pmap_x86_64_init(pmap, &self->vspace, vnode_cap, NULL);
505 assert(err_is_ok(err));
506 err = vspace_init(&self->vspace, pmap);
507 assert(err_is_ok(err));
510 err = guest_slot_alloc(self, &self->dcb_cap);
511 assert(err_is_ok(err));
512 err = dispatcher_create(self->dcb_cap);
513 assert(err_is_ok(err));
516 struct capref ep_cap;
518 // use minimum-sized endpoint, because we don't need to buffer >1 vmexit
519 err = endpoint_create(LMP_RECV_LENGTH, &ep_cap, &self->monitor_ep);
520 assert(err_is_ok(err));
522 // register to receive on this endpoint
523 struct event_closure cl = {
524 .handler = idc_handler,
527 err = lmp_endpoint_register(self->monitor_ep, get_default_waitset(), cl);
528 assert(err_is_ok(err));
531 err = invoke_dispatcher_setup_guest(self->dcb_cap, ep_cap, vnode_cap,
532 self->vmcb_cap, self->ctrl_cap);
533 assert(err_is_ok(err));
536 initialize_guest_msr_area(self);
539 err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_IOBMP_A_F, self->iobmp_a_pa);
540 err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_IOBMP_B_F, self->iobmp_b_pa);
541 err += invoke_dispatcher_vmwrite(self->dcb_cap, VMX_MSRBMP_F, self->msrpm_pa);
542 assert(err_is_ok(err));
544 // set up the guests physical address space
545 self->mem_low_va = 0;
546 // FIXME: Hardcoded guest memory size
547 // allocate the memory used for real mode
548 // This is not 100% necessary since one could also catch the pagefaults.
549 // If we allocate the whole memory at once we use less caps and reduce
550 // the risk run out of CSpace.
552 self->mem_high_va = 0x80000000;
553 err = alloc_guest_mem(self, 0x0, 0x80000000);
555 self->mem_high_va = GUEST_VSPACE_SIZE;
556 err = alloc_guest_mem(self, 0x0, GUEST_VSPACE_SIZE);
558 assert_err(err, "alloc_guest_mem");
562 install_grub_stage2 (struct guest *g, void *img, size_t img_size)
566 /* the grub image goes to 0x8000 according to
567 * http://www.gnu.org/software/grub/manual/html_node/Memory-map.html */
568 memcpy((void *)(guest_to_host(g->mem_low_va + 0x8000)), img, img_size);
569 // according to grub stage2 source its entry point is at 0x8200
571 amd_vmcb_rip_wr(&g->vmcb, 0x8200);
572 // switch to the first segment
573 amd_vmcb_cs_selector_wr(&g->vmcb, 0x0);
574 amd_vmcb_cs_base_wr(&g->vmcb, 0x0);
575 amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff);
577 errval_t err = invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, 0x8200);
578 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_SEL, 0x0);
579 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_BASE, 0x0);
580 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CS_LIM, 0xffff);
581 assert(err_is_ok(err));
588 install_debug_app (struct guest *g)
590 //static uint8_t app[] = { 0xcd, 0x20 };
591 static uint8_t app[] = { 0xcd, 0x20, 0x90, 0x90, 0x90, 0x90, 0x90 };
592 memcpy((void *)g->rm_mem_va, app, sizeof(app));
593 amd_vmcb_rip_wr(&g->vmcb, 0x0);
594 // disable nested pageing in real mode
595 amd_vmcb_np_enable_wrf(&g->vmcb, 0);
596 // enable paged real mode
597 //amd_vmcb_cr0_pg_wrf(&g->vmcb, 0x1);
598 //g->save_area->cr0 |= X86_CR0_PE_MASK;
599 amd_vmcb_rsp_wr(&g->vmcb, 0x1000);
600 amd_vmcb_cs_selector_wr(&g->vmcb, 0x0);
601 amd_vmcb_cs_base_wr(&g->vmcb, 0x0);
602 amd_vmcb_cs_limit_wr(&g->vmcb, 0xffff);
603 //g->save_area->cs.selector = 0x1000;
604 //g->save_area->cs.base = 0x10000;
605 //g->save_area->cs.base = 0x1ffff;
610 virq_pending (void *ud, uint8_t *irq, uint8_t *irq_prio)
614 struct guest *g = ud;
616 if (amd_vmcb_vintr_rd(&g->vmcb).virq == 1) {
619 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_ENTRY_INTR_INFO, &info);
620 assert(err_is_ok(err));
621 if (!!(info & (1UL << 31))) {
625 *irq = amd_vmcb_vintr_rd(&g->vmcb).vintr_vector;
630 if (irq_prio != NULL) {
632 *irq_prio = amd_vmcb_vintr_rd(&g->vmcb).vintr_prio;
634 *irq_prio = interrupt_priority;
645 virq_accepting (void *ud)
649 struct guest *g = ud;
651 uint64_t guest_rflags;
652 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
653 assert(err_is_ok(err));
654 return (guest_rflags & (1UL << 9));
659 virq_handler (void *ud, uint8_t irq, uint8_t irq_prio)
663 struct guest *g = ud;
665 // tell the hw extensions that there is a virtual IRQ pending
667 amd_vmcb_vintr_virq_wrf(&g->vmcb, 1);
668 amd_vmcb_vintr_vintr_prio_wrf(&g->vmcb, irq_prio);
669 amd_vmcb_vintr_vintr_vector_wrf(&g->vmcb, irq);
670 amd_vmcb_vintr_v_ign_tpr_wrf(&g->vmcb, 1);
672 uint64_t guest_rflags;
673 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
674 assert(guest_rflags & (1UL << 9));
676 uint64_t info = (0 << 8 /*HWINTR*/) | (1UL << 31 /*INTR VALID*/) | irq;
677 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_ENTRY_INTR_INFO, info);
679 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_ACTIV_STATE, 0x0);
680 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_INTR_STATE, 0x0);
681 assert(err_is_ok(err));
683 interrupt_priority = irq_prio;
685 // if the guest is currently waiting then we have to restart it to make
689 guest_make_runnable(g, true);
694 guest_setup (struct guest *g)
698 // initialize the guests slot_allocator
699 err = two_level_slot_alloc_init(&g->slot_alloc);
700 assert_err(err, "two_level_slot_alloc_init");
702 struct frame_identity fi;
704 // allocate memory for the vmcb
705 err = guest_slot_alloc(g, &g->vmcb_cap);
706 assert_err(err, "guest_cspace_alloc");
707 err = frame_create(g->vmcb_cap, VMCB_SIZE, NULL);
708 assert_err(err, "frame_create");
709 err = frame_identify(g->vmcb_cap, &fi);
710 assert_err(err, "frame_identify");
711 g->vmcb_pa = fi.base;
712 err = vspace_map_one_frame_attr((void**)&g->vmcb_va, VMCB_SIZE, g->vmcb_cap,
713 VREGION_FLAGS_READ_WRITE_NOCACHE,
715 if (err_is_fail(err)) {
716 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
720 err = frame_alloc(&g->ctrl_cap, sizeof(struct guest_control), NULL);
721 assert_err(err, "frame_alloc");
722 size_t size = ROUND_UP(sizeof(struct guest_control), BASE_PAGE_SIZE);
723 err = vspace_map_one_frame_attr((void**)&g->ctrl, size, g->ctrl_cap,
724 VREGION_FLAGS_READ_WRITE_NOCACHE,
726 if (err_is_fail(err)) {
727 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
729 g->ctrl->num_vm_exits_with_monitor_invocation = 0;
730 g->ctrl->num_vm_exits_without_monitor_invocation = 0;
732 // allocate memory for the iopm
733 err = frame_alloc(&g->iopm_cap, IOPM_SIZE, NULL);
734 assert_err(err, "frame_alloc");
735 err = frame_identify(g->iopm_cap, &fi);
736 assert_err(err, "frame_identify");
737 g->iopm_pa = fi.base;
738 err = vspace_map_one_frame_attr((void**)&g->iopm_va, IOPM_SIZE, g->iopm_cap,
739 VREGION_FLAGS_READ_WRITE_NOCACHE,
741 if (err_is_fail(err)) {
742 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
745 // allocate memory for I/O bitmap A
746 err = frame_alloc(&g->iobmp_a_cap, IOBMP_A_SIZE, NULL);
747 assert_err(err, "frame_alloc");
748 err = frame_identify(g->iobmp_a_cap, &fi);
749 assert_err(err, "frame_identify");
750 g->iobmp_a_pa = fi.base;
751 err = vspace_map_one_frame_attr((void**)&g->iobmp_a_va, IOBMP_A_SIZE, g->iobmp_a_cap,
752 VREGION_FLAGS_READ_WRITE_NOCACHE,
754 if (err_is_fail(err)) {
755 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
758 // allocate memory for I/O bitmap B
759 err = frame_alloc(&g->iobmp_b_cap, IOBMP_B_SIZE, NULL);
760 assert_err(err, "frame_alloc");
761 err = frame_identify(g->iobmp_b_cap, &fi);
762 assert_err(err, "frame_identify");
763 g->iobmp_b_pa = fi.base;
764 err = vspace_map_one_frame_attr((void**)&g->iobmp_b_va, IOBMP_B_SIZE, g->iobmp_b_cap,
765 VREGION_FLAGS_READ_WRITE_NOCACHE,
767 if (err_is_fail(err)) {
768 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
771 // allocate memory for the guest MSR store/load area
772 err = frame_alloc(&g->msr_area_cap, VMX_MSR_AREA_SIZE, NULL);
773 assert_err(err, "frame_alloc");
774 err = frame_identify(g->msr_area_cap, &fi);
775 assert_err(err, "frame_identify");
776 g->msr_area_pa = fi.base;
777 err = vspace_map_one_frame_attr((void**)&g->msr_area_va, VMX_MSR_AREA_SIZE,
779 VREGION_FLAGS_READ_WRITE_NOCACHE,
781 if (err_is_fail(err)) {
782 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
785 // allocate memory for the msrpm
786 err = frame_alloc(&g->msrpm_cap, MSRPM_SIZE, NULL);
787 assert_err(err, "frame_alloc");
788 err = frame_identify(g->msrpm_cap, &fi);
789 assert_err(err, "frame_identify");
790 g->msrpm_pa = fi.base;
791 err = vspace_map_one_frame_attr((void**)&g->msrpm_va, MSRPM_SIZE,
793 VREGION_FLAGS_READ_WRITE_NOCACHE,
795 if (err_is_fail(err)) {
796 DEBUG_ERR(err, "vspace_map_one_frame_attr failed");
799 // initialize the allocated structures
805 // spawn the guest domain
806 spawn_guest_domain(g);
807 assert (grub_image != NULL);
808 install_grub_stage2(g, grub_image, grub_image_size);
809 //install_debug_app(g);
811 // add virtual hardware
812 g->apic = apic_new(APIC_BASE);
813 g->lpc = lpc_new(virq_handler, virq_pending,
818 if (hdd0_image != NULL) {
819 g->hdds[0] = hdd_new_from_memory(hdd0_image, hdd0_image_size);
822 g->console = console_new();
823 g->serial_ports[0] = pc16550d_new(0x3f8, 4, g->lpc);
825 // FIXME: Which virtual uart port is connected to which host port
826 // should be adjustable from the command line or a configuration
828 pc16550d_attach_to_host_uart(g->serial_ports[0], SERIAL_DRIVER);
829 g->serial_ports[1] = pc16550d_new(0x2f8, 3, g->lpc);
830 g->serial_ports[2] = pc16550d_new(0x3e8, 4, g->lpc);
831 g->serial_ports[3] = pc16550d_new(0x2e8, 3, g->lpc);
832 g->serial_port_count = 4;
835 init_host_devices(g->pci);
837 // struct pci_device *ethernet = pci_ethernet_new(g->lpc, g);
838 // int r = pci_attach_device(g->pci, 0, 2, ethernet);
841 // struct pci_device *vmkitmon_eth = pci_vmkitmon_eth_new(g->lpc, g);
842 // r = pci_attach_device(g->pci, 0, 3, vmkitmon_eth);
845 // set up bios memory
846 // FIXME: find a modular way to do this
847 *(uint16_t *)guest_to_host(g->mem_low_va + 0x400) = 0x3f8; // COM1
848 *(uint16_t *)guest_to_host(g->mem_low_va + 0x402) = 0x2f8; // COM2
854 * \brief Create a new guest.
856 * This function creates a new guest. It will do everything necessary to make
857 * the guest accept images to run. It will create a new domain and assign some
858 * memory to that domain. Afterwards it will load a bios into the memory and
859 * set the guest initial IP to the POST entry of the bios.
861 * \return The pointer to the newly created structure describing the guest.
866 // support the allocation of one guest for now
867 assert(__guestp == NULL);
869 memset(__guestp, 0, sizeof(struct guest));
870 guest_setup(__guestp);
875 run_realmode (struct guest *g)
879 realmode_switch_to(g);
881 assert(r == REALMODE_ERR_OK);
882 realmode_switch_from(g);
884 guest_handle_vmexit(g);
890 // Return true if the "Enable EPT" Secondary Processor-based control is
891 // set in the VMCS, else false.
892 static inline bool vmx_ept_enabled(struct guest *g)
894 uint64_t sp_controls;
895 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sp_controls);
896 assert(err_is_ok(err));
897 return ((sp_controls & SP_CLTS_ENABLE_EPT) != 0);
900 // Set or clear the "Descriptor-table exiting" Secondary Processor-based
901 // control if val is 1 or 0, respectively.
902 static inline void vmx_intercept_desc_table_wrf(struct guest *g, int val)
904 assert(val == 0 || val == 1);
906 uint64_t sec_proc_ctrls;
907 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_SEC_PROC, &sec_proc_ctrls);
909 uint64_t prim_proc_ctrls;
910 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXEC_PRIM_PROC, &prim_proc_ctrls);
911 assert(prim_proc_ctrls & PP_CLTS_SEC_CTLS);
912 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
913 sec_proc_ctrls | SP_CLTS_DESC_TABLE);
915 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_EXEC_SEC_PROC,
916 sec_proc_ctrls & ~SP_CLTS_DESC_TABLE);
918 assert(err_is_ok(err));
922 // Before entering the guest, synchronize the CR0 shadow with the guest
923 // CR0 value that is potentially changed in the real-mode emulator.
924 static inline void vmx_set_cr0_shadow(struct guest *g)
927 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &cr0_shadow);
928 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_CR0_RD_SHADOW, cr0_shadow);
929 assert(err_is_ok(err));
934 * \brief Marks a guest as runnable.
936 * A call to this method will update the guest's runnable state and, if made
937 * runnable, yield the remaining time slice to the guest domain.
939 * \return Zero on success, non-zero on error
942 guest_make_runnable (struct guest *g, bool run)
948 /* If the guest is currently in real mode (CR0.PE flag clear) then we do not
949 * schedule the domain to run the virtualization but run the real-mode
952 if (UNLIKELY(run && amd_vmcb_cr0_rd(&g->vmcb).pe == 0)) {
953 if (!g->emulated_before_exit) {
954 // do the inverse of the code below
955 amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 1);
956 amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 1);
957 amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 1);
958 amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 1);
959 amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 1);
960 amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 1);
961 amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 1);
962 amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 1);
963 amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 1);
964 amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 1);
965 amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 1);
966 amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 1);
967 amd_vmcb_intercepts_intn_wrf(&g->vmcb, 1);
969 // mark guest as emulated
970 g->emulated_before_exit = true;
974 err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
975 assert(err_is_ok(err));
976 if (UNLIKELY(run && (guest_cr0 & CR0_PE) == 0)) {
977 if (!g->emulated_before_exit) {
978 vmx_intercept_desc_table_wrf(g, 1);
979 g->emulated_before_exit = true;
982 #if 0 /* why create a thread for this? it seems fine without! -AB */
983 struct thread *t = thread_create((thread_func_t)run_realmode, g);
985 err = thread_detach(t);
986 assert(err_is_ok(err));
993 /* every time we move the machine from the emulated to virtualized we need
994 * to adjust some intercepts */
995 if (UNLIKELY(run && g->emulated_before_exit)) {
997 // we enforce NP to be enabled (no shadow paging support)
998 assert(amd_vmcb_np_rd(&g->vmcb).enable == 1);
1000 // disable GDTR intercept
1001 amd_vmcb_intercepts_rdgdtr_wrf(&g->vmcb, 0);
1002 amd_vmcb_intercepts_wrgdtr_wrf(&g->vmcb, 0);
1003 // disable GDTR intercept
1004 amd_vmcb_intercepts_rdldtr_wrf(&g->vmcb, 0);
1005 amd_vmcb_intercepts_wrldtr_wrf(&g->vmcb, 0);
1006 // disable IDTR intercept
1007 amd_vmcb_intercepts_rdidtr_wrf(&g->vmcb, 0);
1008 amd_vmcb_intercepts_wridtr_wrf(&g->vmcb, 0);
1009 // disable TR intercept
1010 amd_vmcb_intercepts_rdtr_wrf(&g->vmcb, 0);
1011 amd_vmcb_intercepts_wrtr_wrf(&g->vmcb, 0);
1012 // disable non essential CR0 access intercepts_t
1013 amd_vmcb_cr_access_rdcr0_wrf(&g->vmcb, 0);
1014 amd_vmcb_cr_access_wrcr0_wrf(&g->vmcb, 0);
1015 // disable CR3 access intercepts
1016 assert(amd_vmcb_np_rd(&g->vmcb).enable != 0);
1017 amd_vmcb_cr_access_rdcr3_wrf(&g->vmcb, 0);
1018 amd_vmcb_cr_access_wrcr3_wrf(&g->vmcb, 0);
1019 // disable INTn intercept
1020 // we have to be outside of real mode for this to work
1021 assert(amd_vmcb_cr0_rd(&g->vmcb).pe != 0);
1022 amd_vmcb_intercepts_intn_wrf(&g->vmcb, 0);
1024 bool ept_enabled = vmx_ept_enabled(g);
1025 assert(ept_enabled);
1026 vmx_intercept_desc_table_wrf(g, 0);
1027 assert(guest_cr0 & CR0_PE);
1028 vmx_set_cr0_shadow(g);
1030 // mark guest as not emulated
1031 g->emulated_before_exit = false;
1034 // update the guets domain's runnable state
1035 err = invoke_dispatcher(g->dcb_cap, NULL_CAP, NULL_CAP, NULL_CAP, NULL_CAP, run);
1036 assert_err(err, "dispatcher_make_runnable");
1037 // yield the dispatcher
1039 thread_yield_dispatcher(NULL_CAP);
1045 /* VMEXIT hanlders */
1047 #define HANDLER_ERR_OK (0)
1048 #define HANDLER_ERR_FATAL (-1)
1052 handle_vmexit_unhandeled (struct guest *g)
1054 printf("Unhandeled guest vmexit:\n");
1055 printf(" code:\t %lx\n", amd_vmcb_exitcode_rd(&g->vmcb));
1056 printf(" info1:\t %lx\n", amd_vmcb_exitinfo1_rd(&g->vmcb));
1057 printf(" info2:\t %lx\n", amd_vmcb_exitinfo2_rd(&g->vmcb));
1058 printf(" intinfo: %lx\n", amd_vmcb_exitintinfo_rd(&g->vmcb));
1060 printf("VMCB save area:\n");
1061 printf(" cr0:\t%lx\n", amd_vmcb_cr0_rd_raw(&g->vmcb));
1062 printf(" cr2:\t%lx\n", amd_vmcb_cr2_rd_raw(&g->vmcb));
1063 printf(" cr3:\t%lx\n", amd_vmcb_cr3_rd_raw(&g->vmcb));
1064 printf(" cr4:\t%lx\n", amd_vmcb_cr4_rd_raw(&g->vmcb));
1065 printf(" efer:\t%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
1066 printf(" rip:\t%lx\n", amd_vmcb_rip_rd_raw(&g->vmcb));
1067 printf(" cs:\tselector %x, base %lx, limit %x, attrib %x\n",
1068 amd_vmcb_cs_selector_rd(&g->vmcb), amd_vmcb_cs_base_rd(&g->vmcb),
1069 amd_vmcb_cs_limit_rd(&g->vmcb), amd_vmcb_cs_attrib_rd_raw(&g->vmcb));
1070 printf(" ds:\tselector %x, base %lx, limit %x, attrib %x\n",
1071 amd_vmcb_ds_selector_rd(&g->vmcb), amd_vmcb_ds_base_rd(&g->vmcb),
1072 amd_vmcb_ds_limit_rd(&g->vmcb), amd_vmcb_ds_attrib_rd_raw(&g->vmcb));
1073 printf(" es:\tselector %x, base %lx, limit %x, attrib %x\n",
1074 amd_vmcb_es_selector_rd(&g->vmcb), amd_vmcb_es_base_rd(&g->vmcb),
1075 amd_vmcb_es_limit_rd(&g->vmcb), amd_vmcb_es_attrib_rd_raw(&g->vmcb));
1076 printf(" ss:\tselector %x, base %lx, limit %x, attrib %x\n",
1077 amd_vmcb_ss_selector_rd(&g->vmcb), amd_vmcb_ss_base_rd(&g->vmcb),
1078 amd_vmcb_ss_limit_rd(&g->vmcb), amd_vmcb_ss_attrib_rd_raw(&g->vmcb));
1079 printf(" rax:\t%lx\n", amd_vmcb_rax_rd_raw(&g->vmcb));
1080 printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1081 printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1082 printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1083 printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1084 printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1086 return HANDLER_ERR_FATAL;
1090 handle_vmexit_unhandeled (struct guest *g)
1092 printf("Unhandeled guest vmexit:\n");
1093 printf(" exit reason:\t %"PRIu16"\n", saved_exit_reason);
1094 printf(" exit qualification:\t %"PRIx64"\n", saved_exit_qual);
1095 printf(" next rip (I/O instruction):\t %"PRIx64"\n", saved_rip);
1098 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &gpaddr);
1099 printf(" guest physical-address:\t %"PRIx64"\n", gpaddr);
1101 uint64_t guest_cr0, guest_cr3, guest_cr4;
1102 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1103 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1104 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1106 uint64_t guest_efer, guest_rip;
1107 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1108 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1110 uint64_t guest_cs_sel, guest_cs_base, guest_cs_lim,
1112 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_SEL, &guest_cs_sel);
1113 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1114 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_LIM, &guest_cs_lim);
1115 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &guest_cs_access);
1117 uint64_t guest_ds_sel, guest_ds_base, guest_ds_lim,
1119 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_SEL, &guest_ds_sel);
1120 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1121 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_LIM, &guest_ds_lim);
1122 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_ACCESS, &guest_ds_access);
1124 uint64_t guest_es_sel, guest_es_base, guest_es_lim,
1126 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_SEL, &guest_es_sel);
1127 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &guest_es_base);
1128 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_LIM, &guest_es_lim);
1129 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_ACCESS, &guest_es_access);
1131 uint64_t guest_ss_sel, guest_ss_base, guest_ss_lim,
1133 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_SEL, &guest_ss_sel);
1134 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_BASE, &guest_ss_base);
1135 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_LIM, &guest_ss_lim);
1136 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SS_ACCESS, &guest_ss_access);
1137 assert(err_is_ok(err));
1139 printf("VMCS save area:\n");
1140 printf(" cr0:\t%lx\n", guest_cr0);
1141 printf(" cr3:\t%lx\n", guest_cr3);
1142 printf(" cr4:\t%lx\n", guest_cr4);
1143 printf(" efer:\t%lx\n", guest_efer);
1144 printf(" rip:\t%lx\n", guest_rip);
1145 printf(" cs:\tselector %lx, base %lx, limit %lx, access %lx\n",
1146 guest_cs_sel, guest_cs_base, guest_cs_lim, guest_cs_access);
1147 printf(" ds:\tselector %lx, base %lx, limit %lx, access %lx\n",
1148 guest_ds_sel, guest_ds_base, guest_ds_lim, guest_ds_access);
1149 printf(" es:\tselector %lx, base %lx, limit %lx, access %lx\n",
1150 guest_es_sel, guest_es_base, guest_es_lim, guest_es_access);
1151 printf(" ss:\tselector %lx, base %lx, limit %lx, access %lx\n",
1152 guest_ss_sel, guest_ss_base, guest_ss_lim, guest_ss_access);
1153 printf(" rax:\t%lx\n", g->ctrl->regs.rax);
1154 printf(" rbx:\t%lx\n", g->ctrl->regs.rbx);
1155 printf(" rcx:\t%lx\n", g->ctrl->regs.rcx);
1156 printf(" rdx:\t%lx\n", g->ctrl->regs.rdx);
1157 printf(" rsi:\t%lx\n", g->ctrl->regs.rsi);
1158 printf(" rdi:\t%lx\n", g->ctrl->regs.rdi);
1160 return HANDLER_ERR_FATAL;
1164 static inline uint64_t
1165 lookup_paddr_long_mode (struct guest *g, uint64_t vaddr)
1167 union x86_lm_va va = { .raw = vaddr };
1168 uint64_t *page_table;
1170 // get a pointer to the pml4 table
1172 page_table = (uint64_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1175 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1176 assert(err_is_ok(err));
1177 page_table = (uint64_t *)guest_to_host(guest_cr3);
1180 union x86_lm_pml4_entry pml4e = { .raw = page_table[va.u.pml4_idx] };
1181 assert (pml4e.u.p == 1);
1183 // get a pointer to the pdp table
1184 page_table = (uint64_t *)guest_to_host(pml4e.u.pdp_base_pa << 12);
1186 union x86_lm_pdp_entry pdpe = { .raw = page_table[va.u.pdp_idx] };
1187 assert(pdpe.u.p == 1);
1188 // check for 1GB page (PS bit set)
1189 if (pdpe.u.ps == 1) {
1190 return (pdpe.u1gb.base_pa << 30) | va.u1gb.pa_offset;
1193 // get a pointer to the pd table
1194 page_table = (uint64_t *)guest_to_host(pdpe.u.pd_base_pa << 12);
1196 union x86_lm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1198 printf("g2h %lx, pml4e %p %lx, pdpe %p %lx, pde %p %lx\n",
1199 guest_to_host(0), &pml4e, pml4e.raw, &pdpe, pdpe.raw, &pde, pde.raw);
1201 assert(pde.u.p == 1);
1202 // check for 2MB page (PS bit set)
1203 if (pde.u.ps == 1) {
1204 return (pde.u2mb.base_pa << 21) | va.u2mb.pa_offset;
1207 // get a pointer to the page table
1208 page_table = (uint64_t *)guest_to_host(pde.u.pt_base_pa << 12);
1209 // get the page table entry
1210 union x86_lm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1211 assert(pte.u.p == 1);
1213 return (pte.u.base_pa << 12) | va.u.pa_offset;
1216 static inline uint32_t
1217 lookup_paddr_legacy_mode (struct guest *g, uint32_t vaddr)
1219 // printf("lookup_paddr_legacy_mode enter\n");
1220 // PAE not supported
1222 guest_assert(g, amd_vmcb_cr4_rd(&g->vmcb).pae == 0);
1225 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR4, &guest_cr4);
1226 guest_assert(g, (guest_cr4 & CR4_PAE) == 0);
1228 union x86_legm_va va = { .raw = vaddr };
1229 uint32_t *page_table;
1231 // get a pointer to the pd table
1233 page_table = (uint32_t *)guest_to_host(amd_vmcb_cr3_rd(&g->vmcb));
1236 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR3, &guest_cr3);
1237 assert(err_is_ok(err));
1238 page_table = (uint32_t *)guest_to_host(guest_cr3);
1242 union x86_legm_pd_entry pde = { .raw = page_table[va.u.pd_idx] };
1243 assert (pde.u.p == 1);
1244 // check for 4MB page (PS bit set)
1245 if (pde.u.ps == 1) {
1246 return (pde.u4mb.base_pa << 22) | va.u4mb.pa_offset;
1249 // get a pointer to the page table
1250 page_table = (uint32_t *)guest_to_host(pde.u.pt_base_pa << 12);
1251 // get the page table entry
1252 union x86_legm_pt_entry pte = { .raw = page_table[va.u.pt_idx] };
1253 assert(pte.u.p == 1);
1255 return (pte.u.base_pa << 12) | va.u.pa_offset;
1258 // retunrs a pointer to a byte array starting at the current instruction
1260 get_instr_arr (struct guest *g, uint8_t **arr)
1263 if (UNLIKELY(amd_vmcb_cr0_rd(&g->vmcb).pg == 0)) {
1266 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1267 if (UNLIKELY((guest_cr0 & CR0_PG) == 0)) {
1269 //printf("Segmentation active!\n");
1271 // take segmentation into account
1273 *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1274 amd_vmcb_cs_base_rd(&g->vmcb) +
1275 amd_vmcb_rip_rd(&g->vmcb));
1277 uint64_t guest_cs_base, guest_rip;
1278 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_BASE, &guest_cs_base);
1279 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1280 *arr = (uint8_t *)(guest_to_host(g->mem_low_va) +
1281 guest_cs_base + guest_rip);
1286 if (amd_vmcb_efer_rd(&g->vmcb).lma == 1) {
1288 uint64_t guest_efer;
1289 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &guest_efer);
1290 if (guest_efer & EFER_LMA) {
1294 if (amd_vmcb_cs_attrib_rd(&g->vmcb).l == 1) {
1296 *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1297 amd_vmcb_rip_rd(&g->vmcb)));
1299 uint64_t cs_access_rights, guest_rip;
1300 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CS_ACCESS, &cs_access_rights);
1301 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1302 if (cs_access_rights & ACCESS_RIGHTS_LONG_MODE) {
1303 *arr = (uint8_t *)guest_to_host(lookup_paddr_long_mode(g,
1307 // cmpatibility mode
1308 guest_assert(g, !"compatiblity mode not supported yet");
1311 // Legacy (aka. Paged Protected) Mode
1313 assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 1);
1315 *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1316 amd_vmcb_rip_rd(&g->vmcb)));
1318 assert(guest_cr0 & CR0_PE);
1321 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1322 *arr = (uint8_t *)guest_to_host(lookup_paddr_legacy_mode(g,
1328 assert(err_is_ok(err));
1330 return HANDLER_ERR_OK;
1333 static inline uint64_t
1334 get_reg_val_by_reg_num (struct guest *g, uint8_t regnum) {
1337 return guest_get_rax(g);
1339 return guest_get_rcx(g);
1341 return guest_get_rdx(g);
1343 return guest_get_rbx(g);
1345 return guest_get_rsp(g);
1347 return guest_get_rbp(g);
1349 return guest_get_rsi(g);
1351 return guest_get_rdi(g);
1353 assert(!"not reached");
1359 set_reg_val_by_reg_num (struct guest *g, uint8_t regnum, uint64_t val) {
1362 guest_set_rax(g, val);
1365 guest_set_rcx(g, val);
1368 guest_set_rdx(g, val);
1371 guest_set_rbx(g, val);
1374 guest_set_rsp(g, val);
1377 guest_set_rbp(g, val);
1380 guest_set_rsi(g, val);
1383 guest_set_rdi(g, val);
1386 assert(!"not reached");
1392 handle_vmexit_cr_access (struct guest *g)
1395 uint8_t *code = NULL;
1398 if (g->emulated_before_exit) {
1399 assert(saved_exit_reason == VMX_EXIT_REASON_CR_ACCESS);
1400 assert(((saved_exit_qual >> 0) & 0xf) == 0);
1403 // fetch the location to the code
1404 r = get_instr_arr(g, &code);
1405 if (r != HANDLER_ERR_OK) {
1408 assert(code != NULL);
1410 assert(code[0] == 0x0f && (code[1] == 0x20 || code[1] == 0x22));
1413 bool read = (code[1] == 0x20);
1414 union x86_modrm mod;
1417 // FIXME: use proper exception
1418 assert(mod.u.mod == 3);
1423 switch (mod.u.regop) {
1426 val = amd_vmcb_cr0_rd_raw(&g->vmcb);
1428 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &val);
1432 printf("CR access: unknown CR source register\n");
1433 return handle_vmexit_unhandeled(g);
1437 val = get_reg_val_by_reg_num(g, mod.u.rm);
1445 guest_set_rax(g, val);
1448 guest_set_rcx(g, val);
1451 guest_set_rdx(g, val);
1454 guest_set_rbx(g, val);
1457 printf("CR access: unknown GPR destination register\n");
1458 return handle_vmexit_unhandeled(g);
1462 switch (mod.u.regop) {
1465 amd_vmcb_cr0_wr_raw(&g->vmcb, val);
1467 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_CR0, val);
1472 // ignore writing to CR4
1473 // allow writing to CR4 by do nothing for this case
1476 printf("CR access: unknown CR destination register\n");
1477 return handle_vmexit_unhandeled(g);
1481 // advance the rip beyond the instruction
1483 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
1486 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1487 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
1488 assert(err_is_ok(err));
1490 return HANDLER_ERR_OK;
1494 handle_vmexit_ldt (struct guest *g)
1497 uint8_t *code = NULL;
1500 // this handler supports only real-mode
1502 assert(amd_vmcb_cr0_rd(&g->vmcb).pe == 0);
1505 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1506 assert((guest_cr0 & CR0_PE) == 0);
1508 // fetch the location to the code
1509 r = get_instr_arr(g, &code);
1510 if (r != HANDLER_ERR_OK) {
1513 mem = (uint8_t *)guest_to_host(g->mem_low_va);
1514 assert(code != NULL);
1516 assert (code[0] == 0x0f && code[1] == 0x01);
1518 // check for relevant instruction prefixes
1519 bool addr32 = code[-2] == 0x67 || code[-1] == 0x67;
1520 bool op32 = code[-2] == 0x66 || code[-1] == 0x66;
1522 union x86_modrm modrm = { .raw = code[2] };
1524 assert(modrm.u.regop == 2 || modrm.u.regop == 3);
1525 guest_assert(g, op32);
1529 // byte 3-6 hold a 32 bit address to a mem location where the first word
1530 // holds the limit and the following dword holds the base
1531 addr = *(uint32_t *)&code[3];
1533 // byte 3-4 hold a 16 bit address to a mem location where the first word
1534 // holds the limit and the following dword holds the base
1535 // this address is relative to DS base
1537 addr = *(uint16_t *)&code[3] + amd_vmcb_ds_base_rd(&g->vmcb);
1539 uint64_t guest_ds_base;
1540 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1541 addr = *(uint16_t *)&code[3] + guest_ds_base;
1545 // santity check on the addr
1546 // FIXME: raise a proper exception
1547 if (addr > g->mem_high_va) {
1548 printf("Memory access beyond physical address space\n");
1549 return HANDLER_ERR_FATAL;
1552 // load the actual register
1553 if (modrm.u.regop == 2) {
1556 amd_vmcb_gdtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
1557 amd_vmcb_gdtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
1559 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_LIM,
1560 *(uint16_t*)(mem + addr));
1561 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GDTR_BASE,
1562 *(uint32_t*)(mem + addr + 2));
1565 } else if (modrm.u.regop == 3) {
1568 amd_vmcb_idtr_limit_wr(&g->vmcb, *(uint16_t*)(mem + addr));
1569 amd_vmcb_idtr_base_wr(&g->vmcb, *(uint32_t*)(mem + addr + 2));
1571 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_LIM,
1572 *(uint16_t*)(mem + addr));
1573 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_IDTR_BASE,
1574 *(uint32_t*)(mem + addr + 2));
1577 assert(!"not reached");
1580 // advance the rip beyond the instruction
1583 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 7);
1585 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 5);
1589 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
1591 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 7);
1593 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 5);
1595 assert(err_is_ok(err));
1597 return HANDLER_ERR_OK;
1601 static inline void vmx_vmcs_rflags_cf_wrf(struct guest *g, int val) {
1602 assert(val == 0 || val == 1);
1603 uint64_t guest_rflags;
1604 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
1606 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
1607 guest_rflags | RFLAGS_CF);
1609 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RFLAGS,
1610 guest_rflags & (~RFLAGS_CF));
1612 assert(err_is_ok(err));
1617 handle_vmexit_swint (struct guest *g)
1620 uint8_t *code = NULL;
1622 r = get_instr_arr(g, &code);
1623 if (r != HANDLER_ERR_OK) {
1626 assert (code != NULL);
1628 // check for correct instruciton
1629 assert(code[0] == 0xcd);
1631 // the number of the interrupt is followed by the INT (0xcd) opcode
1632 uint8_t int_num = code[1];
1634 // check whether the guest is in real mode
1636 if (amd_vmcb_cr0_rd(&g->vmcb).pe == 0) {
1638 uint64_t guest_ds_base, es_guest_base;
1639 uint64_t guest_cr0, guest_rip;
1640 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_CR0, &guest_cr0);
1641 if ((guest_cr0 & CR0_PE) == 0) {
1643 // in real mode the interrupts starting at 10 have different meaning
1644 // examine the sw interrupt
1647 r = console_handle_int10(g->console, g);
1648 if (r != HANDLER_ERR_OK) {
1649 printf("Unhandeled method on INT 0x10\n");
1650 return handle_vmexit_unhandeled(g);
1654 switch (guest_get_ax(g)) {
1655 case 0: // GET MEMORY SIZE
1656 // our VM always has 1MB of base memory
1657 // AX holds the amount of 1KB memory blocks starting at
1658 // addr 0 which is 640 (640 KiB)
1659 guest_set_ax(g, 640);
1662 printf("Unhandeled method on INT 0x12\n");
1663 return handle_vmexit_unhandeled(g);
1667 // Bootable CD-ROM - GET STATUS
1668 if (guest_get_ax(g) == 0x4b01) {
1671 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1673 vmx_vmcs_rflags_cf_wrf(g, 1);
1677 else if (guest_get_ah(g) == 0) {
1678 for (int i = 0; i < g->hdd_count; i++) {
1679 hdd_reset(g->hdds[i]);
1682 // DISK - GET DRIVE PARAMETERS (PC,XT286,CONV,PS,ESDI,SCSI)
1683 else if (guest_get_ah(g) == 0x08) {
1684 uint8_t dl = guest_get_dl(g);
1686 // only respond to installed hard disks
1687 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1691 r = hdd_get_geometry_chs(g->hdds[dl & 0x7f], &c, &h, &s);
1694 // set some return values for success
1697 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1699 vmx_vmcs_rflags_cf_wrf(g, 0);
1702 // store the geometry into the correct registers
1703 guest_set_cx(g, c << 6 | (s & 0x3f));
1705 guest_set_dl(g, g->hdd_count);
1708 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1710 vmx_vmcs_rflags_cf_wrf(g, 1);
1712 // it is not really clear to me what ah should contain
1713 // when the drive is not present, so set it to FF
1717 // INT 13 Extensions - INSTALLATION CHECK
1718 else if (guest_get_ah(g) == 0x41 && guest_get_bx(g) == 0x55aa) {
1720 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1722 vmx_vmcs_rflags_cf_wrf(g, 0);
1724 guest_set_bx(g, 0xaa55);
1725 guest_set_ah(g, 0x01); // Drive extensions 1.x
1727 guest_set_cx(g, 0x5);
1729 // IBM/MS INT 13 Extensions - EXTENDED READ
1730 else if (guest_get_ah(g) == 0x42) {
1731 uint8_t dl = guest_get_dl(g);
1733 // only respond to installed hard disks
1734 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1736 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1738 vmx_vmcs_rflags_cf_wrf(g, 0);
1742 struct disk_access_block {
1746 // pointer to the data buffer formated like
1748 uint32_t transfer_buffer;
1749 uint64_t abs_block_number;
1750 } __attribute__ ((packed));
1752 // memory location of the disk access block
1754 uintptr_t mem = guest_to_host(g->mem_low_va) +
1755 amd_vmcb_ds_base_rd(&g->vmcb) +
1758 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1759 uintptr_t mem = guest_to_host(g->mem_low_va) +
1760 guest_ds_base + guest_get_si(g);
1763 struct disk_access_block *dap = (void *)mem;
1765 if (dap->size < 0x10) {
1767 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1769 vmx_vmcs_rflags_cf_wrf(g, 1);
1773 // dap->transfer buffer points to a real-mode segment
1774 // resolve it according to that rules
1775 mem = guest_to_host(g->mem_low_va) +
1776 ((dap->transfer_buffer >> 16) << 4) +
1777 (dap->transfer_buffer & 0xffff);
1779 size_t count = dap->count;
1780 r = hdd_read_blocks(g->hdds[dl & 0x7f],
1781 dap->abs_block_number,
1785 if (r != HANDLER_ERR_OK) {
1787 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1789 vmx_vmcs_rflags_cf_wrf(g, 1);
1796 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1798 vmx_vmcs_rflags_cf_wrf(g, 1);
1800 // it is not really clear to me what ah should contain
1801 // when the drive is not present, so set it to FF
1805 // IBM/MS INT 13 Extensions - GET DRIVE PARAMETERS
1806 else if (guest_get_ah(g) == 0x48) {
1807 uint8_t dl = guest_get_dl(g);
1809 // only respond to installed hard disks
1810 if ((dl >> 7) && ((dl & 0x7f) < g->hdd_count)) {
1811 // structure to hold drive info
1812 struct drive_params {
1818 uint64_t total_sectors;
1819 uint16_t bytes_per_sector;
1820 } __attribute__ ((packed));
1822 // memory where the drive info shall be stored
1824 uintptr_t mem = guest_to_host(g->mem_low_va) +
1825 amd_vmcb_ds_base_rd(&g->vmcb) +
1828 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_DS_BASE, &guest_ds_base);
1829 uintptr_t mem = guest_to_host(g->mem_low_va) +
1830 guest_ds_base + guest_get_si(g);
1833 struct drive_params *drp = (void *)mem;
1836 if (drp->size < sizeof(struct drive_params)) {
1838 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1840 vmx_vmcs_rflags_cf_wrf(g, 1);
1844 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1846 vmx_vmcs_rflags_cf_wrf(g, 0);
1850 drp->size = sizeof(struct drive_params);
1851 // CHS invalid, no removable drive, etc
1856 drp->total_sectors = hdd_get_blocks_count(
1857 g->hdds[dl & 0x7f]);
1858 drp->bytes_per_sector = 512; // FIXME: Hardcoded
1862 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1864 vmx_vmcs_rflags_cf_wrf(g, 1);
1866 // it is not really clear to me what ah should contain
1867 // when the drive is not present, so set it to FF
1868 guest_set_ah(g, 0x1);
1871 printf("Unhandeled method on INT 0x13\n");
1872 return handle_vmexit_unhandeled(g);
1877 if (guest_get_ax(g) == 0x2401) {
1878 g->a20_gate_enabled = true;
1880 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1882 vmx_vmcs_rflags_cf_wrf(g, 0);
1886 // APM INSTALLATION CHECK
1887 else if (guest_get_ax(g) == 0x5300) {
1888 // we do not support APM - set carry flag to indicate error
1890 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1892 vmx_vmcs_rflags_cf_wrf(g, 1);
1896 else if (guest_get_ax(g) == 0x5304) {
1897 // we do not support APM - set carry flag to indicate error
1899 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1901 vmx_vmcs_rflags_cf_wrf(g, 1);
1904 // GET MEMORY SIZE FOR >64M CONFIGURATIONS
1905 else if (guest_get_ax(g) == 0xe801) {
1906 // we do not support this BIOS call
1907 // both grub and linux may also use the 0xe820 call
1909 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1911 vmx_vmcs_rflags_cf_wrf(g, 1);
1914 // GET SYSTEM MEMORY MAP
1915 // EDX has to contain 0x534d4150 (== 'SMAP')
1916 else if (guest_get_ax(g) == 0xe820 &&
1917 guest_get_edx(g) == 0x534d4150) {
1918 // for now we return only one entry containing the real mem
1919 if (guest_get_ebx(g) > 1 || guest_get_ecx(g) < 20) {
1920 // wrong input params -> report error
1922 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1924 vmx_vmcs_rflags_cf_wrf(g, 1);
1927 // taken from http://www.ctyme.com/intr/rb-1741.htm
1929 uintptr_t addr = guest_to_host(g->mem_low_va) +
1930 amd_vmcb_es_base_rd(&g->vmcb) +
1933 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_ES_BASE, &es_guest_base);
1934 uintptr_t addr = guest_to_host(g->mem_low_va) +
1935 es_guest_base + guest_get_di(g);
1937 // set EAX to 'SMAP'
1938 guest_set_eax(g, 0x534D4150);
1939 // returned bytes (always 20)
1940 guest_set_ecx(g, 20);
1942 switch (guest_get_ebx(g)) {
1945 assert(g->mem_low_va == 0);
1947 *(uint64_t *)addr = 0;
1948 // size of the memory block
1949 *(uint64_t *)(addr + 8) = 0xa0000; // 640 KiB
1950 // mem type, 1 == "memory, available to the OS"
1951 *(uint32_t *)(addr + 16) = 1;
1952 // indicate that there is more data
1953 guest_set_ebx(g, 1);
1957 assert(g->mem_high_va > 0x100000);
1959 *(uint64_t *)addr = 0x100000; // 1 MiB
1960 // size of the memory block
1961 *(uint64_t *)(addr + 8) = g->mem_high_va - 0x100000;
1962 // mem type, 1 == "memory, available to the OS"
1963 *(uint32_t *)(addr + 16) = 1;
1964 // indicate that there is no more data
1965 guest_set_ebx(g, 0);
1968 assert(!"not reached");
1974 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
1976 vmx_vmcs_rflags_cf_wrf(g, 0);
1980 // SYSTEM - Get Intel SpeedStep (IST) information
1981 else if (guest_get_ax(g) == 0xe980) {
1982 // not supportet yet
1984 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1986 vmx_vmcs_rflags_cf_wrf(g, 1);
1989 // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
1991 // GRUB BUG: it puts 0xc0 into AX instead of AH
1992 else if (guest_get_ax(g) == 0xc0) {
1993 // we do not support this
1995 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
1997 vmx_vmcs_rflags_cf_wrf(g, 1);
1999 guest_set_ah(g, 0x80);
2001 // GET EXTENDED MEMORY SIZE
2002 else if (guest_get_ah(g) == 0x88) {
2003 // calculate number of 1KB chunks starting from 1MB but not
2005 assert(((g->mem_high_va - g->mem_low_va) & 0x3ff) == 0);
2006 guest_set_ax(g, MIN(0x3c00 /* 16MB */,
2007 (g->mem_high_va - g->mem_low_va) / 1024));
2008 // indicate no error occured
2010 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2012 vmx_vmcs_rflags_cf_wrf(g, 0);
2015 // SYSTEM - GET CONFIGURATION (XT >1986/1/10,AT mdl 3x9,
2017 else if (guest_get_ah(g) == 0xc0) {
2018 // we do not support this
2020 amd_vmcb_rflags_cf_wrf(&g->vmcb, 1);
2022 vmx_vmcs_rflags_cf_wrf(g, 1);
2024 guest_set_ah(g, 0x80);
2025 // SYSTEM - SET BIOS MODE
2026 } else if (guest_get_ah(g) == 0xec) {
2027 // I do no really know the use of this bios call and linux
2028 // expects no action what so ever
2030 printf("Unhandeled method on INT 0x15\n");
2031 return handle_vmexit_unhandeled(g);
2035 // KEYBOARD - SET TYPEMATIC RATE AND DELAY
2036 if (guest_get_ah(g) == 0x3) {
2038 } else if (guest_get_ah(g) == 0x2) {
2039 // Return keyboard flags
2040 guest_set_al(g, 0x0);
2042 printf("Unhandeled method on INT 0x16\n");
2043 return handle_vmexit_unhandeled(g);
2047 // TIME - GET REAL-TIME CLOCK TIME (AT,XT286,PS)
2048 if (guest_get_ah(g) == 0x2) {
2050 lpc_rtc_get_time_bcd(g->lpc, &h, &m, &s);
2057 amd_vmcb_rflags_cf_wrf(&g->vmcb, 0);
2059 vmx_vmcs_rflags_cf_wrf(g, 0);
2062 printf("Unhandeled method on INT 0x1a\n");
2063 return handle_vmexit_unhandeled(g);
2067 printf("handle_vmexit_swint: Unhandeled real-mode interrupt "
2068 "0x%x (%d).\n", int_num, int_num);
2069 return handle_vmexit_unhandeled(g);
2072 printf("vmkitmon: encountered INT instruction outside real mode\n");
2073 return handle_vmexit_unhandeled(g);
2076 // advance the rip beyond the instruction
2078 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2080 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2081 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2082 assert(err_is_ok(err));
2084 return HANDLER_ERR_OK;
2087 static inline enum opsize
2088 io_access_size_to_opsize (enum x86_io_access io)
2090 if (io & X86_IO_ACCESS_SZ8) {
2092 } else if (io & X86_IO_ACCESS_SZ16) {
2094 } else if (io & X86_IO_ACCESS_SZ32) {
2103 handle_vmexit_ioio (struct guest *g)
2107 uint64_t info1 = amd_vmcb_exitinfo1_rd(&g->vmcb);
2108 enum x86_io_access io;
2109 uint16_t port = info1 >> 16;
2112 if (!g->emulated_before_exit) {
2113 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_QUAL, &saved_exit_qual);
2114 uint64_t instr_len, guest_rip;
2115 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_INSTR_LEN, &instr_len);
2116 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2117 saved_rip = guest_rip + instr_len;
2119 uint16_t port = (saved_exit_qual >> 16) & 0xffff;
2124 bool newapi = false; // needed as a transition
2127 // copy the access flags
2128 // FIXME: this severely exploits the way the x86_io_access flags are set up
2130 io |= info1 & SVM_IOIO_TYPE_MASK;
2132 // gather some params for the io access
2133 write = (io & X86_IO_ACCESS_TYPE) == 0;
2134 size = OPSIZE_8; // make gcc happy
2135 if (io & X86_IO_ACCESS_SZ8) {
2137 } else if (io & X86_IO_ACCESS_SZ16) {
2139 } else if (io & X86_IO_ACCESS_SZ32) {
2143 write = ((saved_exit_qual >> 3) & 0x1) == 0;
2145 if ((saved_exit_qual & 0x7) == 0) {
2147 } else if ((saved_exit_qual & 0x7) == 1) {
2149 } else if ((saved_exit_qual & 0x7) == 3) {
2152 assert(!"Invalid size of access value");
2155 // fetch the source val if neccessary
2159 val = guest_get_al(g);
2162 val = guest_get_ax(g);
2165 val = guest_get_eax(g);
2168 assert(!"not reached");
2173 // assign the request to the corresponding subsystem
2176 case 0x20: // primary PIC
2177 case 0x21: // primary PIC
2182 case 0x61: // NMI Controller
2191 case 0xa0: // secondary PIC
2192 case 0xa1: // secondary PIC
2194 r = lpc_handle_pio_write(g->lpc, port, size, val);
2195 guest_assert(g, r == 0);
2197 r = lpc_handle_pio_read(g->lpc, port, size, &val);
2205 // we currently do not support a keyboard
2212 // some apps use writing to this port as a method to delay execution
2213 // so we just do noting
2218 // coprocessor IGNNE# - do nothing for now
2222 // FIXME: this should not be hardcoded !
2260 com = (port & 0xf0) == 0xf0 ? !(port & 0x100) : !(port & 0x100) + 2;
2261 assert(com >= 0 && com < 4);
2263 r = pc16550d_handle_pio_write(g->serial_ports[com], port,
2267 r = pc16550d_handle_pio_read(g->serial_ports[com], port,
2275 // PCI config space (address)
2280 // PCI config space (data)
2286 r = pci_handle_pio_write(g->pci, port, size, val);
2288 r = pci_handle_pio_read(g->pci, port, size, &val);
2295 // the default is to return 0xff and to ignore writes
2302 // set the destination when neccessary
2303 if (newapi && !write) {
2306 guest_set_al(g, val);
2309 guest_set_ax(g, val);
2312 guest_set_eax(g, val);
2315 assert(!"not reached");
2320 // the following IP is stored in the exitinfo2 field
2322 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_exitinfo2_rd(&g->vmcb));
2324 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, saved_rip);
2325 assert(err_is_ok(err));
2327 return HANDLER_ERR_OK;
2331 handle_vmexit_msr (struct guest *g) {
2333 bool write = amd_vmcb_exitinfo1_rd(&g->vmcb) == 1;
2337 bool write = (saved_exit_reason == VMX_EXIT_REASON_WRMSR);
2338 struct msr_entry *guest_msr_area = (struct msr_entry *)g->msr_area_va;
2340 uint32_t msr = guest_get_ecx(g);
2343 // there may be writes or reads to MSRs
2345 // fetch the value to write from EDX:EAX
2346 val = ((uint64_t)guest_get_edx(g) << 32) | guest_get_eax(g);
2348 // store the read value into the corresponding location
2350 case X86_MSR_SYSENTER_CS:
2352 amd_vmcb_sysenter_cs_wr(&g->vmcb, val);
2354 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_CS, val);
2357 case X86_MSR_SYSENTER_ESP:
2359 amd_vmcb_sysenter_esp_wr(&g->vmcb, val);
2361 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, val);
2364 case X86_MSR_SYSENTER_EIP:
2366 amd_vmcb_sysenter_eip_wr(&g->vmcb, val);
2368 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, val);
2373 amd_vmcb_efer_wr_raw(&g->vmcb, val);
2375 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_EFER_F, val);
2378 case X86_MSR_FS_BASE:
2380 amd_vmcb_fs_base_wr(&g->vmcb, val);
2382 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_FS_BASE, val);
2385 case X86_MSR_GS_BASE:
2387 amd_vmcb_gs_base_wr(&g->vmcb, val);
2389 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_GS_BASE, val);
2393 case X86_MSR_KERNEL_GS_BASE:
2394 amd_vmcb_kernel_gs_base_wr(&g->vmcb, val);
2397 amd_vmcb_star_wr(&g->vmcb, val);
2400 amd_vmcb_lstar_wr(&g->vmcb, val);
2403 amd_vmcb_cstar_wr(&g->vmcb, val);
2405 case X86_MSR_SFMASK:
2406 amd_vmcb_sfmask_wr(&g->vmcb, val);
2409 printf("MSR: unhandeled MSR write access to %x\n", msr);
2410 return handle_vmexit_unhandeled(g);
2413 case 0x8b: // IA32_BIOS_SIGN_ID
2416 case X86_MSR_BIOS_SIGN_ID:
2420 msr_index = vmx_guest_msr_index(msr);
2421 if (msr_index == -1) {
2422 printf("MSR: unhandeled MSR write access to %x\n", msr);
2423 return handle_vmexit_unhandeled(g);
2425 guest_msr_area[msr_index].val = val;
2430 // read the value from the corresponding location
2432 case X86_MSR_SYSENTER_CS:
2434 val = amd_vmcb_sysenter_cs_rd(&g->vmcb);
2436 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_CS, &val);
2439 case X86_MSR_SYSENTER_ESP:
2441 val = amd_vmcb_sysenter_esp_rd(&g->vmcb);
2443 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_ESP, &val);
2446 case X86_MSR_SYSENTER_EIP:
2448 val = amd_vmcb_sysenter_eip_rd(&g->vmcb);
2450 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_SYSENTER_EIP, &val);
2455 val = amd_vmcb_efer_rd_raw(&g->vmcb);
2457 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_EFER_F, &val);
2460 case X86_MSR_FS_BASE:
2462 val = amd_vmcb_fs_base_rd(&g->vmcb);
2464 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_FS_BASE, &val);
2467 case X86_MSR_GS_BASE:
2469 val = amd_vmcb_gs_base_rd(&g->vmcb);
2471 err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_GS_BASE, &val);
2475 case X86_MSR_KERNEL_GS_BASE:
2476 val = amd_vmcb_kernel_gs_base_rd(&g->vmcb);
2479 val = amd_vmcb_star_rd(&g->vmcb);
2482 val = amd_vmcb_lstar_rd(&g->vmcb);
2485 val = amd_vmcb_cstar_rd(&g->vmcb);
2487 case X86_MSR_SFMASK:
2488 val = amd_vmcb_sfmask_rd(&g->vmcb);
2491 printf("MSR: unhandeled MSR read access to %x\n", msr);
2492 return handle_vmexit_unhandeled(g);
2510 case 0x1a0: // IA32_MISC_ENABLE
2511 val = 0x1; // Fast-Strings Enable
2520 msr_index = vmx_guest_msr_index(msr);
2521 if (msr_index == -1) {
2522 printf("MSR: unhandeled MSR read access to %x\n", msr);
2523 return handle_vmexit_unhandeled(g);
2525 val = guest_msr_area[msr_index].val;
2528 case X86_MSR_APIC_BASE:
2529 case X86_MSR_BIOS_SIGN_ID:
2530 case X86_MSR_MTRRCAP:
2531 case X86_MSR_MCG_CAP:
2532 case X86_MSR_MCG_STATUS:
2534 case X86_MTRR_DEF_TYPE:
2537 case X86_MSR_MISC_ENABLE:
2538 val = 0x1; // enable fast-string instructions
2541 msr_index = vmx_guest_msr_index(msr);
2542 if (msr_index == -1) {
2543 printf("MSR: unhandeled MSR read access to %x\n", msr);
2544 return handle_vmexit_unhandeled(g);
2546 val = guest_msr_area[msr_index].val;
2552 // store the value in EDX:EAX
2553 guest_set_eax(g, val);
2554 guest_set_edx(g, val >> 32);
2557 // advance the rip beyond the current instruction
2559 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2562 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2563 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2564 assert(err_is_ok(err));
2566 return HANDLER_ERR_OK;
2570 handle_vmexit_cpuid (struct guest *g) {
2571 uint32_t eax, ebx, ecx, edx;
2572 uint32_t func = guest_get_eax(g);
2574 /* the register values are copied from an emuliated Pentium processor in QEMU*/
2577 // Processor Vendor and Largest Standard Function Number
2580 // max standard function offset
2581 eax = func == 0 ? 0x1 : 0x80000000;
2582 // string "AuthenticAMD"
2588 // Family, Model, Stepping Identifiers
2590 // we simulate a AMD K6-3D
2591 // Family 5, Model 8, Stepping 12
2593 // no brand, clflush size 16, no mulitprocessing, no local apic
2595 // support the popcnt instr
2597 // support some basic features
2602 // use the answer of the host if there is any other request
2603 // FIXME: this is probably not a good idea ;)
2604 cpuid(func, &eax, &ebx, &ecx, &edx);
2605 printf("handle_vmexit_cpuid: CPUID: func %x, host reports: eax %x, "
2606 "ebx %x, ecx %x, edx %x\n", func, eax, ebx, ecx, edx);
2635 guest_set_eax(g, eax);
2636 guest_set_ebx(g, ebx);
2637 guest_set_ecx(g, ecx);
2638 guest_set_edx(g, edx);
2640 // advance the rip beyond the instruction
2642 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 2);
2645 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2646 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 2);
2647 assert(err_is_ok(err));
2649 return HANDLER_ERR_OK;
2653 handle_vmexit_vmmcall (struct guest *g) {
2654 /*printf("VMMCALL: tsc %lu, exits with mon invocation %lu, exits w/o mon "
2655 "invocation %lu\n", rdtsc(),
2656 g->ctrl->num_vm_exits_with_monitor_invocation,
2657 g->ctrl->num_vm_exits_without_monitor_invocation);*/
2659 // advance the rip beyond the instruction
2661 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 3);
2664 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2665 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 3);
2666 assert(err_is_ok(err));
2668 return HANDLER_ERR_OK;
2672 handle_vmexit_hlt (struct guest *g) {
2673 // the guest has nothing to do - poll out irq sources for pending IRQs
2674 // if they do not assert a virtual IRQ then we will do nothing
2675 lpc_pic_process_irqs(g->lpc);
2677 // advance the rip beyond the instruction
2679 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) + 1);
2682 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2683 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip + 1);
2686 // running HLT with IRQs masked does not make any sense
2687 // FIXME: this assert silly, shutting down the VM would be the right way
2689 guest_assert(g, amd_vmcb_rflags_rd(&g->vmcb).intrf == 1);
2691 uint64_t guest_rflags;
2692 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RFLAGS, &guest_rflags);
2693 assert(err_is_ok(err));
2694 guest_assert(g, guest_rflags & RFLAGS_IF);
2696 if (virq_pending(g, NULL, NULL)) {
2697 // there is an IRQ pending, proceed as normal, the CPU will take it
2699 // there is really nothing to do - stop the VM and wait
2700 g->runnable = false;
2703 return HANDLER_ERR_OK;
2707 decode_mov_instr_length (struct guest *g, uint8_t *code)
2711 // we only support long mode for now
2712 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2714 // all non special MOV instructions use one byte as opcode and at least a
2717 // check for the REX prefix
2718 if ((code[0] >> 4) == 0x4) {
2722 // precaution because I did no check all variants of MOV, at least these two
2723 // variants are supported
2724 assert(code[0] == 0x89 || code[0] == 0x8b);
2726 union x86_modrm modrm = { .raw = code[1] };
2727 // check for displacements
2728 if (modrm.u.mod == 0x1) {
2731 } else if (modrm.u.mod == 0x2) {
2736 // check for SIB byte
2737 if (modrm.u.rm == 0x4 && modrm.u.mod != 0x3) {
2744 // finds out whether a move instruction is a read or a write with respect to
2747 decode_mov_is_write (struct guest *g, uint8_t *code)
2749 // check for the REX prefix
2750 if ((code[0] >> 4) == 0x4) {
2754 // we only support one move variant (in each direction) for now
2755 assert(code[0] == 0x89 || code[0] == 0x8b);
2757 union x86_modrm modrm = { .raw = code[1] };
2758 // not defined for reg to reg moves
2759 assert(modrm.u.mod != 3);
2761 return code[0] == 0x89; // 0x89 ==> MOV reg -> mem
2764 static inline enum opsize
2765 decode_mov_op_size (struct guest *g, uint8_t *code)
2768 printf("EFER: 0x%lx\n", amd_vmcb_efer_rd_raw(&g->vmcb));
2769 printf("Code: 0x%lx\n", *((uint64_t *)code));
2770 printf("Code[0]: 0x%x, Code[1]: 0x%x, Code[2]: 0x%x, Code[3]: 0x%x\n", code[0],code[1],code[2],code[3]);
2771 printf("Guest EAX: 0x%x\n", guest_get_eax(g));
2772 printf("Guest EBX: 0x%x\n", guest_get_ebx(g));
2773 printf("Guest ECX: 0x%x\n", guest_get_ecx(g));
2775 printf("Guest EDX: 0x%x\n", guest_get_edx(g));
2776 printf("Guest RDI: 0x%lx\n", guest_get_rdi(g));
2777 printf("Guest RSI: 0x%lx\n", guest_get_rsi(g));
2778 printf("Guest RSP: 0x%lx\n", guest_get_rsp(g));
2779 printf("Guest RBP: 0x%lx\n", guest_get_rbp(g));
2782 // we only support long mode for now
2783 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2785 // check for the REX prefix
2786 if ((code[0] >> 4) == 0x4 && code[0] & 0x48) {
2793 static inline uint64_t
2794 decode_mov_src_val (struct guest *g, uint8_t *code) {
2796 // we only support long mode for now
2797 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2799 // check for the REX prefix
2800 if ((code[0] >> 4) == 0x4) {
2804 // we only support one variant for now
2805 assert(code[0] == 0x89);
2807 union x86_modrm modrm = { .raw = code[1] };
2808 return get_reg_val_by_reg_num(g, modrm.u.regop);
2813 decode_mov_dest_val (struct guest *g, uint8_t *code, uint64_t val)
2815 // we only support long mode for now
2816 //assert(amd_vmcb_efer_rd(&g->vmcb).lma == 1);
2818 // check for the REX prefix
2819 if ((code[0] >> 4) == 0x4) {
2823 // we only support one variant for now
2824 assert(code[0] == 0x8b);
2826 union x86_modrm modrm = { .raw = code[1] };
2827 set_reg_val_by_reg_num(g, modrm.u.regop, val);
2831 #define TDBAL_OFFSET 0x3800
2832 #define TDBAH_OFFSET 0x3804
2833 #define RDBAL_OFFSET 0x2800
2834 #define RDBAH_OFFSET 0x2804
2835 #define TDT_OFFSET 0x3818 //Transmit descriptor tail. Writes to this toggle transmission
2836 #define TCTL_OFFSET 0x400 //Transmission Control
2838 #define IMS_OFFSET 0xd0 // Interrupt Mask Set/Read Register
2839 #define ICS_OFFSET 0xc8 // Interrupt Cause Set Register
2841 static int register_needs_translation(uint64_t addr){
2843 addr == TDBAL_OFFSET ||
2844 addr == TDBAH_OFFSET ||
2845 addr == RDBAL_OFFSET ||
2846 addr == RDBAH_OFFSET
2856 #define MMIO_MASK(bytes) (~(~(bytes) + 1)) // I think ~(-bytes) is also correct
2859 handle_vmexit_npf (struct guest *g) {
2862 uint64_t fault_addr = amd_vmcb_exitinfo2_rd(&g->vmcb);
2864 uint64_t fault_addr, guest_rip;
2865 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_GPADDR_F, &fault_addr);
2866 assert(err_is_ok(err));
2868 uint8_t *code = NULL;
2870 // check for fault inside the guest physical memory region
2871 if (fault_addr >= g->mem_low_va && fault_addr < g->mem_high_va) {
2872 // allocate the missing memory
2873 alloc_guest_mem(g, fault_addr & ~BASE_PAGE_MASK, BASE_PAGE_SIZE);
2874 // do not advance the RIP, it is safe (and neccessary) to
2875 // replay the faulting instruction
2876 return HANDLER_ERR_OK;
2879 // fetch the location to the code
2880 r = get_instr_arr(g, &code);
2884 switch (fault_addr & ~BASE_PAGE_MASK) {
2889 assert(g->apic != NULL);
2890 size = decode_mov_op_size(g, code);
2891 if (decode_mov_is_write(g, code)) {
2892 val = decode_mov_src_val(g, code);
2893 r = apic_handle_mmio_write(g->apic, fault_addr, size, val);
2896 r = apic_handle_mmio_read(g->apic, fault_addr, size, &val);
2898 decode_mov_dest_val(g, code, val);
2901 // advance the rip beyond the instruction
2903 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) +
2904 decode_mov_instr_length(g, code));
2906 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2907 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip +
2908 decode_mov_instr_length(g, code));
2909 assert(err_is_ok(err));
2911 return HANDLER_ERR_OK;
2915 //Check if this is a access to a pci device memory
2917 for(int bus_i = 0; bus_i<256; bus_i++){
2918 for(int dev_i = 0; dev_i < 32; dev_i++){
2919 struct pci_bus *bus = g->pci->bus[bus_i];
2921 struct pci_device* dev = bus->device[dev_i];
2923 for(int bar_i=0; bar_i<5; bar_i++){
2924 struct bar_info *curbar = &dev->bars[bar_i];
2925 if(curbar->paddr <= fault_addr && fault_addr < curbar->paddr + curbar->bytes){
2926 if(decode_mov_is_write(g, code)){
2927 uint64_t val = decode_mov_src_val(g, code);
2928 if(dev->mem_write) {
2929 dev->mem_write(dev, MMIO_MASK(curbar->bytes) & fault_addr, bar_i, val );
2936 dev->mem_read(dev, MMIO_MASK(curbar->bytes) & fault_addr, bar_i, (uint32_t*)&val);
2937 decode_mov_dest_val(g, code, val);
2943 amd_vmcb_rip_wr(&g->vmcb, amd_vmcb_rip_rd(&g->vmcb) +
2944 decode_mov_instr_length(g, code));
2946 err += invoke_dispatcher_vmread(g->dcb_cap, VMX_GUEST_RIP, &guest_rip);
2947 err += invoke_dispatcher_vmwrite(g->dcb_cap, VMX_GUEST_RIP, guest_rip +
2948 decode_mov_instr_length(g, code));
2949 assert(err_is_ok(err));
2951 return HANDLER_ERR_OK;
2960 printf("vmkitmon: access to an unknown memory location: %lx", fault_addr);
2961 return handle_vmexit_unhandeled(g);
2964 typedef int (*vmexit_handler)(struct guest *g);
2967 static vmexit_handler vmexit_handlers[0x8c] = {
2968 [SVM_VMEXIT_CR0_READ] = handle_vmexit_cr_access,
2969 [SVM_VMEXIT_CR0_WRITE] = handle_vmexit_cr_access,
2970 [SVM_VMEXIT_CR0_SEL_WRITE] = handle_vmexit_cr_access,
2971 [SVM_VMEXIT_SWINT] = handle_vmexit_swint,
2972 [SVM_VMEXIT_IDTR_WRITE] = handle_vmexit_ldt,
2973 [SVM_VMEXIT_GDTR_WRITE] = handle_vmexit_ldt,
2974 [SVM_VMEXIT_IOIO] = handle_vmexit_ioio,
2975 [SVM_VMEXIT_MSR] = handle_vmexit_msr,
2976 [SVM_VMEXIT_CPUID] = handle_vmexit_cpuid,
2977 [SVM_VMEXIT_VMMCALL] = handle_vmexit_vmmcall,
2978 [SVM_VMEXIT_HLT] = handle_vmexit_hlt
2981 static vmexit_handler vmexit_handlers[0x8c] = {
2982 [VMX_EXIT_REASON_CPUID] = handle_vmexit_cpuid,
2983 [VMX_EXIT_REASON_HLT] = handle_vmexit_hlt,
2984 [VMX_EXIT_REASON_VMCALL] = handle_vmexit_vmmcall,
2985 [VMX_EXIT_REASON_CR_ACCESS] = handle_vmexit_cr_access,
2986 [VMX_EXIT_REASON_INOUT] = handle_vmexit_ioio,
2987 [VMX_EXIT_REASON_RDMSR] = handle_vmexit_msr,
2988 [VMX_EXIT_REASON_WRMSR] = handle_vmexit_msr,
2989 [VMX_EXIT_REASON_GDTR_IDTR] = handle_vmexit_ldt,
2990 [VMX_EXIT_REASON_EPT_FAULT] = handle_vmexit_npf,
2991 [VMX_EXIT_REASON_SWINT] = handle_vmexit_swint
2996 guest_handle_vmexit (struct guest *g) {
2997 //struct pci_ethernet * eth = (struct pci_ethernet * ) g->pci->bus[0]->device[2]->state;//
2998 //printf("guest_handle_vmexit\n");
2999 vmexit_handler handler;
3001 uint64_t exitcode = amd_vmcb_exitcode_rd(&g->vmcb);
3002 if (exitcode == SVM_VMEXIT_NPF) {
3003 handler = handle_vmexit_npf;
3004 } else if (LIKELY(vmexit_handlers[exitcode] != NULL)) {
3005 handler = vmexit_handlers[exitcode];
3007 handle_vmexit_unhandeled(g);
3011 if (!g->emulated_before_exit) {
3012 errval_t err = invoke_dispatcher_vmread(g->dcb_cap, VMX_EXIT_REASON,
3013 (uint64_t *)&saved_exit_reason);
3014 assert(err_is_ok(err));
3017 if (LIKELY(vmexit_handlers[saved_exit_reason] != NULL)) {
3018 handler = vmexit_handlers[saved_exit_reason];
3020 handle_vmexit_unhandeled(g);
3025 if (LIKELY(r == HANDLER_ERR_OK)) {
3027 guest_make_runnable(g, true);