akaros/kern/arch/x86/vmm/vmm.c
<<
>>
Prefs
   1/* Copyright 2015 Google Inc.
   2 *
   3 * See LICENSE for details.
   4 */
   5
   6/* We're not going to falll into the trap of only compiling support
   7 * for AMD OR Intel for an image. It all gets compiled in, and which
   8 * one you use depends on on cpuinfo, not a compile-time
   9 * switch. That's proven to be the best strategy.  Conditionally
  10 * compiling in support is the path to hell.
  11 */
  12#include <assert.h>
  13#include <pmap.h>
  14#include <smp.h>
  15#include <kmalloc.h>
  16
  17#include <ros/vmm.h>
  18#include "intel/vmx.h"
  19#include "vmm.h"
  20#include <trap.h>
  21#include <umem.h>
  22
  23#include <arch/x86.h>
  24#include <ros/procinfo.h>
  25
  26
  27/* TODO: have better cpuid info storage and checks */
  28bool x86_supports_vmx = FALSE;
  29
  30/* Figure out what kind of CPU we are on, and if it supports any reasonable
  31 * virtualization. For now, if we're not some sort of newer intel, don't
  32 * bother. This does all cores. Again, note, we make these decisions at runtime,
  33 * to avoid getting into the problems that compile-time decisions can cause.
  34 * At this point, of course, it's still all intel.
  35 */
  36void vmm_init(void)
  37{
  38        int ret;
  39        /* Check first for intel capabilities. This is hence two back-to-back
  40         * implementationd-dependent checks. That's ok, it's all msr dependent.
  41         */
  42
  43        ret = intel_vmm_init();
  44        if (! ret) {
  45                x86_supports_vmx = TRUE;
  46                return;
  47        }
  48
  49        /* TODO: AMD. Will we ever care? It's not clear. */
  50        printk("vmm_init failed, ret %d\n", ret);
  51        return;
  52}
  53
  54void vmm_pcpu_init(void)
  55{
  56        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
  57
  58        pcpui->guest_pcoreid = -1;
  59        if (!x86_supports_vmx)
  60                return;
  61        if (! intel_vmm_pcpu_init()) {
  62                printd("vmm_pcpu_init worked\n");
  63                return;
  64        }
  65        /* TODO: AMD. Will we ever care? It's not clear. */
  66        printk("vmm_pcpu_init failed\n");
  67}
  68
  69/* Ensures a process is ready to run virtual machines, though it may have no
  70 * guest pcores yet.  Typically, this is called by other vmm functions.  Caller
  71 * holds the qlock.  Throws on error. */
  72void __vmm_struct_init(struct proc *p)
  73{
  74        struct vmm *vmm = &p->vmm;
  75
  76        if (vmm->vmmcp)
  77                return;
  78        if (!x86_supports_vmx)
  79                error(ENODEV, "This CPU does not support VMX");
  80        vmm->vmmcp = TRUE;
  81        vmm->amd = 0;
  82        vmx_setup_vmx_vmm(&vmm->vmx);
  83        for (int i = 0; i < VMM_VMEXIT_NR_TYPES; i++)
  84                vmm->vmexits[i] = 0;
  85        vmm->nr_guest_pcores = 0;
  86        vmm->guest_pcores = NULL;
  87        vmm->gpc_array_elem = 0;
  88}
  89
  90/* Helper, grows the array of guest_pcores in vmm.  Concurrent readers
  91 * (lookup_guest_pcore) need to use a seq-lock-style of concurrency.  They could
  92 * read the old array even after we free it. */
  93static void __vmm_grow_gpc_array(struct vmm *vmm, unsigned int new_nr_gpcs)
  94{
  95        struct guest_pcore **new_array, **old_array;
  96        size_t new_nr_elem;
  97
  98        if (new_nr_gpcs <= vmm->gpc_array_elem)
  99                return;
 100        /* TODO: (RCU) we could defer the free, maybe with an RCU-safe krealloc.
 101         */
 102        old_array = vmm->guest_pcores;
 103        new_nr_elem = MAX(vmm->gpc_array_elem * 2, new_nr_gpcs);
 104        new_array = kzmalloc(new_nr_elem * sizeof(void*), MEM_WAIT);
 105        memcpy(new_array, vmm->guest_pcores,
 106               sizeof(void*) * vmm->nr_guest_pcores);
 107        wmb();  /* all elements written before changing pointer */
 108        vmm->guest_pcores = new_array;
 109        wmb();  /* ptr written before potentially clobbering it. */
 110        kfree(old_array);
 111}
 112
 113/* Adds gpcs to the VMM.  Caller holds the qlock; throws on error. */
 114void __vmm_add_gpcs(struct proc *p, unsigned int nr_more_gpcs,
 115                    struct vmm_gpcore_init *u_gpcis)
 116{
 117        struct vmm *vmm = &p->vmm;
 118        struct vmm_gpcore_init gpci;
 119        unsigned int new_nr_gpcs;
 120
 121        if (!nr_more_gpcs)
 122                return;
 123        new_nr_gpcs = vmm->nr_guest_pcores + nr_more_gpcs;
 124        if ((new_nr_gpcs < vmm->nr_guest_pcores) || (new_nr_gpcs > 10000))
 125                error(EINVAL, "Can't add %u new gpcs", new_nr_gpcs);
 126        __vmm_grow_gpc_array(vmm, new_nr_gpcs);
 127        for (int i = 0; i < nr_more_gpcs; i++) {
 128                if (copy_from_user(&gpci, &u_gpcis[i],
 129                                   sizeof(struct vmm_gpcore_init)))
 130                        error(EINVAL, "Bad pointer %p for gps", u_gpcis);
 131                vmm->guest_pcores[vmm->nr_guest_pcores] =
 132                        create_guest_pcore(p, &gpci);
 133                /* concurrent readers will check nr_guest_pcores first */
 134                wmb();
 135                vmm->nr_guest_pcores++;
 136        }
 137}
 138
 139/* Has no concurrency protection - only call this when you know you have the
 140 * only ref to vmm.  For instance, from __proc_free, where there is only one ref
 141 * to the proc (and thus proc.vmm). */
 142void __vmm_struct_cleanup(struct proc *p)
 143{
 144        struct vmm *vmm = &p->vmm;
 145
 146        if (!vmm->vmmcp)
 147                return;
 148        for (int i = 0; i < vmm->nr_guest_pcores; i++) {
 149                if (vmm->guest_pcores[i])
 150                        destroy_guest_pcore(vmm->guest_pcores[i]);
 151        }
 152        kfree(vmm->guest_pcores);
 153        ept_flush(p->env_pgdir.eptp);
 154        vmm->vmmcp = FALSE;
 155}
 156
 157int vmm_poke_guest(struct proc *p, int guest_pcoreid)
 158{
 159        struct guest_pcore *gpc;
 160        int pcoreid;
 161
 162        gpc = lookup_guest_pcore(p, guest_pcoreid);
 163        if (!gpc) {
 164                set_error(ENOENT, "Bad guest_pcoreid %d", guest_pcoreid);
 165                return -1;
 166        }
 167        /* We're doing an unlocked peek; it could change immediately.  This is a
 168         * best effort service. */
 169        pcoreid = ACCESS_ONCE(gpc->cpu);
 170        if (pcoreid == -1) {
 171                /* So we know that we'll miss the poke for the posted IRQ.  We
 172                 * could return an error.  However, error handling for this case
 173                 * isn't particularly helpful (yet).  The absence of the error
 174                 * does not mean the IRQ was posted.  We'll still return 0,
 175                 * meaning "the user didn't mess up; we tried." */
 176                return 0;
 177        }
 178        send_ipi(pcoreid, I_POKE_GUEST);
 179        return 0;
 180}
 181
 182struct guest_pcore *lookup_guest_pcore(struct proc *p, int guest_pcoreid)
 183{
 184        struct guest_pcore **array;
 185        struct guest_pcore *ret;
 186
 187        if (guest_pcoreid < 0)
 188                return NULL;
 189        /* nr_guest_pcores is written once at setup and never changed */
 190        if (guest_pcoreid >= p->vmm.nr_guest_pcores)
 191                return NULL;
 192        /* TODO: (RCU) Synchronizing with __vmm_grow_gpc_array() */
 193        do {
 194                array = ACCESS_ONCE(p->vmm.guest_pcores);
 195                ret = array[guest_pcoreid];
 196                rmb();  /* read ret before rereading array pointer */
 197        } while (array != ACCESS_ONCE(p->vmm.guest_pcores));
 198        return ret;
 199}
 200
 201struct guest_pcore *load_guest_pcore(struct proc *p, int guest_pcoreid)
 202{
 203        struct guest_pcore *gpc;
 204        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 205
 206        gpc = lookup_guest_pcore(p, guest_pcoreid);
 207        if (!gpc)
 208                return 0;
 209        assert(pcpui->guest_pcoreid == -1);
 210        spin_lock(&p->vmm.lock);
 211        if (gpc->cpu != -1) {
 212                spin_unlock(&p->vmm.lock);
 213                return 0;
 214        }
 215        gpc->cpu = core_id();
 216        spin_unlock(&p->vmm.lock);
 217        /* We've got dibs on the gpc; we don't need to hold the lock any longer.
 218         */
 219        pcpui->guest_pcoreid = guest_pcoreid;
 220        vmx_load_guest_pcore(gpc);
 221        /* Load guest's xcr0 */
 222        lxcr0(gpc->xcr0);
 223
 224        /* Manual MSR save/restore */
 225        write_kern_gsbase(gpc->msr_kern_gs_base);
 226        if (gpc->msr_star != AKAROS_MSR_STAR)
 227                write_msr(MSR_STAR, gpc->msr_star);
 228        if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
 229                write_msr(MSR_LSTAR, gpc->msr_lstar);
 230        if (gpc->msr_sfmask != AKAROS_MSR_SFMASK)
 231                write_msr(MSR_SFMASK, gpc->msr_sfmask);
 232
 233        return gpc;
 234}
 235
 236void unload_guest_pcore(struct proc *p, int guest_pcoreid)
 237{
 238        struct guest_pcore *gpc;
 239        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 240
 241        gpc = lookup_guest_pcore(p, guest_pcoreid);
 242        assert(gpc);
 243        spin_lock(&p->vmm.lock);
 244        assert(gpc->cpu != -1);
 245        vmx_unload_guest_pcore(gpc);
 246        gpc->cpu = -1;
 247
 248        /* Save guest's xcr0 and restore Akaros's default. */
 249        gpc->xcr0 = rxcr0();
 250        lxcr0(__proc_global_info.x86_default_xcr0);
 251
 252        /* We manage these MSRs manually. */
 253        gpc->msr_kern_gs_base = read_kern_gsbase();
 254        gpc->msr_star = read_msr(MSR_STAR);
 255        gpc->msr_lstar = read_msr(MSR_LSTAR);
 256        gpc->msr_sfmask = read_msr(MSR_SFMASK);
 257
 258        write_kern_gsbase((uint64_t)pcpui);
 259        if (gpc->msr_star != AKAROS_MSR_STAR)
 260                write_msr(MSR_STAR, AKAROS_MSR_STAR);
 261        if (gpc->msr_lstar != AKAROS_MSR_LSTAR)
 262                write_msr(MSR_LSTAR, AKAROS_MSR_LSTAR);
 263        if (gpc->msr_sfmask, AKAROS_MSR_SFMASK)
 264                write_msr(MSR_SFMASK, AKAROS_MSR_SFMASK);
 265
 266        /* As soon as we unlock, this gpc can be started on another core */
 267        spin_unlock(&p->vmm.lock);
 268        pcpui->guest_pcoreid = -1;
 269}
 270
 271/* emulated msr. For now, an msr value and a pointer to a helper that
 272 * performs the requested operation.
 273 */
 274struct emmsr {
 275        uint32_t reg;
 276        char *name;
 277        bool (*f)(struct emmsr *msr, struct vm_trapframe *vm_tf,
 278                  uint32_t opcode);
 279        bool written;
 280        uint32_t edx, eax;
 281};
 282
 283static bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
 284                            uint32_t opcode);
 285static bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
 286                          uint32_t opcode);
 287static bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
 288                          uint32_t opcode);
 289static bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
 290                           uint32_t opcode);
 291static bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
 292                    uint32_t opcode);
 293static bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
 294                               uint32_t opcode);
 295static bool emsr_lapic_icr(struct emmsr *msr, struct vm_trapframe *vm_tf,
 296                           uint32_t opcode);
 297
 298struct emmsr emmsrs[] = {
 299        {MSR_LAPIC_ICR, "MSR_LAPIC_ICR", emsr_lapic_icr},
 300        {MSR_IA32_MISC_ENABLE, "MSR_IA32_MISC_ENABLE", emsr_miscenable},
 301        {MSR_IA32_SYSENTER_CS, "MSR_IA32_SYSENTER_CS", emsr_ok},
 302        {MSR_IA32_SYSENTER_EIP, "MSR_IA32_SYSENTER_EIP", emsr_ok},
 303        {MSR_IA32_SYSENTER_ESP, "MSR_IA32_SYSENTER_ESP", emsr_ok},
 304        {MSR_IA32_UCODE_REV, "MSR_IA32_UCODE_REV", emsr_fakewrite},
 305        {MSR_CSTAR, "MSR_CSTAR", emsr_fakewrite},
 306        {MSR_IA32_VMX_BASIC_MSR, "MSR_IA32_VMX_BASIC_MSR", emsr_fakewrite},
 307        {MSR_IA32_VMX_PINBASED_CTLS_MSR, "MSR_IA32_VMX_PINBASED_CTLS_MSR",
 308         emsr_fakewrite},
 309        {MSR_IA32_VMX_PROCBASED_CTLS_MSR, "MSR_IA32_VMX_PROCBASED_CTLS_MSR",
 310         emsr_fakewrite},
 311        {MSR_IA32_VMX_PROCBASED_CTLS2, "MSR_IA32_VMX_PROCBASED_CTLS2",
 312         emsr_fakewrite},
 313        {MSR_IA32_VMX_EXIT_CTLS_MSR, "MSR_IA32_VMX_EXIT_CTLS_MSR",
 314         emsr_fakewrite},
 315        {MSR_IA32_VMX_ENTRY_CTLS_MSR, "MSR_IA32_VMX_ENTRY_CTLS_MSR",
 316         emsr_fakewrite},
 317        {MSR_IA32_ENERGY_PERF_BIAS, "MSR_IA32_ENERGY_PERF_BIAS",
 318         emsr_fakewrite},
 319        {MSR_LBR_SELECT, "MSR_LBR_SELECT", emsr_ok},
 320        {MSR_LBR_TOS, "MSR_LBR_TOS", emsr_ok},
 321        {MSR_LBR_NHM_FROM, "MSR_LBR_NHM_FROM", emsr_ok},
 322        {MSR_LBR_NHM_TO, "MSR_LBR_NHM_TO", emsr_ok},
 323        {MSR_LBR_CORE_FROM, "MSR_LBR_CORE_FROM", emsr_ok},
 324        {MSR_LBR_CORE_TO, "MSR_LBR_CORE_TO", emsr_ok},
 325
 326        // grumble.
 327        {MSR_OFFCORE_RSP_0, "MSR_OFFCORE_RSP_0", emsr_ok},
 328        {MSR_OFFCORE_RSP_1, "MSR_OFFCORE_RSP_1", emsr_ok},
 329        // louder.
 330        {MSR_PEBS_LD_LAT_THRESHOLD, "MSR_PEBS_LD_LAT_THRESHOLD", emsr_ok},
 331        // aaaaaahhhhhhhhhhhhhhhhhhhhh
 332        {MSR_ARCH_PERFMON_EVENTSEL0, "MSR_ARCH_PERFMON_EVENTSEL0", emsr_ok},
 333        {MSR_ARCH_PERFMON_EVENTSEL1, "MSR_ARCH_PERFMON_EVENTSEL1", emsr_ok},
 334        {MSR_IA32_PERF_CAPABILITIES, "MSR_IA32_PERF_CAPABILITIES", emsr_readzero},
 335        // unsafe.
 336        {MSR_IA32_APICBASE, "MSR_IA32_APICBASE", emsr_fake_apicbase},
 337
 338        // mostly harmless.
 339        {MSR_TSC_AUX, "MSR_TSC_AUX", emsr_fakewrite},
 340        {MSR_RAPL_POWER_UNIT, "MSR_RAPL_POWER_UNIT", emsr_readzero},
 341        {MSR_IA32_MCG_CAP, "MSR_IA32_MCG_CAP", emsr_readzero},
 342        {MSR_IA32_DEBUGCTLMSR, "MSR_IA32_DEBUGCTLMSR", emsr_fakewrite},
 343
 344        // TBD
 345        {MSR_IA32_TSC_DEADLINE, "MSR_IA32_TSC_DEADLINE", emsr_fakewrite},
 346};
 347
 348/* Here are the rules for IPI injection:
 349 * 1) The guest can't sleep if notif is set.
 350 * 2) Userspace must wake the guest if notif is set, unconditionally
 351 * 3) Whoever sets notif must make sure the interrupt gets injected.
 352 *
 353 * This allows the kernel to set notif and possibly lose a race with a
 354 * concurrently halting / vmexiting guest.
 355 *
 356 * Guest sleeping happens in userspace in the halt/mwait vmexit handler.  If
 357 * userspace (vmm_interrupt_guest() sees notif set, it must try to wake the
 358 * guest - even if the user didn't set notif.  If the kernel sets notif, it
 359 * might be able to know the guest is running.  But if that fails, we have to
 360 * kick it back to userspace (return false here).  In that case, even though
 361 * userspace didn't set notif, it must attempt to wake the guest.
 362 *
 363 * For 3, the kernel can often know if the guest is running.  Then it can send
 364 * the posted IPI, then reconfirm the guest is running.  If that fails, or if it
 365 * *might* have failed, the guest still needs to get the IRQ.  The next time the
 366 * guest runs after notif was set, the interrupt will be injected.  If the
 367 * kernel kicks it back to userspace, the guest will wake or will fail to halt
 368 * (due to notif being set), and the next time it runs, the kernel will inject
 369 * the IPI (when we pop the vmtf).
 370 *
 371 * There's another case: the kernel sets notif, reads the coreid, sends the IPI,
 372 * and then sees the coreid is changed.  If the coreid is -1, the GPC isn't
 373 * loaded/running, and we kick back to userspace (as above).  If the coreid is
 374 * not -1, it is running somewhere else.  It might have missed the IPI, but
 375 * since the guest was popped on a core after notif was set, the IRQ was
 376 * posted/injected. */
 377static bool emsr_lapic_icr_write(struct emmsr *msr, struct vm_trapframe *tf)
 378{
 379        uint32_t destination = tf->tf_rdx & 0xffffffff;
 380        uint8_t vector = tf->tf_rax & 0xff;
 381        uint8_t type = (tf->tf_rax >> 8) & 0x7;
 382        struct guest_pcore *gpc;
 383        int target_coreid;
 384
 385        if (type != 0 || destination == 0xffffffff)
 386                return false;
 387        gpc = lookup_guest_pcore(current, destination);
 388        if (!gpc)
 389                return false;
 390        SET_BITMASK_BIT_ATOMIC((void*)gpc->posted_irq_desc, vector);
 391        cmb();  /* atomic does the MB, order set write before test read */
 392        /* We got lucky and squeezed our IRQ in with someone else's */
 393        if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, (void*)gpc->posted_irq_desc))
 394                return true;
 395        SET_BITMASK_BIT_ATOMIC((void*)gpc->posted_irq_desc,
 396                               VMX_POSTED_OUTSTANDING_NOTIF);
 397        cmb();  /* atomic does the MB, order set write before read of cpu */
 398        target_coreid = ACCESS_ONCE(gpc->cpu);
 399        if (target_coreid == -1)
 400                return false;
 401        /* If it's us, we'll send_ipi when we restart the VMTF.  Note this is
 402         * rare: the guest will usually use the self_ipi virtualization. */
 403        if (target_coreid != core_id())
 404                send_ipi(target_coreid, I_POKE_GUEST);
 405        /* No MBs needed here: only that it happens after setting notif */
 406        if (ACCESS_ONCE(gpc->cpu) == -1)
 407                return false;
 408        return true;
 409}
 410
 411static bool emsr_lapic_icr(struct emmsr *msr, struct vm_trapframe *tf,
 412                           uint32_t opcode)
 413{
 414        if (opcode == VMM_MSR_EMU_READ)
 415                return false;
 416        return emsr_lapic_icr_write(msr, tf);
 417}
 418
 419/* this may be the only register that needs special handling.
 420 * If there others then we might want to extend the emmsr struct.
 421 */
 422bool emsr_miscenable(struct emmsr *msr, struct vm_trapframe *vm_tf,
 423                     uint32_t opcode)
 424{
 425        uint64_t val;
 426        uint32_t eax, edx;
 427
 428        if (read_msr_safe(msr->reg, &val))
 429                return FALSE;
 430        eax = low32(val);
 431        eax |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
 432        edx = high32(val);
 433        /* we just let them read the misc msr for now. */
 434        if (opcode == VMM_MSR_EMU_READ) {
 435                vm_tf->tf_rax = eax;
 436                vm_tf->tf_rdx = edx;
 437                return TRUE;
 438        } else {
 439                /* if they are writing what is already written, that's ok. */
 440                if (((uint32_t) vm_tf->tf_rax == eax)
 441                    && ((uint32_t) vm_tf->tf_rdx == edx))
 442                        return TRUE;
 443        }
 444        printk("%s: Wanted to write 0x%x%x, but could not; value was 0x%x%x\n",
 445               msr->name, (uint32_t) vm_tf->tf_rdx, (uint32_t) vm_tf->tf_rax,
 446               edx, eax);
 447        return FALSE;
 448}
 449
 450bool emsr_readonly(struct emmsr *msr, struct vm_trapframe *vm_tf,
 451                   uint32_t opcode)
 452{
 453        uint64_t val;
 454
 455        if (read_msr_safe(msr->reg, &val))
 456                return FALSE;
 457        if (opcode == VMM_MSR_EMU_READ) {
 458                vm_tf->tf_rax = low32(val);
 459                vm_tf->tf_rdx = high32(val);
 460                return TRUE;
 461        }
 462
 463        printk("%s: Tried to write a readonly register\n", msr->name);
 464        return FALSE;
 465}
 466
 467bool emsr_readzero(struct emmsr *msr, struct vm_trapframe *vm_tf,
 468                   uint32_t opcode)
 469{
 470        if (opcode == VMM_MSR_EMU_READ) {
 471                vm_tf->tf_rax = 0;
 472                vm_tf->tf_rdx = 0;
 473                return TRUE;
 474        }
 475
 476        printk("%s: Tried to write a readonly register\n", msr->name);
 477        return FALSE;
 478}
 479
 480/* pretend to write it, but don't write it. */
 481bool emsr_fakewrite(struct emmsr *msr, struct vm_trapframe *vm_tf,
 482                    uint32_t opcode)
 483{
 484        uint32_t eax, edx;
 485        uint64_t val;
 486
 487        if (!msr->written) {
 488                if (read_msr_safe(msr->reg, &val))
 489                        return FALSE;
 490                eax = low32(val);
 491                edx = high32(val);
 492        } else {
 493                eax = msr->eax;
 494                edx = msr->edx;
 495        }
 496        /* we just let them read the misc msr for now. */
 497        if (opcode == VMM_MSR_EMU_READ) {
 498                vm_tf->tf_rax = eax;
 499                vm_tf->tf_rdx = edx;
 500                return TRUE;
 501        } else {
 502                msr->edx = vm_tf->tf_rdx;
 503                msr->eax = vm_tf->tf_rax;
 504                msr->written = TRUE;
 505        }
 506        return TRUE;
 507}
 508
 509bool emsr_ok(struct emmsr *msr, struct vm_trapframe *vm_tf,
 510             uint32_t opcode)
 511{
 512        uint64_t val;
 513
 514        if (opcode == VMM_MSR_EMU_READ) {
 515                if (read_msr_safe(msr->reg, &val))
 516                        return FALSE;
 517                vm_tf->tf_rax = low32(val);
 518                vm_tf->tf_rdx = high32(val);
 519        } else {
 520                val = (vm_tf->tf_rdx << 32) | (vm_tf->tf_rax & 0xffffffff);
 521                if (write_msr_safe(msr->reg, val))
 522                        return FALSE;
 523        }
 524        return TRUE;
 525}
 526
 527/* pretend to write it, but don't write it. */
 528bool emsr_fake_apicbase(struct emmsr *msr, struct vm_trapframe *vm_tf,
 529                        uint32_t opcode)
 530{
 531        uint32_t eax, edx;
 532
 533        if (!msr->written) {
 534                /* TODO: tightly coupled to the addr in vmrunkernel.  We want
 535                 * this func to return the val that vmrunkernel put into the
 536                 * VMCS. */
 537                eax = 0xfee00d00;
 538                if (vm_tf->tf_guest_pcoreid != 0) {
 539                        // Remove BSP bit if not core 0
 540                        eax = 0xfee00c00;
 541                }
 542                edx = 0;
 543        } else {
 544                edx = msr->edx;
 545                eax = msr->eax;
 546        }
 547        /* we just let them read the misc msr for now. */
 548        if (opcode == VMM_MSR_EMU_READ) {
 549                vm_tf->tf_rax = eax;
 550                vm_tf->tf_rdx = edx;
 551                return TRUE;
 552        } else {
 553                /* if they are writing what is already written, that's ok. */
 554                if (((uint32_t) vm_tf->tf_rax == eax)
 555                    && ((uint32_t) vm_tf->tf_rdx == edx))
 556                        return 0;
 557                msr->edx = vm_tf->tf_rdx;
 558                msr->eax = vm_tf->tf_rax;
 559                msr->written = TRUE;
 560        }
 561        return TRUE;
 562}
 563
 564bool vmm_emulate_msr(struct vm_trapframe *vm_tf, int op)
 565{
 566        for (int i = 0; i < ARRAY_SIZE(emmsrs); i++) {
 567                if (emmsrs[i].reg != vm_tf->tf_rcx)
 568                        continue;
 569                return emmsrs[i].f(&emmsrs[i], vm_tf, op);
 570        }
 571        return FALSE;
 572}
 573