akaros/kern/arch/x86/vmm/intel/vmx.c
<<
>>
Prefs
   1//#define DEBUG
   2/**
   3 *  vmx.c - The Intel VT-x driver for Dune
   4 *
   5 * This file is derived from Linux KVM VT-x support.
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Original Authors:
  10 *   Avi Kivity   <avi@qumranet.com>
  11 *   Yaniv Kamay  <yaniv@qumranet.com>
  12 *
  13 * This modified version is simpler because it avoids the following
  14 * features that are not requirements for Dune:
  15 *  * Real-mode emulation
  16 *  * Nested VT-x support
  17 *  * I/O hardware emulation
  18 *  * Any of the more esoteric X86 features and registers
  19 *  * KVM-specific functionality
  20 *
  21 * In essence we provide only the minimum functionality needed to run
  22 * a process in vmx non-root mode rather than the full hardware emulation
  23 * needed to support an entire OS.
  24 *
  25 * This driver is a research prototype and as such has the following
  26 * limitations:
  27 *
  28 * FIXME: Backward compatibility is currently a non-goal, and only recent
  29 * full-featured (EPT, PCID, VPID, etc.) Intel hardware is supported by this
  30 * driver.
  31 *
  32 * FIXME: Eventually we should handle concurrent user's of VT-x more
  33 * gracefully instead of requiring exclusive access. This would allow
  34 * Dune to interoperate with KVM and other HV solutions.
  35 *
  36 * FIXME: We need to support hotplugged physical CPUs.
  37 *
  38 * Authors:
  39 *   Adam Belay   <abelay@stanford.edu>
  40 */
  41
  42/* Basic flow.
  43 * Yep, it's confusing. This is in part because the vmcs is used twice, for two different things.
  44 * You're left with the feeling that they got part way through and realized they had to have one for
  45 *
  46 * 1) your CPU is going to be capable of running VMs, and you need state for
  47 * that.
  48 *
  49 * 2) you're about to start a guest, and you need state for that.
  50 *
  51 * So there is get cpu set up to be able to run VMs stuff, and now
  52 * let's start a guest stuff.  In Akaros, CPUs will always be set up
  53 * to run a VM if that is possible. Processes can flip themselves into
  54 * a VM and that will require another VMCS.
  55 *
  56 * So: at kernel startup time, the SMP boot stuff calls
  57 * k/a/x86/vmm/vmm.c:vmm_init, which calls arch-dependent bits, which
  58 * in the case of this file is intel_vmm_init. That does some code
  59 * that sets up stuff for ALL sockets, based on the capabilities of
  60 * the socket it runs on. If any cpu supports vmx, it assumes they all
  61 * do. That's a realistic assumption. So the call_function_all is kind
  62 * of stupid, really; it could just see what's on the current cpu and
  63 * assume it's on all. HOWEVER: there are systems in the wild that
  64 * can run VMs on some but not all CPUs, due to BIOS mistakes, so we
  65 * might as well allow for the chance that we'll only all VMMCPs on a
  66 * subset (not implemented yet however).  So: probe all CPUs, get a
  67 * count of how many support VMX and, for now, assume they all do
  68 * anyway.
  69 *
  70 * Next, call setup_vmcs_config to configure the GLOBAL vmcs_config struct,
  71 * which contains all the naughty bits settings for all the cpus that can run a
  72 * VM.
  73 * Realistically, all VMX-capable cpus in a system will have identical
  74 * configurations.
  75 * So: 0 or more cpus can run VMX; all cpus which can run VMX will have the same
  76 * configuration.
  77 *
  78 * configure the msr_bitmap. This is the bitmap of MSRs which the
  79 * guest can manipulate.  Currently, we only allow GS and FS base.
  80 *
  81 * Reserve bit 0 in the vpid bitmap as guests can not use that
  82 *
  83 * Set up the what we call the vmxarea. The vmxarea is per-cpu, not
  84 * per-guest. Once set up, it is left alone.  The ONLY think we set in
  85 * there is the revision area. The VMX is page-sized per cpu and
  86 * page-aligned. Note that it can be smaller, but why bother? We know
  87 * the max size and alignment, and it's convenient.
  88 *
  89 * Now that it is set up, enable vmx on all cpus. This involves
  90 * testing VMXE in cr4, to see if we've been here before (TODO: delete
  91 * this test), then testing MSR_IA32_FEATURE_CONTROL to see if we can
  92 * do a VM, the setting the VMXE in cr4, calling vmxon (does a vmxon
  93 * instruction), and syncing vpid's and ept's.  Now the CPU is ready
  94 * to host guests.
  95 *
  96 * Setting up a guest.
  97 * We divide this into two things: vmm_proc_init and vm_run.
  98 * Currently, on Intel, vmm_proc_init does nothing.
  99 *
 100 * vm_run is really complicated. It is called with a coreid, and
 101 * vmctl struct. On intel, it calls vmx_launch. vmx_launch is set
 102 * up for a few test cases. If rip is 1, it sets the guest rip to
 103 * a function which will deref 0 and should exit with failure 2. If rip is 0,
 104 * it calls an infinite loop in the guest.
 105 *
 106 * The sequence of operations:
 107 * create a vcpu
 108 * while (1) {
 109 * get a vcpu
 110 * disable irqs (required or you can't enter the VM)
 111 * vmx_run_vcpu()
 112 * enable irqs
 113 * manage the vm exit
 114 * }
 115 *
 116 * get a vcpu
 117 * See if the current cpu has a vcpu. If so, and is the same as the vcpu we
 118 * want, vmcs_load(vcpu->vmcs) -- i.e. issue a VMPTRLD.
 119 *
 120 * If it's not the same, see if the vcpu thinks it is on the core. If it is not,
 121 * call __vmx_get_cpu_helper on the other cpu, to free it up. Else vmcs_clear
 122 * the one attached to this cpu. Then vmcs_load the vmcs for vcpu on this this
 123 * cpu, call __vmx_setup_cpu, mark this vcpu as being attached to this cpu,
 124 * done.
 125 *
 126 * vmx_run_vcpu this one gets messy, mainly because it's a giant wad
 127 * of inline assembly with embedded CPP crap. I suspect we'll want to
 128 * un-inline it someday, but maybe not.  It's called with a vcpu
 129 * struct from which it loads guest state, and to which it stores
 130 * non-virtualized host state. It issues a vmlaunch or vmresume
 131 * instruction depending, and on return, it evaluates if things the
 132 * launch/resume had an error in that operation. Note this is NOT the
 133 * same as an error while in the virtual machine; this is an error in
 134 * startup due to misconfiguration. Depending on what is returned it's
 135 * either a failed vm startup or an exit for lots of many reasons.
 136 *
 137 */
 138
 139/* basically: only rename those globals that might conflict
 140 * with existing names. Leave all else the same.
 141 * this code is more modern than the other code, yet still
 142 * well encapsulated, it seems.
 143 */
 144#include <kmalloc.h>
 145#include <string.h>
 146#include <stdio.h>
 147#include <assert.h>
 148#include <error.h>
 149#include <pmap.h>
 150#include <sys/queue.h>
 151#include <smp.h>
 152#include <kref.h>
 153#include <atomic.h>
 154#include <alarm.h>
 155#include <event.h>
 156#include <umem.h>
 157#include <bitops.h>
 158#include <arch/types.h>
 159#include <syscall.h>
 160#include <arch/io.h>
 161#include <percpu.h>
 162
 163#include <ros/vmm.h>
 164#include "vmx.h"
 165#include "../vmm.h"
 166
 167#include <trap.h>
 168
 169#include <smp.h>
 170#include <ros/procinfo.h>
 171
 172#define currentcpu (&per_cpu_info[core_id()])
 173
 174static unsigned long *msr_bitmap;
 175#define VMX_IO_BITMAP_SZ                (1 << 16) /* 64 KB */
 176static unsigned long *io_bitmap;
 177
 178int x86_ept_pte_fix_ups = 0;
 179
 180struct vmx_capability vmx_capability;
 181struct vmcs_config vmcs_config;
 182
 183char * const VMX_EXIT_REASON_NAMES[] = {
 184        VMX_EXIT_REASONS
 185};
 186
 187static char *cr_access_type[] = {
 188        "move to cr",
 189        "move from cr",
 190        "clts",
 191        "lmsw"
 192};
 193
 194static char *cr_gpr[] = {
 195        "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
 196        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 197};
 198
 199static int guest_cr_num[16] = {
 200        GUEST_CR0,
 201        -1,
 202        -1,
 203        GUEST_CR3,
 204        GUEST_CR4,
 205        -1,
 206        -1,
 207        -1,
 208        -1,     /* 8? */
 209        -1, -1, -1, -1, -1, -1, -1
 210};
 211
 212static __always_inline unsigned long vmcs_readl(unsigned long field);
 213
 214/* See section 24-3 of The Good Book */
 215void show_cr_access(uint64_t val)
 216{
 217        int crnr = val & 0xf;
 218        int type = (val >> 4) & 3;
 219        int reg = (val >> 11) & 0xf;
 220
 221        print_lock();
 222        printk("%s: %d: ", cr_access_type[type], crnr);
 223        if (type < 2) {
 224                printk("%s", cr_gpr[reg]);
 225                if (guest_cr_num[crnr] > -1) {
 226                        printk(": 0x%x", vmcs_readl(guest_cr_num[crnr]));
 227                }
 228        }
 229        printk("\n");
 230        print_unlock();
 231}
 232
 233void ept_flush(uint64_t eptp)
 234{
 235        ept_sync_context(eptp);
 236}
 237
 238static void vmcs_clear(struct vmcs *vmcs)
 239{
 240        uint64_t phys_addr = PADDR(vmcs);
 241        uint8_t error;
 242
 243        asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
 244                      : "=qm"(error)
 245                      : "a"(&phys_addr), "m"(phys_addr)
 246                      :"cc", "memory");
 247        if (error)
 248                printk("vmclear fail: %p/%llx\n", vmcs, phys_addr);
 249}
 250
 251static void vmcs_load(struct vmcs *vmcs)
 252{
 253        uint64_t phys_addr = PADDR(vmcs);
 254        uint8_t error;
 255
 256        asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
 257                      : "=qm"(error)
 258                      : "a"(&phys_addr), "m"(phys_addr)
 259                      : "cc", "memory");
 260        if (error)
 261                printk("vmptrld %p/%llx failed\n", vmcs, phys_addr);
 262}
 263
 264/* Returns the paddr pointer of the current CPU's VMCS region, or -1 if none. */
 265static physaddr_t vmcs_get_current(void)
 266{
 267        physaddr_t vmcs_paddr;
 268
 269        /* RAX contains the addr of the location to store the VMCS pointer.  The
 270         * compiler doesn't know the ASM will deref that pointer, hence the =m
 271         */
 272        asm volatile (ASM_VMX_VMPTRST_RAX:"=m"(vmcs_paddr):"a"(&vmcs_paddr));
 273        return vmcs_paddr;
 274}
 275
 276static __always_inline unsigned long vmcs_readl(unsigned long field)
 277{
 278        return vmcs_read(field);
 279}
 280
 281static __always_inline uint16_t vmcs_read16(unsigned long field)
 282{
 283        return vmcs_readl(field);
 284}
 285
 286static __always_inline uint32_t vmcs_read32(unsigned long field)
 287{
 288        return vmcs_readl(field);
 289}
 290
 291static __always_inline uint64_t vmcs_read64(unsigned long field)
 292{
 293        return vmcs_readl(field);
 294}
 295
 296void vmwrite_error(unsigned long field, unsigned long value)
 297{
 298        printk("vmwrite error: reg %lx value %lx (err %d)\n",
 299               field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 300}
 301
 302void vmcs_writel(unsigned long field, unsigned long value)
 303{
 304        if (!vmcs_write(field, value))
 305                vmwrite_error(field, value);
 306}
 307
 308static void vmcs_write16(unsigned long field, uint16_t value)
 309{
 310        vmcs_writel(field, value);
 311}
 312
 313static void vmcs_write32(unsigned long field, uint32_t value)
 314{
 315        vmcs_writel(field, value);
 316}
 317
 318static void vmcs_write64(unsigned long field, uint64_t value)
 319{
 320        vmcs_writel(field, value);
 321}
 322
 323void vapic_status_dump_kernel(void *vapic);
 324
 325static bool vmx_control_can_be_changed(struct vmxec *v, uint32_t ctl)
 326{
 327        return v->hw_changeable & v->policy_changeable & ctl;
 328}
 329
 330/*
 331 * A note on Things You Can't Make Up.
 332 * or
 333 * "George, you can type this shit, but you can't say it" -- Harrison Ford
 334 *
 335 * There are 5 VMCS 32-bit words that control guest permissions. If
 336 * you set these correctly, you've got a guest that will behave. If
 337 * you get even one bit wrong, you've got a guest that will chew your
 338 * leg off. Some bits must be 1, some must be 0, and some can be set
 339 * either way. To add to the fun, the docs are sort of a docudrama or,
 340 * as the quote goes, "interesting if true."
 341 *
 342 * To determine what bit can be set in what VMCS 32-bit control word,
 343 * there are 5 corresponding 64-bit MSRs.  And, to make it even more
 344 * fun, the standard set of MSRs have errors in them, i.e. report
 345 * incorrect values, for legacy reasons, and so you are supposed to
 346 * "look around" to another set, which have correct bits in
 347 * them. There are four such 'correct' registers, and they have _TRUE_
 348 * in the names as you can see below. We test for the value of VMCS
 349 * control bits in the _TRUE_ registers if possible. The fifth
 350 * register, CPU Secondary Exec Controls, which came later, needs no
 351 * _TRUE_ variant.
 352 *
 353 * For each MSR, the high 32 bits tell you what bits can be "1" by a
 354 * "1" in that position; the low 32 bits tell you what bit can be "0"
 355 * by a "0" in that position. So, for each of 32 bits in a given VMCS
 356 * control word, there is a pair of bits in an MSR that tells you what
 357 * values it can take. The two bits, of which there are *four*
 358 * combinations, describe the *three* possible operations on a
 359 * bit. The two bits, taken together, form an untruth table: There are
 360 * three possibilities: The VMCS bit can be set to 0 or 1, or it can
 361 * only be 0, or only 1. The fourth combination is not supposed to
 362 * happen.
 363 *
 364 * So: there is the 1 bit from the upper 32 bits of the msr.
 365 * If this bit is set, then the bit can be 1. If clear, it can not be 1.
 366 *
 367 * Then there is the 0 bit, from low 32 bits. If clear, the VMCS bit
 368 * can be 0. If 1, the VMCS bit can not be 0.
 369 *
 370 * SO, let's call the 1 bit R1, and the 0 bit R0, we have:
 371 *  R1 R0
 372 *  0 0 -> must be 0
 373 *  1 0 -> can be 1, can be 0
 374 *  0 1 -> can not be 1, can not be 0. --> JACKPOT! Not seen yet.
 375 *  1 1 -> must be one.
 376 *
 377 * It's also pretty hard to know what you can and can't set, and
 378 * that's led to inadvertent opening of permissions at times.  Because
 379 * of this complexity we've decided on the following: the driver must
 380 * define EVERY bit, UNIQUELY, for each of the 5 registers, that it wants
 381 * set. Further, for any bit that's settable, the driver must specify
 382 * a setting; for any bit that's reserved, the driver settings must
 383 * match that bit. If there are reserved bits we don't specify, that's
 384 * ok; we'll take them as is.
 385 *
 386 * We use a set-means-set, and set-means-clear model, i.e. we use a
 387 * 32-bit word to contain the bits we want to be 1, indicated by one;
 388 * and another 32-bit word in which a bit we want to be 0 is indicated
 389 * by a 1. This allows us to easily create masks of all bits we're
 390 * going to set, for example.
 391 *
 392 * We have two 32-bit numbers for each 32-bit VMCS field: bits we want
 393 * set and bits we want clear.  If you read the MSR for that field,
 394 * compute the reserved 0 and 1 settings, and | them together, they
 395 * need to result in 0xffffffff. You can see that we can create other
 396 * tests for conflicts (i.e. overlap).
 397 *
 398 * At this point, I've tested check_vmx_controls in every way
 399 * possible, because I kept screwing the bitfields up. You'll get a nice
 400 * error it won't work at all, which is what we want: a
 401 * failure-prone setup, where even errors that might result in correct
 402 * values are caught -- "right answer, wrong method, zero credit." If there's
 403 * weirdness in the bits, we don't want to run.
 404 * The try_set stuff adds particular ugliness but we have to have it.
 405 */
 406static bool check_vmxec_controls(struct vmxec *v, bool have_true_msr,
 407                                 uint32_t *result)
 408{
 409        bool err = false;
 410        uint32_t vmx_msr_low, vmx_msr_high;
 411        uint64_t msr_val;
 412        uint32_t reserved_0, reserved_1, changeable_bits, try0, try1;
 413
 414        if (have_true_msr)
 415                msr_val = read_msr(v->truemsr);
 416        else
 417                msr_val = read_msr(v->msr);
 418        vmx_msr_high = high32(msr_val);
 419        vmx_msr_low = low32(msr_val);
 420
 421        if (vmx_msr_low & ~vmx_msr_high)
 422                warn("JACKPOT: Conflicting VMX ec ctls for %s, high 0x%08x low 0x%08x",
 423                     v->name, vmx_msr_high, vmx_msr_low);
 424
 425        reserved_0 = (~vmx_msr_low) & (~vmx_msr_high);
 426        reserved_1 = vmx_msr_low & vmx_msr_high;
 427        changeable_bits = ~(reserved_0 | reserved_1);
 428        v->hw_changeable = changeable_bits;
 429
 430        /*
 431         * this is very much as follows:
 432         * accept the things I cannot change,
 433         * change the things I can,
 434         * know the difference.
 435         */
 436
 437        /* Conflict. Don't try to both set and reset bits. */
 438        if ((v->must_be_1 & (v->must_be_0 | v->try_set_1 | v->try_set_0)) ||
 439            (v->must_be_0 & (v->try_set_1 | v->try_set_0)) ||
 440            (v->try_set_1 & v->try_set_0)) {
 441                printk("%s: must 0 (0x%x) and must be 1 (0x%x) and try_set_0 (0x%x) and try_set_1 (0x%x) overlap\n",
 442                       v->name, v->must_be_0, v->must_be_1, v->try_set_0,
 443                       v->try_set_1);
 444                err = true;
 445        }
 446
 447        /* coverage */
 448        if (((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1)
 449             & changeable_bits) != changeable_bits) {
 450                printk("%s: Need to cover 0x%x and have 0x%x,0x%x\n",
 451                       v->name, changeable_bits, v->must_be_0, v->must_be_1,
 452                       v->try_set_0, v->try_set_1);
 453                err = true;
 454        }
 455
 456        if ((v->must_be_0 | v->must_be_1 | v->try_set_0 | v->try_set_1
 457             | reserved_0 | reserved_1) != 0xffffffff) {
 458                printk("%s: incomplete coverage: have 0x%x, want 0x%x\n",
 459                       v->name, v->must_be_0 | v->must_be_1 | v->try_set_0 |
 460                       v->try_set_1 | reserved_0 | reserved_1, 0xffffffff);
 461                err = true;
 462        }
 463
 464        /* Don't try to change bits that can't be changed. */
 465        if ((v->must_be_0 & (reserved_0 | changeable_bits)) != v->must_be_0) {
 466                printk("%s: set to 0 (0x%x) can't be done\n", v->name,
 467                       v->must_be_0);
 468                err = true;
 469        }
 470
 471        if ((v->must_be_1 & (reserved_1 | changeable_bits)) != v->must_be_1) {
 472                printk("%s: set to 1 (0x%x) can't be done\n", v->name,
 473                       v->must_be_1);
 474                err = true;
 475        }
 476        // Note we don't REQUIRE that try_set_0 or try_set_0 be possible. We
 477        // just want to try it.
 478
 479        // Clear bits in try_set that can't be set.
 480        try1 = v->try_set_1 & (reserved_1 | changeable_bits);
 481
 482        /* If there's been any error at all, spill our guts and return. */
 483        if (err) {
 484                printk("%s: vmx_msr_high 0x%x, vmx_msr_low 0x%x, ",
 485                           v->name, vmx_msr_high, vmx_msr_low);
 486                printk("must_be_0 0x%x, try_set_0 0x%x,reserved_0 0x%x",
 487                           v->must_be_0, v->try_set_0, reserved_0);
 488                printk("must_be_1 0x%x, try_set_1 0x%x,reserved_1 0x%x",
 489                           v->must_be_1, v->try_set_1, reserved_1);
 490                printk(" reserved_0 0x%x", reserved_0);
 491                printk(" changeable_bits 0x%x\n", changeable_bits);
 492                return false;
 493        }
 494
 495        *result = v->must_be_1 | try1 | reserved_1;
 496
 497        printk("%s: check_vmxec_controls succeeds with result 0x%x\n",
 498                   v->name, *result);
 499        return true;
 500}
 501
 502/*
 503 * We're trying to make this as readable as possible. Realistically, it will
 504 * rarely if ever change, if the past is any guide.
 505 */
 506static struct vmxec pbec = {
 507        .name = "Pin Based Execution Controls",
 508        .msr = MSR_IA32_VMX_PINBASED_CTLS,
 509        .truemsr = MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 510
 511        .must_be_1 = (PIN_BASED_EXT_INTR_MASK |
 512                      PIN_BASED_NMI_EXITING |
 513                      PIN_BASED_VIRTUAL_NMIS |
 514                      PIN_BASED_POSTED_INTR),
 515
 516        .must_be_0 = (PIN_BASED_VMX_PREEMPTION_TIMER),
 517};
 518
 519static struct vmxec cbec = {
 520        .name = "CPU Based Execution Controls",
 521        .msr = MSR_IA32_VMX_PROCBASED_CTLS,
 522        .truemsr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
 523
 524        .must_be_1 = (
 525                      CPU_BASED_MWAIT_EXITING |
 526                      CPU_BASED_HLT_EXITING |
 527                      CPU_BASED_TPR_SHADOW |
 528                      CPU_BASED_RDPMC_EXITING |
 529                      CPU_BASED_CR8_LOAD_EXITING |
 530                      CPU_BASED_CR8_STORE_EXITING |
 531                      CPU_BASED_USE_MSR_BITMAPS |
 532                      CPU_BASED_USE_IO_BITMAPS |
 533                      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS),
 534
 535        .must_be_0 = (
 536                      CPU_BASED_VIRTUAL_INTR_PENDING |
 537                      CPU_BASED_INVLPG_EXITING |
 538                      CPU_BASED_USE_TSC_OFFSETING |
 539                      CPU_BASED_RDTSC_EXITING |
 540                      CPU_BASED_CR3_LOAD_EXITING |
 541                      CPU_BASED_CR3_STORE_EXITING |
 542                      CPU_BASED_MOV_DR_EXITING |
 543                      CPU_BASED_VIRTUAL_NMI_PENDING |
 544                      CPU_BASED_MONITOR_TRAP |
 545                      CPU_BASED_PAUSE_EXITING |
 546                      CPU_BASED_UNCOND_IO_EXITING),
 547
 548        .try_set_0 = (CPU_BASED_MONITOR_EXITING),
 549        .policy_changeable = (
 550                              CPU_BASED_HLT_EXITING |
 551                              CPU_BASED_PAUSE_EXITING |
 552                              CPU_BASED_MWAIT_EXITING |
 553                              0),
 554};
 555
 556static struct vmxec cb2ec = {
 557        .name = "CPU Based 2nd Execution Controls",
 558        .msr = MSR_IA32_VMX_PROCBASED_CTLS2,
 559        .truemsr = MSR_IA32_VMX_PROCBASED_CTLS2,
 560
 561        .must_be_1 = (SECONDARY_EXEC_ENABLE_EPT |
 562                      SECONDARY_EXEC_APIC_REGISTER_VIRT |
 563                      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 564                      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 565                      SECONDARY_EXEC_ENABLE_INVPCID |
 566                      SECONDARY_EXEC_WBINVD_EXITING),
 567
 568        .must_be_0 = (
 569                      SECONDARY_EXEC_DESCRIPTOR_EXITING |
 570                      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 571                      SECONDARY_EXEC_ENABLE_VPID |
 572                      SECONDARY_EXEC_UNRESTRICTED_GUEST |
 573                      SECONDARY_EXEC_PAUSE_LOOP_EXITING |
 574                      SECONDARY_EXEC_RDRAND_EXITING |
 575                      SECONDARY_EXEC_ENABLE_VMFUNC |
 576                      SECONDARY_EXEC_SHADOW_VMCS |
 577                      SECONDARY_EXEC_RDSEED_EXITING |
 578                      SECONDARY_EPT_VE |
 579                      SECONDARY_ENABLE_XSAV_RESTORE),
 580
 581        .try_set_1 = SECONDARY_EXEC_RDTSCP,
 582
 583        .try_set_0 = SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_PML
 584
 585};
 586
 587static struct vmxec vmentry = {
 588        .name = "VMENTRY controls",
 589        .msr = MSR_IA32_VMX_ENTRY_CTLS,
 590        .truemsr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
 591        /* exact order from vmx.h; only the first two are enabled. */
 592
 593        .must_be_1 =  (VM_ENTRY_LOAD_DEBUG_CONTROLS | /* can't set to 0 */
 594                       VM_ENTRY_LOAD_IA32_EFER |
 595                       VM_ENTRY_IA32E_MODE),
 596
 597        .must_be_0 = (VM_ENTRY_SMM |
 598                      VM_ENTRY_DEACT_DUAL_MONITOR |
 599                      VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
 600                      VM_ENTRY_LOAD_IA32_PAT),
 601};
 602
 603static struct vmxec vmexit = {
 604        .name = "VMEXIT controls",
 605        .msr = MSR_IA32_VMX_EXIT_CTLS,
 606        .truemsr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
 607
 608        .must_be_1 = (VM_EXIT_SAVE_DEBUG_CONTROLS |     /* can't set to 0 */
 609                      VM_EXIT_ACK_INTR_ON_EXIT |
 610                      VM_EXIT_SAVE_IA32_EFER |
 611                      VM_EXIT_LOAD_IA32_EFER |
 612                      VM_EXIT_HOST_ADDR_SPACE_SIZE),    /* 64 bit */
 613
 614        .must_be_0 = (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
 615                      VM_EXIT_SAVE_IA32_PAT |
 616                      VM_EXIT_LOAD_IA32_PAT |
 617                      VM_EXIT_SAVE_VMX_PREEMPTION_TIMER),
 618};
 619
 620static void setup_vmcs_config(void *p)
 621{
 622        int *ret = p;
 623        struct vmcs_config *vmcs_conf = &vmcs_config;
 624        uint32_t vmx_msr_high;
 625        uint64_t vmx_msr;
 626        bool have_true_msrs = false;
 627        bool ok;
 628
 629        *ret = -EIO;
 630
 631        vmx_msr = read_msr(MSR_IA32_VMX_BASIC);
 632        vmx_msr_high = vmx_msr >> 32;
 633
 634        /*
 635         * If bit 55 (VMX_BASIC_HAVE_TRUE_MSRS) is set, then we
 636         * can go for the true MSRs.  Else, we ask you to get a better CPU.
 637         */
 638        if (vmx_msr & VMX_BASIC_TRUE_CTLS) {
 639                have_true_msrs = true;
 640                printd("Running with TRUE MSRs\n");
 641        } else {
 642                printk("Running with non-TRUE MSRs, this is old hardware\n");
 643        }
 644
 645        /*
 646         * Don't worry that one or more of these might fail and leave
 647         * the VMCS in some kind of incomplete state. If one of these
 648         * fails, the caller is going to discard the VMCS.
 649         * It is written this way to ensure we get results of all tests and
 650         * avoid BMAFR behavior.
 651         */
 652        ok = check_vmxec_controls(&pbec, have_true_msrs,
 653                                  &vmcs_conf->pin_based_exec_ctrl);
 654        ok = check_vmxec_controls(&cbec, have_true_msrs,
 655                                  &vmcs_conf->cpu_based_exec_ctrl) && ok;
 656        /* Only check cb2ec if we're still ok, o/w we may GPF */
 657        ok = ok && check_vmxec_controls(&cb2ec, have_true_msrs,
 658                                        &vmcs_conf->cpu_based_2nd_exec_ctrl);
 659        ok = check_vmxec_controls(&vmentry, have_true_msrs,
 660                                  &vmcs_conf->vmentry_ctrl) && ok;
 661        ok = check_vmxec_controls(&vmexit, have_true_msrs,
 662                                  &vmcs_conf->vmexit_ctrl) && ok;
 663        if (!ok) {
 664                printk("vmxexec controls is no good.\n");
 665                return;
 666        }
 667        assert(cpu_has_secondary_exec_ctrls());
 668
 669        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
 670        if ((vmx_msr_high & 0x1fff) > PGSIZE) {
 671                printk("vmx_msr_high & 0x1fff) is 0x%x, > PAGE_SIZE 0x%x\n",
 672                           vmx_msr_high & 0x1fff, PGSIZE);
 673                return;
 674        }
 675
 676        /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
 677        if (vmx_msr & VMX_BASIC_64) {
 678                printk("VMX doesn't support 64 bit width!\n");
 679                return;
 680        }
 681
 682        if (((vmx_msr & VMX_BASIC_MEM_TYPE_MASK) >> VMX_BASIC_MEM_TYPE_SHIFT)
 683                != VMX_BASIC_MEM_TYPE_WB) {
 684                printk("VMX doesn't support WB memory for VMCS accesses!\n");
 685                return;
 686        }
 687
 688        vmcs_conf->size = vmx_msr_high & 0x1fff;
 689        vmcs_conf->revision_id = (uint32_t) vmx_msr;
 690
 691        /* Read in the caps for runtime checks.  This MSR is only available if
 692         * secondary controls and ept or vpid is on, which we check earlier */
 693        vmx_msr = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
 694        vmx_capability.vpid = high32(vmx_msr);
 695        vmx_capability.ept = low32(vmx_msr);
 696
 697        *ret = 0;
 698}
 699
 700static struct vmcs *__vmx_alloc_vmcs(int node)
 701{
 702        struct vmcs *vmcs;
 703
 704        vmcs = kpages_alloc(vmcs_config.size, MEM_WAIT);
 705        if (!vmcs)
 706                error(ENOMEM, "__vmx_alloc_vmcs: Could not get %d contig bytes",
 707                      vmcs_config.size);
 708        memset(vmcs, 0, vmcs_config.size);
 709        vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
 710        printd("%d: set rev id %d\n", core_id(), vmcs->revision_id);
 711        return vmcs;
 712}
 713
 714/**
 715 * vmx_alloc_vmcs - allocates a VMCS region
 716 *
 717 * NOTE: Assumes the new region will be used by the current CPU.
 718 *
 719 * Returns a valid VMCS region.
 720 */
 721static struct vmcs *vmx_alloc_vmcs(void)
 722{
 723        return __vmx_alloc_vmcs(numa_id());
 724}
 725
 726/**
 727 * vmx_free_vmcs - frees a VMCS region
 728 */
 729static void vmx_free_vmcs(struct vmcs *vmcs)
 730{
 731        kpages_free(vmcs, vmcs_config.size);
 732}
 733
 734/*
 735 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
 736 * will not change in the lifetime of the guest.
 737 * Note that host-state that does change is set elsewhere. E.g., host-state
 738 * that is set differently for each CPU is set in __vmx_setup_pcpu(), not here.
 739 */
 740static void vmx_setup_constant_host_state(void)
 741{
 742        pseudodesc_t dt;
 743
 744        vmcs_writel(HOST_CR0, rcr0() & ~X86_CR0_TS);    /* 22.2.3 */
 745        vmcs_writel(HOST_CR4, rcr4());  /* 22.2.3, 22.2.5 */
 746        vmcs_writel(HOST_CR3, rcr3());  /* 22.2.3 */
 747
 748        vmcs_write16(HOST_CS_SELECTOR, GD_KT);  /* 22.2.4 */
 749        vmcs_write16(HOST_DS_SELECTOR, GD_KD);  /* 22.2.4 */
 750        vmcs_write16(HOST_ES_SELECTOR, GD_KD);  /* 22.2.4 */
 751        vmcs_write16(HOST_SS_SELECTOR, GD_KD);  /* 22.2.4 */
 752        vmcs_write16(HOST_TR_SELECTOR, GD_TSS); /* 22.2.4 */
 753
 754        native_store_idt(&dt);
 755        vmcs_writel(HOST_IDTR_BASE, dt.pd_base);        /* 22.2.4 */
 756
 757        extern void vmexit_handler(void);
 758        vmcs_writel(HOST_RIP, (unsigned long)vmexit_handler);
 759
 760        vmcs_write32(HOST_IA32_SYSENTER_CS, read_msr(MSR_IA32_SYSENTER_CS));
 761        vmcs_writel(HOST_IA32_SYSENTER_EIP, read_msr(MSR_IA32_SYSENTER_EIP));
 762
 763        vmcs_write32(HOST_IA32_EFER, read_msr(MSR_EFER));
 764
 765        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
 766                vmcs_write64(HOST_IA32_PAT, read_msr(MSR_IA32_CR_PAT));
 767
 768        vmcs_write16(HOST_FS_SELECTOR, 0);      /* 22.2.4 */
 769        vmcs_write16(HOST_GS_SELECTOR, 0);      /* 22.2.4 */
 770        vmcs_write(HOST_FS_BASE, 0);
 771}
 772
 773/* Set up the per-core VMCS fields.  This is the host state that varies from
 774 * core to core, which the hardware will switch for us on VM enters/exits. */
 775static void __vmx_setup_pcpu(struct guest_pcore *gpc)
 776{
 777        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 778        struct vmx_vmm *vmx = &gpc->proc->vmm.vmx;
 779
 780        vmcs_write(HOST_TR_BASE, (uintptr_t)pcpui->tss);
 781        vmcs_writel(HOST_GDTR_BASE, (uintptr_t)pcpui->gdt);
 782        vmcs_write(HOST_GS_BASE, (uintptr_t)pcpui);
 783        /* TODO: we might need to also set HOST_IA32_PERF_GLOBAL_CTRL.  Need to
 784         * think about how perf will work with VMs */
 785        /* Userspace can request changes to the ctls.  They take effect when we
 786         * reload the GPC, which occurs after a transition from userspace to VM.
 787         */
 788        vmcs_write(PIN_BASED_VM_EXEC_CONTROL, vmx->pin_exec_ctls);
 789        vmcs_write(CPU_BASED_VM_EXEC_CONTROL, vmx->cpu_exec_ctls);
 790        vmcs_write(SECONDARY_VM_EXEC_CONTROL, vmx->cpu2_exec_ctls);
 791}
 792
 793uint64_t construct_eptp(physaddr_t root_hpa)
 794{
 795        uint64_t eptp;
 796
 797        /* set WB memory and 4 levels of walk.  we checked these in ept_init */
 798        eptp = VMX_EPT_MEM_TYPE_WB | (VMX_EPT_GAW_4_LVL <<
 799                                      VMX_EPT_GAW_EPTP_SHIFT);
 800        if (cpu_has_vmx_ept_ad_bits())
 801                eptp |= VMX_EPT_AD_ENABLE_BIT;
 802        eptp |= (root_hpa & PAGE_MASK);
 803
 804        return eptp;
 805}
 806
 807/* Helper: some fields of the VMCS need a physical page address, e.g. the VAPIC
 808 * page.  We have the user address.  This converts the user to phys addr and
 809 * sets that up in the VMCS.  Throws on error. */
 810static void vmcs_set_pgaddr(struct proc *p, void *u_addr,
 811                            unsigned long field, char *what)
 812{
 813        uintptr_t kva;
 814        physaddr_t paddr;
 815
 816        /* Enforce page alignment */
 817        kva = uva2kva(p, ROUNDDOWN(u_addr, PGSIZE), PGSIZE, PROT_WRITE);
 818        if (!kva)
 819                error(EINVAL, "Unmapped pgaddr %p for VMCS page %s",
 820                      u_addr, what);
 821
 822        paddr = PADDR(kva);
 823        /* TODO: need to pin the page.  A munmap would actually be okay
 824         * (though probably we should kill the process), but we need to
 825         * keep the page from being reused.  A refcnt would do the trick,
 826         * which we decref when we destroy the guest core/vcpu. Note that
 827         * this is an assert, not an error, because it represents an error
 828         * in the kernel itself. */
 829        assert(!PGOFF(paddr));
 830        vmcs_writel(field, paddr);
 831        /* Pages are inserted twice.  Once, with the full paddr.  The next field
 832         * is the upper 32 bits of the paddr. */
 833        vmcs_writel(field + 1, paddr >> 32);
 834}
 835
 836/**
 837 * vmx_setup_initial_guest_state - configures the initial state of guest
 838 * registers and the VMCS.  Throws on error.
 839 */
 840static void vmx_setup_initial_guest_state(struct proc *p,
 841                                          struct vmm_gpcore_init *gpci)
 842{
 843        unsigned long cr4 = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSXMMEXCPT |
 844                X86_CR4_PGE | X86_CR4_OSFXSR;
 845        uint32_t protected_mode = X86_CR0_PG | X86_CR0_PE;
 846
 847        /*
 848         * Allow guest to use xsave and read/write fs/gs base.
 849         * We require these features to be present on the cpu.
 850         */
 851        assert(cpu_has_feat(CPU_FEAT_X86_XSAVE));
 852        assert(cpu_has_feat(CPU_FEAT_X86_FSGSBASE));
 853        cr4 |= X86_CR4_RDWRGSFS;
 854        cr4 |= X86_CR4_OSXSAVE;
 855        /* configure control and data registers */
 856        vmcs_writel(GUEST_CR0, protected_mode | X86_CR0_WP |
 857                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
 858        vmcs_writel(CR0_READ_SHADOW, protected_mode | X86_CR0_WP |
 859                                X86_CR0_MP | X86_CR0_ET | X86_CR0_NE);
 860        vmcs_writel(GUEST_CR3, rcr3());
 861        vmcs_writel(GUEST_CR4, cr4);
 862        /* The only bits that matter in this shadow are those that are
 863         * set in CR4_GUEST_HOST_MASK.  TODO: do we need to separate
 864         * the setting of this value from that of
 865         * CR4_GUEST_HOST_MASK? */
 866        vmcs_writel(CR4_READ_SHADOW, 0);
 867        vmcs_writel(GUEST_IA32_EFER, EFER_LME | EFER_LMA |
 868                                EFER_SCE | EFER_NX /*| EFER_FFXSR */ );
 869        vmcs_writel(GUEST_GDTR_BASE, 0);
 870        vmcs_writel(GUEST_GDTR_LIMIT, 0);
 871        vmcs_writel(GUEST_IDTR_BASE, 0);
 872        vmcs_writel(GUEST_IDTR_LIMIT, 0);
 873        vmcs_writel(GUEST_RIP, 0xdeadbeef);
 874        vmcs_writel(GUEST_RSP, 0xdeadbeef);
 875        vmcs_writel(GUEST_RFLAGS, FL_RSVD_1);
 876        vmcs_writel(GUEST_DR7, 0);
 877
 878        /* guest segment bases */
 879        vmcs_writel(GUEST_CS_BASE, 0);
 880        vmcs_writel(GUEST_DS_BASE, 0);
 881        vmcs_writel(GUEST_ES_BASE, 0);
 882        enforce_user_canon(&gpci->fsbase);
 883        vmcs_writel(GUEST_FS_BASE, gpci->fsbase);
 884        enforce_user_canon(&gpci->gsbase);
 885        vmcs_writel(GUEST_GS_BASE, gpci->gsbase);
 886        vmcs_writel(GUEST_SS_BASE, 0);
 887
 888        /* guest segment access rights */
 889        vmcs_writel(GUEST_CS_AR_BYTES, 0xA09B);
 890        vmcs_writel(GUEST_DS_AR_BYTES, 0xA093);
 891        vmcs_writel(GUEST_ES_AR_BYTES, 0xA093);
 892        vmcs_writel(GUEST_FS_AR_BYTES, 0xA093);
 893        vmcs_writel(GUEST_GS_AR_BYTES, 0xA093);
 894        vmcs_writel(GUEST_SS_AR_BYTES, 0xA093);
 895
 896        /* guest segment limits */
 897        vmcs_write32(GUEST_CS_LIMIT, 0xFFFFFFFF);
 898        vmcs_write32(GUEST_DS_LIMIT, 0xFFFFFFFF);
 899        vmcs_write32(GUEST_ES_LIMIT, 0xFFFFFFFF);
 900        vmcs_write32(GUEST_FS_LIMIT, 0xFFFFFFFF);
 901        vmcs_write32(GUEST_GS_LIMIT, 0xFFFFFFFF);
 902        vmcs_write32(GUEST_SS_LIMIT, 0xFFFFFFFF);
 903
 904        /* configure segment selectors */
 905        vmcs_write16(GUEST_CS_SELECTOR, 0);
 906        vmcs_write16(GUEST_DS_SELECTOR, 0);
 907        vmcs_write16(GUEST_ES_SELECTOR, 0);
 908        vmcs_write16(GUEST_FS_SELECTOR, 0);
 909        vmcs_write16(GUEST_GS_SELECTOR, 0);
 910        vmcs_write16(GUEST_SS_SELECTOR, 0);
 911        vmcs_write16(GUEST_TR_SELECTOR, 0);
 912
 913        /* guest LDTR */
 914        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
 915        vmcs_writel(GUEST_LDTR_AR_BYTES, 0x0082);
 916        vmcs_writel(GUEST_LDTR_BASE, 0);
 917        vmcs_writel(GUEST_LDTR_LIMIT, 0);
 918
 919        /* guest TSS */
 920        vmcs_writel(GUEST_TR_BASE, 0);
 921        vmcs_writel(GUEST_TR_AR_BYTES, 0x0080 | AR_TYPE_BUSY_64_TSS);
 922        vmcs_writel(GUEST_TR_LIMIT, 0xff);
 923
 924        /* initialize sysenter */
 925        vmcs_write32(GUEST_SYSENTER_CS, 0);
 926        vmcs_writel(GUEST_SYSENTER_ESP, 0);
 927        vmcs_writel(GUEST_SYSENTER_EIP, 0);
 928
 929        /* other random initialization */
 930        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 931        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 932        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 933        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 934        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);      /* 22.2.1 */
 935
 936        /* Initialize posted interrupt notification vector */
 937        vmcs_write16(POSTED_NOTIFICATION_VEC, I_POKE_GUEST);
 938
 939        /* Clear the EOI exit bitmap */
 940        vmcs_writel(EOI_EXIT_BITMAP0, 0);
 941        vmcs_writel(EOI_EXIT_BITMAP0_HIGH, 0);
 942        vmcs_writel(EOI_EXIT_BITMAP1, 0);
 943        vmcs_writel(EOI_EXIT_BITMAP1_HIGH, 0);
 944        vmcs_writel(EOI_EXIT_BITMAP2, 0);
 945        vmcs_writel(EOI_EXIT_BITMAP2_HIGH, 0);
 946        vmcs_writel(EOI_EXIT_BITMAP3, 0);
 947        vmcs_writel(EOI_EXIT_BITMAP3_HIGH, 0);
 948
 949        /* Initialize parts based on the users info. */
 950        vmcs_set_pgaddr(p, gpci->posted_irq_desc, POSTED_INTR_DESC_ADDR,
 951                        "posted_irq_desc");
 952        vmcs_set_pgaddr(p, gpci->vapic_addr, VIRTUAL_APIC_PAGE_ADDR,
 953                        "vapic_addr");
 954        vmcs_set_pgaddr(p, gpci->apic_addr, APIC_ACCESS_ADDR, "apic_addr");
 955}
 956
 957static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 958                                            uint32_t msr)
 959{
 960        int f = sizeof(unsigned long);
 961
 962        /*
 963         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 964         * have the write-low and read-high bitmap offsets the wrong way round.
 965         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 966         */
 967        if (msr <= 0x1fff) {
 968                __clear_bit(msr, msr_bitmap + 0x000 / f);       /* read-low */
 969                __clear_bit(msr, msr_bitmap + 0x800 / f);       /* write-low */
 970        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 971                msr &= 0x1fff;
 972                __clear_bit(msr, msr_bitmap + 0x400 / f);       /* read-high */
 973                __clear_bit(msr, msr_bitmap + 0xc00 / f);       /* write-high */
 974        }
 975}
 976
 977/* note the io_bitmap is big enough for the 64K port space. */
 978static void __vmx_disable_intercept_for_io(unsigned long *io_bitmap,
 979                                           uint16_t port)
 980{
 981        __clear_bit(port, io_bitmap);
 982}
 983
 984static void dumpmsrs(void)
 985{
 986        int i;
 987        int set[] = {
 988                MSR_LSTAR,
 989                MSR_FS_BASE,
 990                MSR_GS_BASE,
 991                MSR_KERNEL_GS_BASE,
 992                MSR_SFMASK,
 993                MSR_IA32_PEBS_ENABLE
 994        };
 995
 996        for (i = 0; i < ARRAY_SIZE(set); i++) {
 997                printk("%p: %p\n", set[i], read_msr(set[i]));
 998        }
 999        printk("core id %d\n", core_id());
1000}
1001
1002/* Notes on autoloading.  We can't autoload FS_BASE or GS_BASE, according to the
1003 * manual, but that's because they are automatically saved and restored when all
1004 * of the other architectural registers are saved and restored, such as cs, ds,
1005 * es, and other fun things. (See 24.4.1).  We need to make sure we don't
1006 * accidentally intercept them too, since they are magically autoloaded.
1007 *
1008 * We'll need to be careful of any MSR we neither autoload nor intercept
1009 * whenever we vmenter/vmexit, and we intercept by default.
1010 *
1011 * Other MSRs, such as MSR_IA32_PEBS_ENABLE only work on certain architectures
1012 * only work on certain architectures. */
1013static void setup_msr(struct guest_pcore *gpc)
1014{
1015        /* Since PADDR(msr_bitmap) is non-zero, and the bitmap is all 0xff, we
1016         * now intercept all MSRs */
1017        vmcs_write64(MSR_BITMAP, PADDR(msr_bitmap));
1018
1019        vmcs_write64(IO_BITMAP_A, PADDR(io_bitmap));
1020        vmcs_write64(IO_BITMAP_B, PADDR((uintptr_t)io_bitmap +
1021                                        (VMX_IO_BITMAP_SZ / 2)));
1022
1023        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1024        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1025        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1026}
1027
1028void vmx_setup_vmx_vmm(struct vmx_vmm *vmx)
1029{
1030        vmx->pin_exec_ctls = vmcs_config.pin_based_exec_ctrl;
1031        vmx->cpu_exec_ctls = vmcs_config.cpu_based_exec_ctrl;
1032        vmx->cpu2_exec_ctls = vmcs_config.cpu_based_2nd_exec_ctrl;
1033}
1034
1035/**
1036 *  vmx_setup_vmcs - configures the vmcs with starting parameters
1037 */
1038static void vmx_setup_vmcs(struct guest_pcore *gpc)
1039{
1040        vmcs_write16(VIRTUAL_PROCESSOR_ID, 0);
1041        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1042
1043        vmcs_write64(EPT_POINTER, gpc_get_eptp(gpc));
1044
1045        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1046        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1047        vmcs_write32(CR3_TARGET_COUNT, 0);      /* 22.2.1 */
1048
1049        setup_msr(gpc);
1050
1051        vmcs_config.vmentry_ctrl |= VM_ENTRY_IA32E_MODE;
1052
1053        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1054        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1055
1056        vmcs_writel(CR0_GUEST_HOST_MASK, 0);    // ~0ul);
1057
1058        /* Mask some bits in CR4 as host-owned by setting them in this
1059         * VMCS entry.  For example, for now, we mark the CR4_VMXE bit
1060         * as host owned.  Right now, when Linux boots, it wants to
1061         * set CR4_VMXE to 0 at first, which is fine -- we do not want
1062         * to think about nested virtualization yet. But if we don't
1063         * mark this bit as host owned we get a VMEXIT. Marking
1064         * CR4_VMXE as host owned means that the writes will succeed
1065         * with no vmexit if the value written matches the
1066         * corresponding bit in the shadow register. */
1067        vmcs_writel(CR4_GUEST_HOST_MASK, CR4_VMXE);
1068
1069        //kvm_write_tsc(&vmx->gpc, 0);
1070        vmcs_writel(TSC_OFFSET, 0);
1071
1072        vmx_setup_constant_host_state();
1073}
1074
1075/**
1076 * create_guest_pcore - allocates and initializes a guest physical core
1077 *
1078 * Returns: A new VCPU structure
1079 */
1080struct guest_pcore *create_guest_pcore(struct proc *p,
1081                                       struct vmm_gpcore_init *gpci)
1082{
1083        ERRSTACK(2);
1084        int8_t state = 0;
1085        struct guest_pcore *gpc = kmalloc(sizeof(struct guest_pcore), MEM_WAIT);
1086
1087        if (!gpc)
1088                error(ENOMEM, "create_guest_pcore could not allocate gpc");
1089
1090        if (waserror()) {
1091                kfree(gpc);
1092                nexterror();
1093        }
1094
1095        memset(gpc, 0, sizeof(*gpc));
1096
1097        /* Warning: p here is uncounted (weak) reference */
1098        gpc->proc = p;
1099        gpc->vmcs = vmx_alloc_vmcs();
1100        if (waserror()) {
1101                vmx_free_vmcs(gpc->vmcs);
1102                nexterror();
1103        }
1104        printd("%d: gpc->vmcs is %p\n", core_id(), gpc->vmcs);
1105        gpc->cpu = -1;
1106        gpc->vmcs_core_id = -1;
1107        gpc->should_vmresume = FALSE;
1108
1109        disable_irqsave(&state);
1110        vmx_load_guest_pcore(gpc);
1111        vmx_setup_vmcs(gpc);
1112        vmx_setup_initial_guest_state(p, gpci);
1113        vmx_unload_guest_pcore(gpc);
1114        enable_irqsave(&state);
1115
1116        gpc->xcr0 = __proc_global_info.x86_default_xcr0;
1117
1118        gpc->posted_irq_desc = gpci->posted_irq_desc;
1119        poperror();
1120        poperror();
1121        return gpc;
1122}
1123
1124/**
1125 * destroy_guest_pcore - destroys and frees an existing guest physical core
1126 * @gpc: the GPC to destroy
1127 */
1128void destroy_guest_pcore(struct guest_pcore *gpc)
1129{
1130        vmx_free_vmcs(gpc->vmcs);
1131        kfree(gpc);
1132}
1133
1134static void vmx_step_instruction(void)
1135{
1136        vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) +
1137                    vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
1138}
1139
1140/**
1141 * __vmx_enable - low-level enable of VMX mode on the current CPU
1142 * @vmxon_buf: an opaque buffer for use as the VMXON region
1143 */
1144static int __vmx_enable(struct vmcs *vmxon_buf)
1145{
1146        uint64_t phys_addr = PADDR(vmxon_buf);
1147        uint64_t old, test_bits;
1148
1149        if (rcr4() & X86_CR4_VMXE) {
1150                panic("Should never have this happen");
1151                return -EBUSY;
1152        }
1153
1154        old = read_msr(MSR_IA32_FEATURE_CONTROL);
1155
1156        test_bits = FEATURE_CONTROL_LOCKED;
1157        test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1158
1159        if (0)  // tboot_enabled())
1160                test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1161
1162        if ((old & test_bits) != test_bits) {
1163                /* If it's locked, then trying to set it will cause a GPF.
1164                 * No Dune for you!
1165                 */
1166                if (old & FEATURE_CONTROL_LOCKED) {
1167                        printk("Dune: MSR_IA32_FEATURE_CONTROL is locked!\n");
1168                        return -1;
1169                }
1170
1171                /* enable and lock */
1172                write_msr(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1173        }
1174        lcr4(rcr4() | X86_CR4_VMXE);
1175
1176        __vmxon(phys_addr);
1177        vpid_sync_gpc_global(); /* good idea, even if we aren't using vpids */
1178        ept_sync_global();
1179
1180        return 0;
1181}
1182
1183/**
1184 * vmx_disable - disables VMX mode on the current CPU
1185 */
1186static void vmx_disable(void *unused)
1187{
1188        if (currentcpu->vmx_enabled) {
1189                __vmxoff();
1190                lcr4(rcr4() & ~X86_CR4_VMXE);
1191                currentcpu->vmx_enabled = 0;
1192        }
1193}
1194
1195/* Probe the cpus to see which ones can do vmx.
1196 * Return -errno if it fails, and 1 if it succeeds.
1197 */
1198static bool probe_cpu_vmx(void)
1199{
1200        /* The best way to test this code is:
1201         * wrmsr -p <cpu> 0x3a 1
1202         * This will lock vmx off; then modprobe dune.
1203         * Frequently, however, systems have all 0x3a registers set to 5,
1204         * meaning testing is impossible, as vmx can not be disabled.
1205         * We have to simulate it being unavailable in most cases.
1206         * The 'test' variable provides an easy way to simulate
1207         * unavailability of vmx on some, none, or all cpus.
1208         */
1209        if (!cpu_has_vmx()) {
1210                printk("Machine does not support VT-x\n");
1211                return FALSE;
1212        } else {
1213                printk("Machine supports VT-x\n");
1214                return TRUE;
1215        }
1216}
1217
1218static int ept_init(void)
1219{
1220        if (!cpu_has_vmx_ept()) {
1221                printk("VMX doesn't support EPT!\n");
1222                return -1;
1223        }
1224        if (!cpu_has_vmx_eptp_writeback()) {
1225                printk("VMX EPT doesn't support WB memory!\n");
1226                return -1;
1227        }
1228        if (!cpu_has_vmx_ept_4levels()) {
1229                printk("VMX EPT doesn't support 4 level walks!\n");
1230                return -1;
1231        }
1232        switch (arch_max_jumbo_page_shift()) {
1233        case PML3_SHIFT:
1234                if (!cpu_has_vmx_ept_1g_page()) {
1235                        printk("VMX EPT doesn't support 1 GB pages!\n");
1236                        return -1;
1237                }
1238                break;
1239        case PML2_SHIFT:
1240                if (!cpu_has_vmx_ept_2m_page()) {
1241                        printk("VMX EPT doesn't support 2 MB pages!\n");
1242                        return -1;
1243                }
1244                break;
1245        default:
1246                printk("Unexpected jumbo page size %d\n",
1247                       arch_max_jumbo_page_shift());
1248                return -1;
1249        }
1250        if (!cpu_has_vmx_ept_ad_bits()) {
1251                printk("VMX EPT doesn't support accessed/dirty!\n");
1252                x86_ept_pte_fix_ups |= EPTE_A | EPTE_D;
1253        }
1254        if (!cpu_has_vmx_invept() || !cpu_has_vmx_invept_global()) {
1255                printk("VMX EPT can't invalidate PTEs/TLBs!\n");
1256                return -1;
1257        }
1258
1259        return 0;
1260}
1261
1262/**
1263 * vmx_init sets up physical core data areas that are required to run a vm at
1264 * all.  These data areas are not connected to a specific user process in any
1265 * way. Instead, they are in some sense externalizing what would other wise be a
1266 * very large ball of state that would be inside the CPU.
1267 */
1268int intel_vmm_init(void)
1269{
1270        int r, cpu, ret;
1271
1272        if (!probe_cpu_vmx()) {
1273                return -EOPNOTSUPP;
1274        }
1275
1276        setup_vmcs_config(&ret);
1277
1278        if (ret) {
1279                printk("setup_vmcs_config failed: %d\n", ret);
1280                return ret;
1281        }
1282
1283        msr_bitmap = (unsigned long *)kpage_zalloc_addr();
1284        if (!msr_bitmap) {
1285                printk("Could not allocate msr_bitmap\n");
1286                return -ENOMEM;
1287        }
1288        io_bitmap = (unsigned long *)kpages_alloc(VMX_IO_BITMAP_SZ, MEM_WAIT);
1289        if (!io_bitmap) {
1290                printk("Could not allocate msr_bitmap\n");
1291                kfree(msr_bitmap);
1292                return -ENOMEM;
1293        }
1294        /* FIXME: do we need APIC virtualization (flexpriority?) */
1295
1296        memset(msr_bitmap, 0xff, PAGE_SIZE);
1297
1298        /* The following MSRs are virtualized to the vapic page so there is no
1299         * write or read from the actual MSR. */
1300        memset((void *)msr_bitmap + INTEL_X2APIC_MSR_START, 0,
1301               INTEL_X2APIC_MSR_LENGTH);
1302        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_EOI);
1303        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_TPR);
1304        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_LAPIC_SELF_IPI);
1305
1306        memset(io_bitmap, 0xff, VMX_IO_BITMAP_SZ);
1307
1308        /* These are the only MSRs that are not intercepted.  The hardware takes
1309         * care of FS_BASE, GS_BASE, and EFER.  We do the rest manually when
1310         * loading and unloading guest pcores. */
1311        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
1312        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);
1313        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_EFER);
1314        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE);
1315        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_LSTAR);
1316        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_STAR);
1317        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_SFMASK);
1318
1319        /* If we pretend to be a processor without this cpuid feature, we should
1320         * intercept and inject a GPF. */
1321        __vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PRED_CMD);
1322
1323        /* TODO: this might be dangerous, since they can do more than just read
1324         * the CMOS */
1325        __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_IDX);
1326        __vmx_disable_intercept_for_io(io_bitmap, CMOS_RAM_DATA);
1327
1328        if ((ret = ept_init())) {
1329                printk("EPT init failed, %d\n", ret);
1330                return ret;
1331        }
1332        printk("VMX setup succeeded\n");
1333        /* If this isn't true (we have VMX but not mwait), then we'll have to
1334         * look closely at CPU_BASED_MWAIT_EXITING. */
1335        assert(cpu_has_feat(CPU_FEAT_X86_MWAIT));
1336        return 0;
1337}
1338
1339int intel_vmm_pcpu_init(void)
1340{
1341        struct vmcs *vmxon_buf;
1342        int ret;
1343
1344        vmxon_buf = __vmx_alloc_vmcs(core_id());
1345        if (!vmxon_buf) {
1346                printk("setup_vmxarea failed on node %d\n", core_id());
1347                return -1;
1348        }
1349
1350        ret = __vmx_enable(vmxon_buf);
1351        if (ret)
1352                goto failed;
1353        currentcpu->vmx_enabled = 1;
1354        return 0;
1355failed:
1356        printk("Failed to enable VMX on core %d, err = %d\n", core_id(), ret);
1357        return ret;
1358}
1359
1360
1361void vapic_status_dump_kernel(void *vapic)
1362{
1363        uint32_t *p = (uint32_t *)vapic;
1364        int i;
1365        printk("-- BEGIN KERNEL APIC STATUS DUMP --\n");
1366        for (i = 0x100/sizeof(*p); i < 0x180/sizeof(*p); i+=4) {
1367                printk("VISR : 0x%x: 0x%08x\n", i, p[i]);
1368        }
1369        for (i = 0x200/sizeof(*p); i < 0x280/sizeof(*p); i+=4) {
1370                printk("VIRR : 0x%x: 0x%08x\n", i, p[i]);
1371        }
1372        i = 0x0B0/sizeof(*p);
1373        printk("EOI FIELD : 0x%x, 0x%08x\n", i, p[i]);
1374
1375        printk("-- END KERNEL APIC STATUS DUMP --\n");
1376}
1377
1378static DEFINE_PERCPU(struct guest_pcore *, gpc_to_clear_to);
1379
1380/* Note this is set up to allow spurious pokes.  Someone could arbitrarily send
1381 * us this KMSG at any time.  We only actually clear when we've previously
1382 * unloaded the GPC.  gpc_to_clear_to is only set once we're just 'caching' it.
1383 * */
1384void vmx_clear_vmcs(void)
1385{
1386        struct guest_pcore *gpc;
1387        int8_t irq_state = 0;
1388
1389        disable_irqsave(&irq_state);
1390        gpc = PERCPU_VAR(gpc_to_clear_to);
1391        if (gpc) {
1392                vmcs_clear(gpc->vmcs);
1393                ept_sync_context(gpc_get_eptp(gpc));
1394                gpc->should_vmresume = FALSE;
1395                wmb(); /* write -1 after clearing */
1396                gpc->vmcs_core_id = -1;
1397                PERCPU_VAR(gpc_to_clear_to) = NULL;
1398        }
1399        enable_irqsave(&irq_state);
1400}
1401
1402static void __clear_vmcs(uint32_t srcid, long a0, long a1, long a2)
1403{
1404        vmx_clear_vmcs();
1405}
1406
1407/* We are safe from races on GPC, other than vmcs and vmcs_core_id.  For
1408 * instance, only one core can be loading or unloading a particular GPC at a
1409 * time.  Other cores write to our GPC's vmcs_core_id and vmcs (doing a
1410 * vmcs_clear).  Once they write vmcs_core_id != -1, it's ours. */
1411void vmx_load_guest_pcore(struct guest_pcore *gpc)
1412{
1413        int remote_core;
1414
1415        assert(!irq_is_enabled());
1416        if (gpc->vmcs_core_id == core_id()) {
1417                PERCPU_VAR(gpc_to_clear_to) = NULL;
1418                return;
1419        }
1420        /* Clear ours *before* waiting on someone else; avoids deadlock
1421         * (circular wait). */
1422        __clear_vmcs(0, 0, 0, 0);
1423        remote_core = ACCESS_ONCE(gpc->vmcs_core_id);
1424        if (remote_core != -1) {
1425                /* This is a bit nasty.  It requires the remote core to receive
1426                 * interrupts, which means we're now waiting indefinitely for
1427                 * them to enable IRQs.  They can wait on another core, and so
1428                 * on.  We cleared our vmcs first, so that we won't deadlock on
1429                 * *this*.
1430                 *
1431                 * However, this means we can't wait on another core with IRQs
1432                 * disabled for any *other* reason.  For instance, if some other
1433                 * subsystem decides to have one core wait with IRQs disabled on
1434                 * another, the core that has our VMCS could be waiting on us to
1435                 * do something that we'll never do. */
1436                send_kernel_message(remote_core, __clear_vmcs, 0, 0, 0,
1437                                    KMSG_IMMEDIATE);
1438                while (gpc->vmcs_core_id != -1)
1439                        cpu_relax();
1440        }
1441        vmcs_load(gpc->vmcs);
1442        __vmx_setup_pcpu(gpc);
1443        gpc->vmcs_core_id = core_id();
1444}
1445
1446void vmx_unload_guest_pcore(struct guest_pcore *gpc)
1447{
1448        /* We don't have to worry about races yet.  No one will try to load gpc
1449         * until we've returned and unlocked, and no one will clear an old VMCS
1450         * to this GPC, since it was cleared before we finished loading (above).
1451         */
1452        assert(!irq_is_enabled());
1453        gpc->vmcs_core_id = core_id();
1454        PERCPU_VAR(gpc_to_clear_to) = gpc;
1455}
1456
1457uint64_t gpc_get_eptp(struct guest_pcore *gpc)
1458{
1459        return gpc->proc->env_pgdir.eptp;
1460}
1461
1462int vmx_ctl_get_exits(struct vmx_vmm *vmx)
1463{
1464        int ret = 0;
1465
1466        if (vmx->cpu_exec_ctls & CPU_BASED_HLT_EXITING)
1467                ret |= VMM_CTL_EXIT_HALT;
1468        if (vmx->cpu_exec_ctls & CPU_BASED_PAUSE_EXITING)
1469                ret |= VMM_CTL_EXIT_PAUSE;
1470        if (vmx->cpu_exec_ctls & CPU_BASED_MWAIT_EXITING)
1471                ret |= VMM_CTL_EXIT_MWAIT;
1472        return ret;
1473}
1474
1475int vmx_ctl_set_exits(struct vmx_vmm *vmx, int vmm_exits)
1476{
1477        int toggle_want;
1478        int vmx_toggle_do = 0;
1479
1480        toggle_want = (vmx_ctl_get_exits(vmx) ^ vmm_exits) & VMM_CTL_ALL_EXITS;
1481        if (toggle_want & VMM_CTL_EXIT_HALT) {
1482            if (!vmx_control_can_be_changed(&cbec, CPU_BASED_HLT_EXITING))
1483                        error(ENOSYS, "VMX can't toggle EXIT_HALT");
1484                vmx_toggle_do |= CPU_BASED_HLT_EXITING;
1485        }
1486        if (toggle_want & VMM_CTL_EXIT_PAUSE) {
1487            if (!vmx_control_can_be_changed(&cbec, CPU_BASED_PAUSE_EXITING))
1488                        error(ENOSYS, "VMX can't toggle EXIT_PAUSE");
1489                vmx_toggle_do |= CPU_BASED_PAUSE_EXITING;
1490        }
1491        if (toggle_want & VMM_CTL_EXIT_MWAIT) {
1492            if (!vmx_control_can_be_changed(&cbec, CPU_BASED_MWAIT_EXITING))
1493                        error(ENOSYS, "VMX can't toggle EXIT_MWAIT");
1494                vmx_toggle_do |= CPU_BASED_MWAIT_EXITING;
1495        }
1496        /* This is being read concurrently by load_guest_pcore. */
1497        WRITE_ONCE(vmx->cpu_exec_ctls, vmx->cpu_exec_ctls ^ vmx_toggle_do);
1498        return 0;
1499}
1500