akaros/kern/arch/x86/process64.c
<<
>>
Prefs
   1#include <arch/arch.h>
   2#include <trap.h>
   3#include <process.h>
   4#include <pmap.h>
   5#include <smp.h>
   6#include <arch/fsgsbase.h>
   7
   8#include <string.h>
   9#include <assert.h>
  10#include <stdio.h>
  11
  12static void __attribute__((noreturn)) proc_pop_hwtf(struct hw_trapframe *tf)
  13{
  14        /* for both HW and SW, note we pass an offset into the TF, beyond the fs
  15         * and gs bases */
  16        if (x86_hwtf_is_partial(tf)) {
  17                swap_gs();
  18        } else {
  19                write_gsbase(tf->tf_gsbase);
  20                write_fsbase(tf->tf_fsbase);
  21        }
  22        asm volatile (".globl __asm_pop_hwtf_start;"
  23                      "__asm_pop_hwtf_start:    "
  24                      "movq %0, %%rsp;          "
  25                      "popq %%rax;              "
  26                      "popq %%rbx;              "
  27                      "popq %%rcx;              "
  28                      "popq %%rdx;              "
  29                      "popq %%rbp;              "
  30                      "popq %%rsi;              "
  31                      "popq %%rdi;              "
  32                      "popq %%r8;               "
  33                      "popq %%r9;               "
  34                      "popq %%r10;              "
  35                      "popq %%r11;              "
  36                      "popq %%r12;              "
  37                      "popq %%r13;              "
  38                      "popq %%r14;              "
  39                      "popq %%r15;              "
  40                      "addq $0x10, %%rsp;       "
  41                      "iretq;                   "
  42                      ".globl __asm_pop_hwtf_end;"
  43                      "__asm_pop_hwtf_end:      "
  44                      : : "g" (&tf->tf_rax) : "memory");
  45        panic("iretq failed");
  46}
  47
  48static void __attribute__((noreturn)) proc_pop_swtf(struct sw_trapframe *tf)
  49{
  50        if (x86_swtf_is_partial(tf)) {
  51                swap_gs();
  52        } else {
  53                write_gsbase(tf->tf_gsbase);
  54                write_fsbase(tf->tf_fsbase);
  55        }
  56        /* We need to 0 out any registers that aren't part of the sw_tf and that
  57         * we won't use/clobber on the out-path.  While these aren't part of the
  58         * sw_tf, we also don't want to leak any kernel register content. */
  59        asm volatile (".globl __asm_pop_swtf_start;"
  60                      "__asm_pop_swtf_start:    "
  61                      "movq %0, %%rsp;          "
  62                      "movq $0, %%rax;          "
  63                      "movq $0, %%rdx;          "
  64                      "movq $0, %%rsi;          "
  65                      "movq $0, %%rdi;          "
  66                      "movq $0, %%r8;           "
  67                      "movq $0, %%r9;           "
  68                      "movq $0, %%r10;          "
  69                      "popq %%rbx;              "
  70                      "popq %%rbp;              "
  71                      "popq %%r12;              "
  72                      "popq %%r13;              "
  73                      "popq %%r14;              "
  74                      "popq %%r15;              "
  75                      "movq %1, %%r11;          "
  76                      "popq %%rcx;              "
  77                      "popq %%rsp;              "
  78                      "rex.w sysret;            "
  79                      ".globl __asm_pop_swtf_end;"
  80                      "__asm_pop_swtf_end:      "
  81                      : : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
  82        panic("sysret failed");
  83}
  84
  85/* If popping a VM TF fails for some reason, we need to reflect it back to the
  86 * user.  It is possible that the reflection fails.  We still need to run
  87 * something, and it's a lousy time to try something else.  So We'll give them a
  88 * TF that will probably fault right away and kill them. */
  89static void __attribute__((noreturn)) handle_bad_vm_tf(struct vm_trapframe *tf)
  90{
  91        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
  92
  93        tf->tf_exit_reason |= VMX_EXIT_REASONS_FAILED_VMENTRY;
  94        tf->tf_flags |= VMCTX_FL_HAS_FAULT;
  95        if (reflect_current_context()) {
  96                printk("[kernel] Unable to reflect after a bad VM enter\n");
  97                proc_init_ctx(pcpui->cur_ctx, 0, 0xcafebabe, 0, 0);
  98        }
  99        proc_pop_ctx(pcpui->cur_ctx);
 100}
 101
 102static void __attribute__((noreturn)) proc_pop_vmtf(struct vm_trapframe *tf)
 103{
 104        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 105        struct proc *p = pcpui->cur_proc;
 106        struct guest_pcore *gpc;
 107
 108        if (x86_vmtf_is_partial(tf)) {
 109                gpc = lookup_guest_pcore(p, tf->tf_guest_pcoreid);
 110                assert(gpc);
 111                assert(pcpui->guest_pcoreid == tf->tf_guest_pcoreid);
 112                assert(gpc->should_vmresume);
 113        } else {
 114                gpc = load_guest_pcore(p, tf->tf_guest_pcoreid);
 115                if (!gpc) {
 116                        tf->tf_exit_reason = EXIT_REASON_GUEST_IN_USE;
 117                        handle_bad_vm_tf(tf);
 118                }
 119        }
 120        vmcs_write(GUEST_RSP, tf->tf_rsp);
 121        vmcs_write(GUEST_CR3, tf->tf_cr3);
 122        vmcs_write(GUEST_RIP, tf->tf_rip);
 123        vmcs_write(GUEST_RFLAGS, tf->tf_rflags);
 124        /* The host stacktop could have changed, even if we are still a partial
 125         * context.  Consider a vmcall that blocks.  We'll restart the partial
 126         * context, but be on a new stack.  set_stack_top() doesn't really know
 127         * about the VMCS. */
 128        vmcs_write(HOST_RSP, pcpui->stacktop);
 129        /* cr2 is not part of the VMCS state; we need to save/restore it
 130         * manually */
 131        lcr2(tf->tf_cr2);
 132        vmcs_write(VM_ENTRY_INTR_INFO_FIELD, tf->tf_trap_inject);
 133        /* Someone may have tried poking the guest and posting an IRQ, but the
 134         * IPI missed (concurrent vmexit).  In these cases, the 'outstanding
 135         * notification' bit should still be set, and we can resend the IPI.
 136         * This will arrive after we vmenter, since IRQs are currently disabled.
 137         * */
 138        if (test_bit(VMX_POSTED_OUTSTANDING_NOTIF, gpc->posted_irq_desc))
 139                send_self_ipi(I_POKE_GUEST);
 140        /* The first time a VMCS is started after being loaded, it must be
 141         * launched.  Subsequent starts must be resumes.  Once the VMCS is
 142         * cleared, we start with a launch again.  Note this is the VMCS, not
 143         * the GPC unload. */
 144        if (gpc->should_vmresume) {
 145                tf->tf_flags |= VMCTX_FL_VMRESUME;
 146        } else {
 147                tf->tf_flags &= ~VMCTX_FL_VMRESUME;
 148                gpc->should_vmresume = TRUE;
 149        }
 150        /* vmlaunch/resume can fail, so we need to be able to return from this.
 151         * Thus we can't clobber rsp via the popq style of setting the
 152         * registers.  Likewise, we don't want to lose rbp via the clobber list.
 153         *
 154         * Partial contexts have already been launched, so we resume them. */
 155        asm volatile (".globl __asm_pop_vmtf_start;"
 156                      "__asm_pop_vmtf_start:     "
 157                      "testl $"STRINGIFY(VMCTX_FL_VMRESUME)", %c[flags](%0);"
 158                      "pushq %%rbp;              " /* save in case we fail */
 159                      "movq %c[rbx](%0), %%rbx;  "
 160                      "movq %c[rcx](%0), %%rcx;  "
 161                      "movq %c[rdx](%0), %%rdx;  "
 162                      "movq %c[rbp](%0), %%rbp;  "
 163                      "movq %c[rsi](%0), %%rsi;  "
 164                      "movq %c[rdi](%0), %%rdi;  "
 165                      "movq %c[r8](%0),  %%r8;   "
 166                      "movq %c[r9](%0),  %%r9;   "
 167                      "movq %c[r10](%0), %%r10;  "
 168                      "movq %c[r11](%0), %%r11;  "
 169                      "movq %c[r12](%0), %%r12;  "
 170                      "movq %c[r13](%0), %%r13;  "
 171                      "movq %c[r14](%0), %%r14;  "
 172                      "movq %c[r15](%0), %%r15;  "
 173                      "movq %c[rax](%0), %%rax;  " /* clobber our *tf last */
 174                      "jnz 1f;                   " /* jump if resume */
 175                      ASM_VMX_VMLAUNCH";         " /* non-resume gets launched*/
 176                      "jmp 2f;                   "
 177                      "1: "ASM_VMX_VMRESUME";    "
 178                      "2: popq %%rbp;            " /* vmlaunch failed */
 179                      ".globl __asm_pop_vmtf_end;"
 180                      "__asm_pop_vmtf_end:       "
 181                      :
 182                      : "a" (tf),
 183                        [rax]"i"(offsetof(struct vm_trapframe, tf_rax)),
 184                        [rbx]"i"(offsetof(struct vm_trapframe, tf_rbx)),
 185                        [rcx]"i"(offsetof(struct vm_trapframe, tf_rcx)),
 186                        [rdx]"i"(offsetof(struct vm_trapframe, tf_rdx)),
 187                        [rbp]"i"(offsetof(struct vm_trapframe, tf_rbp)),
 188                        [rsi]"i"(offsetof(struct vm_trapframe, tf_rsi)),
 189                        [rdi]"i"(offsetof(struct vm_trapframe, tf_rdi)),
 190                         [r8]"i"(offsetof(struct vm_trapframe, tf_r8)),
 191                         [r9]"i"(offsetof(struct vm_trapframe, tf_r9)),
 192                        [r10]"i"(offsetof(struct vm_trapframe, tf_r10)),
 193                        [r11]"i"(offsetof(struct vm_trapframe, tf_r11)),
 194                        [r12]"i"(offsetof(struct vm_trapframe, tf_r12)),
 195                        [r13]"i"(offsetof(struct vm_trapframe, tf_r13)),
 196                        [r14]"i"(offsetof(struct vm_trapframe, tf_r14)),
 197                        [r15]"i"(offsetof(struct vm_trapframe, tf_r15)),
 198                        [flags]"i"(offsetof(struct vm_trapframe, tf_flags))
 199                      : "cc", "memory", "rbx", "rcx", "rdx", "rsi", "rdi",
 200                        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
 201        /* vmlaunch/resume failed.  It could be for a few reasons, including
 202         * things like launching instead of resuming, not having a VMCS loaded,
 203         * failing a host-state area check, etc.  Those are kernel problems.
 204         *
 205         * The user should not be able to trigger these problems.  The user
 206         * could trigger a problem loading the guest-state area, such as a
 207         * non-canonical address for RIP.  Those sorts of errors should appear
 208         * to be a normal vmexit with some flags set.
 209         *
 210         * Any failed vmlaunch/resume is likely a kernel bug, but we'll still
 211         * reflect it to the user for debugability.
 212         *
 213         * Also we should always have a non-shadow VMCS, so ZF should be 1 and
 214         * we can read the error register. */
 215        assert(read_flags() & FL_ZF);
 216        tf->tf_exit_reason = EXIT_REASON_VMENTER_FAILED;
 217        tf->tf_exit_qual = vmcs_read(VM_INSTRUCTION_ERROR);
 218        tf->tf_flags |= VMCTX_FL_PARTIAL;
 219        warn("vmlaunch / vmresume failed!, check userspace's reflected fault");
 220        handle_bad_vm_tf(tf);
 221}
 222
 223void proc_pop_ctx(struct user_context *ctx)
 224{
 225        disable_irq();
 226        switch (ctx->type) {
 227        case ROS_HW_CTX:
 228                proc_pop_hwtf(&ctx->tf.hw_tf);
 229                break;
 230        case ROS_SW_CTX:
 231                proc_pop_swtf(&ctx->tf.sw_tf);
 232                break;
 233        case ROS_VM_CTX:
 234                proc_pop_vmtf(&ctx->tf.vm_tf);
 235                break;
 236        default:
 237                /* We should have caught this when securing the ctx */
 238                panic("Unknown context type %d!", ctx->type);
 239        }
 240}
 241
 242void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
 243                   uintptr_t stack_top, uintptr_t tls_desc)
 244{
 245        struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;
 246
 247        /* zero the entire structure for any type, prevent potential disclosure
 248         */
 249        memset(ctx, 0, sizeof(struct user_context));
 250        ctx->type = ROS_SW_CTX;
 251        /* Stack pointers in x86 C functions need to be such that adding or
 252         * subtracting 8 will result in 16 byte alignment (AMD64 ABI), which we
 253         * call an odd-8-byte alignment.  The reason is so that input arguments
 254         * (on the stack) are 16 byte aligned.  The extra 8 bytes is the
 255         * retaddr, pushed on the stack.  Compilers know they can subtract 8 to
 256         * get 16 byte alignment for instructions like movaps.
 257         *
 258         * However, the kernel will start contexts at 16 byte aligned stacks.
 259         * This is because glibc's _start (in ASM) expects this.  Parlib x86's
 260         * vcore entry does the same.
 261         *
 262         * We init contexts for both an elf startup as well as vcore entry.  It
 263         * is up to the caller (including the user) to make sure the stack is
 264         * aligned properly.  elf.c doesn't know about these concerns, so if it
 265         * messes up, there's nothing we can really do, since the args are just
 266         * wrong.  ld will fail immediately though, so we'll find out quickly.
 267         * */
 268        sw_tf->tf_rsp = stack_top;
 269        sw_tf->tf_rip = entryp;
 270        sw_tf->tf_rbp = 0;      /* for potential backtraces */
 271        sw_tf->tf_mxcsr = 0x00001f80;   /* x86 default mxcsr */
 272        sw_tf->tf_fpucw = 0x037f;               /* x86 default FP CW */
 273        /* Coupled closely with user's entry.S.  id is the vcoreid, which
 274         * entry.S uses to determine what to do.  vcoreid == 0 is the main
 275         * core/context. */
 276        sw_tf->tf_rbx = vcoreid;
 277        sw_tf->tf_fsbase = tls_desc;
 278        proc_secure_ctx(ctx);
 279}
 280
 281static void proc_secure_hwtf(struct hw_trapframe *tf)
 282{
 283        enforce_user_canon(&tf->tf_gsbase);
 284        enforce_user_canon(&tf->tf_fsbase);
 285        enforce_user_canon(&tf->tf_rip);
 286        enforce_user_canon(&tf->tf_rsp);
 287        /* GD_UD is the user data segment selector in the GDT, and
 288         * GD_UT is the user text segment selector (see inc/memlayout.h).
 289         * The low 2 bits of each segment register contains the
 290         * Requestor Privilege Level (RPL); 3 means user mode. */
 291        tf->tf_ss = GD_UD | 3;
 292        tf->tf_cs = GD_UT | 3;
 293        /* Always 1: interrupts */
 294        tf->tf_rflags |= FL_IF;
 295        /* Always 0: IOPL must be set to 0.  VM (virtual 8086) probably doesn't
 296         * matter - SDM says it can't get modified via iret anyways.  VIF and
 297         * VIP are also virtual-8086 mode stuff.  Supposedly NT is settable by
 298         * userspace, but there's no good reason for it.  Rather be paranoid. */
 299        tf->tf_rflags &= ~(FL_IOPL_MASK | FL_VM | FL_NT | FL_VIF | FL_VIP);
 300        tf->tf_rflags |= FL_RSVD_1;
 301        tf->tf_rflags &= FL_RSVD_0;
 302        x86_hwtf_clear_partial(tf);
 303}
 304
 305static void proc_secure_swtf(struct sw_trapframe *tf)
 306{
 307        enforce_user_canon(&tf->tf_gsbase);
 308        enforce_user_canon(&tf->tf_fsbase);
 309        enforce_user_canon(&tf->tf_rip);
 310        enforce_user_canon(&tf->tf_rsp);
 311        /* The kernel doesn't actually load the mxcsr or the fpucw, but we can
 312         * still sanitize it in case we ever do load it. */
 313        tf->tf_mxcsr &= MXCSR_RSVD_0;
 314        x86_swtf_clear_partial(tf);
 315}
 316
 317static void proc_secure_vmtf(struct vm_trapframe *tf)
 318{
 319        /* The user can say whatever it wants for the bulk of the TF.  If they
 320         * mess up something in the guest-area, it'll be treated like a vmexit.
 321         * There are a few things in the TF that we use on the kernel side.
 322         *
 323         * If guest_pcoreid is bad (not a guest_pcore), we'll fail to load the
 324         * GPC and reflect the fault to userspace.
 325         *
 326         * Regarding tf_flags, some are informational for the user, some are
 327         * used for our own use in the kernel.
 328         * - VMCTX_FL_PARTIAL: We clear this below
 329         * - VMCTX_FL_VMRESUME: Used to temporarily carry a bool in pop_vmtf,
 330         *   but we never trust the value in the VM TF.
 331         * These are write-only from the kernel and passed to the user:
 332         * - VMCTX_FL_HAS_FAULT
 333         * - VMCTX_FL_EPT_VMR_BACKED */
 334        x86_vmtf_clear_partial(tf);
 335}
 336
 337void proc_secure_ctx(struct user_context *ctx)
 338{
 339        switch (ctx->type) {
 340        case ROS_HW_CTX:
 341                proc_secure_hwtf(&ctx->tf.hw_tf);
 342                break;
 343        case ROS_SW_CTX:
 344                proc_secure_swtf(&ctx->tf.sw_tf);
 345                break;
 346        case ROS_VM_CTX:
 347                proc_secure_vmtf(&ctx->tf.vm_tf);
 348                break;
 349        default:
 350                /* If we aren't another ctx type, we're assuming (and forcing) a
 351                 * HW ctx.  If this is somehow fucked up, userspace should die
 352                 * rather quickly. */
 353                ctx->type = ROS_HW_CTX;
 354                proc_secure_hwtf(&ctx->tf.hw_tf);
 355        }
 356}
 357
 358/* Called when we are currently running an address space on our core and want to
 359 * abandon it.  We need a known good pgdir before releasing the old one.  We
 360 * decref, since current no longer tracks the proc (and current no longer
 361 * protects the cr3). */
 362void __abandon_core(void)
 363{
 364        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 365        struct proc *old_proc;
 366
 367        lcr3(boot_cr3);
 368        old_proc = pcpui->cur_proc;
 369        pcpui->cur_proc = NULL;
 370        proc_decref(old_proc);
 371}
 372
 373void __clear_owning_proc(uint32_t coreid)
 374{
 375        vmx_clear_vmcs();
 376}
 377