x86: Fixes context security
[akaros.git] / kern / arch / x86 / process64.c
1 #include <arch/arch.h>
2 #include <trap.h>
3 #include <process.h>
4 #include <pmap.h>
5 #include <smp.h>
6
7 #include <string.h>
8 #include <assert.h>
9 #include <stdio.h>
10
11 void proc_pop_ctx(struct user_context *ctx)
12 {
13         disable_irq();
14         /* for both HW and SW, note we pass an offset into the TF, beyond the fs and
15          * gs bases */
16         if (ctx->type == ROS_HW_CTX) {
17                 struct hw_trapframe *tf = &ctx->tf.hw_tf;
18                 write_msr(MSR_GS_BASE, (uint64_t)tf->tf_gsbase);
19                 write_msr(MSR_FS_BASE, (uint64_t)tf->tf_fsbase);
20                 asm volatile ("movq %0, %%rsp;          "
21                               "popq %%rax;              "
22                               "popq %%rbx;              "
23                               "popq %%rcx;              "
24                               "popq %%rdx;              "
25                               "popq %%rbp;              "
26                               "popq %%rsi;              "
27                               "popq %%rdi;              "
28                               "popq %%r8;               "
29                               "popq %%r9;               "
30                               "popq %%r10;              "
31                               "popq %%r11;              "
32                               "popq %%r12;              "
33                               "popq %%r13;              "
34                               "popq %%r14;              "
35                               "popq %%r15;              "
36                               "addq $0x10, %%rsp;       "
37                               "iretq                    "
38                               : : "g" (&tf->tf_rax) : "memory");
39                 panic("iretq failed");
40         } else {
41                 struct sw_trapframe *tf = &ctx->tf.sw_tf;
42                 write_msr(MSR_GS_BASE, (uint64_t)tf->tf_gsbase);
43                 write_msr(MSR_FS_BASE, (uint64_t)tf->tf_fsbase);
44                 /* We need to 0 out any registers that aren't part of the sw_tf and that
45                  * we won't use/clobber on the out-path.  While these aren't part of the
46                  * sw_tf, we also don't want to leak any kernel register content. */
47                 asm volatile ("movq %0, %%rsp;          "
48                               "movq $0, %%rax;          "
49                                           "movq $0, %%rdx;          "
50                                           "movq $0, %%rsi;          "
51                                           "movq $0, %%rdi;          "
52                                           "movq $0, %%r8;           "
53                                           "movq $0, %%r9;           "
54                                           "movq $0, %%r10;          "
55                               "popq %%rbx;              "
56                               "popq %%rbp;              "
57                               "popq %%r12;              "
58                               "popq %%r13;              "
59                               "popq %%r14;              "
60                               "popq %%r15;              "
61                                           "movq %1, %%r11;          "
62                               "popq %%rcx;              "
63                               "popq %%rsp;              "
64                               "rex.w sysret             "
65                               : : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
66                 panic("sysret failed");
67         }
68         panic("Unknown context type!\n");
69 }
70
71 /* Helper: if *addr isn't a canonical user address, poison it.  Use this when
72  * you need a canonical address (like MSR_FS_BASE) */
73 static void enforce_user_canon(uintptr_t *addr)
74 {
75         if (*addr >> 47 != 0)
76                 *addr = 0x5a5a5a5a;
77 }
78
79 /* TODO: consider using a SW context */
80 void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
81                    uintptr_t stack_top, uintptr_t tls_desc)
82 {
83         struct hw_trapframe *tf = &ctx->tf.hw_tf;
84         /* zero the entire structure for any type, prevent potential disclosure */
85         memset(ctx, 0, sizeof(struct user_context));
86         ctx->type = ROS_HW_CTX;
87         /* Stack pointers in a fresh stackframe need to be such that adding or
88          * subtracting 8 will result in 16 byte alignment (AMD64 ABI).  The reason
89          * is so that input arguments (on the stack) are 16 byte aligned.  The
90          * extra 8 bytes is the retaddr, pushed on the stack.  Compilers know they
91          * can subtract 8 to get 16 byte alignment for instructions like movaps. */
92         tf->tf_rsp = ROUNDDOWN(stack_top, 16) - 8;
93         tf->tf_rip = entryp;
94         /* Coupled closely with user's entry.S.  id is the vcoreid, which entry.S
95          * uses to determine what to do.  vcoreid == 0 is the main core/context. */
96         tf->tf_rax = vcoreid;
97         tf->tf_fsbase = tls_desc;
98         proc_secure_ctx(ctx);
99 }
100
101 void proc_secure_ctx(struct user_context *ctx)
102 {
103         if (ctx->type == ROS_SW_CTX) {
104                 struct sw_trapframe *tf = &ctx->tf.sw_tf;
105                 enforce_user_canon(&tf->tf_gsbase);
106                 enforce_user_canon(&tf->tf_fsbase);
107                 enforce_user_canon(&tf->tf_rip);
108         } else {
109                 /* If we aren't SW, we're assuming (and forcing) a HW ctx.  If this is
110                  * somehow fucked up, userspace should die rather quickly. */
111                 struct hw_trapframe *tf = &ctx->tf.hw_tf;
112                 ctx->type = ROS_HW_CTX;
113                 enforce_user_canon(&tf->tf_gsbase);
114                 enforce_user_canon(&tf->tf_fsbase);
115                 /* GD_UD is the user data segment selector in the GDT, and
116                  * GD_UT is the user text segment selector (see inc/memlayout.h).
117                  * The low 2 bits of each segment register contains the
118                  * Requestor Privilege Level (RPL); 3 means user mode. */
119                 tf->tf_ss = GD_UD | 3;
120                 tf->tf_cs = GD_UT | 3;
121                 tf->tf_rflags |= FL_IF;
122         }
123 }
124
125 /* Called when we are currently running an address space on our core and want to
126  * abandon it.  We need a known good pgdir before releasing the old one.  We
127  * decref, since current no longer tracks the proc (and current no longer
128  * protects the cr3). */
129 void __abandon_core(void)
130 {
131         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
132         lcr3(boot_cr3);
133         proc_decref(pcpui->cur_proc);
134         pcpui->cur_proc = 0;
135 }