Added perfmon interrupt handling to allow overflow based profiling
[akaros.git] / kern / arch / x86 / process64.c
1 #include <arch/arch.h>
2 #include <trap.h>
3 #include <process.h>
4 #include <pmap.h>
5 #include <smp.h>
6
7 #include <string.h>
8 #include <assert.h>
9 #include <stdio.h>
10
11 void proc_pop_ctx(struct user_context *ctx)
12 {
13         disable_irq();
14         /* for both HW and SW, note we pass an offset into the TF, beyond the fs and
15          * gs bases */
16         if (ctx->type == ROS_HW_CTX) {
17                 struct hw_trapframe *tf = &ctx->tf.hw_tf;
18
19                 if (x86_hwtf_is_partial(tf)) {
20                         swap_gs();
21                 } else {
22                         write_msr(MSR_GS_BASE, (uint64_t)tf->tf_gsbase);
23                         write_msr(MSR_FS_BASE, (uint64_t)tf->tf_fsbase);
24                 }
25                 asm volatile ("movq %0, %%rsp;          "
26                               "popq %%rax;              "
27                               "popq %%rbx;              "
28                               "popq %%rcx;              "
29                               "popq %%rdx;              "
30                               "popq %%rbp;              "
31                               "popq %%rsi;              "
32                               "popq %%rdi;              "
33                               "popq %%r8;               "
34                               "popq %%r9;               "
35                               "popq %%r10;              "
36                               "popq %%r11;              "
37                               "popq %%r12;              "
38                               "popq %%r13;              "
39                               "popq %%r14;              "
40                               "popq %%r15;              "
41                               "addq $0x10, %%rsp;       "
42                               "iretq                    "
43                               : : "g" (&tf->tf_rax) : "memory");
44                 panic("iretq failed");
45         } else {
46                 struct sw_trapframe *tf = &ctx->tf.sw_tf;
47
48                 if (x86_swtf_is_partial(tf)) {
49                         swap_gs();
50                 } else {
51                         write_msr(MSR_GS_BASE, (uint64_t)tf->tf_gsbase);
52                         write_msr(MSR_FS_BASE, (uint64_t)tf->tf_fsbase);
53                 }
54                 /* We need to 0 out any registers that aren't part of the sw_tf and that
55                  * we won't use/clobber on the out-path.  While these aren't part of the
56                  * sw_tf, we also don't want to leak any kernel register content. */
57                 asm volatile ("movq %0, %%rsp;          "
58                               "movq $0, %%rax;          "
59                                           "movq $0, %%rdx;          "
60                                           "movq $0, %%rsi;          "
61                                           "movq $0, %%rdi;          "
62                                           "movq $0, %%r8;           "
63                                           "movq $0, %%r9;           "
64                                           "movq $0, %%r10;          "
65                               "popq %%rbx;              "
66                               "popq %%rbp;              "
67                               "popq %%r12;              "
68                               "popq %%r13;              "
69                               "popq %%r14;              "
70                               "popq %%r15;              "
71                                           "movq %1, %%r11;          "
72                               "popq %%rcx;              "
73                               "popq %%rsp;              "
74                               "rex.w sysret             "
75                               : : "g"(&tf->tf_rbx), "i"(FL_IF) : "memory");
76                 panic("sysret failed");
77         }
78         panic("Unknown context type!\n");
79 }
80
81 /* Helper: if *addr isn't a canonical user address, poison it.  Use this when
82  * you need a canonical address (like MSR_FS_BASE) */
83 static void enforce_user_canon(uintptr_t *addr)
84 {
85         if (*addr >> 47 != 0)
86                 *addr = 0x5a5a5a5a;
87 }
88
89 void proc_init_ctx(struct user_context *ctx, uint32_t vcoreid, uintptr_t entryp,
90                    uintptr_t stack_top, uintptr_t tls_desc)
91 {
92         struct sw_trapframe *sw_tf = &ctx->tf.sw_tf;
93         /* zero the entire structure for any type, prevent potential disclosure */
94         memset(ctx, 0, sizeof(struct user_context));
95         ctx->type = ROS_SW_CTX;
96         /* Stack pointers in a fresh stack frame need to be 16 byte aligned
97          * (AMD64 ABI). If we call this function from within load_elf(), it
98          * should already be aligned properly, but we round again here for good
99          * measure. We used to subtract an extra 8 bytes here to allow us to
100          * write our _start() function in C instead of assembly. This was
101          * necessary to account for a preamble inserted the compiler which
102          * assumed a return address was pushed on the stack. Now that we properly
103          * pass our arguments on the stack, we will have to rewrite our _start()
104          * function in assembly to handle things properly. */
105         sw_tf->tf_rsp = ROUNDDOWN(stack_top, 16);
106         sw_tf->tf_rip = entryp;
107         sw_tf->tf_rbp = 0;      /* for potential backtraces */
108         sw_tf->tf_mxcsr = 0x00001f80;   /* x86 default mxcsr */
109         sw_tf->tf_fpucw = 0x037f;               /* x86 default FP CW */
110         /* Coupled closely with user's entry.S.  id is the vcoreid, which entry.S
111          * uses to determine what to do.  vcoreid == 0 is the main core/context. */
112         sw_tf->tf_rbx = vcoreid;
113         sw_tf->tf_fsbase = tls_desc;
114         proc_secure_ctx(ctx);
115 }
116
117 void proc_secure_ctx(struct user_context *ctx)
118 {
119         if (ctx->type == ROS_SW_CTX) {
120                 struct sw_trapframe *tf = &ctx->tf.sw_tf;
121                 enforce_user_canon(&tf->tf_gsbase);
122                 enforce_user_canon(&tf->tf_fsbase);
123                 enforce_user_canon(&tf->tf_rip);
124                 x86_swtf_clear_partial(tf);
125         } else {
126                 /* If we aren't SW, we're assuming (and forcing) a HW ctx.  If this is
127                  * somehow fucked up, userspace should die rather quickly. */
128                 struct hw_trapframe *tf = &ctx->tf.hw_tf;
129                 ctx->type = ROS_HW_CTX;
130                 enforce_user_canon(&tf->tf_gsbase);
131                 enforce_user_canon(&tf->tf_fsbase);
132                 /* GD_UD is the user data segment selector in the GDT, and
133                  * GD_UT is the user text segment selector (see inc/memlayout.h).
134                  * The low 2 bits of each segment register contains the
135                  * Requestor Privilege Level (RPL); 3 means user mode. */
136                 tf->tf_ss = GD_UD | 3;
137                 tf->tf_cs = GD_UT | 3;
138                 tf->tf_rflags |= FL_IF;
139                 x86_hwtf_clear_partial(tf);
140         }
141 }
142
143 /* Called when we are currently running an address space on our core and want to
144  * abandon it.  We need a known good pgdir before releasing the old one.  We
145  * decref, since current no longer tracks the proc (and current no longer
146  * protects the cr3). */
147 void __abandon_core(void)
148 {
149         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
150         lcr3(boot_cr3);
151         proc_decref(pcpui->cur_proc);
152         pcpui->cur_proc = 0;
153 }