4 #include <arch/console.h>
6 #include <ros/common.h>
19 #include <arch/mptables.h>
23 /* Interrupt descriptor table. 64 bit needs 16 byte alignment (i think). */
24 gatedesc_t __attribute__((aligned (16))) idt[256] = { { 0 } };
27 /* interrupt handler table, each element is a linked list of handlers for a
28 * given IRQ. Modification requires holding the lock (TODO: RCU) */
29 struct irq_handler *irq_handlers[NUM_IRQS];
30 spinlock_t irq_handler_wlock = SPINLOCK_INITIALIZER_IRQSAVE;
32 const char *x86_trapname(int trapno)
34 static const char *const excnames[] = {
37 "Non-Maskable Interrupt",
40 "BOUND Range Exceeded",
42 "Device Not Available",
44 "Coprocessor Segment Overrun",
46 "Segment Not Present",
51 "x87 FPU Floating-Point Error",
54 "SIMD Floating-Point Exception"
57 if (trapno < sizeof(excnames)/sizeof(excnames[0]))
58 return excnames[trapno];
59 if (trapno == T_SYSCALL)
61 return "(unknown trap)";
64 /* Set stacktop for the current core to be the stack the kernel will start on
65 * when trapping/interrupting from userspace. Don't use this til after
66 * smp_percpu_init(). We can probably get the TSS by reading the task register
67 * and then the GDT. Still, it's a pain. */
68 void set_stack_top(uintptr_t stacktop)
70 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
71 /* No need to reload the task register, this takes effect immediately */
72 x86_set_stacktop_tss(pcpui->tss, stacktop);
73 /* Also need to make sure sysenters come in correctly */
74 x86_set_sysenter_stacktop(stacktop);
77 /* Note the check implies we only are on a one page stack (or the first page) */
78 uintptr_t get_stack_top(void)
80 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
82 /* so we can check this in interrupt handlers (before smp_boot()) */
83 /* TODO: These are dangerous - it assumes we're on a one-page stack. If we
84 * change it to KSTKSIZE, then we assume stacks are KSTKSIZE-aligned */
86 return ROUNDUP(read_sp(), PGSIZE);
87 stacktop = x86_get_stacktop_tss(pcpui->tss);
88 if (stacktop != ROUNDUP(read_sp(), PGSIZE))
89 panic("Bad stacktop: %p esp one is %p\n", stacktop,
90 ROUNDUP(read_sp(), PGSIZE));
94 /* Sends a non-maskable interrupt; the handler will print a trapframe. */
95 void send_nmi(uint32_t os_coreid)
97 /* NMI / IPI for x86 are limited to 8 bits */
98 uint8_t hw_core = (uint8_t)get_hw_coreid(os_coreid);
104 /* This table is made in trapentry$BITS.S by each macro in that file.
105 * It is layed out such that the ith entry is the ith's traphandler's
106 * (uintptr_t) trap addr, then (uint32_t) trap number. */
107 struct trapinfo { uintptr_t trapaddr; uint32_t trapnumber; }
108 __attribute__((packed));
109 extern struct trapinfo trap_tbl[];
110 extern struct trapinfo trap_tbl_end[];
111 int i, trap_tbl_size = trap_tbl_end - trap_tbl;
112 extern void ISR_default(void);
113 extern void ISR_syscall(void);
115 /* set all to default, to catch everything */
116 for (i = 0; i < 256; i++)
117 SETGATE(idt[i], 0, GD_KT, &ISR_default, 0);
119 /* set all entries that have real trap handlers
120 * we need to stop short of the last one, since the last is the default
121 * handler with a fake interrupt number (500) that is out of bounds of
123 for (i = 0; i < trap_tbl_size - 1; i++)
124 SETGATE(idt[trap_tbl[i].trapnumber], 0, GD_KT, trap_tbl[i].trapaddr, 0);
126 assert((uintptr_t)ISR_syscall ==
127 ((uintptr_t)idt[T_SYSCALL].gd_off_63_32 << 32 |
128 (uintptr_t)idt[T_SYSCALL].gd_off_31_16 << 16 |
129 (uintptr_t)idt[T_SYSCALL].gd_off_15_0));
130 /* turn on trap-based syscall handling and other user-accessible ints
131 * DPL 3 means this can be triggered by the int instruction */
132 idt[T_SYSCALL].gd_dpl = 3;
133 idt[T_BRKPT].gd_dpl = 3;
135 /* Set up our kernel stack when changing rings */
136 /* Note: we want 16 byte aligned kernel stack frames (AMD 2:8.9.3) */
137 x86_set_stacktop_tss(&ts, (uintptr_t)bootstacktop);
138 x86_sysenter_init((uintptr_t)bootstacktop);
140 #ifdef CONFIG_KTHREAD_POISON
141 *kstack_bottom_addr((uintptr_t)bootstacktop) = 0xdeadbeef;
142 #endif /* CONFIG_KTHREAD_POISON */
144 /* Initialize the TSS field of the gdt. The size of the TSS desc differs
145 * between 64 and 32 bit, hence the pointer acrobatics */
146 syssegdesc_t *ts_slot = (syssegdesc_t*)&gdt[GD_TSS >> 3];
147 *ts_slot = (syssegdesc_t)SEG_SYS_SMALL(STS_T32A, (uintptr_t)&ts,
148 sizeof(taskstate_t), 0);
150 /* Init the IDT PD. Need to do this before ltr for some reason. (Doing
151 * this between ltr and lidt causes the machine to reboot... */
152 idt_pd.pd_lim = sizeof(idt) - 1;
153 idt_pd.pd_base = (uintptr_t)idt;
157 asm volatile("lidt %0" : : "m"(idt_pd));
162 int ncleft = MAX_NUM_CORES;
163 int num_cores_mpacpi;
165 ncleft = mpsinit(ncleft);
166 ncleft = mpacpi(ncleft);
167 num_cores_mpacpi = MAX_NUM_CORES - ncleft;
168 printk("MP and ACPI found %d cores\n", num_cores_mpacpi);
169 if (num_cores != num_cores_mpacpi)
170 warn("Topology (%d) and MP/ACPI (%d) differ on num_cores!", num_cores,
176 /* the lapic IRQs need to be unmasked on a per-core basis */
177 register_irq(IdtLAPIC_TIMER, timer_interrupt, NULL,
178 MKBUS(BusLAPIC, 0, 0, 0));
179 register_irq(IdtLAPIC_ERROR, handle_lapic_error, NULL,
180 MKBUS(BusLAPIC, 0, 0, 0));
181 register_irq(I_KERNEL_MSG, handle_kmsg_ipi, NULL, MKBUS(BusIPI, 0, 0, 0));
184 static void handle_fperr(struct hw_trapframe *hw_tf)
188 asm volatile ("fnstcw %0" : "=m"(fpcw));
189 asm volatile ("fnstsw %0" : "=m"(fpsw));
190 asm volatile ("stmxcsr %0" : "=m"(mxcsr));
191 print_trapframe(hw_tf);
192 printk("Core %d: FP ERR, CW: 0x%04x, SW: 0x%04x, MXCSR 0x%08x\n", core_id(),
194 printk("Core %d: The following faults are unmasked:\n", core_id());
195 if (fpsw & ~fpcw & FP_EXCP_IE) {
196 printk("\tInvalid Operation: ");
197 if (fpsw & FP_SW_SF) {
199 printk("Stack overflow\n");
201 printk("Stack underflow\n");
203 printk("invalid arithmetic operand\n");
206 if (fpsw & ~fpcw & FP_EXCP_DE)
207 printk("\tDenormalized operand\n");
208 if (fpsw & ~fpcw & FP_EXCP_ZE)
209 printk("\tDivide by zero\n");
210 if (fpsw & ~fpcw & FP_EXCP_OE)
211 printk("\tNumeric Overflow\n");
212 if (fpsw & ~fpcw & FP_EXCP_UE)
213 printk("\tNumeric Underflow\n");
214 if (fpsw & ~fpcw & FP_EXCP_PE)
215 printk("\tInexact result (precision)\n");
216 printk("Killing the process.\n");
218 proc_destroy(current);
221 static bool __handle_page_fault(struct hw_trapframe *hw_tf, unsigned long *aux)
223 uintptr_t fault_va = rcr2();
224 int prot = hw_tf->tf_err & PF_ERROR_WRITE ? PROT_WRITE : PROT_READ;
225 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
228 /* safe to reenable after rcr2 */
231 if (!pcpui->cur_proc) {
232 /* still catch KPFs */
233 assert((hw_tf->tf_cs & 3) == 0);
234 print_trapframe(hw_tf);
235 backtrace_kframe(hw_tf);
236 panic("Proc-less Page Fault in the Kernel at %p!", fault_va);
238 /* TODO - handle kernel page faults. This is dangerous, since we might be
239 * holding locks in the kernel and could deadlock when we HPF. For now, I'm
240 * just disabling the lock checker, since it'll flip out when it sees there
241 * is a kernel trap. Will need to think about this a bit, esp when we
242 * properly handle bad addrs and whatnot.
244 * Also consider turning on IRQs globally while we call HPF. */
245 if (in_kernel(hw_tf))
246 pcpui->__lock_checking_enabled--;
247 err = handle_page_fault(pcpui->cur_proc, fault_va, prot);
248 if (in_kernel(hw_tf))
249 pcpui->__lock_checking_enabled++;
251 if (in_kernel(hw_tf)) {
252 print_trapframe(hw_tf);
253 backtrace_kframe(hw_tf);
254 panic("Proc-ful Page Fault in the Kernel at %p!", fault_va);
255 /* if we want to do something like kill a process or other code, be
256 * aware we are in a sort of irq-like context, meaning the main
257 * kernel code we 'interrupted' could be holding locks - even
262 hw_tf->tf_err |= PF_VMR_BACKED;
265 /* useful debugging */
266 printk("[%08x] user %s fault va %p ip %p on core %d with err %d\n",
267 current->pid, prot & PROT_READ ? "READ" : "WRITE", fault_va,
268 hw_tf->tf_rip, core_id(), err);
269 print_trapframe(hw_tf);
270 /* Turn this on to help debug bad function pointers */
271 printd("rsp %p\n\t 0(rsp): %p\n\t 8(rsp): %p\n\t 16(rsp): %p\n"
272 "\t24(rsp): %p\n", hw_tf->tf_rsp,
273 *(uintptr_t*)(hw_tf->tf_rsp + 0),
274 *(uintptr_t*)(hw_tf->tf_rsp + 8),
275 *(uintptr_t*)(hw_tf->tf_rsp + 16),
276 *(uintptr_t*)(hw_tf->tf_rsp + 24));
281 /* Certain traps want IRQs enabled, such as the syscall. Others can't handle
282 * it, like the page fault handler. Turn them on on a case-by-case basis. */
283 static void trap_dispatch(struct hw_trapframe *hw_tf)
285 struct per_cpu_info *pcpui;
287 unsigned long aux = 0;
288 // Handle processor exceptions.
289 switch(hw_tf->tf_trapno) {
291 /* Temporarily disable deadlock detection when we print. We could
292 * deadlock if we were printing when we NMIed. */
293 pcpui = &per_cpu_info[core_id()];
294 pcpui->__lock_checking_enabled--;
295 /* This is a bit hacky, but we don't have a decent API yet */
296 extern bool mon_verbose_trace;
297 if (mon_verbose_trace) {
298 print_trapframe(hw_tf);
299 backtrace_kframe(hw_tf);
301 char *fn_name = get_fn_name(x86_get_ip_hw(hw_tf));
302 printk("Core %d is at %p (%s)\n", core_id(), x86_get_ip_hw(hw_tf),
305 print_kmsgs(core_id());
306 pcpui->__lock_checking_enabled++;
314 /* TODO: this can PF if there is a concurrent unmap/PM removal. */
315 uintptr_t ip = x86_get_ip_hw(hw_tf);
316 pcpui = &per_cpu_info[core_id()];
317 pcpui->__lock_checking_enabled--; /* for print debugging */
318 /* We will muck with the actual TF. If we're dealing with
319 * userspace, we need to make sure we edit the actual TF that will
320 * get restarted (pcpui), and not the TF on the kstack (which aren't
321 * the same). See set_current_ctx() for more info. */
322 if (!in_kernel(hw_tf))
323 hw_tf = &pcpui->cur_ctx->tf.hw_tf;
324 printd("bad opcode, eip: %p, next 3 bytes: %x %x %x\n", ip,
327 *(uint8_t*)(ip + 2));
328 /* rdtscp: 0f 01 f9 */
329 if (*(uint8_t*)(ip + 0) == 0x0f,
330 *(uint8_t*)(ip + 1) == 0x01,
331 *(uint8_t*)(ip + 2) == 0xf9) {
332 x86_fake_rdtscp(hw_tf);
333 pcpui->__lock_checking_enabled++; /* for print debugging */
338 pcpui->__lock_checking_enabled++; /* for print debugging */
342 handled = __handle_page_fault(hw_tf, &aux);
349 // check for userspace, for now
350 assert(hw_tf->tf_cs != GD_KT);
351 /* Set up and run the async calls */
352 /* TODO: this is using the wrong reg1 for traps for 32 bit */
353 prep_syscalls(current,
354 (struct syscall*)x86_get_systrap_arg0(hw_tf),
355 (unsigned int)x86_get_systrap_arg1(hw_tf));
358 if (hw_tf->tf_cs == GD_KT) {
359 print_trapframe(hw_tf);
360 panic("Damn Damn! Unhandled trap in the kernel!");
366 reflect_unhandled_trap(hw_tf->tf_trapno, hw_tf->tf_err, aux);
369 /* Helper. For now, this copies out the TF to pcpui. Eventually, we should
370 * consider doing this in trapentry.S
372 * TODO: consider having this return the tf used, so we can set tf in trap and
373 * irq handlers to edit the TF that will get restarted. Right now, the kernel
374 * uses and restarts tf, but userspace restarts the old pcpui tf. It is
375 * tempting to do this, but note that tf stays on the stack of the kthread,
376 * while pcpui->cur_ctx is for the core we trapped in on. Meaning if we ever
377 * block, suddenly cur_ctx is pointing to some old clobbered state that was
378 * already returned to and can't be trusted. Meanwhile tf can always be trusted
379 * (like with an in_kernel() check). The only types of traps from the user that
380 * can be expected to have editable trapframes are ones that don't block. */
381 static void set_current_ctx_hw(struct per_cpu_info *pcpui,
382 struct hw_trapframe *hw_tf)
384 assert(!irq_is_enabled());
385 assert(!pcpui->cur_ctx);
386 pcpui->actual_ctx.type = ROS_HW_CTX;
387 pcpui->actual_ctx.tf.hw_tf = *hw_tf;
388 pcpui->cur_ctx = &pcpui->actual_ctx;
391 static void set_current_ctx_sw(struct per_cpu_info *pcpui,
392 struct sw_trapframe *sw_tf)
394 assert(!irq_is_enabled());
395 assert(!pcpui->cur_ctx);
396 pcpui->actual_ctx.type = ROS_SW_CTX;
397 pcpui->actual_ctx.tf.sw_tf = *sw_tf;
398 pcpui->cur_ctx = &pcpui->actual_ctx;
401 /* If the interrupt interrupted a halt, we advance past it. Made to work with
402 * x86's custom cpu_halt() in arch/arch.h. Note this nearly never gets called.
403 * I needed to insert exactly one 'nop' in cpu_halt() (that isn't there now) to
404 * get the interrupt to trip on the hlt, o/w the hlt will execute before the
405 * interrupt arrives (even with a pending interrupt that should hit right after
406 * an interrupt_enable (sti)). This was on the i7. */
407 static void abort_halt(struct hw_trapframe *hw_tf)
409 /* Don't care about user TFs. Incidentally, dereferencing user EIPs is
410 * reading userspace memory, which can be dangerous. It can page fault,
411 * like immediately after a fork (which doesn't populate the pages). */
412 if (!in_kernel(hw_tf))
414 /* the halt instruction in is 0xf4, and it's size is 1 byte */
415 if (*(uint8_t*)x86_get_ip_hw(hw_tf) == 0xf4)
416 x86_advance_ip(hw_tf, 1);
419 void trap(struct hw_trapframe *hw_tf)
421 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
422 /* Copy out the TF for now */
423 if (!in_kernel(hw_tf)) {
424 set_current_ctx_hw(pcpui, hw_tf);
425 /* ignoring state for nested kernel traps. should be rare. */
426 __set_cpu_state(pcpui, CPU_STATE_KERNEL);
428 inc_ktrap_depth(pcpui);
430 printd("Incoming TRAP %d on core %d, TF at %p\n", hw_tf->tf_trapno,
432 if ((hw_tf->tf_cs & ~3) != GD_UT && (hw_tf->tf_cs & ~3) != GD_KT) {
433 print_trapframe(hw_tf);
434 panic("Trapframe with invalid CS!");
436 trap_dispatch(hw_tf);
437 /* Return to the current process, which should be runnable. If we're the
438 * kernel, we should just return naturally. Note that current and tf need
439 * to still be okay (might not be after blocking) */
440 if (in_kernel(hw_tf)) {
441 dec_ktrap_depth(pcpui);
448 static bool vector_is_irq(int apic_vec)
450 /* arguably, we could limit them to MaxIdtIOAPIC */
451 return (IdtPIC <= apic_vec) && (apic_vec <= IdtMAX);
454 /* Note IRQs are disabled unless explicitly turned on.
456 * In general, we should only get trapno's >= PIC1_OFFSET (32). Anything else
457 * should be a trap. Even if we don't use the PIC, that should be the standard.
458 * It is possible to get a spurious LAPIC IRQ with vector 15 (or similar), but
459 * the spurious check should catch that.
461 * Note that from hardware's perspective (PIC, etc), IRQs start from 0, but they
462 * are all mapped up at PIC1_OFFSET for the cpu / irq_handler. */
463 void handle_irq(struct hw_trapframe *hw_tf)
465 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
466 struct irq_handler *irq_h;
467 /* Copy out the TF for now */
468 if (!in_kernel(hw_tf))
469 set_current_ctx_hw(pcpui, hw_tf);
470 if (!in_irq_ctx(pcpui))
471 __set_cpu_state(pcpui, CPU_STATE_IRQ);
472 inc_irq_depth(pcpui);
473 /* Coupled with cpu_halt() and smp_idle() */
476 if (hw_tf->tf_trapno != IdtLAPIC_TIMER) /* timer irq */
477 if (hw_tf->tf_trapno != I_KERNEL_MSG)
478 if (hw_tf->tf_trapno != 65) /* qemu serial tends to get this one */
479 printd("Incoming IRQ, ISR: %d on core %d\n", hw_tf->tf_trapno,
481 /* TODO: RCU read lock */
482 irq_h = irq_handlers[hw_tf->tf_trapno];
484 warn_once("Received IRQ %d, had no handler registered!",
486 /* If we don't have an IRQ handler, we don't know how to EOI. Odds are,
487 * it's a LAPIC IRQ, such as I_TESTING */
488 if (!lapic_check_spurious(hw_tf->tf_trapno))
489 lapic_send_eoi(hw_tf->tf_trapno);
492 if (irq_h->check_spurious(hw_tf->tf_trapno))
494 /* Can now be interrupted/nested by higher priority IRQs, but not by our
495 * current IRQ vector, til we EOI. */
498 irq_h->isr(hw_tf, irq_h->data);
501 // if we're a general purpose IPI function call, down the cpu_list
502 extern handler_wrapper_t handler_wrappers[NUM_HANDLER_WRAPPERS];
503 if ((I_SMP_CALL0 <= hw_tf->tf_trapno) &&
504 (hw_tf->tf_trapno <= I_SMP_CALL_LAST))
505 down_checklist(handler_wrappers[hw_tf->tf_trapno & 0x0f].cpu_list);
507 /* Keep in sync with ipi_is_pending */
508 irq_handlers[hw_tf->tf_trapno]->eoi(hw_tf->tf_trapno);
511 dec_irq_depth(pcpui);
512 if (!in_irq_ctx(pcpui))
513 __set_cpu_state(pcpui, CPU_STATE_KERNEL);
514 /* Return to the current process, which should be runnable. If we're the
515 * kernel, we should just return naturally. Note that current and tf need
516 * to still be okay (might not be after blocking) */
517 if (in_kernel(hw_tf))
523 /* The irq field may be ignored based on the type of Bus. */
524 int register_irq(int irq, isr_t handler, void *irq_arg, uint32_t tbdf)
526 struct irq_handler *irq_h;
528 irq_h = kzmalloc(sizeof(struct irq_handler), 0);
530 irq_h->dev_irq = irq;
532 vector = bus_irq_setup(irq_h);
537 printk("IRQ %d, vector %d (0x%x), type %s\n", irq, vector, vector,
539 assert(irq_h->check_spurious && irq_h->eoi);
540 irq_h->isr = handler;
541 irq_h->data = irq_arg;
542 irq_h->apic_vector = vector;
544 spin_lock_irqsave(&irq_handler_wlock);
545 irq_h->next = irq_handlers[vector];
546 wmb(); /* make sure irq_h is done before publishing to readers */
547 irq_handlers[vector] = irq_h;
548 spin_unlock_irqsave(&irq_handler_wlock);
549 /* Most IRQs other than the BusIPI should need their irq unmasked.
550 * Might need to pass the irq_h, in case unmask needs more info.
551 * The lapic IRQs need to be unmasked on a per-core basis */
552 if (irq_h->unmask && strcmp(irq_h->type, "lapic"))
553 irq_h->unmask(irq_h, vector);
557 /* These routing functions only allow the routing of an irq to a single core.
558 * If we want to route to multiple cores, we'll probably need to set up logical
559 * groups or something and take some additional parameters. */
560 static int route_irq_h(struct irq_handler *irq_h, int os_coreid)
563 if (!irq_h->route_irq) {
564 printk("[kernel] apic_vec %d, type %s cannot be routed\n",
565 irq_h->apic_vector, irq_h->type);
568 if (os_coreid >= MAX_NUM_CORES) {
569 printk("[kernel] os_coreid %d out of range!\n", os_coreid);
572 hw_coreid = get_hw_coreid(os_coreid);
573 if (hw_coreid == -1) {
574 printk("[kernel] os_coreid %d not a valid hw core!\n", os_coreid);
577 irq_h->route_irq(irq_h, irq_h->apic_vector, hw_coreid);
581 /* Routes all irqs for a given apic_vector to os_coreid. Returns 0 if all of
582 * them succeeded. -1 if there were none or if any of them failed. We don't
583 * share IRQs often (if ever anymore), so this shouldn't be an issue. */
584 int route_irqs(int apic_vec, int os_coreid)
586 struct irq_handler *irq_h;
588 if (!vector_is_irq(apic_vec)) {
589 printk("[kernel] vector %d is not an IRQ vector!\n", apic_vec);
592 irq_h = irq_handlers[apic_vec];
594 assert(irq_h->apic_vector == apic_vec);
595 ret = route_irq_h(irq_h, os_coreid);
601 /* It's a moderate pain in the ass to put these in bit-specific files (header
602 * hell with the set_current_ helpers) */
603 void sysenter_callwrapper(struct syscall *sysc, unsigned long count,
604 struct sw_trapframe *sw_tf)
606 struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
607 set_current_ctx_sw(pcpui, sw_tf);
608 __set_cpu_state(pcpui, CPU_STATE_KERNEL);
609 /* Once we've set_current_ctx, we can enable interrupts. This used to be
610 * mandatory (we had immediate KMSGs that would muck with cur_ctx). Now it
611 * should only help for sanity/debugging. */
613 /* Set up and run the async calls */
614 prep_syscalls(current, sysc, count);
615 /* If you use pcpui again, reread it, since you might have migrated */
619 /* Declared in x86/arch.h */
620 void send_ipi(uint32_t os_coreid, uint8_t vector)
622 int hw_coreid = get_hw_coreid(os_coreid);
623 if (hw_coreid == -1) {
624 panic("Unmapped OS coreid (OS %d)!\n", os_coreid);
627 __send_ipi(hw_coreid, vector);