x86: vmm: Use a separate vector for posted IRQs
[akaros.git] / kern / src / elf.c
index 6cf23b1..233b202 100644 (file)
 #include <pmap.h>
 #include <smp.h>
 #include <arch/arch.h>
+#include <umem.h>
 
-#ifdef KERN64
+#ifdef CONFIG_64BIT
 # define elf_field(obj, field) (elf64 ? (obj##64)->field : (obj##32)->field)
 #else
 # define elf_field(obj, field) ((obj##32)->field)
 #endif
 
+/* Check if the file is valid elf file (i.e. by checking for ELF_MAGIC in the
+ * header) */
+bool is_valid_elf(struct file *f)
+{
+       elf64_t h;
+       off64_t o = 0;
+       uintptr_t c = switch_to_ktask();
+
+       if (f->f_op->read(f, (char*)&h, sizeof(elf64_t), &o) != sizeof(elf64_t)) {
+               goto fail;
+       }
+       if (h.e_magic != ELF_MAGIC) {
+               goto fail;
+       }
+success:
+       switch_back_from_ktask(c);
+       return TRUE;
+fail:
+       switch_back_from_ktask(c);
+       return FALSE;
+}
+
+static uintptr_t populate_stack(struct proc *p, int argc, char *argv[],
+                                                int envc, char *envp[],
+                                                int auxc, elf_aux_t auxv[])
+{
+       /* Map in pages for p's stack. */
+       int flags = MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE;
+       uintptr_t stacksz = USTACK_NUM_PAGES*PGSIZE;
+       if (do_mmap(p, USTACKTOP-stacksz, stacksz, PROT_READ | PROT_WRITE,
+                   flags, NULL, 0) == MAP_FAILED)
+               return 0;
+
+       /* Function to get the lengths of the argument and environment strings. */
+       int get_lens(int argc, char *argv[], int arg_lens[])
+       {
+               int total = 0;
+               for (int i = 0; i < argc; i++) {
+                       arg_lens[i] = strlen(argv[i]) + 1;
+                       total += arg_lens[i];
+               }
+               return total;
+       }
+
+       /* Function to help map the argument and environment strings, to their
+        * final location. */
+       int remap(int argc, char *argv[], char *new_argv[],
+              char new_argbuf[], int arg_lens[])
+       {
+               int offset = 0;
+               char *temp_argv[argc + 1];
+               for(int i = 0; i < argc; i++) {
+                       if (memcpy_to_user(p, new_argbuf + offset, argv[i], arg_lens[i]))
+                               return -1;
+                       temp_argv[i] = new_argbuf + offset;
+                       offset += arg_lens[i];
+               }
+               temp_argv[argc] = NULL;
+               if (memcpy_to_user(p, new_argv, temp_argv, sizeof(temp_argv)))
+                       return -1;
+               return offset;
+       }
+
+       /* Start tracking the size of the buffer necessary to hold all of our data
+        * on the stack. Preallocate space for argc, argv, envp, and auxv in this
+        * buffer. */
+       int bufsize = 0;
+       bufsize += 1 * sizeof(size_t);
+       bufsize += (auxc + 1) * sizeof(elf_aux_t);
+       bufsize += (envc + 1) * sizeof(char**);
+       bufsize += (argc + 1) * sizeof(char**);
+
+       /* Add in the size of the env and arg strings. */
+       int arg_lens[argc];
+       int env_lens[envc];
+       bufsize += get_lens(argc, argv, arg_lens);
+       bufsize += get_lens(envc, envp, env_lens);
+
+       /* Adjust bufsize so that our buffer will ultimately be 16 byte aligned. */
+       bufsize = ROUNDUP(bufsize, 16);
+
+       /* Set up pointers to all of the appropriate data regions we map to. */
+       size_t *new_argc = (size_t*)(USTACKTOP - bufsize);
+       char **new_argv = (char**)(new_argc + 1);
+       char **new_envp = new_argv + argc + 1;
+       elf_aux_t *new_auxv = (elf_aux_t*)(new_envp + envc + 1);
+       char *new_argbuf = (char*)(new_auxv + auxc + 1);
+
+       /* Verify that all data associated with our argv, envp, and auxv arrays
+        * (and any corresponding strings they point to) will fit in the space
+        * alloted. */
+       if (bufsize > ARG_MAX)
+               return 0;
+
+       /* Map argc into its final location. */
+       if (memcpy_to_user(p, new_argc, &argc, sizeof(size_t)))
+               return 0;
+
+       /* Map all data for argv and envp into its final location. */
+       int offset = 0;
+       offset = remap(argc, argv, new_argv, new_argbuf, arg_lens);
+       if (offset == -1)
+               return 0;
+       offset = remap(envc, envp, new_envp, new_argbuf + offset, env_lens);
+       if (offset == -1)
+               return 0;
+
+       /* Map auxv into its final location. */
+       elf_aux_t null_aux = {0, 0};
+       if (memcpy_to_user(p, new_auxv, auxv, auxc * sizeof(elf_aux_t)))
+               return 0;
+       if (memcpy_to_user(p, new_auxv + auxc, &null_aux, sizeof(elf_aux_t)))
+               return 0;
+
+       return USTACKTOP - bufsize;
+}
+
 /* We need the writable flag for ld.  Even though the elf header says it wants
  * RX (and not W) for its main program header, it will page fault (eip 56f0,
  * 46f0 after being relocated to 0x1000, va 0x20f4). */
-static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
+static int load_one_elf(struct proc *p, struct file *f, uintptr_t pg_num,
                         elf_info_t *ei, bool writable)
 {
        int ret = -1;
        ei->phdr = -1;
        ei->dynamic = 0;
        ei->highest_addr = 0;
-       off_t f_off = 0;
+       off64_t f_off = 0;
        void* phdrs = 0;
-       int mm_perms, mm_flags = MAP_FIXED;
-       
-       /* When reading on behalf of the kernel, we need to make sure no proc is
-        * "current".  This is a bit ghetto (TODO: KFOP) */
-       struct proc *cur_proc = current;
-       current = 0;
+       int mm_perms, mm_flags;
+
+       /* When reading on behalf of the kernel, we need to switch to a ktask so
+        * the VFS (and maybe other places) know. (TODO: KFOP) */
+       uintptr_t old_ret = switch_to_ktask();
 
        /* Read in ELF header. */
        elf64_t elfhdr_storage;
        elf32_t* elfhdr32 = (elf32_t*)&elfhdr_storage;
        elf64_t* elfhdr64 = &elfhdr_storage;
-       if (f->f_op->read(f, (char*)elfhdr64, sizeof(elf64_t), &f_off) == -1)
+       if (f->f_op->read(f, (char*)elfhdr64, sizeof(elf64_t), &f_off)
+               != sizeof(elf64_t)) {
+               /* if you ever debug this, be sure to 0 out elfhrd_storage in advance */
+               printk("[kernel] load_one_elf: failed to read file\n");
                goto fail;
-
+       }
+       if (elfhdr64->e_magic != ELF_MAGIC) {
+               printk("[kernel] load_one_elf: file is not an elf!\n");
+               goto fail;
+       }
        bool elf32 = elfhdr32->e_ident[ELF_IDENT_CLASS] == ELFCLASS32;
        bool elf64 = elfhdr64->e_ident[ELF_IDENT_CLASS] == ELFCLASS64;
-       if (elf64 == elf32)
+       if (elf64 == elf32) {
+               printk("[kernel] load_one_elf: ID as both 32 and 64 bit\n");
                goto fail;
-       #ifndef KERN64
-       if(elf64)
+       }
+       #ifndef CONFIG_64BIT
+       if (elf64) {
+               printk("[kernel] load_one_elf: 64 bit elf on 32 bit kernel\n");
                goto fail;
+       }
+       #endif
+       /* Not sure what RISCV's 64 bit kernel can do here, so this check is x86
+        * only */
+       #ifdef CONFIG_X86
+       if (elf32) {
+               printk("[kernel] load_one_elf: 32 bit elf on 64 bit kernel\n");
+               goto fail;
+       }
        #endif
 
        size_t phsz = elf64 ? sizeof(proghdr64_t) : sizeof(proghdr32_t);
@@ -54,12 +190,17 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
        uint16_t e_phoff = elf_field(elfhdr, e_phoff);
 
        /* Read in program headers. */
-       if (e_phnum > 10000 || e_phoff % (elf32 ? 4 : 8) != 0)
-         goto fail;
+       if (e_phnum > 10000 || e_phoff % (elf32 ? 4 : 8) != 0) {
+               printk("[kernel] load_one_elf: Bad program headers\n");
+               goto fail;
+       }
        phdrs = kmalloc(e_phnum * phsz, 0);
        f_off = e_phoff;
-       if (!phdrs || f->f_op->read(f, phdrs, e_phnum * phsz, &f_off) == -1)
+       if (!phdrs || f->f_op->read(f, phdrs, e_phnum * phsz, &f_off) !=
+                     e_phnum * phsz) {
+               printk("[kernel] load_one_elf: could not get program headers\n");
                goto fail;
+       }
        for (int i = 0; i < e_phnum; i++) {
                proghdr32_t* ph32 = (proghdr32_t*)phdrs + i;
                proghdr64_t* ph64 = (proghdr64_t*)phdrs + i;
@@ -75,7 +216,8 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                p_flags |= (writable ? ELF_PROT_WRITE : 0);
                /* All mmaps need to be fixed to their VAs.  If the program wants it to
                 * be a writable region, we also need the region to be private. */
-               mm_flags = MAP_FIXED | (p_flags & ELF_PROT_WRITE ? MAP_PRIVATE : 0);
+               mm_flags = MAP_FIXED |
+                          (p_flags & ELF_PROT_WRITE ? MAP_PRIVATE : MAP_SHARED);
 
                if (p_type == ELF_PROG_PHDR)
                        ei->phdr = p_va;
@@ -83,27 +225,37 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                        f_off = p_offset;
                        ssize_t maxlen = sizeof(ei->interp);
                        ssize_t bytes = f->f_op->read(f, ei->interp, maxlen, &f_off);
-                       if (bytes == -1)
-                         goto fail;
+                       /* trying to catch errors.  don't know how big it could be, but it
+                        * should be at least 0. */
+                       if (bytes <= 0) {
+                               printk("[kernel] load_one_elf: could not read ei->interp\n");
+                               goto fail;
+                       }
 
                        maxlen = MIN(maxlen, bytes);
-                       if (strnlen(ei->interp, maxlen) == maxlen)
-                         goto fail;
+                       if (strnlen(ei->interp, maxlen) == maxlen) {
+                               printk("[kernel] load_one_elf: interpreter name too long\n");
+                               goto fail;
+                       }
 
                        ei->dynamic = 1;
                }
                else if (p_type == ELF_PROG_LOAD && p_memsz) {
-                       if (p_align % PGSIZE)
+                       if (p_align % PGSIZE) {
+                               printk("[kernel] load_one_elf: not page aligned\n");
                                goto fail;
-                       if (p_offset % PGSIZE != p_va % PGSIZE)
+                       }
+                       if (p_offset % PGSIZE != p_va % PGSIZE) {
+                               printk("[kernel] load_one_elf: offset difference \n");
                                goto fail;
+                       }
 
                        uintptr_t filestart = ROUNDDOWN(p_offset, PGSIZE);
                        uintptr_t filesz = p_offset + p_filesz - filestart;
 
                        uintptr_t memstart = ROUNDDOWN(p_va, PGSIZE);
                        uintptr_t memsz = ROUNDUP(p_va + p_memsz, PGSIZE) - memstart;
-                       memstart += pgoffset * PGSIZE;
+                       memstart += pg_num * PGSIZE;
 
                        if (memstart + memsz > ei->highest_addr)
                                ei->highest_addr = memstart + memsz;
@@ -125,8 +277,10 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                                if (filesz - partial) {
                                        /* Map the complete pages. */
                                        if (do_mmap(p, memstart, filesz - partial, mm_perms,
-                                                   mm_flags, f, filestart) == MAP_FAILED)
+                                                   mm_flags, f, filestart) == MAP_FAILED) {
+                                               printk("[kernel] load_one_elf: complete mmap failed\n");
                                                goto fail;
+                                       }
                                }
                                /* Note that we (probably) only need to do this zeroing the end
                                 * of a partial file page when we are dealing with
@@ -135,19 +289,34 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                                        /* Need our own populated, private copy of the page so that
                                         * we can zero the remainder - and not zero chunks of the
                                         * real file in the page cache. */
+                                       mm_flags &= ~MAP_SHARED;
                                        mm_flags |= MAP_PRIVATE | MAP_POPULATE;
 
                                        /* Map the final partial page. */
                                        uintptr_t last_page = memstart + filesz - partial;
                                        if (do_mmap(p, last_page, PGSIZE, mm_perms, mm_flags,
-                                                   f, filestart + filesz - partial) == MAP_FAILED)
+                                                   f, filestart + filesz - partial) == MAP_FAILED) {
+                                               printk("[kernel] load_one_elf: partial mmap failed\n");
                                                goto fail;
+                                       }
 
-                                       /* Zero the end of it. */
-                                       pte_t *pte = pgdir_walk(p->env_pgdir, (void*)last_page, 0);
-                                       assert(pte);
-                                       void* last_page_kva = ppn2kva(PTE2PPN(*pte));
-                                       memset(last_page_kva + partial, 0, PGSIZE - partial);
+                                       /* Zero the end of it.  This is a huge pain in the ass.  The
+                                        * filesystems should zero out the last bits of a page if
+                                        * the file doesn't fill the last page.  But we're dealing
+                                        * with windows into otherwise complete files. */
+                                       pte_t pte = pgdir_walk(p->env_pgdir, (void*)last_page, 0);
+                                       /* if we were able to get a PTE, then there is a real page
+                                        * backing the VMR, and we need to zero the excess.  if
+                                        * there isn't, then the page fault code should handle it.
+                                        * since we set populate above, we should have a PTE, except
+                                        * in cases where the offset + len window exceeded the file
+                                        * size.  in this case, we let them mmap it, but didn't
+                                        * populate it.  there will be a PF right away if someone
+                                        * tries to use this.  check out do_mmap for more info. */
+                                       if (pte_walk_okay(pte)) {
+                                               void* last_page_kva = KADDR(pte_get_paddr(pte));
+                                               memset(last_page_kva + partial, 0, PGSIZE - partial);
+                                       }
 
                                        filesz = ROUNDUP(filesz, PGSIZE);
                                }
@@ -156,8 +325,10 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                        if (filesz < memsz)
                                if (do_mmap(p, memstart + filesz, memsz-filesz,
                                            PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                                               NULL, 0) == MAP_FAILED)
+                                               NULL, 0) == MAP_FAILED) {
+                                       printk("[kernel] load_one_elf: anon mmap failed\n");
                                        goto fail;
+                               }
                }
        }
        /* map in program headers anyway if not present in binary.
@@ -167,80 +338,102 @@ static int load_one_elf(struct proc *p, struct file *f, uintptr_t pgoffset,
                uintptr_t filesz = e_phoff + (e_phnum * phsz) - filestart;
                void *phdr_addr = do_mmap(p, 0, filesz, PROT_READ | PROT_WRITE,
                                          MAP_PRIVATE, f, filestart);
-               if (phdr_addr == MAP_FAILED)
+               if (phdr_addr == MAP_FAILED) {
+                       printk("[kernel] load_one_elf: prog header mmap failed\n");
                        goto fail;
+               }
                ei->phdr = (long)phdr_addr + e_phoff;
        }
-       ei->entry = elf_field(elfhdr, e_entry) + pgoffset*PGSIZE;
+       ei->entry = elf_field(elfhdr, e_entry) + pg_num * PGSIZE;
        ei->phnum = e_phnum;
        ei->elf64 = elf64;
        ret = 0;
-       goto out;
+       /* Fall-through */
 fail:
-       printk("[kernel] Load failed during loadelf of file %s!\n", file_name(f));
-out:
        if (phdrs)
                kfree(phdrs);
-       current = cur_proc;
+       switch_back_from_ktask(old_ret);
        return ret;
 }
 
-int load_elf(struct proc* p, struct file* f)
+int load_elf(struct proc* p, struct file* f,
+             int argc, char *argv[], int envc, char *envp[])
 {
        elf_info_t ei, interp_ei;
        if (load_one_elf(p, f, 0, &ei, FALSE))
                return -1;
 
        if (ei.dynamic) {
-               struct file *interp = do_file_open(ei.interp, 0, 0);
+               struct file *interp = do_file_open(ei.interp, O_READ, 0);
                if (!interp)
                        return -1;
-               /* Load dynamic linker one page into the address space */
-               int error = load_one_elf(p, interp, 1, &interp_ei, TRUE);
+               /* Load dynamic linker at 1M. Obvious MIB joke avoided.
+                * It used to be loaded at page 1, but the existence of valid addresses
+                * that low masked bad derefs through NULL pointer structs. This in turn
+                * helped us waste a full day debugging a bug in the Go runtime. True!
+                * Note that MMAP_LOWEST_VA also has this value but we want to make this
+                * explicit. */
+               int error = load_one_elf(p, interp, MMAP_LD_FIXED_VA >> PGSHIFT,
+                                        &interp_ei, TRUE);
                kref_put(&interp->f_kref);
                if (error)
                        return -1;
        }
 
-       // fill in auxiliary info for dynamic linker/runtime
-       elf_aux_t auxp[] = {{ELF_AUX_PHDR, ei.phdr},
+       /* Set up the auxiliary info for dynamic linker/runtime */
+       elf_aux_t auxv[] = {{ELF_AUX_PHDR, ei.phdr},
                            {ELF_AUX_PHENT, sizeof(proghdr32_t)},
                            {ELF_AUX_PHNUM, ei.phnum},
-                           {ELF_AUX_ENTRY, ei.entry},
-                           #ifdef __sparc_v8__
-                           {ELF_AUX_HWCAP, ELF_HWCAP_SPARC_FLUSH},
-                           #endif
-                           {0, 0}};
-
-       // put auxp after argv, envp in procinfo
-       int auxp_pos = -1;
-       for (int i = 0, zeros = 0; i < PROCINFO_MAX_ARGP; i++)
-               if (p->procinfo->argp[i] == NULL)
-                       if (++zeros == 2)
-                               auxp_pos = i + 1;
-       if (auxp_pos == -1 ||
-           auxp_pos + sizeof(auxp) / sizeof(char*) >= PROCINFO_MAX_ARGP)
+                           {ELF_AUX_ENTRY, ei.entry}};
+       int auxc = sizeof(auxv)/sizeof(auxv[0]);
+
+       /* Populate the stack with the required info. */
+       uintptr_t stack_top = populate_stack(p, argc, argv, envc, envp, auxc, auxv);
+       if (!stack_top)
                return -1;
-       memcpy(p->procinfo->argp+auxp_pos,auxp,sizeof(auxp));
 
+       /* Initialize the process as an SCP. */
        uintptr_t core0_entry = ei.dynamic ? interp_ei.entry : ei.entry;
-       proc_init_trapframe(&p->env_tf,0,core0_entry,USTACKTOP);
-       p->env_entry = ei.entry;
+       proc_init_ctx(&p->scp_ctx, 0, core0_entry, stack_top, 0);
 
-       int flags = MAP_FIXED | MAP_ANONYMOUS;
-       #ifdef __sparc_v8__
-       flags |= MAP_POPULATE; // SPARC stacks must be mapped in
-       #endif
-       uintptr_t stacksz = USTACK_NUM_PAGES*PGSIZE;
-       if (do_mmap(p, USTACKTOP-stacksz, stacksz, PROT_READ | PROT_WRITE,
-                   flags, NULL, 0) == MAP_FAILED)
-               return -1;
-
-       // Set the heap bottom and top to just past where the text 
-       // region has been loaded
-       p->heap_top = (void*)ei.highest_addr;
-       p->procinfo->heap_bottom = p->heap_top;
+       p->procinfo->program_end = ei.highest_addr;
+       p->args_base = (void *) stack_top;
 
        return 0;
 }
 
+ssize_t get_startup_argc(struct proc *p)
+{
+       const char *sptr = (const char *) p->args_base;
+       ssize_t argc = 0;
+
+       /* TODO,DL: Use copy_from_user() when available.
+        */
+       if (memcpy_from_user(p, &argc, sptr, sizeof(size_t)))
+               return -1;
+
+       return argc;
+}
+
+char *get_startup_argv(struct proc *p, size_t idx, char *argp,
+                                          size_t max_size)
+{
+       size_t stack_space = (const char *) USTACKTOP - (const char *) p->args_base;
+       const char *sptr = (const char *) p->args_base + sizeof(size_t) +
+               idx * sizeof(char *);
+       const char *argv = NULL;
+
+       /* TODO,DL: Use copy_from_user() when available.
+        */
+       if (memcpy_from_user(p, &argv, sptr, sizeof(char *)))
+               return NULL;
+
+       /* TODO,DL: Use strncpy_from_user() when available.
+        */
+       max_size = MIN(max_size, stack_space);
+       if (memcpy_from_user(p, argp, argv, max_size))
+               return NULL;
+       argp[max_size - 1] = 0;
+
+       return argp;
+}