Fix vm code
[akaros.git] / kern / drivers / dev / vm.c
1 //#define DEBUG
2 /* Copyright 2014 Google Inc.
3  * Copyright (c) 2013 The Regents of the University of California
4  * Barret Rhoden <brho@cs.berkeley.edu>
5  * See LICENSE for details.
6  *
7  * devvm/#V: a device for VMs
8  *
9  */
10
11 #include <kmalloc.h>
12 #include <string.h>
13 #include <stdio.h>
14 #include <assert.h>
15 #include <error.h>
16 #include <pmap.h>
17 #include <sys/queue.h>
18 #include <smp.h>
19 #include <kref.h>
20 #include <atomic.h>
21 #include <alarm.h>
22 #include <event.h>
23 #include <umem.h>
24 #include <devalarm.h>
25 #include <arch/types.h>
26 #include <arch/vm.h>
27 #include <arch/emulate.h>
28 #include <arch/vmdebug.h>
29
30 /* qid path types */
31 enum {
32         Qtopdir = 1,
33         Qclone,
34         Qstat,
35         Qvmdir,
36         Qctl,
37         Qimage,
38 };
39
40 /* This paddr/kaddr is a bit dangerous.  it'll work so long as we don't need all
41  * 64 bits for a physical address (48 is the current norm on x86_64).
42  * We're probably going to move to a model where we put the VM index or something
43  * into the qid, but this works for now.
44  */
45 #define ADDR_SHIFT 5
46 #define QID2VM(q) ((struct vm*)KADDR(((q).path >> ADDR_SHIFT)))
47 #define TYPE(q) ((q).path & ((1 << ADDR_SHIFT) - 1))
48 #define QID(ptr, type) ((PADDR(ptr) << ADDR_SHIFT) | type)
49
50 /* vm's have an image.
51  * Note that the image can be read even as it is running. */
52 struct vm {
53         struct vm *next;
54         struct kref kref;
55         /* should this be an array of pages? Hmm. */
56         void *image;
57         unsigned long imagesize;
58         int id;
59         struct litevm *archvm;
60 };
61
62 static spinlock_t vmlock;
63 /* array, not linked list. We expect few, might as well be cache friendly. */
64 static struct vm *vms = NULL;
65 static int nvm = 0;
66 static int vmok = 0;
67
68 static spinlock_t vmidlock[1];
69 static struct kref vmid[1] = { {(void *)1, fake_release} };
70
71 /* we'll need this somewhere more generic. */
72 static void readn(struct chan *c, void *vp, long n)
73 {
74         print_func_entry();
75         char *p;
76         long nn;
77         int total = 0, want = n;
78
79         p = vp;
80         while (n > 0) {
81                 nn = devtab[c->type].read(c, p, n, c->offset);
82                 printk("readn: Got %d@%lld\n", nn, c->offset);
83                 if (nn == 0)
84                         error("%s: wanted %d, got %d", Eshort, total, want);
85                 c->offset += nn;
86                 p += nn;
87                 n -= nn;
88                 total += nn;
89         }
90         print_func_exit();
91 }
92
93 static void vm_release(struct kref *kref)
94 {
95         print_func_entry();
96         struct vm *v = container_of(kref, struct vm, kref);
97         spin_lock_irqsave(&vmlock);
98         /* cute trick. Save the last element of the array in place of the
99          * one we're deleting. Reduce nvm. Don't realloc; that way, next
100          * time we add a vm the allocator will just return.
101          * Well, this is stupid, because when we do this, we break
102          * the QIDs, which have pointers embedded in them.
103          * darn it, may have to use a linked list. Nope, will probably
104          * just walk the array until we find a matching id. Still ... yuck.
105          */
106         if (v != &vms[nvm - 1]) {
107                 /* free the image ... oops */
108                 /* get rid of the kref. */
109                 *v = vms[nvm - 1];
110         }
111         nvm--;
112         spin_unlock(&vmlock);
113         print_func_exit();
114 }
115
116 /* VM ids run in the range 1..infinity. But vmx.c wants them
117  * 0-based.
118  */
119 static int newvmid(void)
120 {
121         print_func_entry();
122         int id;
123         spin_lock_irqsave(vmidlock);
124         id = kref_refcnt(vmid);
125         kref_get(vmid, 1);
126         spin_unlock(vmidlock);
127         print_func_exit();
128         return id - 1;
129 }
130
131 static int vmgen(struct chan *c, char *entry_name,
132                                  struct dirtab *unused, int unused_nr_dirtab,
133                                  int s, struct dir *dp)
134 {
135         print_func_entry();
136         struct qid q;
137         struct vm *vm_i;
138         printd("GEN s %d\n", s);
139         /* Whether we're in one dir or at the top, .. still takes us to the top. */
140         if (s == DEVDOTDOT) {
141                 mkqid(&q, Qtopdir, 0, QTDIR);
142                 devdir(c, c->qid, "#V", 0, eve, 0555, dp);
143                 print_func_exit();
144                 return 1;
145         }
146         printd("TYPE %d\n", TYPE(c->qid));
147         switch (TYPE(c->qid)) {
148                 case Qtopdir:
149                         printd("Qtopdir s %d nvm %d\n", s, nvm);
150                         /* Generate elements for the top level dir.  We support clone, stat,
151                          * vm dirs at the top level */
152                         if (s == 0) {
153                                 mkqid(&q, Qclone, 0, QTFILE);
154                                 devdir(c, q, "clone", 0, eve, 0666, dp);
155                                 print_func_exit();
156                                 return 1;
157                         }
158                         s--;
159                         if (s == 0) {
160                                 mkqid(&q, Qstat, 0, QTFILE);
161                                 devdir(c, q, "stat", 0, eve, 0666, dp);
162                                 print_func_exit();
163                                 return 1;
164                         }
165                         s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
166                         spin_lock_irqsave(&vmlock);
167                         if (s >= nvm) {
168                                 printd("DONE qtopdir\n");
169                                 spin_unlock(&vmlock);
170                                 print_func_exit();
171                                 return -1;
172                         }
173                         vm_i = &vms[s];
174                         snprintf(get_cur_genbuf(), GENBUF_SZ, "vm%d", vm_i->id);
175                         spin_unlock(&vmlock);
176                         mkqid(&q, QID(vm_i, Qvmdir), 0, QTDIR);
177                         devdir(c, q, get_cur_genbuf(), 0, eve, 0555, dp);
178                         print_func_exit();
179                         return 1;
180                 case Qvmdir:
181                         /* Gen the contents of the vm dirs */
182                         s += Qctl;      /* first time through, start on Qctl */
183                         switch (s) {
184                                 case Qctl:
185                                         mkqid(&q, QID(QID2VM(c->qid), Qctl), 0, QTFILE);
186                                         devdir(c, q, "ctl", 0, eve, 0666, dp);
187                                         print_func_exit();
188                                         return 1;
189                                 case Qimage:
190                                         mkqid(&q, QID(QID2VM(c->qid), Qimage), 0, QTFILE);
191                                         devdir(c, q, "image", 0, eve, 0666, dp);
192                                         print_func_exit();
193                                         return 1;
194                         }
195                         print_func_exit();
196                         return -1;
197                         /* Need to also provide a direct hit for Qclone and all other files (at
198                          * all levels of the hierarchy).  Every file is both
199                          * generated (via the s increments in their respective directories) and
200                          * directly gen-able.  devstat() will call gen with a specific path in
201                          * the qid.  In these cases, we make a dir for whatever they are asking
202                          * for.  Note the qid stays the same.  I think this is what the old
203                          * plan9 comments above devgen were talking about for (ii).
204                          *
205                          * We don't need to do this for the directories - devstat will look for
206                          * the a directory by path and fail.  Then it will manually build the
207                          * stat output (check the -1 case in devstat). */
208                 case Qclone:
209                         devdir(c, c->qid, "clone", 0, eve, 0666, dp);
210                         print_func_exit();
211                         return 1;
212                 case Qstat:
213                         devdir(c, c->qid, "stat", 0, eve, 0444, dp);
214                         print_func_exit();
215                         return 1;
216                 case Qctl:
217                         devdir(c, c->qid, "ctl", 0, eve, 0666, dp);
218                         print_func_exit();
219                         return 1;
220                 case Qimage:
221                         devdir(c, c->qid, "image", 0, eve, 0666, dp);
222                         print_func_exit();
223                         return 1;
224         }
225         print_func_exit();
226         return -1;
227 }
228
229 static void vminit(void)
230 {
231         print_func_entry();
232         int i;
233         spinlock_init_irqsave(&vmlock);
234         spinlock_init_irqsave(vmidlock);
235         i = vmx_init();
236         if (i == 0)
237                 vmok = 1;
238         printk("vminit: litevm_init returns %d\n", i);
239
240         print_func_exit();
241 }
242
243 static struct chan *vmattach(char *spec)
244 {
245         print_func_entry();
246         if (!vmok)
247                 error("No VMs available");
248         struct chan *c = devattach('V', spec);
249         mkqid(&c->qid, Qtopdir, 0, QTDIR);
250         print_func_exit();
251         return c;
252 }
253
254 static struct walkqid *vmwalk(struct chan *c, struct chan *nc, char **name,
255                                                           int nname)
256 {
257         print_func_entry();
258         print_func_exit();
259         return devwalk(c, nc, name, nname, 0, 0, vmgen);
260 }
261
262 static int vmstat(struct chan *c, uint8_t * db, int n)
263 {
264         print_func_entry();
265         print_func_exit();
266         return devstat(c, db, n, 0, 0, vmgen);
267 }
268
269 /* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
270  * the open chan into p's fd table, then decref the chan. */
271 static struct chan *vmopen(struct chan *c, int omode)
272 {
273         print_func_entry();
274         ERRSTACK(1);
275         struct vm *v = QID2VM(c->qid);
276         printk("vmopen: v is %p\n", v);
277         if (waserror()) {
278                 nexterror();
279         }
280         switch (TYPE(c->qid)) {
281                 case Qtopdir:
282                 case Qvmdir:
283                         if (omode & ORCLOSE)
284                                 error(Eperm);
285                         if (!IS_RDONLY(omode))
286                                 error(Eisdir);
287                         break;
288                 case Qclone:
289                         spin_lock_irqsave(&vmlock);
290                         vms = krealloc(vms, sizeof(vms[0]) * (nvm + 1), 0);
291                         v = &vms[nvm];
292                         nvm++;
293                         spin_unlock(&vmlock);
294                         kref_init(&v->kref, vm_release, 1);
295                         v->id = newvmid();
296                         mkqid(&c->qid, QID(v, Qctl), 0, QTFILE);
297                         c->aux = v;
298                         printd("New VM id %d\n", v->id);
299                         v->archvm = vmx_open();
300                         if (!v->archvm) {
301                                 printk("vm_open failed\n");
302                                 error("vm_open failed");
303                         }
304                         if (vmx_create_vcpu(v->archvm, v->id) < 0) {
305                                 printk("vm_create failed");
306                                 error("vm_create failed");
307                         }
308                         break;
309                 case Qstat:
310                         break;
311                 case Qctl:
312                 case Qimage:
313                         c->aux = QID2VM(c->qid);
314                         printk("open qctl: aux is %p\n", c->aux);
315                         break;
316         }
317         c->mode = openmode(omode);
318         /* Assumes c is unique (can't be closed concurrently */
319         c->flag |= COPEN;
320         c->offset = 0;
321         poperror();
322         print_func_exit();
323         return c;
324 }
325
326 static void vmcreate(struct chan *c, char *name, int omode, uint32_t perm)
327 {
328         print_func_entry();
329         error(Eperm);
330         print_func_exit();
331 }
332
333 static void vmremove(struct chan *c)
334 {
335         print_func_entry();
336         error(Eperm);
337         print_func_exit();
338 }
339
340 static int vmwstat(struct chan *c, uint8_t * dp, int n)
341 {
342         print_func_entry();
343         error("No vmwstat");
344         print_func_exit();
345         return 0;
346 }
347
348 static void vmclose(struct chan *c)
349 {
350         print_func_entry();
351         struct vm *v = c->aux;
352         if (!v) {
353                 print_func_exit();
354                 return;
355         }
356         /* There are more closes than opens.  For instance, sysstat doesn't open,
357          * but it will close the chan it got from namec.  We only want to clean
358          * up/decref chans that were actually open. */
359         if (!(c->flag & COPEN)) {
360                 print_func_exit();
361                 return;
362         }
363         switch (TYPE(c->qid)) {
364                         /* for now, leave the VM active even when we close ctl */
365                 case Qctl:
366                         break;
367                 case Qimage:
368                         kref_put(&v->kref);
369                         break;
370         }
371         print_func_exit();
372 }
373
374 static long vmread(struct chan *c, void *ubuf, long n, int64_t offset)
375 {
376         print_func_entry();
377         struct vm *v = c->aux;
378         printd("VMREAD\n");
379         switch (TYPE(c->qid)) {
380                 case Qtopdir:
381                 case Qvmdir:
382                         print_func_exit();
383                         return devdirread(c, ubuf, n, 0, 0, vmgen);
384                 case Qstat:
385                         print_func_exit();
386                         return readnum(offset, ubuf, n, nvm, NUMSIZE32);
387                 case Qctl:
388                         assert(v);
389                         print_func_exit();
390                         return readnum(offset, ubuf, n, v->id, NUMSIZE32);
391                 case Qimage:
392                         assert(v);
393                         print_func_exit();
394                         return readmem(offset, ubuf, n, v->image, v->imagesize);
395                 default:
396                         panic("Bad QID %p in devvm", c->qid.path);
397         }
398         print_func_exit();
399         return 0;
400 }
401
402 static long vmwrite(struct chan *c, void *ubuf, long n, int64_t unused)
403 {
404         print_func_entry();
405         ERRSTACK(3);
406         char buf[32];
407         struct cmdbuf *cb;
408         struct vm *vm;
409         struct litevm *litevm;
410         uint64_t hexval;
411         printd("vmwrite(%p, %p, %d)\n", c, ubuf, n);
412         switch (TYPE(c->qid)) {
413                 case Qtopdir:
414                 case Qvmdir:
415                 case Qstat:
416                         error(Eperm);
417                 case Qctl:
418                         vm = c->aux;
419                         cb = parsecmd(ubuf, n);
420                         if (waserror()) {
421                                 kfree(cb);
422                                 nexterror();
423                         }
424                         if (!strcmp(cb->f[0], "run")) {
425                                 int ret;
426                                 if (cb->nf != 4)
427                                         error("usage: run vcpu emulated mmio_completed");
428                                 litevm = vm->archvm;
429                                 struct litevm_run vmr;
430                                 vmr.vcpu = strtoul(cb->f[1], NULL, 0);
431                                 vmr.emulated = strtoul(cb->f[2], NULL, 0);
432                                 vmr.mmio_completed = strtoul(cb->f[3], NULL, 0);
433                                 ret = vm_run(litevm, &vmr);
434                                 printk("vm_run returns %d\n", ret);
435                                 print_func_exit();
436                                 return ret;
437                         } else if (!strcmp(cb->f[0], "stop")) {
438                                 error("can't stop a vm yet");
439                         } else if (!strcmp(cb->f[0], "fillmem")) {
440                                 struct chan *file;
441                                 void *v;
442                                 vm = c->aux;
443                                 litevm = vm->archvm;
444                                 uint64_t filesize;
445                                 struct litevm_memory_region vmr;
446                                 int got;
447
448                                 if (cb->nf != 6)
449                                         error("usage: mapmem file slot flags addr size");
450                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
451                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
452                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
453                                 filesize = strtoul(cb->f[5], NULL, 0);
454                                 vmr.memory_size = (filesize + 4095) & ~4095ULL;
455
456                                 file = namec(cb->f[1], Aopen, OREAD, 0);
457                                 printk("after namec file is %p\n", file);
458                                 if (waserror()) {
459                                         cclose(file);
460                                         nexterror();
461                                 }
462                                 /* at some point we want to mmap from the kernel
463                                  * but we don't have that yet. This all needs
464                                  * rethinking but the abstractions of kvm do too.
465                                  */
466                                 v = kmalloc(vmr.memory_size, KMALLOC_WAIT);
467                                 if (waserror()) {
468                                         kfree(v);
469                                         nexterror();
470                                 }
471
472                                 readn(file, v, filesize);
473                                 vmr.init_data = v;
474
475                                 if (vm_set_memory_region(litevm, &vmr))
476                                         error("vm_set_memory_region failed");
477                                 poperror();
478                                 poperror();
479                                 kfree(v);
480                                 cclose(file);
481
482                         } else if (!strcmp(cb->f[0], "region")) {
483                                 void *v;
484                                 struct litevm_memory_region vmr;
485                                 litevm = vm->archvm;
486                                 if (cb->nf != 5)
487                                         error("usage: mapmem slot flags addr size");
488                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
489                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
490                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
491                                 vmr.memory_size = strtoul(cb->f[5], NULL, 0);
492                                 if (vm_set_memory_region(litevm, &vmr))
493                                         error("vm_set_memory_region failed");
494                         } else {
495                                 error("%s: not implemented", cb->f[0]);
496                         }
497                         kfree(cb);
498                         poperror();
499                         break;
500                 case Qimage:
501                         error("can't write an image yet");
502                         break;
503                 default:
504                         panic("Bad QID %p in devvm", c->qid.path);
505         }
506         print_func_exit();
507         return n;
508 }
509
510 struct dev vmdevtab __devtab = {
511         'V',
512         "vm",
513
514         devreset,
515         vminit,
516         devshutdown,
517         vmattach,
518         vmwalk,
519         vmstat,
520         vmopen,
521         vmcreate,
522         vmclose,
523         vmread,
524         devbread,
525         vmwrite,
526         devbwrite,
527         vmremove,
528         vmwstat,
529         devpower,
530 //  devconfig,
531         devchaninfo,
532 };