Run a VM.
[akaros.git] / kern / drivers / dev / vm.c
1 //#define DEBUG
2 /* Copyright 2014 Google Inc.
3  * Copyright (c) 2013 The Regents of the University of California
4  * Barret Rhoden <brho@cs.berkeley.edu>
5  * See LICENSE for details.
6  *
7  * devvm/#V: a device for VMs
8  *
9  */
10
11 #include <kmalloc.h>
12 #include <string.h>
13 #include <stdio.h>
14 #include <assert.h>
15 #include <error.h>
16 #include <pmap.h>
17 #include <sys/queue.h>
18 #include <smp.h>
19 #include <kref.h>
20 #include <atomic.h>
21 #include <alarm.h>
22 #include <event.h>
23 #include <umem.h>
24 #include <devalarm.h>
25 #include <arch/types.h>
26 #include <arch/vm.h>
27 #include <arch/emulate.h>
28 #include <arch/vmdebug.h>
29
30 /* qid path types */
31 enum {
32         Qtopdir = 1,
33         Qclone,
34         Qstat,
35         Qvmdir,
36         Qctl,
37         Qimage,
38 };
39
40 /* The QID is the TYPE and the index into the vms array.
41  * We reserve the right to make it an id later.
42  */
43 #define ID_SHIFT 5
44 /* vm's have an image.
45  * Note that the image can be read even as it is running. */
46 struct vm {
47         struct kref kref;
48         /* should this be an array of pages? Hmm. */
49         void *image;
50         unsigned long imagesize;
51         int id; // not used yet. 
52         struct litevm *archvm;
53 };
54
55 static spinlock_t vmlock;
56 /* array, not linked list. We expect few, might as well be cache friendly. */
57 static struct vm *vms = NULL;
58 static int nvm = 0;
59 static int vmok = 0;
60
61 static spinlock_t vmidlock[1];
62 static struct kref vmid[1] = { {(void *)1, fake_release} };
63
64 static inline struct vm *
65 QID2VM(struct qid q)
66 {
67         return &vms[((q).path >> ID_SHIFT)];
68 }
69
70 static inline int 
71 TYPE(struct qid q)
72 {
73         return ((q).path & ((1 << ID_SHIFT) - 1));
74 }
75
76 static inline int QID(int index, int type)
77 {
78         return ((index << ID_SHIFT) | type);
79 }
80
81 /* we'll need this somewhere more generic. */
82 static void readn(struct chan *c, void *vp, long n)
83 {
84         //print_func_entry();
85         char *p;
86         long nn;
87         int total = 0, want = n;
88
89         p = vp;
90         while (n > 0) {
91                 nn = devtab[c->type].read(c, p, n, c->offset);
92                 printk("readn: Got %d@%lld\n", nn, c->offset);
93                 if (nn == 0)
94                         error("%s: wanted %d, got %d", Eshort, want, total);
95                 c->offset += nn;
96                 p += nn;
97                 n -= nn;
98                 total += nn;
99         }
100         //print_func_exit();
101 }
102
103 /* not called yet.  -- we have to unlink the vm */
104 static void vm_release(struct kref *kref)
105 {
106         //print_func_entry();
107         struct vm *v = container_of(kref, struct vm, kref);
108         spin_lock_irqsave(&vmlock);
109         /* cute trick. Save the last element of the array in place of the
110          * one we're deleting. Reduce nvm. Don't realloc; that way, next
111          * time we add a vm the allocator will just return.
112          * Well, this is stupid, because when we do this, we break
113          * the QIDs, which have pointers embedded in them.
114          * darn it, may have to use a linked list. Nope, will probably
115          * just walk the array until we find a matching id. Still ... yuck.
116          */
117         if (v != &vms[nvm - 1]) {
118                 /* free the image ... oops */
119                 /* get rid of the kref. */
120                 *v = vms[nvm - 1];
121         }
122         nvm--;
123         spin_unlock(&vmlock);
124         //print_func_exit();
125 }
126
127 /* VM ids run in the range 1..infinity. But vmx.c wants them
128  * 0-based.
129  */
130 static int newvmid(void)
131 {
132         //print_func_entry();
133         int id;
134         spin_lock_irqsave(vmidlock);
135         id = kref_refcnt(vmid);
136         kref_get(vmid, 1);
137         spin_unlock(vmidlock);
138         //print_func_exit();
139         return id - 1;
140 }
141
142 static int vmgen(struct chan *c, char *entry_name,
143                                  struct dirtab *unused, int unused_nr_dirtab,
144                                  int s, struct dir *dp)
145 {
146         //print_func_entry();
147         struct qid q;
148         struct vm *vm_i;
149         printd("GEN s %d\n", s);
150         /* Whether we're in one dir or at the top, .. still takes us to the top. */
151         if (s == DEVDOTDOT) {
152                 mkqid(&q, Qtopdir, 0, QTDIR);
153                 devdir(c, c->qid, "#V", 0, eve, 0555, dp);
154                 //print_func_exit();
155                 return 1;
156         }
157         printd("TYPE %d\n", TYPE(c->qid));
158         switch (TYPE(c->qid)) {
159                 case Qtopdir:
160                         printd("Qtopdir s %d nvm %d\n", s, nvm);
161                         /* Generate elements for the top level dir.  We support clone, stat,
162                          * vm dirs at the top level */
163                         if (s == 0) {
164                                 mkqid(&q, Qclone, 0, QTFILE);
165                                 devdir(c, q, "clone", 0, eve, 0666, dp);
166                                 //print_func_exit();
167                                 return 1;
168                         }
169                         s--;
170                         if (s == 0) {
171                                 mkqid(&q, Qstat, 0, QTFILE);
172                                 devdir(c, q, "stat", 0, eve, 0666, dp);
173                                 //print_func_exit();
174                                 return 1;
175                         }
176                         s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
177                         spin_lock_irqsave(&vmlock);
178                         if (s >= nvm) {
179                                 printd("DONE qtopdir\n");
180                                 spin_unlock(&vmlock);
181                                 //print_func_exit();
182                                 return -1;
183                         }
184                         vm_i = &vms[s];
185                         snprintf(get_cur_genbuf(), GENBUF_SZ, "vm%d", vm_i->id);
186                         spin_unlock(&vmlock);
187                         mkqid(&q, QID(s, Qvmdir), 0, QTDIR);
188                         devdir(c, q, get_cur_genbuf(), 0, eve, 0555, dp);
189                         //print_func_exit();
190                         return 1;
191                 case Qvmdir:
192                         /* Gen the contents of the vm dirs */
193                         s += Qctl;      /* first time through, start on Qctl */
194                         switch (s) {
195                                 case Qctl:
196                                         mkqid(&q, QID(s-Qctl, Qctl), 0, QTFILE);
197                                         devdir(c, q, "ctl", 0, eve, 0666, dp);
198                                         //print_func_exit();
199                                         return 1;
200                                 case Qimage:
201                                         mkqid(&q, QID(s-Qctl, Qimage), 0, QTFILE);
202                                         devdir(c, q, "image", 0, eve, 0666, dp);
203                                         //print_func_exit();
204                                         return 1;
205                         }
206                         //print_func_exit();
207                         return -1;
208                         /* Need to also provide a direct hit for Qclone and all other files (at
209                          * all levels of the hierarchy).  Every file is both
210                          * generated (via the s increments in their respective directories) and
211                          * directly gen-able.  devstat() will call gen with a specific path in
212                          * the qid.  In these cases, we make a dir for whatever they are asking
213                          * for.  Note the qid stays the same.  I think this is what the old
214                          * plan9 comments above devgen were talking about for (ii).
215                          *
216                          * We don't need to do this for the directories - devstat will look for
217                          * the a directory by path and fail.  Then it will manually build the
218                          * stat output (check the -1 case in devstat). */
219                 case Qclone:
220                         devdir(c, c->qid, "clone", 0, eve, 0666, dp);
221                         //print_func_exit();
222                         return 1;
223                 case Qstat:
224                         devdir(c, c->qid, "stat", 0, eve, 0444, dp);
225                         //print_func_exit();
226                         return 1;
227                 case Qctl:
228                         devdir(c, c->qid, "ctl", 0, eve, 0666, dp);
229                         //print_func_exit();
230                         return 1;
231                 case Qimage:
232                         devdir(c, c->qid, "image", 0, eve, 0666, dp);
233                         //print_func_exit();
234                         return 1;
235         }
236         //print_func_exit();
237         return -1;
238 }
239
240 static void vminit(void)
241 {
242         //print_func_entry();
243         int i;
244         spinlock_init_irqsave(&vmlock);
245         spinlock_init_irqsave(vmidlock);
246         i = vmx_init();
247         if (i == 0)
248                 vmok = 1;
249         printk("vminit: litevm_init returns %d\n", i);
250
251         //print_func_exit();
252 }
253
254 static struct chan *vmattach(char *spec)
255 {
256         //print_func_entry();
257         if (!vmok)
258                 error("No VMs available");
259         struct chan *c = devattach('V', spec);
260         mkqid(&c->qid, Qtopdir, 0, QTDIR);
261         //print_func_exit();
262         return c;
263 }
264
265 static struct walkqid *vmwalk(struct chan *c, struct chan *nc, char **name,
266                                                           int nname)
267 {
268         //print_func_entry();
269         //print_func_exit();
270         return devwalk(c, nc, name, nname, 0, 0, vmgen);
271 }
272
273 static int vmstat(struct chan *c, uint8_t * db, int n)
274 {
275         //print_func_entry();
276         //print_func_exit();
277         return devstat(c, db, n, 0, 0, vmgen);
278 }
279
280 /* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
281  * the open chan into p's fd table, then decref the chan. */
282 static struct chan *vmopen(struct chan *c, int omode)
283 {
284         //print_func_entry();
285         ERRSTACK(1);
286         struct vm *v = QID2VM(c->qid);
287         printk("vmopen: v is %p\n", v);
288         if (waserror()) {
289                 nexterror();
290         }
291         switch (TYPE(c->qid)) {
292                 case Qtopdir:
293                 case Qvmdir:
294                         if (omode & ORCLOSE)
295                                 error(Eperm);
296                         if (!IS_RDONLY(omode))
297                                 error(Eisdir);
298                         break;
299                 case Qclone:
300                         spin_lock_irqsave(&vmlock);
301                         vms = krealloc(vms, sizeof(vms[0]) * (nvm + 1), 0);
302                         v = &vms[nvm];
303                         nvm++;
304                         spin_unlock(&vmlock);
305                         kref_init(&v->kref, vm_release, 1);
306                         v->id = newvmid();
307                         mkqid(&c->qid, QID(nvm, Qctl), 0, QTFILE);
308                         c->aux = v;
309                         printd("New VM id %d\n", v->id);
310                         v->archvm = vmx_open();
311                         if (!v->archvm) {
312                                 printk("vm_open failed\n");
313                                 error("vm_open failed");
314                         }
315                         if (vmx_create_vcpu(v->archvm, v->id) < 0) {
316                                 printk("vm_create failed");
317                                 error("vm_create failed");
318                         }
319                         printk("Qclone open: id %d, v is %p, v->archvm is %p\n", 
320                                         nvm-1,
321                                         v, v->archvm);
322                         break;
323                 case Qstat:
324                         break;
325                 case Qctl:
326                 case Qimage:
327                         c->aux = QID2VM(c->qid);
328                         printk("open qctl: aux (vm) is %p\n", c->aux);
329                         break;
330         }
331         c->mode = openmode(omode);
332         /* Assumes c is unique (can't be closed concurrently */
333         c->flag |= COPEN;
334         c->offset = 0;
335         poperror();
336         //print_func_exit();
337         return c;
338 }
339
340 static void vmcreate(struct chan *c, char *name, int omode, uint32_t perm)
341 {
342         //print_func_entry();
343         error(Eperm);
344         //print_func_exit();
345 }
346
347 static void vmremove(struct chan *c)
348 {
349         //print_func_entry();
350         error(Eperm);
351         //print_func_exit();
352 }
353
354 static int vmwstat(struct chan *c, uint8_t * dp, int n)
355 {
356         //print_func_entry();
357         error("No vmwstat");
358         //print_func_exit();
359         return 0;
360 }
361
362 static void vmclose(struct chan *c)
363 {
364         //print_func_entry();
365         struct vm *v = c->aux;
366         if (!v) {
367                 //print_func_exit();
368                 return;
369         }
370         /* There are more closes than opens.  For instance, sysstat doesn't open,
371          * but it will close the chan it got from namec.  We only want to clean
372          * up/decref chans that were actually open. */
373         if (!(c->flag & COPEN)) {
374                 //print_func_exit();
375                 return;
376         }
377         switch (TYPE(c->qid)) {
378                         /* for now, leave the VM active even when we close ctl */
379                 case Qctl:
380                         break;
381                 case Qimage:
382                         kref_put(&v->kref);
383                         break;
384         }
385         //print_func_exit();
386 }
387
388 static long vmread(struct chan *c, void *ubuf, long n, int64_t offset)
389 {
390         //print_func_entry();
391         struct vm *v = c->aux;
392         printd("VMREAD\n");
393         switch (TYPE(c->qid)) {
394                 case Qtopdir:
395                 case Qvmdir:
396                         //print_func_exit();
397                         return devdirread(c, ubuf, n, 0, 0, vmgen);
398                 case Qstat:
399                         //print_func_exit();
400                         return readnum(offset, ubuf, n, nvm, NUMSIZE32);
401                 case Qctl:
402                         assert(v);
403                         //print_func_exit();
404                         return readnum(offset, ubuf, n, v->id, NUMSIZE32);
405                 case Qimage:
406                         assert(v);
407                         //print_func_exit();
408                         return readmem(offset, ubuf, n, v->image, v->imagesize);
409                 default:
410                         panic("Bad QID %p in devvm", c->qid.path);
411         }
412         //print_func_exit();
413         return 0;
414 }
415
416 static long vmwrite(struct chan *c, void *ubuf, long n, int64_t unused)
417 {
418         //print_func_entry();
419         ERRSTACK(3);
420         char buf[32];
421         struct cmdbuf *cb;
422         struct vm *vm;
423         struct litevm *litevm;
424         uint64_t hexval;
425         printd("vmwrite(%p, %p, %d)\n", c, ubuf, n);
426         switch (TYPE(c->qid)) {
427                 case Qtopdir:
428                 case Qvmdir:
429                 case Qstat:
430                         error(Eperm);
431                 case Qctl:
432                         vm = c->aux;
433                         litevm = vm->archvm;
434                         printk("qctl: vm is %p, litevm is %p\n", vm, litevm);
435                         cb = parsecmd(ubuf, n);
436                         if (waserror()) {
437                                 kfree(cb);
438                                 nexterror();
439                         }
440                         if (!strcmp(cb->f[0], "run")) {
441                                 int ret;
442                                 if (cb->nf != 4)
443                                         error("usage: run vcpu emulated mmio_completed");
444                                 struct litevm_run vmr;
445                                 vmr.vcpu = strtoul(cb->f[1], NULL, 0);
446                                 vmr.emulated = strtoul(cb->f[2], NULL, 0);
447                                 vmr.mmio_completed = strtoul(cb->f[3], NULL, 0);
448                                 ret = vm_run(litevm, &vmr);
449                                 printk("vm_run returns %d\n", ret);
450                                 //print_func_exit();
451                                 return ret;
452                         } else if (!strcmp(cb->f[0], "stop")) {
453                                 error("can't stop a vm yet");
454                         } else if (!strcmp(cb->f[0], "mapmem")) {
455                                 struct chan *file;
456                                 void *v;
457                                 vm = c->aux;
458                                 uint64_t filesize;
459                                 struct litevm_memory_region vmr;
460                                 int got;
461
462                                 if (cb->nf != 6)
463                                         error("usage: mapmem file slot flags addr size");
464                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
465                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
466                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
467                                 filesize = strtoul(cb->f[5], NULL, 0);
468                                 vmr.memory_size = (filesize + 4095) & ~4095ULL;
469
470                                 file = namec(cb->f[1], Aopen, OREAD, 0);
471                                 printk("after namec file is %p\n", file);
472                                 if (waserror()) {
473                                         printk("File open, alloc bad\n");
474                                         cclose(file);
475                                         nexterror();
476                                 }
477                                 /* at some point we want to mmap from the kernel
478                                  * but we don't have that yet. This all needs
479                                  * rethinking but the abstractions of kvm do too.
480                                  */
481                                 v = kmalloc(vmr.memory_size, KMALLOC_WAIT);
482                                 if (waserror()) {
483                                         printk("memory allocated, read bad %s\n", 
484                                                 current_errstr());
485                                         kfree(v);
486                                         nexterror();
487                                 }
488
489                                 readn(file, v, filesize);
490                                 vmr.init_data = v;
491
492                                 if (vm_set_memory_region(litevm, &vmr))
493                                         error("vm_set_memory_region failed");
494                                 void monitor(void *);
495                                 monitor(NULL);
496                                 poperror();
497                                 poperror();
498                                 kfree(v);
499                                 cclose(file);
500
501                         } else if (!strcmp(cb->f[0], "region")) {
502                                 void *v;
503                                 struct litevm_memory_region vmr;
504                                 if (cb->nf != 5)
505                                         error("usage: mapmem slot flags addr size");
506                                 vmr.slot = strtoul(cb->f[1], NULL, 0);
507                                 vmr.flags = strtoul(cb->f[2], NULL, 0);
508                                 vmr.guest_phys_addr = strtoul(cb->f[3], NULL, 0);
509                                 vmr.memory_size = strtoul(cb->f[4], NULL, 0);
510                                 vmr.init_data = NULL;
511                                 if (vm_set_memory_region(litevm, &vmr))
512                                         error("vm_set_memory_region failed");
513                         } else {
514                                 error("%s: not implemented", cb->f[0]);
515                         }
516                         kfree(cb);
517                         poperror();
518                         break;
519                 case Qimage:
520                         error("can't write an image yet");
521                         break;
522                 default:
523                         panic("Bad QID %p in devvm", c->qid.path);
524         }
525         //print_func_exit();
526         return n;
527 }
528
529 struct dev vmdevtab __devtab = {
530         'V',
531         "vm",
532
533         devreset,
534         vminit,
535         devshutdown,
536         vmattach,
537         vmwalk,
538         vmstat,
539         vmopen,
540         vmcreate,
541         vmclose,
542         vmread,
543         devbread,
544         vmwrite,
545         devbwrite,
546         vmremove,
547         vmwstat,
548         devpower,
549 //  devconfig,
550         devchaninfo,
551 };