9c7abb7310e4e8046325300b0b33ed3fcce5ae9d
[akaros.git] / kern / drivers / dev / vm.c
1 //#define DEBUG
2 /* Copyright 2014 Google Inc.
3  * Copyright (c) 2013 The Regents of the University of California
4  * Barret Rhoden <brho@cs.berkeley.edu>
5  * See LICENSE for details.
6  *
7  * devvm/#V: a device for VMs
8  *
9  */
10
11 #include <kmalloc.h>
12 #include <string.h>
13 #include <stdio.h>
14 #include <assert.h>
15 #include <error.h>
16 #include <pmap.h>
17 #include <sys/queue.h>
18 #include <smp.h>
19 #include <kref.h>
20 #include <atomic.h>
21 #include <alarm.h>
22 #include <event.h>
23 #include <umem.h>
24 #include <devalarm.h>
25 #include <arch/types.h>
26 #include <arch/vm.h>
27 #include <arch/emulate.h>
28 #include <arch/vmdebug.h>
29
30 /* qid path types */
31 enum {
32         Qtopdir = 1,
33         Qclone,
34         Qstat,
35         Qvmdir,
36         Qctl,
37         Qimage,
38 };
39
40 /* The QID is the TYPE and the index into the vms array.
41  * We reserve the right to make it an id later.
42  */
43 #define ID_SHIFT 5
44 /* vm's have an image.
45  * Note that the image can be read even as it is running. */
46 struct vm {
47         struct kref kref;
48         /* should this be an array of pages? Hmm. */
49         void *image;
50         unsigned long imagesize;
51         int id; // not used yet. 
52         struct litevm *archvm;
53 };
54
55 static spinlock_t vmlock;
56 /* array, not linked list. We expect few, might as well be cache friendly. */
57 static struct vm *vms = NULL;
58 static int nvm = 0;
59 static int vmok = 0;
60
61 static spinlock_t vmidlock[1];
62 static struct kref vmid[1] = { {(void *)1, fake_release} };
63
64 /* not clear what .h to put these in. Put them here. */
65
66 struct litevm *vmx_open(void);
67 int vmx_create_vcpu(struct litevm *litevm, int n);
68 int vmx_init(void);
69 int vm_set_memory_region(struct litevm *litevm,
70                                                  struct litevm_memory_region *mem);
71 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run);
72
73 static inline struct vm *
74 QID2VM(struct qid q)
75 {
76         return &vms[((q).path >> ID_SHIFT)];
77 }
78
79 static inline int 
80 TYPE(struct qid q)
81 {
82         return ((q).path & ((1 << ID_SHIFT) - 1));
83 }
84
85 static inline int QID(int index, int type)
86 {
87         return ((index << ID_SHIFT) | type);
88 }
89
90 /* we'll need this somewhere more generic. */
91 static void readn(struct chan *c, void *vp, long n)
92 {
93         //print_func_entry();
94         char *p;
95         long nn;
96         int total = 0, want = n;
97
98         p = vp;
99         while (n > 0) {
100                 nn = devtab[c->type].read(c, p, n, c->offset);
101                 printk("readn: Got %d@%lld\n", nn, c->offset);
102                 if (nn == 0)
103                         error("%s: wanted %d, got %d", Eshort, want, total);
104                 c->offset += nn;
105                 p += nn;
106                 n -= nn;
107                 total += nn;
108         }
109         //print_func_exit();
110 }
111
112 /* not called yet.  -- we have to unlink the vm */
113 static void vm_release(struct kref *kref)
114 {
115         //print_func_entry();
116         struct vm *v = container_of(kref, struct vm, kref);
117         spin_lock_irqsave(&vmlock);
118         /* cute trick. Save the last element of the array in place of the
119          * one we're deleting. Reduce nvm. Don't realloc; that way, next
120          * time we add a vm the allocator will just return.
121          * Well, this is stupid, because when we do this, we break
122          * the QIDs, which have pointers embedded in them.
123          * darn it, may have to use a linked list. Nope, will probably
124          * just walk the array until we find a matching id. Still ... yuck.
125          */
126         if (v != &vms[nvm - 1]) {
127                 /* free the image ... oops */
128                 /* get rid of the kref. */
129                 *v = vms[nvm - 1];
130         }
131         nvm--;
132         spin_unlock(&vmlock);
133         //print_func_exit();
134 }
135
136 /* VM ids run in the range 1..infinity. But vmx.c wants them
137  * 0-based.
138  */
139 static int newvmid(void)
140 {
141         //print_func_entry();
142         int id;
143         spin_lock_irqsave(vmidlock);
144         id = kref_refcnt(vmid);
145         kref_get(vmid, 1);
146         spin_unlock(vmidlock);
147         //print_func_exit();
148         return id - 1;
149 }
150
151 static int vmgen(struct chan *c, char *entry_name,
152                                  struct dirtab *unused, int unused_nr_dirtab,
153                                  int s, struct dir *dp)
154 {
155         //print_func_entry();
156         struct qid q;
157         struct vm *vm_i;
158         printd("GEN s %d\n", s);
159         /* Whether we're in one dir or at the top, .. still takes us to the top. */
160         if (s == DEVDOTDOT) {
161                 mkqid(&q, Qtopdir, 0, QTDIR);
162                 devdir(c, c->qid, "#V", 0, eve, 0555, dp);
163                 //print_func_exit();
164                 return 1;
165         }
166         printd("TYPE %d\n", TYPE(c->qid));
167         switch (TYPE(c->qid)) {
168                 case Qtopdir:
169                         printd("Qtopdir s %d nvm %d\n", s, nvm);
170                         /* Generate elements for the top level dir.  We support clone, stat,
171                          * vm dirs at the top level */
172                         if (s == 0) {
173                                 mkqid(&q, Qclone, 0, QTFILE);
174                                 devdir(c, q, "clone", 0, eve, 0666, dp);
175                                 //print_func_exit();
176                                 return 1;
177                         }
178                         s--;
179                         if (s == 0) {
180                                 mkqid(&q, Qstat, 0, QTFILE);
181                                 devdir(c, q, "stat", 0, eve, 0666, dp);
182                                 //print_func_exit();
183                                 return 1;
184                         }
185                         s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
186                         spin_lock_irqsave(&vmlock);
187                         if (s >= nvm) {
188                                 printd("DONE qtopdir\n");
189                                 spin_unlock(&vmlock);
190                                 //print_func_exit();
191                                 return -1;
192                         }
193                         vm_i = &vms[s];
194                         snprintf(get_cur_genbuf(), GENBUF_SZ, "vm%d", vm_i->id);
195                         spin_unlock(&vmlock);
196                         mkqid(&q, QID(s, Qvmdir), 0, QTDIR);
197                         devdir(c, q, get_cur_genbuf(), 0, eve, 0555, dp);
198                         //print_func_exit();
199                         return 1;
200                 case Qvmdir:
201                         /* Gen the contents of the vm dirs */
202                         s += Qctl;      /* first time through, start on Qctl */
203                         switch (s) {
204                                 case Qctl:
205                                         mkqid(&q, QID(s-Qctl, Qctl), 0, QTFILE);
206                                         devdir(c, q, "ctl", 0, eve, 0666, dp);
207                                         //print_func_exit();
208                                         return 1;
209                                 case Qimage:
210                                         mkqid(&q, QID(s-Qctl, Qimage), 0, QTFILE);
211                                         devdir(c, q, "image", 0, eve, 0666, dp);
212                                         //print_func_exit();
213                                         return 1;
214                         }
215                         //print_func_exit();
216                         return -1;
217                         /* Need to also provide a direct hit for Qclone and all other files
218                          * (at all levels of the hierarchy).  Every file is both generated
219                          * (via the s increments in their respective directories) and
220                          * directly gen-able.  devstat() will call gen with a specific path
221                          * in the qid.  In these cases, we make a dir for whatever they are
222                          * asking for.  Note the qid stays the same.  I think this is what
223                          * the old plan9 comments above devgen were talking about for (ii).
224                          *
225                          * We don't need to do this for the directories - devstat will look
226                          * for the a directory by path and fail.  Then it will manually
227                          * build the stat output (check the -1 case in devstat). */
228                 case Qclone:
229                         devdir(c, c->qid, "clone", 0, eve, 0666, dp);
230                         //print_func_exit();
231                         return 1;
232                 case Qstat:
233                         devdir(c, c->qid, "stat", 0, eve, 0444, dp);
234                         //print_func_exit();
235                         return 1;
236                 case Qctl:
237                         devdir(c, c->qid, "ctl", 0, eve, 0666, dp);
238                         //print_func_exit();
239                         return 1;
240                 case Qimage:
241                         devdir(c, c->qid, "image", 0, eve, 0666, dp);
242                         //print_func_exit();
243                         return 1;
244         }
245         //print_func_exit();
246         return -1;
247 }
248
249 static void vminit(void)
250 {
251         //print_func_entry();
252         int i;
253         spinlock_init_irqsave(&vmlock);
254         spinlock_init_irqsave(vmidlock);
255         i = vmx_init();
256         if (i == 0)
257                 vmok = 1;
258         printk("vminit: litevm_init returns %d\n", i);
259
260         //print_func_exit();
261 }
262
263 static struct chan *vmattach(char *spec)
264 {
265         //print_func_entry();
266         if (!vmok)
267                 error("No VMs available");
268         struct chan *c = devattach('V', spec);
269         mkqid(&c->qid, Qtopdir, 0, QTDIR);
270         //print_func_exit();
271         return c;
272 }
273
274 static struct walkqid *vmwalk(struct chan *c, struct chan *nc, char **name,
275                                                           int nname)
276 {
277         //print_func_entry();
278         //print_func_exit();
279         return devwalk(c, nc, name, nname, 0, 0, vmgen);
280 }
281
282 static int vmstat(struct chan *c, uint8_t * db, int n)
283 {
284         //print_func_entry();
285         //print_func_exit();
286         return devstat(c, db, n, 0, 0, vmgen);
287 }
288
289 /* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
290  * the open chan into p's fd table, then decref the chan. */
291 static struct chan *vmopen(struct chan *c, int omode)
292 {
293         //print_func_entry();
294         ERRSTACK(1);
295         struct vm *v = QID2VM(c->qid);
296         printk("vmopen: v is %p\n", v);
297         if (waserror()) {
298                 nexterror();
299         }
300         switch (TYPE(c->qid)) {
301                 case Qtopdir:
302                 case Qvmdir:
303                         if (omode & ORCLOSE)
304                                 error(Eperm);
305                         if (!IS_RDONLY(omode))
306                                 error(Eisdir);
307                         break;
308                 case Qclone:
309                         spin_lock_irqsave(&vmlock);
310                         vms = krealloc(vms, sizeof(vms[0]) * (nvm + 1), 0);
311                         v = &vms[nvm];
312                         nvm++;
313                         spin_unlock(&vmlock);
314                         kref_init(&v->kref, vm_release, 1);
315                         v->id = newvmid();
316                         mkqid(&c->qid, QID(nvm, Qctl), 0, QTFILE);
317                         c->aux = v;
318                         printd("New VM id %d\n", v->id);
319                         v->archvm = vmx_open();
320                         if (!v->archvm) {
321                                 printk("vm_open failed\n");
322                                 error("vm_open failed");
323                         }
324                         if (vmx_create_vcpu(v->archvm, v->id) < 0) {
325                                 printk("vm_create failed");
326                                 error("vm_create failed");
327                         }
328                         printk("Qclone open: id %d, v is %p, v->archvm is %p\n", 
329                                         nvm-1,
330                                         v, v->archvm);
331                         break;
332                 case Qstat:
333                         break;
334                 case Qctl:
335                 case Qimage:
336                         c->aux = QID2VM(c->qid);
337                         printk("open qctl: aux (vm) is %p\n", c->aux);
338                         break;
339         }
340         c->mode = openmode(omode);
341         /* Assumes c is unique (can't be closed concurrently */
342         c->flag |= COPEN;
343         c->offset = 0;
344         poperror();
345         //print_func_exit();
346         return c;
347 }
348
349 static void vmcreate(struct chan *c, char *name, int omode, uint32_t perm)
350 {
351         //print_func_entry();
352         error(Eperm);
353         //print_func_exit();
354 }
355
356 static void vmremove(struct chan *c)
357 {
358         //print_func_entry();
359         error(Eperm);
360         //print_func_exit();
361 }
362
363 static int vmwstat(struct chan *c, uint8_t * dp, int n)
364 {
365         //print_func_entry();
366         error("No vmwstat");
367         //print_func_exit();
368         return 0;
369 }
370
371 static void vmclose(struct chan *c)
372 {
373         //print_func_entry();
374         struct vm *v = c->aux;
375         if (!v) {
376                 //print_func_exit();
377                 return;
378         }
379         /* There are more closes than opens.  For instance, sysstat doesn't open,
380          * but it will close the chan it got from namec.  We only want to clean
381          * up/decref chans that were actually open. */
382         if (!(c->flag & COPEN)) {
383                 //print_func_exit();
384                 return;
385         }
386         switch (TYPE(c->qid)) {
387                         /* for now, leave the VM active even when we close ctl */
388                 case Qctl:
389                         break;
390                 case Qimage:
391                         kref_put(&v->kref);
392                         break;
393         }
394         //print_func_exit();
395 }
396
397 static long vmread(struct chan *c, void *ubuf, long n, int64_t offset)
398 {
399         //print_func_entry();
400         struct vm *v = c->aux;
401         printd("VMREAD\n");
402         switch (TYPE(c->qid)) {
403                 case Qtopdir:
404                 case Qvmdir:
405                         //print_func_exit();
406                         return devdirread(c, ubuf, n, 0, 0, vmgen);
407                 case Qstat:
408                         //print_func_exit();
409                         return readnum(offset, ubuf, n, nvm, NUMSIZE32);
410                 case Qctl:
411                         assert(v);
412                         //print_func_exit();
413                         return readnum(offset, ubuf, n, v->id, NUMSIZE32);
414                 case Qimage:
415                         assert(v);
416                         //print_func_exit();
417                         return readmem(offset, ubuf, n, v->image, v->imagesize);
418                 default:
419                         panic("Bad QID %p in devvm", c->qid.path);
420         }
421         //print_func_exit();
422         return 0;
423 }
424
425 static long vmwrite(struct chan *c, void *ubuf, long n, int64_t unused)
426 {
427         //print_func_entry();
428         ERRSTACK(3);
429         char buf[32];
430         struct cmdbuf *cb;
431         struct vm *vm;
432         struct litevm *litevm;
433         uint64_t hexval;
434         printd("vmwrite(%p, %p, %d)\n", c, ubuf, n);
435         switch (TYPE(c->qid)) {
436                 case Qtopdir:
437                 case Qvmdir:
438                 case Qstat:
439                         error(Eperm);
440                 case Qctl:
441                         vm = c->aux;
442                         litevm = vm->archvm;
443                         printk("qctl: vm is %p, litevm is %p\n", vm, litevm);
444                         cb = parsecmd(ubuf, n);
445                         if (waserror()) {
446                                 kfree(cb);
447                                 nexterror();
448                         }
449                         if (!strcmp(cb->f[0], "run")) {
450                                 int ret;
451                                 if (cb->nf != 4)
452                                         error("usage: run vcpu emulated mmio_completed");
453                                 struct litevm_run vmr;
454                                 vmr.vcpu = strtoul(cb->f[1], NULL, 0);
455                                 vmr.emulated = strtoul(cb->f[2], NULL, 0);
456                                 vmr.mmio_completed = strtoul(cb->f[3], NULL, 0);
457                                 disable_irq();
458                                 ret = vm_run(litevm, &vmr);
459                                 enable_irq();
460                                 printk("vm_run returns %d\n", ret);
461                                 //print_func_exit();
462                                 return ret;
463                         } else if (!strcmp(cb->f[0], "stop")) {
464                                 error("can't stop a vm yet");
465                         } else if (!strcmp(cb->f[0], "mapmem")) {
466                                 struct chan *file;
467                                 void *v;
468                                 vm = c->aux;
469                                 uint64_t filesize;
470                                 struct litevm_memory_region vmr;
471                                 int got;
472
473                                 if (cb->nf != 6)
474                                         error("usage: mapmem file slot flags addr size");
475                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
476                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
477                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
478                                 filesize = strtoul(cb->f[5], NULL, 0);
479                                 vmr.memory_size = (filesize + 4095) & ~4095ULL;
480
481                                 file = namec(cb->f[1], Aopen, OREAD, 0);
482                                 printk("after namec file is %p\n", file);
483                                 if (waserror()) {
484                                         printk("File open, alloc bad\n");
485                                         cclose(file);
486                                         nexterror();
487                                 }
488                                 /* at some point we want to mmap from the kernel
489                                  * but we don't have that yet. This all needs
490                                  * rethinking but the abstractions of kvm do too.
491                                  */
492                                 v = kmalloc(vmr.memory_size, KMALLOC_WAIT);
493                                 if (waserror()) {
494                                         printk("memory allocated, read bad %s\n", 
495                                                 current_errstr());
496                                         kfree(v);
497                                         nexterror();
498                                 }
499
500                                 readn(file, v, filesize);
501                                 vmr.init_data = v;
502
503                                 if (vm_set_memory_region(litevm, &vmr))
504                                         error("vm_set_memory_region failed");
505                                 void monitor(void *);
506                                 monitor(NULL);
507                                 poperror();
508                                 poperror();
509                                 kfree(v);
510                                 cclose(file);
511
512                         } else if (!strcmp(cb->f[0], "region")) {
513                                 void *v;
514                                 struct litevm_memory_region vmr;
515                                 if (cb->nf != 5)
516                                         error("usage: mapmem slot flags addr size");
517                                 vmr.slot = strtoul(cb->f[1], NULL, 0);
518                                 vmr.flags = strtoul(cb->f[2], NULL, 0);
519                                 vmr.guest_phys_addr = strtoul(cb->f[3], NULL, 0);
520                                 vmr.memory_size = strtoul(cb->f[4], NULL, 0);
521                                 vmr.init_data = NULL;
522                                 if (vm_set_memory_region(litevm, &vmr))
523                                         error("vm_set_memory_region failed");
524                         } else {
525                                 error("%s: not implemented", cb->f[0]);
526                         }
527                         kfree(cb);
528                         poperror();
529                         break;
530                 case Qimage:
531                         error("can't write an image yet");
532                         break;
533                 default:
534                         panic("Bad QID %p in devvm", c->qid.path);
535         }
536         //print_func_exit();
537         return n;
538 }
539
540 struct dev vmdevtab __devtab = {
541         'V',
542         "vm",
543
544         devreset,
545         vminit,
546         devshutdown,
547         vmattach,
548         vmwalk,
549         vmstat,
550         vmopen,
551         vmcreate,
552         vmclose,
553         vmread,
554         devbread,
555         vmwrite,
556         devbwrite,
557         vmremove,
558         vmwstat,
559         devpower,
560 //  devconfig,
561         devchaninfo,
562 };