67fbe5fd5718bad83b6f2c8b31f592278f1d4244
[akaros.git] / kern / drivers / dev / vm.c
1 //#define DEBUG
2 /* Copyright 2014 Google Inc.
3  * Copyright (c) 2013 The Regents of the University of California
4  * Barret Rhoden <brho@cs.berkeley.edu>
5  * See LICENSE for details.
6  *
7  * devvm/#V: a device for VMs
8  *
9  */
10
11 #include <kmalloc.h>
12 #include <string.h>
13 #include <stdio.h>
14 #include <assert.h>
15 #include <error.h>
16 #include <pmap.h>
17 #include <sys/queue.h>
18 #include <smp.h>
19 #include <kref.h>
20 #include <atomic.h>
21 #include <alarm.h>
22 #include <event.h>
23 #include <umem.h>
24 #include <devalarm.h>
25 #include <arch/types.h>
26 #include <arch/vm.h>
27 #include <arch/emulate.h>
28 #include <arch/vmdebug.h>
29
30 /* qid path types */
31 enum {
32         Qtopdir = 1,
33         Qclone,
34         Qstat,
35         Qvmdir,
36         Qctl,
37         Qimage,
38 };
39
40 /* The QID is the TYPE and the index into the vms array.
41  * We reserve the right to make it an id later.
42  */
43 #define ID_SHIFT 5
44 /* vm's have an image.
45  * Note that the image can be read even as it is running. */
46 struct vm {
47         struct kref kref;
48         /* should this be an array of pages? Hmm. */
49         void *image;
50         unsigned long imagesize;
51         int id; // not used yet. 
52         struct litevm *archvm;
53 };
54
55 static spinlock_t vmlock;
56 /* array, not linked list. We expect few, might as well be cache friendly. */
57 static struct vm *vms = NULL;
58 static int nvm = 0;
59 static int vmok = 0;
60
61 static spinlock_t vmidlock[1];
62 static struct kref vmid[1] = { {(void *)1, fake_release} };
63
64 /* not clear what .h to put these in. Put them here. */
65
66 struct litevm *vmx_open(void);
67 int vmx_create_vcpu(struct litevm *litevm, int n);
68 int vmx_init(void);
69 int vm_set_memory_region(struct litevm *litevm,
70                                                  struct litevm_memory_region *mem);
71 int vm_run(struct litevm *litevm, struct litevm_run *litevm_run);
72
73 static inline struct vm *
74 QID2VM(struct qid q)
75 {
76         return &vms[((q).path >> ID_SHIFT)];
77 }
78
79 static inline int 
80 TYPE(struct qid q)
81 {
82         return ((q).path & ((1 << ID_SHIFT) - 1));
83 }
84
85 static inline int QID(int index, int type)
86 {
87         return ((index << ID_SHIFT) | type);
88 }
89
90 /* we'll need this somewhere more generic. */
91 static void readn(struct chan *c, void *vp, long n)
92 {
93         //print_func_entry();
94         char *p;
95         long nn;
96         int total = 0, want = n;
97
98         p = vp;
99         while (n > 0) {
100                 nn = devtab[c->type].read(c, p, n, c->offset);
101                 printk("readn: Got %d@%lld\n", nn, c->offset);
102                 if (nn == 0)
103                         error("%s: wanted %d, got %d", Eshort, want, total);
104                 c->offset += nn;
105                 p += nn;
106                 n -= nn;
107                 total += nn;
108         }
109         //print_func_exit();
110 }
111
112 /* not called yet.  -- we have to unlink the vm */
113 static void vm_release(struct kref *kref)
114 {
115         //print_func_entry();
116         struct vm *v = container_of(kref, struct vm, kref);
117         spin_lock_irqsave(&vmlock);
118         /* cute trick. Save the last element of the array in place of the
119          * one we're deleting. Reduce nvm. Don't realloc; that way, next
120          * time we add a vm the allocator will just return.
121          * Well, this is stupid, because when we do this, we break
122          * the QIDs, which have pointers embedded in them.
123          * darn it, may have to use a linked list. Nope, will probably
124          * just walk the array until we find a matching id. Still ... yuck.
125          */
126         if (v != &vms[nvm - 1]) {
127                 /* free the image ... oops */
128                 /* get rid of the kref. */
129                 *v = vms[nvm - 1];
130         }
131         nvm--;
132         spin_unlock(&vmlock);
133         //print_func_exit();
134 }
135
136 /* VM ids run in the range 1..infinity. But vmx.c wants them
137  * 0-based.
138  */
139 static int newvmid(void)
140 {
141         //print_func_entry();
142         int id;
143         spin_lock_irqsave(vmidlock);
144         id = kref_refcnt(vmid);
145         kref_get(vmid, 1);
146         spin_unlock(vmidlock);
147         //print_func_exit();
148         return id - 1;
149 }
150
151 static int vmgen(struct chan *c, char *entry_name,
152                                  struct dirtab *unused, int unused_nr_dirtab,
153                                  int s, struct dir *dp)
154 {
155         //print_func_entry();
156         struct qid q;
157         struct vm *vm_i;
158         printd("GEN s %d\n", s);
159         /* Whether we're in one dir or at the top, .. still takes us to the top. */
160         if (s == DEVDOTDOT) {
161                 mkqid(&q, Qtopdir, 0, QTDIR);
162                 devdir(c, c->qid, "#V", 0, eve, 0555, dp);
163                 //print_func_exit();
164                 return 1;
165         }
166         printd("TYPE %d\n", TYPE(c->qid));
167         switch (TYPE(c->qid)) {
168                 case Qtopdir:
169                         printd("Qtopdir s %d nvm %d\n", s, nvm);
170                         /* Generate elements for the top level dir.  We support clone, stat,
171                          * vm dirs at the top level */
172                         if (s == 0) {
173                                 mkqid(&q, Qclone, 0, QTFILE);
174                                 devdir(c, q, "clone", 0, eve, 0666, dp);
175                                 //print_func_exit();
176                                 return 1;
177                         }
178                         s--;
179                         if (s == 0) {
180                                 mkqid(&q, Qstat, 0, QTFILE);
181                                 devdir(c, q, "stat", 0, eve, 0666, dp);
182                                 //print_func_exit();
183                                 return 1;
184                         }
185                         s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
186                         spin_lock_irqsave(&vmlock);
187                         if (s >= nvm) {
188                                 printd("DONE qtopdir\n");
189                                 spin_unlock(&vmlock);
190                                 //print_func_exit();
191                                 return -1;
192                         }
193                         vm_i = &vms[s];
194                         snprintf(get_cur_genbuf(), GENBUF_SZ, "vm%d", vm_i->id);
195                         spin_unlock(&vmlock);
196                         mkqid(&q, QID(s, Qvmdir), 0, QTDIR);
197                         devdir(c, q, get_cur_genbuf(), 0, eve, 0555, dp);
198                         //print_func_exit();
199                         return 1;
200                 case Qvmdir:
201                         /* Gen the contents of the vm dirs */
202                         s += Qctl;      /* first time through, start on Qctl */
203                         switch (s) {
204                                 case Qctl:
205                                         mkqid(&q, QID(s-Qctl, Qctl), 0, QTFILE);
206                                         devdir(c, q, "ctl", 0, eve, 0666, dp);
207                                         //print_func_exit();
208                                         return 1;
209                                 case Qimage:
210                                         mkqid(&q, QID(s-Qctl, Qimage), 0, QTFILE);
211                                         devdir(c, q, "image", 0, eve, 0666, dp);
212                                         //print_func_exit();
213                                         return 1;
214                         }
215                         //print_func_exit();
216                         return -1;
217                         /* Need to also provide a direct hit for Qclone and all other files (at
218                          * all levels of the hierarchy).  Every file is both
219                          * generated (via the s increments in their respective directories) and
220                          * directly gen-able.  devstat() will call gen with a specific path in
221                          * the qid.  In these cases, we make a dir for whatever they are asking
222                          * for.  Note the qid stays the same.  I think this is what the old
223                          * plan9 comments above devgen were talking about for (ii).
224                          *
225                          * We don't need to do this for the directories - devstat will look for
226                          * the a directory by path and fail.  Then it will manually build the
227                          * stat output (check the -1 case in devstat). */
228                 case Qclone:
229                         devdir(c, c->qid, "clone", 0, eve, 0666, dp);
230                         //print_func_exit();
231                         return 1;
232                 case Qstat:
233                         devdir(c, c->qid, "stat", 0, eve, 0444, dp);
234                         //print_func_exit();
235                         return 1;
236                 case Qctl:
237                         devdir(c, c->qid, "ctl", 0, eve, 0666, dp);
238                         //print_func_exit();
239                         return 1;
240                 case Qimage:
241                         devdir(c, c->qid, "image", 0, eve, 0666, dp);
242                         //print_func_exit();
243                         return 1;
244         }
245         //print_func_exit();
246         return -1;
247 }
248
249 static void vminit(void)
250 {
251         //print_func_entry();
252         int i;
253         spinlock_init_irqsave(&vmlock);
254         spinlock_init_irqsave(vmidlock);
255         i = vmx_init();
256         if (i == 0)
257                 vmok = 1;
258         printk("vminit: litevm_init returns %d\n", i);
259
260         //print_func_exit();
261 }
262
263 static struct chan *vmattach(char *spec)
264 {
265         //print_func_entry();
266         if (!vmok)
267                 error("No VMs available");
268         struct chan *c = devattach('V', spec);
269         mkqid(&c->qid, Qtopdir, 0, QTDIR);
270         //print_func_exit();
271         return c;
272 }
273
274 static struct walkqid *vmwalk(struct chan *c, struct chan *nc, char **name,
275                                                           int nname)
276 {
277         //print_func_entry();
278         //print_func_exit();
279         return devwalk(c, nc, name, nname, 0, 0, vmgen);
280 }
281
282 static int vmstat(struct chan *c, uint8_t * db, int n)
283 {
284         //print_func_entry();
285         //print_func_exit();
286         return devstat(c, db, n, 0, 0, vmgen);
287 }
288
289 /* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
290  * the open chan into p's fd table, then decref the chan. */
291 static struct chan *vmopen(struct chan *c, int omode)
292 {
293         //print_func_entry();
294         ERRSTACK(1);
295         struct vm *v = QID2VM(c->qid);
296         printk("vmopen: v is %p\n", v);
297         if (waserror()) {
298                 nexterror();
299         }
300         switch (TYPE(c->qid)) {
301                 case Qtopdir:
302                 case Qvmdir:
303                         if (omode & ORCLOSE)
304                                 error(Eperm);
305                         if (!IS_RDONLY(omode))
306                                 error(Eisdir);
307                         break;
308                 case Qclone:
309                         spin_lock_irqsave(&vmlock);
310                         vms = krealloc(vms, sizeof(vms[0]) * (nvm + 1), 0);
311                         v = &vms[nvm];
312                         nvm++;
313                         spin_unlock(&vmlock);
314                         kref_init(&v->kref, vm_release, 1);
315                         v->id = newvmid();
316                         mkqid(&c->qid, QID(nvm, Qctl), 0, QTFILE);
317                         c->aux = v;
318                         printd("New VM id %d\n", v->id);
319                         v->archvm = vmx_open();
320                         if (!v->archvm) {
321                                 printk("vm_open failed\n");
322                                 error("vm_open failed");
323                         }
324                         if (vmx_create_vcpu(v->archvm, v->id) < 0) {
325                                 printk("vm_create failed");
326                                 error("vm_create failed");
327                         }
328                         printk("Qclone open: id %d, v is %p, v->archvm is %p\n", 
329                                         nvm-1,
330                                         v, v->archvm);
331                         break;
332                 case Qstat:
333                         break;
334                 case Qctl:
335                 case Qimage:
336                         c->aux = QID2VM(c->qid);
337                         printk("open qctl: aux (vm) is %p\n", c->aux);
338                         break;
339         }
340         c->mode = openmode(omode);
341         /* Assumes c is unique (can't be closed concurrently */
342         c->flag |= COPEN;
343         c->offset = 0;
344         poperror();
345         //print_func_exit();
346         return c;
347 }
348
349 static void vmcreate(struct chan *c, char *name, int omode, uint32_t perm)
350 {
351         //print_func_entry();
352         error(Eperm);
353         //print_func_exit();
354 }
355
356 static void vmremove(struct chan *c)
357 {
358         //print_func_entry();
359         error(Eperm);
360         //print_func_exit();
361 }
362
363 static int vmwstat(struct chan *c, uint8_t * dp, int n)
364 {
365         //print_func_entry();
366         error("No vmwstat");
367         //print_func_exit();
368         return 0;
369 }
370
371 static void vmclose(struct chan *c)
372 {
373         //print_func_entry();
374         struct vm *v = c->aux;
375         if (!v) {
376                 //print_func_exit();
377                 return;
378         }
379         /* There are more closes than opens.  For instance, sysstat doesn't open,
380          * but it will close the chan it got from namec.  We only want to clean
381          * up/decref chans that were actually open. */
382         if (!(c->flag & COPEN)) {
383                 //print_func_exit();
384                 return;
385         }
386         switch (TYPE(c->qid)) {
387                         /* for now, leave the VM active even when we close ctl */
388                 case Qctl:
389                         break;
390                 case Qimage:
391                         kref_put(&v->kref);
392                         break;
393         }
394         //print_func_exit();
395 }
396
397 static long vmread(struct chan *c, void *ubuf, long n, int64_t offset)
398 {
399         //print_func_entry();
400         struct vm *v = c->aux;
401         printd("VMREAD\n");
402         switch (TYPE(c->qid)) {
403                 case Qtopdir:
404                 case Qvmdir:
405                         //print_func_exit();
406                         return devdirread(c, ubuf, n, 0, 0, vmgen);
407                 case Qstat:
408                         //print_func_exit();
409                         return readnum(offset, ubuf, n, nvm, NUMSIZE32);
410                 case Qctl:
411                         assert(v);
412                         //print_func_exit();
413                         return readnum(offset, ubuf, n, v->id, NUMSIZE32);
414                 case Qimage:
415                         assert(v);
416                         //print_func_exit();
417                         return readmem(offset, ubuf, n, v->image, v->imagesize);
418                 default:
419                         panic("Bad QID %p in devvm", c->qid.path);
420         }
421         //print_func_exit();
422         return 0;
423 }
424
425 static long vmwrite(struct chan *c, void *ubuf, long n, int64_t unused)
426 {
427         //print_func_entry();
428         ERRSTACK(3);
429         char buf[32];
430         struct cmdbuf *cb;
431         struct vm *vm;
432         struct litevm *litevm;
433         uint64_t hexval;
434         printd("vmwrite(%p, %p, %d)\n", c, ubuf, n);
435         switch (TYPE(c->qid)) {
436                 case Qtopdir:
437                 case Qvmdir:
438                 case Qstat:
439                         error(Eperm);
440                 case Qctl:
441                         vm = c->aux;
442                         litevm = vm->archvm;
443                         printk("qctl: vm is %p, litevm is %p\n", vm, litevm);
444                         cb = parsecmd(ubuf, n);
445                         if (waserror()) {
446                                 kfree(cb);
447                                 nexterror();
448                         }
449                         if (!strcmp(cb->f[0], "run")) {
450                                 int ret;
451                                 if (cb->nf != 4)
452                                         error("usage: run vcpu emulated mmio_completed");
453                                 struct litevm_run vmr;
454                                 vmr.vcpu = strtoul(cb->f[1], NULL, 0);
455                                 vmr.emulated = strtoul(cb->f[2], NULL, 0);
456                                 vmr.mmio_completed = strtoul(cb->f[3], NULL, 0);
457                                 ret = vm_run(litevm, &vmr);
458                                 printk("vm_run returns %d\n", ret);
459                                 //print_func_exit();
460                                 return ret;
461                         } else if (!strcmp(cb->f[0], "stop")) {
462                                 error("can't stop a vm yet");
463                         } else if (!strcmp(cb->f[0], "mapmem")) {
464                                 struct chan *file;
465                                 void *v;
466                                 vm = c->aux;
467                                 uint64_t filesize;
468                                 struct litevm_memory_region vmr;
469                                 int got;
470
471                                 if (cb->nf != 6)
472                                         error("usage: mapmem file slot flags addr size");
473                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
474                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
475                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
476                                 filesize = strtoul(cb->f[5], NULL, 0);
477                                 vmr.memory_size = (filesize + 4095) & ~4095ULL;
478
479                                 file = namec(cb->f[1], Aopen, OREAD, 0);
480                                 printk("after namec file is %p\n", file);
481                                 if (waserror()) {
482                                         printk("File open, alloc bad\n");
483                                         cclose(file);
484                                         nexterror();
485                                 }
486                                 /* at some point we want to mmap from the kernel
487                                  * but we don't have that yet. This all needs
488                                  * rethinking but the abstractions of kvm do too.
489                                  */
490                                 v = kmalloc(vmr.memory_size, KMALLOC_WAIT);
491                                 if (waserror()) {
492                                         printk("memory allocated, read bad %s\n", 
493                                                 current_errstr());
494                                         kfree(v);
495                                         nexterror();
496                                 }
497
498                                 readn(file, v, filesize);
499                                 vmr.init_data = v;
500
501                                 if (vm_set_memory_region(litevm, &vmr))
502                                         error("vm_set_memory_region failed");
503                                 void monitor(void *);
504                                 monitor(NULL);
505                                 poperror();
506                                 poperror();
507                                 kfree(v);
508                                 cclose(file);
509
510                         } else if (!strcmp(cb->f[0], "region")) {
511                                 void *v;
512                                 struct litevm_memory_region vmr;
513                                 if (cb->nf != 5)
514                                         error("usage: mapmem slot flags addr size");
515                                 vmr.slot = strtoul(cb->f[1], NULL, 0);
516                                 vmr.flags = strtoul(cb->f[2], NULL, 0);
517                                 vmr.guest_phys_addr = strtoul(cb->f[3], NULL, 0);
518                                 vmr.memory_size = strtoul(cb->f[4], NULL, 0);
519                                 vmr.init_data = NULL;
520                                 if (vm_set_memory_region(litevm, &vmr))
521                                         error("vm_set_memory_region failed");
522                         } else {
523                                 error("%s: not implemented", cb->f[0]);
524                         }
525                         kfree(cb);
526                         poperror();
527                         break;
528                 case Qimage:
529                         error("can't write an image yet");
530                         break;
531                 default:
532                         panic("Bad QID %p in devvm", c->qid.path);
533         }
534         //print_func_exit();
535         return n;
536 }
537
538 struct dev vmdevtab __devtab = {
539         'V',
540         "vm",
541
542         devreset,
543         vminit,
544         devshutdown,
545         vmattach,
546         vmwalk,
547         vmstat,
548         vmopen,
549         vmcreate,
550         vmclose,
551         vmread,
552         devbread,
553         vmwrite,
554         devbwrite,
555         vmremove,
556         vmwstat,
557         devpower,
558 //  devconfig,
559         devchaninfo,
560 };