9587fe7d2453c918d53f62edaf4ac120e998077b
[akaros.git] / kern / drivers / dev / vm.c
1 //#define DEBUG
2 /* Copyright 2014 Google Inc.
3  * Copyright (c) 2013 The Regents of the University of California
4  * Barret Rhoden <brho@cs.berkeley.edu>
5  * See LICENSE for details.
6  *
7  * devvm/#V: a device for VMs
8  *
9  */
10
11 #include <kmalloc.h>
12 #include <string.h>
13 #include <stdio.h>
14 #include <assert.h>
15 #include <error.h>
16 #include <pmap.h>
17 #include <sys/queue.h>
18 #include <smp.h>
19 #include <kref.h>
20 #include <atomic.h>
21 #include <alarm.h>
22 #include <event.h>
23 #include <umem.h>
24 #include <devalarm.h>
25 #include <arch/types.h>
26 #include <arch/vm.h>
27 #include <arch/emulate.h>
28 #include <arch/vmdebug.h>
29
30 /* qid path types */
31 enum {
32         Qtopdir = 1,
33         Qclone,
34         Qstat,
35         Qvmdir,
36         Qctl,
37         Qimage,
38 };
39
40 /* This paddr/kaddr is a bit dangerous.  it'll work so long as we don't need all
41  * 64 bits for a physical address (48 is the current norm on x86_64).
42  * We're probably going to move to a model where we put the VM index or something
43  * into the qid, but this works for now.
44  */
45 #define ADDR_SHIFT 5
46 #define QID2VM(q) ((struct vm*)KADDR(((q).path >> ADDR_SHIFT)))
47 #define TYPE(q) ((q).path & ((1 << ADDR_SHIFT) - 1))
48 #define QID(ptr, type) ((PADDR(ptr) << ADDR_SHIFT) | type)
49
50 /* vm's have an image.
51  * Note that the image can be read even as it is running. */
52 struct vm {
53         struct vm *next;
54         struct kref kref;
55         /* should this be an array of pages? Hmm. */
56         void *image;
57         unsigned long imagesize;
58         int id;
59         struct litevm *archvm;
60 };
61
62 static spinlock_t vmlock;
63 /* array, not linked list. We expect few, might as well be cache friendly. */
64 static struct vm *vms = NULL;
65 static int nvm = 0;
66
67 static spinlock_t vmidlock[1];
68 static struct kref vmid[1] = { {(void *)1, fake_release} };
69
70 /* we'll need this somewhere more generic. */
71 static void readn(struct chan *c, void *vp, long n)
72 {
73         print_func_entry();
74         char *p;
75         long nn;
76         int total = 0, want = n;
77
78         p = vp;
79         while (n > 0) {
80                 nn = devtab[c->type].read(c, p, n, c->offset);
81                 printk("readn: Got %d@%lld\n", nn, c->offset);
82                 if (nn == 0)
83                         error("%s: wanted %d, got %d", Eshort, total, want);
84                 c->offset += nn;
85                 p += nn;
86                 n -= nn;
87                 total += nn;
88         }
89         print_func_exit();
90 }
91
92 static void vm_release(struct kref *kref)
93 {
94         print_func_entry();
95         struct vm *v = container_of(kref, struct vm, kref);
96         spin_lock_irqsave(&vmlock);
97         /* cute trick. Save the last element of the array in place of the
98          * one we're deleting. Reduce nvm. Don't realloc; that way, next
99          * time we add a vm the allocator will just return.
100          * Well, this is stupid, because when we do this, we break
101          * the QIDs, which have pointers embedded in them.
102          * darn it, may have to use a linked list. Nope, will probably
103          * just walk the array until we find a matching id. Still ... yuck.
104          */
105         if (v != &vms[nvm - 1]) {
106                 /* free the image ... oops */
107                 /* get rid of the kref. */
108                 *v = vms[nvm - 1];
109         }
110         nvm--;
111         spin_unlock(&vmlock);
112         print_func_exit();
113 }
114
115 /* VM ids run in the range 1..infinity. But vmx.c wants them
116  * 0-based.
117  */
118 static int newvmid(void)
119 {
120         print_func_entry();
121         int id;
122         spin_lock_irqsave(vmidlock);
123         id = kref_refcnt(vmid);
124         kref_get(vmid, 1);
125         spin_unlock(vmidlock);
126         print_func_exit();
127         return id - 1;
128 }
129
130 static int vmgen(struct chan *c, char *entry_name,
131                                  struct dirtab *unused, int unused_nr_dirtab,
132                                  int s, struct dir *dp)
133 {
134         print_func_entry();
135         struct qid q;
136         struct vm *vm_i;
137         printd("GEN s %d\n", s);
138         /* Whether we're in one dir or at the top, .. still takes us to the top. */
139         if (s == DEVDOTDOT) {
140                 mkqid(&q, Qtopdir, 0, QTDIR);
141                 devdir(c, c->qid, "#V", 0, eve, 0555, dp);
142                 print_func_exit();
143                 return 1;
144         }
145         printd("TYPE %d\n", TYPE(c->qid));
146         switch (TYPE(c->qid)) {
147                 case Qtopdir:
148                         printd("Qtopdir s %d nvm %d\n", s, nvm);
149                         /* Generate elements for the top level dir.  We support clone, stat,
150                          * vm dirs at the top level */
151                         if (s == 0) {
152                                 mkqid(&q, Qclone, 0, QTFILE);
153                                 devdir(c, q, "clone", 0, eve, 0666, dp);
154                                 print_func_exit();
155                                 return 1;
156                         }
157                         s--;
158                         if (s == 0) {
159                                 mkqid(&q, Qstat, 0, QTFILE);
160                                 devdir(c, q, "stat", 0, eve, 0666, dp);
161                                 print_func_exit();
162                                 return 1;
163                         }
164                         s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
165                         spin_lock_irqsave(&vmlock);
166                         if (s >= nvm) {
167                                 printd("DONE qtopdir\n");
168                                 spin_unlock(&vmlock);
169                                 print_func_exit();
170                                 return -1;
171                         }
172                         vm_i = &vms[s];
173                         snprintf(get_cur_genbuf(), GENBUF_SZ, "vm%d", vm_i->id);
174                         spin_unlock(&vmlock);
175                         mkqid(&q, QID(vm_i, Qvmdir), 0, QTDIR);
176                         devdir(c, q, get_cur_genbuf(), 0, eve, 0555, dp);
177                         print_func_exit();
178                         return 1;
179                 case Qvmdir:
180                         /* Gen the contents of the vm dirs */
181                         s += Qctl;      /* first time through, start on Qctl */
182                         switch (s) {
183                                 case Qctl:
184                                         mkqid(&q, QID(QID2VM(c->qid), Qctl), 0, QTFILE);
185                                         devdir(c, q, "ctl", 0, eve, 0666, dp);
186                                         print_func_exit();
187                                         return 1;
188                                 case Qimage:
189                                         mkqid(&q, QID(QID2VM(c->qid), Qimage), 0, QTFILE);
190                                         devdir(c, q, "image", 0, eve, 0666, dp);
191                                         print_func_exit();
192                                         return 1;
193                         }
194                         print_func_exit();
195                         return -1;
196                         /* Need to also provide a direct hit for Qclone and all other files (at
197                          * all levels of the hierarchy).  Every file is both
198                          * generated (via the s increments in their respective directories) and
199                          * directly gen-able.  devstat() will call gen with a specific path in
200                          * the qid.  In these cases, we make a dir for whatever they are asking
201                          * for.  Note the qid stays the same.  I think this is what the old
202                          * plan9 comments above devgen were talking about for (ii).
203                          *
204                          * We don't need to do this for the directories - devstat will look for
205                          * the a directory by path and fail.  Then it will manually build the
206                          * stat output (check the -1 case in devstat). */
207                 case Qclone:
208                         devdir(c, c->qid, "clone", 0, eve, 0666, dp);
209                         print_func_exit();
210                         return 1;
211                 case Qstat:
212                         devdir(c, c->qid, "stat", 0, eve, 0444, dp);
213                         print_func_exit();
214                         return 1;
215                 case Qctl:
216                         devdir(c, c->qid, "ctl", 0, eve, 0666, dp);
217                         print_func_exit();
218                         return 1;
219                 case Qimage:
220                         devdir(c, c->qid, "image", 0, eve, 0666, dp);
221                         print_func_exit();
222                         return 1;
223         }
224         print_func_exit();
225         return -1;
226 }
227
228 static void vminit(void)
229 {
230         return;
231         print_func_entry();
232         int i;
233         spinlock_init_irqsave(&vmlock);
234         spinlock_init_irqsave(vmidlock);
235         i = vmx_init();
236         printk("vminit: litevm_init returns %d\n", i);
237
238         print_func_exit();
239 }
240
241 static struct chan *vmattach(char *spec)
242 {
243         print_func_entry();
244         struct chan *c = devattach('V', spec);
245         mkqid(&c->qid, Qtopdir, 0, QTDIR);
246         print_func_exit();
247         return c;
248 }
249
250 static struct walkqid *vmwalk(struct chan *c, struct chan *nc, char **name,
251                                                           int nname)
252 {
253         print_func_entry();
254         print_func_exit();
255         return devwalk(c, nc, name, nname, 0, 0, vmgen);
256 }
257
258 static int vmstat(struct chan *c, uint8_t * db, int n)
259 {
260         print_func_entry();
261         print_func_exit();
262         return devstat(c, db, n, 0, 0, vmgen);
263 }
264
265 /* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
266  * the open chan into p's fd table, then decref the chan. */
267 static struct chan *vmopen(struct chan *c, int omode)
268 {
269         print_func_entry();
270         ERRSTACK(1);
271         struct vm *v = QID2VM(c->qid);
272         printk("vmopen: v is %p\n", v);
273         if (waserror()) {
274                 nexterror();
275         }
276         switch (TYPE(c->qid)) {
277                 case Qtopdir:
278                 case Qvmdir:
279                         if (omode & ORCLOSE)
280                                 error(Eperm);
281                         if (!IS_RDONLY(omode))
282                                 error(Eisdir);
283                         break;
284                 case Qclone:
285                         spin_lock_irqsave(&vmlock);
286                         vms = krealloc(vms, sizeof(vms[0]) * (nvm + 1), 0);
287                         v = &vms[nvm];
288                         nvm++;
289                         spin_unlock(&vmlock);
290                         kref_init(&v->kref, vm_release, 1);
291                         v->id = newvmid();
292                         mkqid(&c->qid, QID(v, Qctl), 0, QTFILE);
293                         c->aux = v;
294                         printd("New VM id %d\n", v->id);
295                         v->archvm = vmx_open();
296                         if (!v->archvm) {
297                                 printk("vm_open failed\n");
298                                 error("vm_open failed");
299                         }
300                         if (vmx_create_vcpu(v->archvm, v->id) < 0) {
301                                 printk("vm_create failed");
302                                 error("vm_create failed");
303                         }
304                         break;
305                 case Qstat:
306                         break;
307                 case Qctl:
308                 case Qimage:
309                         c->aux = QID2VM(c->qid);
310                         printk("open qctl: aux is %p\n", c->aux);
311                         break;
312         }
313         c->mode = openmode(omode);
314         /* Assumes c is unique (can't be closed concurrently */
315         c->flag |= COPEN;
316         c->offset = 0;
317         poperror();
318         print_func_exit();
319         return c;
320 }
321
322 static void vmcreate(struct chan *c, char *name, int omode, uint32_t perm)
323 {
324         print_func_entry();
325         error(Eperm);
326         print_func_exit();
327 }
328
329 static void vmremove(struct chan *c)
330 {
331         print_func_entry();
332         error(Eperm);
333         print_func_exit();
334 }
335
336 static int vmwstat(struct chan *c, uint8_t * dp, int n)
337 {
338         print_func_entry();
339         error("No vmwstat");
340         print_func_exit();
341         return 0;
342 }
343
344 static void vmclose(struct chan *c)
345 {
346         print_func_entry();
347         struct vm *v = c->aux;
348         if (!v) {
349                 print_func_exit();
350                 return;
351         }
352         /* There are more closes than opens.  For instance, sysstat doesn't open,
353          * but it will close the chan it got from namec.  We only want to clean
354          * up/decref chans that were actually open. */
355         if (!(c->flag & COPEN)) {
356                 print_func_exit();
357                 return;
358         }
359         switch (TYPE(c->qid)) {
360                         /* for now, leave the VM active even when we close ctl */
361                 case Qctl:
362                         break;
363                 case Qimage:
364                         kref_put(&v->kref);
365                         break;
366         }
367         print_func_exit();
368 }
369
370 static long vmread(struct chan *c, void *ubuf, long n, int64_t offset)
371 {
372         print_func_entry();
373         struct vm *v = c->aux;
374         printd("VMREAD\n");
375         switch (TYPE(c->qid)) {
376                 case Qtopdir:
377                 case Qvmdir:
378                         print_func_exit();
379                         return devdirread(c, ubuf, n, 0, 0, vmgen);
380                 case Qstat:
381                         print_func_exit();
382                         return readnum(offset, ubuf, n, nvm, NUMSIZE32);
383                 case Qctl:
384                         assert(v);
385                         print_func_exit();
386                         return readnum(offset, ubuf, n, v->id, NUMSIZE32);
387                 case Qimage:
388                         assert(v);
389                         print_func_exit();
390                         return readmem(offset, ubuf, n, v->image, v->imagesize);
391                 default:
392                         panic("Bad QID %p in devvm", c->qid.path);
393         }
394         print_func_exit();
395         return 0;
396 }
397
398 static long vmwrite(struct chan *c, void *ubuf, long n, int64_t unused)
399 {
400         print_func_entry();
401         ERRSTACK(3);
402         char buf[32];
403         struct cmdbuf *cb;
404         struct vm *vm;
405         struct litevm *litevm;
406         uint64_t hexval;
407         printd("vmwrite(%p, %p, %d)\n", c, ubuf, n);
408         switch (TYPE(c->qid)) {
409                 case Qtopdir:
410                 case Qvmdir:
411                 case Qstat:
412                         error(Eperm);
413                 case Qctl:
414                         vm = c->aux;
415                         cb = parsecmd(ubuf, n);
416                         if (waserror()) {
417                                 kfree(cb);
418                                 nexterror();
419                         }
420                         if (!strcmp(cb->f[0], "run")) {
421                                 int ret;
422                                 if (cb->nf != 4)
423                                         error("usage: run vcpu emulated mmio_completed");
424                                 litevm = vm->archvm;
425                                 struct litevm_run vmr;
426                                 vmr.vcpu = strtoul(cb->f[1], NULL, 0);
427                                 vmr.emulated = strtoul(cb->f[2], NULL, 0);
428                                 vmr.mmio_completed = strtoul(cb->f[3], NULL, 0);
429                                 ret = vm_run(litevm, &vmr);
430                                 printk("vm_run returns %d\n", ret);
431                                 print_func_exit();
432                                 return ret;
433                         } else if (!strcmp(cb->f[0], "stop")) {
434                                 error("can't stop a vm yet");
435                         } else if (!strcmp(cb->f[0], "fillmem")) {
436                                 struct chan *file;
437                                 void *v;
438                                 vm = c->aux;
439                                 litevm = vm->archvm;
440                                 uint64_t filesize;
441                                 struct litevm_memory_region vmr;
442                                 int got;
443
444                                 if (cb->nf != 6)
445                                         error("usage: mapmem file slot flags addr size");
446                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
447                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
448                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
449                                 filesize = strtoul(cb->f[5], NULL, 0);
450                                 vmr.memory_size = (filesize + 4095) & ~4095ULL;
451
452                                 file = namec(cb->f[1], Aopen, OREAD, 0);
453                                 printk("after namec file is %p\n", file);
454                                 if (waserror()) {
455                                         cclose(file);
456                                         nexterror();
457                                 }
458                                 /* at some point we want to mmap from the kernel
459                                  * but we don't have that yet. This all needs
460                                  * rethinking but the abstractions of kvm do too.
461                                  */
462                                 v = kmalloc(vmr.memory_size, KMALLOC_WAIT);
463                                 if (waserror()) {
464                                         kfree(v);
465                                         nexterror();
466                                 }
467
468                                 readn(file, v, filesize);
469                                 vmr.init_data = v;
470
471                                 if (vm_set_memory_region(litevm, &vmr))
472                                         error("vm_set_memory_region failed");
473                                 poperror();
474                                 poperror();
475                                 kfree(v);
476                                 cclose(file);
477
478                         } else if (!strcmp(cb->f[0], "region")) {
479                                 void *v;
480                                 struct litevm_memory_region vmr;
481                                 litevm = vm->archvm;
482                                 if (cb->nf != 5)
483                                         error("usage: mapmem slot flags addr size");
484                                 vmr.slot = strtoul(cb->f[2], NULL, 0);
485                                 vmr.flags = strtoul(cb->f[3], NULL, 0);
486                                 vmr.guest_phys_addr = strtoul(cb->f[4], NULL, 0);
487                                 vmr.memory_size = strtoul(cb->f[5], NULL, 0);
488                                 if (vm_set_memory_region(litevm, &vmr))
489                                         error("vm_set_memory_region failed");
490                         } else {
491                                 error("%s: not implemented", cb->f[0]);
492                         }
493                         kfree(cb);
494                         poperror();
495                         break;
496                 case Qimage:
497                         error("can't write an image yet");
498                         break;
499                 default:
500                         panic("Bad QID %p in devvm", c->qid.path);
501         }
502         print_func_exit();
503         return n;
504 }
505
506 struct dev vmdevtab __devtab = {
507         'V',
508         "vm",
509
510         devreset,
511         vminit,
512         devshutdown,
513         vmattach,
514         vmwalk,
515         vmstat,
516         vmopen,
517         vmcreate,
518         vmclose,
519         vmread,
520         devbread,
521         vmwrite,
522         devbwrite,
523         vmremove,
524         vmwstat,
525         devpower,
526 //  devconfig,
527         devchaninfo,
528 };