Remove the BUILD_INFO_FILE variable
[akaros.git] / kern / arch / x86 / devarch.c
index 7c60738..2a45f6e 100644 (file)
@@ -1,4 +1,4 @@
-/* 
+/*
  * This file is part of the UCB release of Plan 9. It is subject to the license
  * terms in the LICENSE file found in the top-level directory of this
  * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
@@ -7,39 +7,52 @@
  * in the LICENSE file.
  */
 
+#include <ros/memops.h>
 #include <vfs.h>
-#include <kfs.h>
-#include <slab.h>
 #include <kmalloc.h>
 #include <kref.h>
+#include <kthread.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
-#include <error.h>
-#include <cpio.h>
+#include <err.h>
 #include <pmap.h>
+#include <umem.h>
 #include <smp.h>
 #include <ip.h>
 #include <time.h>
+#include <bitops.h>
+#include <core_set.h>
+#include <address_range.h>
+#include <arch/ros/perfmon.h>
+#include <arch/topology.h>
+#include <arch/perfmon.h>
+#include <arch/ros/msr-index.h>
+#include <arch/msr.h>
+#include <arch/devarch.h>
+
+#define REAL_MEM_SIZE (1024 * 1024)
+
+struct perf_context {
+       struct perfmon_session *ps;
+       size_t resp_size;
+       uint8_t *resp;
+};
 
-typedef struct IOMap IOMap;
-struct IOMap
-{
-       IOMap   *next;
-       int     reserved;
-       char    tag[13];
-       uint32_t        start;
-       uint32_t        end;
+struct io_map {
+       struct io_map *next;
+       int reserved;
+       char tag[13];
+       uint32_t start;
+       uint32_t end;
 };
 
-static struct
-{
+static struct {
        spinlock_t lock;
-       IOMap   *map;
-       IOMap   *free;
-       IOMap   maps[32];               // some initial free maps
-
-       qlock_t ql;                     // lock for reading map
+       struct io_map *map;
+       struct io_map *free;
+       struct io_map maps[32];                         // some initial free maps
+       qlock_t ql;                                     // lock for reading map
 } iomap;
 
 enum {
@@ -49,135 +62,213 @@ enum {
        Qiow,
        Qiol,
        Qgdb,
-       Qbase,
-       Qmapram,
        Qrealmem,
+       Qmsr,
+       Qperf,
 
-       Qmax = 16,
+       Qmax,
 };
 
-typedef long Rdwrfn(struct chan*, void*, long, int64_t);
-
-static Rdwrfn *readfn[Qmax];
-static Rdwrfn *writefn[Qmax];
+enum {
+       Linelen = 31,
+};
 
+struct dev archdevtab;
 static struct dirtab archdir[Qmax] = {
-       {".",           { Qdir, 0, QTDIR },     0,      0555},
-       {"ioalloc",     { Qioalloc, 0 },        0,      0444},
-       {"iob",         { Qiob, 0 },            0,      0660},
-       {"iow",         { Qiow, 0 },            0,      0660},
-       {"iol",         { Qiol, 0 },            0,      0660},
-       {"gdb",         { Qgdb, 0},             0,      0660},
-       {"mapram",      { Qmapram, 0 }, 0,      0444},
-       {"realmodemem", { Qrealmem, 0 },        0,      0660},
+       {".", {Qdir, 0, QTDIR}, 0, 0555},
+       {"ioalloc", {Qioalloc, 0}, 0, 0444},
+       {"iob", {Qiob, 0}, 0, 0666},
+       {"iow", {Qiow, 0}, 0, 0666},
+       {"iol", {Qiol, 0}, 0, 0666},
+       {"gdb", {Qgdb, 0}, 0, 0660},
+       {"realmem", {Qrealmem, 0}, 0, 0444},
+       {"msr", {Qmsr, 0}, 0, 0666},
+       {"perf", {Qperf, 0}, 0, 0666},
+};
+/* White list entries needs to be ordered by start address, and never overlap.
+ */
+#define MSR_MAX_VAR_COUNTERS 16
+#define MSR_MAX_FIX_COUNTERS 4
+
+static struct address_range msr_rd_wlist[] = {
+       ADDRESS_RANGE(0x00000000, 0xffffffff),
+};
+static struct address_range msr_wr_wlist[] = {
+       ADDRESS_RANGE(MSR_IA32_PERFCTR0,
+                                 MSR_IA32_PERFCTR0 + MSR_MAX_VAR_COUNTERS - 1),
+       ADDRESS_RANGE(MSR_ARCH_PERFMON_EVENTSEL0,
+                                 MSR_ARCH_PERFMON_EVENTSEL0 + MSR_MAX_VAR_COUNTERS - 1),
+       ADDRESS_RANGE(MSR_IA32_PERF_CTL, MSR_IA32_PERF_CTL),
+       ADDRESS_RANGE(MSR_CORE_PERF_FIXED_CTR0,
+                                 MSR_CORE_PERF_FIXED_CTR0 + MSR_MAX_FIX_COUNTERS - 1),
+       ADDRESS_RANGE(MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL),
 };
-spinlock_t archwlock;  /* the lock is only for changing archdir */
-int narchdir = Qbase;
 int gdbactive = 0;
 
-/*
- * Add a file to the #P listing.  Once added, you can't delete it.
- * You can't add a file with the same name as one already there,
- * and you get a pointer to the Dirtab entry so you can do things
- * like change the Qid version.  Changing the Qid path is disallowed.
- */
-struct dirtab*
-addarchfile(char *name, int perm, Rdwrfn *rdfn, Rdwrfn *wrfn)
+//
+//  alloc some io port space and remember who it was
+//  alloced to.  if port < 0, find a free region.
+//
+int ioalloc(int port, int size, int align, char *tag)
 {
+       struct io_map *map, **l;
        int i;
-       struct dirtab d;
-       struct dirtab *dp;
-
-       memset(&d, 0, sizeof d);
-       strncpy(d.name,  name, sizeof(d.name));
-       d.perm = perm;
 
-       spin_lock(&archwlock);
-       if(narchdir >= Qmax){
-               spin_unlock(&archwlock);
-               return NULL;
+       spin_lock(&(&iomap)->lock);
+       if (port < 0) {
+               // find a free port above 0x400 and below 0x1000
+               port = 0x400;
+               for (l = &iomap.map; *l; l = &(*l)->next) {
+                       map = *l;
+                       if (map->start < 0x400)
+                               continue;
+                       i = map->start - port;
+                       if (i > size)
+                               break;
+                       if (align > 0)
+                               port = ((port + align - 1) / align) * align;
+                       else
+                               port = map->end;
+               }
+               if (*l == NULL) {
+                       spin_unlock(&(&iomap)->lock);
+                       return -1;
+               }
+       } else {
+               // Only 64KB I/O space on the x86.
+               if ((port + size) > 0x10000) {
+                       spin_unlock(&(&iomap)->lock);
+                       return -1;
+               }
+               // see if the space clashes with previously allocated ports
+               for (l = &iomap.map; *l; l = &(*l)->next) {
+                       map = *l;
+                       if (map->end <= port)
+                               continue;
+                       if (map->reserved && map->start == port && map->end == port + size) {
+                               map->reserved = 0;
+                               spin_unlock(&(&iomap)->lock);
+                               return map->start;
+                       }
+                       if (map->start >= port + size)
+                               break;
+                       spin_unlock(&(&iomap)->lock);
+                       return -1;
+               }
+       }
+       map = iomap.free;
+       if (map == NULL) {
+               printd("ioalloc: out of maps");
+               spin_unlock(&(&iomap)->lock);
+               return port;
        }
+       iomap.free = map->next;
+       map->next = *l;
+       map->start = port;
+       map->end = port + size;
+       strlcpy(map->tag, tag, sizeof(map->tag));
+       *l = map;
 
-       for(i=0; i<narchdir; i++)
-               if(strcmp(archdir[i].name, name) == 0){
-                       spin_unlock(&archwlock);
-                       return NULL;
+       archdir[0].qid.vers++;
+
+       spin_unlock(&(&iomap)->lock);
+       return map->start;
+}
+
+void iofree(int port)
+{
+       struct io_map *map, **l;
+
+       spin_lock(&(&iomap)->lock);
+       for (l = &iomap.map; *l; l = &(*l)->next) {
+               if ((*l)->start == port) {
+                       map = *l;
+                       *l = map->next;
+                       map->next = iomap.free;
+                       iomap.free = map;
+                       break;
                }
+               if ((*l)->start > port)
+                       break;
+       }
+       archdir[0].qid.vers++;
+       spin_unlock(&(&iomap)->lock);
+}
 
-       d.qid.path = narchdir;
-       archdir[narchdir] = d;
-       readfn[narchdir] = rdfn;
-       writefn[narchdir] = wrfn;
-       dp = &archdir[narchdir++];
-       spin_unlock(&archwlock);
+int iounused(int start, int end)
+{
+       struct io_map *map;
 
-       return dp;
+       for (map = iomap.map; map; map = map->next) {
+               if (((start >= map->start) && (start < map->end)) ||
+                   ((start <= map->start) && (end > map->start)))
+                       return 0;
+       }
+       return 1;
 }
 
-void
-ioinit(void)
+void ioinit(void)
 {
        int i;
        char *excluded = "";
 
-       for(i = 0; i < ARRAY_SIZE(iomap.maps)-1; i++)
-               iomap.maps[i].next = &iomap.maps[i+1];
+       panic("Akaros doesn't do IO port allocation yet.  Don't init.");
+       for (i = 0; i < ARRAY_SIZE(iomap.maps) - 1; i++)
+               iomap.maps[i].next = &iomap.maps[i + 1];
        iomap.maps[i].next = NULL;
        iomap.free = iomap.maps;
        char *s;
-       
+
        s = excluded;
        while (s && *s != '\0' && *s != '\n') {
                char *ends;
                int io_s, io_e;
-               
+
                io_s = (int)strtol(s, &ends, 0);
                if (ends == NULL || ends == s || *ends != '-') {
                        printd("ioinit: cannot parse option string\n");
                        break;
                }
                s = ++ends;
-               
+
                io_e = (int)strtol(s, &ends, 0);
                if (ends && *ends == ',')
                        *ends++ = '\0';
                s = ends;
-               
-#warning "how do we do io allocate"
-               //ioalloc(io_s, io_e - io_s + 1, 0, "pre-allocated");
+
+               ioalloc(io_s, io_e - io_s + 1, 0, "pre-allocated");
        }
 }
 
 // Reserve a range to be ioalloced later.
 // This is in particular useful for exchangable cards, such
 // as pcmcia and cardbus cards.
-int
-ioreserve( int unused_int, int size, int align, char *tag)
+int ioreserve(int unused_int, int size, int align, char *tag)
 {
-       IOMap *map, **l;
+       struct io_map *map, **l;
        int i, port;
 
        spin_lock(&(&iomap)->lock);
        // find a free port above 0x400 and below 0x1000
        port = 0x400;
-       for(l = &iomap.map; *l; l = &(*l)->next){
+       for (l = &iomap.map; *l; l = &(*l)->next) {
                map = *l;
                if (map->start < 0x400)
                        continue;
                i = map->start - port;
-               if(i > size)
+               if (i > size)
                        break;
-               if(align > 0)
-                       port = ((port+align-1)/align)*align;
+               if (align > 0)
+                       port = ((port + align - 1) / align) * align;
                else
                        port = map->end;
        }
-       if(*l == NULL){
+       if (*l == NULL) {
                spin_unlock(&(&iomap)->lock);
                return -1;
        }
        map = iomap.free;
-       if(map == NULL){
+       if (map == NULL) {
                printd("ioalloc: out of maps");
                spin_unlock(&(&iomap)->lock);
                return port;
@@ -187,8 +278,7 @@ ioreserve( int unused_int, int size, int align, char *tag)
        map->start = port;
        map->end = port + size;
        map->reserved = 1;
-       strncpy(map->tag, tag, sizeof(map->tag));
-       map->tag[sizeof(map->tag)-1] = 0;
+       strlcpy(map->tag, tag, sizeof(map->tag));
        *l = map;
 
        archdir[0].qid.vers++;
@@ -197,234 +287,335 @@ ioreserve( int unused_int, int size, int align, char *tag)
        return map->start;
 }
 
-//
-//     alloc some io port space and remember who it was
-//     alloced to.  if port < 0, find a free region.
-//
-int
-ioalloc(int port, int size, int align, char *tag)
+static void checkport(int start, int end)
 {
-       IOMap *map, **l;
-       int i;
-
-       spin_lock(&(&iomap)->lock);
-       if(port < 0){
-               // find a free port above 0x400 and below 0x1000
-               port = 0x400;
-               for(l = &iomap.map; *l; l = &(*l)->next){
-                       map = *l;
-                       if (map->start < 0x400)
-                               continue;
-                       i = map->start - port;
-                       if(i > size)
-                               break;
-                       if(align > 0)
-                               port = ((port+align-1)/align)*align;
-                       else
-                               port = map->end;
-               }
-               if(*l == NULL){
-                       spin_unlock(&(&iomap)->lock);
-                       return -1;
-               }
-       } else {
-               // Only 64KB I/O space on the x86.
-               if((port+size) > 0x10000){
-                       spin_unlock(&(&iomap)->lock);
-                       return -1;
-               }
-               // see if the space clashes with previously allocated ports
-               for(l = &iomap.map; *l; l = &(*l)->next){
-                       map = *l;
-                       if(map->end <= port)
-                               continue;
-                       if(map->reserved && map->start == port && map->end == port + size) {
-                               map->reserved = 0;
-                               spin_unlock(&(&iomap)->lock);
-                               return map->start;
-                       }
-                       if(map->start >= port+size)
-                               break;
-                       spin_unlock(&(&iomap)->lock);
-                       return -1;
-               }
-       }
-       map = iomap.free;
-       if(map == NULL){
-               printd("ioalloc: out of maps");
-               spin_unlock(&(&iomap)->lock);
-               return port;
-       }
-       iomap.free = map->next;
-       map->next = *l;
-       map->start = port;
-       map->end = port + size;
-       strncpy(map->tag, tag, sizeof(map->tag));
-       map->tag[sizeof(map->tag)-1] = 0;
-       *l = map;
-
-       archdir[0].qid.vers++;
+       /* standard vga regs are OK */
+       if (start >= 0x2b0 && end <= 0x2df + 1)
+               return;
+       if (start >= 0x3c0 && end <= 0x3da + 1)
+               return;
 
-       spin_unlock(&(&iomap)->lock);
-       return map->start;
+       if (iounused(start, end))
+               return;
+       error(EPERM, ERROR_FIXME);
 }
 
-void
-iofree(int port)
+static struct chan *archattach(char *spec)
 {
-       IOMap *map, **l;
-
-       spin_lock(&(&iomap)->lock);
-       for(l = &iomap.map; *l; l = &(*l)->next){
-               if((*l)->start == port){
-                       map = *l;
-                       *l = map->next;
-                       map->next = iomap.free;
-                       iomap.free = map;
-                       break;
-               }
-               if((*l)->start > port)
-                       break;
-       }
-       archdir[0].qid.vers++;
-       spin_unlock(&(&iomap)->lock);
+       return devattach(archdevtab.name, spec);
 }
 
-int
-iounused(int start, int end)
+struct walkqid *archwalk(struct chan *c, struct chan *nc, char **name,
+                                                int nname)
 {
-       IOMap *map;
-
-       for(map = iomap.map; map; map = map->next){
-               if(((start >= map->start) && (start < map->end))
-                  || ((start <= map->start) && (end > map->start)))
-                       return 0;
-       }
-       return 1;
+       return devwalk(c, nc, name, nname, archdir, Qmax, devgen);
 }
 
-static void
-checkport(int start, int end)
+static int archstat(struct chan *c, uint8_t * dp, int n)
 {
-       /* standard vga regs are OK */
-       if(start >= 0x2b0 && end <= 0x2df+1)
-               return;
-       if(start >= 0x3c0 && end <= 0x3da+1)
-               return;
+       archdir[Qrealmem].length = REAL_MEM_SIZE;
 
-       if(iounused(start, end))
-               return;
-       error(Eperm);
+       return devstat(c, dp, n, archdir, Qmax, devgen);
 }
 
-static struct chan*
-archattach(char* spec)
+static struct perf_context *arch_create_perf_context(void)
 {
-       return devattach('P', spec);
+       ERRSTACK(1);
+       struct perf_context *pc = kzmalloc(sizeof(struct perf_context),
+                                          MEM_WAIT);
+
+       if (waserror()) {
+               kfree(pc);
+               nexterror();
+       }
+       pc->ps = perfmon_create_session();
+       poperror();
+
+       return pc;
 }
 
-struct walkqid*
-archwalk(struct chan* c, struct chan *nc, char** name, int nname)
+static void arch_free_perf_context(struct perf_context *pc)
 {
-       return devwalk(c, nc, name, nname, archdir, narchdir, devgen);
+       if (likely(pc)) {
+               perfmon_close_session(pc->ps);
+               kfree(pc->resp);
+               kfree(pc);
+       }
 }
 
-static int
-archstat(struct chan* c, uint8_t* dp, int n)
+static const uint8_t *arch_read_core_set(struct core_set *cset,
+                                         const uint8_t *kptr,
+                                         const uint8_t *ktop)
 {
-       return devstat(c, dp, n, archdir, narchdir, devgen);
+       int i, nb;
+       uint32_t n;
+
+       error_assert(EBADMSG, (kptr + sizeof(uint32_t)) <= ktop);
+       kptr = get_le_u32(kptr, &n);
+       error_assert(EBADMSG, (kptr + n) <= ktop);
+       core_set_init(cset);
+       nb = MIN((int) n * 8, num_cores);
+       for (i = 0; i < nb; i++) {
+               if (test_bit(i, (const unsigned long *) kptr))
+                       core_set_setcpu(cset, i);
+       }
+
+       return kptr + n;
 }
 
-static struct chan*
-archopen(struct chan* c, int omode)
+static long arch_perf_write(struct perf_context *pc, const void *udata,
+                            long usize)
 {
-       return devopen(c, omode, archdir, narchdir, devgen);
+       ERRSTACK(1);
+       void *kdata;
+       const uint8_t *kptr, *ktop;
+
+       kfree(pc->resp);
+       pc->resp = NULL;
+       pc->resp_size = 0;
+
+       kdata = user_memdup_errno(current, udata, usize);
+       if (unlikely(!kdata))
+               return -1;
+       if (waserror()) {
+               kfree(kdata);
+               nexterror();
+       }
+       kptr = kdata;
+       ktop = kptr + usize;
+       error_assert(EBADMSG, (kptr + 1) <= ktop);
+       switch (*kptr++) {
+               case PERFMON_CMD_COUNTER_OPEN: {
+                       int ped;
+                       struct perfmon_event pev;
+                       struct core_set cset;
+
+                       error_assert(EBADMSG, (kptr + 3 * sizeof(uint64_t)) <= ktop);
+                       perfmon_init_event(&pev);
+                       kptr = get_le_u64(kptr, &pev.event);
+                       kptr = get_le_u64(kptr, &pev.flags);
+                       kptr = get_le_u64(kptr, &pev.trigger_count);
+                       kptr = arch_read_core_set(&cset, kptr, ktop);
+
+                       ped = perfmon_open_event(&cset, pc->ps, &pev);
+
+                       pc->resp_size = sizeof(uint32_t);
+                       pc->resp = kmalloc(pc->resp_size, MEM_WAIT);
+                       put_le_u32(pc->resp, (uint32_t) ped);
+                       break;
+               }
+               case PERFMON_CMD_COUNTER_STATUS: {
+                       int i;
+                       uint32_t ped;
+                       uint8_t *rptr;
+                       uint64_t *mvalues;
+                       struct perfmon_status *pef;
+
+                       error_assert(EBADMSG, (kptr + sizeof(uint32_t)) <= ktop);
+                       kptr = get_le_u32(kptr, &ped);
+
+                       pef = perfmon_get_event_status(pc->ps, (int) ped);
+
+                       mvalues = kzmalloc(num_cores * sizeof(mvalues),
+                                          MEM_WAIT);
+                       for (i = 0; i < num_cores; i++)
+                               mvalues[i] = pef->cores_values[i];
+
+                       pc->resp_size = 3 * sizeof(uint64_t) + sizeof(uint32_t) +
+                               num_cores * sizeof(uint64_t);
+                       pc->resp = kmalloc(pc->resp_size, MEM_WAIT);
+
+                       rptr = put_le_u64(pc->resp, pef->ev.event);
+                       rptr = put_le_u64(rptr, pef->ev.flags);
+                       rptr = put_le_u64(rptr, pef->ev.trigger_count);
+                       rptr = put_le_u32(rptr, num_cores);
+                       for (i = 0; i < num_cores; i++)
+                               rptr = put_le_u64(rptr, mvalues[i]);
+                       kfree(mvalues);
+                       perfmon_free_event_status(pef);
+                       break;
+               }
+               case PERFMON_CMD_COUNTER_CLOSE: {
+                       uint32_t ped;
+
+                       error_assert(EBADMSG, (kptr + sizeof(uint32_t)) <= ktop);
+                       kptr = get_le_u32(kptr, &ped);
+
+                       perfmon_close_event(pc->ps, (int) ped);
+                       break;
+               }
+               case PERFMON_CMD_CPU_CAPS: {
+                       uint8_t *rptr;
+                       struct perfmon_cpu_caps pcc;
+
+                       kptr++;
+                       perfmon_get_cpu_caps(&pcc);
+
+                       pc->resp_size = 6 * sizeof(uint32_t);
+                       pc->resp = kmalloc(pc->resp_size, MEM_WAIT);
+
+                       rptr = put_le_u32(pc->resp, pcc.perfmon_version);
+                       rptr = put_le_u32(rptr, pcc.proc_arch_events);
+                       rptr = put_le_u32(rptr, pcc.bits_x_counter);
+                       rptr = put_le_u32(rptr, pcc.counters_x_proc);
+                       rptr = put_le_u32(rptr, pcc.bits_x_fix_counter);
+                       rptr = put_le_u32(rptr, pcc.fix_counters_x_proc);
+                       break;
+               }
+               default:
+                       error(EINVAL, "Invalid perfmon command: 0x%x", kptr[-1]);
+       }
+       poperror();
+       kfree(kdata);
+
+       return (long) (kptr - (const uint8_t *) kdata);
 }
 
-static void
-archclose(struct chan*unused)
+static struct chan *archopen(struct chan *c, int omode)
 {
+       c = devopen(c, omode, archdir, Qmax, devgen);
+       switch ((uint32_t) c->qid.path) {
+               case Qperf:
+                       if (!perfmon_supported())
+                               error(ENODEV, "perf is not supported");
+                       assert(!c->aux);
+                       c->aux = arch_create_perf_context();
+                       break;
+       }
+
+       return c;
 }
 
-enum
+static void archclose(struct chan *c)
 {
-       Linelen= 31,
-};
+       switch ((uint32_t) c->qid.path) {
+               case Qperf:
+                       if (c->aux) {
+                               arch_free_perf_context((struct perf_context *) c->aux);
+                               c->aux = NULL;
+                       }
+                       break;
+       }
+}
 
-static long
-archread(struct chan *c, void *a, long n, int64_t offset)
+static long archread(struct chan *c, void *a, long n, int64_t offset)
 {
        char *buf, *p;
-       int port;
+       int err, port;
+       uint64_t *values;
        uint16_t *sp;
        uint32_t *lp;
-       IOMap *map;
-       Rdwrfn *fn;
-
-       switch((uint32_t)c->qid.path){
-
-       case Qdir:
-               return devdirread(c, a, n, archdir, narchdir, devgen);
-
-       case Qgdb:
-               p = gdbactive ? "1" : "0";
-               return readstr(offset, a, n, p);
-       case Qiob:
-               port = offset;
-               checkport(offset, offset+n);
-               for(p = a; port < offset+n; port++)
-                       *p++ = inb(port);
-               return n;
-
-       case Qiow:
-               if(n & 1)
-                       error(Ebadarg);
-               checkport(offset, offset+n);
-               sp = a;
-               for(port = offset; port < offset+n; port += 2)
-                       *sp++ = inw(port);
-               return n;
-
-       case Qiol:
-               if(n & 3)
-                       error(Ebadarg);
-               checkport(offset, offset+n);
-               lp = a;
-               for(port = offset; port < offset+n; port += 4)
-                       *lp++ = inl(port);
-               return n;
-
-       case Qioalloc:
-               break;
-
-       default:
-               if(c->qid.path < narchdir && (fn = readfn[c->qid.path]))
-                       return fn(c, a, n, offset);
-               error(Eperm);
-               break;
+       struct io_map *map;
+       struct core_set cset;
+       struct msr_address msra;
+       struct msr_value msrv;
+
+       switch ((uint32_t) c->qid.path) {
+               case Qdir:
+                       return devdirread(c, a, n, archdir, Qmax, devgen);
+               case Qgdb:
+                       p = gdbactive ? "1" : "0";
+                       return readstr(offset, a, n, p);
+               case Qiob:
+                       port = offset;
+                       checkport(offset, offset + n);
+                       for (p = a; port < offset + n; port++)
+                               *p++ = inb(port);
+                       return n;
+               case Qiow:
+                       if (n & 1)
+                               error(EINVAL, ERROR_FIXME);
+                       checkport(offset, offset + n);
+                       sp = a;
+                       for (port = offset; port < offset + n; port += 2)
+                               *sp++ = inw(port);
+                       return n;
+               case Qiol:
+                       if (n & 3)
+                               error(EINVAL, ERROR_FIXME);
+                       checkport(offset, offset + n);
+                       lp = a;
+                       for (port = offset; port < offset + n; port += 4)
+                               *lp++ = inl(port);
+                       return n;
+               case Qioalloc:
+                       break;
+               case Qrealmem:
+                       return readmem(offset, a, n, KADDR(0), REAL_MEM_SIZE);
+               case Qmsr:
+                       if (!address_range_find(msr_rd_wlist, ARRAY_SIZE(msr_rd_wlist),
+                                               (uintptr_t) offset))
+                               error(EPERM, "MSR 0x%x not in read whitelist", offset);
+                       core_set_init(&cset);
+                       core_set_fill_available(&cset);
+                       msr_set_address(&msra, (uint32_t) offset);
+                       values = kzmalloc(num_cores * sizeof(uint64_t),
+                                         MEM_WAIT);
+                       if (!values)
+                               error(ENOMEM, ERROR_FIXME);
+                       msr_set_values(&msrv, values, num_cores);
+
+                       err = msr_cores_read(&cset, &msra, &msrv);
+
+                       if (likely(!err)) {
+                               if (n >= num_cores * sizeof(uint64_t)) {
+                                       if (!memcpy_to_user_errno(current, a, values,
+                                                                 num_cores * sizeof(uint64_t)))
+                                               n = num_cores * sizeof(uint64_t);
+                                       else
+                                               n = -1;
+                               } else {
+                                       kfree(values);
+                                       error(ERANGE, "Not enough space for MSR read");
+                               }
+                       } else {
+                               switch (-err) {
+                               case (EFAULT):
+                                       error(-err, "read_msr() faulted on MSR 0x%x", offset);
+                               case (ERANGE):
+                                       error(-err, "Not enough space for MSR read");
+                               };
+                               error(-err, "MSR read failed");
+                       }
+                       kfree(values);
+                       return n;
+               case Qperf: {
+                       struct perf_context *pc = (struct perf_context *) c->aux;
+
+                       assert(pc);
+                       if (pc->resp && ((size_t) offset < pc->resp_size)) {
+                               n = MIN(n, (long) pc->resp_size - (long) offset);
+                               if (memcpy_to_user_errno(current, a, pc->resp + offset, n))
+                                       n = -1;
+                       } else {
+                               n = 0;
+                       }
+
+                       return n;
+               }
+               default:
+                       error(EINVAL, ERROR_FIXME);
        }
 
-       if((buf = kzmalloc(n, 0)) == NULL)
-               error(Enomem);
+       if ((buf = kzmalloc(n, 0)) == NULL)
+               error(ENOMEM, ERROR_FIXME);
        p = buf;
-       n = n/Linelen;
-       offset = offset/Linelen;
-
-       switch((uint32_t)c->qid.path){
-       case Qioalloc:
-               spin_lock(&(&iomap)->lock);
-               for(map = iomap.map; n > 0 && map != NULL; map = map->next){
-                       if(offset-- > 0)
-                               continue;
-                       snprintf(p, n*Linelen, "%#8p %#8p %-12.12s\n", map->start, map->end-1, map->tag);
-                       p += Linelen;
-                       n--;
-               }
-               spin_unlock(&(&iomap)->lock);
-               break;
-       case Qmapram:
-               error("Not yet");
-               break;
+       n = n / Linelen;
+       offset = offset / Linelen;
+
+       switch ((uint32_t) c->qid.path) {
+               case Qioalloc:
+                       spin_lock(&(&iomap)->lock);
+                       for (map = iomap.map; n > 0 && map != NULL; map = map->next) {
+                               if (offset-- > 0)
+                                       continue;
+                               snprintf(p, n * Linelen, "%#8p %#8p %-12.12s\n", map->start,
+                                        map->end - 1, map->tag);
+                               p += Linelen;
+                               n--;
+                       }
+                       spin_unlock(&(&iomap)->lock);
+                       break;
        }
 
        n = p - buf;
@@ -434,163 +625,120 @@ archread(struct chan *c, void *a, long n, int64_t offset)
        return n;
 }
 
-static long
-archwrite(struct chan *c, void *a, long n, int64_t offset)
+static long archwrite(struct chan *c, void *a, long n, int64_t offset)
 {
        char *p;
-       int port;
+       int port, err;
+       uint64_t value;
        uint16_t *sp;
        uint32_t *lp;
-       Rdwrfn *fn;
-
-       switch((uint32_t)c->qid.path){
-
-       case Qgdb:
-               p = a;
-               if (n != 1)
-                       error("Gdb: Write one byte, '1' or '0'");
-               if (*p == '1')
-                       gdbactive = 1;
-               else if (*p == '0')
-                       gdbactive = 0;
-               else
-                       error("Gdb: must be 1 or 0");
-               return 1;
-
-       case Qiob:
-               p = a;
-               checkport(offset, offset+n);
-               for(port = offset; port < offset+n; port++)
-                       outb(port, *p++);
-               return n;
-
-       case Qiow:
-               if(n & 1)
-                       error(Ebadarg);
-               checkport(offset, offset+n);
-               sp = a;
-               for(port = offset; port < offset+n; port += 2)
-                       outw(port, *sp++);
-               return n;
-
-       case Qiol:
-               if(n & 3)
-                       error(Ebadarg);
-               checkport(offset, offset+n);
-               lp = a;
-               for(port = offset; port < offset+n; port += 4)
-                       outl(port, *lp++);
-               return n;
-
-       default:
-               if(c->qid.path < narchdir && (fn = writefn[c->qid.path]))
-                       return fn(c, a, n, offset);
-               error(Eperm);
-               break;
-       }
-       return 0;
-}
-
-struct dev archdevtab = {
-       'P',
-       "arch",
-
-       devreset,
-       devinit,
-       devshutdown,
-       archattach,
-       archwalk,
-       archstat,
-       archopen,
-       devcreate,
-       archclose,
-       archread,
-       devbread,
-       archwrite,
-       devbwrite,
-       devremove,
-       devwstat,
-};
-
-/*
- */
-void
-nop(void)
-{
-}
-
-//void (*coherence)(void) = mfence;
-#warning "need memory fence"
-#define coherence()
-
-
-static long
-cputyperead(struct chan*unused, void *a, long n, int64_t off)
-{
-       char buf[512], *s, *e;
-       int i, k;
-       error("unimplemented");
-#if 0
-       e = buf+sizeof buf;
-       s = seprintf(buf, e, "%s %d\n", "AMD64", 0);
-       k = m->ncpuinfoe - m->ncpuinfos;
-       if(k > 4)
-               k = 4;
-       for(i = 0; i < k; i++)
-               s = seprintf(s, e, "%#8.8ux %#8.8ux %#8.8ux %#8.8ux\n",
-                       m->cpuinfo[i][0], m->cpuinfo[i][1],
-                       m->cpuinfo[i][2], m->cpuinfo[i][3]);
-       return readstr(off, a, n, buf);
-#endif
-}
+       struct core_set cset;
+       struct msr_address msra;
+       struct msr_value msrv;
+
+       switch ((uint32_t) c->qid.path) {
+               case Qgdb:
+                       p = a;
+                       if (n != 1)
+                               error(EINVAL, "Gdb: Write one byte, '1' or '0'");
+                       if (*p == '1')
+                               gdbactive = 1;
+                       else if (*p == '0')
+                               gdbactive = 0;
+                       else
+                               error(EINVAL, "Gdb: must be 1 or 0");
+                       return 1;
+               case Qiob:
+                       p = a;
+                       checkport(offset, offset + n);
+                       for (port = offset; port < offset + n; port++)
+                               outb(port, *p++);
+                       return n;
+               case Qiow:
+                       if (n & 1)
+                               error(EINVAL, ERROR_FIXME);
+                       checkport(offset, offset + n);
+                       sp = a;
+                       for (port = offset; port < offset + n; port += 2)
+                               outw(port, *sp++);
+                       return n;
+               case Qiol:
+                       if (n & 3)
+                               error(EINVAL, ERROR_FIXME);
+                       checkport(offset, offset + n);
+                       lp = a;
+                       for (port = offset; port < offset + n; port += 4)
+                               outl(port, *lp++);
+                       return n;
+               case Qmsr:
+                       if (!address_range_find(msr_wr_wlist, ARRAY_SIZE(msr_wr_wlist),
+                                               (uintptr_t) offset))
+                               error(EPERM, "MSR 0x%x not in write whitelist", offset);
+                       if (n != sizeof(uint64_t))
+                               error(EINVAL, "Tried to write more than a u64 (%p)", n);
+                       if (memcpy_from_user_errno(current, &value, a, sizeof(value)))
+                               return -1;
+
+                       core_set_init(&cset);
+                       core_set_fill_available(&cset);
+                       msr_set_address(&msra, (uint32_t) offset);
+                       msr_set_value(&msrv, value);
+
+                       err = msr_cores_write(&cset, &msra, &msrv);
+                       if (unlikely(err)) {
+                               switch (-err) {
+                               case (EFAULT):
+                                       error(-err, "write_msr() faulted on MSR 0x%x", offset);
+                               case (ERANGE):
+                                       error(-err, "Not enough space for MSR write");
+                               };
+                               error(-err, "MSR write failed");
+                       }
+                       return sizeof(uint64_t);
+               case Qperf: {
+                       struct perf_context *pc = (struct perf_context *) c->aux;
 
+                       assert(pc);
 
-static long
-rmemrw(int isr, void *a, long n, int64_t off)
-{
-       if(off < 0)
-               error("offset must be >= 0");
-       if(n < 0)
-               error("count must be >= 0");
-       if(isr){
-               if(off >= MB)
-                       error("offset must be < 1MB");
-               if(off+n >= MB)
-                       n = MB - off;
-               memmove(a, KADDR((uint32_t)off), n);
-       } else {
-               /* realmode buf page ok, allow vga framebuf's access */
-               if(off >= MB)
-                       error("offset must be < 1MB");
-               if(off+n > MB && (off < 0xA0000 || off+n > 0xB0000+0x10000))
-                       error("bad offset/count in write");
-               memmove(KADDR((uint32_t)off), a, n);
+                       return arch_perf_write(pc, a, n);
+               }
+               default:
+                       error(EINVAL, ERROR_FIXME);
        }
-       return n;
+       return 0;
 }
 
-static long
-rmemread(struct chan*unused, void *a, long n, int64_t off)
+static void archinit(void)
 {
-       return rmemrw(1, a, n, off);
-}
+       int ret;
 
-static long
-rmemwrite(struct chan*unused, void *a, long n, int64_t off)
-{
-       return rmemrw(0, a, n, off);
+       ret = address_range_init(msr_rd_wlist, ARRAY_SIZE(msr_rd_wlist));
+       assert(!ret);
+       ret = address_range_init(msr_wr_wlist, ARRAY_SIZE(msr_wr_wlist));
+       assert(!ret);
 }
 
-void
-archinit(void)
-{
-       spinlock_init(&archwlock);
-       addarchfile("cputype", 0444, cputyperead, NULL);
-       addarchfile("realmodemem", 0660, rmemread, rmemwrite);
-}
+struct dev archdevtab __devtab = {
+       .name = "arch",
+
+       .reset = devreset,
+       .init = archinit,
+       .shutdown = devshutdown,
+       .attach = archattach,
+       .walk = archwalk,
+       .stat = archstat,
+       .open = archopen,
+       .create = devcreate,
+       .close = archclose,
+       .read = archread,
+       .bread = devbread,
+       .write = archwrite,
+       .bwrite = devbwrite,
+       .remove = devremove,
+       .wstat = devwstat,
+};
 
-void
-archreset(void)
+void archreset(void)
 {
        int i;
 
@@ -600,15 +748,15 @@ archreset(void)
         * The reset register (0xcf9) is usually in one of the bridge
         * chips. The actual location and sequence could be extracted from
         * ACPI but why bother, this is the end of the line anyway.
-       print("Takes a licking and keeps on ticking...\n");
+        print("Takes a licking and keeps on ticking...\n");
         */
-       i = inb(0xcf9);                                 /* ICHx reset control */
+       i = inb(0xcf9); /* ICHx reset control */
        i &= 0x06;
-       outb(0xcf9, i|0x02);                            /* SYS_RST */
+       outb(0xcf9, i | 0x02);  /* SYS_RST */
        udelay(1000);
-       outb(0xcf9, i|0x06);                            /* RST_CPU transition */
+       outb(0xcf9, i | 0x06);  /* RST_CPU transition */
 
-       udelay(100*1000);
+       udelay(100 * 1000);
 
        /* some broken hardware -- as well as qemu -- might
         * never reboot anyway with cf9. This is a standard
@@ -616,10 +764,9 @@ archreset(void)
         * broken stuff -- like qemu. If there is no
         * keyboard it will do no harm.
         */
-       for(;;){
-               (void) inb(0x64);
+       for (;;) {
+               (void)inb(0x64);
                outb(0x64, 0xFE);
-               udelay(100*1000);
+               udelay(100 * 1000);
        }
 }
-