9ns: Fix devtab function pointer signatures
[akaros.git] / kern / drivers / dev / kprof.c
index 69b05b1..cbc7b1c 100644 (file)
  * in the LICENSE file.
  */
 
  * in the LICENSE file.
  */
 
-// get_fn_name is slowing down the kprocread
-//     have an array of translated fns
-//     or a "next" iterator, since we're walking in order
-//
-// irqsave locks
-//
-// kprof struct should be a ptr, have them per core
-//             we'll probably need to track the length still, so userspace knows how
-//             big it is
-//
-//             will also want more files in the kprof dir for each cpu or something
-//
-// maybe don't use slot 0 and 1 as total and 'not kernel' ticks
-//
-// fix the failed assert XXX
-
+#include <ros/profiler_records.h>
+#include <arch/time.h>
 #include <vfs.h>
 #include <vfs.h>
-#include <kfs.h>
 #include <slab.h>
 #include <kmalloc.h>
 #include <kref.h>
 #include <slab.h>
 #include <kmalloc.h>
 #include <kref.h>
+#include <atomic.h>
+#include <kthread.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
-#include <cpio.h>
 #include <pmap.h>
 #include <smp.h>
 #include <pmap.h>
 #include <smp.h>
-#include <ip.h>
+#include <time.h>
+#include <circular_buffer.h>
+#include <umem.h>
 #include <profiler.h>
 #include <profiler.h>
+#include <kprof.h>
+#include <ros/procinfo.h>
+#include <init.h>
 
 
-struct dev kprofdevtab;
-
-static char *devname(void)
-{
-       return kprofdevtab.name;
-}
-
-#define LRES   3               /* log of PC resolution */
-#define CELLSIZE       8       /* sizeof of count cell */
-
-struct kprof
-{
-       uintptr_t       minpc;
-       uintptr_t       maxpc;
-       int     nbuf;
-       int     time;
-       uint64_t        *buf;   /* keep in sync with cellsize */
-       size_t          buf_sz;
-       spinlock_t lock;
-       struct queue *systrace;
-       bool            mpstat_ipi;
-};
-struct kprof kprof;
+#define KTRACE_BUFFER_SIZE (128 * 1024)
+#define TRACE_PRINTK_BUFFER_SIZE (8 * 1024)
 
 
-/* output format. Nice fixed size. That makes it seekable.
- * small subtle bit here. You have to convert offset FROM FORMATSIZE units
- * to CELLSIZE units in a few places.
- */
-char *outformat = "%016llx %29.29s %016llx\n";
-#define FORMATSIZE 64
-enum{
+enum {
        Kprofdirqid = 0,
        Kprofdataqid,
        Kprofctlqid,
        Kprofdirqid = 0,
        Kprofdataqid,
        Kprofctlqid,
-       Kprofoprofileqid,
+       Kptracectlqid,
        Kptraceqid,
        Kprintxqid,
        Kmpstatqid,
        Kmpstatrawqid,
 };
 
        Kptraceqid,
        Kprintxqid,
        Kmpstatqid,
        Kmpstatrawqid,
 };
 
-struct dirtab kproftab[]={
-       {".",           {Kprofdirqid, 0, QTDIR},0,      DMDIR|0550},
-       {"kpdata",      {Kprofdataqid},         0,      0600},
-       {"kpctl",       {Kprofctlqid},          0,      0600},
-       {"kpoprofile",  {Kprofoprofileqid},     0,      0600},
-       {"kptrace",     {Kptraceqid},           0,      0600},
-       {"kprintx",     {Kprintxqid},           0,      0600},
-       {"mpstat",      {Kmpstatqid},           0,      0600},
-       {"mpstat-raw",  {Kmpstatrawqid},                0,      0600},
+struct trace_printk_buffer {
+       atomic_t in_use;
+       char buffer[TRACE_PRINTK_BUFFER_SIZE];
 };
 
 };
 
-static size_t mpstatraw_len(void);
-static size_t mpstat_len(void);
+struct kprof {
+       qlock_t lock;
+       bool mpstat_ipi;
+       bool profiling;
+       bool opened;
+};
+
+struct dev kprofdevtab;
+struct dirtab kproftab[] = {
+       {".",                   {Kprofdirqid,           0, QTDIR}, 0,   DMDIR|0550},
+       {"kpdata",              {Kprofdataqid},         0,      0600},
+       {"kpctl",               {Kprofctlqid},          0,      0600},
+       {"kptrace_ctl", {Kptracectlqid},        0,      0660},
+       {"kptrace",             {Kptraceqid},           0,      0600},
+       {"kprintx",             {Kprintxqid},           0,      0600},
+       {"mpstat",              {Kmpstatqid},           0,      0600},
+       {"mpstat-raw",  {Kmpstatrawqid},        0,      0600},
+};
 
 
-static struct alarm_waiter *oprof_alarms;
-static unsigned int oprof_timer_period = 1000;
+static struct kprof kprof;
+static bool ktrace_init_done = FALSE;
+static spinlock_t ktrace_lock = SPINLOCK_INITIALIZER_IRQSAVE;
+static struct circular_buffer ktrace_data;
+static char ktrace_buffer[KTRACE_BUFFER_SIZE];
+static char kprof_control_usage[128];
 
 
-static void oprof_alarm_handler(struct alarm_waiter *waiter,
-                                struct hw_trapframe *hw_tf)
+static size_t mpstat_len(void)
 {
 {
-       int coreid = core_id();
-       struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
+       size_t each_row = 7 + NR_CPU_STATES * 26;
 
 
-       profiler_add_hw_sample(hw_tf);
-       reset_alarm_rel(tchain, waiter, oprof_timer_period);
+       return each_row * (num_cores + 1) + 1;
 }
 
 }
 
-static struct chan*
-kprofattach(char *spec)
+static size_t mpstatraw_len(void)
 {
 {
-       // Did we initialise completely?
-       if ( !(oprof_alarms && kprof.buf && kprof.systrace) )
-               error(ENOMEM, NULL);
+       size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
+       size_t cpu_row = 7 + NR_CPU_STATES * 17;
 
 
-       return devattach(devname(), spec);
+       return header_row + cpu_row * num_cores + 1;
 }
 
 }
 
-static void
-kproftimer(uintptr_t pc)
+static char *devname(void)
 {
 {
-       if(kprof.time == 0)
-               return;
-
-       /*
-        * if the pc corresponds to the idle loop, don't consider it.
+       return kprofdevtab.name;
+}
 
 
-       if(m->inidle)
-               return;
-        */
-       /*
-        *  if the pc is coming out of spllo or splx,
-        *  use the pc saved when we went splhi.
+static struct chan *kprof_attach(char *spec)
+{
+       return devattach(devname(), spec);
+}
 
 
-       if(pc>=PTR2UINT(spllo) && pc<=PTR2UINT(spldone))
-               pc = m->splpc;
-        */
+/* Start collecting samples from perf events into the profiler.
+ *
+ * This command only runs if the user successfully opened kpctl, which gives
+ * them a profiler (the global profiler, for now). */
+static void kprof_start_profiler(void)
+{
+       ERRSTACK(1);
 
 
-//     ilock(&kprof);
-       /* this is weird. What we do is assume that all the time since the last
-        * measurement went into this PC. It's the best
-        * we can do I suppose. And we are sampling at 1 ms. for now.
-        * better ideas welcome.
-        */
-       kprof.buf[0] += 1; //Total count of ticks.
-       if(kprof.minpc<=pc && pc<kprof.maxpc){
-               pc -= kprof.minpc;
-               pc >>= LRES;
-               kprof.buf[pc] += 1;
-       }else
-               kprof.buf[1] += 1; // Why?
-//     iunlock(&kprof);
+       qlock(&kprof.lock);
+       if (waserror()) {
+               qunlock(&kprof.lock);
+               nexterror();
+       }
+       if (!kprof.profiling) {
+               profiler_start();
+               kprof.profiling = TRUE;
+       }
+       poperror();
+       qunlock(&kprof.lock);
 }
 
 }
 
-static void setup_timers(void)
+/* Stops collecting samples from perf events.
+ *
+ * This command only runs if the user successfully opened kpctl, which gives
+ * them a profiler (the global profiler, for now). */
+static void kprof_stop_profiler(void)
 {
 {
-       void kprof_alarm(struct alarm_waiter *waiter, struct hw_trapframe *hw_tf)
-       {
-               struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
-               kproftimer(get_hwtf_pc(hw_tf));
-               set_awaiter_rel(waiter, 1000);
-               set_alarm(tchain, waiter);
+       ERRSTACK(1);
+
+       qlock(&kprof.lock);
+       if (waserror()) {
+               qunlock(&kprof.lock);
+               nexterror();
+       }
+       if (kprof.profiling) {
+               profiler_stop();
+               kprof.profiling = FALSE;
        }
        }
-       struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
-       struct alarm_waiter *waiter = kmalloc(sizeof(struct alarm_waiter), 0);
-       init_awaiter_irq(waiter, kprof_alarm);
-       set_awaiter_rel(waiter, 1000);
-       set_alarm(tchain, waiter);
+       poperror();
+       qunlock(&kprof.lock);
 }
 
 }
 
-static void kprofinit(void)
+/* Makes each core flush its results into the profiler queue.  You can do this
+ * while the profiler is still running.  However, this does not hang up the
+ * queue, so reads on kpdata will block. */
+static void kprof_flush_profiler(void)
 {
 {
-       uint32_t n;
-
-       static_assert(CELLSIZE == sizeof kprof.buf[0]); // kprof size
+       ERRSTACK(1);
 
 
-       /* allocate when first used */
-       kprof.minpc = KERN_LOAD_ADDR;
-       kprof.maxpc = (uintptr_t) &etext;
-       kprof.nbuf = (kprof.maxpc-kprof.minpc) >> LRES;
-       n = kprof.nbuf*CELLSIZE;
-       kprof.buf = kzmalloc(n, KMALLOC_WAIT);
-       if (kprof.buf)
-               kprof.buf_sz = n;
+       qlock(&kprof.lock);
+       if (waserror()) {
+               qunlock(&kprof.lock);
+               nexterror();
+       }
+       if (kprof.profiling)
+               profiler_trace_data_flush();
+       poperror();
+       qunlock(&kprof.lock);
+}
 
 
-       /* no, i'm not sure how we should do this yet. */
+static void kprof_init(void)
+{
        profiler_init();
        profiler_init();
-       oprof_alarms = kzmalloc(sizeof(struct alarm_waiter) * num_cores,
-                               KMALLOC_WAIT);
-       if (!oprof_alarms)
-               error(ENOMEM, NULL);
 
 
-       for (int i = 0; i < num_cores; i++)
-               init_awaiter_irq(&oprof_alarms[i], oprof_alarm_handler);
+       qlock_init(&kprof.lock);
+       kprof.profiling = FALSE;
+       kprof.opened = FALSE;
 
 
-       kprof.systrace = qopen(2 << 20, 0, 0, 0);
-       if (!kprof.systrace) {
-               printk("systrace allocate failed. No system call tracing\n");
-       }
-       kprof.mpstat_ipi = TRUE;
+       for (int i = 0; i < ARRAY_SIZE(kproftab); i++)
+               kproftab[i].length = 0;
 
 
-       kproftab[Kprofdataqid].length = kprof.nbuf * FORMATSIZE;
+       kprof.mpstat_ipi = TRUE;
        kproftab[Kmpstatqid].length = mpstat_len();
        kproftab[Kmpstatrawqid].length = mpstatraw_len();
        kproftab[Kmpstatqid].length = mpstat_len();
        kproftab[Kmpstatrawqid].length = mpstatraw_len();
+
+       strlcpy(kprof_control_usage, "start|stop|flush",
+               sizeof(kprof_control_usage));
+       profiler_append_configure_usage(kprof_control_usage,
+                                       sizeof(kprof_control_usage));
 }
 
 }
 
-static void kprofshutdown(void)
+static void kprof_shutdown(void)
 {
 {
-       kfree(oprof_alarms); oprof_alarms = NULL;
-       kfree(kprof.buf); kprof.buf = NULL;
-       qfree(kprof.systrace); kprof.systrace = NULL;
-       profiler_cleanup();
 }
 
 }
 
-static struct walkqid*
-kprofwalk(struct chan *c, struct chan *nc, char **name, int nname)
+static struct walkqid *kprof_walk(struct chan *c, struct chan *nc, char **name,
+                                  unsigned int nname)
 {
        return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab), devgen);
 }
 
 {
        return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab), devgen);
 }
 
-static int
-kprofstat(struct chan *c, uint8_t *db, int n)
+static size_t kprof_profdata_size(void)
 {
 {
-       kproftab[Kprofoprofileqid].length = profiler_size();
-       if (kprof.systrace)
-               kproftab[Kptraceqid].length = qlen(kprof.systrace);
-       else
-               kproftab[Kptraceqid].length = 0;
+       return profiler_size();
+}
+
+static long kprof_profdata_read(void *dest, long size, int64_t off)
+{
+       return profiler_read(dest, size);
+}
+
+static size_t kprof_stat(struct chan *c, uint8_t *db, size_t n)
+{
+       kproftab[Kprofdataqid].length = kprof_profdata_size();
+       kproftab[Kptraceqid].length = kprof_tracedata_size();
 
        return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
 }
 
 
        return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
 }
 
-static struct chan*
-kprofopen(struct chan *c, int omode)
+static struct chan *kprof_open(struct chan *c, int omode)
 {
 {
-       if(c->qid.type & QTDIR){
-               if(openmode(omode) != O_READ)
-                       error(EPERM, NULL);
+       if (c->qid.type & QTDIR) {
+               if (openmode(omode) != O_READ)
+                       error(EPERM, ERROR_FIXME);
+       }
+       switch ((int) c->qid.path) {
+       case Kprofctlqid:
+               /* We have one global profiler.  Only one FD may be opened at a time for
+                * it.  If we ever have separate profilers, we can create the profiler
+                * here, and every open would get a separate instance. */
+               qlock(&kprof.lock);
+               if (kprof.opened) {
+                       qunlock(&kprof.lock);
+                       error(EBUSY, "Global profiler is already open");
+               }
+               kprof.opened = TRUE;
+               /* TODO: have a real creation function for a non-global profiler */
+               profiler_setup();
+               qunlock(&kprof.lock);
+               break;
        }
        c->mode = openmode(omode);
        c->flag |= COPEN;
        }
        c->mode = openmode(omode);
        c->flag |= COPEN;
@@ -243,21 +236,25 @@ kprofopen(struct chan *c, int omode)
        return c;
 }
 
        return c;
 }
 
-static void
-kprofclose(struct chan*unused)
-{
-}
-
-static size_t mpstat_len(void)
+static void kprof_close(struct chan *c)
 {
 {
-       size_t each_row = 7 + NR_CPU_STATES * 26;
-       return each_row * (num_cores + 1) + 1;
+       if (c->flag & COPEN) {
+               switch ((int) c->qid.path) {
+               case Kprofctlqid:
+                       kprof_stop_profiler();
+                       qlock(&kprof.lock);
+                       profiler_cleanup();
+                       kprof.opened = FALSE;
+                       qunlock(&kprof.lock);
+                       break;
+               }
+       }
 }
 
 static long mpstat_read(void *va, long n, int64_t off)
 {
        size_t bufsz = mpstat_len();
 }
 
 static long mpstat_read(void *va, long n, int64_t off)
 {
        size_t bufsz = mpstat_len();
-       char *buf = kmalloc(bufsz, KMALLOC_WAIT);
+       char *buf = kmalloc(bufsz, MEM_WAIT);
        int len = 0;
        struct per_cpu_info *pcpui;
        uint64_t cpu_total;
        int len = 0;
        struct per_cpu_info *pcpui;
        uint64_t cpu_total;
@@ -280,7 +277,7 @@ static long mpstat_read(void *va, long n, int64_t off)
                        cpu_total += pcpui->state_ticks[j];
                cpu_total = MAX(cpu_total, 1);  /* for the divide later */
                for (int j = 0; j < NR_CPU_STATES; j++) {
                        cpu_total += pcpui->state_ticks[j];
                cpu_total = MAX(cpu_total, 1);  /* for the divide later */
                for (int j = 0; j < NR_CPU_STATES; j++) {
-                       tsc2timespec(pcpui->state_ticks[j], &ts);
+                       ts = tsc2timespec(pcpui->state_ticks[j]);
                        len += snprintf(buf + len, bufsz - len, "%10d.%06d (%3d%%)%s",
                                        ts.tv_sec, ts.tv_nsec / 1000,
                                        MIN((pcpui->state_ticks[j] * 100) / cpu_total, 100),
                        len += snprintf(buf + len, bufsz - len, "%10d.%06d (%3d%%)%s",
                                        ts.tv_sec, ts.tv_nsec / 1000,
                                        MIN((pcpui->state_ticks[j] * 100) / cpu_total, 100),
@@ -292,17 +289,10 @@ static long mpstat_read(void *va, long n, int64_t off)
        return n;
 }
 
        return n;
 }
 
-static size_t mpstatraw_len(void)
-{
-       size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
-       size_t cpu_row = 7 + NR_CPU_STATES * 17;
-       return header_row + cpu_row * num_cores + 1;
-}
-
 static long mpstatraw_read(void *va, long n, int64_t off)
 {
        size_t bufsz = mpstatraw_len();
 static long mpstatraw_read(void *va, long n, int64_t off)
 {
        size_t bufsz = mpstatraw_len();
-       char *buf = kmalloc(bufsz, KMALLOC_WAIT);
+       char *buf = kmalloc(bufsz, MEM_WAIT);
        int len = 0;
        struct per_cpu_info *pcpui;
 
        int len = 0;
        struct per_cpu_info *pcpui;
 
@@ -311,7 +301,7 @@ static long mpstatraw_read(void *va, long n, int64_t off)
 
        /* header line: version, num_cores, tsc freq, state names */
        len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1, num_cores,
 
        /* header line: version, num_cores, tsc freq, state names */
        len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1, num_cores,
-                       system_timing.tsc_freq);
+                       __proc_global_info.tsc_freq);
        for (int j = 0; j < NR_CPU_STATES; j++)
                len += snprintf(buf + len, bufsz - len, " %6s", cpu_state_names[j]);
        len += snprintf(buf + len, bufsz - len, "\n");
        for (int j = 0; j < NR_CPU_STATES; j++)
                len += snprintf(buf + len, bufsz - len, " %6s", cpu_state_names[j]);
        len += snprintf(buf + len, bufsz - len, "\n");
@@ -330,90 +320,21 @@ static long mpstatraw_read(void *va, long n, int64_t off)
        return n;
 }
 
        return n;
 }
 
-static long
-kprofread(struct chan *c, void *va, long n, int64_t off)
+static size_t kprof_read(struct chan *c, void *va, size_t n, off64_t off)
 {
        uint64_t w, *bp;
        char *a, *ea;
        uintptr_t offset = off;
        uint64_t pc;
 {
        uint64_t w, *bp;
        char *a, *ea;
        uintptr_t offset = off;
        uint64_t pc;
-       int snp_ret, ret = 0;
 
 
-       switch((int)c->qid.path){
+       switch ((int) c->qid.path) {
        case Kprofdirqid:
                return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab), devgen);
        case Kprofdirqid:
                return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab), devgen);
-
        case Kprofdataqid:
        case Kprofdataqid:
-
-               if (n < FORMATSIZE){
-                       n = 0;
-                       break;
-               }
-               a = va;
-               ea = a + n;
-
-               /* we check offset later before deref bp.  offset / FORMATSIZE is how
-                * many entries we're skipping/offsetting. */
-               bp = kprof.buf + offset/FORMATSIZE;
-               pc = kprof.minpc + ((offset/FORMATSIZE)<<LRES);
-               while((a < ea) && (n >= FORMATSIZE)){
-                       /* what a pain. We need to manage the
-                        * fact that the *prints all make room for
-                        * \0
-                        */
-                       char print[FORMATSIZE+1];
-                       char *name;
-                       int amt_read;
-
-                       if (pc >= kprof.maxpc)
-                               break;
-                       /* pc is also our exit for bp.  should be in lockstep */
-                       // XXX this assert fails, fix it!
-                       //assert(bp < kprof.buf + kprof.nbuf);
-                       /* do not attempt to filter these results based on w < threshold.
-                        * earlier, we computed bp/pc based on assuming a full-sized file,
-                        * and skipping entries will result in read() calls thinking they
-                        * received earlier entries when they really received later ones.
-                        * imagine a case where there are 1000 skipped items, and read()
-                        * asks for chunks of 32.  it'll get chunks of the next 32 valid
-                        * items, over and over (1000/32 times). */
-                       w = *bp++;
-
-                       if (pc == kprof.minpc)
-                               name = "Total";
-                       else if (pc == kprof.minpc + 8)
-                               name = "User";
-                       else
-                               name = get_fn_name(pc);
-
-                       snp_ret = snprintf(print, sizeof(print), outformat, pc, name, w);
-                       assert(snp_ret == FORMATSIZE);
-                       if ((pc != kprof.minpc) && (pc != kprof.minpc + 8))
-                               kfree(name);
-
-                       amt_read = readmem(offset % FORMATSIZE, a, n, print, FORMATSIZE);
-                       offset = 0;     /* future loops have no offset */
-
-                       a += amt_read;
-                       n -= amt_read;
-                       ret += amt_read;
-
-                       pc += (1 << LRES);
-               }
-               n = ret;
-               break;
-       case Kprofoprofileqid:
-               n = profiler_read(va, n);
+               n = kprof_profdata_read(va, n, off);
                break;
        case Kptraceqid:
                break;
        case Kptraceqid:
-               if (kprof.systrace) {
-                       printd("Kptraceqid: kprof.systrace %p len %p\n", kprof.systrace, qlen(kprof.systrace));
-                       if (qlen(kprof.systrace) > 0)
-                               n = qread(kprof.systrace, va, n);
-                       else
-                               n = 0;
-               } else
-                       error(EFAIL, "no systrace queue");
+               n = kprof_tracedata_read(va, n, off);
                break;
        case Kprintxqid:
                n = readstr(offset, va, n, printx_on ? "on" : "off");
                break;
        case Kprintxqid:
                n = readstr(offset, va, n, printx_on ? "on" : "off");
@@ -431,111 +352,66 @@ kprofread(struct chan *c, void *va, long n, int64_t off)
        return n;
 }
 
        return n;
 }
 
-static void kprof_clear(struct kprof *kp)
-{
-       spin_lock(&kp->lock);
-       memset(kp->buf, 0, kp->buf_sz);
-       spin_unlock(&kp->lock);
-}
-
-static void manage_oprof_timer(int coreid, struct cmdbuf *cb)
-{
-       struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
-       struct alarm_waiter *waiter = &oprof_alarms[coreid];
-       if (!strcmp(cb->f[2], "on")) {
-               /* pcpu waiters already inited.  will set/reset each time (1 ms
-                * default). */
-               reset_alarm_rel(tchain, waiter, oprof_timer_period);
-       } else if (!strcmp(cb->f[2], "off")) {
-               /* since the alarm handler runs and gets reset within IRQ context, then
-                * we should never fail to cancel the alarm if it was already running
-                * (tchain locks synchronize us).  but it might not be set at all, which
-                * is fine. */
-               unset_alarm(tchain, waiter);
-       } else {
-               error(EFAIL, "optimer needs on|off");
-       }
-}
-
-static long
-kprofwrite(struct chan *c, void *a, long n, int64_t unused)
+static size_t kprof_write(struct chan *c, void *a, size_t n, off64_t unused)
 {
        ERRSTACK(1);
 {
        ERRSTACK(1);
-       uintptr_t pc;
-       struct cmdbuf *cb;
-       char *ctlstring = "startclr|start|stop|clear|opstart|opstop|optimer";
-       cb = parsecmd(a, n);
+       struct cmdbuf *cb = parsecmd(a, n);
 
        if (waserror()) {
                kfree(cb);
                nexterror();
        }
 
        if (waserror()) {
                kfree(cb);
                nexterror();
        }
-
-       switch((int)(c->qid.path)){
+       switch ((int) c->qid.path) {
        case Kprofctlqid:
                if (cb->nf < 1)
        case Kprofctlqid:
                if (cb->nf < 1)
-                       error(EFAIL, ctlstring);
-
-               /* Kprof: a "which kaddr are we at when the timer goes off".  not used
-                * much anymore */
-               if (!strcmp(cb->f[0], "startclr")) {
-                       kprof_clear(&kprof);
-                       kprof.time = 1;
-               } else if (!strcmp(cb->f[0], "start")) {
-                       kprof.time = 1;
-                       /* this sets up the timer on the *calling* core! */
-                       setup_timers();
+                       error(EFAIL, kprof_control_usage);
+               if (profiler_configure(cb))
+                       break;
+               if (!strcmp(cb->f[0], "start")) {
+                       kprof_start_profiler();
+               } else if (!strcmp(cb->f[0], "flush")) {
+                       kprof_flush_profiler();
                } else if (!strcmp(cb->f[0], "stop")) {
                } else if (!strcmp(cb->f[0], "stop")) {
-                       /* TODO: stop the timers! */
-                       kprof.time = 0;
-               } else if (!strcmp(cb->f[0], "clear")) {
-                       kprof_clear(&kprof);
-
-               /* oprof: samples and traces using oprofile */
-               } else if (!strcmp(cb->f[0], "optimer")) {
-                       if (cb->nf < 3)
-                               error(EFAIL, "optimer [<0|1|..|n|all> <on|off>] [period USEC]");
-                       if (!strcmp(cb->f[1], "period")) {
-                               oprof_timer_period = strtoul(cb->f[2], 0, 10);
-                       } else if (!strcmp(cb->f[1], "all")) {
-                               for (int i = 0; i < num_cores; i++)
-                                       manage_oprof_timer(i, cb);
-                       } else {
-                               int pcoreid = strtoul(cb->f[1], 0, 10);
-                               if (pcoreid >= num_cores)
-                                       error(EFAIL, "no such coreid %d", pcoreid);
-                               manage_oprof_timer(pcoreid, cb);
-                       }
-               } else if (!strcmp(cb->f[0], "opstart")) {
-                       profiler_control_trace(1);
-               } else if (!strcmp(cb->f[0], "opstop")) {
-                       profiler_control_trace(0);
+                       kprof_stop_profiler();
                } else {
                } else {
-                       error(EFAIL, ctlstring);
+                       error(EFAIL, kprof_control_usage);
                }
                break;
                }
                break;
+       case Kptracectlqid:
+               if (cb->nf < 1)
+                       error(EFAIL, "Bad kptrace_ctl option (reset)");
+               if (!strcmp(cb->f[0], "clear")) {
+                       spin_lock_irqsave(&ktrace_lock);
+                       circular_buffer_clear(&ktrace_data);
+                       spin_unlock_irqsave(&ktrace_lock);
+               }
+               break;
+       case Kptraceqid:
+               if (a && (n > 0)) {
+                       char *uptr = user_strdup_errno(current, a, n);
 
 
-               /* The format is a long as text. We strtoul, and jam it into the
-                * trace buffer.
-                */
-       case Kprofoprofileqid:
-               pc = strtoul(a, 0, 0);
-               profiler_add_trace(pc);
+                       if (uptr) {
+                               trace_printk("%s", uptr);
+                               user_memdup_free(current, uptr);
+                       } else {
+                               n = -1;
+                       }
+               }
                break;
        case Kprintxqid:
                if (!strncmp(a, "on", 2))
                        set_printx(1);
                else if (!strncmp(a, "off", 3))
                        set_printx(0);
                break;
        case Kprintxqid:
                if (!strncmp(a, "on", 2))
                        set_printx(1);
                else if (!strncmp(a, "off", 3))
                        set_printx(0);
-               else if (!strncmp(a, "toggle", 6))      /* why not. */
+               else if (!strncmp(a, "toggle", 6))
                        set_printx(2);
                else
                        set_printx(2);
                else
-                       error(EFAIL, "invalid option to Kprintx %s\n", a);
+                       error(EFAIL, "Invalid option to Kprintx %s\n", a);
                break;
        case Kmpstatqid:
        case Kmpstatrawqid:
                if (cb->nf < 1)
                break;
        case Kmpstatqid:
        case Kmpstatrawqid:
                if (cb->nf < 1)
-                       error(EFAIL, "mpstat bad option (reset|ipi|on|off)");
+                       error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
                if (!strcmp(cb->f[0], "reset")) {
                        for (int i = 0; i < num_cores; i++)
                                reset_cpu_state_ticks(i);
                if (!strcmp(cb->f[0], "reset")) {
                        for (int i = 0; i < num_cores; i++)
                                reset_cpu_state_ticks(i);
@@ -545,7 +421,7 @@ kprofwrite(struct chan *c, void *a, long n, int64_t unused)
                        /* TODO: disable the ticks */ ;
                } else if (!strcmp(cb->f[0], "ipi")) {
                        if (cb->nf < 2)
                        /* TODO: disable the ticks */ ;
                } else if (!strcmp(cb->f[0], "ipi")) {
                        if (cb->nf < 2)
-                               error(EFAIL, "need another arg: ipi [on|off]");
+                               error(EFAIL, "Need another arg: ipi [on|off]");
                        if (!strcmp(cb->f[1], "on"))
                                kprof.mpstat_ipi = TRUE;
                        else if (!strcmp(cb->f[1], "off"))
                        if (!strcmp(cb->f[1], "on"))
                                kprof.mpstat_ipi = TRUE;
                        else if (!strcmp(cb->f[1], "off"))
@@ -553,74 +429,167 @@ kprofwrite(struct chan *c, void *a, long n, int64_t unused)
                        else
                                error(EFAIL, "ipi [on|off]");
                } else {
                        else
                                error(EFAIL, "ipi [on|off]");
                } else {
-                       error(EFAIL, "mpstat bad option (reset|ipi|on|off)");
+                       error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
                }
                break;
        default:
                }
                break;
        default:
-               error(EBADFD, NULL);
+               error(EBADFD, ERROR_FIXME);
        }
        kfree(cb);
        poperror();
        return n;
 }
 
        }
        kfree(cb);
        poperror();
        return n;
 }
 
-void kprof_write_sysrecord(char *pretty_buf, size_t len)
+size_t kprof_tracedata_size(void)
+{
+       return circular_buffer_size(&ktrace_data);
+}
+
+size_t kprof_tracedata_read(void *data, size_t size, size_t offset)
 {
 {
-       int wrote;
-       if (kprof.systrace) {
-               /* TODO: need qio work so we can simply add the buf as extra data */
-               wrote = qiwrite(kprof.systrace, pretty_buf, len);
-               /* based on the current queue settings, we only drop when we're running
-                * out of memory.  odds are, we won't make it this far. */
-               if (wrote != len)
-                       printk("DROPPED %s", pretty_buf);
+       spin_lock_irqsave(&ktrace_lock);
+       if (likely(ktrace_init_done))
+               size = circular_buffer_read(&ktrace_data, data, size, offset);
+       else
+               size = 0;
+       spin_unlock_irqsave(&ktrace_lock);
+
+       return size;
+}
+
+void kprof_dump_data(void)
+{
+       void *buf;
+       size_t len = kprof_tracedata_size();
+
+       buf = kmalloc(len, MEM_WAIT);
+       kprof_tracedata_read(buf, len, 0);
+       printk("%s", buf);
+       kfree(buf);
+}
+
+void kprof_tracedata_write(const char *pretty_buf, size_t len)
+{
+       spin_lock_irqsave(&ktrace_lock);
+       if (unlikely(!ktrace_init_done)) {
+               circular_buffer_init(&ktrace_data, sizeof(ktrace_buffer),
+                                    ktrace_buffer);
+               ktrace_init_done = TRUE;
        }
        }
+       circular_buffer_write(&ktrace_data, pretty_buf, len);
+       spin_unlock_irqsave(&ktrace_lock);
 }
 
 }
 
-void trace_printk(const char *fmt, ...)
+static struct trace_printk_buffer *kprof_get_printk_buffer(void)
+{
+       static struct trace_printk_buffer boot_tpb;
+       static struct trace_printk_buffer *cpu_tpbs;
+       static atomic_t alloc_done;
+
+       if (unlikely(booting))
+               return &boot_tpb;
+       if (unlikely(!cpu_tpbs)) {
+               /* Poor man per-CPU data structure. I really do no like littering global
+                * data structures with module specific data.
+                * We cannot take the ktrace_lock to protect the kzmalloc() call, as
+                * that might trigger printk()s, and we would reenter here.
+                * Let only one core into the kzmalloc() path, and let the others get
+                * the boot_tpb until finished.
+                */
+               if (!atomic_cas(&alloc_done, 0, 1))
+                       return &boot_tpb;
+               cpu_tpbs = kzmalloc(num_cores * sizeof(struct trace_printk_buffer), 0);
+       }
+
+       return cpu_tpbs + core_id_early();
+}
+
+void trace_vprintk(const char *fmt, va_list args)
 {
 {
-       va_list ap;
-       struct timespec ts_now;
-       size_t bufsz = 160;     /* 2x terminal width */
-       size_t len = 0;
-       char *buf = kmalloc(bufsz, 0);
+       struct print_buf {
+               char *ptr;
+               char *top;
+       };
+
+       void emit_print_buf_str(struct print_buf *pb, const char *str, ssize_t size)
+       {
+               if (size < 0) {
+                       for (; *str && (pb->ptr < pb->top); str++)
+                               *(pb->ptr++) = *str;
+               } else {
+                       for (; (size > 0) && (pb->ptr < pb->top); str++, size--)
+                               *(pb->ptr++) = *str;
+               }
+       }
 
 
-       if (!buf)
+       static const size_t bufsz = TRACE_PRINTK_BUFFER_SIZE;
+       static const size_t usr_bufsz = (3 * bufsz) / 8;
+       static const size_t kp_bufsz = bufsz - usr_bufsz;
+       struct trace_printk_buffer *tpb = kprof_get_printk_buffer();
+       struct timespec ts_now = { 0, 0 };
+       struct print_buf pb;
+       char *usrbuf = tpb->buffer, *kpbuf = tpb->buffer + usr_bufsz;
+       const char *utop, *uptr;
+       char hdr[64];
+
+       if (!atomic_cas(&tpb->in_use, 0, 1))
                return;
                return;
-       tsc2timespec(read_tsc(), &ts_now);
-       len += snprintf(buf + len, bufsz - len, "[%7d.%09d] /* ", ts_now.tv_sec,
-                       ts_now.tv_nsec);
-       va_start(ap, fmt);
-       len += vsnprintf(buf + len, bufsz - len, fmt, ap);
-       va_start(ap, fmt);
-       va_end(ap);
-       len += snprintf(buf + len, bufsz - len, " */\n");
-       va_start(ap, fmt);
+       if (likely(__proc_global_info.tsc_freq))
+               ts_now = tsc2timespec(read_tsc());
+       snprintf(hdr, sizeof(hdr), "[%lu.%09lu]:cpu%d: ", ts_now.tv_sec,
+                ts_now.tv_nsec, core_id_early());
+
+       pb.ptr = usrbuf + vsnprintf(usrbuf, usr_bufsz, fmt, args);
+       pb.top = usrbuf + usr_bufsz;
+
+       if (pb.ptr[-1] != '\n')
+               emit_print_buf_str(&pb, "\n", 1);
        /* snprintf null terminates the buffer, and does not count that as part of
        /* snprintf null terminates the buffer, and does not count that as part of
-        * the len.  if we maxed out the buffer, let's make sure it has a \n */
-       if (len == bufsz - 1) {
-               assert(buf[bufsz - 1] == '\0');
-               buf[bufsz - 2] = '\n';
+        * the len.  If we maxed out the buffer, let's make sure it has a \n.
+        */
+       if (pb.ptr == pb.top)
+               pb.ptr[-1] = '\n';
+       utop = pb.ptr;
+
+       pb.ptr = kpbuf;
+       pb.top = kpbuf + kp_bufsz;
+       for (uptr = usrbuf; uptr < utop;) {
+               const char *nlptr = memchr(uptr, '\n', utop - uptr);
+
+               if (nlptr == NULL)
+                       nlptr = utop;
+               emit_print_buf_str(&pb, hdr, -1);
+               emit_print_buf_str(&pb, uptr, (nlptr - uptr) + 1);
+               uptr = nlptr + 1;
        }
        }
-       kprof_write_sysrecord(buf, len);
-       kfree(buf);
+       kprof_tracedata_write(kpbuf, pb.ptr - kpbuf);
+       atomic_set(&tpb->in_use, 0);
+}
+
+void trace_printk(const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       trace_vprintk(fmt, args);
+       va_end(args);
 }
 
 struct dev kprofdevtab __devtab = {
        .name = "kprof",
 
        .reset = devreset,
 }
 
 struct dev kprofdevtab __devtab = {
        .name = "kprof",
 
        .reset = devreset,
-       .init = kprofinit,
-       .shutdown = kprofshutdown,
-       .attach = kprofattach,
-       .walk = kprofwalk,
-       .stat = kprofstat,
-       .open = kprofopen,
+       .init = kprof_init,
+       .shutdown = kprof_shutdown,
+       .attach = kprof_attach,
+       .walk = kprof_walk,
+       .stat = kprof_stat,
+       .open = kprof_open,
        .create = devcreate,
        .create = devcreate,
-       .close = kprofclose,
-       .read = kprofread,
+       .close = kprof_close,
+       .read = kprof_read,
        .bread = devbread,
        .bread = devbread,
-       .write = kprofwrite,
+       .write = kprof_write,
        .bwrite = devbwrite,
        .remove = devremove,
        .wstat = devwstat,
        .bwrite = devbwrite,
        .remove = devremove,
        .wstat = devwstat,