akaros/kern/drivers/dev/kprof.c
<<
>>
Prefs
   1/*
   2 * This file is part of the UCB release of Plan 9. It is subject to the license
   3 * terms in the LICENSE file found in the top-level directory of this
   4 * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
   5 * part of the UCB release of Plan 9, including this file, may be copied,
   6 * modified, propagated, or distributed except according to the terms contained
   7 * in the LICENSE file.
   8 */
   9
  10#include <ros/profiler_records.h>
  11#include <arch/time.h>
  12#include <slab.h>
  13#include <kmalloc.h>
  14#include <kref.h>
  15#include <atomic.h>
  16#include <kthread.h>
  17#include <string.h>
  18#include <stdio.h>
  19#include <assert.h>
  20#include <error.h>
  21#include <pmap.h>
  22#include <smp.h>
  23#include <time.h>
  24#include <circular_buffer.h>
  25#include <umem.h>
  26#include <profiler.h>
  27#include <kprof.h>
  28#include <ros/procinfo.h>
  29#include <init.h>
  30
  31#define KTRACE_BUFFER_SIZE (128 * 1024)
  32#define TRACE_PRINTK_BUFFER_SIZE (8 * 1024)
  33
  34enum {
  35        Kprofdirqid = 0,
  36        Kprofdataqid,
  37        Kprofctlqid,
  38        Kptracectlqid,
  39        Kptraceqid,
  40        Kprintxqid,
  41        Kmpstatqid,
  42        Kmpstatrawqid,
  43};
  44
  45struct trace_printk_buffer {
  46        atomic_t in_use;
  47        char buffer[TRACE_PRINTK_BUFFER_SIZE];
  48};
  49
  50struct kprof {
  51        qlock_t lock;
  52        bool mpstat_ipi;
  53        bool profiling;
  54        bool opened;
  55};
  56
  57struct dev kprofdevtab;
  58struct dirtab kproftab[] = {
  59        {".",           {Kprofdirqid, 0, QTDIR},0,      DMDIR|0550},
  60        {"kpdata",      {Kprofdataqid},         0,      0600},
  61        {"kpctl",       {Kprofctlqid},          0,      0600},
  62        {"kptrace_ctl", {Kptracectlqid},        0,      0660},
  63        {"kptrace",     {Kptraceqid},           0,      0600},
  64        {"kprintx",     {Kprintxqid},           0,      0600},
  65        {"mpstat",      {Kmpstatqid},           0,      0600},
  66        {"mpstat-raw",  {Kmpstatrawqid},        0,      0600},
  67};
  68
  69static struct kprof kprof;
  70static bool ktrace_init_done = FALSE;
  71static spinlock_t ktrace_lock = SPINLOCK_INITIALIZER_IRQSAVE;
  72static struct circular_buffer ktrace_data;
  73static char ktrace_buffer[KTRACE_BUFFER_SIZE];
  74static char kprof_control_usage[128];
  75
  76static size_t mpstat_len(void)
  77{
  78        size_t each_row = 7 + NR_CPU_STATES * 26;
  79
  80        return each_row * (num_cores + 1) + 1;
  81}
  82
  83static size_t mpstatraw_len(void)
  84{
  85        size_t header_row = 27 + NR_CPU_STATES * 7 + 1;
  86        size_t cpu_row = 7 + NR_CPU_STATES * 17;
  87
  88        return header_row + cpu_row * num_cores + 1;
  89}
  90
  91static char *devname(void)
  92{
  93        return kprofdevtab.name;
  94}
  95
  96static struct chan *kprof_attach(char *spec)
  97{
  98        return devattach(devname(), spec);
  99}
 100
 101/* Start collecting samples from perf events into the profiler.
 102 *
 103 * This command only runs if the user successfully opened kpctl, which gives
 104 * them a profiler (the global profiler, for now). */
 105static void kprof_start_profiler(void)
 106{
 107        ERRSTACK(1);
 108
 109        qlock(&kprof.lock);
 110        if (waserror()) {
 111                qunlock(&kprof.lock);
 112                nexterror();
 113        }
 114        if (!kprof.profiling) {
 115                profiler_start();
 116                kprof.profiling = TRUE;
 117        }
 118        poperror();
 119        qunlock(&kprof.lock);
 120}
 121
 122/* Stops collecting samples from perf events.
 123 *
 124 * This command only runs if the user successfully opened kpctl, which gives
 125 * them a profiler (the global profiler, for now). */
 126static void kprof_stop_profiler(void)
 127{
 128        ERRSTACK(1);
 129
 130        qlock(&kprof.lock);
 131        if (waserror()) {
 132                qunlock(&kprof.lock);
 133                nexterror();
 134        }
 135        if (kprof.profiling) {
 136                profiler_stop();
 137                kprof.profiling = FALSE;
 138        }
 139        poperror();
 140        qunlock(&kprof.lock);
 141}
 142
 143/* Makes each core flush its results into the profiler queue.  You can do this
 144 * while the profiler is still running.  However, this does not hang up the
 145 * queue, so reads on kpdata will block. */
 146static void kprof_flush_profiler(void)
 147{
 148        ERRSTACK(1);
 149
 150        qlock(&kprof.lock);
 151        if (waserror()) {
 152                qunlock(&kprof.lock);
 153                nexterror();
 154        }
 155        if (kprof.profiling)
 156                profiler_trace_data_flush();
 157        poperror();
 158        qunlock(&kprof.lock);
 159}
 160
 161static void kprof_init(void)
 162{
 163        qlock_init(&kprof.lock);
 164        kprof.profiling = FALSE;
 165        kprof.opened = FALSE;
 166
 167        for (int i = 0; i < ARRAY_SIZE(kproftab); i++)
 168                kproftab[i].length = 0;
 169
 170        kprof.mpstat_ipi = TRUE;
 171        kproftab[Kmpstatqid].length = mpstat_len();
 172        kproftab[Kmpstatrawqid].length = mpstatraw_len();
 173
 174        strlcpy(kprof_control_usage, "start|stop|flush",
 175                sizeof(kprof_control_usage));
 176        profiler_append_configure_usage(kprof_control_usage,
 177                                        sizeof(kprof_control_usage));
 178}
 179
 180static struct walkqid *kprof_walk(struct chan *c, struct chan *nc, char **name,
 181                                  unsigned int nname)
 182{
 183        return devwalk(c, nc, name, nname, kproftab, ARRAY_SIZE(kproftab),
 184                       devgen);
 185}
 186
 187static size_t kprof_profdata_size(void)
 188{
 189        return profiler_size();
 190}
 191
 192static long kprof_profdata_read(void *dest, long size, int64_t off)
 193{
 194        return profiler_read(dest, size);
 195}
 196
 197static size_t kprof_stat(struct chan *c, uint8_t *db, size_t n)
 198{
 199        kproftab[Kprofdataqid].length = kprof_profdata_size();
 200        kproftab[Kptraceqid].length = kprof_tracedata_size();
 201
 202        return devstat(c, db, n, kproftab, ARRAY_SIZE(kproftab), devgen);
 203}
 204
 205static struct chan *kprof_open(struct chan *c, int omode)
 206{
 207        if (c->qid.type & QTDIR) {
 208                if (openmode(omode) != O_READ)
 209                        error(EPERM, ERROR_FIXME);
 210        }
 211        switch ((int) c->qid.path) {
 212        case Kprofctlqid:
 213                /* We have one global profiler.  Only one FD may be opened at a
 214                 * time for it.  If we ever have separate profilers, we can
 215                 * create the profiler here, and every open would get a separate
 216                 * instance. */
 217                qlock(&kprof.lock);
 218                if (kprof.opened) {
 219                        qunlock(&kprof.lock);
 220                        error(EBUSY, "Global profiler is already open");
 221                }
 222                if (profiler_setup() < 0) {
 223                        qunlock(&kprof.lock);
 224                        error(ENOMEM, "failed to set up profiler");
 225                }
 226                kprof.opened = TRUE;
 227                qunlock(&kprof.lock);
 228                break;
 229        }
 230        c->mode = openmode(omode);
 231        c->flag |= COPEN;
 232        c->offset = 0;
 233        return c;
 234}
 235
 236static void kprof_close(struct chan *c)
 237{
 238        if (c->flag & COPEN) {
 239                switch ((int) c->qid.path) {
 240                case Kprofctlqid:
 241                        kprof_stop_profiler();
 242                        qlock(&kprof.lock);
 243                        profiler_cleanup();
 244                        kprof.opened = FALSE;
 245                        qunlock(&kprof.lock);
 246                        break;
 247                }
 248        }
 249}
 250
 251static long mpstat_read(void *va, long n, int64_t off)
 252{
 253        size_t bufsz = mpstat_len();
 254        char *buf = kmalloc(bufsz, MEM_WAIT);
 255        int len = 0;
 256        struct per_cpu_info *pcpui;
 257        uint64_t cpu_total;
 258        struct timespec ts;
 259
 260        /* the IPI interferes with other cores, might want to disable that. */
 261        if (kprof.mpstat_ipi)
 262                send_broadcast_ipi(I_POKE_CORE);
 263
 264        len += snprintf(buf + len, bufsz - len, "  CPU: ");
 265        for (int j = 0; j < NR_CPU_STATES; j++)
 266                len += snprintf(buf + len, bufsz - len, "%23s%s",
 267                                cpu_state_names[j],
 268                                j != NR_CPU_STATES - 1 ? " " : "  \n");
 269
 270        for (int i = 0; i < num_cores; i++) {
 271                pcpui = &per_cpu_info[i];
 272                cpu_total = 0;
 273                len += snprintf(buf + len, bufsz - len, "%5d: ", i);
 274                for (int j = 0; j < NR_CPU_STATES; j++)
 275                        cpu_total += pcpui->state_ticks[j];
 276                cpu_total = MAX(cpu_total, 1);  /* for the divide later */
 277                for (int j = 0; j < NR_CPU_STATES; j++) {
 278                        ts = tsc2timespec(pcpui->state_ticks[j]);
 279                        len += snprintf(buf + len, bufsz - len,
 280                                        "%10d.%06d (%3d%%)%s",
 281                                        ts.tv_sec, ts.tv_nsec / 1000,
 282                                        MIN((pcpui->state_ticks[j] * 100) /
 283                                            cpu_total, 100),
 284                                        j != NR_CPU_STATES - 1 ? ", " : " \n");
 285                }
 286        }
 287        n = readstr(off, va, n, buf);
 288        kfree(buf);
 289        return n;
 290}
 291
 292static long mpstatraw_read(void *va, long n, int64_t off)
 293{
 294        size_t bufsz = mpstatraw_len();
 295        char *buf = kmalloc(bufsz, MEM_WAIT);
 296        int len = 0;
 297        struct per_cpu_info *pcpui;
 298
 299        /* could spit it all out in binary, though then it'd be harder to
 300         * process the data across a mnt (if we export #K).  probably not a big
 301         * deal. */
 302
 303        /* header line: version, num_cores, tsc freq, state names */
 304        len += snprintf(buf + len, bufsz - len, "v%03d %5d %16llu", 1,
 305                        num_cores, __proc_global_info.tsc_freq);
 306        for (int j = 0; j < NR_CPU_STATES; j++)
 307                len += snprintf(buf + len, bufsz - len, " %6s",
 308                                cpu_state_names[j]);
 309        len += snprintf(buf + len, bufsz - len, "\n");
 310
 311        for (int i = 0; i < num_cores; i++) {
 312                pcpui = &per_cpu_info[i];
 313                len += snprintf(buf + len, bufsz - len, "%5d: ", i);
 314                for (int j = 0; j < NR_CPU_STATES; j++) {
 315                        len += snprintf(buf + len, bufsz - len, "%16llx%s",
 316                                        pcpui->state_ticks[j],
 317                                        j != NR_CPU_STATES - 1 ? " " : "\n");
 318                }
 319        }
 320        n = readstr(off, va, n, buf);
 321        kfree(buf);
 322        return n;
 323}
 324
 325static size_t kprof_read(struct chan *c, void *va, size_t n, off64_t off)
 326{
 327        uint64_t w, *bp;
 328        char *a, *ea;
 329        uintptr_t offset = off;
 330        uint64_t pc;
 331
 332        switch ((int) c->qid.path) {
 333        case Kprofdirqid:
 334                return devdirread(c, va, n, kproftab, ARRAY_SIZE(kproftab),
 335                                  devgen);
 336        case Kprofdataqid:
 337                n = kprof_profdata_read(va, n, off);
 338                break;
 339        case Kptraceqid:
 340                n = kprof_tracedata_read(va, n, off);
 341                break;
 342        case Kprintxqid:
 343                n = readstr(offset, va, n, printx_on ? "on" : "off");
 344                break;
 345        case Kmpstatqid:
 346                n = mpstat_read(va, n, offset);
 347                break;
 348        case Kmpstatrawqid:
 349                n = mpstatraw_read(va, n, offset);
 350                break;
 351        default:
 352                n = 0;
 353                break;
 354        }
 355        return n;
 356}
 357
 358static size_t kprof_write(struct chan *c, void *a, size_t n, off64_t unused)
 359{
 360        ERRSTACK(1);
 361        struct cmdbuf *cb = parsecmd(a, n);
 362
 363        if (waserror()) {
 364                kfree(cb);
 365                nexterror();
 366        }
 367        switch ((int) c->qid.path) {
 368        case Kprofctlqid:
 369                if (cb->nf < 1)
 370                        error(EFAIL, kprof_control_usage);
 371                if (profiler_configure(cb))
 372                        break;
 373                if (!strcmp(cb->f[0], "start")) {
 374                        kprof_start_profiler();
 375                } else if (!strcmp(cb->f[0], "flush")) {
 376                        kprof_flush_profiler();
 377                } else if (!strcmp(cb->f[0], "stop")) {
 378                        kprof_stop_profiler();
 379                } else {
 380                        error(EFAIL, kprof_control_usage);
 381                }
 382                break;
 383        case Kptracectlqid:
 384                if (cb->nf < 1)
 385                        error(EFAIL, "Bad kptrace_ctl option (reset)");
 386                if (!strcmp(cb->f[0], "clear")) {
 387                        spin_lock_irqsave(&ktrace_lock);
 388                        circular_buffer_clear(&ktrace_data);
 389                        spin_unlock_irqsave(&ktrace_lock);
 390                }
 391                break;
 392        case Kptraceqid:
 393                if (a && (n > 0)) {
 394                        char *uptr = user_strdup_errno(current, a, n);
 395
 396                        if (uptr) {
 397                                trace_printk("%s", uptr);
 398                                user_memdup_free(current, uptr);
 399                        } else {
 400                                n = -1;
 401                        }
 402                }
 403                break;
 404        case Kprintxqid:
 405                if (cb->nf < 1)
 406                        error(EFAIL, "no printx option: (on|off|toggle)");
 407                if (!strcmp(cb->f[0], "on"))
 408                        set_printx(1);
 409                else if (!strcmp(cb->f[0], "off"))
 410                        set_printx(0);
 411                else if (!strcmp(cb->f[0], "toggle"))
 412                        set_printx(2);
 413                else
 414                        error(EFAIL, "bad printx option: (on|off|toggle)");
 415                break;
 416        case Kmpstatqid:
 417        case Kmpstatrawqid:
 418                if (cb->nf < 1)
 419                        error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
 420                if (!strcmp(cb->f[0], "reset")) {
 421                        for (int i = 0; i < num_cores; i++)
 422                                reset_cpu_state_ticks(i);
 423                } else if (!strcmp(cb->f[0], "on")) {
 424                        /* TODO: enable the ticks */ ;
 425                } else if (!strcmp(cb->f[0], "off")) {
 426                        /* TODO: disable the ticks */ ;
 427                } else if (!strcmp(cb->f[0], "ipi")) {
 428                        if (cb->nf < 2)
 429                                error(EFAIL, "Need another arg: ipi [on|off]");
 430                        if (!strcmp(cb->f[1], "on"))
 431                                kprof.mpstat_ipi = TRUE;
 432                        else if (!strcmp(cb->f[1], "off"))
 433                                kprof.mpstat_ipi = FALSE;
 434                        else
 435                                error(EFAIL, "ipi [on|off]");
 436                } else {
 437                        error(EFAIL, "Bad mpstat option (reset|ipi|on|off)");
 438                }
 439                break;
 440        default:
 441                error(EBADFD, ERROR_FIXME);
 442        }
 443        kfree(cb);
 444        poperror();
 445        return n;
 446}
 447
 448size_t kprof_tracedata_size(void)
 449{
 450        return circular_buffer_size(&ktrace_data);
 451}
 452
 453size_t kprof_tracedata_read(void *data, size_t size, size_t offset)
 454{
 455        spin_lock_irqsave(&ktrace_lock);
 456        if (likely(ktrace_init_done))
 457                size = circular_buffer_read(&ktrace_data, data, size, offset);
 458        else
 459                size = 0;
 460        spin_unlock_irqsave(&ktrace_lock);
 461
 462        return size;
 463}
 464
 465void kprof_dump_data(void)
 466{
 467        void *buf;
 468        size_t len = kprof_tracedata_size();
 469
 470        buf = kmalloc(len, MEM_WAIT);
 471        kprof_tracedata_read(buf, len, 0);
 472        printk("%s", buf);
 473        kfree(buf);
 474}
 475
 476void kprof_tracedata_write(const char *pretty_buf, size_t len)
 477{
 478        spin_lock_irqsave(&ktrace_lock);
 479        if (unlikely(!ktrace_init_done)) {
 480                circular_buffer_init(&ktrace_data, sizeof(ktrace_buffer),
 481                                     ktrace_buffer);
 482                ktrace_init_done = TRUE;
 483        }
 484        circular_buffer_write(&ktrace_data, pretty_buf, len);
 485        spin_unlock_irqsave(&ktrace_lock);
 486}
 487
 488static struct trace_printk_buffer *kprof_get_printk_buffer(void)
 489{
 490        static struct trace_printk_buffer boot_tpb;
 491        static struct trace_printk_buffer *cpu_tpbs;
 492        static atomic_t alloc_done;
 493
 494        if (unlikely(booting))
 495                return &boot_tpb;
 496        if (unlikely(!cpu_tpbs)) {
 497                /* Poor man per-CPU data structure. I really do no like
 498                 * littering global data structures with module specific data.
 499                 * We cannot take the ktrace_lock to protect the kzmalloc()
 500                 * call, as that might trigger printk()s, and we would reenter
 501                 * here.  Let only one core into the kzmalloc() path, and let
 502                 * the others get the boot_tpb until finished. */
 503                if (!atomic_cas(&alloc_done, 0, 1))
 504                        return &boot_tpb;
 505                cpu_tpbs = kzmalloc(num_cores *
 506                                    sizeof(struct trace_printk_buffer), 0);
 507        }
 508
 509        return cpu_tpbs + core_id_early();
 510}
 511
 512void trace_vprintk(const char *fmt, va_list args)
 513{
 514        struct print_buf {
 515                char *ptr;
 516                char *top;
 517        };
 518
 519        void emit_print_buf_str(struct print_buf *pb, const char *str,
 520                                ssize_t size)
 521        {
 522                if (size < 0) {
 523                        for (; *str && (pb->ptr < pb->top); str++)
 524                                *(pb->ptr++) = *str;
 525                } else {
 526                        for (; (size > 0) && (pb->ptr < pb->top); str++, size--)
 527                                *(pb->ptr++) = *str;
 528                }
 529        }
 530
 531        static const size_t bufsz = TRACE_PRINTK_BUFFER_SIZE;
 532        static const size_t usr_bufsz = (3 * bufsz) / 8;
 533        static const size_t kp_bufsz = bufsz - usr_bufsz;
 534        struct trace_printk_buffer *tpb = kprof_get_printk_buffer();
 535        struct timespec ts_now = { 0, 0 };
 536        struct print_buf pb;
 537        char *usrbuf = tpb->buffer, *kpbuf = tpb->buffer + usr_bufsz;
 538        const char *utop, *uptr;
 539        char hdr[64];
 540
 541        if (!atomic_cas(&tpb->in_use, 0, 1))
 542                return;
 543        if (likely(__proc_global_info.tsc_freq))
 544                ts_now = tsc2timespec(read_tsc());
 545        snprintf(hdr, sizeof(hdr), "[%lu.%09lu]:cpu%d: ", ts_now.tv_sec,
 546                 ts_now.tv_nsec, core_id_early());
 547
 548        pb.ptr = usrbuf + vsnprintf(usrbuf, usr_bufsz, fmt, args);
 549        pb.top = usrbuf + usr_bufsz;
 550
 551        if (pb.ptr[-1] != '\n')
 552                emit_print_buf_str(&pb, "\n", 1);
 553        /* snprintf null terminates the buffer, and does not count that as part
 554         * of the len.  If we maxed out the buffer, let's make sure it has a \n.
 555         */
 556        if (pb.ptr == pb.top)
 557                pb.ptr[-1] = '\n';
 558        utop = pb.ptr;
 559
 560        pb.ptr = kpbuf;
 561        pb.top = kpbuf + kp_bufsz;
 562        for (uptr = usrbuf; uptr < utop;) {
 563                const char *nlptr = memchr(uptr, '\n', utop - uptr);
 564
 565                if (nlptr == NULL)
 566                        nlptr = utop;
 567                emit_print_buf_str(&pb, hdr, -1);
 568                emit_print_buf_str(&pb, uptr, (nlptr - uptr) + 1);
 569                uptr = nlptr + 1;
 570        }
 571        kprof_tracedata_write(kpbuf, pb.ptr - kpbuf);
 572        atomic_set(&tpb->in_use, 0);
 573}
 574
 575void trace_printk(const char *fmt, ...)
 576{
 577        va_list args;
 578
 579        va_start(args, fmt);
 580        trace_vprintk(fmt, args);
 581        va_end(args);
 582}
 583
 584struct dev kprofdevtab __devtab = {
 585        .name = "kprof",
 586
 587        .reset = devreset,
 588        .init = kprof_init,
 589        .shutdown = devshutdown,
 590        .attach = kprof_attach,
 591        .walk = kprof_walk,
 592        .stat = kprof_stat,
 593        .open = kprof_open,
 594        .create = devcreate,
 595        .close = kprof_close,
 596        .read = kprof_read,
 597        .bread = devbread,
 598        .write = kprof_write,
 599        .bwrite = devbwrite,
 600        .remove = devremove,
 601        .wstat = devwstat,
 602};
 603