Export epoch time via proc_global_info (XCC)
[akaros.git] / kern / src / profiler.c
index c76c936..c94caed 100644 (file)
@@ -1,7 +1,25 @@
 /* Copyright (c) 2015 Google Inc
  * Davide Libenzi <dlibenzi@google.com>
  * See LICENSE for details.
- */
+ *
+ * This controls the emitting, collecting, and exporting of samples for perf
+ * events.  Examples of events are PMU counter overflows, mmaps, and process
+ * creation.
+ *
+ * Events are collected in a central qio queue.  High-frequency events (e.g.
+ * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
+ * central queue when they fill up or on command.  Lower-frequency events (e.g.
+ * profiler_notify_mmap()) just go straight to the central queue.
+ *
+ * Currently there is one global profiler.  Kprof is careful to only have one
+ * open profiler at a time.  We assert that this is true.  TODO: stop using the
+ * global profiler!
+ *
+ * A few other notes:
+ * - profiler_control_trace() controls the per-core trace collection.  When it
+ *   is disabled, it also flushes the per-core blocks to the central queue.
+ * - The collection of mmap and comm samples is independent of trace collection.
+ *   Those will occur whenever the profiler is open (refcnt check, for now). */
 
 #include <ros/common.h>
 #include <ros/mman.h>
 #include "profiler.h"
 
 #define PROFILER_MAX_PRG_PATH  256
-#define PROFILER_BT_DEPTH 16
 
 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
 
+/* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
 struct profiler_cpu_context {
        struct block *block;
        int cpu;
        int tracing;
-       size_t dropped_data_size;
+       size_t dropped_data_cnt;
 };
 
 static int profiler_queue_limit = 64 * 1024 * 1024;
@@ -64,21 +82,19 @@ static inline char *vb_encode_uint64(char *data, uint64_t n)
 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
                                            struct block *b)
 {
+       /* qpass will drop b if the queue is over its limit.  we're willing to lose
+        * traces, but we won't lose 'control' events, such as MMAP and PID. */
        if (b) {
-               qibwrite(profiler_queue, b);
-
-               if (qlen(profiler_queue) > profiler_queue_limit) {
-                       b = qget(profiler_queue);
-                       if (likely(b)) {
-                               cpu_buf->dropped_data_size += BLEN(b);
-                               freeb(b);
-                       }
-               }
+               if (qpass(profiler_queue, b) < 0)
+                       cpu_buf->dropped_data_cnt++;
        }
-
-       return iallocb(profiler_cpu_buffer_size);
+       return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
 }
 
+/* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
+ * enough room in the pcpu block for our write.  May alloc a new one.
+ *
+ * IRQs must be disabled before calling, until after write_commit. */
 static char *profiler_cpu_buffer_write_reserve(
        struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
 {
@@ -94,6 +110,9 @@ static char *profiler_cpu_buffer_write_reserve(
        return (char *) b->wp;
 }
 
+/* Helper, paired with write_reserve.  Finalizes the writing into the block's
+ * main body of @size bytes.  IRQs must be disabled until after this is called.
+ */
 static inline void profiler_cpu_buffer_write_commit(
        struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
 {
@@ -109,12 +128,16 @@ static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
                                          const uintptr_t *trace, size_t count,
                                          uint64_t info)
 {
-       size_t i, size = sizeof(struct proftype_kern_trace64) +
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+       size_t size = sizeof(struct proftype_kern_trace64) +
                count * sizeof(uint64_t);
        struct block *b;
-       char *resptr = profiler_cpu_buffer_write_reserve(
-               cpu_buf, size + profiler_max_envelope_size(), &b);
-       char *ptr = resptr;
+       void *resptr, *ptr;
+
+       assert(!irq_is_enabled());
+       resptr = profiler_cpu_buffer_write_reserve(
+           cpu_buf, size + profiler_max_envelope_size(), &b);
+       ptr = resptr;
 
        if (likely(ptr)) {
                struct proftype_kern_trace64 *record;
@@ -127,9 +150,13 @@ static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
 
                record->info = info;
                record->tstamp = nsec();
+               if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
+                       record->pid = -1;
+               else
+                       record->pid = pcpui->cur_proc->pid;
                record->cpu = cpu_buf->cpu;
                record->num_traces = count;
-               for (i = 0; i < count; i++)
+               for (size_t i = 0; i < count; i++)
                        record->trace[i] = (uint64_t) trace[i];
 
                profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
@@ -140,12 +167,15 @@ static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
                                        struct proc *p, const uintptr_t *trace,
                                        size_t count, uint64_t info)
 {
-       size_t i, size = sizeof(struct proftype_user_trace64) +
+       size_t size = sizeof(struct proftype_user_trace64) +
                count * sizeof(uint64_t);
        struct block *b;
-       char *resptr = profiler_cpu_buffer_write_reserve(
-               cpu_buf, size + profiler_max_envelope_size(), &b);
-       char *ptr = resptr;
+       void *resptr, *ptr;
+
+       assert(!irq_is_enabled());
+       resptr = profiler_cpu_buffer_write_reserve(
+           cpu_buf, size + profiler_max_envelope_size(), &b);
+       ptr = resptr;
 
        if (likely(ptr)) {
                struct proftype_user_trace64 *record;
@@ -161,7 +191,7 @@ static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
                record->pid = p->pid;
                record->cpu = cpu_buf->cpu;
                record->num_traces = count;
-               for (i = 0; i < count; i++)
+               for (size_t i = 0; i < count; i++)
                        record->trace[i] = (uint64_t) trace[i];
 
                profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
@@ -171,12 +201,12 @@ static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
                                    size_t offset, const char *path)
 {
-       size_t i, plen = strlen(path) + 1,
-               size = sizeof(struct proftype_pid_mmap64) + plen;
-       char *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
+       size_t plen = strlen(path) + 1;
+       size_t size = sizeof(struct proftype_pid_mmap64) + plen;
+       void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
 
        if (likely(resptr)) {
-               char *ptr = resptr;
+               void *ptr = resptr;
                struct proftype_pid_mmap64 *record;
 
                ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
@@ -200,12 +230,12 @@ static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
 
 static void profiler_push_new_process(struct proc *p)
 {
-       size_t i, plen = strlen(p->binary_path) + 1,
-               size = sizeof(struct proftype_new_process) + plen;
-       char *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
+       size_t plen = strlen(p->binary_path) + 1;
+       size_t size = sizeof(struct proftype_new_process) + plen;
+       void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
 
        if (likely(resptr)) {
-               char *ptr = resptr;
+               void *ptr = resptr;
                struct proftype_new_process *record;
 
                ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
@@ -244,8 +274,10 @@ static void profiler_emit_current_system_status(void)
                nexterror();
        }
 
-       for (size_t i = 0; i < pset.num_processes; i++)
+       for (size_t i = 0; i < pset.num_processes; i++) {
+               profiler_notify_new_process(pset.procs[i]);
                enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
+       }
 
        poperror();
        proc_free_set(&pset);
@@ -257,7 +289,7 @@ static void free_cpu_buffers(void)
        profiler_percpu_ctx = NULL;
 
        if (profiler_queue) {
-               qclose(profiler_queue);
+               qfree(profiler_queue);
                profiler_queue = NULL;
        }
 }
@@ -265,8 +297,15 @@ static void free_cpu_buffers(void)
 static void alloc_cpu_buffers(void)
 {
        ERRSTACK(1);
-       int i;
 
+       /* It is very important that we enqueue and dequeue entire records at once.
+        * If we leave partial records, the entire stream will be corrupt.  Our
+        * reader does its best to make sure it has room for complete records
+        * (checks qlen()).
+        *
+        * If we ever get corrupt streams, try making this a Qmsg.  Though it
+        * doesn't help every situation - we have issues with writes greater than
+        * Maxatomic regardless. */
        profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
        if (!profiler_queue)
                error(ENOMEM, ERROR_FIXME);
@@ -275,13 +314,10 @@ static void alloc_cpu_buffers(void)
                nexterror();
        }
 
-       qdropoverflow(profiler_queue, TRUE);
-       qnonblock(profiler_queue, TRUE);
-
        profiler_percpu_ctx =
-           kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, KMALLOC_WAIT);
+           kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
 
-       for (i = 0; i < num_cores; i++) {
+       for (int i = 0; i < num_cores; i++) {
                struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
 
                b->cpu = i;
@@ -310,26 +346,30 @@ int profiler_configure(struct cmdbuf *cb)
                        error(EFAIL, "Profiler already running");
                profiler_queue_limit = (int) profiler_get_checked_value(
                        cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
-       } else if (!strcmp(cb->f[0], "prof_cpubufsz")) {
+               return 1;
+       }
+       if (!strcmp(cb->f[0], "prof_cpubufsz")) {
                if (cb->nf < 2)
                        error(EFAIL, "prof_cpubufsz KB");
                profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
                        cb->f[1], 1024, 16 * 1024, 1024 * 1024);
-       } else {
-               return 0;
+               return 1;
        }
 
-       return 1;
+       return 0;
 }
 
-const char* const *profiler_configure_cmds(void)
+void profiler_append_configure_usage(char *msgbuf, size_t buflen)
 {
-       static const char * const cmds[] = {
-               "prof_qlimit", "prof_cpubufsz",
-               NULL
+       const char * const cmds[] = {
+               "prof_qlimit",
+               "prof_cpubufsz",
        };
 
-       return cmds;
+       for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
+               strlcat(msgbuf, "|", buflen);
+               strlcat(msgbuf, cmds[i], buflen);
+       }
 }
 
 static void profiler_release(struct kref *kref)
@@ -372,8 +412,8 @@ void profiler_setup(void)
                qunlock(&profiler_mtx);
                nexterror();
        }
-       if (!profiler_queue)
-               alloc_cpu_buffers();
+       assert(!profiler_queue);
+       alloc_cpu_buffers();
 
        /* Do this only when everything is initialized (as last init operation).
         */
@@ -392,11 +432,15 @@ void profiler_cleanup(void)
 
 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
 {
+       int8_t irq_state = 0;
+
+       disable_irqsave(&irq_state);
        if (cpu_buf->block && profiler_queue) {
                qibwrite(profiler_queue, cpu_buf->block);
 
                cpu_buf->block = NULL;
        }
+       enable_irqsave(&irq_state);
 }
 
 static void profiler_core_trace_enable(void *opaque)
@@ -408,7 +452,7 @@ static void profiler_core_trace_enable(void *opaque)
                profiler_cpu_flush(cpu_buf);
 }
 
-void profiler_control_trace(int onoff)
+static void profiler_control_trace(int onoff)
 {
        struct core_set cset;
 
@@ -420,6 +464,20 @@ void profiler_control_trace(int onoff)
                        (void *) (uintptr_t) onoff);
 }
 
+void profiler_start(void)
+{
+       assert(profiler_queue);
+       profiler_control_trace(1);
+       qreopen(profiler_queue);
+}
+
+void profiler_stop(void)
+{
+       assert(profiler_queue);
+       profiler_control_trace(0);
+       qhangup(profiler_queue, 0);
+}
+
 static void profiler_core_flush(void *opaque)
 {
        struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
@@ -438,65 +496,31 @@ void profiler_trace_data_flush(void)
        smp_do_in_cores(&cset, profiler_core_flush, NULL);
 }
 
-void profiler_add_trace(uintptr_t pc, uint64_t info)
-{
-       if (is_user_raddr((void *) pc, 1))
-               profiler_add_user_backtrace(pc, 0, info);
-       else
-               profiler_add_kernel_backtrace(pc, 0, info);
-}
-
-void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
+void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
+                                    uint64_t info)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
                struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
 
-               if (profiler_percpu_ctx && cpu_buf->tracing) {
-                       uintptr_t trace[PROFILER_BT_DEPTH];
-                       size_t n = 1;
-
-                       trace[0] = pc;
-                       if (likely(fp))
-                               n = backtrace_list(pc, fp, trace + 1,
-                                                  PROFILER_BT_DEPTH - 1) + 1;
-
-                       profiler_push_kernel_trace64(cpu_buf, trace, n, info);
-               }
+               if (profiler_percpu_ctx && cpu_buf->tracing)
+                       profiler_push_kernel_trace64(cpu_buf, pc_list, nr_pcs, info);
                kref_put(&profiler_kref);
        }
 }
 
-void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
+void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
+                                  uint64_t info)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
                struct proc *p = current;
                struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
 
-               if (p && profiler_percpu_ctx && cpu_buf->tracing) {
-                       uintptr_t trace[PROFILER_BT_DEPTH];
-                       size_t n = 1;
-
-                       trace[0] = pc;
-                       if (likely(fp))
-                               n = backtrace_user_list(pc, fp, trace + 1,
-                                                       PROFILER_BT_DEPTH - 1) + 1;
-
-                       profiler_push_user_trace64(cpu_buf, p, trace, n, info);
-               }
+               if (profiler_percpu_ctx && cpu_buf->tracing)
+                       profiler_push_user_trace64(cpu_buf, p, pc_list, nr_pcs, info);
                kref_put(&profiler_kref);
        }
 }
 
-void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info)
-{
-       if (in_kernel(hw_tf))
-               profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
-                                             info);
-       else
-               profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
-                                           info);
-}
-
 int profiler_size(void)
 {
        return profiler_queue ? qlen(profiler_queue) : 0;