Enable the PFM sampling to pass an 64bit info value
authorDavide Libenzi <dlibenzi@google.com>
Sat, 12 Dec 2015 21:35:31 +0000 (13:35 -0800)
committerBarret Rhoden <brho@cs.berkeley.edu>
Wed, 16 Dec 2015 21:27:59 +0000 (16:27 -0500)
Enable the PFM sampling to pass an 64bit info value to allow userspace
to distinguish among samples.
As part of this CL, the enabling and disabling of the profiler has been
improved by using call-in-all-cores APIs.

Signed-off-by: Davide Libenzi <dlibenzi@google.com>
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
kern/arch/x86/perfmon.c
kern/drivers/dev/kprof.c
kern/include/profiler.h
kern/include/ros/profiler_records.h
kern/src/profiler.c

index f444ea6..810be3c 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <sys/types.h>
 #include <arch/ros/msr-index.h>
+#include <arch/ros/membar.h>
 #include <arch/x86.h>
 #include <arch/msr.h>
 #include <arch/uaccess.h>
@@ -55,13 +56,13 @@ static void perfmon_read_cpu_caps(struct perfmon_cpu_caps *pcc)
 
        cpuid(0x0a, 0, &a, &b, &c, &d);
 
-       ZERO_DATA(*pcc);
-       pcc->perfmon_version = a & 0xff;
        pcc->proc_arch_events = a >> 24;
        pcc->bits_x_counter = (a >> 16) & 0xff;
        pcc->counters_x_proc = (a >> 8) & 0xff;
        pcc->bits_x_fix_counter = (d >> 5) & 0xff;
        pcc->fix_counters_x_proc = d & 0x1f;
+       wmb_f();
+       pcc->perfmon_version = a & 0xff;
 }
 
 static void perfmon_enable_event(int event, bool enable)
@@ -96,11 +97,11 @@ static uint64_t perfmon_get_fixevent_mask(const struct perfmon_event *pev,
 {
        uint64_t m = 0;
 
-       if (pev->u.b.inten)
+       if (PMEV_GET_EN(pev->event))
                m |= 1 << 3;
-       if (pev->u.b.os)
+       if (PMEV_GET_OS(pev->event))
                m |= (1 << 0);
-       if (pev->u.b.usr)
+       if (PMEV_GET_USR(pev->event))
                m |= (1 << 1);
 
        m <<= eventno * FIXCNTR_NBITS;
@@ -119,14 +120,14 @@ static void perfmon_do_cores_alloc(void *opaque)
        if (perfmon_is_fixed_event(&pa->ev)) {
                uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL), tmp;
 
-               i = pa->ev.u.b.event;
+               i = PMEV_GET_EVENT(pa->ev.event);
                if (i >= (int) cpu_caps.fix_counters_x_proc) {
                        i = -EINVAL;
                } else if (fxctrl_value & (FIXCNTR_MASK << i)) {
                        i = -EBUSY;
                } else {
                        cctx->fixed_counters[i] = pa->ev;
-                       cctx->fixed_counters[i].u.b.en = 1;
+                       PMEV_SET_EN(cctx->fixed_counters[i].event, 1);
 
                        tmp = perfmon_get_fixevent_mask(&pa->ev, i, fxctrl_value);
 
@@ -138,7 +139,7 @@ static void perfmon_do_cores_alloc(void *opaque)
                }
        } else {
                for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
-                       if (cctx->counters[i].u.v == 0) {
+                       if (cctx->counters[i].event == 0) {
                                if (!perfmon_event_available(i))
                                        warn_once("Counter %d is free but not available", i);
                                else
@@ -147,13 +148,13 @@ static void perfmon_do_cores_alloc(void *opaque)
                }
                if (i < (int) cpu_caps.counters_x_proc) {
                        cctx->counters[i] = pa->ev;
-                       cctx->counters[i].u.b.en = 1;
+                       PMEV_SET_EN(cctx->counters[i].event, 1);
 
                        perfmon_enable_event(i, TRUE);
 
                        write_msr(MSR_IA32_PERFCTR0 + i, -(int64_t) pa->ev.trigger_count);
                        write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + i,
-                                         cctx->counters[i].u.v);
+                                         cctx->counters[i].event);
                } else {
                        i = -ENOSPC;
                }
@@ -285,6 +286,11 @@ static struct perfmon_status *perfmon_alloc_status(void)
        return pef;
 }
 
+static void perfmon_arm_irq(void)
+{
+       write_mmreg32(LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+}
+
 void perfmon_init(void)
 {
        int i;
@@ -293,6 +299,11 @@ void perfmon_init(void)
        lcr4(rcr4() | CR4_PCE);
 
        /* This will be called from every core, no need to execute more than once.
+        * All the call to perfmon_init() will be done when the core boots, so
+        * they will be no perfmon users calling it, while perfmon_read_cpu_caps()
+        * is executing.
+        * All the cores will be writing the same values, so even from that POV,
+        * no serialization is required.
         */
        if (cpu_caps.perfmon_version == 0)
                perfmon_read_cpu_caps(&cpu_caps);
@@ -308,7 +319,18 @@ void perfmon_init(void)
        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++)
                write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
 
-       write_mmreg32(LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+       perfmon_arm_irq();
+}
+
+static uint64_t perfmon_make_sample_event(const struct perfmon_event *pev)
+{
+       uint64_t ei = ((uint64_t) PMEV_GET_MASK(pev->event) << 8) |
+               PMEV_GET_EVENT(pev->event);
+
+       if (perfmon_is_fixed_event(pev))
+               ei |= 1 << 16;
+
+       return PROF_MKINFO(PROF_DOM_PMU, ei);
 }
 
 void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
@@ -317,28 +339,32 @@ void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
        uint64_t gctrl, status;
 
-       profiler_add_hw_sample(hw_tf);
-
        spin_lock_irqsave(&cctx->lock);
        /* We need to save the global control status, because we need to disable
         * counters in order to be able to reset their values.
         * We will restore the global control status on exit.
         */
+       status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-       status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
        for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
                if (status & ((uint64_t) 1 << i)) {
-                       if (cctx->counters[i].u.v)
+                       if (cctx->counters[i].event) {
+                               profiler_add_hw_sample(
+                                       hw_tf, perfmon_make_sample_event(cctx->counters + i));
                                write_msr(MSR_IA32_PERFCTR0 + i,
                                                  -(int64_t) cctx->counters[i].trigger_count);
+                       }
                }
        }
        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++) {
                if (status & ((uint64_t) 1 << (32 + i))) {
-                       if (cctx->fixed_counters[i].u.v)
+                       if (cctx->fixed_counters[i].event) {
+                               profiler_add_hw_sample(
+                                       hw_tf, perfmon_make_sample_event(cctx->fixed_counters + i));
                                write_msr(MSR_CORE_PERF_FIXED_CTR0 + i,
                                                  -(int64_t) cctx->fixed_counters[i].trigger_count);
+                       }
                }
        }
        write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
@@ -346,8 +372,10 @@ void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
        spin_unlock_irqsave(&cctx->lock);
 
        /* We need to re-arm the IRQ as the PFM IRQ gets masked on trigger.
+        * Note that KVM and real HW seems to be doing two different things WRT
+        * re-arming the IRQ. KVM re-arms does not mask the IRQ, while real HW does.
         */
-       write_mmreg32(LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+       perfmon_arm_irq();
 }
 
 void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
index fe2194a..b9bf2c3 100644 (file)
@@ -7,6 +7,7 @@
  * in the LICENSE file.
  */
 
+#include <ros/profiler_records.h>
 #include <arch/time.h>
 #include <vfs.h>
 #include <slab.h>
@@ -97,7 +98,8 @@ static void kprof_alarm_handler(struct alarm_waiter *waiter,
        int coreid = core_id();
        struct timer_chain *tchain = &per_cpu_info[coreid].tchain;
 
-       profiler_add_hw_sample(hw_tf);
+       profiler_add_hw_sample(hw_tf, PROF_MKINFO(PROF_DOM_TIMER,
+                                                                                         kprof_timer_period));
        reset_alarm_rel(tchain, waiter, kprof_timer_period);
 }
 
@@ -197,6 +199,12 @@ static void kprof_stop_profiler(void)
        qunlock(&kprof.lock);
 }
 
+static void kprof_flush_profiler(void)
+{
+       if (kprof.profiling)
+               profiler_trace_data_flush();
+}
+
 static void kprof_init(void)
 {
        int i;
@@ -410,7 +418,7 @@ static void kprof_manage_timer(int coreid, struct cmdbuf *cb)
 
 static void kprof_usage_fail(void)
 {
-       static const char *ctlstring = "clear|start|stop|timer";
+       static const char *ctlstring = "clear|start|stop|flush|timer";
        const char * const *cmds = profiler_configure_cmds();
        char msgbuf[128];
 
@@ -457,6 +465,8 @@ static long kprof_write(struct chan *c, void *a, long n, int64_t unused)
                        }
                } else if (!strcmp(cb->f[0], "start")) {
                        kprof_start_profiler();
+               } else if (!strcmp(cb->f[0], "flush")) {
+                       kprof_flush_profiler();
                } else if (!strcmp(cb->f[0], "stop")) {
                        kprof_stop_profiler();
                } else {
@@ -464,7 +474,7 @@ static long kprof_write(struct chan *c, void *a, long n, int64_t unused)
                }
                break;
        case Kprofdataqid:
-               profiler_add_trace((uintptr_t) strtoul(a, 0, 0));
+               profiler_add_trace((uintptr_t) strtoul(a, 0, 0), 0);
                break;
        case Kptraceqid:
                if (a && (n > 0)) {
index 862c87f..aedd748 100644 (file)
@@ -18,11 +18,12 @@ const char * const *profiler_configure_cmds(void);
 void profiler_init(void);
 void profiler_setup(void);
 void profiler_cleanup(void);
-void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp);
-void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp);
-void profiler_add_trace(uintptr_t pc);
+void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info);
+void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info);
+void profiler_add_trace(uintptr_t pc, uint64_t info);
 void profiler_control_trace(int onoff);
-void profiler_add_hw_sample(struct hw_trapframe *hw_tf);
+void profiler_trace_data_flush(void);
+void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info);
 int profiler_size(void);
 int profiler_read(void *va, int n);
 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
index 73309a6..c08ffb4 100644 (file)
@@ -7,9 +7,22 @@
 
 #include <sys/types.h>
 
+#define PROF_DOM_SHIFT (8 * sizeof(uint64_t) - 4)
+#define PROF_INFO_MASK (((uint64_t) 1 << PROF_DOM_SHIFT) - 1)
+
+#define PROF_MKINFO(dom, dinfo)                                                                \
+       (((uint64_t) (dom) << PROF_DOM_SHIFT) | ((dinfo) & PROF_INFO_MASK))
+
+#define PROF_INFO_DOM(i) ((uint64_t) (i) >> PROF_DOM_SHIFT)
+#define PROF_INFO_DATA(i) ((i) & PROF_INFO_MASK)
+
+#define PROF_DOM_TIMER 1
+#define PROF_DOM_PMU 2
+
 #define PROFTYPE_KERN_TRACE64  1
 
 struct proftype_kern_trace64 {
+       uint64_t info;
        uint64_t tstamp;
        uint16_t cpu;
        uint16_t num_traces;
@@ -19,6 +32,7 @@ struct proftype_kern_trace64 {
 #define PROFTYPE_USER_TRACE64  2
 
 struct proftype_user_trace64 {
+       uint64_t info;
        uint64_t tstamp;
        uint32_t pid;
        uint16_t cpu;
index a6b9a99..89b35e8 100644 (file)
@@ -21,6 +21,7 @@
 #include <elf.h>
 #include <ns.h>
 #include <err.h>
+#include <core_set.h>
 #include <string.h>
 #include "profiler.h"
 
@@ -39,7 +40,6 @@ struct profiler_cpu_context {
 static int profiler_queue_limit = 64 * 1024 * 1024;
 static size_t profiler_cpu_buffer_size = 65536;
 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
-static int tracing;
 static struct kref profiler_kref;
 static struct profiler_cpu_context *profiler_percpu_ctx;
 static struct queue *profiler_queue;
@@ -106,7 +106,8 @@ static inline size_t profiler_max_envelope_size(void)
 }
 
 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
-                                                                                const uintptr_t *trace, size_t count)
+                                                                                const uintptr_t *trace, size_t count,
+                                                                                uint64_t info)
 {
        size_t i, size = sizeof(struct proftype_kern_trace64) +
                count * sizeof(uint64_t);
@@ -124,6 +125,7 @@ static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
                record = (struct proftype_kern_trace64 *) ptr;
                ptr += size;
 
+               record->info = info;
                record->tstamp = nsec();
                record->cpu = cpu_buf->cpu;
                record->num_traces = count;
@@ -136,7 +138,7 @@ static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
 
 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
                                                                           struct proc *p, const uintptr_t *trace,
-                                                                          size_t count)
+                                                                          size_t count, uint64_t info)
 {
        size_t i, size = sizeof(struct proftype_user_trace64) +
                count * sizeof(uint64_t);
@@ -154,6 +156,7 @@ static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
                record = (struct proftype_user_trace64 *) ptr;
                ptr += size;
 
+               record->info = info;
                record->tstamp = nsec();
                record->pid = p->pid;
                record->cpu = cpu_buf->cpu;
@@ -248,21 +251,6 @@ static void profiler_emit_current_system_status(void)
        proc_free_set(&pset);
 }
 
-static inline bool profiler_is_tracing(struct profiler_cpu_context *cpu_buf)
-{
-       if (unlikely(cpu_buf->tracing < 0)) {
-               if (cpu_buf->block) {
-                       qibwrite(profiler_queue, cpu_buf->block);
-
-                       cpu_buf->block = NULL;
-               }
-
-               cpu_buf->tracing = 0;
-       }
-
-       return (cpu_buf->tracing != 0) ? TRUE : FALSE;
-}
-
 static void free_cpu_buffers(void)
 {
        kfree(profiler_percpu_ctx);
@@ -387,12 +375,12 @@ void profiler_setup(void)
        if (!profiler_queue)
                alloc_cpu_buffers();
 
-       profiler_emit_current_system_status();
-
        /* Do this only when everything is initialized (as last init operation).
         */
        __kref_get(&profiler_kref, 1);
 
+       profiler_emit_current_system_status();
+
        poperror();
        qunlock(&profiler_mtx);
 }
@@ -402,37 +390,68 @@ void profiler_cleanup(void)
        kref_put(&profiler_kref);
 }
 
+static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
+{
+       if (cpu_buf->block && profiler_queue) {
+               qibwrite(profiler_queue, cpu_buf->block);
+
+               cpu_buf->block = NULL;
+       }
+}
+
+static void profiler_core_trace_enable(void *opaque)
+{
+       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+
+       cpu_buf->tracing = (int) (opaque != NULL);
+       if (!cpu_buf->tracing)
+               profiler_cpu_flush(cpu_buf);
+}
+
 void profiler_control_trace(int onoff)
 {
-       int core;
+       struct core_set cset;
 
-       tracing = onoff;
-       for (core = 0; core < num_cores; core++) {
-               struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core);
+       error_assert(EINVAL, profiler_percpu_ctx);
 
-               /*
-                * We cannot access directly other CPU buffers from here, in order
-                * to issue a flush. So, when disabling, we set tracing = -1, and
-                * we let profiler_is_tracing() to perform it at the next timer tick.
-                */
-               cpu_buf->tracing = onoff ? 1 : -1;
-       }
+       core_set_init(&cset);
+       core_set_fill_available(&cset);
+       smp_do_in_cores(&cset, profiler_core_trace_enable,
+                                       (void *) (uintptr_t) onoff);
+}
+
+static void profiler_core_flush(void *opaque)
+{
+       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+
+       profiler_cpu_flush(cpu_buf);
+}
+
+void profiler_trace_data_flush(void)
+{
+       struct core_set cset;
+
+       error_assert(EINVAL, profiler_percpu_ctx);
+
+       core_set_init(&cset);
+       core_set_fill_available(&cset);
+       smp_do_in_cores(&cset, profiler_core_flush, NULL);
 }
 
-void profiler_add_trace(uintptr_t pc)
+void profiler_add_trace(uintptr_t pc, uint64_t info)
 {
        if (is_user_raddr((void *) pc, 1))
-               profiler_add_user_backtrace(pc, 0);
+               profiler_add_user_backtrace(pc, 0, info);
        else
-               profiler_add_kernel_backtrace(pc, 0);
+               profiler_add_kernel_backtrace(pc, 0, info);
 }
 
-void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp)
+void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
                struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
 
-               if (profiler_percpu_ctx && profiler_is_tracing(cpu_buf)) {
+               if (profiler_percpu_ctx && cpu_buf->tracing) {
                        uintptr_t trace[PROFILER_BT_DEPTH];
                        size_t n = 1;
 
@@ -441,19 +460,19 @@ void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp)
                                n = backtrace_list(pc, fp, trace + 1,
                                                                   PROFILER_BT_DEPTH - 1) + 1;
 
-                       profiler_push_kernel_trace64(cpu_buf, trace, n);
+                       profiler_push_kernel_trace64(cpu_buf, trace, n, info);
                }
                kref_put(&profiler_kref);
        }
 }
 
-void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp)
+void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
                struct proc *p = current;
                struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
 
-               if (p && profiler_percpu_ctx && profiler_is_tracing(cpu_buf)) {
+               if (p && profiler_percpu_ctx && cpu_buf->tracing) {
                        uintptr_t trace[PROFILER_BT_DEPTH];
                        size_t n = 1;
 
@@ -462,18 +481,20 @@ void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp)
                                n = backtrace_user_list(pc, fp, trace + 1,
                                                                                PROFILER_BT_DEPTH - 1) + 1;
 
-                       profiler_push_user_trace64(cpu_buf, p, trace, n);
+                       profiler_push_user_trace64(cpu_buf, p, trace, n, info);
                }
                kref_put(&profiler_kref);
        }
 }
 
-void profiler_add_hw_sample(struct hw_trapframe *hw_tf)
+void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info)
 {
        if (in_kernel(hw_tf))
-               profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf));
+               profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
+                                                                         info);
        else
-               profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf));
+               profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
+                                                                       info);
 }
 
 int profiler_size(void)
@@ -490,7 +511,7 @@ void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
                                                  int flags, struct file *f, size_t offset)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
-               if (f && (prot & PROT_EXEC) && profiler_percpu_ctx && tracing) {
+               if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
                        char path_buf[PROFILER_MAX_PRG_PATH];
                        char *path = file_abs_path(f, path_buf, sizeof(path_buf));
 
@@ -504,7 +525,7 @@ void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
 void profiler_notify_new_process(struct proc *p)
 {
        if (kref_get_not_zero(&profiler_kref, 1)) {
-               if (profiler_percpu_ctx && tracing && p->binary_path)
+               if (profiler_percpu_ctx && p->binary_path)
                        profiler_push_new_process(p);
                kref_put(&profiler_kref);
        }