perf: Use a user_data blob for perf_event (XCC)
[akaros.git] / kern / arch / x86 / perfmon.c
index f444ea6..05e46a3 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <sys/types.h>
 #include <arch/ros/msr-index.h>
+#include <arch/ros/membar.h>
 #include <arch/x86.h>
 #include <arch/msr.h>
 #include <arch/uaccess.h>
@@ -55,105 +56,165 @@ static void perfmon_read_cpu_caps(struct perfmon_cpu_caps *pcc)
 
        cpuid(0x0a, 0, &a, &b, &c, &d);
 
-       ZERO_DATA(*pcc);
-       pcc->perfmon_version = a & 0xff;
        pcc->proc_arch_events = a >> 24;
        pcc->bits_x_counter = (a >> 16) & 0xff;
        pcc->counters_x_proc = (a >> 8) & 0xff;
        pcc->bits_x_fix_counter = (d >> 5) & 0xff;
        pcc->fix_counters_x_proc = d & 0x1f;
+       pcc->perfmon_version = a & 0xff;
 }
 
-static void perfmon_enable_event(int event, bool enable)
+static void perfmon_enable_event(int idx, uint64_t event)
 {
-       uint64_t gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       uint64_t gctrl;
 
-       if (enable)
-               write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl | (1 << event));
-       else
-               write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~(1 << event));
+       /* Events need to be enabled in both MSRs */
+       write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, event);
+       gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl | (1 << idx));
 }
 
-static void perfmon_enable_fix_event(int event, bool enable)
+static void perfmon_disable_event(int idx)
 {
-       uint64_t gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       uint64_t gctrl;
 
-       if (enable)
-               write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
-                                 gctrl | ((uint64_t) 1 << (32 + event)));
-       else
-               write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
-                                 gctrl & ~((uint64_t) 1 << (32 + event)));
+       /* Events can be disabled in either location.  We could just clear the
+        * global ctrl, but we use the contents of EVENTSEL to say if the counter is
+        * available or not. */
+       write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, 0);
+       gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~(1 << idx));
 }
 
-static bool perfmon_event_available(uint32_t event)
+static bool perfmon_event_available(uint32_t idx)
 {
-       return read_msr(MSR_ARCH_PERFMON_EVENTSEL0 + event) == 0;
+       return read_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx) == 0;
 }
 
-static uint64_t perfmon_get_fixevent_mask(const struct perfmon_event *pev,
-                                                                                 int eventno, uint64_t base)
+/* Helper.  Given an event, a fixed counter index, and the contents of the fixed
+ * counter ctl MSR, output the value for the fixed counter ctl that will enable
+ * the event at idx. */
+static uint64_t perfmon_apply_fixevent_mask(uint64_t event, int idx,
+                                            uint64_t base)
 {
        uint64_t m = 0;
 
-       if (pev->u.b.inten)
-               m |= 1 << 3;
-       if (pev->u.b.os)
+       if (PMEV_GET_OS(event))
                m |= (1 << 0);
-       if (pev->u.b.usr)
+       if (PMEV_GET_USR(event))
                m |= (1 << 1);
-
-       m <<= eventno * FIXCNTR_NBITS;
-       m |= base & ~(FIXCNTR_MASK << (eventno * FIXCNTR_NBITS));
+       if (PMEV_GET_ANYTH(event) && (cpu_caps.perfmon_version >= 3))
+               m |= (1 << 2);
+       if (PMEV_GET_INTEN(event))
+               m |= (1 << 3);
+       /* Enable enforcement: we need at least one bit set so that this fixed
+        * counter appears to be in use. */
+       if (PMEV_GET_EN(event) && !PMEV_GET_OS(event) && !PMEV_GET_USR(event))
+               m |= (1 << 0) | (1 << 1);
+
+       m <<= idx * FIXCNTR_NBITS;
+       m |= base & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS));
 
        return m;
 }
 
+/* These helpers take the fxctrl_value to save on a rdmsr. */
+static void perfmon_enable_fix_event(int idx, uint64_t event,
+                                     uint64_t fxctrl_value)
+{
+       uint64_t gctrl, fx;
+
+       /* Enable in both locations: the bits in FIXED and the bit in GLOBAL. */
+       fx = perfmon_apply_fixevent_mask(event, idx, fxctrl_value);
+       write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, fx);
+       gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl | ((uint64_t) 1 << (32 + idx)));
+}
+
+static void perfmon_disable_fix_event(int idx, uint64_t fxctrl_value)
+{
+       uint64_t gctrl;
+
+       /* Events can be disabled in either location.  We could just clear the
+        * global ctrl, but we use the bits of fxctlr to say if the counter is
+        * available or not. */
+       write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL,
+                 fxctrl_value & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS)));
+       gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
+       write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~((uint64_t) 1 << (32 + idx)));
+}
+
+static bool perfmon_fix_event_available(uint32_t idx, uint64_t fxctrl_value)
+{
+       return (fxctrl_value & (FIXCNTR_MASK << (idx * FIXCNTR_NBITS))) == 0;
+}
+
+/* Helper to set a fixed perfcounter to trigger/overflow after count events.
+ * Anytime you set a perfcounter to something non-zero, you need to use this
+ * helper. */
+static void perfmon_set_fixed_trigger(unsigned int idx, uint64_t count)
+{
+       int64_t write_val = -(int64_t)count;
+
+       write_val &= (1ULL << cpu_caps.bits_x_fix_counter) - 1;
+       write_msr(MSR_CORE_PERF_FIXED_CTR0 + idx, write_val);
+}
+
+/* Helper to set a regular perfcounter to trigger/overflow after count events.
+ * Anytime you set a perfcounter to something non-zero, you ought to use this
+ * helper. */
+static void perfmon_set_unfixed_trigger(unsigned int idx, uint64_t count)
+{
+       int64_t write_val = -(int64_t)count;
+
+       write_val &= (1ULL << cpu_caps.bits_x_counter) - 1;
+       write_msr(MSR_IA32_PERFCTR0 + idx, write_val);
+}
+
 static void perfmon_do_cores_alloc(void *opaque)
 {
        struct perfmon_alloc *pa = (struct perfmon_alloc *) opaque;
        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
        int i;
+       struct perfmon_event *pev;
 
        spin_lock_irqsave(&cctx->lock);
        if (perfmon_is_fixed_event(&pa->ev)) {
-               uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL), tmp;
+               uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);
 
-               i = pa->ev.u.b.event;
+               i = PMEV_GET_EVENT(pa->ev.event);
                if (i >= (int) cpu_caps.fix_counters_x_proc) {
                        i = -EINVAL;
-               } else if (fxctrl_value & (FIXCNTR_MASK << i)) {
+               } else if (!perfmon_fix_event_available(i, fxctrl_value)) {
                        i = -EBUSY;
                } else {
+                       /* Keep a copy of pa->ev for later.  pa is read-only and shared. */
                        cctx->fixed_counters[i] = pa->ev;
-                       cctx->fixed_counters[i].u.b.en = 1;
-
-                       tmp = perfmon_get_fixevent_mask(&pa->ev, i, fxctrl_value);
-
-                       perfmon_enable_fix_event(i, TRUE);
-
-                       write_msr(MSR_CORE_PERF_FIXED_CTR0 + i,
-                                         -(int64_t) pa->ev.trigger_count);
-                       write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, tmp);
+                       pev = &cctx->fixed_counters[i];
+                       if (PMEV_GET_INTEN(pev->event))
+                               perfmon_set_fixed_trigger(i, pev->trigger_count);
+                       else
+                               write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
+                       write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1ULL << (32 + i));
+                       perfmon_enable_fix_event(i, pev->event, fxctrl_value);
                }
        } else {
                for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
-                       if (cctx->counters[i].u.v == 0) {
-                               if (!perfmon_event_available(i))
-                                       warn_once("Counter %d is free but not available", i);
-                               else
-                                       break;
+                       if (cctx->counters[i].event == 0) {
+                               /* kernel bug if the MSRs don't agree with our bookkeeping */
+                               assert(perfmon_event_available(i));
+                               break;
                        }
                }
                if (i < (int) cpu_caps.counters_x_proc) {
                        cctx->counters[i] = pa->ev;
-                       cctx->counters[i].u.b.en = 1;
-
-                       perfmon_enable_event(i, TRUE);
-
-                       write_msr(MSR_IA32_PERFCTR0 + i, -(int64_t) pa->ev.trigger_count);
-                       write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + i,
-                                         cctx->counters[i].u.v);
+                       pev = &cctx->counters[i];
+                       if (PMEV_GET_INTEN(pev->event))
+                               perfmon_set_unfixed_trigger(i, pev->trigger_count);
+                       else
+                               write_msr(MSR_IA32_PERFCTR0 + i, 0);
+                       write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1ULL << i);
+                       perfmon_enable_event(i, pev->event);
                } else {
                        i = -ENOSPC;
                }
@@ -172,28 +233,20 @@ static void perfmon_do_cores_free(void *opaque)
 
        spin_lock_irqsave(&cctx->lock);
        if (perfmon_is_fixed_event(&pa->ev)) {
-               unsigned int ccbitsh = ccno * FIXCNTR_NBITS;
                uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);
 
                if ((ccno >= cpu_caps.fix_counters_x_proc) ||
-                       !(fxctrl_value & (FIXCNTR_MASK << ccbitsh))) {
+                   perfmon_fix_event_available(ccno, fxctrl_value)) {
                        err = -ENOENT;
                } else {
                        perfmon_init_event(&cctx->fixed_counters[ccno]);
-
-                       perfmon_enable_fix_event((int) ccno, FALSE);
-
-                       write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL,
-                                         fxctrl_value & ~(FIXCNTR_MASK << ccbitsh));
+                       perfmon_disable_fix_event((int) ccno, fxctrl_value);
                        write_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno, 0);
                }
        } else {
                if (ccno < (int) cpu_caps.counters_x_proc) {
                        perfmon_init_event(&cctx->counters[ccno]);
-
-                       perfmon_enable_event((int) ccno, FALSE);
-
-                       write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + ccno, 0);
+                       perfmon_disable_event((int) ccno);
                        write_msr(MSR_IA32_PERFCTR0 + ccno, 0);
                } else {
                        err = -ENOENT;
@@ -214,15 +267,15 @@ static void perfmon_do_cores_status(void *opaque)
        spin_lock_irqsave(&cctx->lock);
        if (perfmon_is_fixed_event(&env->pa->ev))
                env->pef->cores_values[coreno] =
-                       read_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno);
+                   read_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno);
        else
                env->pef->cores_values[coreno] =
-                       read_msr(MSR_IA32_PERFCTR0 + ccno);
+                   read_msr(MSR_IA32_PERFCTR0 + ccno);
        spin_unlock_irqsave(&cctx->lock);
 }
 
 static void perfmon_setup_alloc_core_set(const struct perfmon_alloc *pa,
-                                                                                struct core_set *cset)
+                                         struct core_set *cset)
 {
        int i;
 
@@ -265,8 +318,8 @@ static struct perfmon_alloc *perfmon_create_alloc(const struct perfmon_event *pe
 {
        int i;
        struct perfmon_alloc *pa = kzmalloc(sizeof(struct perfmon_alloc) +
-                                                                               num_cores * sizeof(counter_t),
-                                                                               KMALLOC_WAIT);
+                                               num_cores * sizeof(counter_t),
+                                           MEM_WAIT);
 
        kref_init(&pa->ref, perfmon_release_alloc, 1);
        pa->ev = *pev;
@@ -279,24 +332,36 @@ static struct perfmon_alloc *perfmon_create_alloc(const struct perfmon_event *pe
 static struct perfmon_status *perfmon_alloc_status(void)
 {
        struct perfmon_status *pef = kzmalloc(sizeof(struct perfmon_status) +
-                                                                                 num_cores * sizeof(uint64_t),
-                                                                                 KMALLOC_WAIT);
+                                                 num_cores * sizeof(uint64_t),
+                                             MEM_WAIT);
 
        return pef;
 }
 
-void perfmon_init(void)
+static void perfmon_arm_irq(void)
+{
+       apicrput(MSR_LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+}
+
+bool perfmon_supported(void)
+{
+       return cpu_caps.perfmon_version >= 2;
+}
+
+void perfmon_global_init(void)
+{
+       perfmon_read_cpu_caps(&cpu_caps);
+}
+
+void perfmon_pcpu_init(void)
 {
        int i;
 
+       if (!perfmon_supported())
+               return;
        /* Enable user level access to the performance counters */
        lcr4(rcr4() | CR4_PCE);
 
-       /* This will be called from every core, no need to execute more than once.
-        */
-       if (cpu_caps.perfmon_version == 0)
-               perfmon_read_cpu_caps(&cpu_caps);
-
        /* Reset all the counters and selectors to zero.
         */
        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -308,7 +373,12 @@ void perfmon_init(void)
        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++)
                write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
 
-       write_mmreg32(LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+       perfmon_arm_irq();
+}
+
+static uint64_t perfmon_make_sample_event(const struct perfmon_event *pev)
+{
+       return pev->user_data;
 }
 
 void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
@@ -317,28 +387,31 @@ void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
        uint64_t gctrl, status;
 
-       profiler_add_hw_sample(hw_tf);
-
        spin_lock_irqsave(&cctx->lock);
        /* We need to save the global control status, because we need to disable
         * counters in order to be able to reset their values.
         * We will restore the global control status on exit.
         */
+       status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-       status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
        for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
                if (status & ((uint64_t) 1 << i)) {
-                       if (cctx->counters[i].u.v)
-                               write_msr(MSR_IA32_PERFCTR0 + i,
-                                                 -(int64_t) cctx->counters[i].trigger_count);
+                       if (cctx->counters[i].event) {
+                               profiler_add_hw_sample(
+                                   hw_tf, perfmon_make_sample_event(cctx->counters + i));
+                               perfmon_set_unfixed_trigger(i, cctx->counters[i].trigger_count);
+                       }
                }
        }
        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++) {
                if (status & ((uint64_t) 1 << (32 + i))) {
-                       if (cctx->fixed_counters[i].u.v)
-                               write_msr(MSR_CORE_PERF_FIXED_CTR0 + i,
-                                                 -(int64_t) cctx->fixed_counters[i].trigger_count);
+                       if (cctx->fixed_counters[i].event) {
+                               profiler_add_hw_sample(
+                                   hw_tf, perfmon_make_sample_event(cctx->fixed_counters + i));
+                               perfmon_set_fixed_trigger(i,
+                                       cctx->fixed_counters[i].trigger_count);
+                       }
                }
        }
        write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
@@ -346,8 +419,10 @@ void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
        spin_unlock_irqsave(&cctx->lock);
 
        /* We need to re-arm the IRQ as the PFM IRQ gets masked on trigger.
+        * Note that KVM and real HW seems to be doing two different things WRT
+        * re-arming the IRQ. KVM re-arms does not mask the IRQ, while real HW does.
         */
-       write_mmreg32(LAPIC_LVT_PERFMON, IdtLAPIC_PCINT);
+       perfmon_arm_irq();
 }
 
 void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
@@ -356,7 +431,7 @@ void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
 }
 
 static int perfmon_install_session_alloc(struct perfmon_session *ps,
-                                                                                struct perfmon_alloc *pa)
+                                         struct perfmon_alloc *pa)
 {
        int i;
 
@@ -369,13 +444,13 @@ static int perfmon_install_session_alloc(struct perfmon_session *ps,
                i = -ENFILE;
        spin_unlock(&ps->lock);
        if (unlikely(i < 0))
-               error(-i, NULL);
+               error(-i, ERROR_FIXME);
 
        return i;
 }
 
 int perfmon_open_event(const struct core_set *cset, struct perfmon_session *ps,
-                                          const struct perfmon_event *pev)
+                       const struct perfmon_event *pev)
 {
        ERRSTACK(1);
        int i;
@@ -385,6 +460,10 @@ int perfmon_open_event(const struct core_set *cset, struct perfmon_session *ps,
                perfmon_destroy_alloc(pa);
                nexterror();
        }
+       /* Ensure we're turning on the event.  The user could have forgotten to set
+        * it.  Our tracking of whether or not a counter is in use depends on it
+        * being enabled, or at least that some bit is set. */
+       PMEV_SET_EN(pa->ev.event, 1);
        smp_do_in_cores(cset, perfmon_do_cores_alloc, pa);
 
        for (i = 0; i < num_cores; i++) {
@@ -410,12 +489,12 @@ int perfmon_open_event(const struct core_set *cset, struct perfmon_session *ps,
 }
 
 static void perfmon_alloc_get(struct perfmon_session *ps, int ped, bool reset,
-                                                         struct perfmon_alloc **ppa)
+                              struct perfmon_alloc **ppa)
 {
        struct perfmon_alloc *pa;
 
        if (unlikely((ped < 0) || (ped >= ARRAY_SIZE(ps->allocs))))
-               error(EBADFD, NULL);
+               error(EBADFD, ERROR_FIXME);
        spin_lock(&ps->lock);
        pa = ps->allocs[ped];
        if (likely(pa)) {
@@ -426,7 +505,7 @@ static void perfmon_alloc_get(struct perfmon_session *ps, int ped, bool reset,
        }
        spin_unlock(&ps->lock);
        if (unlikely(!pa))
-               error(ENOENT, NULL);
+               error(ENOENT, ERROR_FIXME);
        *ppa = pa;
 }
 
@@ -439,7 +518,7 @@ void perfmon_close_event(struct perfmon_session *ps, int ped)
 }
 
 struct perfmon_status *perfmon_get_event_status(struct perfmon_session *ps,
-                                                                                               int ped)
+                                                int ped)
 {
        struct core_set cset;
        struct perfmon_status_env env;
@@ -462,11 +541,10 @@ void perfmon_free_event_status(struct perfmon_status *pef)
 
 static void perfmon_release_session(struct kref *kref)
 {
-       struct perfmon_session *ps = container_of(kref, struct perfmon_session,
-                                                                                         ref);
-       int i;
+       struct perfmon_session *ps =
+           container_of(kref, struct perfmon_session, ref);
 
-       for (i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
+       for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
                struct perfmon_alloc *pa = ps->allocs[i];
 
                if (pa)
@@ -478,7 +556,7 @@ static void perfmon_release_session(struct kref *kref)
 struct perfmon_session *perfmon_create_session(void)
 {
        struct perfmon_session *ps = kzmalloc(sizeof(struct perfmon_session),
-                                                                                 KMALLOC_WAIT);
+                                             MEM_WAIT);
 
        kref_init(&ps->ref, perfmon_release_session, 1);
        spinlock_init(&ps->lock);