akaros/kern/arch/x86/perfmon.c
<<
>>
Prefs
   1/* Copyright (c) 2015 Google Inc
   2 * Davide Libenzi <dlibenzi@google.com>
   3 * Barret Rhoden <brho@cs.berkeley.edu>
   4 * See LICENSE for details.
   5 *
   6 * Manages the setting and reading of hardware perf counters across all cores,
   7 * including generating samples in response to counter overflow interrupts.
   8 *
   9 * The hardware interface is pretty straightforward - it's mostly setting and
  10 * unsetting fixed and unfixed events, sometimes with interrupts and trigger
  11 * counts.
  12 *
  13 * The 'command' to the cores is a struct perfmon_alloc.  This tells the core
  14 * which event to set up (this is the perfmon_event).  The cores respond in
  15 * counters[], saying which of their counters it is using for that event.  If
  16 * the cores are given different alloc requests, it is possible that they might
  17 * choose different counters[] for the same event.
  18 *
  19 * These perfmon_allocs are collected in a perfmon_session.  The session is just
  20 * a bunch of allocs, which are referred to by index (the 'ped').  Currently,
  21 * the session is grabbed by whoever opens the perf FD in devarch, and closed
  22 * when that FD is closed.  They are 1:1 with devarch's perf_contexts.
  23 *
  24 * The values for the counters are extracted with perfmon_get_event_status(),
  25 * which uses a struct perfmon_status to collect the results.  We pass the
  26 * perfmon_alloc as part of the perfmon_status_env, since we need to tell the
  27 * core which counter we're talking about.
  28 *
  29 * You can have multiple sessions, but if you try to install the same counter in
  30 * multiple, concurrent sessions, the hardware might complain (it definitely
  31 * will if it is a fixed event). */
  32
  33#include <sys/types.h>
  34#include <arch/ros/msr-index.h>
  35#include <arch/x86.h>
  36#include <arch/msr.h>
  37#include <arch/uaccess.h>
  38#include <ros/errno.h>
  39#include <assert.h>
  40#include <trap.h>
  41#include <smp.h>
  42#include <atomic.h>
  43#include <core_set.h>
  44#include <percpu.h>
  45#include <kmalloc.h>
  46#include <err.h>
  47#include <string.h>
  48#include <profiler.h>
  49#include <arch/perfmon.h>
  50
  51#define FIXCNTR_NBITS 4
  52#define FIXCNTR_MASK (((uint64_t) 1 << FIXCNTR_NBITS) - 1)
  53
  54struct perfmon_cpu_context {
  55        spinlock_t lock;
  56        struct perfmon_event counters[MAX_VAR_COUNTERS];
  57        struct perfmon_event fixed_counters[MAX_FIX_COUNTERS];
  58};
  59
  60struct perfmon_status_env {
  61        struct perfmon_alloc *pa;
  62        struct perfmon_status *pef;
  63};
  64
  65static struct perfmon_cpu_caps cpu_caps;
  66static DEFINE_PERCPU(struct perfmon_cpu_context, counters_env);
  67DEFINE_PERCPU_INIT(perfmon_counters_env_init);
  68
  69#define PROFILER_BT_DEPTH 16
  70
  71struct sample_snapshot {
  72        struct user_context             ctx;
  73        uintptr_t                       pc_list[PROFILER_BT_DEPTH];
  74        size_t                          nr_pcs;
  75};
  76static DEFINE_PERCPU(struct sample_snapshot, sample_snapshots);
  77
  78static void perfmon_counters_env_init(void)
  79{
  80        for (int i = 0; i < num_cores; i++) {
  81                struct perfmon_cpu_context *cctx =
  82                        _PERCPU_VARPTR(counters_env, i);
  83
  84                spinlock_init_irqsave(&cctx->lock);
  85        }
  86}
  87
  88static void perfmon_read_cpu_caps(struct perfmon_cpu_caps *pcc)
  89{
  90        uint32_t a, b, c, d;
  91
  92        cpuid(0x0a, 0, &a, &b, &c, &d);
  93
  94        pcc->proc_arch_events = a >> 24;
  95        pcc->bits_x_counter = (a >> 16) & 0xff;
  96        pcc->counters_x_proc = (a >> 8) & 0xff;
  97        pcc->bits_x_fix_counter = (d >> 5) & 0xff;
  98        pcc->fix_counters_x_proc = d & 0x1f;
  99        pcc->perfmon_version = a & 0xff;
 100}
 101
 102static void perfmon_enable_event(int idx, uint64_t event)
 103{
 104        uint64_t gctrl;
 105
 106        /* Events need to be enabled in both MSRs */
 107        write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, event);
 108        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 109        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl | (1 << idx));
 110}
 111
 112static void perfmon_disable_event(int idx)
 113{
 114        uint64_t gctrl;
 115
 116        /* Events can be disabled in either location.  We could just clear the
 117         * global ctrl, but we use the contents of EVENTSEL to say if the
 118         * counter is available or not. */
 119        write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx, 0);
 120        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 121        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl & ~(1 << idx));
 122}
 123
 124static bool perfmon_event_available(uint32_t idx)
 125{
 126        return read_msr(MSR_ARCH_PERFMON_EVENTSEL0 + idx) == 0;
 127}
 128
 129/* Helper.  Given an event, a fixed counter index, and the contents of the fixed
 130 * counter ctl MSR, output the value for the fixed counter ctl that will enable
 131 * the event at idx. */
 132static uint64_t perfmon_apply_fixevent_mask(uint64_t event, int idx,
 133                                            uint64_t base)
 134{
 135        uint64_t m = 0;
 136
 137        if (PMEV_GET_OS(event))
 138                m |= (1 << 0);
 139        if (PMEV_GET_USR(event))
 140                m |= (1 << 1);
 141        if (PMEV_GET_ANYTH(event))
 142                m |= (1 << 2);
 143        if (PMEV_GET_INTEN(event))
 144                m |= (1 << 3);
 145        /* Enable enforcement: we need at least one bit set so that this fixed
 146         * counter appears to be in use. */
 147        if (PMEV_GET_EN(event) && !PMEV_GET_OS(event) && !PMEV_GET_USR(event))
 148                m |= (1 << 0) | (1 << 1);
 149
 150        m <<= idx * FIXCNTR_NBITS;
 151        m |= base & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS));
 152
 153        return m;
 154}
 155
 156/* These helpers take the fxctrl_value to save on a rdmsr. */
 157static void perfmon_enable_fix_event(int idx, uint64_t event,
 158                                     uint64_t fxctrl_value)
 159{
 160        uint64_t gctrl, fx;
 161
 162        /* Enable in both locations: the bits in FIXED and the bit in GLOBAL. */
 163        fx = perfmon_apply_fixevent_mask(event, idx, fxctrl_value);
 164        write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, fx);
 165        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 166        write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
 167                  gctrl | ((uint64_t) 1 << (32 + idx)));
 168}
 169
 170static void perfmon_disable_fix_event(int idx, uint64_t fxctrl_value)
 171{
 172        uint64_t gctrl;
 173
 174        /* Events can be disabled in either location.  We could just clear the
 175         * global ctrl, but we use the bits of fxctlr to say if the counter is
 176         * available or not. */
 177        write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL,
 178                  fxctrl_value & ~(FIXCNTR_MASK << (idx * FIXCNTR_NBITS)));
 179        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 180        write_msr(MSR_CORE_PERF_GLOBAL_CTRL,
 181                  gctrl & ~((uint64_t) 1 << (32 + idx)));
 182}
 183
 184static bool perfmon_fix_event_available(uint32_t idx, uint64_t fxctrl_value)
 185{
 186        return (fxctrl_value & (FIXCNTR_MASK << (idx * FIXCNTR_NBITS))) == 0;
 187}
 188
 189/* Helper to set a fixed perfcounter to trigger/overflow after count events.
 190 * Anytime you set a perfcounter to something non-zero, you need to use this
 191 * helper. */
 192static void perfmon_set_fixed_trigger(unsigned int idx, uint64_t count)
 193{
 194        int64_t write_val = -(int64_t)count;
 195
 196        write_val &= (1ULL << cpu_caps.bits_x_fix_counter) - 1;
 197        write_msr(MSR_CORE_PERF_FIXED_CTR0 + idx, write_val);
 198}
 199
 200/* Helper to set a regular perfcounter to trigger/overflow after count events.
 201 * Anytime you set a perfcounter to something non-zero, you ought to use this
 202 * helper. */
 203static void perfmon_set_unfixed_trigger(unsigned int idx, uint64_t count)
 204{
 205        int64_t write_val = -(int64_t)count;
 206
 207        write_val &= (1ULL << cpu_caps.bits_x_counter) - 1;
 208        write_msr(MSR_IA32_PERFCTR0 + idx, write_val);
 209}
 210
 211/* Helper: sets errno/errstr based on the error code returned from the core.  We
 212 * don't have a great way to get errors back from smp_do_in_cores() commands.
 213 * We use negative counter values (e.g. i = -EBUSY) to signal an error of a
 214 * certain type.  This converts that to something useful for userspace. */
 215static void perfmon_convert_error(int err_code, int core_id)
 216{
 217        switch (err_code) {
 218        case EBUSY:
 219                set_error(err_code, "Fixed perf counter is busy on core %d",
 220                          core_id);
 221                break;
 222        case ENOSPC:
 223                set_error(err_code, "Perf counter idx out of range on core %d",
 224                          core_id);
 225                break;
 226        case ENOENT:
 227                set_error(err_code, "Perf counter not set on core %d", core_id);
 228                break;
 229        default:
 230                set_error(err_code, "Unknown perf counter error on core %d",
 231                          core_id);
 232                break;
 233        };
 234}
 235
 236static void perfmon_do_cores_alloc(void *opaque)
 237{
 238        struct perfmon_alloc *pa = (struct perfmon_alloc *) opaque;
 239        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 240        int i;
 241        struct perfmon_event *pev;
 242
 243        spin_lock_irqsave(&cctx->lock);
 244        if (perfmon_is_fixed_event(&pa->ev)) {
 245                uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);
 246
 247                i = PMEV_GET_EVENT(pa->ev.event);
 248                if (i >= (int) cpu_caps.fix_counters_x_proc) {
 249                        i = -ENOSPC;
 250                } else if (!perfmon_fix_event_available(i, fxctrl_value)) {
 251                        i = -EBUSY;
 252                } else {
 253                        /* Keep a copy of pa->ev for later.  pa is read-only and
 254                         * shared. */
 255                        cctx->fixed_counters[i] = pa->ev;
 256                        pev = &cctx->fixed_counters[i];
 257                        if (PMEV_GET_INTEN(pev->event))
 258                                perfmon_set_fixed_trigger(i,
 259                                                          pev->trigger_count);
 260                        else
 261                                write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
 262                        write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL,
 263                                  1ULL << (32 + i));
 264                        perfmon_enable_fix_event(i, pev->event, fxctrl_value);
 265                }
 266        } else {
 267                for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 268                        if (cctx->counters[i].event == 0) {
 269                                /* kernel bug if the MSRs don't agree with our
 270                                 * bookkeeping */
 271                                assert(perfmon_event_available(i));
 272                                break;
 273                        }
 274                }
 275                if (i < (int) cpu_caps.counters_x_proc) {
 276                        cctx->counters[i] = pa->ev;
 277                        pev = &cctx->counters[i];
 278                        if (PMEV_GET_INTEN(pev->event))
 279                                perfmon_set_unfixed_trigger(i,
 280                                                            pev->trigger_count);
 281                        else
 282                                write_msr(MSR_IA32_PERFCTR0 + i, 0);
 283                        write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1ULL << i);
 284                        perfmon_enable_event(i, pev->event);
 285                } else {
 286                        i = -ENOSPC;
 287                }
 288        }
 289        spin_unlock_irqsave(&cctx->lock);
 290
 291        pa->cores_counters[core_id()] = (counter_t) i;
 292}
 293
 294static void perfmon_do_cores_free(void *opaque)
 295{
 296        struct perfmon_alloc *pa = (struct perfmon_alloc *) opaque;
 297        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 298        int err = 0, coreno = core_id();
 299        counter_t ccno = pa->cores_counters[coreno];
 300
 301        spin_lock_irqsave(&cctx->lock);
 302        if (perfmon_is_fixed_event(&pa->ev)) {
 303                uint64_t fxctrl_value = read_msr(MSR_CORE_PERF_FIXED_CTR_CTRL);
 304
 305                if ((ccno >= cpu_caps.fix_counters_x_proc) ||
 306                    perfmon_fix_event_available(ccno, fxctrl_value)) {
 307                        err = -ENOENT;
 308                } else {
 309                        perfmon_init_event(&cctx->fixed_counters[ccno]);
 310                        perfmon_disable_fix_event((int) ccno, fxctrl_value);
 311                        write_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno, 0);
 312                }
 313        } else {
 314                if (ccno < (int) cpu_caps.counters_x_proc) {
 315                        perfmon_init_event(&cctx->counters[ccno]);
 316                        perfmon_disable_event((int) ccno);
 317                        write_msr(MSR_IA32_PERFCTR0 + ccno, 0);
 318                } else {
 319                        err = -ENOENT;
 320                }
 321        }
 322        spin_unlock_irqsave(&cctx->lock);
 323
 324        pa->cores_counters[coreno] = (counter_t) err;
 325}
 326
 327/* Helper: Reads a fixed counter's value.  Returns the max amount possible if
 328 * the counter overflowed. */
 329static uint64_t perfmon_read_fixed_counter(int ccno)
 330{
 331        uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
 332
 333        if (overflow_status & (1ULL << (32 + ccno)))
 334                return (1ULL << cpu_caps.bits_x_fix_counter) - 1;
 335        else
 336                return read_msr(MSR_CORE_PERF_FIXED_CTR0 + ccno);
 337}
 338
 339/* Helper: Reads an unfixed counter's value.  Returns the max amount possible if
 340 * the counter overflowed. */
 341static uint64_t perfmon_read_unfixed_counter(int ccno)
 342{
 343        uint64_t overflow_status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
 344
 345        if (overflow_status & (1ULL << ccno))
 346                return (1ULL << cpu_caps.bits_x_counter) - 1;
 347        else
 348                return read_msr(MSR_IA32_PERFCTR0 + ccno);
 349}
 350
 351static void perfmon_do_cores_status(void *opaque)
 352{
 353        struct perfmon_status_env *env = (struct perfmon_status_env *) opaque;
 354        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 355        int coreno = core_id();
 356        counter_t ccno = env->pa->cores_counters[coreno];
 357
 358        spin_lock_irqsave(&cctx->lock);
 359        if (perfmon_is_fixed_event(&env->pa->ev))
 360                env->pef->cores_values[coreno] =
 361                        perfmon_read_fixed_counter(ccno);
 362        else
 363                env->pef->cores_values[coreno] =
 364                        perfmon_read_unfixed_counter(ccno);
 365        spin_unlock_irqsave(&cctx->lock);
 366}
 367
 368static void perfmon_setup_alloc_core_set(const struct perfmon_alloc *pa,
 369                                         struct core_set *cset)
 370{
 371        int i;
 372
 373        core_set_init(cset);
 374        for (i = 0; i < num_cores; i++) {
 375                if (pa->cores_counters[i] >= 0)
 376                        core_set_setcpu(cset, i);
 377        }
 378}
 379
 380static void perfmon_cleanup_cores_alloc(struct perfmon_alloc *pa)
 381{
 382        struct core_set cset;
 383
 384        perfmon_setup_alloc_core_set(pa, &cset);
 385        smp_do_in_cores(&cset, perfmon_do_cores_free, pa);
 386}
 387
 388static void perfmon_free_alloc(struct perfmon_alloc *pa)
 389{
 390        kfree(pa);
 391}
 392
 393static void perfmon_destroy_alloc(struct perfmon_alloc *pa)
 394{
 395        perfmon_cleanup_cores_alloc(pa);
 396        perfmon_free_alloc(pa);
 397}
 398
 399static struct perfmon_alloc *perfmon_create_alloc(const struct perfmon_event *pev)
 400{
 401        int i;
 402        struct perfmon_alloc *pa = kzmalloc(sizeof(struct perfmon_alloc) +
 403                                            num_cores * sizeof(counter_t),
 404                                            MEM_WAIT);
 405
 406        pa->ev = *pev;
 407        for (i = 0; i < num_cores; i++)
 408                pa->cores_counters[i] = INVALID_COUNTER;
 409
 410        return pa;
 411}
 412
 413static struct perfmon_status *perfmon_status_alloc(void)
 414{
 415        struct perfmon_status *pef = kzmalloc(sizeof(struct perfmon_status) +
 416                                              num_cores * sizeof(uint64_t),
 417                                              MEM_WAIT);
 418
 419        return pef;
 420}
 421
 422static void perfmon_arm_irq(void)
 423{
 424        /* Actually, the vector is ignored, I'm just adding T_NMI to avoid
 425         * confusion.  The important part is the NMI-bits (0x4) */
 426        apicrput(MSR_LAPIC_LVT_PERFMON, (0x4 << 8) | T_NMI);
 427}
 428
 429bool perfmon_supported(void)
 430{
 431        return cpu_caps.perfmon_version >= 2;
 432}
 433
 434void perfmon_global_init(void)
 435{
 436        perfmon_read_cpu_caps(&cpu_caps);
 437}
 438
 439void perfmon_pcpu_init(void)
 440{
 441        int i;
 442
 443        if (!perfmon_supported())
 444                return;
 445        /* Enable user level access to the performance counters */
 446        lcr4(rcr4() | CR4_PCE);
 447
 448        /* Reset all the counters and selectors to zero.
 449         */
 450        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 451        for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 452                write_msr(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
 453                write_msr(MSR_IA32_PERFCTR0 + i, 0);
 454        }
 455        write_msr(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
 456        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++)
 457                write_msr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
 458
 459        perfmon_arm_irq();
 460}
 461
 462static uint64_t perfmon_make_sample_event(const struct perfmon_event *pev)
 463{
 464        return pev->user_data;
 465}
 466
 467/* Called from NMI context! */
 468void perfmon_snapshot_hwtf(struct hw_trapframe *hw_tf)
 469{
 470        struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);
 471        uintptr_t pc = get_hwtf_pc(hw_tf);
 472        uintptr_t fp = get_hwtf_fp(hw_tf);
 473
 474        sample->ctx.type = ROS_HW_CTX;
 475        sample->ctx.tf.hw_tf = *hw_tf;
 476        if (in_kernel(hw_tf)) {
 477                sample->nr_pcs = backtrace_list(pc, fp, sample->pc_list,
 478                                                PROFILER_BT_DEPTH);
 479        } else {
 480                sample->nr_pcs = backtrace_user_list(pc, fp, sample->pc_list,
 481                                                     PROFILER_BT_DEPTH);
 482        }
 483}
 484
 485/* Called from NMI context, *and* this cannot fault (e.g. breakpoint tracing)!
 486 * The latter restriction is due to the vmexit NMI handler not being
 487 * interruptible.  Because of this, we just copy out the VM TF. */
 488void perfmon_snapshot_vmtf(struct vm_trapframe *vm_tf)
 489{
 490        struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);
 491
 492        sample->ctx.type = ROS_VM_CTX;
 493        sample->ctx.tf.vm_tf = *vm_tf;
 494        sample->nr_pcs = 1;
 495        sample->pc_list[0] = get_vmtf_pc(vm_tf);
 496}
 497
 498static void profiler_add_sample(uint64_t info)
 499{
 500        struct sample_snapshot *sample = PERCPU_VARPTR(sample_snapshots);
 501
 502        /* We shouldn't need to worry about another NMI that concurrently mucks
 503         * with the sample.  The PMU won't rearm the interrupt until we're done
 504         * here.  In the event that we do get another NMI from another source,
 505         * we may get a weird backtrace in the perf output. */
 506        switch (sample->ctx.type) {
 507        case ROS_HW_CTX:
 508                if (in_kernel(&sample->ctx.tf.hw_tf)) {
 509                        profiler_push_kernel_backtrace(sample->pc_list,
 510                                                       sample->nr_pcs, info);
 511                } else {
 512                        profiler_push_user_backtrace(sample->pc_list,
 513                                                     sample->nr_pcs, info);
 514                }
 515                break;
 516        case ROS_VM_CTX:
 517                /* TODO: add VM support to perf.  For now, just treat it like a
 518                 * user addr.  Note that the address is a guest-virtual address,
 519                 * not guest-physical (which would be host virtual), and our
 520                 * VM_CTXs don't make a distinction between user and kernel TFs
 521                 * (yet). */
 522                profiler_push_user_backtrace(sample->pc_list, sample->nr_pcs,
 523                                             info);
 524                break;
 525        default:
 526                warn("Bad perf sample type %d!", sample->ctx.type);
 527        }
 528}
 529
 530void perfmon_interrupt(struct hw_trapframe *hw_tf, void *data)
 531{
 532        int i;
 533        struct perfmon_cpu_context *cctx = PERCPU_VARPTR(counters_env);
 534        uint64_t gctrl, status;
 535
 536        spin_lock_irqsave(&cctx->lock);
 537        /* We need to save the global control status, because we need to disable
 538         * counters in order to be able to reset their values.
 539         * We will restore the global control status on exit.
 540         */
 541        status = read_msr(MSR_CORE_PERF_GLOBAL_STATUS);
 542        gctrl = read_msr(MSR_CORE_PERF_GLOBAL_CTRL);
 543        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 544        for (i = 0; i < (int) cpu_caps.counters_x_proc; i++) {
 545                if (status & ((uint64_t) 1 << i)) {
 546                        if (cctx->counters[i].event) {
 547                                profiler_add_sample(
 548                                    perfmon_make_sample_event(
 549                                        cctx->counters + i));
 550                                perfmon_set_unfixed_trigger(i,
 551                                        cctx->counters[i].trigger_count);
 552                        }
 553                }
 554        }
 555        for (i = 0; i < (int) cpu_caps.fix_counters_x_proc; i++) {
 556                if (status & ((uint64_t) 1 << (32 + i))) {
 557                        if (cctx->fixed_counters[i].event) {
 558                                profiler_add_sample(
 559                                    perfmon_make_sample_event(
 560                                        cctx->fixed_counters + i));
 561                                perfmon_set_fixed_trigger(i,
 562                                        cctx->fixed_counters[i].trigger_count);
 563                        }
 564                }
 565        }
 566        write_msr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
 567        write_msr(MSR_CORE_PERF_GLOBAL_CTRL, gctrl);
 568        spin_unlock_irqsave(&cctx->lock);
 569
 570        /* We need to re-arm the IRQ as the PFM IRQ gets masked on trigger.
 571         * Note that KVM and real HW seems to be doing two different things WRT
 572         * re-arming the IRQ. KVM re-arms does not mask the IRQ, while real HW
 573         * does.  */
 574        perfmon_arm_irq();
 575}
 576
 577void perfmon_get_cpu_caps(struct perfmon_cpu_caps *pcc)
 578{
 579        memcpy(pcc, &cpu_caps, sizeof(*pcc));
 580}
 581
 582static int perfmon_install_session_alloc(struct perfmon_session *ps,
 583                                         struct perfmon_alloc *pa)
 584{
 585        qlock(&ps->qlock);
 586        for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
 587                if (!ps->allocs[i]) {
 588                        ps->allocs[i] = pa;
 589                        qunlock(&ps->qlock);
 590                        return i;
 591                }
 592        }
 593        qunlock(&ps->qlock);
 594        error(ENFILE, "Too many perf allocs in the session");
 595}
 596
 597int perfmon_open_event(const struct core_set *cset, struct perfmon_session *ps,
 598                       const struct perfmon_event *pev)
 599{
 600        ERRSTACK(1);
 601        int i;
 602        struct perfmon_alloc *pa = perfmon_create_alloc(pev);
 603
 604        if (waserror()) {
 605                perfmon_destroy_alloc(pa);
 606                nexterror();
 607        }
 608        /* Ensure the user did not set reserved bits or otherwise give us a bad
 609         * event.  pev (now pa->ev) must be a valid IA32_PERFEVTSEL MSR. */
 610        pa->ev.event &= 0xffffffff;
 611        if (cpu_caps.perfmon_version < 3)
 612                PMEV_SET_ANYTH(pa->ev.event, 0);
 613        /* Ensure we're turning on the event.  The user could have forgotten to
 614         * set it.  Our tracking of whether or not a counter is in use depends
 615         * on it being enabled, or at least that some bit is set. */
 616        PMEV_SET_EN(pa->ev.event, 1);
 617        smp_do_in_cores(cset, perfmon_do_cores_alloc, pa);
 618
 619        for (i = 0; i < num_cores; i++) {
 620                if (core_set_getcpu(cset, i)) {
 621                        counter_t ccno = pa->cores_counters[i];
 622
 623                        if (unlikely(ccno < 0)) {
 624                                perfmon_destroy_alloc(pa);
 625                                perfmon_convert_error(-(int)ccno, i);
 626                                return -1;
 627                        }
 628                }
 629        }
 630        /* The perfmon_alloc data structure will not be visible to userspace,
 631         * until the perfmon_install_session_alloc() completes, and at that
 632         * time the smp_do_in_cores(perfmon_do_cores_alloc) will have run on
 633         * all cores.
 634         * The perfmon_alloc data structure will never be changed once
 635         * published. */
 636        i = perfmon_install_session_alloc(ps, pa);
 637        poperror();
 638
 639        return i;
 640}
 641
 642/* Helper, looks up a pa, given ped.  Hold the qlock. */
 643static struct perfmon_alloc *__lookup_pa(struct perfmon_session *ps, int ped)
 644{
 645        struct perfmon_alloc *pa;
 646
 647        if (unlikely((ped < 0) || (ped >= ARRAY_SIZE(ps->allocs))))
 648                error(EBADFD, "Perf event %d out of range", ped);
 649        pa = ps->allocs[ped];
 650        if (!pa)
 651                error(ENOENT, "No perf alloc for event %d", ped);
 652        return pa;
 653}
 654
 655void perfmon_close_event(struct perfmon_session *ps, int ped)
 656{
 657        ERRSTACK(1);
 658        struct perfmon_alloc *pa;
 659
 660        qlock(&ps->qlock);
 661        if (waserror()) {
 662                qunlock(&ps->qlock);
 663                nexterror();
 664        };
 665        /* lookup does the error checking */
 666        pa = __lookup_pa(ps, ped);
 667        ps->allocs[ped] = NULL;
 668        poperror();
 669        qunlock(&ps->qlock);
 670        perfmon_destroy_alloc(pa);
 671}
 672
 673/* Fetches the status (i.e. PMU counters) of event ped from all applicable
 674 * cores.  Returns a perfmon_status, which the caller should free. */
 675struct perfmon_status *perfmon_get_event_status(struct perfmon_session *ps,
 676                                                int ped)
 677{
 678        ERRSTACK(1);
 679        struct core_set cset;
 680        struct perfmon_status_env env;
 681
 682        /* qlock keeps the PA alive.  We don't want to spin, since the spinners
 683         * might prevent the smp_do_in_cores(), resulting in a deadlock. */
 684        qlock(&ps->qlock);
 685        if (waserror()) {
 686                qunlock(&ps->qlock);
 687                nexterror();
 688        };
 689        env.pa = __lookup_pa(ps, ped);
 690        env.pef = perfmon_status_alloc();
 691
 692        perfmon_setup_alloc_core_set(env.pa, &cset);
 693        smp_do_in_cores(&cset, perfmon_do_cores_status, &env);
 694
 695        poperror();
 696        qunlock(&ps->qlock);
 697
 698        return env.pef;
 699}
 700
 701void perfmon_free_event_status(struct perfmon_status *pef)
 702{
 703        kfree(pef);
 704}
 705
 706struct perfmon_session *perfmon_create_session(void)
 707{
 708        struct perfmon_session *ps = kzmalloc(sizeof(struct perfmon_session),
 709                                              MEM_WAIT);
 710
 711        qlock_init(&ps->qlock);
 712        return ps;
 713}
 714
 715void perfmon_close_session(struct perfmon_session *ps)
 716{
 717        struct perfmon_alloc *pa;
 718
 719        for (int i = 0; i < ARRAY_SIZE(ps->allocs); i++) {
 720                pa = ps->allocs[i];
 721                if (pa)
 722                        perfmon_destroy_alloc(pa);
 723        }
 724        kfree(ps);
 725}
 726