x86: Panic if there is no x2APIC
[akaros.git] / kern / src / profiler.c
index 772390f..c94caed 100644 (file)
@@ -1,36 +1,64 @@
+/* Copyright (c) 2015 Google Inc
+ * Davide Libenzi <dlibenzi@google.com>
+ * See LICENSE for details.
+ *
+ * This controls the emitting, collecting, and exporting of samples for perf
+ * events.  Examples of events are PMU counter overflows, mmaps, and process
+ * creation.
+ *
+ * Events are collected in a central qio queue.  High-frequency events (e.g.
+ * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
+ * central queue when they fill up or on command.  Lower-frequency events (e.g.
+ * profiler_notify_mmap()) just go straight to the central queue.
+ *
+ * Currently there is one global profiler.  Kprof is careful to only have one
+ * open profiler at a time.  We assert that this is true.  TODO: stop using the
+ * global profiler!
+ *
+ * A few other notes:
+ * - profiler_control_trace() controls the per-core trace collection.  When it
+ *   is disabled, it also flushes the per-core blocks to the central queue.
+ * - The collection of mmap and comm samples is independent of trace collection.
+ *   Those will occur whenever the profiler is open (refcnt check, for now). */
 
 #include <ros/common.h>
+#include <ros/mman.h>
+#include <sys/types.h>
 #include <smp.h>
 #include <trap.h>
 #include <kthread.h>
+#include <env.h>
+#include <process.h>
+#include <mm.h>
+#include <vfs.h>
 #include <kmalloc.h>
+#include <pmap.h>
+#include <kref.h>
 #include <atomic.h>
-#include <sys/types.h>
+#include <umem.h>
+#include <elf.h>
+#include <ns.h>
+#include <err.h>
+#include <core_set.h>
+#include <string.h>
 #include "profiler.h"
 
-struct op_sample {
-       uint64_t hdr;
-       uint64_t event;
-       uint64_t data[0];
-};
+#define PROFILER_MAX_PRG_PATH  256
 
-struct op_entry {
-       struct op_sample *sample;
-       size_t size;
-       uint64_t *data;
-};
+#define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
 
+/* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
 struct profiler_cpu_context {
-       spinlock_t lock;
-       int tracing;
        struct block *block;
+       int cpu;
+       int tracing;
+       size_t dropped_data_cnt;
 };
 
-static int profiler_queue_limit = 1024;
+static int profiler_queue_limit = 64 * 1024 * 1024;
 static size_t profiler_cpu_buffer_size = 65536;
-static size_t profiler_backtrace_depth = 16;
-static struct semaphore mtx = SEMAPHORE_INITIALIZER(mtx, 1);
-static int profiler_users = 0;
+static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
+static struct kref profiler_kref;
 static struct profiler_cpu_context *profiler_percpu_ctx;
 static struct queue *profiler_queue;
 
@@ -39,26 +67,220 @@ static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
        return profiler_percpu_ctx + cpu;
 }
 
-static inline uint64_t profiler_create_header(int cpu, size_t nbt)
+static inline char *vb_encode_uint64(char *data, uint64_t n)
+{
+       /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
+        * number 7 in the byte, as indicator of end of sequence (when zero).
+        */
+       for (; n >= 0x80; n >>= 7)
+               *data++ = (char) (n | 0x80);
+       *data++ = (char) n;
+
+       return data;
+}
+
+static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
+                                           struct block *b)
+{
+       /* qpass will drop b if the queue is over its limit.  we're willing to lose
+        * traces, but we won't lose 'control' events, such as MMAP and PID. */
+       if (b) {
+               if (qpass(profiler_queue, b) < 0)
+                       cpu_buf->dropped_data_cnt++;
+       }
+       return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
+}
+
+/* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
+ * enough room in the pcpu block for our write.  May alloc a new one.
+ *
+ * IRQs must be disabled before calling, until after write_commit. */
+static char *profiler_cpu_buffer_write_reserve(
+       struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
+{
+       struct block *b = cpu_buf->block;
+
+       if (unlikely((!b) || (b->lim - b->wp) < size)) {
+               cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
+               if (unlikely(!b))
+                       return NULL;
+       }
+       *pb = b;
+
+       return (char *) b->wp;
+}
+
+/* Helper, paired with write_reserve.  Finalizes the writing into the block's
+ * main body of @size bytes.  IRQs must be disabled until after this is called.
+ */
+static inline void profiler_cpu_buffer_write_commit(
+       struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
 {
-       return (((uint64_t) 0xee01) << 48) | ((uint64_t) cpu << 16) |
-               (uint64_t) nbt;
+       b->wp += size;
 }
 
-static inline size_t profiler_cpu_buffer_add_data(struct op_entry *entry,
-                                                                                                 const uintptr_t *values,
-                                                                                                 size_t count)
+static inline size_t profiler_max_envelope_size(void)
 {
-       size_t i;
+       return 2 * VBE_MAX_SIZE(uint64_t);
+}
+
+static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
+                                         const uintptr_t *trace, size_t count,
+                                         uint64_t info)
+{
+       struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
+       size_t size = sizeof(struct proftype_kern_trace64) +
+               count * sizeof(uint64_t);
+       struct block *b;
+       void *resptr, *ptr;
+
+       assert(!irq_is_enabled());
+       resptr = profiler_cpu_buffer_write_reserve(
+           cpu_buf, size + profiler_max_envelope_size(), &b);
+       ptr = resptr;
+
+       if (likely(ptr)) {
+               struct proftype_kern_trace64 *record;
+
+               ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
+               ptr = vb_encode_uint64(ptr, size);
+
+               record = (struct proftype_kern_trace64 *) ptr;
+               ptr += size;
+
+               record->info = info;
+               record->tstamp = nsec();
+               if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
+                       record->pid = -1;
+               else
+                       record->pid = pcpui->cur_proc->pid;
+               record->cpu = cpu_buf->cpu;
+               record->num_traces = count;
+               for (size_t i = 0; i < count; i++)
+                       record->trace[i] = (uint64_t) trace[i];
+
+               profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
+       }
+}
+
+static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
+                                       struct proc *p, const uintptr_t *trace,
+                                       size_t count, uint64_t info)
+{
+       size_t size = sizeof(struct proftype_user_trace64) +
+               count * sizeof(uint64_t);
+       struct block *b;
+       void *resptr, *ptr;
+
+       assert(!irq_is_enabled());
+       resptr = profiler_cpu_buffer_write_reserve(
+           cpu_buf, size + profiler_max_envelope_size(), &b);
+       ptr = resptr;
+
+       if (likely(ptr)) {
+               struct proftype_user_trace64 *record;
 
-       if (unlikely(count > entry->size))
-               count = entry->size;
-       for (i = 0; i < count; i++)
-               entry->data[i] = (uint64_t) values[i];
-       entry->size -= count;
-       entry->data += count;
+               ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
+               ptr = vb_encode_uint64(ptr, size);
 
-       return entry->size;
+               record = (struct proftype_user_trace64 *) ptr;
+               ptr += size;
+
+               record->info = info;
+               record->tstamp = nsec();
+               record->pid = p->pid;
+               record->cpu = cpu_buf->cpu;
+               record->num_traces = count;
+               for (size_t i = 0; i < count; i++)
+                       record->trace[i] = (uint64_t) trace[i];
+
+               profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
+       }
+}
+
+static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
+                                   size_t offset, const char *path)
+{
+       size_t plen = strlen(path) + 1;
+       size_t size = sizeof(struct proftype_pid_mmap64) + plen;
+       void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
+
+       if (likely(resptr)) {
+               void *ptr = resptr;
+               struct proftype_pid_mmap64 *record;
+
+               ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
+               ptr = vb_encode_uint64(ptr, size);
+
+               record = (struct proftype_pid_mmap64 *) ptr;
+               ptr += size;
+
+               record->tstamp = nsec();
+               record->pid = p->pid;
+               record->addr = addr;
+               record->size = msize;
+               record->offset = offset;
+               memcpy(record->path, path, plen);
+
+               qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
+
+               kfree(resptr);
+       }
+}
+
+static void profiler_push_new_process(struct proc *p)
+{
+       size_t plen = strlen(p->binary_path) + 1;
+       size_t size = sizeof(struct proftype_new_process) + plen;
+       void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
+
+       if (likely(resptr)) {
+               void *ptr = resptr;
+               struct proftype_new_process *record;
+
+               ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
+               ptr = vb_encode_uint64(ptr, size);
+
+               record = (struct proftype_new_process *) ptr;
+               ptr += size;
+
+               record->tstamp = nsec();
+               record->pid = p->pid;
+               memcpy(record->path, p->binary_path, plen);
+
+               qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
+
+               kfree(resptr);
+       }
+}
+
+static void profiler_emit_current_system_status(void)
+{
+       void enum_proc(struct vm_region *vmr, void *opaque)
+       {
+               struct proc *p = (struct proc *) opaque;
+
+               profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
+                                    vmr->vm_prot, vmr->vm_flags, vmr->vm_file,
+                                    vmr->vm_foff);
+       }
+
+       ERRSTACK(1);
+       struct process_set pset;
+
+       proc_get_set(&pset);
+       if (waserror()) {
+               proc_free_set(&pset);
+               nexterror();
+       }
+
+       for (size_t i = 0; i < pset.num_processes; i++) {
+               profiler_notify_new_process(pset.procs[i]);
+               enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
+       }
+
+       poperror();
+       proc_free_set(&pset);
 }
 
 static void free_cpu_buffers(void)
@@ -66,233 +288,269 @@ static void free_cpu_buffers(void)
        kfree(profiler_percpu_ctx);
        profiler_percpu_ctx = NULL;
 
-       qclose(profiler_queue);
-       profiler_queue = NULL;
+       if (profiler_queue) {
+               qfree(profiler_queue);
+               profiler_queue = NULL;
+       }
 }
 
-static int alloc_cpu_buffers(void)
+static void alloc_cpu_buffers(void)
 {
-       int i;
+       ERRSTACK(1);
 
+       /* It is very important that we enqueue and dequeue entire records at once.
+        * If we leave partial records, the entire stream will be corrupt.  Our
+        * reader does its best to make sure it has room for complete records
+        * (checks qlen()).
+        *
+        * If we ever get corrupt streams, try making this a Qmsg.  Though it
+        * doesn't help every situation - we have issues with writes greater than
+        * Maxatomic regardless. */
        profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
        if (!profiler_queue)
-               return -ENOMEM;
-
-       qdropoverflow(profiler_queue, 1);
-       qnonblock(profiler_queue, 1);
+               error(ENOMEM, ERROR_FIXME);
+       if (waserror()) {
+               free_cpu_buffers();
+               nexterror();
+       }
 
        profiler_percpu_ctx =
-               kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, KMALLOC_WAIT);
-       if (!profiler_percpu_ctx)
-               goto fail;
+           kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
 
-       for (i = 0; i < num_cores; i++) {
+       for (int i = 0; i < num_cores; i++) {
                struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
 
-               b->tracing = 0;
-               spinlock_init_irqsave(&b->lock);
+               b->cpu = i;
        }
-
-       return 0;
-
-fail:
-       qclose(profiler_queue);
-       profiler_queue = NULL;
-       return -ENOMEM;
 }
 
-int profiler_init(void)
+static long profiler_get_checked_value(const char *value, long k, long minval,
+                                       long maxval)
 {
-       int error = 0;
+       long lvalue = strtol(value, NULL, 0) * k;
 
-       sem_down(&mtx);
-       if (!profiler_queue)
-               error = alloc_cpu_buffers();
-       profiler_users++;
-       sem_up(&mtx);
+       if (lvalue < minval)
+               error(EFAIL, "Value should be greater than %ld", minval);
+       if (lvalue > maxval)
+               error(EFAIL, "Value should be lower than %ld", maxval);
 
-       return error;
+       return lvalue;
 }
 
-void profiler_cleanup(void)
+int profiler_configure(struct cmdbuf *cb)
 {
-       sem_down(&mtx);
-       profiler_users--;
-       if (profiler_users == 0)
-               free_cpu_buffers();
-       sem_up(&mtx);
+       if (!strcmp(cb->f[0], "prof_qlimit")) {
+               if (cb->nf < 2)
+                       error(EFAIL, "prof_qlimit KB");
+               if (kref_refcnt(&profiler_kref) > 0)
+                       error(EFAIL, "Profiler already running");
+               profiler_queue_limit = (int) profiler_get_checked_value(
+                       cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
+               return 1;
+       }
+       if (!strcmp(cb->f[0], "prof_cpubufsz")) {
+               if (cb->nf < 2)
+                       error(EFAIL, "prof_cpubufsz KB");
+               profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
+                       cb->f[1], 1024, 16 * 1024, 1024 * 1024);
+               return 1;
+       }
+
+       return 0;
 }
 
-static struct block *profiler_cpu_buffer_write_reserve(
-       struct profiler_cpu_context *cpu_buf, struct op_entry *entry, size_t size)
+void profiler_append_configure_usage(char *msgbuf, size_t buflen)
 {
-       struct block *b = cpu_buf->block;
-    size_t totalsize = sizeof(struct op_sample) +
-               size * sizeof(entry->sample->data[0]);
-
-       if (unlikely((!b) || (b->lim - b->wp) < totalsize)) {
-               if (b)
-                       qibwrite(profiler_queue, b);
-               /* For now. Later, we will grab a block off the
-                * emptyblock queue.
-                */
-               cpu_buf->block = b = iallocb(profiler_cpu_buffer_size);
-        if (unlikely(!b)) {
-                       printk("%s: fail\n", __func__);
-                       return NULL;
-               }
+       const char * const cmds[] = {
+               "prof_qlimit",
+               "prof_cpubufsz",
+       };
+
+       for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
+               strlcat(msgbuf, "|", buflen);
+               strlcat(msgbuf, cmds[i], buflen);
        }
-       entry->sample = (struct op_sample *) b->wp;
-       entry->size = size;
-       entry->data = entry->sample->data;
+}
 
-       b->wp += totalsize;
+static void profiler_release(struct kref *kref)
+{
+       bool got_reference = FALSE;
+
+       assert(kref == &profiler_kref);
+       qlock(&profiler_mtx);
+       /* Make sure we did not race with profiler_setup(), that got the
+        * profiler_mtx lock just before us, and re-initialized the profiler
+        * for a new user.
+        * If we race here from another profiler_release() (user did a
+        * profiler_setup() immediately followed by a profiler_cleanup()) we are
+        * fine because free_cpu_buffers() can be called multiple times.
+        */
+       if (!kref_get_not_zero(kref, 1))
+               free_cpu_buffers();
+       else
+               got_reference = TRUE;
+       qunlock(&profiler_mtx);
+       /* We cannot call kref_put() within the profiler_kref lock, as such call
+        * might trigger anohter call to profiler_release().
+        */
+       if (got_reference)
+               kref_put(kref);
+}
 
-       return b;
+void profiler_init(void)
+{
+       assert(kref_refcnt(&profiler_kref) == 0);
+       kref_init(&profiler_kref, profiler_release, 0);
 }
 
-static inline int profiler_add_sample(struct profiler_cpu_context *cpu_buf,
-                                                                         uintptr_t pc, unsigned long event)
+void profiler_setup(void)
 {
        ERRSTACK(1);
-       struct op_entry entry;
-       struct block *b;
 
+       qlock(&profiler_mtx);
        if (waserror()) {
-               poperror();
-               printk("%s: failed\n", __func__);
-               return 1;
+               qunlock(&profiler_mtx);
+               nexterror();
        }
+       assert(!profiler_queue);
+       alloc_cpu_buffers();
 
-       b = profiler_cpu_buffer_write_reserve(cpu_buf, &entry, 0);
-       if (likely(b)) {
-               entry.sample->hdr = profiler_create_header(core_id(), 1);
-               entry.sample->event = (uint64_t) event;
-               profiler_cpu_buffer_add_data(&entry, &pc, 1);
-       }
-       poperror();
+       /* Do this only when everything is initialized (as last init operation).
+        */
+       __kref_get(&profiler_kref, 1);
 
-       return b == NULL;
-}
+       profiler_emit_current_system_status();
 
-static inline void profiler_begin_trace(struct profiler_cpu_context *cpu_buf)
-{
-       cpu_buf->tracing = 1;
+       poperror();
+       qunlock(&profiler_mtx);
 }
 
-static inline void profiler_end_trace(struct profiler_cpu_context *cpu_buf)
+void profiler_cleanup(void)
 {
-       cpu_buf->tracing = 0;
+       kref_put(&profiler_kref);
 }
 
-static void profiler_cpubuf_flushone(int core, int newbuf)
+static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
 {
-       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core);
+       int8_t irq_state = 0;
 
-       spin_lock_irqsave(&cpu_buf->lock);
-       if (cpu_buf->block) {
-               printk("Core %d has data\n", core);
+       disable_irqsave(&irq_state);
+       if (cpu_buf->block && profiler_queue) {
                qibwrite(profiler_queue, cpu_buf->block);
-               printk("After qibwrite in %s, profiler_queue len %d\n",
-                          __func__, qlen(profiler_queue));
-       }
-       if (newbuf)
-               cpu_buf->block = iallocb(profiler_cpu_buffer_size);
-       else
+
                cpu_buf->block = NULL;
-       spin_unlock_irqsave(&cpu_buf->lock);
+       }
+       enable_irqsave(&irq_state);
 }
 
-void profiler_control_trace(int onoff)
+static void profiler_core_trace_enable(void *opaque)
 {
-       int core;
+       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
 
-       for (core = 0; core < num_cores; core++) {
-               struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core);
+       cpu_buf->tracing = (int) (opaque != NULL);
+       if (!cpu_buf->tracing)
+               profiler_cpu_flush(cpu_buf);
+}
 
-               cpu_buf->tracing = onoff;
-               if (onoff) {
-                       printk("Enable tracing on %d\n", core);
-               } else {
-                       printk("Disable tracing on %d\n", core);
-                       profiler_cpubuf_flushone(core, 0);
-               }
-       }
+static void profiler_control_trace(int onoff)
+{
+       struct core_set cset;
+
+       error_assert(EINVAL, profiler_percpu_ctx);
+
+       core_set_init(&cset);
+       core_set_fill_available(&cset);
+       smp_do_in_cores(&cset, profiler_core_trace_enable,
+                       (void *) (uintptr_t) onoff);
 }
 
-void profiler_add_trace(uintptr_t pc)
+void profiler_start(void)
 {
-       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+       assert(profiler_queue);
+       profiler_control_trace(1);
+       qreopen(profiler_queue);
+}
 
-       if (profiler_percpu_ctx && cpu_buf->tracing)
-               profiler_add_sample(cpu_buf, pc, nsec());
+void profiler_stop(void)
+{
+       assert(profiler_queue);
+       profiler_control_trace(0);
+       qhangup(profiler_queue, 0);
 }
 
-/* Format for samples:
- * first word:
- * high 8 bits is ee, which is an invalid address on amd64.
- * next 8 bits is protocol version
- * next 16 bits is unused, MBZ. Later, we can make it a packet type.
- * next 16 bits is core id
- * next 8 bits is unused
- * next 8 bits is # PCs following. This should be at least 1, for one EIP.
- *
- * second word is time in ns.
- *
- * Third and following words are PCs, there must be at least one of them.
- */
-void profiler_add_backtrace(uintptr_t pc, uintptr_t fp)
-{
-       int cpu = core_id();
-       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(cpu);
-
-       if (profiler_percpu_ctx && cpu_buf->tracing) {
-               struct op_entry entry;
-               struct block *b;
-               uintptr_t bt_pcs[profiler_backtrace_depth];
-               size_t n = backtrace_list(pc, fp, bt_pcs, profiler_backtrace_depth);
-
-               b = profiler_cpu_buffer_write_reserve(cpu_buf, &entry, n);
-               if (likely(b)) {
-                       entry.sample->hdr = profiler_create_header(cpu, n);
-                       entry.sample->event = nsec();
-                       profiler_cpu_buffer_add_data(&entry, bt_pcs, n);
-               }
-       }
+static void profiler_core_flush(void *opaque)
+{
+       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+
+       profiler_cpu_flush(cpu_buf);
 }
 
-void profiler_add_userpc(uintptr_t pc)
+void profiler_trace_data_flush(void)
 {
-       int cpu = core_id();
-       struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(cpu);
+       struct core_set cset;
 
-       if (profiler_percpu_ctx && cpu_buf->tracing) {
-               struct op_entry entry;
-               struct block *b = profiler_cpu_buffer_write_reserve(cpu_buf,
-                                                                                                                       &entry, 1);
+       error_assert(EINVAL, profiler_percpu_ctx);
 
-               if (likely(b)) {
-                       entry.sample->hdr = profiler_create_header(cpu, 1);
-                       entry.sample->event = nsec();
-                       profiler_cpu_buffer_add_data(&entry, &pc, 1);
-               }
+       core_set_init(&cset);
+       core_set_fill_available(&cset);
+       smp_do_in_cores(&cset, profiler_core_flush, NULL);
+}
+
+void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
+                                    uint64_t info)
+{
+       if (kref_get_not_zero(&profiler_kref, 1)) {
+               struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+
+               if (profiler_percpu_ctx && cpu_buf->tracing)
+                       profiler_push_kernel_trace64(cpu_buf, pc_list, nr_pcs, info);
+               kref_put(&profiler_kref);
        }
 }
 
-void profiler_add_hw_sample(struct hw_trapframe *hw_tf)
+void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
+                                  uint64_t info)
 {
-       if (in_kernel(hw_tf))
-               profiler_add_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf));
-       else
-               profiler_add_userpc(get_hwtf_pc(hw_tf));
+       if (kref_get_not_zero(&profiler_kref, 1)) {
+               struct proc *p = current;
+               struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
+
+               if (profiler_percpu_ctx && cpu_buf->tracing)
+                       profiler_push_user_trace64(cpu_buf, p, pc_list, nr_pcs, info);
+               kref_put(&profiler_kref);
+       }
 }
 
 int profiler_size(void)
 {
-       return qlen(profiler_queue);
+       return profiler_queue ? qlen(profiler_queue) : 0;
 }
 
 int profiler_read(void *va, int n)
 {
-       return qread(profiler_queue, va, n);
+       return profiler_queue ? qread(profiler_queue, va, n) : 0;
+}
+
+void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
+                          int flags, struct file *f, size_t offset)
+{
+       if (kref_get_not_zero(&profiler_kref, 1)) {
+               if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
+                       char path_buf[PROFILER_MAX_PRG_PATH];
+                       char *path = file_abs_path(f, path_buf, sizeof(path_buf));
+
+                       if (likely(path))
+                               profiler_push_pid_mmap(p, addr, size, offset, path);
+               }
+               kref_put(&profiler_kref);
+       }
+}
+
+void profiler_notify_new_process(struct proc *p)
+{
+       if (kref_get_not_zero(&profiler_kref, 1)) {
+               if (profiler_percpu_ctx && p->binary_path)
+                       profiler_push_new_process(p);
+               kref_put(&profiler_kref);
+       }
 }