8ab964651490d87112eb4f005d61ceda84fef384
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  *
5  * This controls the emitting, collecting, and exporting of samples for perf
6  * events.  Examples of events are PMU counter overflows, mmaps, and process
7  * creation.
8  *
9  * Events are collected in a central qio queue.  High-frequency events (e.g.
10  * profiler_add_hw_sample()) are collected in per-core buffers, which are
11  * flushed to the central queue when they fill up or on command.
12  * Lower-frequency events (e.g. profiler_notify_mmap()) just go straight to the
13  * central queue.
14  *
15  * Currently there is one global profiler.  Kprof is careful to only have one
16  * open profiler at a time.  We assert that this is true.  TODO: stop using the
17  * global profiler!
18  *
19  * A few other notes:
20  * - profiler_control_trace() controls the per-core trace collection.  When it
21  *   is disabled, it also flushes the per-core blocks to the central queue.
22  * - The collection of mmap and comm samples is independent of trace collection.
23  *   Those will occur whenever the profiler is open (refcnt check, for now). */
24
25 #include <ros/common.h>
26 #include <ros/mman.h>
27 #include <sys/types.h>
28 #include <smp.h>
29 #include <trap.h>
30 #include <kthread.h>
31 #include <env.h>
32 #include <process.h>
33 #include <mm.h>
34 #include <vfs.h>
35 #include <kmalloc.h>
36 #include <pmap.h>
37 #include <kref.h>
38 #include <atomic.h>
39 #include <umem.h>
40 #include <elf.h>
41 #include <ns.h>
42 #include <err.h>
43 #include <core_set.h>
44 #include <string.h>
45 #include "profiler.h"
46
47 #define PROFILER_MAX_PRG_PATH   256
48 #define PROFILER_BT_DEPTH 16
49
50 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
51
52 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
53 struct profiler_cpu_context {
54         struct block *block;
55         int cpu;
56         int tracing;
57         size_t dropped_data_cnt;
58 };
59
60 static int profiler_queue_limit = 64 * 1024 * 1024;
61 static size_t profiler_cpu_buffer_size = 65536;
62 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
63 static struct kref profiler_kref;
64 static struct profiler_cpu_context *profiler_percpu_ctx;
65 static struct queue *profiler_queue;
66
67 static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
68 {
69         return profiler_percpu_ctx + cpu;
70 }
71
72 static inline char *vb_encode_uint64(char *data, uint64_t n)
73 {
74         /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
75          * number 7 in the byte, as indicator of end of sequence (when zero).
76          */
77         for (; n >= 0x80; n >>= 7)
78                 *data++ = (char) (n | 0x80);
79         *data++ = (char) n;
80
81         return data;
82 }
83
84 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
85                                            struct block *b)
86 {
87         /* qpass will drop b if the queue is over its limit.  we're willing to lose
88          * traces, but we won't lose 'control' events, such as MMAP and PID. */
89         if (b) {
90                 if (qpass(profiler_queue, b) < 0)
91                         cpu_buf->dropped_data_cnt++;
92         }
93         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
94 }
95
96 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
97  * enough room in the pcpu block for our write.  May alloc a new one.
98  *
99  * IRQs must be disabled before calling, until after write_commit. */
100 static char *profiler_cpu_buffer_write_reserve(
101         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
102 {
103         struct block *b = cpu_buf->block;
104
105         if (unlikely((!b) || (b->lim - b->wp) < size)) {
106                 cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
107                 if (unlikely(!b))
108                         return NULL;
109         }
110         *pb = b;
111
112         return (char *) b->wp;
113 }
114
115 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
116  * main body of @size bytes.  IRQs must be disabled until after this is called.
117  */
118 static inline void profiler_cpu_buffer_write_commit(
119         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
120 {
121         b->wp += size;
122 }
123
124 static inline size_t profiler_max_envelope_size(void)
125 {
126         return 2 * VBE_MAX_SIZE(uint64_t);
127 }
128
129 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
130                                          const uintptr_t *trace, size_t count,
131                                          uint64_t info)
132 {
133         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
134         size_t size = sizeof(struct proftype_kern_trace64) +
135                 count * sizeof(uint64_t);
136         struct block *b;
137         void *resptr, *ptr;
138
139         assert(!irq_is_enabled());
140         resptr = profiler_cpu_buffer_write_reserve(
141             cpu_buf, size + profiler_max_envelope_size(), &b);
142         ptr = resptr;
143
144         if (likely(ptr)) {
145                 struct proftype_kern_trace64 *record;
146
147                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
148                 ptr = vb_encode_uint64(ptr, size);
149
150                 record = (struct proftype_kern_trace64 *) ptr;
151                 ptr += size;
152
153                 record->info = info;
154                 record->tstamp = nsec();
155                 if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
156                         record->pid = -1;
157                 else
158                         record->pid = pcpui->cur_proc->pid;
159                 record->cpu = cpu_buf->cpu;
160                 record->num_traces = count;
161                 for (size_t i = 0; i < count; i++)
162                         record->trace[i] = (uint64_t) trace[i];
163
164                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
165         }
166 }
167
168 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
169                                        struct proc *p, const uintptr_t *trace,
170                                        size_t count, uint64_t info)
171 {
172         size_t size = sizeof(struct proftype_user_trace64) +
173                 count * sizeof(uint64_t);
174         struct block *b;
175         void *resptr, *ptr;
176
177         assert(!irq_is_enabled());
178         resptr = profiler_cpu_buffer_write_reserve(
179             cpu_buf, size + profiler_max_envelope_size(), &b);
180         ptr = resptr;
181
182         if (likely(ptr)) {
183                 struct proftype_user_trace64 *record;
184
185                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
186                 ptr = vb_encode_uint64(ptr, size);
187
188                 record = (struct proftype_user_trace64 *) ptr;
189                 ptr += size;
190
191                 record->info = info;
192                 record->tstamp = nsec();
193                 record->pid = p->pid;
194                 record->cpu = cpu_buf->cpu;
195                 record->num_traces = count;
196                 for (size_t i = 0; i < count; i++)
197                         record->trace[i] = (uint64_t) trace[i];
198
199                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
200         }
201 }
202
203 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
204                                    size_t offset, const char *path)
205 {
206         size_t plen = strlen(path) + 1;
207         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
208         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
209
210         if (likely(resptr)) {
211                 void *ptr = resptr;
212                 struct proftype_pid_mmap64 *record;
213
214                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
215                 ptr = vb_encode_uint64(ptr, size);
216
217                 record = (struct proftype_pid_mmap64 *) ptr;
218                 ptr += size;
219
220                 record->tstamp = nsec();
221                 record->pid = p->pid;
222                 record->addr = addr;
223                 record->size = msize;
224                 record->offset = offset;
225                 memcpy(record->path, path, plen);
226
227                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
228
229                 kfree(resptr);
230         }
231 }
232
233 static void profiler_push_new_process(struct proc *p)
234 {
235         size_t plen = strlen(p->binary_path) + 1;
236         size_t size = sizeof(struct proftype_new_process) + plen;
237         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
238
239         if (likely(resptr)) {
240                 void *ptr = resptr;
241                 struct proftype_new_process *record;
242
243                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
244                 ptr = vb_encode_uint64(ptr, size);
245
246                 record = (struct proftype_new_process *) ptr;
247                 ptr += size;
248
249                 record->tstamp = nsec();
250                 record->pid = p->pid;
251                 memcpy(record->path, p->binary_path, plen);
252
253                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
254
255                 kfree(resptr);
256         }
257 }
258
259 static void profiler_emit_current_system_status(void)
260 {
261         void enum_proc(struct vm_region *vmr, void *opaque)
262         {
263                 struct proc *p = (struct proc *) opaque;
264
265                 profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
266                                      vmr->vm_prot, vmr->vm_flags, vmr->vm_file,
267                                      vmr->vm_foff);
268         }
269
270         ERRSTACK(1);
271         struct process_set pset;
272
273         proc_get_set(&pset);
274         if (waserror()) {
275                 proc_free_set(&pset);
276                 nexterror();
277         }
278
279         for (size_t i = 0; i < pset.num_processes; i++) {
280                 profiler_notify_new_process(pset.procs[i]);
281                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
282         }
283
284         poperror();
285         proc_free_set(&pset);
286 }
287
288 static void free_cpu_buffers(void)
289 {
290         kfree(profiler_percpu_ctx);
291         profiler_percpu_ctx = NULL;
292
293         if (profiler_queue) {
294                 qfree(profiler_queue);
295                 profiler_queue = NULL;
296         }
297 }
298
299 static void alloc_cpu_buffers(void)
300 {
301         ERRSTACK(1);
302
303         /* It is very important that we enqueue and dequeue entire records at once.
304          * If we leave partial records, the entire stream will be corrupt.  Our
305          * reader does its best to make sure it has room for complete records
306          * (checks qlen()).
307          *
308          * If we ever get corrupt streams, try making this a Qmsg.  Though it
309          * doesn't help every situation - we have issues with writes greater than
310          * Maxatomic regardless. */
311         profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
312         if (!profiler_queue)
313                 error(ENOMEM, ERROR_FIXME);
314         if (waserror()) {
315                 free_cpu_buffers();
316                 nexterror();
317         }
318
319         profiler_percpu_ctx =
320             kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
321
322         for (int i = 0; i < num_cores; i++) {
323                 struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
324
325                 b->cpu = i;
326         }
327 }
328
329 static long profiler_get_checked_value(const char *value, long k, long minval,
330                                        long maxval)
331 {
332         long lvalue = strtol(value, NULL, 0) * k;
333
334         if (lvalue < minval)
335                 error(EFAIL, "Value should be greater than %ld", minval);
336         if (lvalue > maxval)
337                 error(EFAIL, "Value should be lower than %ld", maxval);
338
339         return lvalue;
340 }
341
342 int profiler_configure(struct cmdbuf *cb)
343 {
344         if (!strcmp(cb->f[0], "prof_qlimit")) {
345                 if (cb->nf < 2)
346                         error(EFAIL, "prof_qlimit KB");
347                 if (kref_refcnt(&profiler_kref) > 0)
348                         error(EFAIL, "Profiler already running");
349                 profiler_queue_limit = (int) profiler_get_checked_value(
350                         cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
351                 return 1;
352         }
353         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
354                 if (cb->nf < 2)
355                         error(EFAIL, "prof_cpubufsz KB");
356                 profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
357                         cb->f[1], 1024, 16 * 1024, 1024 * 1024);
358                 return 1;
359         }
360
361         return 0;
362 }
363
364 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
365 {
366         const char * const cmds[] = {
367                 "prof_qlimit",
368                 "prof_cpubufsz",
369         };
370
371         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
372                 strlcat(msgbuf, "|", buflen);
373                 strlcat(msgbuf, cmds[i], buflen);
374         }
375 }
376
377 static void profiler_release(struct kref *kref)
378 {
379         bool got_reference = FALSE;
380
381         assert(kref == &profiler_kref);
382         qlock(&profiler_mtx);
383         /* Make sure we did not race with profiler_setup(), that got the
384          * profiler_mtx lock just before us, and re-initialized the profiler
385          * for a new user.
386          * If we race here from another profiler_release() (user did a
387          * profiler_setup() immediately followed by a profiler_cleanup()) we are
388          * fine because free_cpu_buffers() can be called multiple times.
389          */
390         if (!kref_get_not_zero(kref, 1))
391                 free_cpu_buffers();
392         else
393                 got_reference = TRUE;
394         qunlock(&profiler_mtx);
395         /* We cannot call kref_put() within the profiler_kref lock, as such call
396          * might trigger anohter call to profiler_release().
397          */
398         if (got_reference)
399                 kref_put(kref);
400 }
401
402 void profiler_init(void)
403 {
404         assert(kref_refcnt(&profiler_kref) == 0);
405         kref_init(&profiler_kref, profiler_release, 0);
406 }
407
408 void profiler_setup(void)
409 {
410         ERRSTACK(1);
411
412         qlock(&profiler_mtx);
413         if (waserror()) {
414                 qunlock(&profiler_mtx);
415                 nexterror();
416         }
417         assert(!profiler_queue);
418         alloc_cpu_buffers();
419
420         /* Do this only when everything is initialized (as last init operation).
421          */
422         __kref_get(&profiler_kref, 1);
423
424         profiler_emit_current_system_status();
425
426         poperror();
427         qunlock(&profiler_mtx);
428 }
429
430 void profiler_cleanup(void)
431 {
432         kref_put(&profiler_kref);
433 }
434
435 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
436 {
437         int8_t irq_state = 0;
438
439         disable_irqsave(&irq_state);
440         if (cpu_buf->block && profiler_queue) {
441                 qibwrite(profiler_queue, cpu_buf->block);
442
443                 cpu_buf->block = NULL;
444         }
445         enable_irqsave(&irq_state);
446 }
447
448 static void profiler_core_trace_enable(void *opaque)
449 {
450         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
451
452         cpu_buf->tracing = (int) (opaque != NULL);
453         if (!cpu_buf->tracing)
454                 profiler_cpu_flush(cpu_buf);
455 }
456
457 static void profiler_control_trace(int onoff)
458 {
459         struct core_set cset;
460
461         error_assert(EINVAL, profiler_percpu_ctx);
462
463         core_set_init(&cset);
464         core_set_fill_available(&cset);
465         smp_do_in_cores(&cset, profiler_core_trace_enable,
466                         (void *) (uintptr_t) onoff);
467 }
468
469 void profiler_start(void)
470 {
471         assert(profiler_queue);
472         profiler_control_trace(1);
473         qreopen(profiler_queue);
474 }
475
476 void profiler_stop(void)
477 {
478         assert(profiler_queue);
479         profiler_control_trace(0);
480         qhangup(profiler_queue, 0);
481 }
482
483 static void profiler_core_flush(void *opaque)
484 {
485         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
486
487         profiler_cpu_flush(cpu_buf);
488 }
489
490 void profiler_trace_data_flush(void)
491 {
492         struct core_set cset;
493
494         error_assert(EINVAL, profiler_percpu_ctx);
495
496         core_set_init(&cset);
497         core_set_fill_available(&cset);
498         smp_do_in_cores(&cset, profiler_core_flush, NULL);
499 }
500
501 void profiler_add_trace(uintptr_t pc, uint64_t info)
502 {
503         if (is_user_raddr((void *) pc, 1))
504                 profiler_add_user_backtrace(pc, 0, info);
505         else
506                 profiler_add_kernel_backtrace(pc, 0, info);
507 }
508
509 void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
510 {
511         if (kref_get_not_zero(&profiler_kref, 1)) {
512                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
513
514                 if (profiler_percpu_ctx && cpu_buf->tracing) {
515                         uintptr_t trace[PROFILER_BT_DEPTH];
516                         size_t n = 1;
517
518                         trace[0] = pc;
519                         if (likely(fp))
520                                 n = backtrace_list(pc, fp, trace + 1,
521                                                    PROFILER_BT_DEPTH - 1) + 1;
522
523                         profiler_push_kernel_trace64(cpu_buf, trace, n, info);
524                 }
525                 kref_put(&profiler_kref);
526         }
527 }
528
529 void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
530 {
531         if (kref_get_not_zero(&profiler_kref, 1)) {
532                 struct proc *p = current;
533                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
534
535                 if (p && profiler_percpu_ctx && cpu_buf->tracing) {
536                         uintptr_t trace[PROFILER_BT_DEPTH];
537                         size_t n = 1;
538
539                         trace[0] = pc;
540                         if (likely(fp))
541                                 n = backtrace_user_list(pc, fp, trace + 1,
542                                                         PROFILER_BT_DEPTH - 1) + 1;
543
544                         profiler_push_user_trace64(cpu_buf, p, trace, n, info);
545                 }
546                 kref_put(&profiler_kref);
547         }
548 }
549
550 void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info)
551 {
552         if (in_kernel(hw_tf))
553                 profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
554                                               info);
555         else
556                 profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
557                                             info);
558 }
559
560 int profiler_size(void)
561 {
562         return profiler_queue ? qlen(profiler_queue) : 0;
563 }
564
565 int profiler_read(void *va, int n)
566 {
567         return profiler_queue ? qread(profiler_queue, va, n) : 0;
568 }
569
570 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
571                           int flags, struct file *f, size_t offset)
572 {
573         if (kref_get_not_zero(&profiler_kref, 1)) {
574                 if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
575                         char path_buf[PROFILER_MAX_PRG_PATH];
576                         char *path = file_abs_path(f, path_buf, sizeof(path_buf));
577
578                         if (likely(path))
579                                 profiler_push_pid_mmap(p, addr, size, offset, path);
580                 }
581                 kref_put(&profiler_kref);
582         }
583 }
584
585 void profiler_notify_new_process(struct proc *p)
586 {
587         if (kref_get_not_zero(&profiler_kref, 1)) {
588                 if (profiler_percpu_ctx && p->binary_path)
589                         profiler_push_new_process(p);
590                 kref_put(&profiler_kref);
591         }
592 }