Add support for attempting returns from panic
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  *
5  * This controls the emitting, collecting, and exporting of samples for perf
6  * events.  Examples of events are PMU counter overflows, mmaps, and process
7  * creation.
8  *
9  * Events are collected in a central qio queue.  High-frequency events (e.g.
10  * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
11  * central queue when they fill up or on command.  Lower-frequency events (e.g.
12  * profiler_notify_mmap()) just go straight to the central queue.
13  *
14  * Currently there is one global profiler.  Kprof is careful to only have one
15  * open profiler at a time.  We assert that this is true.  TODO: stop using the
16  * global profiler!
17  *
18  * A few other notes:
19  * - profiler_control_trace() controls the per-core trace collection.  When it
20  *   is disabled, it also flushes the per-core blocks to the central queue.
21  * - The collection of mmap and comm samples is independent of trace collection.
22  *   Those will occur whenever the profiler is open (refcnt check, for now). */
23
24 #include <ros/common.h>
25 #include <ros/mman.h>
26 #include <sys/types.h>
27 #include <smp.h>
28 #include <trap.h>
29 #include <kthread.h>
30 #include <env.h>
31 #include <process.h>
32 #include <mm.h>
33 #include <vfs.h>
34 #include <kmalloc.h>
35 #include <pmap.h>
36 #include <kref.h>
37 #include <atomic.h>
38 #include <umem.h>
39 #include <elf.h>
40 #include <ns.h>
41 #include <err.h>
42 #include <core_set.h>
43 #include <string.h>
44 #include "profiler.h"
45
46 #define PROFILER_MAX_PRG_PATH   256
47
48 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
49
50 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
51 struct profiler_cpu_context {
52         struct block *block;
53         int cpu;
54         int tracing;
55         size_t dropped_data_cnt;
56 };
57
58 static int profiler_queue_limit = 64 * 1024 * 1024;
59 static size_t profiler_cpu_buffer_size = 65536;
60 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
61 static struct kref profiler_kref;
62 static struct profiler_cpu_context *profiler_percpu_ctx;
63 static struct queue *profiler_queue;
64
65 static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
66 {
67         return profiler_percpu_ctx + cpu;
68 }
69
70 static inline char *vb_encode_uint64(char *data, uint64_t n)
71 {
72         /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
73          * number 7 in the byte, as indicator of end of sequence (when zero).
74          */
75         for (; n >= 0x80; n >>= 7)
76                 *data++ = (char) (n | 0x80);
77         *data++ = (char) n;
78
79         return data;
80 }
81
82 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
83                                            struct block *b)
84 {
85         /* qpass will drop b if the queue is over its limit.  we're willing to lose
86          * traces, but we won't lose 'control' events, such as MMAP and PID. */
87         if (b) {
88                 if (qpass(profiler_queue, b) < 0)
89                         cpu_buf->dropped_data_cnt++;
90         }
91         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
92 }
93
94 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
95  * enough room in the pcpu block for our write.  May alloc a new one.
96  *
97  * IRQs must be disabled before calling, until after write_commit. */
98 static char *profiler_cpu_buffer_write_reserve(
99         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
100 {
101         struct block *b = cpu_buf->block;
102
103         if (unlikely((!b) || (b->lim - b->wp) < size)) {
104                 cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
105                 if (unlikely(!b))
106                         return NULL;
107         }
108         *pb = b;
109
110         return (char *) b->wp;
111 }
112
113 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
114  * main body of @size bytes.  IRQs must be disabled until after this is called.
115  */
116 static inline void profiler_cpu_buffer_write_commit(
117         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
118 {
119         b->wp += size;
120 }
121
122 static inline size_t profiler_max_envelope_size(void)
123 {
124         return 2 * VBE_MAX_SIZE(uint64_t);
125 }
126
127 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
128                                          const uintptr_t *trace, size_t count,
129                                          uint64_t info)
130 {
131         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
132         size_t size = sizeof(struct proftype_kern_trace64) +
133                 count * sizeof(uint64_t);
134         struct block *b;
135         void *resptr, *ptr;
136
137         assert(!irq_is_enabled());
138         resptr = profiler_cpu_buffer_write_reserve(
139             cpu_buf, size + profiler_max_envelope_size(), &b);
140         ptr = resptr;
141
142         if (likely(ptr)) {
143                 struct proftype_kern_trace64 *record;
144
145                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
146                 ptr = vb_encode_uint64(ptr, size);
147
148                 record = (struct proftype_kern_trace64 *) ptr;
149                 ptr += size;
150
151                 record->info = info;
152                 record->tstamp = nsec();
153                 if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
154                         record->pid = -1;
155                 else
156                         record->pid = pcpui->cur_proc->pid;
157                 record->cpu = cpu_buf->cpu;
158                 record->num_traces = count;
159                 for (size_t i = 0; i < count; i++)
160                         record->trace[i] = (uint64_t) trace[i];
161
162                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
163         }
164 }
165
166 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
167                                        struct proc *p, const uintptr_t *trace,
168                                        size_t count, uint64_t info)
169 {
170         size_t size = sizeof(struct proftype_user_trace64) +
171                 count * sizeof(uint64_t);
172         struct block *b;
173         void *resptr, *ptr;
174
175         assert(!irq_is_enabled());
176         resptr = profiler_cpu_buffer_write_reserve(
177             cpu_buf, size + profiler_max_envelope_size(), &b);
178         ptr = resptr;
179
180         if (likely(ptr)) {
181                 struct proftype_user_trace64 *record;
182
183                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
184                 ptr = vb_encode_uint64(ptr, size);
185
186                 record = (struct proftype_user_trace64 *) ptr;
187                 ptr += size;
188
189                 record->info = info;
190                 record->tstamp = nsec();
191                 record->pid = p->pid;
192                 record->cpu = cpu_buf->cpu;
193                 record->num_traces = count;
194                 for (size_t i = 0; i < count; i++)
195                         record->trace[i] = (uint64_t) trace[i];
196
197                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
198         }
199 }
200
201 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
202                                    size_t offset, const char *path)
203 {
204         size_t plen = strlen(path) + 1;
205         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
206         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
207
208         if (likely(resptr)) {
209                 void *ptr = resptr;
210                 struct proftype_pid_mmap64 *record;
211
212                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
213                 ptr = vb_encode_uint64(ptr, size);
214
215                 record = (struct proftype_pid_mmap64 *) ptr;
216                 ptr += size;
217
218                 record->tstamp = nsec();
219                 record->pid = p->pid;
220                 record->addr = addr;
221                 record->size = msize;
222                 record->offset = offset;
223                 memcpy(record->path, path, plen);
224
225                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
226
227                 kfree(resptr);
228         }
229 }
230
231 static void profiler_push_new_process(struct proc *p)
232 {
233         size_t plen = strlen(p->binary_path) + 1;
234         size_t size = sizeof(struct proftype_new_process) + plen;
235         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
236
237         if (likely(resptr)) {
238                 void *ptr = resptr;
239                 struct proftype_new_process *record;
240
241                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
242                 ptr = vb_encode_uint64(ptr, size);
243
244                 record = (struct proftype_new_process *) ptr;
245                 ptr += size;
246
247                 record->tstamp = nsec();
248                 record->pid = p->pid;
249                 memcpy(record->path, p->binary_path, plen);
250
251                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
252
253                 kfree(resptr);
254         }
255 }
256
257 static void profiler_emit_current_system_status(void)
258 {
259         void enum_proc(struct vm_region *vmr, void *opaque)
260         {
261                 struct proc *p = (struct proc *) opaque;
262
263                 profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
264                                      vmr->vm_prot, vmr->vm_flags, vmr->vm_file,
265                                      vmr->vm_foff);
266         }
267
268         ERRSTACK(1);
269         struct process_set pset;
270
271         proc_get_set(&pset);
272         if (waserror()) {
273                 proc_free_set(&pset);
274                 nexterror();
275         }
276
277         for (size_t i = 0; i < pset.num_processes; i++) {
278                 profiler_notify_new_process(pset.procs[i]);
279                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
280         }
281
282         poperror();
283         proc_free_set(&pset);
284 }
285
286 static void free_cpu_buffers(void)
287 {
288         kfree(profiler_percpu_ctx);
289         profiler_percpu_ctx = NULL;
290
291         if (profiler_queue) {
292                 qfree(profiler_queue);
293                 profiler_queue = NULL;
294         }
295 }
296
297 static void alloc_cpu_buffers(void)
298 {
299         ERRSTACK(1);
300
301         /* It is very important that we enqueue and dequeue entire records at once.
302          * If we leave partial records, the entire stream will be corrupt.  Our
303          * reader does its best to make sure it has room for complete records
304          * (checks qlen()).
305          *
306          * If we ever get corrupt streams, try making this a Qmsg.  Though it
307          * doesn't help every situation - we have issues with writes greater than
308          * Maxatomic regardless. */
309         profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
310         if (!profiler_queue)
311                 error(ENOMEM, ERROR_FIXME);
312         if (waserror()) {
313                 free_cpu_buffers();
314                 nexterror();
315         }
316
317         profiler_percpu_ctx =
318             kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
319
320         for (int i = 0; i < num_cores; i++) {
321                 struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
322
323                 b->cpu = i;
324         }
325 }
326
327 static long profiler_get_checked_value(const char *value, long k, long minval,
328                                        long maxval)
329 {
330         long lvalue = strtol(value, NULL, 0) * k;
331
332         if (lvalue < minval)
333                 error(EFAIL, "Value should be greater than %ld", minval);
334         if (lvalue > maxval)
335                 error(EFAIL, "Value should be lower than %ld", maxval);
336
337         return lvalue;
338 }
339
340 int profiler_configure(struct cmdbuf *cb)
341 {
342         if (!strcmp(cb->f[0], "prof_qlimit")) {
343                 if (cb->nf < 2)
344                         error(EFAIL, "prof_qlimit KB");
345                 if (kref_refcnt(&profiler_kref) > 0)
346                         error(EFAIL, "Profiler already running");
347                 profiler_queue_limit = (int) profiler_get_checked_value(
348                         cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
349                 return 1;
350         }
351         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
352                 if (cb->nf < 2)
353                         error(EFAIL, "prof_cpubufsz KB");
354                 profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
355                         cb->f[1], 1024, 16 * 1024, 1024 * 1024);
356                 return 1;
357         }
358
359         return 0;
360 }
361
362 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
363 {
364         const char * const cmds[] = {
365                 "prof_qlimit",
366                 "prof_cpubufsz",
367         };
368
369         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
370                 strlcat(msgbuf, "|", buflen);
371                 strlcat(msgbuf, cmds[i], buflen);
372         }
373 }
374
375 static void profiler_release(struct kref *kref)
376 {
377         bool got_reference = FALSE;
378
379         assert(kref == &profiler_kref);
380         qlock(&profiler_mtx);
381         /* Make sure we did not race with profiler_setup(), that got the
382          * profiler_mtx lock just before us, and re-initialized the profiler
383          * for a new user.
384          * If we race here from another profiler_release() (user did a
385          * profiler_setup() immediately followed by a profiler_cleanup()) we are
386          * fine because free_cpu_buffers() can be called multiple times.
387          */
388         if (!kref_get_not_zero(kref, 1))
389                 free_cpu_buffers();
390         else
391                 got_reference = TRUE;
392         qunlock(&profiler_mtx);
393         /* We cannot call kref_put() within the profiler_kref lock, as such call
394          * might trigger anohter call to profiler_release().
395          */
396         if (got_reference)
397                 kref_put(kref);
398 }
399
400 void profiler_init(void)
401 {
402         assert(kref_refcnt(&profiler_kref) == 0);
403         kref_init(&profiler_kref, profiler_release, 0);
404 }
405
406 void profiler_setup(void)
407 {
408         ERRSTACK(1);
409
410         qlock(&profiler_mtx);
411         if (waserror()) {
412                 qunlock(&profiler_mtx);
413                 nexterror();
414         }
415         assert(!profiler_queue);
416         alloc_cpu_buffers();
417
418         /* Do this only when everything is initialized (as last init operation).
419          */
420         __kref_get(&profiler_kref, 1);
421
422         profiler_emit_current_system_status();
423
424         poperror();
425         qunlock(&profiler_mtx);
426 }
427
428 void profiler_cleanup(void)
429 {
430         kref_put(&profiler_kref);
431 }
432
433 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
434 {
435         int8_t irq_state = 0;
436
437         disable_irqsave(&irq_state);
438         if (cpu_buf->block && profiler_queue) {
439                 qibwrite(profiler_queue, cpu_buf->block);
440
441                 cpu_buf->block = NULL;
442         }
443         enable_irqsave(&irq_state);
444 }
445
446 static void profiler_core_trace_enable(void *opaque)
447 {
448         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
449
450         cpu_buf->tracing = (int) (opaque != NULL);
451         if (!cpu_buf->tracing)
452                 profiler_cpu_flush(cpu_buf);
453 }
454
455 static void profiler_control_trace(int onoff)
456 {
457         struct core_set cset;
458
459         error_assert(EINVAL, profiler_percpu_ctx);
460
461         core_set_init(&cset);
462         core_set_fill_available(&cset);
463         smp_do_in_cores(&cset, profiler_core_trace_enable,
464                         (void *) (uintptr_t) onoff);
465 }
466
467 void profiler_start(void)
468 {
469         assert(profiler_queue);
470         profiler_control_trace(1);
471         qreopen(profiler_queue);
472 }
473
474 void profiler_stop(void)
475 {
476         assert(profiler_queue);
477         profiler_control_trace(0);
478         qhangup(profiler_queue, 0);
479 }
480
481 static void profiler_core_flush(void *opaque)
482 {
483         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
484
485         profiler_cpu_flush(cpu_buf);
486 }
487
488 void profiler_trace_data_flush(void)
489 {
490         struct core_set cset;
491
492         error_assert(EINVAL, profiler_percpu_ctx);
493
494         core_set_init(&cset);
495         core_set_fill_available(&cset);
496         smp_do_in_cores(&cset, profiler_core_flush, NULL);
497 }
498
499 void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
500                                     uint64_t info)
501 {
502         if (kref_get_not_zero(&profiler_kref, 1)) {
503                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
504
505                 if (profiler_percpu_ctx && cpu_buf->tracing)
506                         profiler_push_kernel_trace64(cpu_buf, pc_list, nr_pcs, info);
507                 kref_put(&profiler_kref);
508         }
509 }
510
511 void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
512                                   uint64_t info)
513 {
514         if (kref_get_not_zero(&profiler_kref, 1)) {
515                 struct proc *p = current;
516                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
517
518                 if (profiler_percpu_ctx && cpu_buf->tracing)
519                         profiler_push_user_trace64(cpu_buf, p, pc_list, nr_pcs, info);
520                 kref_put(&profiler_kref);
521         }
522 }
523
524 int profiler_size(void)
525 {
526         return profiler_queue ? qlen(profiler_queue) : 0;
527 }
528
529 int profiler_read(void *va, int n)
530 {
531         return profiler_queue ? qread(profiler_queue, va, n) : 0;
532 }
533
534 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
535                           int flags, struct file *f, size_t offset)
536 {
537         if (kref_get_not_zero(&profiler_kref, 1)) {
538                 if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
539                         char path_buf[PROFILER_MAX_PRG_PATH];
540                         char *path = file_abs_path(f, path_buf, sizeof(path_buf));
541
542                         if (likely(path))
543                                 profiler_push_pid_mmap(p, addr, size, offset, path);
544                 }
545                 kref_put(&profiler_kref);
546         }
547 }
548
549 void profiler_notify_new_process(struct proc *p)
550 {
551         if (kref_get_not_zero(&profiler_kref, 1)) {
552                 if (profiler_percpu_ctx && p->binary_path)
553                         profiler_push_new_process(p);
554                 kref_put(&profiler_kref);
555         }
556 }