Add a bulk interface to sem_down()
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  *
5  * This controls the emitting, collecting, and exporting of samples for perf
6  * events.  Examples of events are PMU counter overflows, mmaps, and process
7  * creation.
8  *
9  * Events are collected in a central qio queue.  High-frequency events (e.g.
10  * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
11  * central queue when they fill up or on command.  Lower-frequency events (e.g.
12  * profiler_notify_mmap()) just go straight to the central queue.
13  *
14  * Currently there is one global profiler.  Kprof is careful to only have one
15  * open profiler at a time.  We assert that this is true.  TODO: stop using the
16  * global profiler!
17  *
18  * A few other notes:
19  * - profiler_control_trace() controls the per-core trace collection.  When it
20  *   is disabled, it also flushes the per-core blocks to the central queue.
21  * - The collection of mmap and comm samples is independent of trace collection.
22  *   Those will occur whenever the profiler is open (refcnt check, for now). */
23
24 #include <ros/common.h>
25 #include <ros/mman.h>
26 #include <sys/types.h>
27 #include <smp.h>
28 #include <trap.h>
29 #include <kthread.h>
30 #include <env.h>
31 #include <process.h>
32 #include <mm.h>
33 #include <kmalloc.h>
34 #include <pmap.h>
35 #include <kref.h>
36 #include <atomic.h>
37 #include <umem.h>
38 #include <elf.h>
39 #include <ns.h>
40 #include <err.h>
41 #include <core_set.h>
42 #include <string.h>
43 #include "profiler.h"
44
45 #define PROFILER_MAX_PRG_PATH   256
46
47 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
48
49 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
50 struct profiler_cpu_context {
51         struct block *block;
52         int cpu;
53         int tracing;
54         size_t dropped_data_cnt;
55 };
56
57 static int profiler_queue_limit = 64 * 1024 * 1024;
58 static size_t profiler_cpu_buffer_size = 65536;
59 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
60 static struct kref profiler_kref;
61 static struct profiler_cpu_context *profiler_percpu_ctx;
62 static struct queue *profiler_queue;
63
64 static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
65 {
66         return profiler_percpu_ctx + cpu;
67 }
68
69 static inline char *vb_encode_uint64(char *data, uint64_t n)
70 {
71         /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
72          * number 7 in the byte, as indicator of end of sequence (when zero).
73          */
74         for (; n >= 0x80; n >>= 7)
75                 *data++ = (char) (n | 0x80);
76         *data++ = (char) n;
77
78         return data;
79 }
80
81 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
82                                            struct block *b)
83 {
84         /* qpass will drop b if the queue is over its limit.  we're willing to lose
85          * traces, but we won't lose 'control' events, such as MMAP and PID. */
86         if (b) {
87                 if (qpass(profiler_queue, b) < 0)
88                         cpu_buf->dropped_data_cnt++;
89         }
90         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
91 }
92
93 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
94  * enough room in the pcpu block for our write.  May alloc a new one.
95  *
96  * IRQs must be disabled before calling, until after write_commit. */
97 static char *profiler_cpu_buffer_write_reserve(
98         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
99 {
100         struct block *b = cpu_buf->block;
101
102         if (unlikely((!b) || (b->lim - b->wp) < size)) {
103                 cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
104                 if (unlikely(!b))
105                         return NULL;
106         }
107         *pb = b;
108
109         return (char *) b->wp;
110 }
111
112 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
113  * main body of @size bytes.  IRQs must be disabled until after this is called.
114  */
115 static inline void profiler_cpu_buffer_write_commit(
116         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
117 {
118         b->wp += size;
119 }
120
121 static inline size_t profiler_max_envelope_size(void)
122 {
123         return 2 * VBE_MAX_SIZE(uint64_t);
124 }
125
126 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
127                                          const uintptr_t *trace, size_t count,
128                                          uint64_t info)
129 {
130         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
131         size_t size = sizeof(struct proftype_kern_trace64) +
132                 count * sizeof(uint64_t);
133         struct block *b;
134         void *resptr, *ptr;
135
136         assert(!irq_is_enabled());
137         resptr = profiler_cpu_buffer_write_reserve(
138             cpu_buf, size + profiler_max_envelope_size(), &b);
139         ptr = resptr;
140
141         if (likely(ptr)) {
142                 struct proftype_kern_trace64 *record;
143
144                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
145                 ptr = vb_encode_uint64(ptr, size);
146
147                 record = (struct proftype_kern_trace64 *) ptr;
148                 ptr += size;
149
150                 record->info = info;
151                 record->tstamp = nsec();
152                 if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
153                         record->pid = -1;
154                 else
155                         record->pid = pcpui->cur_proc->pid;
156                 record->cpu = cpu_buf->cpu;
157                 record->num_traces = count;
158                 for (size_t i = 0; i < count; i++)
159                         record->trace[i] = (uint64_t) trace[i];
160
161                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
162         }
163 }
164
165 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
166                                        struct proc *p, const uintptr_t *trace,
167                                        size_t count, uint64_t info)
168 {
169         size_t size = sizeof(struct proftype_user_trace64) +
170                 count * sizeof(uint64_t);
171         struct block *b;
172         void *resptr, *ptr;
173
174         assert(!irq_is_enabled());
175         resptr = profiler_cpu_buffer_write_reserve(
176             cpu_buf, size + profiler_max_envelope_size(), &b);
177         ptr = resptr;
178
179         if (likely(ptr)) {
180                 struct proftype_user_trace64 *record;
181
182                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
183                 ptr = vb_encode_uint64(ptr, size);
184
185                 record = (struct proftype_user_trace64 *) ptr;
186                 ptr += size;
187
188                 record->info = info;
189                 record->tstamp = nsec();
190                 record->pid = p->pid;
191                 record->cpu = cpu_buf->cpu;
192                 record->num_traces = count;
193                 for (size_t i = 0; i < count; i++)
194                         record->trace[i] = (uint64_t) trace[i];
195
196                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
197         }
198 }
199
200 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
201                                    size_t offset, const char *path)
202 {
203         size_t plen = strlen(path) + 1;
204         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
205         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
206
207         if (likely(resptr)) {
208                 void *ptr = resptr;
209                 struct proftype_pid_mmap64 *record;
210
211                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
212                 ptr = vb_encode_uint64(ptr, size);
213
214                 record = (struct proftype_pid_mmap64 *) ptr;
215                 ptr += size;
216
217                 record->tstamp = nsec();
218                 record->pid = p->pid;
219                 record->addr = addr;
220                 record->size = msize;
221                 record->offset = offset;
222                 memcpy(record->path, path, plen);
223
224                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
225
226                 kfree(resptr);
227         }
228 }
229
230 static void profiler_push_new_process(struct proc *p)
231 {
232         size_t plen = strlen(p->binary_path) + 1;
233         size_t size = sizeof(struct proftype_new_process) + plen;
234         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
235
236         if (likely(resptr)) {
237                 void *ptr = resptr;
238                 struct proftype_new_process *record;
239
240                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
241                 ptr = vb_encode_uint64(ptr, size);
242
243                 record = (struct proftype_new_process *) ptr;
244                 ptr += size;
245
246                 record->tstamp = nsec();
247                 record->pid = p->pid;
248                 memcpy(record->path, p->binary_path, plen);
249
250                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
251
252                 kfree(resptr);
253         }
254 }
255
256 static void profiler_emit_current_system_status(void)
257 {
258         void enum_proc(struct vm_region *vmr, void *opaque)
259         {
260                 struct proc *p = (struct proc *) opaque;
261
262                 profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
263                                      vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
264                                      vmr->vm_foff);
265         }
266
267         ERRSTACK(1);
268         struct process_set pset;
269
270         proc_get_set(&pset);
271         if (waserror()) {
272                 proc_free_set(&pset);
273                 nexterror();
274         }
275
276         for (size_t i = 0; i < pset.num_processes; i++) {
277                 profiler_notify_new_process(pset.procs[i]);
278                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
279         }
280
281         poperror();
282         proc_free_set(&pset);
283 }
284
285 static void free_cpu_buffers(void)
286 {
287         kfree(profiler_percpu_ctx);
288         profiler_percpu_ctx = NULL;
289
290         if (profiler_queue) {
291                 qfree(profiler_queue);
292                 profiler_queue = NULL;
293         }
294 }
295
296 static void alloc_cpu_buffers(void)
297 {
298         ERRSTACK(1);
299
300         /* It is very important that we enqueue and dequeue entire records at once.
301          * If we leave partial records, the entire stream will be corrupt.  Our
302          * reader does its best to make sure it has room for complete records
303          * (checks qlen()).
304          *
305          * If we ever get corrupt streams, try making this a Qmsg.  Though it
306          * doesn't help every situation - we have issues with writes greater than
307          * Maxatomic regardless. */
308         profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
309         if (!profiler_queue)
310                 error(ENOMEM, ERROR_FIXME);
311         if (waserror()) {
312                 free_cpu_buffers();
313                 nexterror();
314         }
315
316         profiler_percpu_ctx =
317             kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
318
319         for (int i = 0; i < num_cores; i++) {
320                 struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
321
322                 b->cpu = i;
323         }
324 }
325
326 static long profiler_get_checked_value(const char *value, long k, long minval,
327                                        long maxval)
328 {
329         long lvalue = strtol(value, NULL, 0) * k;
330
331         if (lvalue < minval)
332                 error(EFAIL, "Value should be greater than %ld", minval);
333         if (lvalue > maxval)
334                 error(EFAIL, "Value should be lower than %ld", maxval);
335
336         return lvalue;
337 }
338
339 int profiler_configure(struct cmdbuf *cb)
340 {
341         if (!strcmp(cb->f[0], "prof_qlimit")) {
342                 if (cb->nf < 2)
343                         error(EFAIL, "prof_qlimit KB");
344                 if (kref_refcnt(&profiler_kref) > 0)
345                         error(EFAIL, "Profiler already running");
346                 profiler_queue_limit = (int) profiler_get_checked_value(
347                         cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
348                 return 1;
349         }
350         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
351                 if (cb->nf < 2)
352                         error(EFAIL, "prof_cpubufsz KB");
353                 profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
354                         cb->f[1], 1024, 16 * 1024, 1024 * 1024);
355                 return 1;
356         }
357
358         return 0;
359 }
360
361 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
362 {
363         const char * const cmds[] = {
364                 "prof_qlimit",
365                 "prof_cpubufsz",
366         };
367
368         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
369                 strlcat(msgbuf, "|", buflen);
370                 strlcat(msgbuf, cmds[i], buflen);
371         }
372 }
373
374 static void profiler_release(struct kref *kref)
375 {
376         bool got_reference = FALSE;
377
378         assert(kref == &profiler_kref);
379         qlock(&profiler_mtx);
380         /* Make sure we did not race with profiler_setup(), that got the
381          * profiler_mtx lock just before us, and re-initialized the profiler
382          * for a new user.
383          * If we race here from another profiler_release() (user did a
384          * profiler_setup() immediately followed by a profiler_cleanup()) we are
385          * fine because free_cpu_buffers() can be called multiple times.
386          */
387         if (!kref_get_not_zero(kref, 1))
388                 free_cpu_buffers();
389         else
390                 got_reference = TRUE;
391         qunlock(&profiler_mtx);
392         /* We cannot call kref_put() within the profiler_kref lock, as such call
393          * might trigger anohter call to profiler_release().
394          */
395         if (got_reference)
396                 kref_put(kref);
397 }
398
399 void profiler_init(void)
400 {
401         assert(kref_refcnt(&profiler_kref) == 0);
402         kref_init(&profiler_kref, profiler_release, 0);
403 }
404
405 void profiler_setup(void)
406 {
407         ERRSTACK(1);
408
409         qlock(&profiler_mtx);
410         if (waserror()) {
411                 qunlock(&profiler_mtx);
412                 nexterror();
413         }
414         assert(!profiler_queue);
415         alloc_cpu_buffers();
416
417         /* Do this only when everything is initialized (as last init operation).
418          */
419         __kref_get(&profiler_kref, 1);
420
421         profiler_emit_current_system_status();
422
423         poperror();
424         qunlock(&profiler_mtx);
425 }
426
427 void profiler_cleanup(void)
428 {
429         kref_put(&profiler_kref);
430 }
431
432 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
433 {
434         int8_t irq_state = 0;
435
436         disable_irqsave(&irq_state);
437         if (cpu_buf->block && profiler_queue) {
438                 qibwrite(profiler_queue, cpu_buf->block);
439
440                 cpu_buf->block = NULL;
441         }
442         enable_irqsave(&irq_state);
443 }
444
445 static void profiler_core_trace_enable(void *opaque)
446 {
447         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
448
449         cpu_buf->tracing = (int) (opaque != NULL);
450         if (!cpu_buf->tracing)
451                 profiler_cpu_flush(cpu_buf);
452 }
453
454 static void profiler_control_trace(int onoff)
455 {
456         struct core_set cset;
457
458         error_assert(EINVAL, profiler_percpu_ctx);
459
460         core_set_init(&cset);
461         core_set_fill_available(&cset);
462         smp_do_in_cores(&cset, profiler_core_trace_enable,
463                         (void *) (uintptr_t) onoff);
464 }
465
466 void profiler_start(void)
467 {
468         assert(profiler_queue);
469         profiler_control_trace(1);
470         qreopen(profiler_queue);
471 }
472
473 void profiler_stop(void)
474 {
475         assert(profiler_queue);
476         profiler_control_trace(0);
477         qhangup(profiler_queue, 0);
478 }
479
480 static void profiler_core_flush(void *opaque)
481 {
482         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
483
484         profiler_cpu_flush(cpu_buf);
485 }
486
487 void profiler_trace_data_flush(void)
488 {
489         struct core_set cset;
490
491         error_assert(EINVAL, profiler_percpu_ctx);
492
493         core_set_init(&cset);
494         core_set_fill_available(&cset);
495         smp_do_in_cores(&cset, profiler_core_flush, NULL);
496 }
497
498 void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
499                                     uint64_t info)
500 {
501         if (kref_get_not_zero(&profiler_kref, 1)) {
502                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
503
504                 if (profiler_percpu_ctx && cpu_buf->tracing)
505                         profiler_push_kernel_trace64(cpu_buf, pc_list, nr_pcs, info);
506                 kref_put(&profiler_kref);
507         }
508 }
509
510 void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
511                                   uint64_t info)
512 {
513         if (kref_get_not_zero(&profiler_kref, 1)) {
514                 struct proc *p = current;
515                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
516
517                 if (profiler_percpu_ctx && cpu_buf->tracing)
518                         profiler_push_user_trace64(cpu_buf, p, pc_list, nr_pcs, info);
519                 kref_put(&profiler_kref);
520         }
521 }
522
523 int profiler_size(void)
524 {
525         return profiler_queue ? qlen(profiler_queue) : 0;
526 }
527
528 int profiler_read(void *va, int n)
529 {
530         return profiler_queue ? qread(profiler_queue, va, n) : 0;
531 }
532
533 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
534                           int flags, struct file_or_chan *foc, size_t offset)
535 {
536         if (kref_get_not_zero(&profiler_kref, 1)) {
537                 if (foc && (prot & PROT_EXEC) && profiler_percpu_ctx) {
538                         char path_buf[PROFILER_MAX_PRG_PATH];
539                         char *path = foc_abs_path(foc, path_buf, sizeof(path_buf));
540
541                         if (likely(path))
542                                 profiler_push_pid_mmap(p, addr, size, offset, path);
543                 }
544                 kref_put(&profiler_kref);
545         }
546 }
547
548 void profiler_notify_new_process(struct proc *p)
549 {
550         if (kref_get_not_zero(&profiler_kref, 1)) {
551                 if (profiler_percpu_ctx && p->binary_path)
552                         profiler_push_new_process(p);
553                 kref_put(&profiler_kref);
554         }
555 }