prof: fix disgusting synchronzation
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  *
5  * This controls the emitting, collecting, and exporting of samples for perf
6  * events.  Examples of events are PMU counter overflows, mmaps, and process
7  * creation.
8  *
9  * Events are collected in a central qio queue.  High-frequency events (e.g.
10  * IRQ backtraces()) are collected in per-core buffers, which are flushed to the
11  * central queue when they fill up or on command.  Lower-frequency events (e.g.
12  * profiler_notify_mmap()) just go straight to the central queue.
13  *
14  * Currently there is one global profiler.  Kprof is careful to only have one
15  * open profiler at a time.  See profiler.h for more info.  The only sync we do
16  * in this file is for the functions that are not called while holding the kprof
17  * mutex - specifically the RCU-protected backtrace sampling code.
18  *
19  * A few other notes:
20  * - profiler_control_trace() controls the per-core trace collection.  When it
21  *   is disabled, it also flushes the per-core blocks to the central queue.
22  * - The collection of mmap and comm samples is independent of trace collection.
23  *   Those will occur whenever the profiler is open, even if it is not started.
24  * - Looks like we don't bother with munmap records.  Not sure if perf can
25  *   handle it or not. */
26
27 #include <ros/common.h>
28 #include <ros/mman.h>
29 #include <sys/types.h>
30 #include <smp.h>
31 #include <trap.h>
32 #include <kthread.h>
33 #include <env.h>
34 #include <process.h>
35 #include <mm.h>
36 #include <kmalloc.h>
37 #include <pmap.h>
38 #include <atomic.h>
39 #include <umem.h>
40 #include <elf.h>
41 #include <ns.h>
42 #include <err.h>
43 #include <core_set.h>
44 #include <string.h>
45 #include "profiler.h"
46
47 #define PROFILER_MAX_PRG_PATH   256
48
49 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
50
51 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
52 struct profiler_cpu_context {
53         struct block *block;
54         int cpu;
55         bool tracing;
56         size_t dropped_data_cnt;
57 };
58
59 /* These are a little hokey, and are currently global vars */
60 static int profiler_queue_limit = 64 * 1024 * 1024;
61 static size_t profiler_cpu_buffer_size = 65536;
62
63 struct profiler {
64         struct profiler_cpu_context *pcpu_ctx;
65         struct queue *qio;
66         bool tracing;
67 };
68
69 static struct profiler __rcu *gbl_prof;
70
71 static struct profiler_cpu_context *profiler_get_cpu_ctx(struct profiler *prof,
72                                                          int cpu)
73 {
74         return prof->pcpu_ctx + cpu;
75 }
76
77 static inline char *vb_encode_uint64(char *data, uint64_t n)
78 {
79         /* Classical variable bytes encoding. Encodes 7 bits at a time, using
80          * bit number 7 in the byte, as indicator of end of sequence (when
81          * zero). */
82         for (; n >= 0x80; n >>= 7)
83                 *data++ = (char) (n | 0x80);
84         *data++ = (char) n;
85
86         return data;
87 }
88
89 static struct block *profiler_buffer_write(struct profiler *prof,
90                                            struct profiler_cpu_context *cpu_buf,
91                                            struct block *b)
92 {
93         /* qpass will drop b if the queue is over its limit.  we're willing to
94          * lose traces, but we won't lose 'control' events, such as MMAP and
95          * PID. */
96         if (b) {
97                 if (qpass(prof->qio, b) < 0)
98                         cpu_buf->dropped_data_cnt++;
99         }
100         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
101 }
102
103 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
104  * enough room in the pcpu block for our write.  May alloc a new one.
105  *
106  * IRQs must be disabled before calling, until after write_commit. */
107 static char *profiler_cpu_buffer_write_reserve(struct profiler *prof,
108         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
109 {
110         struct block *b = cpu_buf->block;
111
112         if (unlikely((!b) || (b->lim - b->wp) < size)) {
113                 cpu_buf->block = b = profiler_buffer_write(prof, cpu_buf, b);
114                 if (unlikely(!b))
115                         return NULL;
116         }
117         *pb = b;
118
119         return (char *) b->wp;
120 }
121
122 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
123  * main body of @size bytes.  IRQs must be disabled until after this is called.
124  */
125 static inline void profiler_cpu_buffer_write_commit(
126         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
127 {
128         b->wp += size;
129 }
130
131 static inline size_t profiler_max_envelope_size(void)
132 {
133         return 2 * VBE_MAX_SIZE(uint64_t);
134 }
135
136 static void profiler_push_kernel_trace64(struct profiler *prof,
137                                          struct profiler_cpu_context *cpu_buf,
138                                          const uintptr_t *trace, size_t count,
139                                          uint64_t info)
140 {
141         struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
142         size_t size = sizeof(struct proftype_kern_trace64) +
143                 count * sizeof(uint64_t);
144         struct block *b;
145         void *resptr, *ptr;
146
147         assert(!irq_is_enabled());
148         resptr = profiler_cpu_buffer_write_reserve(prof,
149             cpu_buf, size + profiler_max_envelope_size(), &b);
150         ptr = resptr;
151
152         if (likely(ptr)) {
153                 struct proftype_kern_trace64 *record;
154
155                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
156                 ptr = vb_encode_uint64(ptr, size);
157
158                 record = (struct proftype_kern_trace64 *) ptr;
159                 ptr += size;
160
161                 record->info = info;
162                 record->tstamp = nsec();
163                 if (is_ktask(pcpui->cur_kthread) || !pcpui->cur_proc)
164                         record->pid = -1;
165                 else
166                         record->pid = pcpui->cur_proc->pid;
167                 record->cpu = cpu_buf->cpu;
168                 record->num_traces = count;
169                 for (size_t i = 0; i < count; i++)
170                         record->trace[i] = (uint64_t) trace[i];
171
172                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
173         }
174 }
175
176 static void profiler_push_user_trace64(struct profiler *prof,
177                                        struct profiler_cpu_context *cpu_buf,
178                                        struct proc *p, const uintptr_t *trace,
179                                        size_t count, uint64_t info)
180 {
181         size_t size = sizeof(struct proftype_user_trace64) +
182                 count * sizeof(uint64_t);
183         struct block *b;
184         void *resptr, *ptr;
185
186         assert(!irq_is_enabled());
187         resptr = profiler_cpu_buffer_write_reserve(prof,
188             cpu_buf, size + profiler_max_envelope_size(), &b);
189         ptr = resptr;
190
191         if (likely(ptr)) {
192                 struct proftype_user_trace64 *record;
193
194                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
195                 ptr = vb_encode_uint64(ptr, size);
196
197                 record = (struct proftype_user_trace64 *) ptr;
198                 ptr += size;
199
200                 record->info = info;
201                 record->tstamp = nsec();
202                 record->pid = p->pid;
203                 record->cpu = cpu_buf->cpu;
204                 record->num_traces = count;
205                 for (size_t i = 0; i < count; i++)
206                         record->trace[i] = (uint64_t) trace[i];
207
208                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
209         }
210 }
211
212 static void profiler_push_pid_mmap(struct profiler *prof, struct proc *p,
213                                    uintptr_t addr, size_t msize, size_t offset,
214                                    const char *path)
215 {
216         size_t plen = strlen(path) + 1;
217         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
218         void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);
219
220         if (likely(resptr)) {
221                 void *ptr = resptr;
222                 struct proftype_pid_mmap64 *record;
223
224                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
225                 ptr = vb_encode_uint64(ptr, size);
226
227                 record = (struct proftype_pid_mmap64 *) ptr;
228                 ptr += size;
229
230                 record->tstamp = nsec();
231                 record->pid = p->pid;
232                 record->addr = addr;
233                 record->size = msize;
234                 record->offset = offset;
235                 memcpy(record->path, path, plen);
236
237                 qiwrite(prof->qio, resptr, (int) (ptr - resptr));
238
239                 kfree(resptr);
240         }
241 }
242
243 static void profiler_push_new_process(struct profiler *prof, struct proc *p)
244 {
245         size_t plen = strlen(p->binary_path) + 1;
246         size_t size = sizeof(struct proftype_new_process) + plen;
247         void *resptr = kmalloc(size + profiler_max_envelope_size(), MEM_ATOMIC);
248
249         if (likely(resptr)) {
250                 void *ptr = resptr;
251                 struct proftype_new_process *record;
252
253                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
254                 ptr = vb_encode_uint64(ptr, size);
255
256                 record = (struct proftype_new_process *) ptr;
257                 ptr += size;
258
259                 record->tstamp = nsec();
260                 record->pid = p->pid;
261                 memcpy(record->path, p->binary_path, plen);
262
263                 qiwrite(prof->qio, resptr, (int) (ptr - resptr));
264
265                 kfree(resptr);
266         }
267 }
268
269 static void profiler_emit_current_system_status(void)
270 {
271         void enum_proc(struct vm_region *vmr, void *opaque)
272         {
273                 struct proc *p = (struct proc *) opaque;
274
275                 profiler_notify_mmap(p, vmr->vm_base,
276                                      vmr->vm_end - vmr->vm_base,
277                                      vmr->vm_prot, vmr->vm_flags, vmr->__vm_foc,
278                                      vmr->vm_foff);
279         }
280
281         struct process_set pset;
282
283         proc_get_set(&pset);
284
285         for (size_t i = 0; i < pset.num_processes; i++) {
286                 profiler_notify_new_process(pset.procs[i]);
287                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
288         }
289
290         proc_free_set(&pset);
291 }
292
293 static long profiler_get_checked_value(const char *value, long k, long minval,
294                                        long maxval)
295 {
296         long lvalue = strtol(value, NULL, 0) * k;
297
298         if (lvalue < minval)
299                 error(EFAIL, "Value should be greater than %ld", minval);
300         if (lvalue > maxval)
301                 error(EFAIL, "Value should be lower than %ld", maxval);
302
303         return lvalue;
304 }
305
306 /* TODO: This configure stuff is a little hokey.  You have to configure before
307  * it's been opened, meaning before you have the kprofctlqid, but you can't
308  * configure until you have the chan.  To use this, you'd need to open, then
309  * config, then close, then hope that the global settings stick around, then
310  * open and run it.
311  *
312  * Also note that no one uses this. */
313 int profiler_configure(struct cmdbuf *cb)
314 {
315         if (!strcmp(cb->f[0], "prof_qlimit")) {
316                 if (cb->nf < 2)
317                         error(EFAIL, "prof_qlimit KB");
318                 /* If the profiler is already running, this won't take effect
319                  * until the next open.  Feel free to change this. */
320                 WRITE_ONCE(profiler_queue_limit,
321                            profiler_get_checked_value(cb->f[1], 1024,
322                                                       1024 * 1024,
323                                                       max_pmem / 32));
324                 return 1;
325         }
326         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
327                 if (cb->nf < 2)
328                         error(EFAIL, "prof_cpubufsz KB");
329                 WRITE_ONCE(profiler_cpu_buffer_size,
330                            profiler_get_checked_value(cb->f[1], 1024,
331                                                       16 * 1024,
332                                                       1024 * 1024));
333                 return 1;
334         }
335
336         return 0;
337 }
338
339 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
340 {
341         const char * const cmds[] = {
342                 "prof_qlimit",
343                 "prof_cpubufsz",
344         };
345
346         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
347                 strlcat(msgbuf, "|", buflen);
348                 strlcat(msgbuf, cmds[i], buflen);
349         }
350 }
351
352 int profiler_setup(void)
353 {
354         struct profiler *prof;
355
356         assert(!rcu_dereference_check(gbl_prof, true));
357         prof = kzmalloc(sizeof(struct profiler), MEM_WAIT);
358         /* It is very important that we enqueue and dequeue entire records at
359          * once.  If we leave partial records, the entire stream will be
360          * corrupt.  Our reader does its best to make sure it has room for
361          * complete records (checks qlen()).
362          *
363          * If we ever get corrupt streams, try making this a Qmsg.  Though it
364          * doesn't help every situation - we have issues with writes greater
365          * than Maxatomic regardless. */
366         prof->qio = qopen(profiler_queue_limit, 0, NULL, NULL);
367         if (!prof->qio) {
368                 kfree(prof);
369                 return -1;
370         }
371         prof->pcpu_ctx = kzmalloc(sizeof(struct profiler_cpu_context)
372                                   * num_cores, MEM_WAIT);
373         for (int i = 0; i < num_cores; i++) {
374                 struct profiler_cpu_context *b = &prof->pcpu_ctx[i];
375
376                 b->cpu = i;
377         }
378         rcu_assign_pointer(gbl_prof, prof);
379         profiler_emit_current_system_status();
380         return 0;
381 }
382
383 void profiler_cleanup(void)
384 {
385         struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
386
387         RCU_INIT_POINTER(gbl_prof, NULL);
388         synchronize_rcu();
389         kfree(prof->pcpu_ctx);
390         qfree(prof->qio);
391         kfree(prof);
392 }
393
394 static void profiler_cpu_flush(struct profiler *prof,
395                                struct profiler_cpu_context *cpu_buf)
396 {
397         int8_t irq_state = 0;
398
399         disable_irqsave(&irq_state);
400         if (cpu_buf->block) {
401                 qibwrite(prof->qio, cpu_buf->block);
402
403                 cpu_buf->block = NULL;
404         }
405         enable_irqsave(&irq_state);
406 }
407
408 static void __profiler_core_trace_enable(void *opaque)
409 {
410         struct profiler *prof = opaque;
411         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
412                                                                     core_id());
413
414         cpu_buf->tracing = prof->tracing;
415         if (!cpu_buf->tracing)
416                 profiler_cpu_flush(prof, cpu_buf);
417 }
418
419 static void profiler_control_trace(struct profiler *prof, int onoff)
420 {
421         struct core_set cset;
422
423         assert(prof);
424
425         core_set_init(&cset);
426         core_set_fill_available(&cset);
427         prof->tracing = onoff;
428         /* Note this blocks until all cores have run the function. */
429         smp_do_in_cores(&cset, __profiler_core_trace_enable, prof);
430 }
431
432 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
433  * profiler exists.  Not thread-safe. */
434 void profiler_start(void)
435 {
436         struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
437
438         profiler_control_trace(prof, 1);
439         qreopen(prof->qio);
440 }
441
442 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
443  * profiler exists.  Not thread-safe. */
444 void profiler_stop(void)
445 {
446         struct profiler *prof = rcu_dereference_protected(gbl_prof, true);
447
448         profiler_control_trace(prof, 0);
449         qhangup(prof->qio, 0);
450 }
451
452 static void __profiler_core_flush(void *opaque)
453 {
454         struct profiler *prof = opaque;
455         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(prof,
456                                                                     core_id());
457
458         profiler_cpu_flush(prof, cpu_buf);
459 }
460
461 /* This must only be called by the Kprofctlqid FD holder, ensuring that the
462  * profiler exists. */
463 void profiler_trace_data_flush(void)
464 {
465         struct core_set cset;
466
467         core_set_init(&cset);
468         core_set_fill_available(&cset);
469         smp_do_in_cores(&cset, __profiler_core_flush, NULL);
470 }
471
472 void profiler_push_kernel_backtrace(uintptr_t *pc_list, size_t nr_pcs,
473                                     uint64_t info)
474 {
475         struct profiler *prof;
476
477         rcu_read_lock();
478         prof = rcu_dereference(gbl_prof);
479         if (prof) {
480                 struct profiler_cpu_context *cpu_buf =
481                         profiler_get_cpu_ctx(prof, core_id());
482
483                 if (cpu_buf->tracing)
484                         profiler_push_kernel_trace64(prof, cpu_buf, pc_list,
485                                                      nr_pcs, info);
486         }
487         rcu_read_unlock();
488 }
489
490 void profiler_push_user_backtrace(uintptr_t *pc_list, size_t nr_pcs,
491                                   uint64_t info)
492 {
493         struct profiler *prof;
494
495         rcu_read_lock();
496         prof = rcu_dereference(gbl_prof);
497         if (prof) {
498                 struct profiler_cpu_context *cpu_buf =
499                         profiler_get_cpu_ctx(prof, core_id());
500
501                 if (cpu_buf->tracing)
502                         profiler_push_user_trace64(prof, cpu_buf, current,
503                                                    pc_list, nr_pcs, info);
504         }
505         rcu_read_unlock();
506 }
507
508 size_t profiler_size(void)
509 {
510         struct profiler *prof;
511         size_t ret;
512
513         rcu_read_lock();
514         prof = rcu_dereference(gbl_prof);
515         ret = prof ? qlen(prof->qio) : 0;
516         rcu_read_unlock();
517         return ret;
518 }
519
520 size_t profiler_read(void *va, size_t n)
521 {
522         struct profiler *prof;
523         size_t ret;
524
525         rcu_read_lock();
526         prof = rcu_dereference(gbl_prof);
527         ret = prof ? qread(prof->qio, va, n) : 0;
528         rcu_read_unlock();
529         return ret;
530 }
531
532 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
533                           int flags, struct file_or_chan *foc, size_t offset)
534 {
535         struct profiler *prof;
536
537         rcu_read_lock();
538         prof = rcu_dereference(gbl_prof);
539         if (prof) {
540                 if (foc && (prot & PROT_EXEC)) {
541                         char path_buf[PROFILER_MAX_PRG_PATH];
542                         char *path = foc_abs_path(foc, path_buf,
543                                                   sizeof(path_buf));
544
545                         if (likely(path))
546                                 profiler_push_pid_mmap(prof, p, addr, size,
547                                                        offset, path);
548                 }
549         }
550         rcu_read_unlock();
551 }
552
553 void profiler_notify_new_process(struct proc *p)
554 {
555         struct profiler *prof;
556
557         rcu_read_lock();
558         prof = rcu_dereference(gbl_prof);
559         if (prof && p->binary_path)
560                 profiler_push_new_process(prof, p);
561         rcu_read_unlock();
562 }