perf: Fix racy access to cpu_buf->block
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  */
5
6 #include <ros/common.h>
7 #include <ros/mman.h>
8 #include <sys/types.h>
9 #include <smp.h>
10 #include <trap.h>
11 #include <kthread.h>
12 #include <env.h>
13 #include <process.h>
14 #include <mm.h>
15 #include <vfs.h>
16 #include <kmalloc.h>
17 #include <pmap.h>
18 #include <kref.h>
19 #include <atomic.h>
20 #include <umem.h>
21 #include <elf.h>
22 #include <ns.h>
23 #include <err.h>
24 #include <core_set.h>
25 #include <string.h>
26 #include "profiler.h"
27
28 #define PROFILER_MAX_PRG_PATH   256
29 #define PROFILER_BT_DEPTH 16
30
31 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
32
33 /* Do not rely on the contents of the PCPU ctx with IRQs enabled. */
34 struct profiler_cpu_context {
35         struct block *block;
36         int cpu;
37         int tracing;
38         size_t dropped_data_cnt;
39 };
40
41 static int profiler_queue_limit = 64 * 1024 * 1024;
42 static size_t profiler_cpu_buffer_size = 65536;
43 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
44 static struct kref profiler_kref;
45 static struct profiler_cpu_context *profiler_percpu_ctx;
46 static struct queue *profiler_queue;
47
48 static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
49 {
50         return profiler_percpu_ctx + cpu;
51 }
52
53 static inline char *vb_encode_uint64(char *data, uint64_t n)
54 {
55         /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
56          * number 7 in the byte, as indicator of end of sequence (when zero).
57          */
58         for (; n >= 0x80; n >>= 7)
59                 *data++ = (char) (n | 0x80);
60         *data++ = (char) n;
61
62         return data;
63 }
64
65 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
66                                            struct block *b)
67 {
68         /* qpass will drop b if the queue is over its limit.  we're willing to lose
69          * traces, but we won't lose 'control' events, such as MMAP and PID. */
70         if (b) {
71                 if (qpass(profiler_queue, b) < 0)
72                         cpu_buf->dropped_data_cnt++;
73         }
74         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
75 }
76
77 /* Helper, paired with profiler_cpu_buffer_write_commit.  Ensures there is
78  * enough room in the pcpu block for our write.  May alloc a new one.
79  *
80  * IRQs must be disabled before calling, until after write_commit. */
81 static char *profiler_cpu_buffer_write_reserve(
82         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
83 {
84         struct block *b = cpu_buf->block;
85
86         if (unlikely((!b) || (b->lim - b->wp) < size)) {
87                 cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
88                 if (unlikely(!b))
89                         return NULL;
90         }
91         *pb = b;
92
93         return (char *) b->wp;
94 }
95
96 /* Helper, paired with write_reserve.  Finalizes the writing into the block's
97  * main body of @size bytes.  IRQs must be disabled until after this is called.
98  */
99 static inline void profiler_cpu_buffer_write_commit(
100         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
101 {
102         b->wp += size;
103 }
104
105 static inline size_t profiler_max_envelope_size(void)
106 {
107         return 2 * VBE_MAX_SIZE(uint64_t);
108 }
109
110 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
111                                          const uintptr_t *trace, size_t count,
112                                          uint64_t info)
113 {
114         size_t size = sizeof(struct proftype_kern_trace64) +
115                 count * sizeof(uint64_t);
116         struct block *b;
117         void *resptr, *ptr;
118
119         assert(!irq_is_enabled());
120         resptr = profiler_cpu_buffer_write_reserve(
121             cpu_buf, size + profiler_max_envelope_size(), &b);
122         ptr = resptr;
123
124         if (likely(ptr)) {
125                 struct proftype_kern_trace64 *record;
126
127                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
128                 ptr = vb_encode_uint64(ptr, size);
129
130                 record = (struct proftype_kern_trace64 *) ptr;
131                 ptr += size;
132
133                 record->info = info;
134                 record->tstamp = nsec();
135                 record->cpu = cpu_buf->cpu;
136                 record->num_traces = count;
137                 for (size_t i = 0; i < count; i++)
138                         record->trace[i] = (uint64_t) trace[i];
139
140                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
141         }
142 }
143
144 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
145                                        struct proc *p, const uintptr_t *trace,
146                                        size_t count, uint64_t info)
147 {
148         size_t size = sizeof(struct proftype_user_trace64) +
149                 count * sizeof(uint64_t);
150         struct block *b;
151         void *resptr, *ptr;
152
153         assert(!irq_is_enabled());
154         resptr = profiler_cpu_buffer_write_reserve(
155             cpu_buf, size + profiler_max_envelope_size(), &b);
156         ptr = resptr;
157
158         if (likely(ptr)) {
159                 struct proftype_user_trace64 *record;
160
161                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
162                 ptr = vb_encode_uint64(ptr, size);
163
164                 record = (struct proftype_user_trace64 *) ptr;
165                 ptr += size;
166
167                 record->info = info;
168                 record->tstamp = nsec();
169                 record->pid = p->pid;
170                 record->cpu = cpu_buf->cpu;
171                 record->num_traces = count;
172                 for (size_t i = 0; i < count; i++)
173                         record->trace[i] = (uint64_t) trace[i];
174
175                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
176         }
177 }
178
179 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
180                                    size_t offset, const char *path)
181 {
182         size_t plen = strlen(path) + 1;
183         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
184         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
185
186         if (likely(resptr)) {
187                 void *ptr = resptr;
188                 struct proftype_pid_mmap64 *record;
189
190                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
191                 ptr = vb_encode_uint64(ptr, size);
192
193                 record = (struct proftype_pid_mmap64 *) ptr;
194                 ptr += size;
195
196                 record->tstamp = nsec();
197                 record->pid = p->pid;
198                 record->addr = addr;
199                 record->size = msize;
200                 record->offset = offset;
201                 memcpy(record->path, path, plen);
202
203                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
204
205                 kfree(resptr);
206         }
207 }
208
209 static void profiler_push_new_process(struct proc *p)
210 {
211         size_t plen = strlen(p->binary_path) + 1;
212         size_t size = sizeof(struct proftype_new_process) + plen;
213         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
214
215         if (likely(resptr)) {
216                 void *ptr = resptr;
217                 struct proftype_new_process *record;
218
219                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
220                 ptr = vb_encode_uint64(ptr, size);
221
222                 record = (struct proftype_new_process *) ptr;
223                 ptr += size;
224
225                 record->tstamp = nsec();
226                 record->pid = p->pid;
227                 memcpy(record->path, p->binary_path, plen);
228
229                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
230
231                 kfree(resptr);
232         }
233 }
234
235 static void profiler_emit_current_system_status(void)
236 {
237         void enum_proc(struct vm_region *vmr, void *opaque)
238         {
239                 struct proc *p = (struct proc *) opaque;
240
241                 profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
242                                      vmr->vm_prot, vmr->vm_flags, vmr->vm_file,
243                                      vmr->vm_foff);
244         }
245
246         ERRSTACK(1);
247         struct process_set pset;
248
249         proc_get_set(&pset);
250         if (waserror()) {
251                 proc_free_set(&pset);
252                 nexterror();
253         }
254
255         for (size_t i = 0; i < pset.num_processes; i++) {
256                 profiler_notify_new_process(pset.procs[i]);
257                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
258         }
259
260         poperror();
261         proc_free_set(&pset);
262 }
263
264 static void free_cpu_buffers(void)
265 {
266         kfree(profiler_percpu_ctx);
267         profiler_percpu_ctx = NULL;
268
269         if (profiler_queue) {
270                 qfree(profiler_queue);
271                 profiler_queue = NULL;
272         }
273 }
274
275 static void alloc_cpu_buffers(void)
276 {
277         ERRSTACK(1);
278
279         /* It is very important that we enqueue and dequeue entire records at once.
280          * If we leave partial records, the entire stream will be corrupt.  Our
281          * reader does its best to make sure it has room for complete records
282          * (checks qlen()).
283          *
284          * If we ever get corrupt streams, try making this a Qmsg.  Though it
285          * doesn't help every situation - we have issues with writes greater than
286          * Maxatomic regardless. */
287         profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
288         if (!profiler_queue)
289                 error(ENOMEM, ERROR_FIXME);
290         if (waserror()) {
291                 free_cpu_buffers();
292                 nexterror();
293         }
294
295         profiler_percpu_ctx =
296             kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
297
298         for (int i = 0; i < num_cores; i++) {
299                 struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
300
301                 b->cpu = i;
302         }
303 }
304
305 static long profiler_get_checked_value(const char *value, long k, long minval,
306                                        long maxval)
307 {
308         long lvalue = strtol(value, NULL, 0) * k;
309
310         if (lvalue < minval)
311                 error(EFAIL, "Value should be greater than %ld", minval);
312         if (lvalue > maxval)
313                 error(EFAIL, "Value should be lower than %ld", maxval);
314
315         return lvalue;
316 }
317
318 int profiler_configure(struct cmdbuf *cb)
319 {
320         if (!strcmp(cb->f[0], "prof_qlimit")) {
321                 if (cb->nf < 2)
322                         error(EFAIL, "prof_qlimit KB");
323                 if (kref_refcnt(&profiler_kref) > 0)
324                         error(EFAIL, "Profiler already running");
325                 profiler_queue_limit = (int) profiler_get_checked_value(
326                         cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
327                 return 1;
328         }
329         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
330                 if (cb->nf < 2)
331                         error(EFAIL, "prof_cpubufsz KB");
332                 profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
333                         cb->f[1], 1024, 16 * 1024, 1024 * 1024);
334                 return 1;
335         }
336
337         return 0;
338 }
339
340 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
341 {
342         const char * const cmds[] = {
343                 "prof_qlimit",
344                 "prof_cpubufsz",
345         };
346
347         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
348                 strlcat(msgbuf, "|", buflen);
349                 strlcat(msgbuf, cmds[i], buflen);
350         }
351 }
352
353 static void profiler_release(struct kref *kref)
354 {
355         bool got_reference = FALSE;
356
357         assert(kref == &profiler_kref);
358         qlock(&profiler_mtx);
359         /* Make sure we did not race with profiler_setup(), that got the
360          * profiler_mtx lock just before us, and re-initialized the profiler
361          * for a new user.
362          * If we race here from another profiler_release() (user did a
363          * profiler_setup() immediately followed by a profiler_cleanup()) we are
364          * fine because free_cpu_buffers() can be called multiple times.
365          */
366         if (!kref_get_not_zero(kref, 1))
367                 free_cpu_buffers();
368         else
369                 got_reference = TRUE;
370         qunlock(&profiler_mtx);
371         /* We cannot call kref_put() within the profiler_kref lock, as such call
372          * might trigger anohter call to profiler_release().
373          */
374         if (got_reference)
375                 kref_put(kref);
376 }
377
378 void profiler_init(void)
379 {
380         assert(kref_refcnt(&profiler_kref) == 0);
381         kref_init(&profiler_kref, profiler_release, 0);
382 }
383
384 void profiler_setup(void)
385 {
386         ERRSTACK(1);
387
388         qlock(&profiler_mtx);
389         if (waserror()) {
390                 qunlock(&profiler_mtx);
391                 nexterror();
392         }
393         if (!profiler_queue)
394                 alloc_cpu_buffers();
395
396         /* Do this only when everything is initialized (as last init operation).
397          */
398         __kref_get(&profiler_kref, 1);
399
400         profiler_emit_current_system_status();
401
402         poperror();
403         qunlock(&profiler_mtx);
404 }
405
406 void profiler_cleanup(void)
407 {
408         kref_put(&profiler_kref);
409 }
410
411 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
412 {
413         int8_t irq_state = 0;
414
415         disable_irqsave(&irq_state);
416         if (cpu_buf->block && profiler_queue) {
417                 qibwrite(profiler_queue, cpu_buf->block);
418
419                 cpu_buf->block = NULL;
420         }
421         enable_irqsave(&irq_state);
422 }
423
424 static void profiler_core_trace_enable(void *opaque)
425 {
426         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
427
428         cpu_buf->tracing = (int) (opaque != NULL);
429         if (!cpu_buf->tracing)
430                 profiler_cpu_flush(cpu_buf);
431 }
432
433 void profiler_control_trace(int onoff)
434 {
435         struct core_set cset;
436
437         error_assert(EINVAL, profiler_percpu_ctx);
438
439         core_set_init(&cset);
440         core_set_fill_available(&cset);
441         smp_do_in_cores(&cset, profiler_core_trace_enable,
442                         (void *) (uintptr_t) onoff);
443 }
444
445 static void profiler_core_flush(void *opaque)
446 {
447         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
448
449         profiler_cpu_flush(cpu_buf);
450 }
451
452 void profiler_trace_data_flush(void)
453 {
454         struct core_set cset;
455
456         error_assert(EINVAL, profiler_percpu_ctx);
457
458         core_set_init(&cset);
459         core_set_fill_available(&cset);
460         smp_do_in_cores(&cset, profiler_core_flush, NULL);
461 }
462
463 void profiler_add_trace(uintptr_t pc, uint64_t info)
464 {
465         if (is_user_raddr((void *) pc, 1))
466                 profiler_add_user_backtrace(pc, 0, info);
467         else
468                 profiler_add_kernel_backtrace(pc, 0, info);
469 }
470
471 void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
472 {
473         if (kref_get_not_zero(&profiler_kref, 1)) {
474                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
475
476                 if (profiler_percpu_ctx && cpu_buf->tracing) {
477                         uintptr_t trace[PROFILER_BT_DEPTH];
478                         size_t n = 1;
479
480                         trace[0] = pc;
481                         if (likely(fp))
482                                 n = backtrace_list(pc, fp, trace + 1,
483                                                    PROFILER_BT_DEPTH - 1) + 1;
484
485                         profiler_push_kernel_trace64(cpu_buf, trace, n, info);
486                 }
487                 kref_put(&profiler_kref);
488         }
489 }
490
491 void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
492 {
493         if (kref_get_not_zero(&profiler_kref, 1)) {
494                 struct proc *p = current;
495                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
496
497                 if (p && profiler_percpu_ctx && cpu_buf->tracing) {
498                         uintptr_t trace[PROFILER_BT_DEPTH];
499                         size_t n = 1;
500
501                         trace[0] = pc;
502                         if (likely(fp))
503                                 n = backtrace_user_list(pc, fp, trace + 1,
504                                                         PROFILER_BT_DEPTH - 1) + 1;
505
506                         profiler_push_user_trace64(cpu_buf, p, trace, n, info);
507                 }
508                 kref_put(&profiler_kref);
509         }
510 }
511
512 void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info)
513 {
514         if (in_kernel(hw_tf))
515                 profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
516                                               info);
517         else
518                 profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
519                                             info);
520 }
521
522 int profiler_size(void)
523 {
524         return profiler_queue ? qlen(profiler_queue) : 0;
525 }
526
527 int profiler_read(void *va, int n)
528 {
529         return profiler_queue ? qread(profiler_queue, va, n) : 0;
530 }
531
532 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
533                           int flags, struct file *f, size_t offset)
534 {
535         if (kref_get_not_zero(&profiler_kref, 1)) {
536                 if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
537                         char path_buf[PROFILER_MAX_PRG_PATH];
538                         char *path = file_abs_path(f, path_buf, sizeof(path_buf));
539
540                         if (likely(path))
541                                 profiler_push_pid_mmap(p, addr, size, offset, path);
542                 }
543                 kref_put(&profiler_kref);
544         }
545 }
546
547 void profiler_notify_new_process(struct proc *p)
548 {
549         if (kref_get_not_zero(&profiler_kref, 1)) {
550                 if (profiler_percpu_ctx && p->binary_path)
551                         profiler_push_new_process(p);
552                 kref_put(&profiler_kref);
553         }
554 }