perf: Fix memory leak in the profiler
[akaros.git] / kern / src / profiler.c
1 /* Copyright (c) 2015 Google Inc
2  * Davide Libenzi <dlibenzi@google.com>
3  * See LICENSE for details.
4  */
5
6 #include <ros/common.h>
7 #include <ros/mman.h>
8 #include <sys/types.h>
9 #include <smp.h>
10 #include <trap.h>
11 #include <kthread.h>
12 #include <env.h>
13 #include <process.h>
14 #include <mm.h>
15 #include <vfs.h>
16 #include <kmalloc.h>
17 #include <pmap.h>
18 #include <kref.h>
19 #include <atomic.h>
20 #include <umem.h>
21 #include <elf.h>
22 #include <ns.h>
23 #include <err.h>
24 #include <core_set.h>
25 #include <string.h>
26 #include "profiler.h"
27
28 #define PROFILER_MAX_PRG_PATH   256
29 #define PROFILER_BT_DEPTH 16
30
31 #define VBE_MAX_SIZE(t) ((8 * sizeof(t) + 6) / 7)
32
33 struct profiler_cpu_context {
34         struct block *block;
35         int cpu;
36         int tracing;
37         size_t dropped_data_size;
38 };
39
40 static int profiler_queue_limit = 64 * 1024 * 1024;
41 static size_t profiler_cpu_buffer_size = 65536;
42 static qlock_t profiler_mtx = QLOCK_INITIALIZER(profiler_mtx);
43 static struct kref profiler_kref;
44 static struct profiler_cpu_context *profiler_percpu_ctx;
45 static struct queue *profiler_queue;
46
47 static inline struct profiler_cpu_context *profiler_get_cpu_ctx(int cpu)
48 {
49         return profiler_percpu_ctx + cpu;
50 }
51
52 static inline char *vb_encode_uint64(char *data, uint64_t n)
53 {
54         /* Classical variable bytes encoding. Encodes 7 bits at a time, using bit
55          * number 7 in the byte, as indicator of end of sequence (when zero).
56          */
57         for (; n >= 0x80; n >>= 7)
58                 *data++ = (char) (n | 0x80);
59         *data++ = (char) n;
60
61         return data;
62 }
63
64 static struct block *profiler_buffer_write(struct profiler_cpu_context *cpu_buf,
65                                            struct block *b)
66 {
67         if (b) {
68                 qibwrite(profiler_queue, b);
69
70                 if (qlen(profiler_queue) > profiler_queue_limit) {
71                         b = qget(profiler_queue);
72                         if (likely(b)) {
73                                 cpu_buf->dropped_data_size += BLEN(b);
74                                 freeb(b);
75                         }
76                 }
77         }
78
79         return block_alloc(profiler_cpu_buffer_size, MEM_ATOMIC);
80 }
81
82 static char *profiler_cpu_buffer_write_reserve(
83         struct profiler_cpu_context *cpu_buf, size_t size, struct block **pb)
84 {
85         struct block *b = cpu_buf->block;
86
87         if (unlikely((!b) || (b->lim - b->wp) < size)) {
88                 cpu_buf->block = b = profiler_buffer_write(cpu_buf, b);
89                 if (unlikely(!b))
90                         return NULL;
91         }
92         *pb = b;
93
94         return (char *) b->wp;
95 }
96
97 static inline void profiler_cpu_buffer_write_commit(
98         struct profiler_cpu_context *cpu_buf, struct block *b, size_t size)
99 {
100         b->wp += size;
101 }
102
103 static inline size_t profiler_max_envelope_size(void)
104 {
105         return 2 * VBE_MAX_SIZE(uint64_t);
106 }
107
108 static void profiler_push_kernel_trace64(struct profiler_cpu_context *cpu_buf,
109                                          const uintptr_t *trace, size_t count,
110                                          uint64_t info)
111 {
112         size_t size = sizeof(struct proftype_kern_trace64) +
113                 count * sizeof(uint64_t);
114         struct block *b;
115         void *resptr = profiler_cpu_buffer_write_reserve(
116             cpu_buf, size + profiler_max_envelope_size(), &b);
117         void *ptr = resptr;
118
119         if (likely(ptr)) {
120                 struct proftype_kern_trace64 *record;
121
122                 ptr = vb_encode_uint64(ptr, PROFTYPE_KERN_TRACE64);
123                 ptr = vb_encode_uint64(ptr, size);
124
125                 record = (struct proftype_kern_trace64 *) ptr;
126                 ptr += size;
127
128                 record->info = info;
129                 record->tstamp = nsec();
130                 record->cpu = cpu_buf->cpu;
131                 record->num_traces = count;
132                 for (size_t i = 0; i < count; i++)
133                         record->trace[i] = (uint64_t) trace[i];
134
135                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
136         }
137 }
138
139 static void profiler_push_user_trace64(struct profiler_cpu_context *cpu_buf,
140                                        struct proc *p, const uintptr_t *trace,
141                                        size_t count, uint64_t info)
142 {
143         size_t size = sizeof(struct proftype_user_trace64) +
144                 count * sizeof(uint64_t);
145         struct block *b;
146         void *resptr = profiler_cpu_buffer_write_reserve(
147             cpu_buf, size + profiler_max_envelope_size(), &b);
148         void *ptr = resptr;
149
150         if (likely(ptr)) {
151                 struct proftype_user_trace64 *record;
152
153                 ptr = vb_encode_uint64(ptr, PROFTYPE_USER_TRACE64);
154                 ptr = vb_encode_uint64(ptr, size);
155
156                 record = (struct proftype_user_trace64 *) ptr;
157                 ptr += size;
158
159                 record->info = info;
160                 record->tstamp = nsec();
161                 record->pid = p->pid;
162                 record->cpu = cpu_buf->cpu;
163                 record->num_traces = count;
164                 for (size_t i = 0; i < count; i++)
165                         record->trace[i] = (uint64_t) trace[i];
166
167                 profiler_cpu_buffer_write_commit(cpu_buf, b, ptr - resptr);
168         }
169 }
170
171 static void profiler_push_pid_mmap(struct proc *p, uintptr_t addr, size_t msize,
172                                    size_t offset, const char *path)
173 {
174         size_t plen = strlen(path) + 1;
175         size_t size = sizeof(struct proftype_pid_mmap64) + plen;
176         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
177
178         if (likely(resptr)) {
179                 void *ptr = resptr;
180                 struct proftype_pid_mmap64 *record;
181
182                 ptr = vb_encode_uint64(ptr, PROFTYPE_PID_MMAP64);
183                 ptr = vb_encode_uint64(ptr, size);
184
185                 record = (struct proftype_pid_mmap64 *) ptr;
186                 ptr += size;
187
188                 record->tstamp = nsec();
189                 record->pid = p->pid;
190                 record->addr = addr;
191                 record->size = msize;
192                 record->offset = offset;
193                 memcpy(record->path, path, plen);
194
195                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
196
197                 kfree(resptr);
198         }
199 }
200
201 static void profiler_push_new_process(struct proc *p)
202 {
203         size_t plen = strlen(p->binary_path) + 1;
204         size_t size = sizeof(struct proftype_new_process) + plen;
205         void *resptr = kmalloc(size + profiler_max_envelope_size(), 0);
206
207         if (likely(resptr)) {
208                 void *ptr = resptr;
209                 struct proftype_new_process *record;
210
211                 ptr = vb_encode_uint64(ptr, PROFTYPE_NEW_PROCESS);
212                 ptr = vb_encode_uint64(ptr, size);
213
214                 record = (struct proftype_new_process *) ptr;
215                 ptr += size;
216
217                 record->tstamp = nsec();
218                 record->pid = p->pid;
219                 memcpy(record->path, p->binary_path, plen);
220
221                 qiwrite(profiler_queue, resptr, (int) (ptr - resptr));
222
223                 kfree(resptr);
224         }
225 }
226
227 static void profiler_emit_current_system_status(void)
228 {
229         void enum_proc(struct vm_region *vmr, void *opaque)
230         {
231                 struct proc *p = (struct proc *) opaque;
232
233                 profiler_notify_mmap(p, vmr->vm_base, vmr->vm_end - vmr->vm_base,
234                                      vmr->vm_prot, vmr->vm_flags, vmr->vm_file,
235                                      vmr->vm_foff);
236         }
237
238         ERRSTACK(1);
239         struct process_set pset;
240
241         proc_get_set(&pset);
242         if (waserror()) {
243                 proc_free_set(&pset);
244                 nexterror();
245         }
246
247         for (size_t i = 0; i < pset.num_processes; i++)
248                 enumerate_vmrs(pset.procs[i], enum_proc, pset.procs[i]);
249
250         poperror();
251         proc_free_set(&pset);
252 }
253
254 static void free_cpu_buffers(void)
255 {
256         kfree(profiler_percpu_ctx);
257         profiler_percpu_ctx = NULL;
258
259         if (profiler_queue) {
260                 qfree(profiler_queue);
261                 profiler_queue = NULL;
262         }
263 }
264
265 static void alloc_cpu_buffers(void)
266 {
267         ERRSTACK(1);
268
269         profiler_queue = qopen(profiler_queue_limit, 0, NULL, NULL);
270         if (!profiler_queue)
271                 error(ENOMEM, ERROR_FIXME);
272         if (waserror()) {
273                 free_cpu_buffers();
274                 nexterror();
275         }
276
277         qdropoverflow(profiler_queue, TRUE);
278
279         profiler_percpu_ctx =
280             kzmalloc(sizeof(*profiler_percpu_ctx) * num_cores, MEM_WAIT);
281
282         for (int i = 0; i < num_cores; i++) {
283                 struct profiler_cpu_context *b = &profiler_percpu_ctx[i];
284
285                 b->cpu = i;
286         }
287 }
288
289 static long profiler_get_checked_value(const char *value, long k, long minval,
290                                        long maxval)
291 {
292         long lvalue = strtol(value, NULL, 0) * k;
293
294         if (lvalue < minval)
295                 error(EFAIL, "Value should be greater than %ld", minval);
296         if (lvalue > maxval)
297                 error(EFAIL, "Value should be lower than %ld", maxval);
298
299         return lvalue;
300 }
301
302 int profiler_configure(struct cmdbuf *cb)
303 {
304         if (!strcmp(cb->f[0], "prof_qlimit")) {
305                 if (cb->nf < 2)
306                         error(EFAIL, "prof_qlimit KB");
307                 if (kref_refcnt(&profiler_kref) > 0)
308                         error(EFAIL, "Profiler already running");
309                 profiler_queue_limit = (int) profiler_get_checked_value(
310                         cb->f[1], 1024, 1024 * 1024, max_pmem / 32);
311                 return 1;
312         }
313         if (!strcmp(cb->f[0], "prof_cpubufsz")) {
314                 if (cb->nf < 2)
315                         error(EFAIL, "prof_cpubufsz KB");
316                 profiler_cpu_buffer_size = (size_t) profiler_get_checked_value(
317                         cb->f[1], 1024, 16 * 1024, 1024 * 1024);
318                 return 1;
319         }
320
321         return 0;
322 }
323
324 void profiler_append_configure_usage(char *msgbuf, size_t buflen)
325 {
326         const char * const cmds[] = {
327                 "prof_qlimit",
328                 "prof_cpubufsz",
329         };
330
331         for (int i = 0; i < ARRAY_SIZE(cmds); i++) {
332                 strlcat(msgbuf, "|", buflen);
333                 strlcat(msgbuf, cmds[i], buflen);
334         }
335 }
336
337 static void profiler_release(struct kref *kref)
338 {
339         bool got_reference = FALSE;
340
341         assert(kref == &profiler_kref);
342         qlock(&profiler_mtx);
343         /* Make sure we did not race with profiler_setup(), that got the
344          * profiler_mtx lock just before us, and re-initialized the profiler
345          * for a new user.
346          * If we race here from another profiler_release() (user did a
347          * profiler_setup() immediately followed by a profiler_cleanup()) we are
348          * fine because free_cpu_buffers() can be called multiple times.
349          */
350         if (!kref_get_not_zero(kref, 1))
351                 free_cpu_buffers();
352         else
353                 got_reference = TRUE;
354         qunlock(&profiler_mtx);
355         /* We cannot call kref_put() within the profiler_kref lock, as such call
356          * might trigger anohter call to profiler_release().
357          */
358         if (got_reference)
359                 kref_put(kref);
360 }
361
362 void profiler_init(void)
363 {
364         assert(kref_refcnt(&profiler_kref) == 0);
365         kref_init(&profiler_kref, profiler_release, 0);
366 }
367
368 void profiler_setup(void)
369 {
370         ERRSTACK(1);
371
372         qlock(&profiler_mtx);
373         if (waserror()) {
374                 qunlock(&profiler_mtx);
375                 nexterror();
376         }
377         if (!profiler_queue)
378                 alloc_cpu_buffers();
379
380         /* Do this only when everything is initialized (as last init operation).
381          */
382         __kref_get(&profiler_kref, 1);
383
384         profiler_emit_current_system_status();
385
386         poperror();
387         qunlock(&profiler_mtx);
388 }
389
390 void profiler_cleanup(void)
391 {
392         kref_put(&profiler_kref);
393 }
394
395 static void profiler_cpu_flush(struct profiler_cpu_context *cpu_buf)
396 {
397         if (cpu_buf->block && profiler_queue) {
398                 qibwrite(profiler_queue, cpu_buf->block);
399
400                 cpu_buf->block = NULL;
401         }
402 }
403
404 static void profiler_core_trace_enable(void *opaque)
405 {
406         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
407
408         cpu_buf->tracing = (int) (opaque != NULL);
409         if (!cpu_buf->tracing)
410                 profiler_cpu_flush(cpu_buf);
411 }
412
413 void profiler_control_trace(int onoff)
414 {
415         struct core_set cset;
416
417         error_assert(EINVAL, profiler_percpu_ctx);
418
419         core_set_init(&cset);
420         core_set_fill_available(&cset);
421         smp_do_in_cores(&cset, profiler_core_trace_enable,
422                         (void *) (uintptr_t) onoff);
423 }
424
425 static void profiler_core_flush(void *opaque)
426 {
427         struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
428
429         profiler_cpu_flush(cpu_buf);
430 }
431
432 void profiler_trace_data_flush(void)
433 {
434         struct core_set cset;
435
436         error_assert(EINVAL, profiler_percpu_ctx);
437
438         core_set_init(&cset);
439         core_set_fill_available(&cset);
440         smp_do_in_cores(&cset, profiler_core_flush, NULL);
441 }
442
443 void profiler_add_trace(uintptr_t pc, uint64_t info)
444 {
445         if (is_user_raddr((void *) pc, 1))
446                 profiler_add_user_backtrace(pc, 0, info);
447         else
448                 profiler_add_kernel_backtrace(pc, 0, info);
449 }
450
451 void profiler_add_kernel_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
452 {
453         if (kref_get_not_zero(&profiler_kref, 1)) {
454                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
455
456                 if (profiler_percpu_ctx && cpu_buf->tracing) {
457                         uintptr_t trace[PROFILER_BT_DEPTH];
458                         size_t n = 1;
459
460                         trace[0] = pc;
461                         if (likely(fp))
462                                 n = backtrace_list(pc, fp, trace + 1,
463                                                    PROFILER_BT_DEPTH - 1) + 1;
464
465                         profiler_push_kernel_trace64(cpu_buf, trace, n, info);
466                 }
467                 kref_put(&profiler_kref);
468         }
469 }
470
471 void profiler_add_user_backtrace(uintptr_t pc, uintptr_t fp, uint64_t info)
472 {
473         if (kref_get_not_zero(&profiler_kref, 1)) {
474                 struct proc *p = current;
475                 struct profiler_cpu_context *cpu_buf = profiler_get_cpu_ctx(core_id());
476
477                 if (p && profiler_percpu_ctx && cpu_buf->tracing) {
478                         uintptr_t trace[PROFILER_BT_DEPTH];
479                         size_t n = 1;
480
481                         trace[0] = pc;
482                         if (likely(fp))
483                                 n = backtrace_user_list(pc, fp, trace + 1,
484                                                         PROFILER_BT_DEPTH - 1) + 1;
485
486                         profiler_push_user_trace64(cpu_buf, p, trace, n, info);
487                 }
488                 kref_put(&profiler_kref);
489         }
490 }
491
492 void profiler_add_hw_sample(struct hw_trapframe *hw_tf, uint64_t info)
493 {
494         if (in_kernel(hw_tf))
495                 profiler_add_kernel_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
496                                               info);
497         else
498                 profiler_add_user_backtrace(get_hwtf_pc(hw_tf), get_hwtf_fp(hw_tf),
499                                             info);
500 }
501
502 int profiler_size(void)
503 {
504         return profiler_queue ? qlen(profiler_queue) : 0;
505 }
506
507 int profiler_read(void *va, int n)
508 {
509         return profiler_queue ? qread(profiler_queue, va, n) : 0;
510 }
511
512 void profiler_notify_mmap(struct proc *p, uintptr_t addr, size_t size, int prot,
513                           int flags, struct file *f, size_t offset)
514 {
515         if (kref_get_not_zero(&profiler_kref, 1)) {
516                 if (f && (prot & PROT_EXEC) && profiler_percpu_ctx) {
517                         char path_buf[PROFILER_MAX_PRG_PATH];
518                         char *path = file_abs_path(f, path_buf, sizeof(path_buf));
519
520                         if (likely(path))
521                                 profiler_push_pid_mmap(p, addr, size, offset, path);
522                 }
523                 kref_put(&profiler_kref);
524         }
525 }
526
527 void profiler_notify_new_process(struct proc *p)
528 {
529         if (kref_get_not_zero(&profiler_kref, 1)) {
530                 if (profiler_percpu_ctx && p->binary_path)
531                         profiler_push_new_process(p);
532                 kref_put(&profiler_kref);
533         }
534 }