fff778e652619ac55ede711d66a9e83c30567ce2
[akaros.git] / kern / src / oprofile / cpu_buffer.c
1 /**
2  * @file cpu_buffer.c
3  *
4  * @remark Copyright 2002-2009 OProfile authors
5  * @remark Read the file COPYING
6  *
7  * @author John Levon <levon@movementarian.org>
8  * @author Barry Kasindorf <barry.kasindorf@amd.com>
9  * @author Robert Richter <robert.richter@amd.com>
10  *
11  * Each CPU has a local buffer that stores PC value/event
12  * pairs. We also log context switches when we notice them.
13  * Eventually each CPU's buffer is processed into the global
14  * event buffer by sync_buffer().
15  *
16  * We use a local buffer for two reasons: an NMI or similar
17  * interrupt cannot synchronise, and high sampling rates
18  * would lead to catastrophic global synchronisation if
19  * a global buffer was used.
20  */
21 #include "event_buffer.h"
22 #include "cpu_buffer.h"
23 #include "buffer_sync.h"
24 #include "oprof.h"
25
26 #define OP_BUFFER_FLAGS 0
27
28 /* we allocate an array of these and set the pointer in pcpui */
29 struct oprofile_cpu_buffer *op_cpu_buffer;
30
31 /* this one queue is used by #K to get all events. */
32 struct queue *opq;
33
34 /* this is run from core 0 for all cpu buffers. */
35 static void wq_sync_buffer(void);
36 unsigned long oprofile_cpu_buffer_size = 65536;
37 unsigned long oprofile_backtrace_depth = 8;
38
39 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
40 static int work_enabled;
41
42 /*
43  * Resets the cpu buffer to a sane state.
44  *
45  * reset these to invalid values; the next sample collected will
46  * populate the buffer with proper values to initialize the buffer
47  */
48 static inline void op_cpu_buffer_reset(int cpu)
49 {
50         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
51
52         cpu_buf->last_is_kernel = -1;
53         cpu_buf->last_proc = NULL;
54 }
55
56 /* returns the remaining free size of data in the entry */
57 static inline
58         int op_cpu_buffer_add_data(struct op_entry *entry, unsigned long val)
59 {
60         assert(entry->size >= 0);
61         if (!entry->size) {
62                 return 0;
63         }
64         *entry->data = val;
65         entry->size--;
66         entry->data++;
67         return entry->size;
68 }
69
70 /* returns the size of data in the entry */
71 static inline int op_cpu_buffer_get_size(struct op_entry *entry)
72 {
73         return entry->size;
74 }
75
76 /* returns 0 if empty or the size of data including the current value */
77 static inline
78         int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
79 {
80         int size = entry->size;
81         if (!size) {
82                 return 0;
83         }
84         *val = *entry->data;
85         entry->size--;
86         entry->data++;
87         return size;
88 }
89
90 unsigned long oprofile_get_cpu_buffer_size(void)
91 {
92         return oprofile_cpu_buffer_size;
93 }
94
95 void oprofile_cpu_buffer_inc_smpl_lost(void)
96 {
97         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
98
99         cpu_buf->sample_lost_overflow++;
100 }
101
102 void free_cpu_buffers(void)
103 {
104         kfree(op_cpu_buffer);
105         /* we can just leave the queue set up; it will then always return EOF */
106 }
107
108 #define RB_EVENT_HDR_SIZE 4
109
110 int alloc_cpu_buffers(void)
111 {
112         /* should probably start using waserror() here. The fail stuff just gets
113          * ugly.
114          */
115         int i;
116         unsigned long buffer_size = oprofile_cpu_buffer_size;
117         unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
118                                                  RB_EVENT_HDR_SIZE);
119         /* this can get called lots of times. Things might have been freed.
120          * So be careful.
121          */
122         /* what limit? No idea. */
123         if (!opq)
124                 opq = qopen(1024, Qmsg, NULL, NULL);
125         if (!opq)
126                 goto fail;
127
128         /* we *really* don't want to block. Losing data is better. */
129         qnoblock(opq, 1);
130         if (!op_cpu_buffer) {
131                 printk("ALlocate %d bytes\n", sizeof(*op_cpu_buffer) * num_cpus);
132                 op_cpu_buffer =
133                         kzmalloc(sizeof(*op_cpu_buffer) * num_cpus, KMALLOC_WAIT);
134                 if (!op_cpu_buffer)
135                         goto fail;
136
137                 for (i = 0; i < num_cpus; i++) {
138                         struct oprofile_cpu_buffer *b = &op_cpu_buffer[i];
139                         /* short term: for each event, we're going to kmalloc a
140                          * sample and shove it into the opq.
141                          * Long term: TBD. One option is to create a big damn Block and
142                          * add to it as needed. Once the block is full we can push
143                          * it onto the opq. That will actually be pretty fast and easy
144                          * if we make the block page-sized. Far, far simpler than the
145                          * Linux tracebuffer stuff.
146                          */
147                         b->last_proc = NULL;
148                         b->last_is_kernel = -1;
149                         b->tracing = 1;
150                         b->buffer_size = buffer_size;
151                         b->sample_received = 0;
152                         b->sample_lost_overflow = 0;
153                         b->backtrace_aborted = 0;
154                         b->sample_invalid_eip = 0;
155                         b->cpu = i;
156                         b->fullqueue = qopen(1024, Qmsg, NULL, NULL);
157                         b->emptyqueue = qopen(1024, Qmsg, NULL, NULL);
158                 }
159         }
160
161         return 0;
162
163 fail:
164         free_cpu_buffers();
165         return -ENOMEM;
166 }
167
168 void start_cpu_work(void)
169 {
170         int i;
171
172         work_enabled = 1;
173         /* task starts here.
174            schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
175          */
176 }
177
178 void end_cpu_work(void)
179 {
180         work_enabled = 0;
181 }
182
183 /* placeholder. Not used yet.
184  */
185 void flush_cpu_work(void)
186 {
187         int i;
188         struct oprofile_cpu_buffer *b = &op_cpu_buffer[core_id()];
189
190 }
191
192 /* Not used since we're not doing per-cpu buffering yet.
193  */
194
195 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
196 {
197         return NULL;
198 }
199
200 static struct block *op_cpu_buffer_write_reserve(struct oprofile_cpu_buffer *cpu_buf,
201         struct op_entry *entry, int size)
202 {
203         struct block *b;
204         int totalsize = sizeof(struct op_sample) +
205                 size * sizeof(entry->sample->data[0]);
206
207         b = cpu_buf->block;
208         /* we might have run out. */
209         if ((! b) || (b->lim - b->wp) < size) {
210                 if (b)
211                         qbwrite(opq, b);
212                 /* For now. Later, we will grab a block off the
213                  * emptyblock queue.
214                  */
215                 cpu_buf->block = b = allocb(oprofile_cpu_buffer_size);
216                 if (!b) {
217                         printk("%s: fail\n", __func__);
218                         return NULL;
219                 }
220         }
221         entry->sample = (void *)b->wp;
222         entry->size = size;
223         entry->data = entry->sample->data;
224
225         b->wp += totalsize;
226         return b;
227
228 }
229
230 static int
231 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
232                         int is_kernel, struct proc *proc)
233 {
234         struct block *b;
235         struct op_entry entry;
236         struct op_sample *sample;
237         unsigned long flags;
238         int size;
239         ERRSTACK(1);
240
241         flags = 0;
242
243         if (waserror()) {
244                 poperror();
245                 printk("%s: failed\n", __func__);
246                 return 1;
247         }
248
249         if (backtrace)
250                 flags |= TRACE_BEGIN;
251
252         /* notice a switch from user->kernel or vice versa */
253         is_kernel = ! !is_kernel;
254         if (cpu_buf->last_is_kernel != is_kernel) {
255                 cpu_buf->last_is_kernel = is_kernel;
256                 flags |= KERNEL_CTX_SWITCH;
257                 if (is_kernel)
258                         flags |= IS_KERNEL;
259         }
260
261         /* notice a proc switch */
262         if (cpu_buf->last_proc != proc) {
263                 cpu_buf->last_proc = proc;
264                 flags |= USER_CTX_SWITCH;
265         }
266
267         if (!flags) {
268                 poperror();
269                 /* nothing to do */
270                 return 0;
271         }
272
273         if (flags & USER_CTX_SWITCH)
274                 size = 1;
275         else
276                 size = 0;
277
278         b = op_cpu_buffer_write_reserve(cpu_buf, &entry, size);
279
280         entry.sample->eip = ESCAPE_CODE;
281         entry.sample->event = flags;
282
283         if (size)
284                 op_cpu_buffer_add_data(&entry, (unsigned long)proc);
285
286         poperror();
287         return 0;
288 }
289
290 static inline int
291 op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
292                           unsigned long pc, unsigned long event)
293 {
294         ERRSTACK(1);
295         struct op_entry entry;
296         struct op_sample *sample;
297         struct block *b;
298
299         if (waserror()) {
300                 poperror();
301                 printk("%s: failed\n", __func__);
302                 return 1;
303         }
304
305         b = op_cpu_buffer_write_reserve(cpu_buf, &entry, 0);
306
307         sample = entry.sample;
308         sample->eip = pc;
309         sample->event = event;
310         poperror();
311         return 0;
312 }
313
314 /*
315  * This must be safe from any context.
316  *
317  * is_kernel is needed because on some architectures you cannot
318  * tell if you are in kernel or user space simply by looking at
319  * pc. We tag this in the buffer by generating kernel enter/exit
320  * events whenever is_kernel changes
321  */
322 static int
323 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
324                    unsigned long backtrace, int is_kernel, unsigned long event,
325                    struct proc *proc)
326 {
327         struct proc *tsk = proc ? proc : current;
328         cpu_buf->sample_received++;
329
330         if (pc == ESCAPE_CODE) {
331                 cpu_buf->sample_invalid_eip++;
332                 return 0;
333         }
334
335         /* ah, so great. op_add* return 1 in event of failure.
336          * this function returns 0 in event of failure.
337          * what a cluster.
338          */
339         if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
340                 goto fail;
341
342         if (op_add_sample(cpu_buf, pc, event))
343                 goto fail;
344
345         return 1;
346
347 fail:
348         cpu_buf->sample_lost_overflow++;
349         return 0;
350 }
351
352 static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
353 {
354         cpu_buf->tracing = 1;
355 }
356
357 static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
358 {
359         cpu_buf->tracing = 0;
360 }
361
362 static inline void
363 __oprofile_add_ext_sample(unsigned long pc,
364                                                   void /*struct pt_regs */ *const regs,
365                                                   unsigned long event, int is_kernel, struct proc *proc)
366 {
367         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
368         unsigned long backtrace = oprofile_backtrace_depth;
369
370         /*
371          * if log_sample() fail we can't backtrace since we lost the
372          * source of this event
373          */
374         if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, proc))
375                 /* failed */
376         {
377                 return;
378         }
379
380         if (!backtrace) {
381                 return;
382         }
383 #if 0
384         oprofile_begin_trace(cpu_buf);
385         oprofile_ops.backtrace(regs, backtrace);
386         oprofile_end_trace(cpu_buf);
387 #endif
388 }
389
390 void oprofile_add_ext_hw_sample(unsigned long pc,
391                                                                 void /*struct pt_regs */ *const regs,
392                                                                 unsigned long event, int is_kernel,
393                                                                 struct proc *proc)
394 {
395         __oprofile_add_ext_sample(pc, regs, event, is_kernel, proc);
396 }
397
398 void oprofile_add_ext_sample(unsigned long pc,
399                                                          void /*struct pt_regs */ *const regs,
400                                                          unsigned long event, int is_kernel)
401 {
402         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
403 }
404
405 void oprofile_add_sample(void /*struct pt_regs */ *const regs,
406                                                  unsigned long event)
407 {
408         int is_kernel;
409         unsigned long pc;
410
411         if (regs) {
412                 is_kernel = 0;  // FIXME!user_mode(regs);
413                 pc = 0; // FIXME profile_pc(regs);
414         } else {
415                 is_kernel = 0;  /* This value will not be used */
416                 pc = ESCAPE_CODE;       /* as this causes an early return. */
417         }
418
419         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
420 }
421
422 /*
423  * Add samples with data to the ring buffer.
424  *
425  * Use oprofile_add_data(&entry, val) to add data and
426  * oprofile_write_commit(&entry) to commit the sample.
427  */
428 void
429 oprofile_write_reserve(struct op_entry *entry,
430                                            void /*struct pt_regs */ *const regs,
431                                            unsigned long pc, int code, int size)
432 {
433         ERRSTACK(1);
434         struct op_sample *sample;
435         struct block *b;
436         int is_kernel = 0;                      // FIXME!user_mode(regs);
437         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
438
439         if (waserror()) {
440                 printk("%s: failed\n", __func__);
441                 poperror();
442                 goto fail;
443         }
444         cpu_buf->sample_received++;
445
446         /* no backtraces for samples with data */
447         if (op_add_code(cpu_buf, 0, is_kernel, current))
448                 goto fail;
449
450         b = op_cpu_buffer_write_reserve(cpu_buf, entry, size + 2);
451         sample = entry->sample;
452         sample->eip = ESCAPE_CODE;
453         sample->event = 0;      /* no flags */
454
455         op_cpu_buffer_add_data(entry, code);
456         op_cpu_buffer_add_data(entry, pc);
457         poperror();
458         return;
459 fail:
460         entry->event = NULL;
461         cpu_buf->sample_lost_overflow++;
462 }
463
464 int oprofile_add_data(struct op_entry *entry, unsigned long val)
465 {
466         if (!entry->event) {
467                 return 0;
468         }
469         return op_cpu_buffer_add_data(entry, val);
470 }
471
472 int oprofile_add_data64(struct op_entry *entry, uint64_t val)
473 {
474         if (!entry->event) {
475                 return 0;
476         }
477         if (op_cpu_buffer_get_size(entry) < 2)
478                 /*
479                  * the function returns 0 to indicate a too small
480                  * buffer, even if there is some space left
481                  */
482         {
483                 return 0;
484         }
485         if (!op_cpu_buffer_add_data(entry, (uint32_t) val)) {
486                 return 0;
487         }
488         return op_cpu_buffer_add_data(entry, (uint32_t) (val >> 32));
489 }
490
491 int oprofile_write_commit(struct op_entry *entry)
492 {
493         /* not much to do at present. In future, we might write the Block
494          * to opq.
495          */
496         return 0;
497 }
498
499 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
500 {
501         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
502         log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
503 }
504
505 void oprofile_add_trace(unsigned long pc)
506 {
507         struct oprofile_cpu_buffer *cpu_buf = &op_cpu_buffer[core_id()];
508
509         if (!cpu_buf->tracing) {
510                 return;
511         }
512
513         /*
514          * broken frame can give an eip with the same value as an
515          * escape code, abort the trace if we get it
516          */
517         if (pc == ESCAPE_CODE)
518                 goto fail;
519
520         if (op_add_sample(cpu_buf, pc, 0))
521                 goto fail;
522
523         return;
524 fail:
525         printk("%s: fail. Turning of tracing on cpu %d\n", core_id());
526         cpu_buf->tracing = 0;
527         cpu_buf->backtrace_aborted++;
528         return;
529 }