qio: Report partial progress for NONBLOCK queues
[akaros.git] / tests / strace.c
1 /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * Ron Minnich <rminnich@google.com>
4  * See LICENSE for details.
5  *
6  * TODO:
7  * - Do per-syscall output formatting, like interpreting arguments
8  */
9
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <unistd.h>
13 #include <fcntl.h>
14 #include <argp.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/param.h>
18 #include <parlib/parlib.h>
19 #include <parlib/bitmask.h>
20
21 struct strace_opts {
22         FILE                                            *outfile;
23         char                                            *trace_set;
24         char                                            **cmd_argv;
25         int                                                     cmd_argc;
26         int                                                     pid;
27         bool                                            follow_children;
28         bool                                            verbose;
29         bool                                            raw_output;
30         bool                                            with_time;
31         bool                                            drop_overflow;
32 };
33 static struct strace_opts opts;
34
35 static const char *doc = "strace -- trace syscalls of a process";
36 static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
37
38 static struct argp_option argp_opts[] = {
39         {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
40         {"pid", 'p', "PID", 0, "Process to attach to"},
41         {"follow", 'f', 0, 0, "Trace children"},
42         {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
43         {0, 0, 0, 0, ""},
44         {"traceset", 'e', "TRACE_SET", 0,
45          "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
46          },
47         {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
48                         "\n"
49                         "- path: syscalls that take file paths\n"
50                         "- fd: syscalls that take FDs\n"
51                         "- file: path and fd sets\n"
52                         "- mem: memory related (mmap, shared mem)\n"
53                         "- life: process lifetime (create, fork)\n"
54                         "- proc: anything process related (yields, pop_ctxs, life)\n"
55                         "- sched: requests or yields to the kernel (all resources)\n"
56                         "- vmm: syscalls mostly for VMs\n"
57         },
58         {0, 0, 0, 0, ""},
59         {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
60         {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
61         {"time", 't', 0, 0, "Print timestamps"},
62         {0, 'h', 0, OPTION_HIDDEN, 0},
63         { 0 }
64 };
65
66 struct trace_set {
67         char                                            *name;
68         unsigned int                            syscs[];
69 };
70
71 /* To add a trace set, create one here, add it to all_trace_sets, and update the
72  * help field in argp_opts. */
73
74 /* If you change this, update 'file' below. */
75 static struct trace_set path_trace_set = { "path",
76         {SYS_proc_create,
77          SYS_exec,
78          SYS_openat,
79          SYS_stat,
80          SYS_lstat,
81          SYS_access,
82          SYS_link,
83          SYS_unlink,
84          SYS_symlink,
85          SYS_readlink,
86          SYS_chdir,
87          SYS_mkdir,
88          SYS_rmdir,
89          SYS_nbind,
90          SYS_nmount,
91          SYS_nunmount,
92          SYS_wstat,
93          SYS_rename,
94          0}
95 };
96
97 /* If you change this, update 'file' below.
98  *
99  * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
100  * tracks openat, since that's the source for all FDs */
101 static struct trace_set fd_trace_set = { "fd",
102         {SYS_openat,
103          SYS_mmap,
104          SYS_read,
105          SYS_write,
106          SYS_openat,
107          SYS_close,
108          SYS_fstat,
109          SYS_fcntl,
110          SYS_llseek,
111          SYS_fchdir,
112          SYS_nmount,
113          SYS_fd2path,
114          SYS_fwstat,
115          SYS_dup_fds_to,
116          SYS_tap_fds,
117          SYS_abort_sysc_fd,
118          0}
119 };
120
121 /* This is the manually-created contents of 'path' and 'fd' */
122 static struct trace_set file_trace_set = { "file",
123         {/* From 'path' */
124          SYS_proc_create,
125          SYS_exec,
126          SYS_openat,
127          SYS_stat,
128          SYS_lstat,
129          SYS_access,
130          SYS_link,
131          SYS_unlink,
132          SYS_symlink,
133          SYS_readlink,
134          SYS_chdir,
135          SYS_mkdir,
136          SYS_rmdir,
137          SYS_nbind,
138          SYS_nmount,
139          SYS_nunmount,
140          SYS_wstat,
141          SYS_rename,
142          SYS_openat,
143          SYS_mmap,
144          SYS_read,
145          SYS_write,
146
147          /* From 'fd' */
148          SYS_openat,
149          SYS_close,
150          SYS_fstat,
151          SYS_fcntl,
152          SYS_llseek,
153          SYS_fchdir,
154          SYS_nmount,
155          SYS_fd2path,
156          SYS_fwstat,
157          SYS_dup_fds_to,
158          SYS_tap_fds,
159          SYS_abort_sysc_fd,
160          0}
161 };
162
163 static struct trace_set mem_trace_set = { "mem",
164         {SYS_mmap,
165          SYS_mprotect,
166          SYS_munmap,
167          SYS_shared_page_alloc,
168          SYS_shared_page_free,
169          SYS_populate_va,
170          0}
171 };
172
173 /* These are all in 'proc'; keep them in sync. */
174 static struct trace_set life_trace_set = { "life",
175         {SYS_proc_create,
176          SYS_proc_run,
177          SYS_proc_destroy,
178          SYS_fork,
179          SYS_exec,
180          SYS_waitpid,
181          SYS_change_to_m,
182          0}
183 };
184
185 /* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
186  * sleeping on something, which is usually useful.  aborting syscs are also
187  * often signs of control flow. */
188 static struct trace_set proc_trace_set = { "proc",
189         {SYS_block,
190          SYS_nanosleep,
191          SYS_getpcoreid,
192          SYS_getvcoreid,
193          SYS_proc_yield,
194          SYS_change_vcore,
195          SYS_notify,
196          SYS_self_notify,
197          SYS_send_event,
198          SYS_vc_entry,
199          SYS_halt_core,
200          SYS_pop_ctx,
201          SYS_abort_sysc,
202          SYS_abort_sysc_fd,
203
204          /* From 'life' */
205          SYS_proc_create,
206          SYS_proc_run,
207          SYS_proc_destroy,
208          SYS_fork,
209          SYS_exec,
210          SYS_waitpid,
211          SYS_change_to_m,
212          0}
213 };
214
215 static struct trace_set sched_trace_set = { "sched",
216         {SYS_provision,
217          SYS_proc_yield,
218          SYS_poke_ksched,
219          0}
220 };
221
222 static struct trace_set vmm_trace_set = { "vmm",
223         {SYS_vmm_setup,
224          SYS_vmm_poke_guest,
225          SYS_pop_ctx,
226          0}
227 };
228
229 static struct trace_set *all_trace_sets[] = {
230         &path_trace_set,
231         &fd_trace_set,
232         &file_trace_set,
233         &mem_trace_set,
234         &life_trace_set,
235         &proc_trace_set,
236         &sched_trace_set,
237         &vmm_trace_set,
238 };
239
240 static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
241
242
243 static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
244 {
245         struct strace_opts *s_opts = state->input;
246
247         switch (key) {
248         case 'o':
249                 s_opts->outfile = fopen(arg, "wb");
250                 if (!s_opts->outfile) {
251                         fprintf(stderr, "Unable to open file '%s' for writing: %s\n",
252                                         arg, strerror(errno));
253                         exit(1);
254                 }
255                 break;
256         case 'e':
257                 s_opts->trace_set = arg;
258                 break;
259         case 'p':
260                 s_opts->pid = atoi(arg);
261                 if (!s_opts->pid)
262                         argp_error(state, "Cannot trace pid 0 (won't exist)");
263                 break;
264         case 'f':
265                 s_opts->follow_children = TRUE;
266                 break;
267         case 'v':
268                 s_opts->verbose = TRUE;
269                 break;
270         case 'r':
271                 s_opts->raw_output = TRUE;
272                 break;
273         case 't':
274                 s_opts->with_time = TRUE;
275                 break;
276         case 'd':
277                 s_opts->drop_overflow = TRUE;
278                 break;
279         case ARGP_KEY_ARG:
280                 if (s_opts->pid)
281                         argp_error(state, "PID already set, can't launch a process too");
282                 s_opts->cmd_argc = state->argc - state->next + 1;
283                 s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc + 1));
284                 assert(s_opts->cmd_argv);
285                 s_opts->cmd_argv[0] = arg;
286                 memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
287                        sizeof(char*) * (s_opts->cmd_argc - 1));
288                 s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
289                 state->next = state->argc;
290                 break;
291         case ARGP_KEY_END:
292                 if (!(s_opts->cmd_argc || s_opts->pid))
293                         argp_error(state, "Need either -p or a command to run");
294                 /* Note we never fclose outfile.  It'll flush when we exit.  o/w, we'll
295                  * need to be careful whether we're closing stderr or not. */
296                 if (!s_opts->outfile)
297                         s_opts->outfile = stderr;
298                 break;
299         case 'h':
300                 argp_usage(state);
301                 break;
302         default:
303                 return ARGP_ERR_UNKNOWN;
304         }
305         return 0;
306 }
307
308 static bool handle_trace_set(char *tok, bool clear)
309 {
310         struct trace_set *ts;
311         unsigned int sysc_nr;
312
313         for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
314                 ts = all_trace_sets[i];
315                 if (!strcmp(ts->name, tok)) {
316                         for (int j = 0; j < MAX_SYSCALL_NR; j++) {
317                                 sysc_nr = ts->syscs[j];
318                                 /* 0-terminated list */
319                                 if (!sysc_nr)
320                                         break;
321                                 if (clear)
322                                         CLR_BITMASK_BIT(traceset_bm, sysc_nr);
323                                 else
324                                         SET_BITMASK_BIT(traceset_bm, sysc_nr);
325                         }
326                         return TRUE;
327                 }
328         }
329         return FALSE;
330 }
331
332 static char *resolve_syscall_alias(char *tok)
333 {
334         if (!strcmp(tok, "open")) {
335                 tok = "openat";
336                 return tok;
337         }
338         return tok;
339 }
340
341 static bool handle_raw_syscall(char *tok, bool clear)
342 {
343         for (int i = 0; i < __syscall_tbl_sz; i++) {
344                 if (!__syscall_tbl[i])
345                         continue;
346                 tok = resolve_syscall_alias(tok);
347                 if (!strcmp(__syscall_tbl[i], tok)) {
348                         if (clear)
349                                 CLR_BITMASK_BIT(traceset_bm, i);
350                         else
351                                 SET_BITMASK_BIT(traceset_bm, i);
352                         return TRUE;
353                 }
354         }
355         return FALSE;
356 }
357
358 static void build_ignore_list(char *trace_set)
359 {
360         char *tok, *tok_save = 0;
361         bool clear = FALSE;
362
363         if (!trace_set) {
364                 if (opts.verbose)
365                         fprintf(stderr, "# Tracing all syscalls\n");
366                 return;
367         }
368         for (tok = strtok_r(trace_set, ",", &tok_save);
369              tok;
370                  tok = strtok_r(NULL, ",", &tok_save)) {
371
372                 if (tok[0] == '!') {
373                         clear = TRUE;
374                         tok++;
375                 }
376                 if (handle_trace_set(tok, clear))
377                         continue;
378                 if (handle_raw_syscall(tok, clear))
379                         continue;
380                 /* You could imaging continuing, but this error would probably be
381                  * missed in the output stream and we'd be wondering why we weren't
382                  * getting a syscall, due to a typo. */
383                 fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
384                         tok);
385                 exit(-1);
386         }
387         if (opts.verbose) {
388                 for (int i = 0; i < MAX_SYSCALL_NR; i++) {
389                         if (GET_BITMASK_BIT(traceset_bm, i))
390                                 fprintf(stderr, "# Tracing syscall %s (%d)\n",
391                                         __syscall_tbl[i], i);
392                 }
393         }
394 }
395
396 /* Removes the timestamp part of the line.  Use the return string in place of
397  * the full line you pass in. */
398 static char *remove_timestamps(char *full_line)
399 {
400         char *close_brace;
401
402         /* Format: E [  13655.986589401]-[      0.000000000] Syscall.  The seconds
403          * field may vary in size, so we need to find the second ']'. */
404         close_brace = strchr(full_line, ']');
405         if (!close_brace)
406                 return full_line;
407         close_brace = strchr(close_brace + 1, ']');
408         if (!close_brace)
409                 return full_line;
410         /* move starting E or X marker */
411         *close_brace = full_line[0];
412         return close_brace;
413 }
414
415 static void parse_traces(int fd)
416 {
417         char *line, *_line;
418         ssize_t ret;
419
420         line = malloc(SYSTR_BUF_SZ);
421         assert(line);
422
423         while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
424                 /* make sure each line ends in \n\0. */
425                 line[ret - 1] = '\n';
426                 line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
427                 _line = line;
428                 if (opts.raw_output) {
429                         fprintf(opts.outfile, "%s", _line);
430                         continue;
431                 }
432                 if (!opts.with_time)
433                         _line = remove_timestamps(_line);
434                 fprintf(opts.outfile, "%s", _line);
435         }
436         /* This is a little hokey.  If the process exited, then the qio hung up and
437          * we got a status message from the kernel.  This was the errstr of the last
438          * failed read.  However, if we're doing a -p and someone kills *us*, we'll
439          * never see this.  And catching the signal doesn't help either.  The
440          * process needs to exit (strace_shutdown).  Either that, or change the
441          * kernel to set_errstr() on close(), and coordinate with a sighandler. */
442         if (opts.verbose)
443                 fprintf(stderr, "%r\n");
444         free(line);
445 }
446
447 int main(int argc, char **argv, char **envp)
448 {
449         int fd;
450         pid_t pid;
451         static char path[2 * MAX_PATH_LEN];
452         struct syscall sysc;
453         struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
454
455         argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
456
457         build_ignore_list(opts.trace_set);
458
459         if (opts.cmd_argc) {
460                 pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
461                                                opts.cmd_argv, envp);
462                 if (pid < 0) {
463                         perror("Unable to spawn child");
464                         exit(-1);
465                 }
466                 /* We need to wait on the child asynchronously.  If we hold a ref (as
467                  * the parent), the child won't proc_free and that won't hangup/wake us
468                  * from a read. */
469                 syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
470         } else {
471                 pid = opts.pid;
472         }
473
474         snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
475         fd = open(path, O_WRITE);
476         if (fd < 0) {
477                 fprintf(stderr, "open %s: %r\n", path);
478                 exit(1);
479         }
480         if (opts.follow_children)
481                 snprintf(path, sizeof(path), "straceall");
482         else
483                 snprintf(path, sizeof(path), "straceme");
484         if (write(fd, path, strlen(path)) < strlen(path)) {
485                 fprintf(stderr, "write to ctl %s: %r\n", path);
486                 exit(1);
487         }
488         if (opts.drop_overflow) {
489                 snprintf(path, sizeof(path), "strace_drop on");
490                 if (write(fd, path, strlen(path)) < strlen(path)) {
491                         fprintf(stderr, "write to ctl %s: %r\n", path);
492                         exit(1);
493                 }
494         }
495         close(fd);
496
497         if (opts.trace_set) {
498                 snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
499                 fd = open(path, O_RDWR);
500                 if (fd < 0) {
501                         fprintf(stderr, "open %s: %r\n", path);
502                         exit(1);
503                 }
504                 if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
505                     < COUNT_OF(traceset_bm)) {
506                         fprintf(stderr, "write to strace_ignore: %r\n");
507                         exit(1);
508                 }
509                 close(fd);
510         }
511
512         snprintf(path, sizeof(path), "/proc/%d/strace", pid);
513         fd = open(path, O_READ);
514         if (!fd) {
515                 fprintf(stderr, "open %s: %r\n", path);
516                 exit(1);
517         }
518
519         if (opts.cmd_argc) {
520                 /* now that we've set up the tracing, we can run the process.  isn't it
521                  * great that the process doesn't immediately start when you make it? */
522                 sys_proc_run(pid);
523         }
524
525         parse_traces(fd);
526         return 0;
527 }