strace: Support all sorts of crazy things!
[akaros.git] / tests / strace.c
1 /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * Ron Minnich <rminnich@google.com>
4  * See LICENSE for details.
5  *
6  * TODO:
7  * - Do per-syscall output formatting, like interpreting arguments
8  */
9
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <unistd.h>
13 #include <fcntl.h>
14 #include <argp.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/param.h>
18 #include <parlib/parlib.h>
19 #include <parlib/bitmask.h>
20
21 struct strace_opts {
22         FILE                                            *outfile;
23         char                                            *trace_set;
24         char                                            **cmd_argv;
25         int                                                     cmd_argc;
26         int                                                     pid;
27         bool                                            follow_children;
28         bool                                            verbose;
29         bool                                            raw_output;
30         bool                                            with_time;
31         bool                                            drop_overflow;
32 };
33 static struct strace_opts opts;
34
35 static const char *doc = "strace -- trace syscalls of a process";
36 static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
37
38 static struct argp_option argp_opts[] = {
39         {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
40         {"pid", 'p', "PID", 0, "Process to attach to"},
41         {"follow", 'f', 0, 0, "Trace children"},
42         {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
43         {0, 0, 0, 0, ""},
44         {"traceset", 'e', "TRACE_SET", 0,
45          "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
46          },
47         {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
48                         "\n"
49                         "- path: syscalls that take file paths\n"
50                         "- fd: syscalls that take FDs\n"
51                         "- file: path and fd sets\n"
52                         "- mem: memory related (mmap, shared mem)\n"
53                         "- life: process lifetime (create, fork)\n"
54                         "- proc: anything process related (yields, pop_ctxs, life)\n"
55                         "- sched: requests or yields to the kernel (all resources)\n"
56                         "- vmm: syscalls mostly for VMs\n"
57         },
58         {0, 0, 0, 0, ""},
59         {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
60         {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
61         {"time", 't', 0, 0, "Print timestamps"},
62         {0, 'h', 0, OPTION_HIDDEN, 0},
63         { 0 }
64 };
65
66 struct trace_set {
67         char                                            *name;
68         unsigned int                            syscs[];
69 };
70
71 /* To add a trace set, create one here, add it to all_trace_sets, and update the
72  * help field in argp_opts. */
73
74 /* If you change this, update 'file' below. */
75 static struct trace_set path_trace_set = { "path",
76         {SYS_proc_create,
77          SYS_exec,
78          SYS_openat,
79          SYS_stat,
80          SYS_lstat,
81          SYS_access,
82          SYS_link,
83          SYS_unlink,
84          SYS_symlink,
85          SYS_readlink,
86          SYS_chdir,
87          SYS_mkdir,
88          SYS_rmdir,
89          SYS_nbind,
90          SYS_nmount,
91          SYS_nunmount,
92          SYS_wstat,
93          SYS_rename,
94          0}
95 };
96
97 /* If you change this, update 'file' below.
98  *
99  * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
100  * tracks openat, since that's the source for all FDs */
101 static struct trace_set fd_trace_set = { "fd",
102         {SYS_openat,
103          SYS_mmap,
104          SYS_read,
105          SYS_write,
106          SYS_openat,
107          SYS_close,
108          SYS_fstat,
109          SYS_fcntl,
110          SYS_llseek,
111          SYS_fchdir,
112          SYS_nmount,
113          SYS_fd2path,
114          SYS_fwstat,
115          SYS_dup_fds_to,
116          SYS_tap_fds,
117          SYS_abort_sysc_fd,
118          0}
119 };
120
121 /* This is the manually-created contents of 'path' and 'fd' */
122 static struct trace_set file_trace_set = { "file",
123         {/* From 'path' */
124          SYS_proc_create,
125          SYS_exec,
126          SYS_openat,
127          SYS_stat,
128          SYS_lstat,
129          SYS_access,
130          SYS_link,
131          SYS_unlink,
132          SYS_symlink,
133          SYS_readlink,
134          SYS_chdir,
135          SYS_mkdir,
136          SYS_rmdir,
137          SYS_nbind,
138          SYS_nmount,
139          SYS_nunmount,
140          SYS_wstat,
141          SYS_rename,
142          SYS_openat,
143          SYS_mmap,
144          SYS_read,
145          SYS_write,
146
147          /* From 'fd' */
148          SYS_openat,
149          SYS_close,
150          SYS_fstat,
151          SYS_fcntl,
152          SYS_llseek,
153          SYS_fchdir,
154          SYS_nmount,
155          SYS_fd2path,
156          SYS_fwstat,
157          SYS_dup_fds_to,
158          SYS_tap_fds,
159          SYS_abort_sysc_fd,
160          0}
161 };
162
163 static struct trace_set mem_trace_set = { "mem",
164         {SYS_mmap,
165          SYS_mprotect,
166          SYS_munmap,
167          SYS_shared_page_alloc,
168          SYS_shared_page_free,
169          SYS_populate_va,
170          0}
171 };
172
173 /* These are all in 'proc'; keep them in sync. */
174 static struct trace_set life_trace_set = { "life",
175         {SYS_proc_create,
176          SYS_proc_run,
177          SYS_proc_destroy,
178          SYS_fork,
179          SYS_exec,
180          SYS_waitpid,
181          SYS_change_to_m,
182          0}
183 };
184
185 /* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
186  * sleeping on something, which is usually useful.  aborting syscs are also
187  * often signs of control flow. */
188 static struct trace_set proc_trace_set = { "proc",
189         {SYS_block,
190          SYS_nanosleep,
191          SYS_getpcoreid,
192          SYS_getvcoreid,
193          SYS_proc_yield,
194          SYS_change_vcore,
195          SYS_notify,
196          SYS_self_notify,
197          SYS_vc_entry,
198          SYS_halt_core,
199          SYS_pop_ctx,
200          SYS_abort_sysc,
201          SYS_abort_sysc_fd,
202
203          /* From 'life' */
204          SYS_proc_create,
205          SYS_proc_run,
206          SYS_proc_destroy,
207          SYS_fork,
208          SYS_exec,
209          SYS_waitpid,
210          SYS_change_to_m,
211          0}
212 };
213
214 static struct trace_set sched_trace_set = { "sched",
215         {SYS_provision,
216          SYS_proc_yield,
217          SYS_poke_ksched,
218          0}
219 };
220
221 static struct trace_set vmm_trace_set = { "vmm",
222         {SYS_vmm_setup,
223          SYS_vmm_poke_guest,
224          SYS_pop_ctx,
225          0}
226 };
227
228 static struct trace_set *all_trace_sets[] = {
229         &path_trace_set,
230         &fd_trace_set,
231         &file_trace_set,
232         &mem_trace_set,
233         &life_trace_set,
234         &proc_trace_set,
235         &sched_trace_set,
236         &vmm_trace_set,
237 };
238
239 static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
240
241
242 static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
243 {
244         struct strace_opts *s_opts = state->input;
245
246         switch (key) {
247         case 'o':
248                 s_opts->outfile = fopen(arg, "wb");
249                 if (!s_opts->outfile) {
250                         fprintf(stderr, "Unable to open file '%s' for writing: %s\n",
251                                         arg, strerror(errno));
252                         exit(1);
253                 }
254                 break;
255         case 'e':
256                 s_opts->trace_set = arg;
257                 break;
258         case 'p':
259                 s_opts->pid = atoi(arg);
260                 if (!s_opts->pid)
261                         argp_error(state, "Cannot trace pid 0 (won't exist)");
262                 break;
263         case 'f':
264                 s_opts->follow_children = TRUE;
265                 break;
266         case 'v':
267                 s_opts->verbose = TRUE;
268                 break;
269         case 'r':
270                 s_opts->raw_output = TRUE;
271                 break;
272         case 't':
273                 s_opts->with_time = TRUE;
274                 break;
275         case 'd':
276                 s_opts->drop_overflow = TRUE;
277                 break;
278         case ARGP_KEY_ARG:
279                 if (s_opts->pid)
280                         argp_error(state, "PID already set, can't launch a process too");
281                 s_opts->cmd_argc = state->argc - state->next + 1;
282                 s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc + 1));
283                 assert(s_opts->cmd_argv);
284                 s_opts->cmd_argv[0] = arg;
285                 memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
286                        sizeof(char*) * (s_opts->cmd_argc - 1));
287                 s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
288                 state->next = state->argc;
289                 break;
290         case ARGP_KEY_END:
291                 if (!(s_opts->cmd_argc || s_opts->pid))
292                         argp_error(state, "Need either -p or a command to run");
293                 /* Note we never fclose outfile.  It'll flush when we exit.  o/w, we'll
294                  * need to be careful whether we're closing stderr or not. */
295                 if (!s_opts->outfile)
296                         s_opts->outfile = stderr;
297                 break;
298         case 'h':
299                 argp_usage(state);
300                 break;
301         default:
302                 return ARGP_ERR_UNKNOWN;
303         }
304         return 0;
305 }
306
307 static bool handle_trace_set(char *tok, bool clear)
308 {
309         struct trace_set *ts;
310         unsigned int sysc_nr;
311
312         for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
313                 ts = all_trace_sets[i];
314                 if (!strcmp(ts->name, tok)) {
315                         for (int j = 0; j < MAX_SYSCALL_NR; j++) {
316                                 sysc_nr = ts->syscs[j];
317                                 /* 0-terminated list */
318                                 if (!sysc_nr)
319                                         break;
320                                 if (clear)
321                                         CLR_BITMASK_BIT(traceset_bm, sysc_nr);
322                                 else
323                                         SET_BITMASK_BIT(traceset_bm, sysc_nr);
324                         }
325                         return TRUE;
326                 }
327         }
328         return FALSE;
329 }
330
331 static char *resolve_syscall_alias(char *tok)
332 {
333         if (!strcmp(tok, "open")) {
334                 tok = "openat";
335                 return tok;
336         }
337         return tok;
338 }
339
340 static bool handle_raw_syscall(char *tok, bool clear)
341 {
342         for (int i = 0; i < __syscall_tbl_sz; i++) {
343                 if (!__syscall_tbl[i])
344                         continue;
345                 tok = resolve_syscall_alias(tok);
346                 if (!strcmp(__syscall_tbl[i], tok)) {
347                         if (clear)
348                                 CLR_BITMASK_BIT(traceset_bm, i);
349                         else
350                                 SET_BITMASK_BIT(traceset_bm, i);
351                         return TRUE;
352                 }
353         }
354         return FALSE;
355 }
356
357 static void build_ignore_list(char *trace_set)
358 {
359         char *tok, *tok_save = 0;
360         bool clear = FALSE;
361
362         if (!trace_set) {
363                 if (opts.verbose)
364                         fprintf(stderr, "# Tracing all syscalls\n");
365                 return;
366         }
367         for (tok = strtok_r(trace_set, ",", &tok_save);
368              tok;
369                  tok = strtok_r(NULL, ",", &tok_save)) {
370
371                 if (tok[0] == '!') {
372                         clear = TRUE;
373                         tok++;
374                 }
375                 if (handle_trace_set(tok, clear))
376                         continue;
377                 if (handle_raw_syscall(tok, clear))
378                         continue;
379                 /* You could imaging continuing, but this error would probably be
380                  * missed in the output stream and we'd be wondering why we weren't
381                  * getting a syscall, due to a typo. */
382                 fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
383                         tok);
384                 exit(-1);
385         }
386         if (opts.verbose) {
387                 for (int i = 0; i < MAX_SYSCALL_NR; i++) {
388                         if (GET_BITMASK_BIT(traceset_bm, i))
389                                 fprintf(stderr, "# Tracing syscall %s (%d)\n",
390                                         __syscall_tbl[i], i);
391                 }
392         }
393 }
394
395 static void parse_traces(int fd)
396 {
397         char *line, *_line;
398         ssize_t ret;
399
400         line = malloc(SYSTR_BUF_SZ);
401         assert(line);
402
403         while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
404                 /* make sure each line ends in \n\0. */
405                 line[ret - 1] = '\n';
406                 line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
407                 _line = line;
408                 if (opts.raw_output) {
409                         fprintf(opts.outfile, "%s", _line);
410                         continue;
411                 }
412                 if (!opts.with_time) {
413                         /* move starting E or X marker */
414                         _line[40] = _line[0];
415                         /* skip over the timestamp */
416                         _line = &_line[40];
417                 }
418                 fprintf(opts.outfile, "%s", _line);
419         }
420         /* This is a little hokey.  If the process exited, then the qio hung up and
421          * we got a status message from the kernel.  This was the errstr of the last
422          * failed read.  However, if we're doing a -p and someone kills *us*, we'll
423          * never see this.  And catching the signal doesn't help either.  The
424          * process needs to exit (strace_shutdown).  Either that, or change the
425          * kernel to set_errstr() on close(), and coordinate with a sighandler. */
426         if (opts.verbose)
427                 fprintf(stderr, "%r\n");
428         free(line);
429 }
430
431 int main(int argc, char **argv, char **envp)
432 {
433         int fd;
434         pid_t pid;
435         static char path[2 * MAX_PATH_LEN];
436         struct syscall sysc;
437         struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
438
439         argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
440
441         build_ignore_list(opts.trace_set);
442
443         if (opts.cmd_argc) {
444                 pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
445                                                opts.cmd_argv, envp);
446                 if (pid < 0) {
447                         perror("Unable to spawn child");
448                         exit(-1);
449                 }
450                 /* We need to wait on the child asynchronously.  If we hold a ref (as
451                  * the parent), the child won't proc_free and that won't hangup/wake us
452                  * from a read. */
453                 syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
454         } else {
455                 pid = opts.pid;
456         }
457
458         snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
459         fd = open(path, O_WRITE);
460         if (fd < 0) {
461                 fprintf(stderr, "open %s: %r\n", path);
462                 exit(1);
463         }
464         if (opts.follow_children)
465                 snprintf(path, sizeof(path), "straceall");
466         else
467                 snprintf(path, sizeof(path), "straceme");
468         if (write(fd, path, strlen(path)) < strlen(path)) {
469                 fprintf(stderr, "write to ctl %s: %r\n", path);
470                 exit(1);
471         }
472         if (opts.drop_overflow) {
473                 snprintf(path, sizeof(path), "strace_drop on");
474                 if (write(fd, path, strlen(path)) < strlen(path)) {
475                         fprintf(stderr, "write to ctl %s: %r\n", path);
476                         exit(1);
477                 }
478         }
479         close(fd);
480
481         if (opts.trace_set) {
482                 snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
483                 fd = open(path, O_RDWR);
484                 if (fd < 0) {
485                         fprintf(stderr, "open %s: %r\n", path);
486                         exit(1);
487                 }
488                 if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
489                     < COUNT_OF(traceset_bm)) {
490                         fprintf(stderr, "write to strace_ignore: %r\n");
491                         exit(1);
492                 }
493                 close(fd);
494         }
495
496         snprintf(path, sizeof(path), "/proc/%d/strace", pid);
497         fd = open(path, O_READ);
498         if (!fd) {
499                 fprintf(stderr, "open %s: %r\n", path);
500                 exit(1);
501         }
502
503         if (opts.cmd_argc) {
504                 /* now that we've set up the tracing, we can run the process.  isn't it
505                  * great that the process doesn't immediately start when you make it? */
506                 sys_proc_run(pid);
507         }
508
509         parse_traces(fd);
510         return 0;
511 }