strace: Fix issues with a few syscalls
[akaros.git] / tests / strace.c
1 /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * Ron Minnich <rminnich@google.com>
4  * See LICENSE for details.
5  *
6  * TODO:
7  * - Do per-syscall output formatting, like interpreting arguments
8  */
9
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <unistd.h>
13 #include <fcntl.h>
14 #include <argp.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/param.h>
18 #include <parlib/parlib.h>
19 #include <parlib/bitmask.h>
20
21 struct strace_opts {
22         FILE                                            *outfile;
23         char                                            *trace_set;
24         char                                            **cmd_argv;
25         int                                                     cmd_argc;
26         int                                                     pid;
27         bool                                            follow_children;
28         bool                                            verbose;
29         bool                                            raw_output;
30         bool                                            with_time;
31         bool                                            drop_overflow;
32 };
33 static struct strace_opts opts;
34
35 static const char *doc = "strace -- trace syscalls of a process";
36 static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
37
38 static struct argp_option argp_opts[] = {
39         {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
40         {"pid", 'p', "PID", 0, "Process to attach to"},
41         {"follow", 'f', 0, 0, "Trace children"},
42         {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
43         {0, 0, 0, 0, ""},
44         {"traceset", 'e', "TRACE_SET", 0,
45          "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
46          },
47         {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
48                         "\n"
49                         "- path: syscalls that take file paths\n"
50                         "- fd: syscalls that take FDs\n"
51                         "- file: path and fd sets\n"
52                         "- mem: memory related (mmap, shared mem)\n"
53                         "- life: process lifetime (create, fork)\n"
54                         "- proc: anything process related (yields, pop_ctxs, life)\n"
55                         "- sched: requests or yields to the kernel (all resources)\n"
56                         "- vmm: syscalls mostly for VMs\n"
57         },
58         {0, 0, 0, 0, ""},
59         {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
60         {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
61         {"time", 't', 0, 0, "Print timestamps"},
62         {0, 'h', 0, OPTION_HIDDEN, 0},
63         { 0 }
64 };
65
66 struct trace_set {
67         char                                            *name;
68         unsigned int                            syscs[];
69 };
70
71 /* To add a trace set, create one here, add it to all_trace_sets, and update the
72  * help field in argp_opts. */
73
74 /* If you change this, update 'file' below. */
75 static struct trace_set path_trace_set = { "path",
76         {SYS_proc_create,
77          SYS_exec,
78          SYS_openat,
79          SYS_stat,
80          SYS_lstat,
81          SYS_access,
82          SYS_link,
83          SYS_unlink,
84          SYS_symlink,
85          SYS_readlink,
86          SYS_chdir,
87          SYS_getcwd,
88          SYS_mkdir,
89          SYS_rmdir,
90          SYS_nbind,
91          SYS_nmount,
92          SYS_nunmount,
93          SYS_wstat,
94          SYS_rename,
95          0}
96 };
97
98 /* If you change this, update 'file' below.
99  *
100  * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
101  * tracks openat, since that's the source for all FDs */
102 static struct trace_set fd_trace_set = { "fd",
103         {SYS_openat,
104          SYS_mmap,
105          SYS_read,
106          SYS_write,
107          SYS_openat,
108          SYS_close,
109          SYS_fstat,
110          SYS_fcntl,
111          SYS_llseek,
112          SYS_fchdir,
113          SYS_nmount,
114          SYS_fd2path,
115          SYS_fwstat,
116          SYS_dup_fds_to,
117          SYS_tap_fds,
118          SYS_abort_sysc_fd,
119          0}
120 };
121
122 /* This is the manually-created contents of 'path' and 'fd' */
123 static struct trace_set file_trace_set = { "file",
124         {/* From 'path' */
125          SYS_proc_create,
126          SYS_exec,
127          SYS_openat,
128          SYS_stat,
129          SYS_lstat,
130          SYS_access,
131          SYS_link,
132          SYS_unlink,
133          SYS_symlink,
134          SYS_readlink,
135          SYS_chdir,
136          SYS_getcwd,
137          SYS_mkdir,
138          SYS_rmdir,
139          SYS_nbind,
140          SYS_nmount,
141          SYS_nunmount,
142          SYS_wstat,
143          SYS_rename,
144          SYS_openat,
145          SYS_mmap,
146          SYS_read,
147          SYS_write,
148
149          /* From 'fd' */
150          SYS_openat,
151          SYS_close,
152          SYS_fstat,
153          SYS_fcntl,
154          SYS_llseek,
155          SYS_fchdir,
156          SYS_nmount,
157          SYS_fd2path,
158          SYS_fwstat,
159          SYS_dup_fds_to,
160          SYS_tap_fds,
161          SYS_abort_sysc_fd,
162          0}
163 };
164
165 static struct trace_set mem_trace_set = { "mem",
166         {SYS_mmap,
167          SYS_mprotect,
168          SYS_munmap,
169          SYS_shared_page_alloc,
170          SYS_shared_page_free,
171          SYS_populate_va,
172          0}
173 };
174
175 /* These are all in 'proc'; keep them in sync. */
176 static struct trace_set life_trace_set = { "life",
177         {SYS_proc_create,
178          SYS_proc_run,
179          SYS_proc_destroy,
180          SYS_fork,
181          SYS_exec,
182          SYS_waitpid,
183          SYS_change_to_m,
184          0}
185 };
186
187 /* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
188  * sleeping on something, which is usually useful.  aborting syscs are also
189  * often signs of control flow. */
190 static struct trace_set proc_trace_set = { "proc",
191         {SYS_block,
192          SYS_nanosleep,
193          SYS_getpcoreid,
194          SYS_getvcoreid,
195          SYS_proc_yield,
196          SYS_change_vcore,
197          SYS_notify,
198          SYS_self_notify,
199          SYS_send_event,
200          SYS_vc_entry,
201          SYS_halt_core,
202          SYS_pop_ctx,
203          SYS_abort_sysc,
204          SYS_abort_sysc_fd,
205
206          /* From 'life' */
207          SYS_proc_create,
208          SYS_proc_run,
209          SYS_proc_destroy,
210          SYS_fork,
211          SYS_exec,
212          SYS_waitpid,
213          SYS_change_to_m,
214          0}
215 };
216
217 static struct trace_set sched_trace_set = { "sched",
218         {SYS_provision,
219          SYS_proc_yield,
220          SYS_poke_ksched,
221          0}
222 };
223
224 static struct trace_set vmm_trace_set = { "vmm",
225         {SYS_vmm_add_gpcs,
226          SYS_vmm_poke_guest,
227          SYS_vmm_ctl,
228          SYS_pop_ctx,
229          0}
230 };
231
232 static struct trace_set *all_trace_sets[] = {
233         &path_trace_set,
234         &fd_trace_set,
235         &file_trace_set,
236         &mem_trace_set,
237         &life_trace_set,
238         &proc_trace_set,
239         &sched_trace_set,
240         &vmm_trace_set,
241 };
242
243 static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
244
245
246 static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
247 {
248         struct strace_opts *s_opts = state->input;
249
250         switch (key) {
251         case 'o':
252                 s_opts->outfile = fopen(arg, "wb");
253                 if (!s_opts->outfile) {
254                         fprintf(stderr, "Unable to open file '%s' for writing: %s\n",
255                                         arg, strerror(errno));
256                         exit(1);
257                 }
258                 break;
259         case 'e':
260                 s_opts->trace_set = arg;
261                 break;
262         case 'p':
263                 s_opts->pid = atoi(arg);
264                 if (!s_opts->pid)
265                         argp_error(state, "Cannot trace pid 0 (won't exist)");
266                 break;
267         case 'f':
268                 s_opts->follow_children = TRUE;
269                 break;
270         case 'v':
271                 s_opts->verbose = TRUE;
272                 break;
273         case 'r':
274                 s_opts->raw_output = TRUE;
275                 break;
276         case 't':
277                 s_opts->with_time = TRUE;
278                 break;
279         case 'd':
280                 s_opts->drop_overflow = TRUE;
281                 break;
282         case ARGP_KEY_ARG:
283                 if (s_opts->pid)
284                         argp_error(state, "PID already set, can't launch a process too");
285                 s_opts->cmd_argc = state->argc - state->next + 1;
286                 s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc + 1));
287                 assert(s_opts->cmd_argv);
288                 s_opts->cmd_argv[0] = arg;
289                 memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
290                        sizeof(char*) * (s_opts->cmd_argc - 1));
291                 s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
292                 state->next = state->argc;
293                 break;
294         case ARGP_KEY_END:
295                 if (!(s_opts->cmd_argc || s_opts->pid))
296                         argp_error(state, "Need either -p or a command to run");
297                 /* Note we never fclose outfile.  It'll flush when we exit.  o/w, we'll
298                  * need to be careful whether we're closing stderr or not. */
299                 if (!s_opts->outfile)
300                         s_opts->outfile = stderr;
301                 break;
302         case 'h':
303                 argp_usage(state);
304                 break;
305         default:
306                 return ARGP_ERR_UNKNOWN;
307         }
308         return 0;
309 }
310
311 static bool handle_trace_set(char *tok, bool clear)
312 {
313         struct trace_set *ts;
314         unsigned int sysc_nr;
315
316         for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
317                 ts = all_trace_sets[i];
318                 if (!strcmp(ts->name, tok)) {
319                         for (int j = 0; j < MAX_SYSCALL_NR; j++) {
320                                 sysc_nr = ts->syscs[j];
321                                 /* 0-terminated list */
322                                 if (!sysc_nr)
323                                         break;
324                                 if (clear)
325                                         CLR_BITMASK_BIT(traceset_bm, sysc_nr);
326                                 else
327                                         SET_BITMASK_BIT(traceset_bm, sysc_nr);
328                         }
329                         return TRUE;
330                 }
331         }
332         return FALSE;
333 }
334
335 static char *resolve_syscall_alias(char *tok)
336 {
337         if (!strcmp(tok, "open")) {
338                 tok = "openat";
339                 return tok;
340         }
341         return tok;
342 }
343
344 static bool handle_raw_syscall(char *tok, bool clear)
345 {
346         for (int i = 0; i < __syscall_tbl_sz; i++) {
347                 if (!__syscall_tbl[i])
348                         continue;
349                 tok = resolve_syscall_alias(tok);
350                 if (!strcmp(__syscall_tbl[i], tok)) {
351                         if (clear)
352                                 CLR_BITMASK_BIT(traceset_bm, i);
353                         else
354                                 SET_BITMASK_BIT(traceset_bm, i);
355                         return TRUE;
356                 }
357         }
358         return FALSE;
359 }
360
361 static void build_ignore_list(char *trace_set)
362 {
363         char *tok, *tok_save = 0;
364         bool clear = FALSE;
365
366         if (!trace_set) {
367                 if (opts.verbose)
368                         fprintf(stderr, "# Tracing all syscalls\n");
369                 return;
370         }
371         for (tok = strtok_r(trace_set, ",", &tok_save);
372              tok;
373                  tok = strtok_r(NULL, ",", &tok_save)) {
374
375                 if (tok[0] == '!') {
376                         clear = TRUE;
377                         tok++;
378                 }
379                 if (handle_trace_set(tok, clear))
380                         continue;
381                 if (handle_raw_syscall(tok, clear))
382                         continue;
383                 /* You could imaging continuing, but this error would probably be
384                  * missed in the output stream and we'd be wondering why we weren't
385                  * getting a syscall, due to a typo. */
386                 fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
387                         tok);
388                 exit(-1);
389         }
390         if (opts.verbose) {
391                 for (int i = 0; i < MAX_SYSCALL_NR; i++) {
392                         if (GET_BITMASK_BIT(traceset_bm, i))
393                                 fprintf(stderr, "# Tracing syscall %s (%d)\n",
394                                         __syscall_tbl[i], i);
395                 }
396         }
397 }
398
399 /* Removes the timestamp part of the line.  Use the return string in place of
400  * the full line you pass in. */
401 static char *remove_timestamps(char *full_line)
402 {
403         char *close_brace;
404
405         /* Format: E [  13655.986589401]-[      0.000000000] Syscall.  The seconds
406          * field may vary in size, so we need to find the second ']'. */
407         close_brace = strchr(full_line, ']');
408         if (!close_brace)
409                 return full_line;
410         close_brace = strchr(close_brace + 1, ']');
411         if (!close_brace)
412                 return full_line;
413         /* move starting E or X marker */
414         *close_brace = full_line[0];
415         return close_brace;
416 }
417
418 static void parse_traces(int fd)
419 {
420         char *line, *_line;
421         ssize_t ret;
422
423         line = malloc(SYSTR_BUF_SZ);
424         assert(line);
425
426         while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
427                 /* make sure each line ends in \n\0. */
428                 line[ret - 1] = '\n';
429                 line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
430                 _line = line;
431                 if (opts.raw_output) {
432                         fprintf(opts.outfile, "%s", _line);
433                         continue;
434                 }
435                 if (!opts.with_time)
436                         _line = remove_timestamps(_line);
437                 fprintf(opts.outfile, "%s", _line);
438         }
439         /* This is a little hokey.  If the process exited, then the qio hung up and
440          * we got a status message from the kernel.  This was the errstr of the last
441          * failed read.  However, if we're doing a -p and someone kills *us*, we'll
442          * never see this.  And catching the signal doesn't help either.  The
443          * process needs to exit (strace_shutdown).  Either that, or change the
444          * kernel to set_errstr() on close(), and coordinate with a sighandler. */
445         if (opts.verbose)
446                 fprintf(stderr, "%r\n");
447         free(line);
448 }
449
450 int main(int argc, char **argv, char **envp)
451 {
452         int fd;
453         pid_t pid;
454         static char path[2 * MAX_PATH_LEN];
455         struct syscall sysc;
456         struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
457
458         argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
459
460         build_ignore_list(opts.trace_set);
461
462         if (opts.cmd_argc) {
463                 pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
464                                                opts.cmd_argv, envp);
465                 if (pid < 0) {
466                         perror("Unable to spawn child");
467                         exit(-1);
468                 }
469                 /* We need to wait on the child asynchronously.  If we hold a ref (as
470                  * the parent), the child won't proc_free and that won't hangup/wake us
471                  * from a read. */
472                 syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
473         } else {
474                 pid = opts.pid;
475         }
476
477         snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
478         fd = open(path, O_WRITE);
479         if (fd < 0) {
480                 fprintf(stderr, "open %s: %r\n", path);
481                 exit(1);
482         }
483         if (opts.follow_children)
484                 snprintf(path, sizeof(path), "straceall");
485         else
486                 snprintf(path, sizeof(path), "straceme");
487         if (write(fd, path, strlen(path)) < strlen(path)) {
488                 fprintf(stderr, "write to ctl %s: %r\n", path);
489                 exit(1);
490         }
491         if (opts.drop_overflow) {
492                 snprintf(path, sizeof(path), "strace_drop on");
493                 if (write(fd, path, strlen(path)) < strlen(path)) {
494                         fprintf(stderr, "write to ctl %s: %r\n", path);
495                         exit(1);
496                 }
497         }
498         close(fd);
499
500         if (opts.trace_set) {
501                 snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
502                 fd = open(path, O_RDWR);
503                 if (fd < 0) {
504                         fprintf(stderr, "open %s: %r\n", path);
505                         exit(1);
506                 }
507                 if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
508                     < COUNT_OF(traceset_bm)) {
509                         fprintf(stderr, "write to strace_ignore: %r\n");
510                         exit(1);
511                 }
512                 close(fd);
513         }
514
515         snprintf(path, sizeof(path), "/proc/%d/strace", pid);
516         fd = open(path, O_READ);
517         if (!fd) {
518                 fprintf(stderr, "open %s: %r\n", path);
519                 exit(1);
520         }
521
522         if (opts.cmd_argc) {
523                 /* now that we've set up the tracing, we can run the process.  isn't it
524                  * great that the process doesn't immediately start when you make it? */
525                 sys_proc_run(pid);
526         }
527
528         parse_traces(fd);
529         return 0;
530 }