cons: disable dangerous conswrites()
[akaros.git] / tests / strace.c
1 /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * Ron Minnich <rminnich@google.com>
4  * See LICENSE for details.
5  *
6  * TODO:
7  * - Do per-syscall output formatting, like interpreting arguments
8  */
9
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <unistd.h>
13 #include <fcntl.h>
14 #include <argp.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/param.h>
18 #include <parlib/parlib.h>
19 #include <parlib/bitmask.h>
20
21 struct strace_opts {
22         FILE                    *outfile;
23         char                    *trace_set;
24         char                    **cmd_argv;
25         int                     cmd_argc;
26         int                     pid;
27         bool                    follow_children;
28         bool                    verbose;
29         bool                    raw_output;
30         bool                    with_time;
31         bool                    drop_overflow;
32 };
33 static struct strace_opts opts;
34
35 static const char *doc = "strace -- trace syscalls of a process";
36 static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
37
38 static struct argp_option argp_opts[] = {
39         {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
40         {"pid", 'p', "PID", 0, "Process to attach to"},
41         {"follow", 'f', 0, 0, "Trace children"},
42         {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
43         {0, 0, 0, 0, ""},
44         {"traceset", 'e', "TRACE_SET", 0,
45          "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
46          },
47         {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
48                  "\n"
49                  "- path: syscalls that take file paths\n"
50                  "- fd: syscalls that take FDs\n"
51                  "- file: path and fd sets\n"
52                  "- mem: memory related (mmap, shared mem)\n"
53                  "- life: process lifetime (create, fork)\n"
54                  "- proc: anything process related (yields, pop_ctxs, life)\n"
55                  "- sched: requests or yields to the kernel (all resources)\n"
56                  "- vmm: syscalls mostly for VMs\n"
57         },
58         {0, 0, 0, 0, ""},
59         {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
60         {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
61         {"time", 't', 0, 0, "Print timestamps"},
62         {0, 'h', 0, OPTION_HIDDEN, 0},
63         { 0 }
64 };
65
66 struct trace_set {
67         char                    *name;
68         unsigned int            syscs[];
69 };
70
71 /* To add a trace set, create one here, add it to all_trace_sets, and update the
72  * help field in argp_opts. */
73
74 /* If you change this, update 'file' below. */
75 static struct trace_set path_trace_set = { "path",
76         {SYS_proc_create,
77          SYS_exec,
78          SYS_openat,
79          SYS_stat,
80          SYS_lstat,
81          SYS_access,
82          SYS_link,
83          SYS_unlink,
84          SYS_symlink,
85          SYS_readlink,
86          SYS_chdir,
87          SYS_getcwd,
88          SYS_mkdir,
89          SYS_rmdir,
90          SYS_nbind,
91          SYS_nmount,
92          SYS_nunmount,
93          SYS_wstat,
94          SYS_rename,
95          0}
96 };
97
98 /* If you change this, update 'file' below.
99  *
100  * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
101  * tracks openat, since that's the source for all FDs */
102 static struct trace_set fd_trace_set = { "fd",
103         {SYS_openat,
104          SYS_mmap,
105          SYS_read,
106          SYS_write,
107          SYS_openat,
108          SYS_close,
109          SYS_fstat,
110          SYS_fcntl,
111          SYS_llseek,
112          SYS_fchdir,
113          SYS_nmount,
114          SYS_fd2path,
115          SYS_fwstat,
116          SYS_dup_fds_to,
117          SYS_tap_fds,
118          SYS_abort_sysc_fd,
119          0}
120 };
121
122 /* This is the manually-created contents of 'path' and 'fd' */
123 static struct trace_set file_trace_set = { "file",
124         {/* From 'path' */
125          SYS_proc_create,
126          SYS_exec,
127          SYS_openat,
128          SYS_stat,
129          SYS_lstat,
130          SYS_access,
131          SYS_link,
132          SYS_unlink,
133          SYS_symlink,
134          SYS_readlink,
135          SYS_chdir,
136          SYS_getcwd,
137          SYS_mkdir,
138          SYS_rmdir,
139          SYS_nbind,
140          SYS_nmount,
141          SYS_nunmount,
142          SYS_wstat,
143          SYS_rename,
144          SYS_openat,
145          SYS_mmap,
146          SYS_read,
147          SYS_write,
148
149          /* From 'fd' */
150          SYS_openat,
151          SYS_close,
152          SYS_fstat,
153          SYS_fcntl,
154          SYS_llseek,
155          SYS_fchdir,
156          SYS_nmount,
157          SYS_fd2path,
158          SYS_fwstat,
159          SYS_dup_fds_to,
160          SYS_tap_fds,
161          SYS_abort_sysc_fd,
162          0}
163 };
164
165 static struct trace_set mem_trace_set = { "mem",
166         {SYS_mmap,
167          SYS_mprotect,
168          SYS_munmap,
169          SYS_shared_page_alloc,
170          SYS_shared_page_free,
171          SYS_populate_va,
172          0}
173 };
174
175 /* These are all in 'proc'; keep them in sync. */
176 static struct trace_set life_trace_set = { "life",
177         {SYS_proc_create,
178          SYS_proc_run,
179          SYS_proc_destroy,
180          SYS_fork,
181          SYS_exec,
182          SYS_waitpid,
183          SYS_change_to_m,
184          0}
185 };
186
187 /* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
188  * sleeping on something, which is usually useful.  aborting syscs are also
189  * often signs of control flow. */
190 static struct trace_set proc_trace_set = { "proc",
191         {SYS_block,
192          SYS_nanosleep,
193          SYS_getpcoreid,
194          SYS_getvcoreid,
195          SYS_proc_yield,
196          SYS_change_vcore,
197          SYS_notify,
198          SYS_self_notify,
199          SYS_send_event,
200          SYS_vc_entry,
201          SYS_halt_core,
202          SYS_pop_ctx,
203          SYS_abort_sysc,
204          SYS_abort_sysc_fd,
205
206          /* From 'life' */
207          SYS_proc_create,
208          SYS_proc_run,
209          SYS_proc_destroy,
210          SYS_fork,
211          SYS_exec,
212          SYS_waitpid,
213          SYS_change_to_m,
214          0}
215 };
216
217 static struct trace_set sched_trace_set = { "sched",
218         {SYS_provision,
219          SYS_proc_yield,
220          SYS_poke_ksched,
221          0}
222 };
223
224 static struct trace_set vmm_trace_set = { "vmm",
225         {SYS_vmm_add_gpcs,
226          SYS_vmm_poke_guest,
227          SYS_vmm_ctl,
228          SYS_pop_ctx,
229          0}
230 };
231
232 static struct trace_set *all_trace_sets[] = {
233         &path_trace_set,
234         &fd_trace_set,
235         &file_trace_set,
236         &mem_trace_set,
237         &life_trace_set,
238         &proc_trace_set,
239         &sched_trace_set,
240         &vmm_trace_set,
241 };
242
243 static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
244
245
246 static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
247 {
248         struct strace_opts *s_opts = state->input;
249
250         switch (key) {
251         case 'o':
252                 s_opts->outfile = fopen(arg, "wb");
253                 if (!s_opts->outfile) {
254                         fprintf(stderr,
255                                 "Unable to open file '%s' for writing: %s\n",
256                                 arg, strerror(errno));
257                         exit(1);
258                 }
259                 break;
260         case 'e':
261                 s_opts->trace_set = arg;
262                 break;
263         case 'p':
264                 s_opts->pid = atoi(arg);
265                 if (!s_opts->pid)
266                         argp_error(state, "Cannot trace pid 0 (won't exist)");
267                 break;
268         case 'f':
269                 s_opts->follow_children = TRUE;
270                 break;
271         case 'v':
272                 s_opts->verbose = TRUE;
273                 break;
274         case 'r':
275                 s_opts->raw_output = TRUE;
276                 break;
277         case 't':
278                 s_opts->with_time = TRUE;
279                 break;
280         case 'd':
281                 s_opts->drop_overflow = TRUE;
282                 break;
283         case ARGP_KEY_ARG:
284                 if (s_opts->pid)
285                         argp_error(state, "PID already set, can't launch a process too");
286                 s_opts->cmd_argc = state->argc - state->next + 1;
287                 s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc
288                                                            + 1));
289                 assert(s_opts->cmd_argv);
290                 s_opts->cmd_argv[0] = arg;
291                 memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
292                        sizeof(char*) * (s_opts->cmd_argc - 1));
293                 s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
294                 state->next = state->argc;
295                 break;
296         case ARGP_KEY_END:
297                 if (!(s_opts->cmd_argc || s_opts->pid))
298                         argp_error(state, "Need either -p or a command to run");
299                 /* Note we never fclose outfile.  It'll flush when we exit.
300                  * o/w, we'll need to be careful whether we're closing stderr or
301                  * not. */
302                 if (!s_opts->outfile)
303                         s_opts->outfile = stderr;
304                 break;
305         case 'h':
306                 argp_usage(state);
307                 break;
308         default:
309                 return ARGP_ERR_UNKNOWN;
310         }
311         return 0;
312 }
313
314 static bool handle_trace_set(char *tok, bool clear)
315 {
316         struct trace_set *ts;
317         unsigned int sysc_nr;
318
319         for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
320                 ts = all_trace_sets[i];
321                 if (!strcmp(ts->name, tok)) {
322                         for (int j = 0; j < MAX_SYSCALL_NR; j++) {
323                                 sysc_nr = ts->syscs[j];
324                                 /* 0-terminated list */
325                                 if (!sysc_nr)
326                                         break;
327                                 if (clear)
328                                         CLR_BITMASK_BIT(traceset_bm, sysc_nr);
329                                 else
330                                         SET_BITMASK_BIT(traceset_bm, sysc_nr);
331                         }
332                         return TRUE;
333                 }
334         }
335         return FALSE;
336 }
337
338 static char *resolve_syscall_alias(char *tok)
339 {
340         if (!strcmp(tok, "open")) {
341                 tok = "openat";
342                 return tok;
343         }
344         return tok;
345 }
346
347 static bool handle_raw_syscall(char *tok, bool clear)
348 {
349         for (int i = 0; i < __syscall_tbl_sz; i++) {
350                 if (!__syscall_tbl[i])
351                         continue;
352                 tok = resolve_syscall_alias(tok);
353                 if (!strcmp(__syscall_tbl[i], tok)) {
354                         if (clear)
355                                 CLR_BITMASK_BIT(traceset_bm, i);
356                         else
357                                 SET_BITMASK_BIT(traceset_bm, i);
358                         return TRUE;
359                 }
360         }
361         return FALSE;
362 }
363
364 static void build_ignore_list(char *trace_set)
365 {
366         char *tok, *tok_save = 0;
367         bool clear = FALSE;
368
369         if (!trace_set) {
370                 if (opts.verbose)
371                         fprintf(stderr, "# Tracing all syscalls\n");
372                 return;
373         }
374         for (tok = strtok_r(trace_set, ",", &tok_save);
375              tok;
376                  tok = strtok_r(NULL, ",", &tok_save)) {
377
378                 if (tok[0] == '!') {
379                         clear = TRUE;
380                         tok++;
381                 }
382                 if (handle_trace_set(tok, clear))
383                         continue;
384                 if (handle_raw_syscall(tok, clear))
385                         continue;
386                 /* You could imaging continuing, but this error would probably
387                  * be missed in the output stream and we'd be wondering why we
388                  * weren't getting a syscall, due to a typo. */
389                 fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
390                         tok);
391                 exit(-1);
392         }
393         if (opts.verbose) {
394                 for (int i = 0; i < MAX_SYSCALL_NR; i++) {
395                         if (GET_BITMASK_BIT(traceset_bm, i))
396                                 fprintf(stderr, "# Tracing syscall %s (%d)\n",
397                                         __syscall_tbl[i], i);
398                 }
399         }
400 }
401
402 /* Removes the timestamp part of the line.  Use the return string in place of
403  * the full line you pass in. */
404 static char *remove_timestamps(char *full_line)
405 {
406         char *close_brace;
407
408         /* Format: E [  13655.986589401]-[      0.000000000] Syscall. 
409          * The seconds field may vary in size, so we need to find the second
410          * ']'. */
411         close_brace = strchr(full_line, ']');
412         if (!close_brace)
413                 return full_line;
414         close_brace = strchr(close_brace + 1, ']');
415         if (!close_brace)
416                 return full_line;
417         /* move starting E or X marker */
418         *close_brace = full_line[0];
419         return close_brace;
420 }
421
422 static void parse_traces(int fd)
423 {
424         char *line, *_line;
425         ssize_t ret;
426
427         line = malloc(SYSTR_BUF_SZ);
428         assert(line);
429
430         while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
431                 /* make sure each line ends in \n\0. */
432                 line[ret - 1] = '\n';
433                 line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
434                 _line = line;
435                 if (opts.raw_output) {
436                         fprintf(opts.outfile, "%s", _line);
437                         continue;
438                 }
439                 if (!opts.with_time)
440                         _line = remove_timestamps(_line);
441                 fprintf(opts.outfile, "%s", _line);
442         }
443         /* This is a little hokey.  If the process exited, then the qio hung up
444          * and we got a status message from the kernel.  This was the errstr of
445          * the last failed read.  However, if we're doing a -p and someone kills
446          * *us*, we'll never see this.  And catching the signal doesn't help
447          * either.  The process needs to exit (strace_shutdown).  Either that,
448          * or change the kernel to set_errstr() on close(), and coordinate with
449          * a sighandler. */
450         if (opts.verbose)
451                 fprintf(stderr, "%r\n");
452         free(line);
453 }
454
455 int main(int argc, char **argv, char **envp)
456 {
457         int fd;
458         pid_t pid;
459         static char path[2 * MAX_PATH_LEN];
460         struct syscall sysc;
461         struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
462
463         argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
464
465         build_ignore_list(opts.trace_set);
466
467         if (opts.cmd_argc) {
468                 pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
469                                                opts.cmd_argv, envp);
470                 if (pid < 0) {
471                         perror("Unable to spawn child");
472                         exit(-1);
473                 }
474                 /* We need to wait on the child asynchronously.  If we hold a
475                  * ref (as the parent), the child won't proc_free and that won't
476                  * hangup/wake us from a read. */
477                 syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
478         } else {
479                 pid = opts.pid;
480         }
481
482         snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
483         fd = open(path, O_WRITE);
484         if (fd < 0) {
485                 fprintf(stderr, "open %s: %r\n", path);
486                 exit(1);
487         }
488         if (opts.follow_children)
489                 snprintf(path, sizeof(path), "straceall");
490         else
491                 snprintf(path, sizeof(path), "straceme");
492         if (write(fd, path, strlen(path)) < strlen(path)) {
493                 fprintf(stderr, "write to ctl %s: %r\n", path);
494                 exit(1);
495         }
496         if (opts.drop_overflow) {
497                 snprintf(path, sizeof(path), "strace_drop on");
498                 if (write(fd, path, strlen(path)) < strlen(path)) {
499                         fprintf(stderr, "write to ctl %s: %r\n", path);
500                         exit(1);
501                 }
502         }
503         close(fd);
504
505         if (opts.trace_set) {
506                 snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
507                 fd = open(path, O_RDWR);
508                 if (fd < 0) {
509                         fprintf(stderr, "open %s: %r\n", path);
510                         exit(1);
511                 }
512                 if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
513                     < COUNT_OF(traceset_bm)) {
514                         fprintf(stderr, "write to strace_ignore: %r\n");
515                         exit(1);
516                 }
517                 close(fd);
518         }
519
520         snprintf(path, sizeof(path), "/proc/%d/strace", pid);
521         fd = open(path, O_READ);
522         if (!fd) {
523                 fprintf(stderr, "open %s: %r\n", path);
524                 exit(1);
525         }
526
527         if (opts.cmd_argc) {
528                 /* now that we've set up the tracing, we can run the process.
529                  * isn't it great that the process doesn't immediately start
530                  * when you make it? */
531                 sys_proc_run(pid);
532         }
533
534         parse_traces(fd);
535         return 0;
536 }