vmm: Allow VMMs to change vmexit conditions (XCC)
[akaros.git] / tests / strace.c
1 /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * Ron Minnich <rminnich@google.com>
4  * See LICENSE for details.
5  *
6  * TODO:
7  * - Do per-syscall output formatting, like interpreting arguments
8  */
9
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <unistd.h>
13 #include <fcntl.h>
14 #include <argp.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/param.h>
18 #include <parlib/parlib.h>
19 #include <parlib/bitmask.h>
20
21 struct strace_opts {
22         FILE                                            *outfile;
23         char                                            *trace_set;
24         char                                            **cmd_argv;
25         int                                                     cmd_argc;
26         int                                                     pid;
27         bool                                            follow_children;
28         bool                                            verbose;
29         bool                                            raw_output;
30         bool                                            with_time;
31         bool                                            drop_overflow;
32 };
33 static struct strace_opts opts;
34
35 static const char *doc = "strace -- trace syscalls of a process";
36 static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
37
38 static struct argp_option argp_opts[] = {
39         {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
40         {"pid", 'p', "PID", 0, "Process to attach to"},
41         {"follow", 'f', 0, 0, "Trace children"},
42         {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
43         {0, 0, 0, 0, ""},
44         {"traceset", 'e', "TRACE_SET", 0,
45          "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
46          },
47         {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
48                         "\n"
49                         "- path: syscalls that take file paths\n"
50                         "- fd: syscalls that take FDs\n"
51                         "- file: path and fd sets\n"
52                         "- mem: memory related (mmap, shared mem)\n"
53                         "- life: process lifetime (create, fork)\n"
54                         "- proc: anything process related (yields, pop_ctxs, life)\n"
55                         "- sched: requests or yields to the kernel (all resources)\n"
56                         "- vmm: syscalls mostly for VMs\n"
57         },
58         {0, 0, 0, 0, ""},
59         {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
60         {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
61         {"time", 't', 0, 0, "Print timestamps"},
62         {0, 'h', 0, OPTION_HIDDEN, 0},
63         { 0 }
64 };
65
66 struct trace_set {
67         char                                            *name;
68         unsigned int                            syscs[];
69 };
70
71 /* To add a trace set, create one here, add it to all_trace_sets, and update the
72  * help field in argp_opts. */
73
74 /* If you change this, update 'file' below. */
75 static struct trace_set path_trace_set = { "path",
76         {SYS_proc_create,
77          SYS_exec,
78          SYS_openat,
79          SYS_stat,
80          SYS_lstat,
81          SYS_access,
82          SYS_link,
83          SYS_unlink,
84          SYS_symlink,
85          SYS_readlink,
86          SYS_chdir,
87          SYS_mkdir,
88          SYS_rmdir,
89          SYS_nbind,
90          SYS_nmount,
91          SYS_nunmount,
92          SYS_wstat,
93          SYS_rename,
94          0}
95 };
96
97 /* If you change this, update 'file' below.
98  *
99  * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
100  * tracks openat, since that's the source for all FDs */
101 static struct trace_set fd_trace_set = { "fd",
102         {SYS_openat,
103          SYS_mmap,
104          SYS_read,
105          SYS_write,
106          SYS_openat,
107          SYS_close,
108          SYS_fstat,
109          SYS_fcntl,
110          SYS_llseek,
111          SYS_fchdir,
112          SYS_nmount,
113          SYS_fd2path,
114          SYS_fwstat,
115          SYS_dup_fds_to,
116          SYS_tap_fds,
117          SYS_abort_sysc_fd,
118          0}
119 };
120
121 /* This is the manually-created contents of 'path' and 'fd' */
122 static struct trace_set file_trace_set = { "file",
123         {/* From 'path' */
124          SYS_proc_create,
125          SYS_exec,
126          SYS_openat,
127          SYS_stat,
128          SYS_lstat,
129          SYS_access,
130          SYS_link,
131          SYS_unlink,
132          SYS_symlink,
133          SYS_readlink,
134          SYS_chdir,
135          SYS_mkdir,
136          SYS_rmdir,
137          SYS_nbind,
138          SYS_nmount,
139          SYS_nunmount,
140          SYS_wstat,
141          SYS_rename,
142          SYS_openat,
143          SYS_mmap,
144          SYS_read,
145          SYS_write,
146
147          /* From 'fd' */
148          SYS_openat,
149          SYS_close,
150          SYS_fstat,
151          SYS_fcntl,
152          SYS_llseek,
153          SYS_fchdir,
154          SYS_nmount,
155          SYS_fd2path,
156          SYS_fwstat,
157          SYS_dup_fds_to,
158          SYS_tap_fds,
159          SYS_abort_sysc_fd,
160          0}
161 };
162
163 static struct trace_set mem_trace_set = { "mem",
164         {SYS_mmap,
165          SYS_mprotect,
166          SYS_munmap,
167          SYS_shared_page_alloc,
168          SYS_shared_page_free,
169          SYS_populate_va,
170          0}
171 };
172
173 /* These are all in 'proc'; keep them in sync. */
174 static struct trace_set life_trace_set = { "life",
175         {SYS_proc_create,
176          SYS_proc_run,
177          SYS_proc_destroy,
178          SYS_fork,
179          SYS_exec,
180          SYS_waitpid,
181          SYS_change_to_m,
182          0}
183 };
184
185 /* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
186  * sleeping on something, which is usually useful.  aborting syscs are also
187  * often signs of control flow. */
188 static struct trace_set proc_trace_set = { "proc",
189         {SYS_block,
190          SYS_nanosleep,
191          SYS_getpcoreid,
192          SYS_getvcoreid,
193          SYS_proc_yield,
194          SYS_change_vcore,
195          SYS_notify,
196          SYS_self_notify,
197          SYS_send_event,
198          SYS_vc_entry,
199          SYS_halt_core,
200          SYS_pop_ctx,
201          SYS_abort_sysc,
202          SYS_abort_sysc_fd,
203
204          /* From 'life' */
205          SYS_proc_create,
206          SYS_proc_run,
207          SYS_proc_destroy,
208          SYS_fork,
209          SYS_exec,
210          SYS_waitpid,
211          SYS_change_to_m,
212          0}
213 };
214
215 static struct trace_set sched_trace_set = { "sched",
216         {SYS_provision,
217          SYS_proc_yield,
218          SYS_poke_ksched,
219          0}
220 };
221
222 static struct trace_set vmm_trace_set = { "vmm",
223         {SYS_vmm_setup,
224          SYS_vmm_poke_guest,
225          SYS_vmm_ctl,
226          SYS_pop_ctx,
227          0}
228 };
229
230 static struct trace_set *all_trace_sets[] = {
231         &path_trace_set,
232         &fd_trace_set,
233         &file_trace_set,
234         &mem_trace_set,
235         &life_trace_set,
236         &proc_trace_set,
237         &sched_trace_set,
238         &vmm_trace_set,
239 };
240
241 static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
242
243
244 static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
245 {
246         struct strace_opts *s_opts = state->input;
247
248         switch (key) {
249         case 'o':
250                 s_opts->outfile = fopen(arg, "wb");
251                 if (!s_opts->outfile) {
252                         fprintf(stderr, "Unable to open file '%s' for writing: %s\n",
253                                         arg, strerror(errno));
254                         exit(1);
255                 }
256                 break;
257         case 'e':
258                 s_opts->trace_set = arg;
259                 break;
260         case 'p':
261                 s_opts->pid = atoi(arg);
262                 if (!s_opts->pid)
263                         argp_error(state, "Cannot trace pid 0 (won't exist)");
264                 break;
265         case 'f':
266                 s_opts->follow_children = TRUE;
267                 break;
268         case 'v':
269                 s_opts->verbose = TRUE;
270                 break;
271         case 'r':
272                 s_opts->raw_output = TRUE;
273                 break;
274         case 't':
275                 s_opts->with_time = TRUE;
276                 break;
277         case 'd':
278                 s_opts->drop_overflow = TRUE;
279                 break;
280         case ARGP_KEY_ARG:
281                 if (s_opts->pid)
282                         argp_error(state, "PID already set, can't launch a process too");
283                 s_opts->cmd_argc = state->argc - state->next + 1;
284                 s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc + 1));
285                 assert(s_opts->cmd_argv);
286                 s_opts->cmd_argv[0] = arg;
287                 memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
288                        sizeof(char*) * (s_opts->cmd_argc - 1));
289                 s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
290                 state->next = state->argc;
291                 break;
292         case ARGP_KEY_END:
293                 if (!(s_opts->cmd_argc || s_opts->pid))
294                         argp_error(state, "Need either -p or a command to run");
295                 /* Note we never fclose outfile.  It'll flush when we exit.  o/w, we'll
296                  * need to be careful whether we're closing stderr or not. */
297                 if (!s_opts->outfile)
298                         s_opts->outfile = stderr;
299                 break;
300         case 'h':
301                 argp_usage(state);
302                 break;
303         default:
304                 return ARGP_ERR_UNKNOWN;
305         }
306         return 0;
307 }
308
309 static bool handle_trace_set(char *tok, bool clear)
310 {
311         struct trace_set *ts;
312         unsigned int sysc_nr;
313
314         for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
315                 ts = all_trace_sets[i];
316                 if (!strcmp(ts->name, tok)) {
317                         for (int j = 0; j < MAX_SYSCALL_NR; j++) {
318                                 sysc_nr = ts->syscs[j];
319                                 /* 0-terminated list */
320                                 if (!sysc_nr)
321                                         break;
322                                 if (clear)
323                                         CLR_BITMASK_BIT(traceset_bm, sysc_nr);
324                                 else
325                                         SET_BITMASK_BIT(traceset_bm, sysc_nr);
326                         }
327                         return TRUE;
328                 }
329         }
330         return FALSE;
331 }
332
333 static char *resolve_syscall_alias(char *tok)
334 {
335         if (!strcmp(tok, "open")) {
336                 tok = "openat";
337                 return tok;
338         }
339         return tok;
340 }
341
342 static bool handle_raw_syscall(char *tok, bool clear)
343 {
344         for (int i = 0; i < __syscall_tbl_sz; i++) {
345                 if (!__syscall_tbl[i])
346                         continue;
347                 tok = resolve_syscall_alias(tok);
348                 if (!strcmp(__syscall_tbl[i], tok)) {
349                         if (clear)
350                                 CLR_BITMASK_BIT(traceset_bm, i);
351                         else
352                                 SET_BITMASK_BIT(traceset_bm, i);
353                         return TRUE;
354                 }
355         }
356         return FALSE;
357 }
358
359 static void build_ignore_list(char *trace_set)
360 {
361         char *tok, *tok_save = 0;
362         bool clear = FALSE;
363
364         if (!trace_set) {
365                 if (opts.verbose)
366                         fprintf(stderr, "# Tracing all syscalls\n");
367                 return;
368         }
369         for (tok = strtok_r(trace_set, ",", &tok_save);
370              tok;
371                  tok = strtok_r(NULL, ",", &tok_save)) {
372
373                 if (tok[0] == '!') {
374                         clear = TRUE;
375                         tok++;
376                 }
377                 if (handle_trace_set(tok, clear))
378                         continue;
379                 if (handle_raw_syscall(tok, clear))
380                         continue;
381                 /* You could imaging continuing, but this error would probably be
382                  * missed in the output stream and we'd be wondering why we weren't
383                  * getting a syscall, due to a typo. */
384                 fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
385                         tok);
386                 exit(-1);
387         }
388         if (opts.verbose) {
389                 for (int i = 0; i < MAX_SYSCALL_NR; i++) {
390                         if (GET_BITMASK_BIT(traceset_bm, i))
391                                 fprintf(stderr, "# Tracing syscall %s (%d)\n",
392                                         __syscall_tbl[i], i);
393                 }
394         }
395 }
396
397 /* Removes the timestamp part of the line.  Use the return string in place of
398  * the full line you pass in. */
399 static char *remove_timestamps(char *full_line)
400 {
401         char *close_brace;
402
403         /* Format: E [  13655.986589401]-[      0.000000000] Syscall.  The seconds
404          * field may vary in size, so we need to find the second ']'. */
405         close_brace = strchr(full_line, ']');
406         if (!close_brace)
407                 return full_line;
408         close_brace = strchr(close_brace + 1, ']');
409         if (!close_brace)
410                 return full_line;
411         /* move starting E or X marker */
412         *close_brace = full_line[0];
413         return close_brace;
414 }
415
416 static void parse_traces(int fd)
417 {
418         char *line, *_line;
419         ssize_t ret;
420
421         line = malloc(SYSTR_BUF_SZ);
422         assert(line);
423
424         while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
425                 /* make sure each line ends in \n\0. */
426                 line[ret - 1] = '\n';
427                 line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
428                 _line = line;
429                 if (opts.raw_output) {
430                         fprintf(opts.outfile, "%s", _line);
431                         continue;
432                 }
433                 if (!opts.with_time)
434                         _line = remove_timestamps(_line);
435                 fprintf(opts.outfile, "%s", _line);
436         }
437         /* This is a little hokey.  If the process exited, then the qio hung up and
438          * we got a status message from the kernel.  This was the errstr of the last
439          * failed read.  However, if we're doing a -p and someone kills *us*, we'll
440          * never see this.  And catching the signal doesn't help either.  The
441          * process needs to exit (strace_shutdown).  Either that, or change the
442          * kernel to set_errstr() on close(), and coordinate with a sighandler. */
443         if (opts.verbose)
444                 fprintf(stderr, "%r\n");
445         free(line);
446 }
447
448 int main(int argc, char **argv, char **envp)
449 {
450         int fd;
451         pid_t pid;
452         static char path[2 * MAX_PATH_LEN];
453         struct syscall sysc;
454         struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
455
456         argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
457
458         build_ignore_list(opts.trace_set);
459
460         if (opts.cmd_argc) {
461                 pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
462                                                opts.cmd_argv, envp);
463                 if (pid < 0) {
464                         perror("Unable to spawn child");
465                         exit(-1);
466                 }
467                 /* We need to wait on the child asynchronously.  If we hold a ref (as
468                  * the parent), the child won't proc_free and that won't hangup/wake us
469                  * from a read. */
470                 syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
471         } else {
472                 pid = opts.pid;
473         }
474
475         snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
476         fd = open(path, O_WRITE);
477         if (fd < 0) {
478                 fprintf(stderr, "open %s: %r\n", path);
479                 exit(1);
480         }
481         if (opts.follow_children)
482                 snprintf(path, sizeof(path), "straceall");
483         else
484                 snprintf(path, sizeof(path), "straceme");
485         if (write(fd, path, strlen(path)) < strlen(path)) {
486                 fprintf(stderr, "write to ctl %s: %r\n", path);
487                 exit(1);
488         }
489         if (opts.drop_overflow) {
490                 snprintf(path, sizeof(path), "strace_drop on");
491                 if (write(fd, path, strlen(path)) < strlen(path)) {
492                         fprintf(stderr, "write to ctl %s: %r\n", path);
493                         exit(1);
494                 }
495         }
496         close(fd);
497
498         if (opts.trace_set) {
499                 snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
500                 fd = open(path, O_RDWR);
501                 if (fd < 0) {
502                         fprintf(stderr, "open %s: %r\n", path);
503                         exit(1);
504                 }
505                 if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
506                     < COUNT_OF(traceset_bm)) {
507                         fprintf(stderr, "write to strace_ignore: %r\n");
508                         exit(1);
509                 }
510                 close(fd);
511         }
512
513         snprintf(path, sizeof(path), "/proc/%d/strace", pid);
514         fd = open(path, O_READ);
515         if (!fd) {
516                 fprintf(stderr, "open %s: %r\n", path);
517                 exit(1);
518         }
519
520         if (opts.cmd_argc) {
521                 /* now that we've set up the tracing, we can run the process.  isn't it
522                  * great that the process doesn't immediately start when you make it? */
523                 sys_proc_run(pid);
524         }
525
526         parse_traces(fd);
527         return 0;
528 }