akaros/tests/strace.c
<<
>>
Prefs
   1/* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * Ron Minnich <rminnich@google.com>
   4 * See LICENSE for details.
   5 *
   6 * TODO:
   7 * - Do per-syscall output formatting, like interpreting arguments
   8 */
   9
  10#include <stdlib.h>
  11#include <stdio.h>
  12#include <unistd.h>
  13#include <fcntl.h>
  14#include <argp.h>
  15#include <sys/types.h>
  16#include <sys/wait.h>
  17#include <sys/param.h>
  18#include <parlib/parlib.h>
  19#include <parlib/bitmask.h>
  20
  21struct strace_opts {
  22        FILE                    *outfile;
  23        char                    *trace_set;
  24        char                    **cmd_argv;
  25        int                     cmd_argc;
  26        int                     pid;
  27        bool                    follow_children;
  28        bool                    verbose;
  29        bool                    raw_output;
  30        bool                    with_time;
  31        bool                    drop_overflow;
  32};
  33static struct strace_opts opts;
  34
  35static const char *doc = "strace -- trace syscalls of a process";
  36static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
  37
  38static struct argp_option argp_opts[] = {
  39        {"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
  40        {"pid", 'p', "PID", 0, "Process to attach to"},
  41        {"follow", 'f', 0, 0, "Trace children"},
  42        {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
  43        {0, 0, 0, 0, ""},
  44        {"traceset", 'e', "TRACE_SET", 0,
  45         "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
  46         },
  47        {"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
  48                 "\n"
  49                 "- path: syscalls that take file paths\n"
  50                 "- fd: syscalls that take FDs\n"
  51                 "- file: path and fd sets\n"
  52                 "- mem: memory related (mmap, shared mem)\n"
  53                 "- life: process lifetime (create, fork)\n"
  54                 "- proc: anything process related (yields, pop_ctxs, life)\n"
  55                 "- sched: requests or yields to the kernel (all resources)\n"
  56                 "- vmm: syscalls mostly for VMs\n"
  57        },
  58        {0, 0, 0, 0, ""},
  59        {"drop", 'd', 0, 0, "Drop syscalls on overflow"},
  60        {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
  61        {"time", 't', 0, 0, "Print timestamps"},
  62        {0, 'h', 0, OPTION_HIDDEN, 0},
  63        { 0 }
  64};
  65
  66struct trace_set {
  67        char                    *name;
  68        unsigned int            syscs[];
  69};
  70
  71/* To add a trace set, create one here, add it to all_trace_sets, and update the
  72 * help field in argp_opts. */
  73
  74/* If you change this, update 'file' below. */
  75static struct trace_set path_trace_set = { "path",
  76        {SYS_proc_create,
  77         SYS_exec,
  78         SYS_openat,
  79         SYS_stat,
  80         SYS_lstat,
  81         SYS_access,
  82         SYS_link,
  83         SYS_unlink,
  84         SYS_symlink,
  85         SYS_readlink,
  86         SYS_chdir,
  87         SYS_getcwd,
  88         SYS_mkdir,
  89         SYS_rmdir,
  90         SYS_nbind,
  91         SYS_nmount,
  92         SYS_nunmount,
  93         SYS_wstat,
  94         SYS_rename,
  95         0}
  96};
  97
  98/* If you change this, update 'file' below.
  99 *
 100 * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
 101 * tracks openat, since that's the source for all FDs */
 102static struct trace_set fd_trace_set = { "fd",
 103        {SYS_openat,
 104         SYS_mmap,
 105         SYS_read,
 106         SYS_write,
 107         SYS_openat,
 108         SYS_close,
 109         SYS_fstat,
 110         SYS_fcntl,
 111         SYS_llseek,
 112         SYS_fchdir,
 113         SYS_nmount,
 114         SYS_fd2path,
 115         SYS_fwstat,
 116         SYS_dup_fds_to,
 117         SYS_tap_fds,
 118         SYS_abort_sysc_fd,
 119         0}
 120};
 121
 122/* This is the manually-created contents of 'path' and 'fd' */
 123static struct trace_set file_trace_set = { "file",
 124        {/* From 'path' */
 125         SYS_proc_create,
 126         SYS_exec,
 127         SYS_openat,
 128         SYS_stat,
 129         SYS_lstat,
 130         SYS_access,
 131         SYS_link,
 132         SYS_unlink,
 133         SYS_symlink,
 134         SYS_readlink,
 135         SYS_chdir,
 136         SYS_getcwd,
 137         SYS_mkdir,
 138         SYS_rmdir,
 139         SYS_nbind,
 140         SYS_nmount,
 141         SYS_nunmount,
 142         SYS_wstat,
 143         SYS_rename,
 144         SYS_openat,
 145         SYS_mmap,
 146         SYS_read,
 147         SYS_write,
 148
 149         /* From 'fd' */
 150         SYS_openat,
 151         SYS_close,
 152         SYS_fstat,
 153         SYS_fcntl,
 154         SYS_llseek,
 155         SYS_fchdir,
 156         SYS_nmount,
 157         SYS_fd2path,
 158         SYS_fwstat,
 159         SYS_dup_fds_to,
 160         SYS_tap_fds,
 161         SYS_abort_sysc_fd,
 162         0}
 163};
 164
 165static struct trace_set mem_trace_set = { "mem",
 166        {SYS_mmap,
 167         SYS_mprotect,
 168         SYS_munmap,
 169         SYS_shared_page_alloc,
 170         SYS_shared_page_free,
 171         SYS_populate_va,
 172         0}
 173};
 174
 175/* These are all in 'proc'; keep them in sync. */
 176static struct trace_set life_trace_set = { "life",
 177        {SYS_proc_create,
 178         SYS_proc_run,
 179         SYS_proc_destroy,
 180         SYS_fork,
 181         SYS_exec,
 182         SYS_waitpid,
 183         SYS_change_to_m,
 184         0}
 185};
 186
 187/* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
 188 * sleeping on something, which is usually useful.  aborting syscs are also
 189 * often signs of control flow. */
 190static struct trace_set proc_trace_set = { "proc",
 191        {SYS_block,
 192         SYS_nanosleep,
 193         SYS_getpcoreid,
 194         SYS_getvcoreid,
 195         SYS_proc_yield,
 196         SYS_change_vcore,
 197         SYS_notify,
 198         SYS_self_notify,
 199         SYS_send_event,
 200         SYS_vc_entry,
 201         SYS_halt_core,
 202         SYS_pop_ctx,
 203         SYS_abort_sysc,
 204         SYS_abort_sysc_fd,
 205
 206         /* From 'life' */
 207         SYS_proc_create,
 208         SYS_proc_run,
 209         SYS_proc_destroy,
 210         SYS_fork,
 211         SYS_exec,
 212         SYS_waitpid,
 213         SYS_change_to_m,
 214         0}
 215};
 216
 217static struct trace_set sched_trace_set = { "sched",
 218        {SYS_provision,
 219         SYS_proc_yield,
 220         SYS_poke_ksched,
 221         0}
 222};
 223
 224static struct trace_set vmm_trace_set = { "vmm",
 225        {SYS_vmm_add_gpcs,
 226         SYS_vmm_poke_guest,
 227         SYS_vmm_ctl,
 228         SYS_pop_ctx,
 229         0}
 230};
 231
 232static struct trace_set *all_trace_sets[] = {
 233        &path_trace_set,
 234        &fd_trace_set,
 235        &file_trace_set,
 236        &mem_trace_set,
 237        &life_trace_set,
 238        &proc_trace_set,
 239        &sched_trace_set,
 240        &vmm_trace_set,
 241};
 242
 243static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
 244
 245
 246static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
 247{
 248        struct strace_opts *s_opts = state->input;
 249
 250        switch (key) {
 251        case 'o':
 252                s_opts->outfile = fopen(arg, "wb");
 253                if (!s_opts->outfile) {
 254                        fprintf(stderr,
 255                                "Unable to open file '%s' for writing: %s\n",
 256                                arg, strerror(errno));
 257                        exit(1);
 258                }
 259                break;
 260        case 'e':
 261                s_opts->trace_set = arg;
 262                break;
 263        case 'p':
 264                s_opts->pid = atoi(arg);
 265                if (!s_opts->pid)
 266                        argp_error(state, "Cannot trace pid 0 (won't exist)");
 267                break;
 268        case 'f':
 269                s_opts->follow_children = TRUE;
 270                break;
 271        case 'v':
 272                s_opts->verbose = TRUE;
 273                break;
 274        case 'r':
 275                s_opts->raw_output = TRUE;
 276                break;
 277        case 't':
 278                s_opts->with_time = TRUE;
 279                break;
 280        case 'd':
 281                s_opts->drop_overflow = TRUE;
 282                break;
 283        case ARGP_KEY_ARG:
 284                if (s_opts->pid)
 285                        argp_error(state, "PID already set, can't launch a process too");
 286                s_opts->cmd_argc = state->argc - state->next + 1;
 287                s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc
 288                                                           + 1));
 289                assert(s_opts->cmd_argv);
 290                s_opts->cmd_argv[0] = arg;
 291                memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
 292                       sizeof(char*) * (s_opts->cmd_argc - 1));
 293                s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
 294                state->next = state->argc;
 295                break;
 296        case ARGP_KEY_END:
 297                if (!(s_opts->cmd_argc || s_opts->pid))
 298                        argp_error(state, "Need either -p or a command to run");
 299                /* Note we never fclose outfile.  It'll flush when we exit.
 300                 * o/w, we'll need to be careful whether we're closing stderr or
 301                 * not. */
 302                if (!s_opts->outfile)
 303                        s_opts->outfile = stderr;
 304                break;
 305        case 'h':
 306                argp_usage(state);
 307                break;
 308        default:
 309                return ARGP_ERR_UNKNOWN;
 310        }
 311        return 0;
 312}
 313
 314static bool handle_trace_set(char *tok, bool clear)
 315{
 316        struct trace_set *ts;
 317        unsigned int sysc_nr;
 318
 319        for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
 320                ts = all_trace_sets[i];
 321                if (!strcmp(ts->name, tok)) {
 322                        for (int j = 0; j < MAX_SYSCALL_NR; j++) {
 323                                sysc_nr = ts->syscs[j];
 324                                /* 0-terminated list */
 325                                if (!sysc_nr)
 326                                        break;
 327                                if (clear)
 328                                        CLR_BITMASK_BIT(traceset_bm, sysc_nr);
 329                                else
 330                                        SET_BITMASK_BIT(traceset_bm, sysc_nr);
 331                        }
 332                        return TRUE;
 333                }
 334        }
 335        return FALSE;
 336}
 337
 338static char *resolve_syscall_alias(char *tok)
 339{
 340        if (!strcmp(tok, "open")) {
 341                tok = "openat";
 342                return tok;
 343        }
 344        return tok;
 345}
 346
 347static bool handle_raw_syscall(char *tok, bool clear)
 348{
 349        for (int i = 0; i < __syscall_tbl_sz; i++) {
 350                if (!__syscall_tbl[i])
 351                        continue;
 352                tok = resolve_syscall_alias(tok);
 353                if (!strcmp(__syscall_tbl[i], tok)) {
 354                        if (clear)
 355                                CLR_BITMASK_BIT(traceset_bm, i);
 356                        else
 357                                SET_BITMASK_BIT(traceset_bm, i);
 358                        return TRUE;
 359                }
 360        }
 361        return FALSE;
 362}
 363
 364static void build_ignore_list(char *trace_set)
 365{
 366        char *tok, *tok_save = 0;
 367        bool clear = FALSE;
 368
 369        if (!trace_set) {
 370                if (opts.verbose)
 371                        fprintf(stderr, "# Tracing all syscalls\n");
 372                return;
 373        }
 374        for (tok = strtok_r(trace_set, ",", &tok_save);
 375             tok;
 376                 tok = strtok_r(NULL, ",", &tok_save)) {
 377
 378                if (tok[0] == '!') {
 379                        clear = TRUE;
 380                        tok++;
 381                }
 382                if (handle_trace_set(tok, clear))
 383                        continue;
 384                if (handle_raw_syscall(tok, clear))
 385                        continue;
 386                /* You could imaging continuing, but this error would probably
 387                 * be missed in the output stream and we'd be wondering why we
 388                 * weren't getting a syscall, due to a typo. */
 389                fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
 390                        tok);
 391                exit(-1);
 392        }
 393        if (opts.verbose) {
 394                for (int i = 0; i < MAX_SYSCALL_NR; i++) {
 395                        if (GET_BITMASK_BIT(traceset_bm, i))
 396                                fprintf(stderr, "# Tracing syscall %s (%d)\n",
 397                                        __syscall_tbl[i], i);
 398                }
 399        }
 400}
 401
 402/* Removes the timestamp part of the line.  Use the return string in place of
 403 * the full line you pass in. */
 404static char *remove_timestamps(char *full_line)
 405{
 406        char *close_brace;
 407
 408        /* Format: E [  13655.986589401]-[      0.000000000] Syscall. 
 409         * The seconds field may vary in size, so we need to find the second
 410         * ']'. */
 411        close_brace = strchr(full_line, ']');
 412        if (!close_brace)
 413                return full_line;
 414        close_brace = strchr(close_brace + 1, ']');
 415        if (!close_brace)
 416                return full_line;
 417        /* move starting E or X marker */
 418        *close_brace = full_line[0];
 419        return close_brace;
 420}
 421
 422static void parse_traces(int fd)
 423{
 424        char *line, *_line;
 425        ssize_t ret;
 426
 427        line = malloc(SYSTR_BUF_SZ);
 428        assert(line);
 429
 430        while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
 431                /* make sure each line ends in \n\0. */
 432                line[ret - 1] = '\n';
 433                line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
 434                _line = line;
 435                if (opts.raw_output) {
 436                        fprintf(opts.outfile, "%s", _line);
 437                        continue;
 438                }
 439                if (!opts.with_time)
 440                        _line = remove_timestamps(_line);
 441                fprintf(opts.outfile, "%s", _line);
 442        }
 443        /* This is a little hokey.  If the process exited, then the qio hung up
 444         * and we got a status message from the kernel.  This was the errstr of
 445         * the last failed read.  However, if we're doing a -p and someone kills
 446         * *us*, we'll never see this.  And catching the signal doesn't help
 447         * either.  The process needs to exit (strace_shutdown).  Either that,
 448         * or change the kernel to set_errstr() on close(), and coordinate with
 449         * a sighandler. */
 450        if (opts.verbose)
 451                fprintf(stderr, "%r\n");
 452        free(line);
 453}
 454
 455int main(int argc, char **argv, char **envp)
 456{
 457        int fd;
 458        pid_t pid;
 459        static char path[2 * MAX_PATH_LEN];
 460        struct syscall sysc;
 461        struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
 462
 463        argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
 464
 465        build_ignore_list(opts.trace_set);
 466
 467        if (opts.cmd_argc) {
 468                pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
 469                                               opts.cmd_argv, envp);
 470                if (pid < 0) {
 471                        perror("Unable to spawn child");
 472                        exit(-1);
 473                }
 474                /* We need to wait on the child asynchronously.  If we hold a
 475                 * ref (as the parent), the child won't proc_free and that won't
 476                 * hangup/wake us from a read. */
 477                syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
 478        } else {
 479                pid = opts.pid;
 480        }
 481
 482        snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
 483        fd = open(path, O_WRITE);
 484        if (fd < 0) {
 485                fprintf(stderr, "open %s: %r\n", path);
 486                exit(1);
 487        }
 488        if (opts.follow_children)
 489                snprintf(path, sizeof(path), "straceall");
 490        else
 491                snprintf(path, sizeof(path), "straceme");
 492        if (write(fd, path, strlen(path)) < strlen(path)) {
 493                fprintf(stderr, "write to ctl %s: %r\n", path);
 494                exit(1);
 495        }
 496        if (opts.drop_overflow) {
 497                snprintf(path, sizeof(path), "strace_drop on");
 498                if (write(fd, path, strlen(path)) < strlen(path)) {
 499                        fprintf(stderr, "write to ctl %s: %r\n", path);
 500                        exit(1);
 501                }
 502        }
 503        close(fd);
 504
 505        if (opts.trace_set) {
 506                snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
 507                fd = open(path, O_RDWR);
 508                if (fd < 0) {
 509                        fprintf(stderr, "open %s: %r\n", path);
 510                        exit(1);
 511                }
 512                if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
 513                    < COUNT_OF(traceset_bm)) {
 514                        fprintf(stderr, "write to strace_ignore: %r\n");
 515                        exit(1);
 516                }
 517                close(fd);
 518        }
 519
 520        snprintf(path, sizeof(path), "/proc/%d/strace", pid);
 521        fd = open(path, O_READ);
 522        if (!fd) {
 523                fprintf(stderr, "open %s: %r\n", path);
 524                exit(1);
 525        }
 526
 527        if (opts.cmd_argc) {
 528                /* now that we've set up the tracing, we can run the process.
 529                 * isn't it great that the process doesn't immediately start
 530                 * when you make it? */
 531                sys_proc_run(pid);
 532        }
 533
 534        parse_traces(fd);
 535        return 0;
 536}
 537