parlib: Fix u32/u64 issue with pvcalarm
[akaros.git] / tests / lock_test.c
index 3950923..6072153 100644 (file)
@@ -1,8 +1,14 @@
-/* Copyright (c) 2013 The Regents of the University of California
+/* Copyright (c) 2013, 2014 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
- * lock_test: microbenchmark to measure different styles of spinlocks. */
+ * lock_test: microbenchmark to measure different styles of spinlocks.
+ *
+ * to build on linux: (hacky)
+ * $ gcc -O2 -std=gnu99 -fno-stack-protector -g tests/lock_test.c -lpthread \
+ *    -lm -o linux_lock_test */
+
+#define _GNU_SOURCE /* pthread_yield */
 
 #include <stdio.h>
 #include <pthread.h>
 #include <sys/time.h>
 #include <math.h>
 #include <argp.h>
-
-#include <tsc-compat.h>
-#include <measure.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <string.h>
 
 /* OS dependent #incs */
-#include <parlib.h>
-#include <vcore.h>
-#include <timing.h>
-#include <spinlock.h>
-#include <mcs.h>
-#include <arch/arch.h>
-#include <event.h>
+#ifdef __ros__
+
+#include <parlib/parlib.h>
+#include <parlib/stdio.h>
+#include <parlib/vcore.h>
+#include <parlib/timing.h>
+#include <parlib/spinlock.h>
+#include <parlib/mcs.h>
+#include <parlib/arch/arch.h>
+#include <parlib/event.h>
+
+#include <parlib/tsc-compat.h>
+#include <benchutil/measure.h>
+
+#else
+
+#include "../user/parlib/include/parlib/tsc-compat.h"
+#include "misc-compat.h"
+#include "linux-lock-hacks.h" /* TODO: have a build system and lib / C file */
+
+#include "../user/benchutil/include/benchutil/measure.h"
+#include "../user/benchutil/measure.c"
+
+static void os_prep_work(pthread_t *worker_threads, int nr_threads)
+{
+       if (nr_threads > num_vcores())
+               printf("WARNING: %d threads requested, but only %d cores available\n",
+                      nr_threads, num_vcores());
+}
+
+static void os_post_work(pthread_t *worker_threads, int nr_threads)
+{
+       if (nr_threads > num_vcores())
+               return;
+       /* assuming we're taking cores 0..nr_threads, and we never move. */
+       for (int i = 0; i < nr_threads; i++) {
+               cpu_set_t cpuset;
+               CPU_ZERO(&cpuset);
+               CPU_SET(i, &cpuset);
+               pthread_setaffinity_np(worker_threads[i], sizeof(cpu_set_t), &cpuset);
+       }
+}
+
+#define print_preempt_trace(args...) {}
+
+__thread int __vcore_context = 0;
+
+#endif
 
 /* TODO: There's lot of work to do still, both on this program and on locking
  * and vcore code.  For some of the issues, I'll leave in the discussion /
  *                                     still spins.  they'll have to make sure their pred runs)
  *                     -adj workers doesn't matter either...
  *                             - the 2LS and preemption handling might be doing this
- *                             automatically, when handle_preempt() does a
+ *                             automatically, when handle_vc_preempt() does a
  *                             thread_paused() on its current_uthread.
  *                             - adj_workers isn't critical if we're using some locks
  *                             that check notif_pending.  eventually someone hears
  */
 
 const char *argp_program_version = "lock_test v0.1475263";
-const char *argp_program_bug_address = "<akaros@lists.eecs.berkeley.edu>";
+const char *argp_program_bug_address = "<akaros+subscribe@googlegroups.com>";
 
 static char doc[] = "lock_test -- spinlock benchmarking";
 static char args_doc[] = "-w NUM -l NUM -t LOCK";
@@ -428,6 +477,7 @@ static struct argp_option options[] = {
        {"hold",                'h', "NSEC",    0, "nsec to hold the lock"},
        {"delay",               'd', "NSEC",    0, "nsec to delay between grabs"},
        {"print",               'p', "ROWS",    0, "Print ROWS of optional measurements"},
+       {"outfile",             'o', "FILE",    0, "Print ROWS of optional measurements"},
        { 0 }
 };
 
@@ -439,6 +489,7 @@ struct prog_args {
        int                                                     nr_print_rows;
        bool                                            fake_vc_ctx;
        bool                                            adj_workers;
+       char                                            *outfile_path;
        void *(*lock_type)(void *arg);
 };
 struct prog_args pargs = {0};
@@ -457,12 +508,6 @@ pthread_barrier_t start_test;
 /* Locking functions.  Define globals here, init them in main (if possible), and
  * use the lock_func() macro to make your thread func. */
 
-spinlock_t spin_lock = SPINLOCK_INITIALIZER;
-struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
-struct mcs_lock mcs_lock = MCS_LOCK_INIT;
-struct mcs_pdr_lock mcspdr_lock;
-struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT;
-
 #define lock_func(lock_name, lock_cmd, unlock_cmd)                             \
 void *lock_name##_thread(void *arg)                                            \
 {                                                                              \
@@ -476,9 +521,10 @@ void *lock_name##_thread(void *arg)                                            \
        struct time_stamp *this_time;                                              \
        struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                          \
        struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT;                     \
+       int i;                                                                     \
        /* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the
         * program gets preempted for that test, things may go nuts */             \
-       pdro_qnode.vcoreid = thread_id - 1;                                        \
+       pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads;                     \
        /* Wait til all threads are created.  Ideally, I'd like to busywait unless
         * absolutely critical to yield */                                         \
        pthread_barrier_wait(&start_test);                                         \
@@ -488,7 +534,7 @@ void *lock_name##_thread(void *arg)                                            \
                /* tricks ourselves into believing we're in vc ctx */                  \
                __vcore_context = TRUE;                                                \
        }                                                                          \
-       for (int i = 0; i < nr_loops; i++) {                                       \
+       for (i = 0; i < nr_loops; i++) {                                           \
                if (!run_locktest)                                                     \
                        break;                                                             \
                pre_lock = read_tsc_serialized();                                      \
@@ -534,9 +580,19 @@ void *lock_name##_thread(void *arg)                                            \
                __vcore_context = FALSE;                                               \
                uth_enable_notifs();                                                   \
        }                                                                          \
-       return arg;                                                                \
+       return (void*)(long)i;                                                     \
+}
+
+#define fake_lock_func(lock_name, x1, x2)                                      \
+void *lock_name##_thread(void *arg)                                            \
+{                                                                              \
+       printf("Lock " #lock_name " not supported!\n");                            \
+       exit(-1);                                                                  \
 }
 
+spinlock_t spin_lock = SPINLOCK_INITIALIZER;
+struct mcs_lock mcs_lock = MCS_LOCK_INIT;
+
 /* Defines locking funcs like "mcs_thread" */
 lock_func(mcs,
           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
@@ -544,6 +600,15 @@ lock_func(mcs,
 lock_func(mcscas,
           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
           mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);)
+lock_func(spin,
+          spinlock_lock(&spin_lock);,
+          spinlock_unlock(&spin_lock);)
+
+#ifdef __ros__
+struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
+struct mcs_pdr_lock mcspdr_lock;
+struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT;
+
 lock_func(mcspdr,
           mcs_pdr_lock(&mcspdr_lock);,
           mcs_pdr_unlock(&mcspdr_lock);)
@@ -553,12 +618,17 @@ lock_func(mcspdro,
 lock_func(__mcspdro,
           __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
           __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
-lock_func(spin,
-          spinlock_lock(&spin_lock);,
-          spinlock_unlock(&spin_lock);)
 lock_func(spinpdr,
           spin_pdr_lock(&spdr_lock);,
           spin_pdr_unlock(&spdr_lock);)
+#else
+
+fake_lock_func(mcspdr, 0, 0);
+fake_lock_func(mcspdro, 0, 0);
+fake_lock_func(__mcspdro, 0, 0);
+fake_lock_func(spinpdr, 0, 0);
+
+#endif
 
 static int get_acq_latency(void **data, int i, int j, uint64_t *sample)
 {
@@ -596,6 +666,8 @@ static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample)
        return 0;
 }
 
+#ifdef __ros__
+
 /* Lousy event intercept.  build something similar in the event library? */
 #define MAX_NR_EVENT_TRACES 1000
 uint64_t preempts[MAX_NR_EVENT_TRACES] = {0};
@@ -605,22 +677,22 @@ atomic_t indir_idx;
 atomic_t preempt_cnt;
 atomic_t indir_cnt;
 
-static void handle_preempt(struct event_msg *ev_msg, unsigned int ev_type)
+static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type,
+                          void *data)
 {
        unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1);
        if (my_slot < MAX_NR_EVENT_TRACES)
                preempts[my_slot] = read_tsc();
        atomic_inc(&preempt_cnt);
-       handle_vc_preempt(ev_msg, ev_type);
 }
 
-static void handle_indir(struct event_msg *ev_msg, unsigned int ev_type)
+static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type,
+                        void *data)
 {
        unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1);
        if (my_slot < MAX_NR_EVENT_TRACES)
                indirs[my_slot] = read_tsc();
        atomic_inc(&indir_cnt);
-       handle_vc_indir(ev_msg, ev_type);
 }
 
 /* Helper, prints out the preempt trace */
@@ -650,7 +722,7 @@ static void print_preempt_trace(uint64_t starttsc, int nr_print_rows)
 }
 
 /* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */
-static void os_prep_work(int nr_threads)
+static void os_prep_work(pthread_t *worker_threads, int nr_threads)
 {
        if (nr_threads > max_vcores()) {
                printf("Too many threads (%d) requested, can't get more than %d vc\n",
@@ -661,27 +733,31 @@ static void os_prep_work(int nr_threads)
        atomic_init(&indir_idx, 0);
        atomic_init(&preempt_cnt, 0);
        atomic_init(&indir_cnt, 0);
-       pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
-       pthread_lib_init();                                     /* gives us one vcore */
-       ev_handlers[EV_VCORE_PREEMPT] = handle_preempt;
-       ev_handlers[EV_CHECK_MSGS] = handle_indir;
+       parlib_never_yield = TRUE;
+       pthread_need_tls(FALSE);
+       pthread_mcp_init();                                     /* gives us one vcore */
+       register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0);
+       register_ev_handler(EV_CHECK_MSGS, trace_indir, 0);
        if (pargs.fake_vc_ctx) {
                /* need to disable events when faking vc ctx.  since we're looping and
                 * not handling events, we could run OOM */
                clear_kevent_q(EV_VCORE_PREEMPT);
                clear_kevent_q(EV_CHECK_MSGS);
        }
-       if (vcore_request(nr_threads - 1)) {
-               printf("Failed to request %d more vcores, currently have %d\n",
-                      nr_threads - 1, num_vcores());
-               exit(-1);
-       }
+       vcore_request_total(nr_threads);
+       parlib_never_vc_request = TRUE;
        for (int i = 0; i < nr_threads; i++) {
                printd("Vcore %d mapped to pcore %d\n", i,
                       __procinfo.vcoremap[i].pcoreid);
        }
 }
 
+static void os_post_work(pthread_t *worker_threads, int nr_threads)
+{
+}
+
+#endif
+
 /* Argument parsing */
 static error_t parse_opt (int key, char *arg, struct argp_state *state)
 {
@@ -721,6 +797,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
                                argp_usage(state);
                        }
                        break;
+               case 'o':
+                       pargs->outfile_path = arg;
+                       break;
                case 'p':
                        pargs->nr_print_rows = atoi(arg);
                        if (pargs->nr_print_rows < 0) {
@@ -791,12 +870,13 @@ static struct argp argp = {options, parse_opt, args_doc, doc};
 int main(int argc, char** argv)
 {
        pthread_t *worker_threads;
-       void *dummy_retval;
+       void **loops_done;
        struct timeval start_tv = {0};
        struct timeval end_tv = {0};
-       long usec_diff;
+       long usec_diff, total_loops = 0;
        uint64_t starttsc;
        int nr_threads, nr_loops;
+       FILE *outfile;
        struct sample_stats acq_stats, hld_stats;
 
        argp_parse(&argp, argc, argv, 0, 0, &pargs);
@@ -804,11 +884,24 @@ int main(int argc, char** argv)
        nr_loops = pargs.nr_loops;
        mcs_pdr_init(&mcspdr_lock);
 
+       if (pargs.outfile_path) {
+               /* RDWR, CREAT, TRUNC, O666 */
+               outfile = fopen(pargs.outfile_path, "w+");
+               if (!outfile) {
+                       perror("outfile");
+                       exit(-1);
+               }
+       }
        worker_threads = malloc(sizeof(pthread_t) * nr_threads);
        if (!worker_threads) {
                perror("pthread_t malloc failed:");
                exit(-1);
        }
+       loops_done = malloc(sizeof(void*) * nr_threads);
+       if (!loops_done) {
+               perror("loops_done malloc failed");
+               exit(-1);
+       }
        printf("Making %d workers of %d loops each, %sadapting workers to vcores, "
               "and %sfaking vcore context\n", nr_threads, nr_loops,
               pargs.adj_workers ? "" : "not ",
@@ -825,9 +918,9 @@ int main(int argc, char** argv)
                }
                memset(times[i], 0, sizeof(struct time_stamp) * nr_loops);
        }
-       printf("Record tracking takes %d bytes of memory\n",
+       printf("Record tracking takes %ld bytes of memory\n",
               nr_threads * nr_loops * sizeof(struct time_stamp));
-       os_prep_work(nr_threads);       /* ensure we have enough VCs */
+       os_prep_work(worker_threads, nr_threads);       /* ensure we have enough VCs */
        /* Doing this in MCP ctx, so we might have been getting a few preempts
         * already.  Want to read start before the threads pass their barrier */
        starttsc = read_tsc();
@@ -837,10 +930,11 @@ int main(int argc, char** argv)
                                   (void*)i))
                        perror("pth_create failed");
        }
+       os_post_work(worker_threads, nr_threads);
        if (gettimeofday(&start_tv, 0))
                perror("Start time error...");
        for (int i = 0; i < nr_threads; i++) {
-               pthread_join(worker_threads[i], &dummy_retval);
+               pthread_join(worker_threads[i], &loops_done[i]);
        }
        if (gettimeofday(&end_tv, 0))
                perror("End time error...");
@@ -855,7 +949,7 @@ int main(int argc, char** argv)
 
        usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 +
                    (end_tv.tv_usec - start_tv.tv_usec);
-       printf("Time to run: %d usec\n", usec_diff);
+       printf("Time to run: %ld usec\n", usec_diff);
 
        printf("\nLock throughput:\n-----------------\n");
        /* throughput for the entire duration (in ms), 1ms steps.  print as many
@@ -865,5 +959,39 @@ int main(int argc, char** argv)
                         starttsc, nr_threads,
                         nr_loops, get_acq_timestamp);
        print_preempt_trace(starttsc, pargs.nr_print_rows);
+
+       for (int i = 0; i < nr_threads; i++) {
+               total_loops += (long)loops_done[i];
+               if (!loops_done[i])
+                       printf("WARNING: thread %d performed 0 loops!\n", i);
+       }
+       printf("Average number of loops done, per thread: %ld\n",
+              total_loops / nr_threads);
+       for (int i = 0; i < nr_threads; i++)
+               printf("\tThread %d performed %lu loops\n", i, (long)loops_done[i]);
+
+       if (pargs.outfile_path) {
+               fprintf(outfile, "#");
+               for (char **arg = argv; *arg; arg++)
+                       fprintf(outfile, " %s", *arg);
+               fprintf(outfile, "\n");
+               fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) "
+                                "tsc_overhead\n");
+               fprintf(outfile, "# acquire latency: acq - pre - tsc_overhead\n");
+               fprintf(outfile, "# hold time: un - acq - tsc_overhead\n");
+               fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq());
+               fprintf(outfile, "# tsc_overhead is 0 on linux, hard code it with a "
+                                "value from akaros\n");
+               for (int i = 0; i < nr_threads; i++) {
+                       for (int j = 0; j < nr_loops; j++) {
+                               struct time_stamp *ts = &times[i][j];
+                               if (!ts->pre)
+                                       break; /* empty record */
+                               fprintf(outfile, "%d %d %llu %llu %llu %llu\n", i, j, ts->pre,
+                                       ts->acq, ts->un, get_tsc_overhead());
+                       }
+               }
+               fclose(outfile);
+       }
        printf("Done, exiting\n");
 }