akaros/tests/lock_test.c
<<
>>
Prefs
   1/* Copyright (c) 2013, 2014 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * lock_test: microbenchmark to measure different styles of spinlocks.
   6 */
   7
   8#define _GNU_SOURCE /* pthread_yield */
   9
  10#include <stdio.h>
  11#include <pthread.h>
  12#include <stdlib.h>
  13#include <unistd.h>
  14#include <sys/time.h>
  15#include <math.h>
  16#include <argp.h>
  17#include <sys/types.h>
  18#include <sys/stat.h>
  19#include <fcntl.h>
  20#include <assert.h>
  21#include <string.h>
  22
  23#define handle_error(msg) \
  24        do { perror("Error, aborting: " msg); exit(EXIT_FAILURE); } while (0)
  25
  26/* OS dependent #incs */
  27#ifdef __akaros__
  28
  29#include <parlib/parlib.h>
  30#include <parlib/stdio.h>
  31#include <parlib/vcore.h>
  32#include <parlib/timing.h>
  33#include <parlib/spinlock.h>
  34#include <parlib/mcs.h>
  35#include <parlib/arch/arch.h>
  36#include <parlib/event.h>
  37
  38#include <parlib/tsc-compat.h>
  39#include <benchutil/measure.h>
  40
  41// os_prep_work is down below.
  42
  43static void os_pthread_prep_work(int thread_id)
  44{
  45}
  46
  47static const char *os_name(void)
  48{
  49        return "Akaros";
  50}
  51
  52#else
  53
  54#include "../user/parlib/include/parlib/tsc-compat.h"
  55#include "../user/benchutil/include/benchutil/measure.h"
  56#include "linux/misc-compat.h"
  57#include "linux/linux-lock-hacks.h"
  58
  59static void os_prep_work(pthread_t *worker_threads, int nr_threads)
  60{
  61        if (nr_threads > max_vcores())
  62                printf("WARNING: %d threads requested, but only %d cores available\n",
  63                       nr_threads, max_vcores());
  64}
  65
  66static void os_pthread_prep_work(int thread_id)
  67{
  68        cpu_set_t cpuset;
  69
  70        if (thread_id > max_vcores())
  71                return;
  72        CPU_ZERO(&cpuset);
  73        CPU_SET(thread_id, &cpuset);
  74        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
  75                                   &cpuset) < 0)
  76                printf("thread_id %d failed to bind to core!\n", thread_id);
  77}
  78
  79static const char *os_name(void)
  80{
  81        return "Linux";
  82}
  83
  84#define print_preempt_trace(args...) {}
  85
  86__thread int __vcore_context = 0;
  87
  88#endif
  89
  90/* TODO: There's lot of work to do still, both on this program and on locking
  91 * and vcore code.  For some of the issues, I'll leave in the discussion /
  92 * answers, in case it comes up in the future (like when I read this in 8
  93 * months).
  94 *
  95 * BUGS / COMMENTARY
  96 * Occasional deadlocks when preempting and not giving back!
  97 *      - with the new PDRs style, though that doesn't mean the older styles
  98 *      don't have this problem
  99 *      - shouldn't be any weaker than PDR.  they all check pred_vc to see
 100 *      if they are running, and if not, they make sure someone runs
 101 *      - could be weaker if we have an old value for the lockholder,
 102 *      someone outside the chain, and we made sure they ran, and they do
 103 *      nothing (spin in the 2LS or something?)
 104 *              no, they should have gotten a msg about us being preempted,
 105 *              since whoever we turn into gets the message about us swapping.
 106 *      - anyway, it's not clear if this is with MCSPDR, event delivery,
 107 *      preemption handling, or just an artifact of the test (less likely)
 108 * why aren't MCS locks in uth_ctx getting dealt with?
 109 *      - because the event is handled, but the lock holder isn't run.  the
 110 *      preemption was dealt with, but nothing saved the lock holder
 111 *      - any uthread_ctx lockholder that gets preempted will get
 112 *      interrupted, and other cores will handle the preemption.  but that
 113 *      uthread won't run again without 2LS support.  either all spinners
 114 *      need to be aware of the 'lockholder' (PDR-style), or the 2LS needs
 115 *      to know when a uthread becomes a 'lockholder' to make sure it runs
 116 *      via user-level preempts.  If the latter, this needs to happen
 117 *      atomically with grabbing the lock, or else be able to handle lots of
 118 *      fake 'lockholders' (like round-robin among all of them)
 119 * why is the delay more than the expected delay?
 120 *      because it takes ~2ms to spawn and run a process
 121 *      could do this in a separate process, instead of a script
 122 *              could also consider not using pth_test and changing prov, but
 123 *              driving it by yields and requests.  would also test the
 124 *              alarm/wakeup code (process sets alarm, goes to sleep, wakes up
 125 *              and requests X cores)
 126 * why do we get occasional preempt-storms? (lots of change_tos)
 127 *      due to the MCS-PDR chain, which i tried fixing by adjusting the number
 128 *      of workers down to the number of vcores
 129 * why isn't the worker adaptation working?
 130 *              - it actually was working, and nr_workers == nr_vcores.  that
 131 *              just wasn't the root cause.
 132 *              - was expecting it to cut down on PDR kernel traffic
 133 *      - still get periods of low perf
 134 *              like O(100) preempt msgs per big preempt/prov
 135 *              does it really take that much to work out an MCS-PDR?
 136 *      - one thing is that if we fake vc ctx, we never receive preemption
 137 *      events.  might be a bad idea.
 138 *              - in general, yeah.  faking VC and turning off events can really
 139 *              muck with things
 140 *              - these events aren't necessarily delivered to a VC who will
 141 *              check events any time soon (might be the last one in the chain)
 142 *              - the core of the issue is that we have the right amount of
 143 *              workers and vcores, but that the system isn't given a chance to
 144 *              stabilize itself.  also, if we have some VCs that are just
 145 *              sitting around, spinning in the 2LS, if those get preempted, no
 146 *              one notices or cares (when faking vc_ctx / getting no events)
 147 *      - there is a slight race where we might make someone run who isn't a
 148 *      lockholder.  logically, its okay.  worst case, it would act like an
 149 *      extra preempt and different startcore, which shouldn't be too bad.
 150 *
 151 * sanity check: does throughput match latency? (2.5GHz TSC, MCS lock)
 152 *      ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock
 153 *      500 ticks * 31 workers (queue) = 15000 ticks
 154 *      avg acquire time was around 14K.  seems fine..
 155 *      when our MCSPDR throughput tanks (during preempts), it's around
 156 *      400-500 locks/ms, which is around 2us/lock.  
 157 *              when the locker on a preempted chain shows up, it needs to
 158 *              change to the next one in line. 
 159 *                      - though that should be in parallel with the other
 160 *                      lockholders letting go.  shouldn't be that bad
 161 *                      - no, it is probably at the head of the chain very soon,
 162 *                      such that it is the bottleneck for the actual lock.  2us
 163 *                      seems possible
 164 *
 165 * what does it take to get out of a preemption with (old) MCS-PDR?
 166 *      - these are now called pdro locks (old)
 167 *      - for a single preempt, it will take 1..n-1 changes.  avg n/2
 168 *      - for multiple preempts, it's nr_pre * that (avg np/2, worst np)
 169 *      - for every unlock/reacquire cycle (someone unlocks, then rejoins
 170 *      the list), its nr_preempts (aka, nr_workers - nr_vcores)
 171 *      - if we need to have a specific worker get out of the chain, on
 172 *      average, it'd take n/2 cycles (p*n/2 changes)  worst: np
 173 *      - if we want to get multiple workers out, the worst case is still
 174 *      np, but as p increases, we're more likely to approach n cycles
 175 *      - so the current model is np for the initial hit (to move the
 176 *      offline VCs to the end of the chain) and another np to get our
 177 *      specific workers out of the chain and yielding (2np)
 178 *
 179 *      - but even with 1 preempt, we're getting 80-200 changes per
 180 *
 181 *      - it shouldn't matter that the sys_change_to is really slow, should
 182 *      be the same amount of changes.  however, the preempted ones are
 183 *      never really at the tail end of the chain - they should end up right
 184 *      before the lockholder often.  while the sys_change_tos are slowly
 185 *      moving towards the back of the chain, the locking code is quickly
 186 *      removing (online) nodes from the head and putting them on the back.
 187 *
 188 *      - end result: based on lock hold time and lock delay time, a
 189 *      preempted VC stays in the MCS chain (swaps btw VC/nodes), and when
 190 *      it is inside the chain, someone is polling to make them run.  with
 191 *      someone polling, it is extremely unlikely that someone outside the
 192 *      chain will win the race and be able to change_to before the in-chain
 193 *      poller.  to clarify:
 194 *              - hold time and delay time matter, since the longer they are,
 195 *              the greater the amount of time the change_to percolation has to
 196 *              get the preempted VCs to the end of the chain (where no one
 197 *              polls them).
 198 *              - at least one vcore is getting the event to handle the
 199 *              preemption of the in-chain, offline VC.  we could change it so
 200 *              every VC polls the preempt_evq, or just wait til whoever is
 201 *              getting the messages eventually checks their messages (VC0)
 202 *              - if there is an in-chain poller, they will notice the instant
 203 *              the VC map changes, and then immediately change_to (and spin on
 204 *              the proclock in the kernel).  there's almost no chance of a
 205 *              normal preempt event handler doing that faster.  (would require
 206 *              some IRQ latency or something serious).
 207 * - adding in any hold time trashes our microbenchmark's perf, but a
 208 * little delay time actually helps: (all with no preempts going on)
 209 *      - mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc)
 210 *      - mcspdr, d = 1: 4400-4800
 211 *      - mcspdr, d = 2: 4200-5200
 212 *      - as you add delay, it cuts down on contention for the
 213 *      lock->lock cacheline.  but if you add in too much, you'll tank
 214 *      throughput (since there is no contention at all).
 215 *      - as we increase the delay, we cut down on the chance of the
 216 *      preempt storm / preempt-stuck-in-the-chain, though it can still
 217 *      happen, even with a delay of 10us
 218 * - maybe add in the lockholder again? (removed in 73701d6bfb)
 219 *      - massively cuts performance, like 2x throughput, without
 220 *      preempts
 221 *      - it's ability to help depends on impl:
 222 *              in one version (old style), it didn't help much at all
 223 *              - in another (optimized lockholder setting), i can't
 224 *              even see the throughput hit, it recovered right away,
 225 *              with O(5) messages
 226 *              - the diff was having the lockholder assign the vcoreid
 227 *              before passing off to the next in the chain, so that
 228 *              there is less time with having "no lockholder".
 229 *              (there's a brief period where the lockholder says it is
 230 *              the next person, who still
 231 *              spins.  they'll have to make
 232 *              sure their pred runs)
 233 * -adj workers doesn't matter either...
 234 *      - the 2LS and preemption handling might be doing this
 235 *      automatically, when handle_vc_preempt() does a
 236 *      thread_paused() on its current_uthread.
 237 *      - adj_workers isn't critical if we're using some locks
 238 *      that check notif_pending.  eventually someone hears
 239 *      about preempted VCs (assuming we can keep up)
 240 *
 241 * What about delays?  both hold and delay should make it easier to get
 242 * the preempted vcore to the end of the chain.  but do they have to be
 243 * too big to be reasonable?
 244 *      - yes.  hold doesn't really help much until everything is
 245 *      slower.  even with a hold of around 1.2us, we still have the
 246 *      change_to-storms and lowered throughput.
 247 *      - doing a combo helps too.  if you hold for 1ns (quite a bit
 248 *      more actually, due to the overhead of ndelay, but sufficient to
 249 *      be "doing work"), and delaying for around 7us before rejoining,
 250 *      there's only about a 1/5 chance of a single preempt messing us
 251 *      up
 252 *              - though having multiple preempts outstanding make this less
 253 *              likely to work.
 254 *              - and it seems like if we get into the storm scenario, we
 255 *              never really get out.  either we do quickly or never do.
 256 *              depending on the workload, this could be a matter of luck
 257 *
 258 * So we could try tracking the lockholder, but only looking at it when
 259 * we know someone was preempted in the chain - specifically, when our
 260 * pred is offline.  when that happens, we don't change to them, we
 261 * make sure the lockholder is running.
 262 *      - tracking takes us from 4200->2800 throughput or so for MCS
 263 *      - 5200 -> 3700 or so for MCS in vc_ctx (__MCSPDR)
 264 *      - main spike seems to be in the hold time.  bimodal distrib,
 265 *      with most below 91 (the usual is everything packed below 70) and
 266 *      a big spike around 320
 267 *
 268 * Summary:
 269 *
 270 * So we need to have someone outside the chain change_to the one in the
 271 * chain o/w, someone will always be in the chain.  Right now, it's always
 272 * the next in line who is doing the changing, so a preempted vcore is
 273 * always still in the chain. 
 274 *
 275 * If the locking workload has some delaying, such as while holding the
 276 * lock or before reacquiring, the "change_to" storm might not be a
 277 * problem.  If it is, the only alternative I have so far is to check the
 278 * lockholder (which prevents a chain member from always ensuring their
 279 * pred runs).  This hurts the lock's scalability/performance when we
 280 * aren't being preempted.  On the otherhand, based on what you're doing
 281 * with the lock, one more cache miss might not be as big of a deal as in
 282 * lock_test.  Especially if when you get stormed, your throughput could be
 283 * terrible and never recover.
 284 *
 285 * Similar point: you can use spinpdr locks.  They have the PDR-benefits,
 286 * and won't induce the storm of change_tos.  However, this isn't much
 287 * better for contended locks.  They perform 2-3x worse (on c89) without
 288 * preemption.  Arguably, if you were worried about the preempt storms and
 289 * want scalability, you might want to use mcspdr with lockholders.
 290 *
 291 * The MCSPDRS (now just callced MCSPDR, these are default) locks can avoid
 292 * the storm, but at the cost of a little more in performance.  mcspdrs
 293 * style is about the same when not getting preempted from uth ctx compared
 294 * to mcspdr (slight drop).  When in vc ctx, it's about 10-20% perf hit
 295 * (PDRS gains little from --vc_ctx). 
 296 *
 297 * Turns out there is a perf hit to PDRS (and any non-stack based qnode)
 298 * when running on c89.  The issue is that after shuffling the vcores
 299 * around, they are no longer mapped nicely to pcores (VC0->PC1, VC1->PC2).
 300 * This is due to some 'false sharing' of the cachelines, caused mostly by
 301 * aggressive prefetching (notably the intel adjacent cacheline prefetcher,
 302 * which grabs two CLs at a time!).  Basically, stack-based qnodes are
 303 * qnodes that are very far apart in memory.  Cranking up the padding in
 304 * qnodes in the "qnodes-in-locks" style replicates this.
 305 *
 306 * For some info on the prefetching:
 307 *      http://software.intel.com/en-us/articles/optimizing-application-performance-on-intel-coret-microarchitecture-using-hardware-implemented-prefetchers/
 308 *      http://software.intel.com/en-us/forums/topic/341769
 309 *
 310 * Here's some rough numbers of the different styles for qnodes on c89.
 311 * 'in order' is VCn->PC(n+1) (0->1, 1->2).  Worst order is with even VCs
 312 * on one socket, odds on the other.  the number of CLs is the size of a
 313 * qnode.  mcspdr is the new style (called mcspdrs in some places in this
 314 * document), with lock-based qnodes.  mcspdr2 is the same, but with
 315 * stack-based qnodes.  mcspdro is the old style (bad a recovery), stack
 316 * based, sometimes just called mcs-pdr
 317 *
 318 *      with prefetchers disabled (MCS and DCU)
 319 *              mcspdr   1CL  4.8-5.4 in order, 3.8-4.2 worst order
 320 *              mcspdr   2CL          in order,         worst order
 321 *              mcspdr   4CL  5.2-6.0 in order, 4.7-5.3 worst order
 322 *              mcspdr   8CL  5.4-6.7 in order, 5.2-6.2 worst order
 323 *              mcspdr  16CL  5.1-5.8 in order, 5.2-6.8 worst order
 324 *              mcspdr2 stck          in order,         worst order
 325 *              mcspdro stck  4-3.4.3 in order, 4.2-4.5 worst order
 326 *              mcspdro-vcctx 4.8-7.0 in order, 5.3-6.7 worst order
 327 *              can we see the 2 humps? 
 328 *                      mcspdr 1CL yes but less, varied, etc
 329 *                      mcspdr2 no
 330 *
 331 *      test again with worst order with prefetchers enabled
 332 *              mcspdr   1CL  3.8-4.0 in order, 2.6-2.7 worst order
 333 *              mcspdr   2CL  4.2-4.4 in order, 3.8-3.9 worst order
 334 *              mcspdr   4CL  4.5-5.2 in order, 4.0-4.2 worst order
 335 *              mcspdr   8CL  4.4-5.1 in order, 4.3-4.7 worst order
 336 *              mcspdr  16CL  4.4-4.8 in order, 4.4-5.3 worst order
 337 *              mcspdr2 stck  3.0-3.0 in order, 2.9-3.0 worst order
 338 *              mcspdro stck  4.2-4.3 in order, 4.2-4.4 worst order
 339 *              mcspdro-vcctx 5.2-6.4 in order, 5.0-5.9 worst order
 340 *              can we see the 2 humps?
 341 *                      mcspdrs 1CL yes, clearly
 342 *                      mcspdr2 no
 343 *
 344 * PROGRAM FEATURES
 345 *      - verbosity?  vcoremap, preempts, the throughput and latency histograms?
 346 *      - have a max workers option (0?) == max vcores
 347 *      - would like to randomize (within bounds) the hold/delay times
 348 *              - help avoid convoys with MCS locks
 349 *
 350 * PERFORMANCE:
 351 *
 352 *      pcore control?  (hyperthreading, core 0, cross socket?)
 353 *              want some options for controlling which threads run where, or
 354 *              which vcores are even used (like turning off hyperthreading)?
 355 *      implement ticket spinlocks?  (more fair, more effects of preempts)
 356 *              no simple way to do PDR either, other than 'check everyone'
 357 *      MCS vs MCSPDR vs __MCSPDR
 358 *              MCS seems slightly better than __MCSPDR (and it should)
 359 *              MCSPDR is a bit worse than __MCSPDR
 360 *                      - the uth_disable/enable code seems to make a
 361 *                      difference.
 362 *                      - i see why the latencies are worse, since they have
 363 *                      extra work to do, but the internal part that contends
 364 *                      with other cores shouldn't be affected, unless there's
 365 *                      some other thing going on.  Or perhaps there isn't
 366 *                      always someone waiting for the lock?
 367 *                      - faking VC ctx mostly negates the cost of MCSPDR vs
 368 *                      __MCSPDR things that made a big diff: CL aligning the
 369 *                      qnodes, putting qnodes
 370 *              on stacks, reading in the vcoreid once before ensuring()
 371 *      both MCS CAS unlocks could use some branch prediction work
 372 *      spinpdr locks are 2-3x faster than spinlocks...
 373 *              test, test&set  vs the existing test&set, plus lots of asserts
 374 *
 375 *      some delay (like 10us) lowers latency while maintaining throughput
 376 *              - makes sense esp with MCS.  if you join the queue at the last
 377 *              second, you'll measure lower latency than attempting right away
 378 *              - also true for spinlocks
 379 *              - we can probably figure out the max throughput (TP = f(delay))
 380 *              for each lock type
 381 *
 382 *      hard to get steady numbers with MCS - different runs of the same test
 383 *      will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k
 384 *      L/ms)
 385 *              - happens on c89 (NUMA) and hossin (UMA)
 386 *              - spinlocks seem a little steadier.
 387 *              - for MCS locks, the order in which they line up across the
 388 *              pcores will matter.  like if on one run, i regularly hand off
 389 *              between cores
 390 *              in the same socket and only do one cross-socket step
 391 *              - run a lot of shorter ones to get a trend, for now
 392 *              - might be correllated with spikes in held times (last bin)
 393 *              - can't turn off legacy USB on c89 (SMM) - interferes with PXE
 394 *
 395 * PREEMPTS:
 396 * better preempt record tracking?
 397 *      i just hacked some event-intercept and timestamp code together
 398 *      maybe put it in the event library?
 399 *      the timestamps definitely helped debugging
 400 *
 401 * is it true that if uthread code never spins outside a PDR lock, then it
 402 * doesn't need preemption IPIs?  (just someone checks the event at some
 403 * point). 
 404 *      think so: so long as you make progress and when you aren't, you
 405 *      check events (like if a uthread blocks on something and enters VC
 406 *      ctx)
 407 * adjusting the number of workers, whether vcores or uthreads
 408 * - if you have more lockers than cores:
 409 *      - spinpdr a worker will get starved (akaros) (without 2LS support)
 410 *              - running this from uth context will cause a handle_events
 411 *      - mcspdr will require the kernel to switch (akaros)
 412 *      - spin (akaros) might DL (o/w nothing), (linux) poor perf
 413 *      - mcs (akaros) will DL, (linux) poor perf
 414 *      - poor perf (latency spikes) comes from running the wrong thread
 415 *      sometimes
 416 *      - deadlock comes from the lack of kernel-level context switching
 417 * - if we scale workers down to the number of active vcores:
 418 *      - two things: the initial hit, and the steady state.  during the
 419 *      initial hit, we can still deadlock, since we have more lockers than
 420 *      cores
 421 *              - non-pdr (akaros) could deadlock in the initial hit
 422 *              - (akaros) steady state, everything is normal (just fewer cores)
 423 *      - how can we adjust this in linux?
 424 *              - if know how many cores you have, then futex wait the others
 425 *              - need some way to wake them back up
 426 *              - if you do this in userspace, you might need something PDR-like
 427 *              to handle when the "2LS" code gets preempted
 428 *      - as mentioned above, the problem in akaros is that the lock/unlock
 429 *      might be happening too fast to get into the steady-state and recover
 430 *      from the initial preemption
 431 * - one of our benefits is that we can adapt in userspace, with userspace
 432 * knowledge, under any circumstance.
 433 *      - we have the deadlock windows (forcing PDR).
 434 *      - in return, we can do this adaptation in userspace
 435 *      - and (arguably) anyone who does this in userspace will need PDR
 436 *
 437 * MEASUREMENT (user/parlib/measure.c)
 438 *      extract into its own library, for linux apps
 439 *      print out raw TSC times?  might help sync up diff timelines
 440 *      Need more latency bins, spinlocks vary too much
 441 *      maybe we need better high/low too, since this hist looks bad too
 442 *              or not center on the average?
 443 *              for printing, its hard to know without already binning.
 444 *              maybe bin once (latency?), then use that to adjust the hist?
 445 *
 446 *      Had this on a spinlock:
 447 *      [      32 -    35656] 1565231:
 448 *      (less than 200 intermediate)
 449 *      [  286557 - 20404788]   65298: *
 450 *
 451 *      Samples per dot: 34782
 452 *      Total samples: 1640606
 453 *      Avg time   : 96658
 454 *      Stdev time : 604064.440882
 455 *      Coef Var   : 6.249503
 456 *              High coeff of var with serious outliers, adjusted bins
 457 *              50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860)
 458 *              Min / Max  : 32 / 20404788
 459 *      was 50/75/90 really within 860 of each other?
 460 *
 461 *      when we are preempted and don't even attempt anything, say for 10ms, it
 462 *      actually doesn't hurt our 50/75/90/99 too much.  we have a ridiculous
 463 *      stddev and max, and high average, but there aren't any additional
 464 *      attempts at locking to mess with the attempt-latency.  Only nr_vcores
 465 *      requests are in flight during the preemption, but we can spit out around
 466 *      5000 per ms when we aren't preempted.
 467 *
 468 */
 469
 470const char *argp_program_version = "lock_test v0.1475264";
 471const char *argp_program_bug_address = "<akaros@googlegroups.com>";
 472
 473static char doc[] = "lock_test -- spinlock benchmarking";
 474static char args_doc[] = "-w NUM -l NUM -t LOCK";
 475
 476#define OPT_VC_CTX 1
 477#define OPT_ADJ_WORKERS 2
 478
 479static struct argp_option options[] = {
 480        {"workers",     'w', "NUM",     OPTION_NO_USAGE, "Number of threads/cores (max possible)"},
 481        {0, 0, 0, 0, ""},
 482        {"loops",       'l', "NUM",     OPTION_NO_USAGE, "Number of loops per worker (10000)"},
 483        {0, 0, 0, 0, ""},
 484        {"type",        't', "LOCK",OPTION_NO_USAGE, "Type of lock to use" },
 485        {0, 0, 0, 0, "Other options (not mandatory):"},
 486        {"adj_workers", OPT_ADJ_WORKERS, 0,     0,
 487                                               "Adjust workers such that the "
 488                                               "number of workers equals the "
 489                                               "number of vcores"},
 490        {"vc_ctx",      OPT_VC_CTX, 0,  0, "Run threads in mock-vcore context"},
 491        {0, 0, 0, 0, ""},
 492        {"hold",        'h', "NSEC",    0, "nsec to hold the lock"},
 493        {"delay",       'd', "NSEC",    0, "nsec to delay between grabs"},
 494        {"print",       'p', "ROWS",    0, "Print ROWS of optional measurements"},
 495        {"outfile",     'o', "FILE",    0, "Print ROWS of optional measurements"},
 496        { 0 }
 497};
 498
 499struct lock_test {
 500        const char *name;
 501        void *(*func)(void *arg);
 502        int id;
 503};
 504
 505struct prog_args {
 506        unsigned int            nr_threads;
 507        unsigned int            nr_loops;
 508        unsigned int            hold_time;
 509        unsigned int            delay_time;
 510        unsigned int            nr_print_rows;
 511        bool                    fake_vc_ctx;
 512        bool                    adj_workers;
 513        char                    *outfile_path;
 514        struct lock_test        *test;
 515};
 516struct prog_args pargs = {0};
 517
 518/* Globals */
 519struct lock_sample {
 520        uint64_t pre;
 521        uint64_t acq;
 522        uint64_t un;
 523        bool valid;
 524};
 525struct lock_sample **times;
 526bool run_locktest = TRUE;
 527pthread_barrier_t start_test;
 528
 529/* Locking functions.  Define globals here, init them in main (if possible), and
 530 * use the lock_func() macro to make your thread func. */
 531
 532#define lock_func(lock_name, lock_cmd, unlock_cmd)                             \
 533void *lock_name##_thread(void *arg)                                            \
 534{                                                                              \
 535        int thread_id = (long)arg;                                             \
 536        int hold_time = ACCESS_ONCE(pargs.hold_time);                          \
 537        int delay_time = ACCESS_ONCE(pargs.delay_time);                        \
 538        int nr_loops = ACCESS_ONCE(pargs.nr_loops);                            \
 539        bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx);                     \
 540        bool adj_workers = ACCESS_ONCE(pargs.adj_workers);                     \
 541        uint64_t pre_lock, acq_lock, un_lock;                                  \
 542        struct lock_sample *this_time;                                         \
 543        struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                      \
 544        struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT;                 \
 545        int i;                                                                 \
 546                                                                               \
 547        os_pthread_prep_work(thread_id);                                       \
 548        /* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the
 549         * program gets preempted for that test, things may go nuts */         \
 550        pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads;                 \
 551        /* Wait til all threads are created.  Ideally, I'd like to busywait
 552         * unless absolutely critical to yield */                              \
 553        pthread_barrier_wait(&start_test);                                     \
 554        if (fake_vc_ctx) {                                                     \
 555                /* tells the kernel / other vcores we're in vc ctx */          \
 556                uth_disable_notifs();                                          \
 557                /* tricks ourselves into believing we're in vc ctx */          \
 558                __vcore_context = TRUE;                                        \
 559        }                                                                      \
 560        for (i = 0; i < nr_loops; i++) {                                       \
 561                if (!run_locktest)                                             \
 562                        break;                                                 \
 563                pre_lock = read_tsc_serialized();                              \
 564                                                                               \
 565                lock_cmd                                                       \
 566                                                                               \
 567                acq_lock = read_tsc_serialized();                              \
 568                if (hold_time)                                                 \
 569                        ndelay(hold_time);                                     \
 570                                                                               \
 571                unlock_cmd                                                     \
 572                                                                               \
 573                un_lock = read_tsc_serialized();                               \
 574                this_time = &times[thread_id][i];                              \
 575                this_time->pre = pre_lock;                                     \
 576                this_time->acq = acq_lock;                                     \
 577                this_time->un = un_lock;                                       \
 578                /* Can turn these on/off to control which samples we gather */ \
 579                this_time->valid = TRUE;                                       \
 580                /* this_time->valid = (num_vcores() == max_vcores());  */      \
 581                                                                               \
 582                if (delay_time)                                                \
 583                        ndelay(delay_time);                                    \
 584                /* worker thread ids are 0..n-1.  if we're one of the threads
 585                 * that's beyond the VC count, we yield. */                    \
 586                if (adj_workers && num_vcores() < thread_id + 1) {             \
 587                        if (fake_vc_ctx) {                                     \
 588                                __vcore_context = FALSE;                       \
 589                                uth_enable_notifs();                           \
 590                        }                                                      \
 591                        /* we'll come back up once we have enough VCs running*/\
 592                        pthread_yield();                                       \
 593                        if (fake_vc_ctx) {                                     \
 594                                uth_disable_notifs();                          \
 595                                __vcore_context = TRUE;                        \
 596                        }                                                      \
 597                }                                                              \
 598                cmb();                                                         \
 599        }                                                                      \
 600        /* First thread to finish stops the test */                            \
 601        run_locktest = FALSE;                                                  \
 602        if (fake_vc_ctx) {                                                     \
 603                __vcore_context = FALSE;                                       \
 604                uth_enable_notifs();                                           \
 605        }                                                                      \
 606        return (void*)(long)i;                                                 \
 607}
 608
 609spinlock_t spin_lock = SPINLOCK_INITIALIZER;
 610struct mcs_lock mcs_lock = MCS_LOCK_INIT;
 611
 612/* Defines locking funcs like "mcs_thread" */
 613lock_func(mcs,
 614          mcs_lock_lock(&mcs_lock, &mcs_qnode);,
 615          mcs_lock_unlock(&mcs_lock, &mcs_qnode);)
 616lock_func(mcscas,
 617          mcs_lock_lock(&mcs_lock, &mcs_qnode);,
 618          mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);)
 619lock_func(spin,
 620          spinlock_lock(&spin_lock);,
 621          spinlock_unlock(&spin_lock);)
 622
 623#ifdef __akaros__
 624struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
 625struct mcs_pdr_lock mcspdr_lock;
 626struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT;
 627
 628lock_func(mcspdr,
 629          mcs_pdr_lock(&mcspdr_lock);,
 630          mcs_pdr_unlock(&mcspdr_lock);)
 631lock_func(mcspdro,
 632          mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
 633          mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
 634lock_func(__mcspdro,
 635          __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
 636          __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
 637lock_func(spinpdr,
 638          spin_pdr_lock(&spdr_lock);,
 639          spin_pdr_unlock(&spdr_lock);)
 640
 641static struct lock_test tests[] = {
 642        {"mcs", mcs_thread},
 643        {"mcscas", mcscas_thread},
 644        {"mcspdr", mcspdr_thread},
 645        {"mcspdro", mcspdro_thread},
 646        {"__mcspdro", __mcspdro_thread},
 647        {"spin", spin_thread},
 648        {"spinpdr", spinpdr_thread},
 649        {}
 650};
 651
 652#else
 653
 654#define LOCKTEST_MCS            1
 655#define LOCKTEST_QUEUE          2
 656#define LOCKTEST_SPIN           3
 657
 658static struct lock_test tests[] = {
 659        {"mcs", mcs_thread},
 660        {"mcscas", mcscas_thread},
 661        {"spin", spin_thread},
 662        {"mcs-kernel", NULL, LOCKTEST_MCS},
 663        {"queue-kernel", NULL, LOCKTEST_QUEUE},
 664        {"spin-kernel", NULL, LOCKTEST_SPIN},
 665        {}
 666};
 667
 668#endif
 669
 670static int get_acq_latency(void **data, int i, int j, uint64_t *sample)
 671{
 672        struct lock_sample **times = (struct lock_sample**)data;
 673        /* 0 for initial time means we didn't measure */
 674        if (times[i][j].pre == 0)
 675                return -1;
 676        /* can optionally throw out invalid times (keep this in sync with the
 677         * lock_test macro, based on what you want to meaasure. */
 678        #if 0
 679        if (!times[i][j].valid)
 680                return -1;
 681        #endif
 682        *sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead();
 683        return 0;
 684}
 685
 686static int get_hld_latency(void **data, int i, int j, uint64_t *sample)
 687{
 688        struct lock_sample **times = (struct lock_sample**)data;
 689        /* 0 for initial time means we didn't measure */
 690        if (times[i][j].pre == 0)
 691                return -1;
 692        *sample = times[i][j].un - times[i][j].acq - get_tsc_overhead();
 693        return 0;
 694}
 695
 696static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample)
 697{
 698        struct lock_sample **times = (struct lock_sample**)data;
 699        /* 0 for initial time means we didn't measure */
 700        if (times[i][j].pre == 0)
 701                return -1;
 702        *sample = times[i][j].acq;
 703        return 0;
 704}
 705
 706#ifdef __akaros__
 707
 708/* Lousy event intercept.  build something similar in the event library? */
 709#define MAX_NR_EVENT_TRACES 1000
 710uint64_t preempts[MAX_NR_EVENT_TRACES] = {0};
 711uint64_t indirs[MAX_NR_EVENT_TRACES] = {0};
 712atomic_t preempt_idx;
 713atomic_t indir_idx;
 714atomic_t preempt_cnt;
 715atomic_t indir_cnt;
 716
 717static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type,
 718                          void *data)
 719{
 720        unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1);
 721
 722        if (my_slot < MAX_NR_EVENT_TRACES)
 723                preempts[my_slot] = read_tsc();
 724        atomic_inc(&preempt_cnt);
 725}
 726
 727static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type,
 728                        void *data)
 729{
 730
 731        unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1);
 732        if (my_slot < MAX_NR_EVENT_TRACES)
 733                indirs[my_slot] = read_tsc();
 734        atomic_inc(&indir_cnt);
 735}
 736
 737/* Helper, prints out the preempt trace */
 738static void print_preempt_trace(uint64_t starttsc, int nr_print_rows)
 739{
 740        /* reusing nr_print_rows for the nr preempt/indirs rows as well */
 741
 742        int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows);
 743        if (pargs.fake_vc_ctx) {
 744                printf("No preempt trace available when faking vc ctx\n");
 745                return;
 746        }
 747        printf("\n");
 748        printf("Nr Preempts: %d\n", atomic_read(&preempt_cnt));
 749        printf("Nr Indirs  : %d\n", atomic_read(&indir_cnt));
 750        if (preempt_rows)
 751                printf("Preempt/Indir events:\n-----------------\n");
 752        for (int i = 0; i < preempt_rows; i++) {
 753                if (preempts[i])
 754                        printf("Preempt %3d at %6llu\n",
 755                               i, tsc2msec(preempts[i] - starttsc));
 756        }
 757        for (int i = 0; i < preempt_rows; i++) {
 758                if (indirs[i])
 759                        printf("Indir   %3d at %6llu\n",
 760                               i, tsc2msec(indirs[i] - starttsc));
 761        }
 762}
 763
 764/* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */
 765static void os_prep_work(pthread_t *worker_threads, int nr_threads)
 766{
 767        if (nr_threads > max_vcores()) {
 768                printf("Too many threads (%d) requested, can't get more than %d vc\n",
 769                       nr_threads, max_vcores());
 770                exit(-1);
 771        }
 772        atomic_init(&preempt_idx, 0);
 773        atomic_init(&indir_idx, 0);
 774        atomic_init(&preempt_cnt, 0);
 775        atomic_init(&indir_cnt, 0);
 776        parlib_never_yield = TRUE;
 777        pthread_need_tls(FALSE);
 778        pthread_mcp_init();             /* gives us one vcore */
 779        register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0);
 780        register_ev_handler(EV_CHECK_MSGS, trace_indir, 0);
 781        if (pargs.fake_vc_ctx) {
 782                /* need to disable events when faking vc ctx.  since we're
 783                 * looping and not handling events, we could run OOM */
 784                clear_kevent_q(EV_VCORE_PREEMPT);
 785                clear_kevent_q(EV_CHECK_MSGS);
 786        }
 787        vcore_request_total(nr_threads);
 788        parlib_never_vc_request = TRUE;
 789        for (int i = 0; i < nr_threads; i++) {
 790                printd("Vcore %d mapped to pcore %d\n", i,
 791                       __procinfo.vcoremap[i].pcoreid);
 792        }
 793}
 794
 795#endif
 796
 797static void print_lock_types(void)
 798{
 799        printf("Available lock types:\n");
 800        for (struct lock_test *t = tests; t->name; t++)
 801                printf("\t%s\n", t->name);
 802        printf("\n");
 803}
 804
 805/* Argument parsing */
 806static error_t parse_opt (int key, char *arg, struct argp_state *state)
 807{
 808        struct prog_args *pargs = state->input;
 809
 810        switch (key) {
 811        case 'w':
 812                pargs->nr_threads = atoi(arg);
 813                if (pargs->nr_threads < 0) {
 814                        printf("Negative nr_threads...\n\n");
 815                        argp_usage(state);
 816                }
 817                break;
 818        case 'l':
 819                pargs->nr_loops = atoi(arg);
 820                if (pargs->nr_loops < 0) {
 821                        printf("Negative nr_loops...\n\n");
 822                        argp_usage(state);
 823                }
 824                break;
 825        case OPT_ADJ_WORKERS:
 826                pargs->adj_workers = TRUE;
 827                break;
 828        case OPT_VC_CTX:
 829                pargs->fake_vc_ctx = TRUE;
 830                break;
 831        case 'h':
 832                pargs->hold_time = atoi(arg);
 833                if (pargs->hold_time < 0) {
 834                        printf("Negative hold_time...\n\n");
 835                        argp_usage(state);
 836                }
 837                break;
 838        case 'd':
 839                pargs->delay_time = atoi(arg);
 840                if (pargs->delay_time < 0) {
 841                        printf("Negative delay_time...\n\n");
 842                        argp_usage(state);
 843                }
 844                break;
 845        case 'o':
 846                pargs->outfile_path = arg;
 847                break;
 848        case 'p':
 849                pargs->nr_print_rows = atoi(arg);
 850                if (pargs->nr_print_rows < 0) {
 851                        printf("Negative print_rows...\n\n");
 852                        argp_usage(state);
 853                }
 854                break;
 855        case 't':
 856                for (struct lock_test *t = tests; t->name; t++) {
 857                        if (!strcmp(t->name, arg)) {
 858                                pargs->test = t;
 859                                break;
 860                        }
 861                }
 862                if (!pargs->test) {
 863                        printf("Unknown locktype %s\n\n", arg);
 864                        print_lock_types();
 865                        argp_usage(state);
 866                }
 867                break;
 868        case ARGP_KEY_ARG:
 869                printf("Warning, extra argument %s ignored\n\n", arg);
 870                break;
 871        case ARGP_KEY_END:
 872                if (!pargs->test) {
 873                        printf("Must select a type of lock.\n\n");
 874                        print_lock_types();
 875                        argp_usage(state);
 876                        break;
 877                }
 878                break;
 879        default:
 880                return ARGP_ERR_UNKNOWN;
 881        }
 882        return 0;
 883}
 884
 885static struct argp argp = {options, parse_opt, args_doc, doc};
 886
 887struct results {
 888        void **loops_done;
 889        struct lock_sample **thread_samples;
 890};
 891
 892static struct results run_pthreads_test(void)
 893{
 894        struct results results;
 895        void **loops_done;
 896        struct lock_sample **thread_samples;
 897        pthread_t *worker_threads;
 898
 899        mcs_pdr_init(&mcspdr_lock);
 900
 901        worker_threads = malloc(sizeof(pthread_t) * pargs.nr_threads);
 902        if (!worker_threads)
 903                handle_error("pthread_t malloc failed:");
 904        loops_done = malloc(sizeof(void*) * pargs.nr_threads);
 905        if (!loops_done)
 906                handle_error("loops_done malloc failed");
 907        pthread_barrier_init(&start_test, NULL, pargs.nr_threads);
 908
 909        times = malloc(sizeof(struct lock_sample *) * pargs.nr_threads);
 910        assert(times);
 911        for (int i = 0; i < pargs.nr_threads; i++) {
 912                times[i] = malloc(sizeof(struct lock_sample) * pargs.nr_loops);
 913                if (!times[i])
 914                        handle_error("Record keeping malloc");
 915                memset(times[i], 0,
 916                       sizeof(struct lock_sample) * pargs.nr_loops);
 917        }
 918        printd("Record tracking takes %ld bytes of memory\n",
 919               pargs.nr_threads * pargs.nr_loops * sizeof(struct lock_sample));
 920
 921        /* ensure we have enough VCs */
 922        os_prep_work(worker_threads, pargs.nr_threads);
 923
 924        for (long i = 0; i < pargs.nr_threads; i++) {
 925                if (pthread_create(&worker_threads[i], NULL, pargs.test->func,
 926                                   (void*)i))
 927                        handle_error("pth_create failed");
 928        }
 929
 930        for (int i = 0; i < pargs.nr_threads; i++)
 931                pthread_join(worker_threads[i], &loops_done[i]);
 932
 933        results.loops_done = loops_done;
 934        results.thread_samples = times;
 935        return results;
 936}
 937
 938#ifdef __akaros__
 939
 940static struct results run_kernel_mod_test(struct lock_test *t)
 941{
 942        printf("Unsupported on Akaros\n");
 943        exit(-1);
 944}
 945
 946#else
 947
 948/* Works with tests/linux/modules/mcs.ko.  Write commands in, read results back.
 949 * The test runs on a pread(off==0).  The return format is all N of the
 950 * loops_done void*, followed by a 2D array of samples for threads then loops.
 951 * (t0_l0, t0_l1, t0_l2..., t1_l0, t1_l1, t1_l2...). */
 952static struct results run_kernel_mod_test(struct lock_test *t)
 953{
 954        struct results results;
 955        int fd;
 956        void *kresults;
 957        size_t sz_rets, sz_times_per, results_sz;
 958        ssize_t ret, amt = 0;
 959        struct lock_sample **thread_samples;
 960        struct lock_sample *base;
 961
 962        fd = open("/sys/kernel/mcs", O_WRONLY);
 963        if (fd < 0)
 964                handle_error("open write");
 965        if (dprintf(fd, "%u %u %u %u %u", pargs.test->id, pargs.nr_threads,
 966                    pargs.nr_loops, pargs.hold_time, pargs.delay_time) < 0)
 967                handle_error("setting opts.  too many threads?");
 968        /* For the change in parameters (notably the file size, due to
 969         * threads * loops) to take effect, you need to close and reopen. */
 970        close(fd);
 971        fd = open("/sys/kernel/mcs", O_RDONLY);
 972        if (fd < 0)
 973                handle_error("open read");
 974
 975        sz_rets = pargs.nr_threads * sizeof(void*);
 976        sz_times_per = pargs.nr_loops * sizeof(struct lock_sample);
 977        results_sz = sz_rets + pargs.nr_threads * sz_times_per;
 978        kresults = malloc(results_sz);
 979        if (!kresults)
 980                handle_error("alloc");
 981
 982        do {
 983                ret = read(fd, kresults + amt, results_sz - amt);
 984                if (ret < 0)
 985                        handle_error("read");
 986                amt += ret;
 987        } while (ret != 0);
 988
 989        if (amt != results_sz)
 990                printf("\n\nfucked, got %ld wanted %ld\n\n", amt, results_sz);
 991
 992        thread_samples = malloc(pargs.nr_threads * sizeof(struct lock_sample*));
 993        if (!thread_samples)
 994                handle_error("alloc");
 995        base = kresults + pargs.nr_threads * sizeof(void*);
 996        for (int i = 0; i < pargs.nr_threads; i++)
 997                thread_samples[i] = base + i * pargs.nr_loops;
 998
 999        results.loops_done = kresults;
1000        results.thread_samples = thread_samples;
1001        return results;
1002}
1003
1004#endif
1005
1006static struct results run_test(void)
1007{
1008        if (pargs.test->id)
1009                return run_kernel_mod_test(pargs.test);
1010        return run_pthreads_test();
1011}
1012
1013static void analyze(struct results *results)
1014{
1015        void **loops_done = results->loops_done;
1016        struct lock_sample **thread_samples = results->thread_samples;
1017
1018        uint64_t max_tsc = 0;
1019        uint64_t min_tsc = UINT64_MAX;
1020        unsigned long total_loops = 0;
1021        struct sample_stats acq_stats, hld_stats;
1022
1023        printf("Acquire times (TSC Ticks)\n---------------------------\n");
1024        acq_stats.get_sample = get_acq_latency;
1025        compute_stats((void**)thread_samples, pargs.nr_threads, pargs.nr_loops,
1026                      &acq_stats);
1027
1028        printf("Held times (from acq til rel done) (TSC Ticks)\n------\n");
1029        hld_stats.get_sample = get_hld_latency;
1030        compute_stats((void**)thread_samples, pargs.nr_threads, pargs.nr_loops,
1031                      &hld_stats);
1032
1033        /* compute start and end based off the data set */
1034        for (int i = 0; i < pargs.nr_threads; i++) {
1035                for (int j = 0; j < pargs.nr_loops; j++) {
1036                        if (thread_samples[i][j].pre == 0)
1037                                continue;
1038                        printd("T %d L %03d p %lu a %lu u %lu v %u\n", i, j,
1039                               thread_samples[i][j].pre,
1040                               thread_samples[i][j].acq,
1041                               thread_samples[i][j].un,
1042                               thread_samples[i][j].valid);
1043                        min_tsc = MIN(min_tsc, thread_samples[i][j].pre);
1044                        max_tsc = MAX(max_tsc, thread_samples[i][j].un);
1045                }
1046        }
1047        printf("Time to run: %ld usec\n", tsc2usec(max_tsc - min_tsc));
1048
1049        /* throughput for the entire duration (in ms), 1ms steps.  print as many
1050         * steps as they ask for (up to the end of the run). */
1051        printf("\nLock throughput:\n-----------------\n");
1052        print_throughput((void**)thread_samples,
1053                         tsc2msec(max_tsc - min_tsc),
1054                         msec2tsc(1),
1055                         pargs.nr_print_rows,
1056                         min_tsc,
1057                         pargs.nr_threads, pargs.nr_loops, get_acq_timestamp);
1058
1059        print_preempt_trace(min_tsc, pargs.nr_print_rows);
1060
1061        for (int i = 0; i < pargs.nr_threads; i++) {
1062                total_loops += (unsigned long)loops_done[i];
1063                if (!loops_done[i])
1064                        printf("WARNING: thread %d performed 0 loops!\n", i);
1065        }
1066        printd("Average number of loops done, per thread: %ld\n",
1067               total_loops / pargs.nr_threads);
1068        for (int i = 0; i < pargs.nr_threads; i++)
1069                printd("\tThread %d performed %lu loops\n",
1070                       i, (long)loops_done[i]);
1071}
1072
1073static void save_results(struct results *results, int argc, char **argv)
1074{
1075        struct lock_sample **thread_samples = results->thread_samples;
1076        FILE *outfile;
1077
1078        if (pargs.outfile_path) {
1079                /* RDWR, CREAT, TRUNC, O666 */
1080                outfile = fopen(pargs.outfile_path, "w+");
1081                if (!outfile)
1082                        handle_error("outfile");
1083
1084                fprintf(outfile, "#");
1085                for (char **arg = argv; *arg; arg++)
1086                        fprintf(outfile, " %s", *arg);
1087                fprintf(outfile, "\n");
1088
1089                fprintf(outfile, "# test '%s', %u host cores, %u workers\n",
1090                        pargs.test->name, max_vcores(), pargs.nr_threads);
1091                fprintf(outfile, "# %lu loops, %sadaptive, %sfaking vc ctx\n",
1092                        pargs.nr_loops,
1093                        pargs.adj_workers ? "" : "not ",
1094                        pargs.fake_vc_ctx ? "" : "not ");
1095                fprintf(outfile, "# %s\n", os_name());
1096
1097                fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) "
1098                                 "tsc_overhead\n");
1099                fprintf(outfile,
1100                        "# acquire latency: acq - pre - tsc_overhead\n");
1101                fprintf(outfile, "# hold time: un - acq - tsc_overhead\n");
1102                fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq());
1103                fprintf(outfile,
1104                        "# tsc_overhead is 0 on linux, hard code it with a value from akaros\n");
1105                for (int i = 0; i < pargs.nr_threads; i++) {
1106                        for (int j = 0; j < pargs.nr_loops; j++) {
1107                                struct lock_sample *ts = &thread_samples[i][j];
1108                                if (!ts->pre)
1109                                        break; /* empty record */
1110                                fprintf(outfile, "%d %d %llu %llu %llu %llu\n",
1111                                        i, j, ts->pre, ts->acq, ts->un,
1112                                        get_tsc_overhead());
1113                        }
1114                }
1115                fclose(outfile);
1116        }
1117}
1118
1119int main(int argc, char **argv)
1120{
1121        struct results results;
1122
1123        pargs.nr_threads = max_vcores();
1124        pargs.nr_loops = 10000;
1125        pargs.nr_print_rows = 10;
1126        argp_parse(&argp, argc, argv, 0, 0, &pargs);
1127
1128        printf("Detected %u cores, running test '%s' on %d cores\n",
1129               max_vcores(), pargs.test->name, pargs.nr_threads);
1130        printf("%lu loops, %sadapting workers to vcores, %sfaking vcore context\n",
1131               pargs.nr_loops,
1132               pargs.adj_workers ? "" : "not ",
1133               pargs.fake_vc_ctx ? "" : "not ");
1134        printf("All times in TSC ticks, freq: %llu\n", get_tsc_freq());
1135        printf("\n\n");
1136
1137        results = run_test();
1138        analyze(&results);
1139        save_results(&results, argc, argv);
1140
1141        return 0;
1142}
1143