akaros/tests/lock_test.c
<<
>>
Prefs
   1/* Copyright (c) 2013, 2014 The Regents of the University of California
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * lock_test: microbenchmark to measure different styles of spinlocks.
   6 *
   7 * to build on linux: (hacky)
   8 * $ gcc -O2 -std=gnu99 -fno-stack-protector -g tests/lock_test.c -lpthread \
   9 *    -lm -o linux_lock_test */
  10
  11#define _GNU_SOURCE /* pthread_yield */
  12
  13#include <stdio.h>
  14#include <pthread.h>
  15#include <stdlib.h>
  16#include <unistd.h>
  17#include <sys/time.h>
  18#include <math.h>
  19#include <argp.h>
  20#include <sys/types.h>
  21#include <sys/stat.h>
  22#include <fcntl.h>
  23#include <assert.h>
  24#include <string.h>
  25
  26/* OS dependent #incs */
  27#ifdef __ros__
  28
  29#include <parlib/parlib.h>
  30#include <parlib/stdio.h>
  31#include <parlib/vcore.h>
  32#include <parlib/timing.h>
  33#include <parlib/spinlock.h>
  34#include <parlib/mcs.h>
  35#include <parlib/arch/arch.h>
  36#include <parlib/event.h>
  37
  38#include <parlib/tsc-compat.h>
  39#include <benchutil/measure.h>
  40
  41#else
  42
  43#include "../user/parlib/include/parlib/tsc-compat.h"
  44#include "misc-compat.h"
  45#include "linux-lock-hacks.h" /* TODO: have a build system and lib / C file */
  46
  47#include "../user/benchutil/include/benchutil/measure.h"
  48#include "../user/benchutil/measure.c"
  49
  50static void os_prep_work(pthread_t *worker_threads, int nr_threads)
  51{
  52        if (nr_threads > num_vcores())
  53                printf("WARNING: %d threads requested, but only %d cores available\n",
  54                       nr_threads, num_vcores());
  55}
  56
  57static void os_post_work(pthread_t *worker_threads, int nr_threads)
  58{
  59        if (nr_threads > num_vcores())
  60                return;
  61        /* assuming we're taking cores 0..nr_threads, and we never move. */
  62        for (int i = 0; i < nr_threads; i++) {
  63                cpu_set_t cpuset;
  64                CPU_ZERO(&cpuset);
  65                CPU_SET(i, &cpuset);
  66                pthread_setaffinity_np(worker_threads[i], sizeof(cpu_set_t),
  67                                       &cpuset);
  68        }
  69}
  70
  71#define print_preempt_trace(args...) {}
  72
  73__thread int __vcore_context = 0;
  74
  75#endif
  76
  77/* TODO: There's lot of work to do still, both on this program and on locking
  78 * and vcore code.  For some of the issues, I'll leave in the discussion /
  79 * answers, in case it comes up in the future (like when I read this in 8
  80 * months).
  81 *
  82 * BUGS / COMMENTARY
  83 * Occasional deadlocks when preempting and not giving back!
  84 *      - with the new PDRs style, though that doesn't mean the older styles
  85 *      don't have this problem
  86 *      - shouldn't be any weaker than PDR.  they all check pred_vc to see
  87 *      if they are running, and if not, they make sure someone runs
  88 *      - could be weaker if we have an old value for the lockholder,
  89 *      someone outside the chain, and we made sure they ran, and they do
  90 *      nothing (spin in the 2LS or something?)
  91 *              no, they should have gotten a msg about us being preempted,
  92 *              since whoever we turn into gets the message about us swapping.
  93 *      - anyway, it's not clear if this is with MCSPDR, event delivery,
  94 *      preemption handling, or just an artifact of the test (less likely)
  95 * why aren't MCS locks in uth_ctx getting dealt with?
  96 *      - because the event is handled, but the lock holder isn't run.  the
  97 *      preemption was dealt with, but nothing saved the lock holder
  98 *      - any uthread_ctx lockholder that gets preempted will get
  99 *      interrupted, and other cores will handle the preemption.  but that
 100 *      uthread won't run again without 2LS support.  either all spinners
 101 *      need to be aware of the 'lockholder' (PDR-style), or the 2LS needs
 102 *      to know when a uthread becomes a 'lockholder' to make sure it runs
 103 *      via user-level preempts.  If the latter, this needs to happen
 104 *      atomically with grabbing the lock, or else be able to handle lots of
 105 *      fake 'lockholders' (like round-robin among all of them)
 106 * why is the delay more than the expected delay?
 107 *      because it takes ~2ms to spawn and run a process
 108 *      could do this in a separate process, instead of a script
 109 *              could also consider not using pth_test and changing prov, but
 110 *              driving it by yields and requests.  would also test the
 111 *              alarm/wakeup code (process sets alarm, goes to sleep, wakes up
 112 *              and requests X cores)
 113 * why do we get occasional preempt-storms? (lots of change_tos)
 114 *      due to the MCS-PDR chain, which i tried fixing by adjusting the number
 115 *      of workers down to the number of vcores
 116 * why isn't the worker adaptation working?
 117 *              - it actually was working, and nr_workers == nr_vcores.  that
 118 *              just wasn't the root cause.
 119 *              - was expecting it to cut down on PDR kernel traffic
 120 *      - still get periods of low perf
 121 *              like O(100) preempt msgs per big preempt/prov
 122 *              does it really take that much to work out an MCS-PDR?
 123 *      - one thing is that if we fake vc ctx, we never receive preemption
 124 *      events.  might be a bad idea.
 125 *              - in general, yeah.  faking VC and turning off events can really
 126 *              muck with things
 127 *              - these events aren't necessarily delivered to a VC who will
 128 *              check events any time soon (might be the last one in the chain)
 129 *              - the core of the issue is that we have the right amount of
 130 *              workers and vcores, but that the system isn't given a chance to
 131 *              stabilize itself.  also, if we have some VCs that are just
 132 *              sitting around, spinning in the 2LS, if those get preempted, no
 133 *              one notices or cares (when faking vc_ctx / getting no events)
 134 *      - there is a slight race where we might make someone run who isn't a
 135 *      lockholder.  logically, its okay.  worst case, it would act like an
 136 *      extra preempt and different startcore, which shouldn't be too bad.
 137 *
 138 * sanity check: does throughput match latency? (2.5GHz TSC, MCS lock)
 139 *      ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock
 140 *      500 ticks * 31 workers (queue) = 15000 ticks
 141 *      avg acquire time was around 14K.  seems fine..
 142 *      when our MCSPDR throughput tanks (during preempts), it's around
 143 *      400-500 locks/ms, which is around 2us/lock.  
 144 *              when the locker on a preempted chain shows up, it needs to
 145 *              change to the next one in line. 
 146 *                      - though that should be in parallel with the other
 147 *                      lockholders letting go.  shouldn't be that bad
 148 *                      - no, it is probably at the head of the chain very soon,
 149 *                      such that it is the bottleneck for the actual lock.  2us
 150 *                      seems possible
 151 *
 152 * what does it take to get out of a preemption with (old) MCS-PDR?
 153 *      - these are now called pdro locks (old)
 154 *      - for a single preempt, it will take 1..n-1 changes.  avg n/2
 155 *      - for multiple preempts, it's nr_pre * that (avg np/2, worst np)
 156 *      - for every unlock/reacquire cycle (someone unlocks, then rejoins
 157 *      the list), its nr_preempts (aka, nr_workers - nr_vcores)
 158 *      - if we need to have a specific worker get out of the chain, on
 159 *      average, it'd take n/2 cycles (p*n/2 changes)  worst: np
 160 *      - if we want to get multiple workers out, the worst case is still
 161 *      np, but as p increases, we're more likely to approach n cycles
 162 *      - so the current model is np for the initial hit (to move the
 163 *      offline VCs to the end of the chain) and another np to get our
 164 *      specific workers out of the chain and yielding (2np)
 165 *
 166 *      - but even with 1 preempt, we're getting 80-200 changes per
 167 *
 168 *      - it shouldn't matter that the sys_change_to is really slow, should
 169 *      be the same amount of changes.  however, the preempted ones are
 170 *      never really at the tail end of the chain - they should end up right
 171 *      before the lockholder often.  while the sys_change_tos are slowly
 172 *      moving towards the back of the chain, the locking code is quickly
 173 *      removing (online) nodes from the head and putting them on the back.
 174 *
 175 *      - end result: based on lock hold time and lock delay time, a
 176 *      preempted VC stays in the MCS chain (swaps btw VC/nodes), and when
 177 *      it is inside the chain, someone is polling to make them run.  with
 178 *      someone polling, it is extremely unlikely that someone outside the
 179 *      chain will win the race and be able to change_to before the in-chain
 180 *      poller.  to clarify:
 181 *              - hold time and delay time matter, since the longer they are,
 182 *              the greater the amount of time the change_to percolation has to
 183 *              get the preempted VCs to the end of the chain (where no one
 184 *              polls them).
 185 *              - at least one vcore is getting the event to handle the
 186 *              preemption of the in-chain, offline VC.  we could change it so
 187 *              every VC polls the preempt_evq, or just wait til whoever is
 188 *              getting the messages eventually checks their messages (VC0)
 189 *              - if there is an in-chain poller, they will notice the instant
 190 *              the VC map changes, and then immediately change_to (and spin on
 191 *              the proclock in the kernel).  there's almost no chance of a
 192 *              normal preempt event handler doing that faster.  (would require
 193 *              some IRQ latency or something serious).
 194 * - adding in any hold time trashes our microbenchmark's perf, but a
 195 * little delay time actually helps: (all with no preempts going on)
 196 *      - mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc)
 197 *      - mcspdr, d = 1: 4400-4800
 198 *      - mcspdr, d = 2: 4200-5200
 199 *      - as you add delay, it cuts down on contention for the
 200 *      lock->lock cacheline.  but if you add in too much, you'll tank
 201 *      throughput (since there is no contention at all).
 202 *      - as we increase the delay, we cut down on the chance of the
 203 *      preempt storm / preempt-stuck-in-the-chain, though it can still
 204 *      happen, even with a delay of 10us
 205 * - maybe add in the lockholder again? (removed in 73701d6bfb)
 206 *      - massively cuts performance, like 2x throughput, without
 207 *      preempts
 208 *      - it's ability to help depends on impl:
 209 *              in one version (old style), it didn't help much at all
 210 *              - in another (optimized lockholder setting), i can't
 211 *              even see the throughput hit, it recovered right away,
 212 *              with O(5) messages
 213 *              - the diff was having the lockholder assign the vcoreid
 214 *              before passing off to the next in the chain, so that
 215 *              there is less time with having "no lockholder".
 216 *              (there's a brief period where the lockholder says it is
 217 *              the next person, who still
 218 *              spins.  they'll have to make
 219 *              sure their pred runs)
 220 * -adj workers doesn't matter either...
 221 *      - the 2LS and preemption handling might be doing this
 222 *      automatically, when handle_vc_preempt() does a
 223 *      thread_paused() on its current_uthread.
 224 *      - adj_workers isn't critical if we're using some locks
 225 *      that check notif_pending.  eventually someone hears
 226 *      about preempted VCs (assuming we can keep up)
 227 *
 228 * What about delays?  both hold and delay should make it easier to get
 229 * the preempted vcore to the end of the chain.  but do they have to be
 230 * too big to be reasonable?
 231 *      - yes.  hold doesn't really help much until everything is
 232 *      slower.  even with a hold of around 1.2us, we still have the
 233 *      change_to-storms and lowered throughput.
 234 *      - doing a combo helps too.  if you hold for 1ns (quite a bit
 235 *      more actually, due to the overhead of ndelay, but sufficient to
 236 *      be "doing work"), and delaying for around 7us before rejoining,
 237 *      there's only about a 1/5 chance of a single preempt messing us
 238 *      up
 239 *              - though having multiple preempts outstanding make this less
 240 *              likely to work.
 241 *              - and it seems like if we get into the storm scenario, we
 242 *              never really get out.  either we do quickly or never do.
 243 *              depending on the workload, this could be a matter of luck
 244 *
 245 * So we could try tracking the lockholder, but only looking at it when
 246 * we know someone was preempted in the chain - specifically, when our
 247 * pred is offline.  when that happens, we don't change to them, we
 248 * make sure the lockholder is running.
 249 *      - tracking takes us from 4200->2800 throughput or so for MCS
 250 *      - 5200 -> 3700 or so for MCS in vc_ctx (__MCSPDR)
 251 *      - main spike seems to be in the hold time.  bimodal distrib,
 252 *      with most below 91 (the usual is everything packed below 70) and
 253 *      a big spike around 320
 254 *
 255 * Summary:
 256 *
 257 * So we need to have someone outside the chain change_to the one in the
 258 * chain o/w, someone will always be in the chain.  Right now, it's always
 259 * the next in line who is doing the changing, so a preempted vcore is
 260 * always still in the chain. 
 261 *
 262 * If the locking workload has some delaying, such as while holding the
 263 * lock or before reacquiring, the "change_to" storm might not be a
 264 * problem.  If it is, the only alternative I have so far is to check the
 265 * lockholder (which prevents a chain member from always ensuring their
 266 * pred runs).  This hurts the lock's scalability/performance when we
 267 * aren't being preempted.  On the otherhand, based on what you're doing
 268 * with the lock, one more cache miss might not be as big of a deal as in
 269 * lock_test.  Especially if when you get stormed, your throughput could be
 270 * terrible and never recover.
 271 *
 272 * Similar point: you can use spinpdr locks.  They have the PDR-benefits,
 273 * and won't induce the storm of change_tos.  However, this isn't much
 274 * better for contended locks.  They perform 2-3x worse (on c89) without
 275 * preemption.  Arguably, if you were worried about the preempt storms and
 276 * want scalability, you might want to use mcspdr with lockholders.
 277 *
 278 * The MCSPDRS (now just callced MCSPDR, these are default) locks can avoid
 279 * the storm, but at the cost of a little more in performance.  mcspdrs
 280 * style is about the same when not getting preempted from uth ctx compared
 281 * to mcspdr (slight drop).  When in vc ctx, it's about 10-20% perf hit
 282 * (PDRS gains little from --vc_ctx). 
 283 *
 284 * Turns out there is a perf hit to PDRS (and any non-stack based qnode)
 285 * when running on c89.  The issue is that after shuffling the vcores
 286 * around, they are no longer mapped nicely to pcores (VC0->PC1, VC1->PC2).
 287 * This is due to some 'false sharing' of the cachelines, caused mostly by
 288 * aggressive prefetching (notably the intel adjacent cacheline prefetcher,
 289 * which grabs two CLs at a time!).  Basically, stack-based qnodes are
 290 * qnodes that are very far apart in memory.  Cranking up the padding in
 291 * qnodes in the "qnodes-in-locks" style replicates this.
 292 *
 293 * For some info on the prefetching:
 294 *      http://software.intel.com/en-us/articles/optimizing-application-performance-on-intel-coret-microarchitecture-using-hardware-implemented-prefetchers/
 295 *      http://software.intel.com/en-us/forums/topic/341769
 296 *
 297 * Here's some rough numbers of the different styles for qnodes on c89.
 298 * 'in order' is VCn->PC(n+1) (0->1, 1->2).  Worst order is with even VCs
 299 * on one socket, odds on the other.  the number of CLs is the size of a
 300 * qnode.  mcspdr is the new style (called mcspdrs in some places in this
 301 * document), with lock-based qnodes.  mcspdr2 is the same, but with
 302 * stack-based qnodes.  mcspdro is the old style (bad a recovery), stack
 303 * based, sometimes just called mcs-pdr
 304 *
 305 *      with prefetchers disabled (MCS and DCU)
 306 *              mcspdr   1CL  4.8-5.4 in order, 3.8-4.2 worst order
 307 *              mcspdr   2CL          in order,         worst order
 308 *              mcspdr   4CL  5.2-6.0 in order, 4.7-5.3 worst order
 309 *              mcspdr   8CL  5.4-6.7 in order, 5.2-6.2 worst order
 310 *              mcspdr  16CL  5.1-5.8 in order, 5.2-6.8 worst order
 311 *              mcspdr2 stck          in order,         worst order
 312 *              mcspdro stck  4-3.4.3 in order, 4.2-4.5 worst order
 313 *              mcspdro-vcctx 4.8-7.0 in order, 5.3-6.7 worst order
 314 *              can we see the 2 humps? 
 315 *                      mcspdr 1CL yes but less, varied, etc
 316 *                      mcspdr2 no
 317 *
 318 *      test again with worst order with prefetchers enabled
 319 *              mcspdr   1CL  3.8-4.0 in order, 2.6-2.7 worst order
 320 *              mcspdr   2CL  4.2-4.4 in order, 3.8-3.9 worst order
 321 *              mcspdr   4CL  4.5-5.2 in order, 4.0-4.2 worst order
 322 *              mcspdr   8CL  4.4-5.1 in order, 4.3-4.7 worst order
 323 *              mcspdr  16CL  4.4-4.8 in order, 4.4-5.3 worst order
 324 *              mcspdr2 stck  3.0-3.0 in order, 2.9-3.0 worst order
 325 *              mcspdro stck  4.2-4.3 in order, 4.2-4.4 worst order
 326 *              mcspdro-vcctx 5.2-6.4 in order, 5.0-5.9 worst order
 327 *              can we see the 2 humps?
 328 *                      mcspdrs 1CL yes, clearly
 329 *                      mcspdr2 no
 330 *
 331 * PROGRAM FEATURES
 332 *      - verbosity?  vcoremap, preempts, the throughput and latency histograms?
 333 *      - have a max workers option (0?) == max vcores
 334 *      - would like to randomize (within bounds) the hold/delay times
 335 *              - help avoid convoys with MCS locks
 336 *
 337 * PERFORMANCE:
 338 *
 339 *      pcore control?  (hyperthreading, core 0, cross socket?)
 340 *              want some options for controlling which threads run where, or
 341 *              which vcores are even used (like turning off hyperthreading)?
 342 *      implement ticket spinlocks?  (more fair, more effects of preempts)
 343 *              no simple way to do PDR either, other than 'check everyone'
 344 *      MCS vs MCSPDR vs __MCSPDR
 345 *              MCS seems slightly better than __MCSPDR (and it should)
 346 *              MCSPDR is a bit worse than __MCSPDR
 347 *                      - the uth_disable/enable code seems to make a
 348 *                      difference.
 349 *                      - i see why the latencies are worse, since they have
 350 *                      extra work to do, but the internal part that contends
 351 *                      with other cores shouldn't be affected, unless there's
 352 *                      some other thing going on.  Or perhaps there isn't
 353 *                      always someone waiting for the lock?
 354 *                      - faking VC ctx mostly negates the cost of MCSPDR vs
 355 *                      __MCSPDR things that made a big diff: CL aligning the
 356 *                      qnodes, putting qnodes
 357 *              on stacks, reading in the vcoreid once before ensuring()
 358 *      both MCS CAS unlocks could use some branch prediction work
 359 *      spinpdr locks are 2-3x faster than spinlocks...
 360 *              test, test&set  vs the existing test&set, plus lots of asserts
 361 *
 362 *      some delay (like 10us) lowers latency while maintaining throughput
 363 *              - makes sense esp with MCS.  if you join the queue at the last
 364 *              second, you'll measure lower latency than attempting right away
 365 *              - also true for spinlocks
 366 *              - we can probably figure out the max throughput (TP = f(delay))
 367 *              for each lock type
 368 *
 369 *      hard to get steady numbers with MCS - different runs of the same test
 370 *      will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k
 371 *      L/ms)
 372 *              - happens on c89 (NUMA) and hossin (UMA)
 373 *              - spinlocks seem a little steadier.
 374 *              - for MCS locks, the order in which they line up across the
 375 *              pcores will matter.  like if on one run, i regularly hand off
 376 *              between cores
 377 *              in the same socket and only do one cross-socket step
 378 *              - run a lot of shorter ones to get a trend, for now
 379 *              - might be correllated with spikes in held times (last bin)
 380 *              - can't turn off legacy USB on c89 (SMM) - interferes with PXE
 381 *
 382 * PREEMPTS:
 383 * better preempt record tracking?
 384 *      i just hacked some event-intercept and timestamp code together
 385 *      maybe put it in the event library?
 386 *      the timestamps definitely helped debugging
 387 *
 388 * is it true that if uthread code never spins outside a PDR lock, then it
 389 * doesn't need preemption IPIs?  (just someone checks the event at some
 390 * point). 
 391 *      think so: so long as you make progress and when you aren't, you
 392 *      check events (like if a uthread blocks on something and enters VC
 393 *      ctx)
 394 * adjusting the number of workers, whether vcores or uthreads
 395 * - if you have more lockers than cores:
 396 *      - spinpdr a worker will get starved (akaros) (without 2LS support)
 397 *              - running this from uth context will cause a handle_events
 398 *      - mcspdr will require the kernel to switch (akaros)
 399 *      - spin (akaros) might DL (o/w nothing), (linux) poor perf
 400 *      - mcs (akaros) will DL, (linux) poor perf
 401 *      - poor perf (latency spikes) comes from running the wrong thread
 402 *      sometimes
 403 *      - deadlock comes from the lack of kernel-level context switching
 404 * - if we scale workers down to the number of active vcores:
 405 *      - two things: the initial hit, and the steady state.  during the
 406 *      initial hit, we can still deadlock, since we have more lockers than
 407 *      cores
 408 *              - non-pdr (akaros) could deadlock in the initial hit
 409 *              - (akaros) steady state, everything is normal (just fewer cores)
 410 *      - how can we adjust this in linux?
 411 *              - if know how many cores you have, then futex wait the others
 412 *              - need some way to wake them back up
 413 *              - if you do this in userspace, you might need something PDR-like
 414 *              to handle when the "2LS" code gets preempted
 415 *      - as mentioned above, the problem in akaros is that the lock/unlock
 416 *      might be happening too fast to get into the steady-state and recover
 417 *      from the initial preemption
 418 * - one of our benefits is that we can adapt in userspace, with userspace
 419 * knowledge, under any circumstance.
 420 *      - we have the deadlock windows (forcing PDR).
 421 *      - in return, we can do this adaptation in userspace
 422 *      - and (arguably) anyone who does this in userspace will need PDR
 423 *
 424 * MEASUREMENT (user/parlib/measure.c)
 425 *      extract into its own library, for linux apps
 426 *      print out raw TSC times?  might help sync up diff timelines
 427 *      Need more latency bins, spinlocks vary too much
 428 *      maybe we need better high/low too, since this hist looks bad too
 429 *              or not center on the average?
 430 *              for printing, its hard to know without already binning.
 431 *              maybe bin once (latency?), then use that to adjust the hist?
 432 *
 433 *      Had this on a spinlock:
 434 *      [      32 -    35656] 1565231:
 435 *      (less than 200 intermediate)
 436 *      [  286557 - 20404788]   65298: *
 437 *
 438 *      Samples per dot: 34782
 439 *      Total samples: 1640606
 440 *      Avg time   : 96658
 441 *      Stdev time : 604064.440882
 442 *      Coef Var   : 6.249503
 443 *              High coeff of var with serious outliers, adjusted bins
 444 *              50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860)
 445 *              Min / Max  : 32 / 20404788
 446 *      was 50/75/90 really within 860 of each other?
 447 *
 448 *      when we are preempted and don't even attempt anything, say for 10ms, it
 449 *      actually doesn't hurt our 50/75/90/99 too much.  we have a ridiculous
 450 *      stddev and max, and high average, but there aren't any additional
 451 *      attempts at locking to mess with the attempt-latency.  Only nr_vcores
 452 *      requests are in flight during the preemption, but we can spit out around
 453 *      5000 per ms when we aren't preempted.
 454 *
 455 */
 456
 457const char *argp_program_version = "lock_test v0.1475263";
 458const char *argp_program_bug_address = "<akaros+subscribe@googlegroups.com>";
 459
 460static char doc[] = "lock_test -- spinlock benchmarking";
 461static char args_doc[] = "-w NUM -l NUM -t LOCK";
 462
 463#define OPT_VC_CTX 1
 464#define OPT_ADJ_WORKERS 2
 465
 466static struct argp_option options[] = {
 467        {"workers",     'w', "NUM",     OPTION_NO_USAGE, "Number of threads/cores"},
 468        {0, 0, 0, 0, ""},
 469        {"loops",       'l', "NUM",     OPTION_NO_USAGE, "Number of loops per worker"},
 470        {0, 0, 0, 0, ""},
 471        {"type",        't', "LOCK",OPTION_NO_USAGE, "Type of lock to use.  "
 472                                                     "Options:\n"
 473                                                     "\tmcs\n"
 474                                                     "\tmcscas\n"
 475                                                     "\tmcspdr\n"
 476                                                     "\tmcspdro\n"
 477                                                     "\t__mcspdro\n"
 478                                                     "\tspin\n"
 479                                                     "\tspinpdr"},
 480        {0, 0, 0, 0, "Other options (not mandatory):"},
 481        {"adj_workers", OPT_ADJ_WORKERS, 0,     0,
 482                                               "Adjust workers such that the "
 483                                               "number of workers equals the "
 484                                               "number of vcores"},
 485        {"vc_ctx",      OPT_VC_CTX, 0,  0, "Run threads in mock-vcore context"},
 486        {0, 0, 0, 0, ""},
 487        {"hold",        'h', "NSEC",    0, "nsec to hold the lock"},
 488        {"delay",       'd', "NSEC",    0, "nsec to delay between grabs"},
 489        {"print",       'p', "ROWS",    0, "Print ROWS of optional measurements"},
 490        {"outfile",     'o', "FILE",    0, "Print ROWS of optional measurements"},
 491        { 0 }
 492};
 493
 494struct prog_args {
 495        int                     nr_threads;
 496        int                     nr_loops;
 497        int                     hold_time;
 498        int                     delay_time;
 499        int                     nr_print_rows;
 500        bool                    fake_vc_ctx;
 501        bool                    adj_workers;
 502        char                    *outfile_path;
 503        void *(*lock_type)(void *arg);
 504};
 505struct prog_args pargs = {0};
 506
 507/* Globals */
 508struct time_stamp {
 509        uint64_t pre;
 510        uint64_t acq;
 511        uint64_t un;
 512        bool valid;
 513};
 514struct time_stamp **times;
 515bool run_locktest = TRUE;
 516pthread_barrier_t start_test;
 517
 518/* Locking functions.  Define globals here, init them in main (if possible), and
 519 * use the lock_func() macro to make your thread func. */
 520
 521#define lock_func(lock_name, lock_cmd, unlock_cmd)                             \
 522void *lock_name##_thread(void *arg)                                            \
 523{                                                                              \
 524        long thread_id = (long)arg;                                            \
 525        int hold_time = ACCESS_ONCE(pargs.hold_time);                          \
 526        int delay_time = ACCESS_ONCE(pargs.delay_time);                        \
 527        int nr_loops = ACCESS_ONCE(pargs.nr_loops);                            \
 528        bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx);                     \
 529        bool adj_workers = ACCESS_ONCE(pargs.adj_workers);                     \
 530        uint64_t pre_lock, acq_lock, un_lock;                                  \
 531        struct time_stamp *this_time;                                          \
 532        struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                      \
 533        struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT;                 \
 534        int i;                                                                 \
 535        /* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the
 536         * program gets preempted for that test, things may go nuts */         \
 537        pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads;                 \
 538        /* Wait til all threads are created.  Ideally, I'd like to busywait
 539         * unless absolutely critical to yield */                              \
 540        pthread_barrier_wait(&start_test);                                     \
 541        if (fake_vc_ctx) {                                                     \
 542                /* tells the kernel / other vcores we're in vc ctx */          \
 543                uth_disable_notifs();                                          \
 544                /* tricks ourselves into believing we're in vc ctx */          \
 545                __vcore_context = TRUE;                                        \
 546        }                                                                      \
 547        for (i = 0; i < nr_loops; i++) {                                       \
 548                if (!run_locktest)                                             \
 549                        break;                                                 \
 550                pre_lock = read_tsc_serialized();                              \
 551                                                                               \
 552                lock_cmd                                                       \
 553                                                                               \
 554                acq_lock = read_tsc_serialized();                              \
 555                if (hold_time)                                                 \
 556                        ndelay(hold_time);                                     \
 557                                                                               \
 558                unlock_cmd                                                     \
 559                                                                               \
 560                un_lock = read_tsc_serialized();                               \
 561                this_time = &times[thread_id][i];                              \
 562                this_time->pre = pre_lock;                                     \
 563                this_time->acq = acq_lock;                                     \
 564                this_time->un = un_lock;                                       \
 565                /* Can turn these on/off to control which samples we gather */ \
 566                this_time->valid = TRUE;                                       \
 567                /* this_time->valid = (num_vcores() == max_vcores());  */      \
 568                                                                               \
 569                if (delay_time)                                                \
 570                        ndelay(delay_time);                                    \
 571                /* worker thread ids are 0..n-1.  if we're one of the threads
 572                 * that's beyond the VC count, we yield. */                    \
 573                if (adj_workers && num_vcores() < thread_id + 1) {             \
 574                        if (fake_vc_ctx) {                                     \
 575                                __vcore_context = FALSE;                       \
 576                                uth_enable_notifs();                           \
 577                        }                                                      \
 578                        /* we'll come back up once we have enough VCs running*/\
 579                        pthread_yield();                                       \
 580                        if (fake_vc_ctx) {                                     \
 581                                uth_disable_notifs();                          \
 582                                __vcore_context = TRUE;                        \
 583                        }                                                      \
 584                }                                                              \
 585                cmb();                                                         \
 586        }                                                                      \
 587        /* First thread to finish stops the test */                            \
 588        run_locktest = FALSE;                                                  \
 589        if (fake_vc_ctx) {                                                     \
 590                __vcore_context = FALSE;                                       \
 591                uth_enable_notifs();                                           \
 592        }                                                                      \
 593        return (void*)(long)i;                                                 \
 594}
 595
 596#define fake_lock_func(lock_name, x1, x2)                                      \
 597void *lock_name##_thread(void *arg)                                            \
 598{                                                                              \
 599        printf("Lock " #lock_name " not supported!\n");                        \
 600        exit(-1);                                                              \
 601}
 602
 603spinlock_t spin_lock = SPINLOCK_INITIALIZER;
 604struct mcs_lock mcs_lock = MCS_LOCK_INIT;
 605
 606/* Defines locking funcs like "mcs_thread" */
 607lock_func(mcs,
 608          mcs_lock_lock(&mcs_lock, &mcs_qnode);,
 609          mcs_lock_unlock(&mcs_lock, &mcs_qnode);)
 610lock_func(mcscas,
 611          mcs_lock_lock(&mcs_lock, &mcs_qnode);,
 612          mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);)
 613lock_func(spin,
 614          spinlock_lock(&spin_lock);,
 615          spinlock_unlock(&spin_lock);)
 616
 617#ifdef __ros__
 618struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
 619struct mcs_pdr_lock mcspdr_lock;
 620struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT;
 621
 622lock_func(mcspdr,
 623          mcs_pdr_lock(&mcspdr_lock);,
 624          mcs_pdr_unlock(&mcspdr_lock);)
 625lock_func(mcspdro,
 626          mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
 627          mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
 628lock_func(__mcspdro,
 629          __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
 630          __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
 631lock_func(spinpdr,
 632          spin_pdr_lock(&spdr_lock);,
 633          spin_pdr_unlock(&spdr_lock);)
 634#else
 635
 636fake_lock_func(mcspdr, 0, 0);
 637fake_lock_func(mcspdro, 0, 0);
 638fake_lock_func(__mcspdro, 0, 0);
 639fake_lock_func(spinpdr, 0, 0);
 640
 641#endif
 642
 643static int get_acq_latency(void **data, int i, int j, uint64_t *sample)
 644{
 645        struct time_stamp **times = (struct time_stamp**)data;
 646        /* 0 for initial time means we didn't measure */
 647        if (times[i][j].pre == 0)
 648                return -1;
 649        /* can optionally throw out invalid times (keep this in sync with the
 650         * lock_test macro, based on what you want to meaasure. */
 651        #if 0
 652        if (!times[i][j].valid)
 653                return -1;
 654        #endif
 655        *sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead();
 656        return 0;
 657}
 658
 659static int get_hld_latency(void **data, int i, int j, uint64_t *sample)
 660{
 661        struct time_stamp **times = (struct time_stamp**)data;
 662        /* 0 for initial time means we didn't measure */
 663        if (times[i][j].pre == 0)
 664                return -1;
 665        *sample = times[i][j].un - times[i][j].acq - get_tsc_overhead();
 666        return 0;
 667}
 668
 669static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample)
 670{
 671        struct time_stamp **times = (struct time_stamp**)data;
 672        /* 0 for initial time means we didn't measure */
 673        if (times[i][j].pre == 0)
 674                return -1;
 675        *sample = times[i][j].acq;
 676        return 0;
 677}
 678
 679#ifdef __ros__
 680
 681/* Lousy event intercept.  build something similar in the event library? */
 682#define MAX_NR_EVENT_TRACES 1000
 683uint64_t preempts[MAX_NR_EVENT_TRACES] = {0};
 684uint64_t indirs[MAX_NR_EVENT_TRACES] = {0};
 685atomic_t preempt_idx;
 686atomic_t indir_idx;
 687atomic_t preempt_cnt;
 688atomic_t indir_cnt;
 689
 690static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type,
 691                          void *data)
 692{
 693        unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1);
 694
 695        if (my_slot < MAX_NR_EVENT_TRACES)
 696                preempts[my_slot] = read_tsc();
 697        atomic_inc(&preempt_cnt);
 698}
 699
 700static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type,
 701                        void *data)
 702{
 703
 704        unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1);
 705        if (my_slot < MAX_NR_EVENT_TRACES)
 706                indirs[my_slot] = read_tsc();
 707        atomic_inc(&indir_cnt);
 708}
 709
 710/* Helper, prints out the preempt trace */
 711static void print_preempt_trace(uint64_t starttsc, int nr_print_rows)
 712{
 713        /* reusing nr_print_rows for the nr preempt/indirs rows as well */
 714
 715        int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows);
 716        if (pargs.fake_vc_ctx) {
 717                printf("No preempt trace available when faking vc ctx\n");
 718                return;
 719        }
 720        printf("\n");
 721        printf("Nr Preempts: %d\n", atomic_read(&preempt_cnt));
 722        printf("Nr Indirs  : %d\n", atomic_read(&indir_cnt));
 723        if (preempt_rows)
 724                printf("Preempt/Indir events:\n-----------------\n");
 725        for (int i = 0; i < preempt_rows; i++) {
 726                if (preempts[i])
 727                        printf("Preempt %3d at %6llu\n",
 728                               i, tsc2msec(preempts[i] - starttsc));
 729        }
 730        for (int i = 0; i < preempt_rows; i++) {
 731                if (indirs[i])
 732                        printf("Indir   %3d at %6llu\n",
 733                               i, tsc2msec(indirs[i] - starttsc));
 734        }
 735}
 736
 737/* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */
 738static void os_prep_work(pthread_t *worker_threads, int nr_threads)
 739{
 740        if (nr_threads > max_vcores()) {
 741                printf("Too many threads (%d) requested, can't get more than %d vc\n",
 742                       nr_threads, max_vcores());
 743                exit(-1);
 744        }
 745        atomic_init(&preempt_idx, 0);
 746        atomic_init(&indir_idx, 0);
 747        atomic_init(&preempt_cnt, 0);
 748        atomic_init(&indir_cnt, 0);
 749        parlib_never_yield = TRUE;
 750        pthread_need_tls(FALSE);
 751        pthread_mcp_init();             /* gives us one vcore */
 752        register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0);
 753        register_ev_handler(EV_CHECK_MSGS, trace_indir, 0);
 754        if (pargs.fake_vc_ctx) {
 755                /* need to disable events when faking vc ctx.  since we're
 756                 * looping and not handling events, we could run OOM */
 757                clear_kevent_q(EV_VCORE_PREEMPT);
 758                clear_kevent_q(EV_CHECK_MSGS);
 759        }
 760        vcore_request_total(nr_threads);
 761        parlib_never_vc_request = TRUE;
 762        for (int i = 0; i < nr_threads; i++) {
 763                printd("Vcore %d mapped to pcore %d\n", i,
 764                       __procinfo.vcoremap[i].pcoreid);
 765        }
 766}
 767
 768static void os_post_work(pthread_t *worker_threads, int nr_threads)
 769{
 770}
 771
 772#endif
 773
 774/* Argument parsing */
 775static error_t parse_opt (int key, char *arg, struct argp_state *state)
 776{
 777        struct prog_args *pargs = state->input;
 778
 779        switch (key) {
 780        case 'w':
 781                pargs->nr_threads = atoi(arg);
 782                if (pargs->nr_threads < 0) {
 783                        printf("Negative nr_threads...\n\n");
 784                        argp_usage(state);
 785                }
 786                break;
 787        case 'l':
 788                pargs->nr_loops = atoi(arg);
 789                if (pargs->nr_loops < 0) {
 790                        printf("Negative nr_loops...\n\n");
 791                        argp_usage(state);
 792                }
 793                break;
 794        case OPT_ADJ_WORKERS:
 795                pargs->adj_workers = TRUE;
 796                break;
 797        case OPT_VC_CTX:
 798                pargs->fake_vc_ctx = TRUE;
 799                break;
 800        case 'h':
 801                pargs->hold_time = atoi(arg);
 802                if (pargs->hold_time < 0) {
 803                        printf("Negative hold_time...\n\n");
 804                        argp_usage(state);
 805                }
 806                break;
 807        case 'd':
 808                pargs->delay_time = atoi(arg);
 809                if (pargs->delay_time < 0) {
 810                        printf("Negative delay_time...\n\n");
 811                        argp_usage(state);
 812                }
 813                break;
 814        case 'o':
 815                pargs->outfile_path = arg;
 816                break;
 817        case 'p':
 818                pargs->nr_print_rows = atoi(arg);
 819                if (pargs->nr_print_rows < 0) {
 820                        printf("Negative print_rows...\n\n");
 821                        argp_usage(state);
 822                }
 823                break;
 824        case 't':
 825                if (!strcmp("mcs", arg)) {
 826                        pargs->lock_type = mcs_thread;
 827                        break;
 828                }
 829                if (!strcmp("mcscas", arg)) {
 830                        pargs->lock_type = mcscas_thread;
 831                        break;
 832                }
 833                if (!strcmp("mcspdr", arg)) {
 834                        pargs->lock_type = mcspdr_thread;
 835                        break;
 836                }
 837                if (!strcmp("mcspdro", arg)) {
 838                        pargs->lock_type = mcspdro_thread;
 839                        break;
 840                }
 841                if (!strcmp("__mcspdro", arg)) {
 842                        pargs->lock_type = __mcspdro_thread;
 843                        break;
 844                }
 845                if (!strcmp("spin", arg)) {
 846                        pargs->lock_type = spin_thread;
 847                        break;
 848                }
 849                if (!strcmp("spinpdr", arg)) {
 850                        pargs->lock_type = spinpdr_thread;
 851                        break;
 852                }
 853                printf("Unknown locktype %s\n\n", arg);
 854                argp_usage(state);
 855                break;
 856        case ARGP_KEY_ARG:
 857                printf("Warning, extra argument %s ignored\n\n", arg);
 858                break;
 859        case ARGP_KEY_END:
 860                if (!pargs->nr_threads) {
 861                        printf("Must select a number of threads.\n\n");
 862                        argp_usage(state);
 863                        break;
 864                }
 865                if (!pargs->nr_loops) {
 866                        printf("Must select a number of loops.\n\n");
 867                        argp_usage(state);
 868                        break;
 869                }
 870                if (!pargs->lock_type) {
 871                        printf("Must select a type of lock.\n\n");
 872                        argp_usage(state);
 873                        break;
 874                }
 875                break;
 876        default:
 877                return ARGP_ERR_UNKNOWN;
 878        }
 879        return 0;
 880}
 881
 882static struct argp argp = {options, parse_opt, args_doc, doc};
 883
 884int main(int argc, char** argv)
 885{
 886        pthread_t *worker_threads;
 887        void **loops_done;
 888        struct timeval start_tv = {0};
 889        struct timeval end_tv = {0};
 890        long usec_diff, total_loops = 0;
 891        uint64_t starttsc;
 892        int nr_threads, nr_loops;
 893        FILE *outfile;
 894        struct sample_stats acq_stats, hld_stats;
 895
 896        argp_parse(&argp, argc, argv, 0, 0, &pargs);
 897        nr_threads = pargs.nr_threads;
 898        nr_loops = pargs.nr_loops;
 899        mcs_pdr_init(&mcspdr_lock);
 900
 901        if (pargs.outfile_path) {
 902                /* RDWR, CREAT, TRUNC, O666 */
 903                outfile = fopen(pargs.outfile_path, "w+");
 904                if (!outfile) {
 905                        perror("outfile");
 906                        exit(-1);
 907                }
 908        }
 909        worker_threads = malloc(sizeof(pthread_t) * nr_threads);
 910        if (!worker_threads) {
 911                perror("pthread_t malloc failed:");
 912                exit(-1);
 913        }
 914        loops_done = malloc(sizeof(void*) * nr_threads);
 915        if (!loops_done) {
 916                perror("loops_done malloc failed");
 917                exit(-1);
 918        }
 919        printf("Making %d workers, %d loops each, %sadapting workers to vcores, and %sfaking vcore context\n",
 920               nr_threads, nr_loops,
 921               pargs.adj_workers ? "" : "not ",
 922               pargs.fake_vc_ctx ? "" : "not ");
 923        pthread_barrier_init(&start_test, NULL, nr_threads);
 924
 925        times = malloc(sizeof(struct time_stamp *) * nr_threads);
 926        assert(times);
 927        for (int i = 0; i < nr_threads; i++) {
 928                times[i] = malloc(sizeof(struct time_stamp) * nr_loops);
 929                if (!times[i]) {
 930                        perror("Record keeping malloc");
 931                        exit(-1);
 932                }
 933                memset(times[i], 0, sizeof(struct time_stamp) * nr_loops);
 934        }
 935        printf("Record tracking takes %ld bytes of memory\n",
 936               nr_threads * nr_loops * sizeof(struct time_stamp));
 937        os_prep_work(worker_threads, nr_threads);/* ensure we have enough VCs */
 938        /* Doing this in MCP ctx, so we might have been getting a few preempts
 939         * already.  Want to read start before the threads pass their barrier */
 940        starttsc = read_tsc();
 941        /* create and join on yield */
 942        for (long i = 0; i < nr_threads; i++) {
 943                if (pthread_create(&worker_threads[i], NULL, pargs.lock_type,
 944                                   (void*)i))
 945                        perror("pth_create failed");
 946        }
 947        os_post_work(worker_threads, nr_threads);
 948        if (gettimeofday(&start_tv, 0))
 949                perror("Start time error...");
 950        for (int i = 0; i < nr_threads; i++) {
 951                pthread_join(worker_threads[i], &loops_done[i]);
 952        }
 953        if (gettimeofday(&end_tv, 0))
 954                perror("End time error...");
 955
 956        printf("Acquire times (TSC Ticks)\n---------------------------\n");
 957        acq_stats.get_sample = get_acq_latency;
 958        compute_stats((void**)times, nr_threads, nr_loops, &acq_stats);
 959
 960        printf("Held times (from acq til rel done) (TSC Ticks)\n------\n");
 961        hld_stats.get_sample = get_hld_latency;
 962        compute_stats((void**)times, nr_threads, nr_loops, &hld_stats);
 963
 964        usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 +
 965                    (end_tv.tv_usec - start_tv.tv_usec);
 966        printf("Time to run: %ld usec\n", usec_diff);
 967
 968        printf("\nLock throughput:\n-----------------\n");
 969        /* throughput for the entire duration (in ms), 1ms steps.  print as many
 970         * steps as they ask for (up to the end of the run). */
 971        print_throughput((void**)times, usec_diff / 1000 + 1, msec2tsc(1),
 972                         pargs.nr_print_rows,
 973                         starttsc, nr_threads,
 974                         nr_loops, get_acq_timestamp);
 975        print_preempt_trace(starttsc, pargs.nr_print_rows);
 976
 977        for (int i = 0; i < nr_threads; i++) {
 978                total_loops += (long)loops_done[i];
 979                if (!loops_done[i])
 980                        printf("WARNING: thread %d performed 0 loops!\n", i);
 981        }
 982        printf("Average number of loops done, per thread: %ld\n",
 983               total_loops / nr_threads);
 984        for (int i = 0; i < nr_threads; i++)
 985                printf("\tThread %d performed %lu loops\n",
 986                       i, (long)loops_done[i]);
 987
 988        if (pargs.outfile_path) {
 989                fprintf(outfile, "#");
 990                for (char **arg = argv; *arg; arg++)
 991                        fprintf(outfile, " %s", *arg);
 992                fprintf(outfile, "\n");
 993                fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) "
 994                                 "tsc_overhead\n");
 995                fprintf(outfile,
 996                        "# acquire latency: acq - pre - tsc_overhead\n");
 997                fprintf(outfile, "# hold time: un - acq - tsc_overhead\n");
 998                fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq());
 999                fprintf(outfile,
1000                        "# tsc_overhead is 0 on linux, hard code it with a value from akaros\n");
1001                for (int i = 0; i < nr_threads; i++) {
1002                        for (int j = 0; j < nr_loops; j++) {
1003                                struct time_stamp *ts = &times[i][j];
1004                                if (!ts->pre)
1005                                        break; /* empty record */
1006                                fprintf(outfile, "%d %d %llu %llu %llu %llu\n",
1007                                        i, j, ts->pre, ts->acq, ts->un,
1008                                        get_tsc_overhead());
1009                        }
1010                }
1011                fclose(outfile);
1012        }
1013        printf("Done, exiting\n");
1014}
1015