27bddc59773965f135d7bcc1c6078e14d329fee1
[akaros.git] / tests / lock_test.c
1 /* Copyright (c) 2013 The Regents of the University of California
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * lock_test: microbenchmark to measure different styles of spinlocks. */
6
7 #include <stdio.h>
8 #include <pthread.h>
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <sys/time.h>
12 #include <math.h>
13 #include <argp.h>
14
15 #include <tsc-compat.h>
16 #include <measure.h>
17
18 /* OS dependent #incs */
19 #include <parlib.h>
20 #include <vcore.h>
21 #include <timing.h>
22 #include <spinlock.h>
23 #include <mcs.h>
24 #include <arch/arch.h>
25 #include <event.h>
26
27 /* TODO: There's lot of work to do still, both on this program and on locking
28  * and vcore code.  For some of the issues, I'll leave in the discussion /
29  * answers, in case it comes up in the future (like when I read this in 8
30  * months).
31  *
32  * BUGS / COMMENTARY
33  *              why aren't MCS locks in uth_ctx getting dealt with?
34  *                      - because the event is handled, but the lock holder isn't run.  the
35  *                      preemption was dealt with, but nothing saved the lock holder
36  *                      - any uthread_ctx lockholder that gets preempted will get
37  *                      interrupted, and other cores will handle the preemption.  but that
38  *                      uthread won't run again without 2LS support.  either all spinners
39  *                      need to be aware of the 'lockholder' (PDR-style), or the 2LS needs
40  *                      to know when a uthread becomes a 'lockholder' to make sure it runs
41  *                      via user-level preempts.  If the latter, this needs to happen
42  *                      atomically with grabbing the lock, or else be able to handle lots of
43  *                      fake 'lockholders' (like round-robin among all of them)
44  *              why is the delay more than the expected delay?
45  *                      because it takes ~2ms to spawn and run a process
46  *                      could do this in a separate process, instead of a script
47  *                              could also consider not using pth_test and changing prov, but
48  *                              driving it by yields and requests.  would also test the
49  *                              alarm/wakeup code (process sets alarm, goes to sleep, wakes up
50  *                              and requests X cores)
51  *              why do we get occasional preempt-storms? (lots of change_tos)
52  *                      due to the MCS-PDR chain, which i tried fixing by adjusting the
53  *                      number of workers down to the number of vcores
54  *                      why isn't the worker adaptation working?
55  *                              - it actually was working, and nr_workers == nr_vcores.  that
56  *                              just wasn't the root cause.
57  *                              - was expecting it to cut down on PDR kernel traffic
58  *                      - still get periods of low perf
59  *                              like O(100) preempt msgs per big preempt/prov
60  *                              does it really take that much to work out an MCS-PDR?
61  *                      - one thing is that if we fake vc ctx, we never receive preemption
62  *                      events.  might be a bad idea.
63  *                              - in general, yeah.  faking VC and turning off events can really
64  *                              muck with things
65  *                              - these events aren't necessarily delivered to a VC who will
66  *                              check events any time soon (might be the last one in the chain)
67  *                              - the core of the issue is that we have the right amount of
68  *                              workers and vcores, but that the system isn't given a chance to
69  *                              stabilize itself.  also, if we have some VCs that are just
70  *                              sitting around, spinning in the 2LS, if those get preempted, no
71  *                              one notices or cares (when faking vc_ctx / getting no events)
72  *                      - there is a slight race where we might make someone run who isn't a
73  *                      lockholder.  logically, its okay.  worst case, it would act like an
74  *                      extra preempt and different startcore, which shouldn't be too bad.
75  *
76  *              sanity check: does throughput match latency? (2.5GHz TSC, MCS lock)
77  *                      ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock
78  *                      500 ticks * 31 workers (queue) = 15000 ticks
79  *                      avg acquire time was around 14K.  seems fine..
80  *
81  *                      when our MCSPDR throughput tanks (during preempts), it's around
82  *                      400-500 locks/ms, which is around 2us/lock.  
83  *                              when the locker on a preempted chain shows up, it needs to
84  *                              change to the next one in line. 
85  *                                      - though that should be in parallel with the other
86  *                                      lockholders letting go.  shouldn't be that bad
87  *                                      - no, it is probably at the head of the chain very soon,
88  *                                      such that it is the bottleneck for the actual lock.  2us
89  *                                      seems possible
90  *
91  *              what does it take to get out of a preemption with MCS-PDR?
92  *                      - for a single preempt, it will take 1..n-1 changes.  avg n/2
93  *                      - for multiple preempts, it's nr_pre * that (avg np/2, worst np)
94  *                      - for every unlock/reacquire cycle (someone unlocks, then rejoins
95  *                      the list), its nr_preempts (aka, nr_workers - nr_vcores)
96  *                      - if we need to have a specific worker get out of the chain, on
97  *                      average, it'd take n/2 cycles (p*n/2 changes)  worst: np
98  *                      - if we want to get multiple workers out, the worst case is still
99  *                      np, but as p increases, we're more likely to approach n cycles
100  *                      - so the current model is np for the initial hit (to move the
101  *                      offline VCs to the end of the chain) and another np to get our
102  *                      specific workers out of the chain and yielding (2np)
103  *
104  *                      - but even with 1 preempt, we're getting 80-200 changes per
105  *
106  *                      - it shouldn't matter that the sys_change_to is really slow, should
107  *                      be the same amount of changes.  however, the preempted ones are
108  *                      never really at the tail end of the chain - they should end up right
109  *                      before the lockholder often.  while the sys_change_tos are slowly
110  *                      moving towards the back of the chain, the locking code is quickly
111  *                      removing (online) nodes from the head and putting them on the back.
112  *
113  *                      - end result: based on lock hold time and lock delay time, a
114  *                      preempted VC stays in the MCS chain (swaps btw VC/nodes), and when
115  *                      it is inside the chain, someone is polling to make them run.  with
116  *                      someone polling, it is extremely unlikely that someone outside the
117  *                      chain will win the race and be able to change_to before the in-chain
118  *                      poller.  to clarify:
119  *                              - hold time and delay time matter, since the longer they are,
120  *                              the greater the amount of time the change_to percolation has to
121  *                              get the preempted VCs to the end of the chain (where no one
122  *                              polls them).
123  *                              - at least one vcore is getting the event to handle the
124  *                              preemption of the in-chain, offline VC.  we could change it so
125  *                              every VC polls the preempt_evq, or just wait til whoever is
126  *                              getting the messages eventually checks their messages (VC0)
127  *                              - if there is an in-chain poller, they will notice the instant
128  *                              the VC map changes, and then immediately change_to (and spin on
129  *                              the proclock in the kernel).  there's almost no chance of a
130  *                              normal preempt event handler doing that faster.  (would require
131  *                              some IRQ latency or something serious).
132  *                      - adding in any hold time trashes our microbenchmark's perf, but a
133  *                      little delay time actually helps: (all with no preempts going on)
134  *                              - mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc)
135  *                              - mcspdr, d = 1: 4400-4800
136  *                              - mcspdr, d = 2: 4200-5200
137  *                              - as you add delay, it cuts down on contention for the
138  *                              lock->lock cacheline.  but if you add in too much, you'll tank
139  *                              throughput (since there is no contention at all).  sweet spot
140  *                              for 31 cores on c89 was around 2-3.
141  *                              - as we increase the delay, we cut down on the chance of the
142  *                              preempt storm / preempt-stuck-in-the-chain, though it can still
143  *                              happen, even with a delay of 10us
144  *                                      - though as we increase the delay, we increase the chance of
145  *                                      the preempted vcore not being in the chain, so its not clear
146  *                                      that having a higher delay actually helped with MCS-chain
147  *                                      preemptions
148  *                      - maybe add in the lockholder again? (removed in 73701d6bfb)
149  *                              - massively cuts performance, like 2x throughput, without preempts
150  *                              - it's ability to help depends on impl:
151  *                                      in one version (old style), it didn't help much at all
152  *                                      - in another (optimized lockholder setting), i can't even
153  *                                      see the throughput hit, it recovered right away, with O(5)
154  *                                      messages
155  *                                      - the diff was having the lockholder assign the vcoreid
156  *                                      before passing off to the next in the chain, so that there
157  *                                      is less time with having "no lockholder".  (there's a brief
158  *                                      period where the lockholder says it is the next person, who
159  *                                      still spins.  they'll have to make sure their pred runs)
160  *                                              - read in lockholder_vcoreid.  if -1 or us, ensure pred,
161  *                                              o/w ensure lockholder.  when passing the lock, read
162  *                                              their vcoreid and set it.  if unlocked, set to -1, etc.
163  *                                      -adj workers doesn't matter either...
164  *                                              - the 2LS and preemption handling might be doing this
165  *                                              automatically, when handle_preempt() does a
166  *                                              thread_paused() on its current_uthread.
167  *                                              - adj_workers isn't critical if we're using some locks
168  *                                              that check notif_pending.  eventually someone hears
169  *                                              about preempted VCs (assuming we can keep up)
170  *                      - or spin a bit first, before ensuring?
171  *                              ensuring once every 1000 spins, for instance, didn't help break
172  *                              out of the chain, and just decreased our speed in resolving
173  *                              preempts
174  *                      - it's not enough to know that there haven't been any new
175  *                      preemptions.  your lockholder could already been preempted.
176  *                      - and any mechanism that allows all lockers to know the lockholder
177  *                      will cause cache traffic, (every lock acquire is a global cache
178  *                      invalidate, which everyone reaquires quickly)
179  *
180  *                      What about delays?  both hold and delay should make it easier to get
181  *                      the preempted vcore to the end of the chain.  but do they have to be
182  *                      too big to be reasonable?
183  *                              - yes.  hold doesn't really help much until everything is slower.
184  *                              even with a hold of around 1.2us, we still have the
185  *                              change_to-storms and lowered throughput.
186  *                              - doing a combo helps too.  if you hold for 1ns (quite a bit
187  *                              more actually, due to the overhead of ndelay, but sufficient to
188  *                              be "doing work"), and delaying for around 7us before rejoining,
189  *                              there's only about a 1/5 chance of a single preempt messing us
190  *                              up
191  *                                      - though having multiple preempts outstanding make this less
192  *                                      likely to work.
193  *                                      - and it seems like if we get into the storm scenario, we
194  *                                      never really get out.  either we do quickly or never do.
195  *                                      depending on the workload, this could be a matter of luck
196  *
197  *      Summary:
198  *              So we need to have someone outside the chain change_to the one in the
199  *              chain o/w, someone will always be in the chain.  Right now, it's always
200  *              the next in line who is doing the changing, so a preempted vcore is
201  *              always still in the chain. 
202  *
203  *              If the locking workload has some delaying, such as while holding the
204  *              lock or before reacquiring, the "change_to" storm might not be a
205  *              problem.  If it is, the only alternative I have so far is to check the
206  *              lockholder (which prevents a chain member from always ensuring their
207  *              pred runs).  This hurts the lock's scalability/performance when we
208  *              aren't being preempted.  On the otherhand, based on what you're doing
209  *              with the lock, one more cache miss might not be as big of a deal as in
210  *              lock_test.  Especially if when you get stormed, your throughput could be
211  *              terrible and never recover.
212  *
213  *              Similar point: you can use spinpdr locks.  They have the PDR-benefits,
214  *              and won't induce the storm of change_tos.  However, this isn't much
215  *              better for contended locks.  They perform 2-3x worse (on c89) without
216  *              preemption.  Arguably, if you were worried about the preempt storms and
217  *              want scalability, you might want to use mcspdr with lockholders.
218  *
219  *
220  * PROGRAM FEATURES
221  *              - verbosity?  vcoremap, preempts, the throughput and latency histograms?
222  *              - have a max workers option (0?) == max vcores
223  *              - would like to randomize (within bounds) the hold/delay times
224  *                      - help avoid convoys with MCS locks
225  *
226  * PERFORMANCE:
227  *              pcore control?  (hyperthreading, core 0, cross socket?)
228  *                      want some options for controlling which threads run where, or which
229  *                      vcores are even used (like turning off hyperthreading)?
230  *              implement ticket spinlocks?  (more fair, more effects of preempts)
231  *                      no simple way to do PDR either, other than 'check everyone'
232  *              MCS vs MCSPDR vs __MCSPDR
233  *                      MCS seems slightly better than __MCSPDR (and it should)
234  *                      MCSPDR is a bit worse than __MCSPDR
235  *                              - the uth_disable/enable code seems to make a difference.
236  *                              - i see why the latencies are worse, since they have extra work
237  *                              to do, but the internal part that contends with other cores
238  *                              shouldn't be affected, unless there's some other thing going on.
239  *                              Or perhaps there isn't always someone waiting for the lock?
240  *                              - faking VC ctx mostly negates the cost of MCSPDR vs __MCSPDR
241  *                      things that made a big diff: CL aligning the qnodes, putting qnodes
242  *                      on stacks, reading in the vcoreid once before ensuring()
243  *              both MCS CAS unlocks could use some branch prediction work
244  *              spinpdr locks are 2-3x faster than spinlocks...
245  *                      test, test&set  vs the existing test&set, plus lots of asserts
246  *
247  *              some delay (like 10us) lowers latency while maintaining throughput
248  *                      - makes sense esp with MCS.  if you join the queue at the last
249  *                      second, you'll measure lower latency than attempting right away
250  *                      - also true for spinlocks
251  *                      - we can probably figure out the max throughput (TP = f(delay)) for
252  *                      each lock type
253  *
254  *              hard to get steady numbers with MCS - different runs of the same test
255  *              will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k
256  *              L/ms)
257  *                      - happens on c89 (NUMA) and hossin (UMA)
258  *                      - spinlocks seem a little steadier.
259  *                      - for MCS locks, the order in which they line up across the pcores
260  *                      will matter.  like if on one run, i regularly hand off between cores
261  *                      in the same socket and only do one cross-socket step
262  *                      - run a lot of shorter ones to get a trend, for now
263  *                      - might be correllated with spikes in held times (last bin)
264  *                      - can't turn off legacy USB on c89 (SMM) - interferes with PXE
265  *
266  * PREEMPTS:
267  *              better preempt record tracking?
268  *                      i just hacked some event-intercept and timestamp code together
269  *                      maybe put it in the event library?
270  *                      the timestamps definitely helped debugging
271  *
272  *              is it true that if uthread code never spins outside a PDR lock, then it
273  *              doesn't need preemption IPIs?  (just someone checks the event at some
274  *              point). 
275  *                      think so: so long as you make progress and when you aren't, you
276  *                      check events (like if a uthread blocks on something and enters VC
277  *                      ctx)
278  *              adjusting the number of workers, whether vcores or uthreads
279  *              - if you have more lockers than cores:
280  *                      - spinpdr a worker will get starved (akaros) (without 2LS support)
281  *                              - running this from uth context will cause a handle_events
282  *                      - mcspdr will require the kernel to switch (akaros)
283  *                      - spin (akaros) might DL (o/w nothing), (linux) poor perf
284  *                      - mcs (akaros) will DL, (linux) poor perf
285  *                      - poor perf (latency spikes) comes from running the wrong thread
286  *                      sometimes
287  *                      - deadlock comes from the lack of kernel-level context switching
288  *              - if we scale workers down to the number of active vcores:
289  *                      - two things: the initial hit, and the steady state.  during the
290  *                      initial hit, we can still deadlock, since we have more lockers than
291  *                      cores
292  *                              - non-pdr (akaros) could deadlock in the initial hit
293  *                              - (akaros) steady state, everything is normal (just fewer cores)
294  *                      - how can we adjust this in linux?
295  *                              - if know how many cores you have, then futex wait the others
296  *                              - need some way to wake them back up
297  *                              - if you do this in userspace, you might need something PDR-like
298  *                              to handle when the "2LS" code gets preempted
299  *                      - as mentioned above, the problem in akaros is that the lock/unlock
300  *                      might be happening too fast to get into the steady-state and recover
301  *                      from the initial preemption
302  *              - one of our benefits is that we can adapt in userspace, with userspace
303  *              knowledge, under any circumstance.
304  *                      - we have the deadlock windows (forcing PDR).
305  *                      - in return, we can do this adaptation in userspace
306  *                      - and (arguably) anyone who does this in userspace will need PDR
307  *
308  * MEASUREMENT (user/parlib/measure.c)
309  *              extract into its own library, for linux apps
310  *              print out raw TSC times?  might help sync up diff timelines
311  *              Need more latency bins, spinlocks vary too much
312  *              maybe we need better high/low too, since this hist looks bad too
313  *                      or not center on the average?
314  *                      for printing, its hard to know without already binning.
315  *                      maybe bin once (latency?), then use that to adjust the hist?
316  *
317  *              Had this on a spinlock:
318  *              [      32 -    35656] 1565231:
319  *              (less than 200 intermediate)
320  *          [  286557 - 20404788]   65298: *
321  *
322  *              Samples per dot: 34782
323  *              Total samples: 1640606
324  *              Avg time   : 96658
325  *              Stdev time : 604064.440882
326  *              Coef Var   : 6.249503
327  *                      High coeff of var with serious outliers, adjusted bins
328  *                      50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860)
329  *                      Min / Max  : 32 / 20404788
330  *              was 50/75/90 really within 860 of each other?
331  *
332  *              when we are preempted and don't even attempt anything, say for 10ms, it
333  *              actually doesn't hurt our 50/75/90/99 too much.  we have a ridiculous
334  *              stddev and max, and high average, but there aren't any additional
335  *              attempts at locking to mess with the attempt-latency.  Only nr_vcores
336  *              requests are in flight during the preemption, but we can spit out around
337  *              5000 per ms when we aren't preempted.
338  *
339  */
340
341 const char *argp_program_version = "lock_test v0.1475263";
342 const char *argp_program_bug_address = "<akaros@lists.eecs.berkeley.edu>";
343
344 static char doc[] = "lock_test -- spinlock benchmarking";
345 static char args_doc[] = "-w NUM -l NUM -t LOCK";
346
347 #define OPT_VC_CTX 1
348 #define OPT_ADJ_WORKERS 2
349
350 static struct argp_option options[] = {
351         {"workers",             'w', "NUM",     OPTION_NO_USAGE, "Number of threads/cores"},
352         {0, 0, 0, 0, ""},
353         {"loops",               'l', "NUM",     OPTION_NO_USAGE, "Number of loops per worker"},
354         {0, 0, 0, 0, ""},
355         {"type",                't', "LOCK",OPTION_NO_USAGE, "Type of lock to use.  "
356                                                      "Options:\n"
357                                                      "\tmcs\n"
358                                                      "\tmcscas\n"
359                                                      "\tmcspdr\n"
360                                                      "\t__mcspdr\n"
361                                                      "\tspin\n"
362                                                      "\tspinpdr"},
363         {0, 0, 0, 0, "Other options (not mandatory):"},
364         {"adj_workers", OPT_ADJ_WORKERS, 0,     0, "Adjust workers such that the "
365                                                "number of workers equals the "
366                                                "number of vcores"},
367         {"vc_ctx",              OPT_VC_CTX, 0,  0, "Run threads in mock-vcore context"},
368         {0, 0, 0, 0, ""},
369         {"hold",                'h', "NSEC",    0, "nsec to hold the lock"},
370         {"delay",               'd', "NSEC",    0, "nsec to delay between grabs"},
371         {"print",               'p', "ROWS",    0, "Print ROWS of optional measurements"},
372         { 0 }
373 };
374
375 struct prog_args {
376         int                                                     nr_threads;
377         int                                                     nr_loops;
378         int                                                     hold_time;
379         int                                                     delay_time;
380         int                                                     nr_print_rows;
381         bool                                            fake_vc_ctx;
382         bool                                            adj_workers;
383         void *(*lock_type)(void *arg);
384 };
385 struct prog_args pargs = {0};
386
387 /* Globals */
388 struct time_stamp {
389         uint64_t pre;
390         uint64_t acq;
391         uint64_t un;
392 };
393 struct time_stamp **times;
394 bool run_locktest = TRUE;
395 pthread_barrier_t start_test;
396
397 /* Locking functions.  Define globals here, init them in main (if possible), and
398  * use the lock_func() macro to make your thread func. */
399
400 spinlock_t spin_lock = SPINLOCK_INITIALIZER;
401 struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
402 struct mcs_lock mcs_lock = MCS_LOCK_INIT;
403 struct mcs_pdr_lock mcspdr_lock = MCSPDR_LOCK_INIT;
404
405 #define lock_func(lock_name, lock_cmd, unlock_cmd)                             \
406 void *lock_name##_thread(void *arg)                                            \
407 {                                                                              \
408         int thread_id = (int)arg;                                                  \
409         int hold_time = ACCESS_ONCE(pargs.hold_time);                              \
410         int delay_time = ACCESS_ONCE(pargs.delay_time);                            \
411         int nr_loops = ACCESS_ONCE(pargs.nr_loops);                                \
412         bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx);                         \
413         bool adj_workers = ACCESS_ONCE(pargs.adj_workers);                         \
414         uint64_t pre_lock, acq_lock, un_lock;                                      \
415         struct time_stamp *this_time;                                              \
416         struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                          \
417         struct mcs_pdr_qnode pdr_qnode = MCSPDR_QNODE_INIT;                        \
418         /* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the
419          * program gets preempted for that test, things may go nuts */             \
420         pdr_qnode.vcoreid = thread_id - 1;                                         \
421         /* Wait til all threads are created.  Ideally, I'd like to busywait unless
422          * absolutely critical to yield */                                         \
423         pthread_barrier_wait(&start_test);                                         \
424         if (fake_vc_ctx) {                                                         \
425                 /* tells the kernel / other vcores we're in vc ctx */                  \
426                 uth_disable_notifs();                                                  \
427                 /* tricks ourselves into believing we're in vc ctx */                  \
428                 __vcore_context = TRUE;                                                \
429         }                                                                          \
430         for (int i = 0; i < nr_loops; i++) {                                       \
431                 if (!run_locktest)                                                     \
432                         break;                                                             \
433                 pre_lock = read_tsc_serialized();                                      \
434                                                                                \
435                 lock_cmd                                                               \
436                                                                                \
437                 acq_lock = read_tsc_serialized();                                      \
438                 if (hold_time)                                                         \
439                         ndelay(hold_time);                                                 \
440                                                                                \
441                 unlock_cmd                                                             \
442                                                                                \
443                 un_lock = read_tsc_serialized();                                       \
444                 this_time = &times[thread_id][i];                                      \
445                 this_time->pre = pre_lock;                                             \
446                 this_time->acq = acq_lock;                                             \
447                 this_time->un = un_lock;                                               \
448                                                                                \
449                 if (delay_time)                                                        \
450                         ndelay(delay_time);                                                \
451                 /* worker thread ids are 0..n-1.  if we're one of the threads that's
452                  * beyond the VC count, we yield. */                                   \
453                 if (adj_workers && num_vcores() < thread_id + 1) {                     \
454                         if (fake_vc_ctx) {                                                 \
455                                 __vcore_context = FALSE;                                       \
456                                 uth_enable_notifs();                                           \
457                         }                                                                  \
458                         /* we'll come back up once we have enough VCs running */           \
459                         pthread_yield();                                                   \
460                         if (fake_vc_ctx) {                                                 \
461                                 uth_disable_notifs();                                          \
462                                 __vcore_context = TRUE;                                        \
463                         }                                                                  \
464                 }                                                                      \
465                 cmb();                                                                 \
466         }                                                                          \
467         /* First thread to finish stops the test */                                \
468         run_locktest = FALSE;                                                      \
469         if (fake_vc_ctx) {                                                         \
470                 __vcore_context = FALSE;                                               \
471                 uth_enable_notifs();                                                   \
472         }                                                                          \
473         return arg;                                                                \
474 }
475
476 /* Defines locking funcs like "mcs_thread" */
477 lock_func(mcs,
478           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
479           mcs_lock_unlock(&mcs_lock, &mcs_qnode);)
480 lock_func(mcscas,
481           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
482           mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);)
483 lock_func(mcspdr,
484           mcs_pdr_lock(&mcspdr_lock, &pdr_qnode);,
485           mcs_pdr_unlock(&mcspdr_lock, &pdr_qnode);)
486 lock_func(__mcspdr,
487           __mcs_pdr_lock(&mcspdr_lock, &pdr_qnode);,
488           __mcs_pdr_unlock(&mcspdr_lock, &pdr_qnode);)
489 lock_func(spin,
490           spinlock_lock(&spin_lock);,
491           spinlock_unlock(&spin_lock);)
492 lock_func(spinpdr,
493           spin_pdr_lock(&spdr_lock);,
494           spin_pdr_unlock(&spdr_lock);)
495
496 static int get_acq_latency(void **data, int i, int j, uint64_t *sample)
497 {
498         struct time_stamp **times = (struct time_stamp**)data;
499         /* 0 for initial time means we didn't measure */
500         if (times[i][j].pre == 0)
501                 return -1;
502         *sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead();
503         return 0;
504 }
505
506 static int get_hld_latency(void **data, int i, int j, uint64_t *sample)
507 {
508         struct time_stamp **times = (struct time_stamp**)data;
509         /* 0 for initial time means we didn't measure */
510         if (times[i][j].pre == 0)
511                 return -1;
512         *sample = times[i][j].un - times[i][j].acq - get_tsc_overhead();
513         return 0;
514 }
515
516 static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample)
517 {
518         struct time_stamp **times = (struct time_stamp**)data;
519         /* 0 for initial time means we didn't measure */
520         if (times[i][j].pre == 0)
521                 return -1;
522         *sample = times[i][j].acq;
523         return 0;
524 }
525
526 /* Lousy event intercept.  build something similar in the event library? */
527 #define MAX_NR_EVENT_TRACES 1000
528 uint64_t preempts[MAX_NR_EVENT_TRACES] = {0};
529 uint64_t indirs[MAX_NR_EVENT_TRACES] = {0};
530 atomic_t preempt_idx;
531 atomic_t indir_idx;
532
533 static void handle_preempt(struct event_msg *ev_msg, unsigned int ev_type)
534 {
535         unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1);
536         if (my_slot < MAX_NR_EVENT_TRACES)
537                 preempts[my_slot] = read_tsc();
538         handle_vc_preempt(ev_msg, ev_type);
539 }
540
541 static void handle_indir(struct event_msg *ev_msg, unsigned int ev_type)
542 {
543         unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1);
544         if (my_slot < MAX_NR_EVENT_TRACES)
545                 indirs[my_slot] = read_tsc();
546         handle_vc_indir(ev_msg, ev_type);
547 }
548
549 /* Helper, prints out the preempt trace */
550 static void print_preempt_trace(uint64_t starttsc, int nr_print_rows)
551 {
552         /* reusing nr_print_rows for the nr preempt/indirs rows as well */
553         int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows);
554         if (pargs.fake_vc_ctx) {
555                 printf("No preempt trace available when faking vc ctx\n");
556                 return;
557         }
558         if (preempt_rows)
559                 printf("\nPreempt/Indir events:\n-----------------\n");
560         for (int i = 0; i < preempt_rows; i++) {
561                 if (preempts[i])
562                         printf("Preempt %3d at %6llu\n", i, tsc2msec(preempts[i]
563                                                                      - starttsc));
564         }
565         for (int i = 0; i < preempt_rows; i++) {
566                 if (indirs[i])
567                         printf("Indir   %3d at %6llu\n", i, tsc2msec(indirs[i]
568                                                                      - starttsc));
569         }
570 }
571
572 /* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */
573 static void os_prep_work(int nr_threads)
574 {
575         if (nr_threads > max_vcores()) {
576                 printf("Too many threads (%d) requested, can't get more than %d vc\n",
577                        nr_threads, max_vcores());
578                 exit(-1);
579         }
580         atomic_init(&preempt_idx, 0);
581         atomic_init(&indir_idx, 0);
582         pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
583         pthread_lib_init();                                     /* gives us one vcore */
584         ev_handlers[EV_VCORE_PREEMPT] = handle_preempt;
585         ev_handlers[EV_CHECK_MSGS] = handle_indir;
586         if (pargs.fake_vc_ctx) {
587                 /* need to disable events when faking vc ctx.  since we're looping and
588                  * not handling events, we could run OOM */
589                 clear_kevent_q(EV_VCORE_PREEMPT);
590                 clear_kevent_q(EV_CHECK_MSGS);
591         }
592         if (vcore_request(nr_threads - 1)) {
593                 printf("Failed to request %d more vcores, currently have %d\n",
594                        nr_threads - 1, num_vcores());
595                 exit(-1);
596         }
597         for (int i = 0; i < nr_threads; i++) {
598                 printd("Vcore %d mapped to pcore %d\n", i,
599                        __procinfo.vcoremap[i].pcoreid);
600         }
601 }
602
603 /* Argument parsing */
604 static error_t parse_opt (int key, char *arg, struct argp_state *state)
605 {
606         struct prog_args *pargs = state->input;
607         switch (key) {
608                 case 'w':
609                         pargs->nr_threads = atoi(arg);
610                         if (pargs->nr_threads < 0) {
611                                 printf("Negative nr_threads...\n\n");
612                                 argp_usage(state);
613                         }
614                         break;
615                 case 'l':
616                         pargs->nr_loops = atoi(arg);
617                         if (pargs->nr_loops < 0) {
618                                 printf("Negative nr_loops...\n\n");
619                                 argp_usage(state);
620                         }
621                         break;
622                 case OPT_ADJ_WORKERS:
623                         pargs->adj_workers = TRUE;
624                         break;
625                 case OPT_VC_CTX:
626                         pargs->fake_vc_ctx = TRUE;
627                         break;
628                 case 'h':
629                         pargs->hold_time = atoi(arg);
630                         if (pargs->hold_time < 0) {
631                                 printf("Negative hold_time...\n\n");
632                                 argp_usage(state);
633                         }
634                         break;
635                 case 'd':
636                         pargs->delay_time = atoi(arg);
637                         if (pargs->delay_time < 0) {
638                                 printf("Negative delay_time...\n\n");
639                                 argp_usage(state);
640                         }
641                         break;
642                 case 'p':
643                         pargs->nr_print_rows = atoi(arg);
644                         if (pargs->nr_print_rows < 0) {
645                                 printf("Negative print_rows...\n\n");
646                                 argp_usage(state);
647                         }
648                         break;
649                 case 't':
650                         if (!strcmp("mcs", arg)) {
651                                 pargs->lock_type = mcs_thread;
652                                 break;
653                         }
654                         if (!strcmp("mcscas", arg)) {
655                                 pargs->lock_type = mcscas_thread;
656                                 break;
657                         }
658                         if (!strcmp("mcspdr", arg)) {
659                                 pargs->lock_type = mcspdr_thread;
660                                 break;
661                         }
662                         if (!strcmp("__mcspdr", arg)) {
663                                 pargs->lock_type = __mcspdr_thread;
664                                 break;
665                         }
666                         if (!strcmp("spin", arg)) {
667                                 pargs->lock_type = spin_thread;
668                                 break;
669                         }
670                         if (!strcmp("spinpdr", arg)) {
671                                 pargs->lock_type = spinpdr_thread;
672                                 break;
673                         }
674                         printf("Unknown locktype %s\n\n", arg);
675                         argp_usage(state);
676                         break;
677                 case ARGP_KEY_ARG:
678                         printf("Warning, extra argument %s ignored\n\n", arg);
679                         break;
680                 case ARGP_KEY_END:
681                         if (!pargs->nr_threads) {
682                                 printf("Must select a number of threads.\n\n");
683                                 argp_usage(state);
684                                 break;
685                         }
686                         if (!pargs->nr_loops) {
687                                 printf("Must select a number of loops.\n\n");
688                                 argp_usage(state);
689                                 break;
690                         }
691                         if (!pargs->lock_type) {
692                                 printf("Must select a type of lock.\n\n");
693                                 argp_usage(state);
694                                 break;
695                         }
696                         break;
697                 default:
698                         return ARGP_ERR_UNKNOWN;
699         }
700         return 0;
701 }
702
703 static struct argp argp = {options, parse_opt, args_doc, doc};
704
705 int main(int argc, char** argv)
706 {
707         pthread_t *worker_threads;
708         void *dummy_retval;
709         struct timeval start_tv = {0};
710         struct timeval end_tv = {0};
711         long usec_diff;
712         uint64_t starttsc;
713         int nr_threads, nr_loops;
714         struct sample_stats acq_stats, hld_stats;
715
716         argp_parse(&argp, argc, argv, 0, 0, &pargs);
717         nr_threads = pargs.nr_threads;
718         nr_loops = pargs.nr_loops;
719
720         worker_threads = malloc(sizeof(pthread_t) * nr_threads);
721         if (!worker_threads) {
722                 perror("pthread_t malloc failed:");
723                 exit(-1);
724         }
725         printf("Making %d workers of %d loops each, %sadapting workers to vcores, "
726                "and %sfaking vcore context\n", nr_threads, nr_loops,
727                pargs.adj_workers ? "" : "not ",
728                pargs.fake_vc_ctx ? "" : "not ");
729         pthread_barrier_init(&start_test, NULL, nr_threads);
730
731         times = malloc(sizeof(struct time_stamp *) * nr_threads);
732         assert(times);
733         for (int i = 0; i < nr_threads; i++) {
734                 times[i] = malloc(sizeof(struct time_stamp) * nr_loops);
735                 if (!times[i]) {
736                         perror("Record keeping malloc");
737                         exit(-1);
738                 }
739                 memset(times[i], 0, sizeof(struct time_stamp) * nr_loops);
740         }
741         printf("Record tracking takes %d bytes of memory\n",
742                nr_threads * nr_loops * sizeof(struct time_stamp));
743         os_prep_work(nr_threads);       /* ensure we have enough VCs */
744         /* Doing this in MCP ctx, so we might have been getting a few preempts
745          * already.  Want to read start before the threads pass their barrier */
746         starttsc = read_tsc();
747         /* create and join on yield */
748         for (int i = 0; i < nr_threads; i++) {
749                 if (pthread_create(&worker_threads[i], NULL, pargs.lock_type,
750                                    (void*)i))
751                         perror("pth_create failed");
752         }
753         if (gettimeofday(&start_tv, 0))
754                 perror("Start time error...");
755         for (int i = 0; i < nr_threads; i++) {
756                 pthread_join(worker_threads[i], &dummy_retval);
757         }
758         if (gettimeofday(&end_tv, 0))
759                 perror("End time error...");
760
761         printf("Acquire times (TSC Ticks)\n---------------------------\n");
762         acq_stats.get_sample = get_acq_latency;
763         compute_stats((void**)times, nr_threads, nr_loops, &acq_stats);
764
765         printf("Held times (from acq til rel done) (TSC Ticks)\n------\n");
766         hld_stats.get_sample = get_hld_latency;
767         compute_stats((void**)times, nr_threads, nr_loops, &hld_stats);
768
769         usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 +
770                     (end_tv.tv_usec - start_tv.tv_usec);
771         printf("Time to run: %d usec\n", usec_diff);
772
773         printf("\nLock throughput:\n-----------------\n");
774         /* throughput for the entire duration (in ms), 1ms steps.  print as many
775          * steps as they ask for (up to the end of the run). */
776         print_throughput((void**)times, usec_diff / 1000 + 1, msec2tsc(1),
777                          pargs.nr_print_rows,
778                          starttsc, nr_threads,
779                          nr_loops, get_acq_timestamp);
780         print_preempt_trace(starttsc, pargs.nr_print_rows);
781         printf("Done, exiting\n");
782 }