Rename vcore_request() -> vcore_request_more()
[akaros.git] / tests / lock_test.c
1 /* Copyright (c) 2013, 2014 The Regents of the University of California
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * lock_test: microbenchmark to measure different styles of spinlocks.
6  *
7  * to build on linux: (hacky)
8  * $ gcc -O2 -std=gnu99 -fno-stack-protector -g tests/lock_test.c -lpthread \
9  *    -lm -o linux_lock_test */
10
11 #define _GNU_SOURCE /* pthread_yield */
12
13 #include <stdio.h>
14 #include <pthread.h>
15 #include <stdlib.h>
16 #include <unistd.h>
17 #include <sys/time.h>
18 #include <math.h>
19 #include <argp.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <fcntl.h>
23 #include <assert.h>
24 #include <string.h>
25
26 /* OS dependent #incs */
27 #ifdef __ros__
28
29 #include <parlib/parlib.h>
30 #include <parlib/vcore.h>
31 #include <parlib/timing.h>
32 #include <parlib/spinlock.h>
33 #include <parlib/mcs.h>
34 #include <parlib/arch/arch.h>
35 #include <parlib/event.h>
36
37 #include <parlib/tsc-compat.h>
38 #include <benchutil/measure.h>
39
40 #else
41
42 #include "../user/parlib/include/tsc-compat.h"
43 #include "misc-compat.h"
44 #include "linux-lock-hacks.h" /* TODO: have a build system and lib / C file */
45
46 #include "../user/benchutil/include/measure.h"
47 #include "../user/benchutil/measure.c"
48
49 static void os_prep_work(pthread_t *worker_threads, int nr_threads)
50 {
51         if (nr_threads > num_vcores())
52                 printf("WARNING: %d threads requested, but only %d cores available\n",
53                        nr_threads, num_vcores());
54 }
55
56 static void os_post_work(pthread_t *worker_threads, int nr_threads)
57 {
58         if (nr_threads > num_vcores())
59                 return;
60         /* assuming we're taking cores 0..nr_threads, and we never move. */
61         for (int i = 0; i < nr_threads; i++) {
62                 cpu_set_t cpuset;
63                 CPU_ZERO(&cpuset);
64                 CPU_SET(i, &cpuset);
65                 pthread_setaffinity_np(worker_threads[i], sizeof(cpu_set_t), &cpuset);
66         }
67 }
68
69 #define print_preempt_trace(args...) {}
70
71 __thread int __vcore_context = 0;
72
73 #endif
74
75 /* TODO: There's lot of work to do still, both on this program and on locking
76  * and vcore code.  For some of the issues, I'll leave in the discussion /
77  * answers, in case it comes up in the future (like when I read this in 8
78  * months).
79  *
80  * BUGS / COMMENTARY
81  *              Occasional deadlocks when preempting and not giving back!
82  *                      - with the new PDRs style, though that doesn't mean the older styles
83  *                      don't have this problem
84  *                      - shouldn't be any weaker than PDR.  they all check pred_vc to see
85  *                      if they are running, and if not, they make sure someone runs
86  *                      - could be weaker if we have an old value for the lockholder,
87  *                      someone outside the chain, and we made sure they ran, and they do
88  *                      nothing (spin in the 2LS or something?)
89  *                              no, they should have gotten a msg about us being preempted,
90  *                              since whoever we turn into gets the message about us swapping.
91  *                      - anyway, it's not clear if this is with MCSPDR, event delivery,
92  *                      preemption handling, or just an artifact of the test (less likely)
93  *              why aren't MCS locks in uth_ctx getting dealt with?
94  *                      - because the event is handled, but the lock holder isn't run.  the
95  *                      preemption was dealt with, but nothing saved the lock holder
96  *                      - any uthread_ctx lockholder that gets preempted will get
97  *                      interrupted, and other cores will handle the preemption.  but that
98  *                      uthread won't run again without 2LS support.  either all spinners
99  *                      need to be aware of the 'lockholder' (PDR-style), or the 2LS needs
100  *                      to know when a uthread becomes a 'lockholder' to make sure it runs
101  *                      via user-level preempts.  If the latter, this needs to happen
102  *                      atomically with grabbing the lock, or else be able to handle lots of
103  *                      fake 'lockholders' (like round-robin among all of them)
104  *              why is the delay more than the expected delay?
105  *                      because it takes ~2ms to spawn and run a process
106  *                      could do this in a separate process, instead of a script
107  *                              could also consider not using pth_test and changing prov, but
108  *                              driving it by yields and requests.  would also test the
109  *                              alarm/wakeup code (process sets alarm, goes to sleep, wakes up
110  *                              and requests X cores)
111  *              why do we get occasional preempt-storms? (lots of change_tos)
112  *                      due to the MCS-PDR chain, which i tried fixing by adjusting the
113  *                      number of workers down to the number of vcores
114  *                      why isn't the worker adaptation working?
115  *                              - it actually was working, and nr_workers == nr_vcores.  that
116  *                              just wasn't the root cause.
117  *                              - was expecting it to cut down on PDR kernel traffic
118  *                      - still get periods of low perf
119  *                              like O(100) preempt msgs per big preempt/prov
120  *                              does it really take that much to work out an MCS-PDR?
121  *                      - one thing is that if we fake vc ctx, we never receive preemption
122  *                      events.  might be a bad idea.
123  *                              - in general, yeah.  faking VC and turning off events can really
124  *                              muck with things
125  *                              - these events aren't necessarily delivered to a VC who will
126  *                              check events any time soon (might be the last one in the chain)
127  *                              - the core of the issue is that we have the right amount of
128  *                              workers and vcores, but that the system isn't given a chance to
129  *                              stabilize itself.  also, if we have some VCs that are just
130  *                              sitting around, spinning in the 2LS, if those get preempted, no
131  *                              one notices or cares (when faking vc_ctx / getting no events)
132  *                      - there is a slight race where we might make someone run who isn't a
133  *                      lockholder.  logically, its okay.  worst case, it would act like an
134  *                      extra preempt and different startcore, which shouldn't be too bad.
135  *
136  *              sanity check: does throughput match latency? (2.5GHz TSC, MCS lock)
137  *                      ex: 5000 locks/ms = 5 locks/us = 200ns/lock = 500 ticks / lock
138  *                      500 ticks * 31 workers (queue) = 15000 ticks
139  *                      avg acquire time was around 14K.  seems fine..
140  *                      when our MCSPDR throughput tanks (during preempts), it's around
141  *                      400-500 locks/ms, which is around 2us/lock.  
142  *                              when the locker on a preempted chain shows up, it needs to
143  *                              change to the next one in line. 
144  *                                      - though that should be in parallel with the other
145  *                                      lockholders letting go.  shouldn't be that bad
146  *                                      - no, it is probably at the head of the chain very soon,
147  *                                      such that it is the bottleneck for the actual lock.  2us
148  *                                      seems possible
149  *
150  *              what does it take to get out of a preemption with (old) MCS-PDR?
151  *                      - these are now called pdro locks (old)
152  *                      - for a single preempt, it will take 1..n-1 changes.  avg n/2
153  *                      - for multiple preempts, it's nr_pre * that (avg np/2, worst np)
154  *                      - for every unlock/reacquire cycle (someone unlocks, then rejoins
155  *                      the list), its nr_preempts (aka, nr_workers - nr_vcores)
156  *                      - if we need to have a specific worker get out of the chain, on
157  *                      average, it'd take n/2 cycles (p*n/2 changes)  worst: np
158  *                      - if we want to get multiple workers out, the worst case is still
159  *                      np, but as p increases, we're more likely to approach n cycles
160  *                      - so the current model is np for the initial hit (to move the
161  *                      offline VCs to the end of the chain) and another np to get our
162  *                      specific workers out of the chain and yielding (2np)
163  *
164  *                      - but even with 1 preempt, we're getting 80-200 changes per
165  *
166  *                      - it shouldn't matter that the sys_change_to is really slow, should
167  *                      be the same amount of changes.  however, the preempted ones are
168  *                      never really at the tail end of the chain - they should end up right
169  *                      before the lockholder often.  while the sys_change_tos are slowly
170  *                      moving towards the back of the chain, the locking code is quickly
171  *                      removing (online) nodes from the head and putting them on the back.
172  *
173  *                      - end result: based on lock hold time and lock delay time, a
174  *                      preempted VC stays in the MCS chain (swaps btw VC/nodes), and when
175  *                      it is inside the chain, someone is polling to make them run.  with
176  *                      someone polling, it is extremely unlikely that someone outside the
177  *                      chain will win the race and be able to change_to before the in-chain
178  *                      poller.  to clarify:
179  *                              - hold time and delay time matter, since the longer they are,
180  *                              the greater the amount of time the change_to percolation has to
181  *                              get the preempted VCs to the end of the chain (where no one
182  *                              polls them).
183  *                              - at least one vcore is getting the event to handle the
184  *                              preemption of the in-chain, offline VC.  we could change it so
185  *                              every VC polls the preempt_evq, or just wait til whoever is
186  *                              getting the messages eventually checks their messages (VC0)
187  *                              - if there is an in-chain poller, they will notice the instant
188  *                              the VC map changes, and then immediately change_to (and spin on
189  *                              the proclock in the kernel).  there's almost no chance of a
190  *                              normal preempt event handler doing that faster.  (would require
191  *                              some IRQ latency or something serious).
192  *                      - adding in any hold time trashes our microbenchmark's perf, but a
193  *                      little delay time actually helps: (all with no preempts going on)
194  *                              - mcspdr, no delay: 4200-4400 (-w31 -l10000, no faking, etc)
195  *                              - mcspdr, d = 1: 4400-4800
196  *                              - mcspdr, d = 2: 4200-5200
197  *                              - as you add delay, it cuts down on contention for the
198  *                              lock->lock cacheline.  but if you add in too much, you'll tank
199  *                              throughput (since there is no contention at all).
200  *                              - as we increase the delay, we cut down on the chance of the
201  *                              preempt storm / preempt-stuck-in-the-chain, though it can still
202  *                              happen, even with a delay of 10us
203  *                      - maybe add in the lockholder again? (removed in 73701d6bfb)
204  *                              - massively cuts performance, like 2x throughput, without
205  *                              preempts
206  *                              - it's ability to help depends on impl:
207  *                                      in one version (old style), it didn't help much at all
208  *                                      - in another (optimized lockholder setting), i can't even
209  *                                      see the throughput hit, it recovered right away, with O(5)
210  *                                      messages
211  *                                      - the diff was having the lockholder assign the vcoreid
212  *                                      before passing off to the next in the chain, so that there
213  *                                      is less time with having "no lockholder".  (there's a brief
214  *                                      period where the lockholder says it is the next person, who
215  *                                      still spins.  they'll have to make sure their pred runs)
216  *                      -adj workers doesn't matter either...
217  *                              - the 2LS and preemption handling might be doing this
218  *                              automatically, when handle_vc_preempt() does a
219  *                              thread_paused() on its current_uthread.
220  *                              - adj_workers isn't critical if we're using some locks
221  *                              that check notif_pending.  eventually someone hears
222  *                              about preempted VCs (assuming we can keep up)
223  *
224  *                      What about delays?  both hold and delay should make it easier to get
225  *                      the preempted vcore to the end of the chain.  but do they have to be
226  *                      too big to be reasonable?
227  *                              - yes.  hold doesn't really help much until everything is
228  *                              slower.  even with a hold of around 1.2us, we still have the
229  *                              change_to-storms and lowered throughput.
230  *                              - doing a combo helps too.  if you hold for 1ns (quite a bit
231  *                              more actually, due to the overhead of ndelay, but sufficient to
232  *                              be "doing work"), and delaying for around 7us before rejoining,
233  *                              there's only about a 1/5 chance of a single preempt messing us
234  *                              up
235  *                                      - though having multiple preempts outstanding make this less
236  *                                      likely to work.
237  *                                      - and it seems like if we get into the storm scenario, we
238  *                                      never really get out.  either we do quickly or never do.
239  *                                      depending on the workload, this could be a matter of luck
240  *
241  *                      So we could try tracking the lockholder, but only looking at it when
242  *                      we know someone was preempted in the chain - specifically, when our
243  *                      pred is offline.  when that happens, we don't change to them, we
244  *                      make sure the lockholder is running.
245  *                              - tracking takes us from 4200->2800 throughput or so for MCS
246  *                              - 5200 -> 3700 or so for MCS in vc_ctx (__MCSPDR)
247  *                              - main spike seems to be in the hold time.  bimodal distrib,
248  *                              with most below 91 (the usual is everything packed below 70) and
249  *                              a big spike around 320
250  *
251  *      Summary:
252  *              So we need to have someone outside the chain change_to the one in the
253  *              chain o/w, someone will always be in the chain.  Right now, it's always
254  *              the next in line who is doing the changing, so a preempted vcore is
255  *              always still in the chain. 
256  *
257  *              If the locking workload has some delaying, such as while holding the
258  *              lock or before reacquiring, the "change_to" storm might not be a
259  *              problem.  If it is, the only alternative I have so far is to check the
260  *              lockholder (which prevents a chain member from always ensuring their
261  *              pred runs).  This hurts the lock's scalability/performance when we
262  *              aren't being preempted.  On the otherhand, based on what you're doing
263  *              with the lock, one more cache miss might not be as big of a deal as in
264  *              lock_test.  Especially if when you get stormed, your throughput could be
265  *              terrible and never recover.
266  *
267  *              Similar point: you can use spinpdr locks.  They have the PDR-benefits,
268  *              and won't induce the storm of change_tos.  However, this isn't much
269  *              better for contended locks.  They perform 2-3x worse (on c89) without
270  *              preemption.  Arguably, if you were worried about the preempt storms and
271  *              want scalability, you might want to use mcspdr with lockholders.
272  *
273  *              The MCSPDRS (now just callced MCSPDR, these are default) locks can avoid
274  *              the storm, but at the cost of a little more in performance.  mcspdrs
275  *              style is about the same when not getting preempted from uth ctx compared
276  *              to mcspdr (slight drop).  When in vc ctx, it's about 10-20% perf hit
277  *              (PDRS gains little from --vc_ctx). 
278  *
279  *              Turns out there is a perf hit to PDRS (and any non-stack based qnode)
280  *              when running on c89.  The issue is that after shuffling the vcores
281  *              around, they are no longer mapped nicely to pcores (VC0->PC1, VC1->PC2).
282  *              This is due to some 'false sharing' of the cachelines, caused mostly by
283  *              aggressive prefetching (notably the intel adjacent cacheline prefetcher,
284  *              which grabs two CLs at a time!).  Basically, stack-based qnodes are
285  *              qnodes that are very far apart in memory.  Cranking up the padding in
286  *              qnodes in the "qnodes-in-locks" style replicates this.
287  *
288  *              For some info on the prefetching:
289  *                      http://software.intel.com/en-us/articles/optimizing-application-performance-on-intel-coret-microarchitecture-using-hardware-implemented-prefetchers/
290  *                      http://software.intel.com/en-us/forums/topic/341769
291  *
292  *              Here's some rough numbers of the different styles for qnodes on c89.
293  *              'in order' is VCn->PC(n+1) (0->1, 1->2).  Worst order is with even VCs
294  *              on one socket, odds on the other.  the number of CLs is the size of a
295  *              qnode.  mcspdr is the new style (called mcspdrs in some places in this
296  *              document), with lock-based qnodes.  mcspdr2 is the same, but with
297  *              stack-based qnodes.  mcspdro is the old style (bad a recovery), stack
298  *              based, sometimes just called mcs-pdr
299  *
300  *              with prefetchers disabled (MCS and DCU)
301  *                      mcspdr   1CL  4.8-5.4 in order, 3.8-4.2 worst order
302  *                      mcspdr   2CL          in order,         worst order
303  *                      mcspdr   4CL  5.2-6.0 in order, 4.7-5.3 worst order
304  *                      mcspdr   8CL  5.4-6.7 in order, 5.2-6.2 worst order
305  *                      mcspdr  16CL  5.1-5.8 in order, 5.2-6.8 worst order
306  *                      mcspdr2 stck          in order,         worst order
307  *                      mcspdro stck  4-3.4.3 in order, 4.2-4.5 worst order
308  *                      mcspdro-vcctx 4.8-7.0 in order, 5.3-6.7 worst order
309  *                      can we see the 2 humps? 
310  *                              mcspdr 1CL yes but less, varied, etc
311  *                              mcspdr2 no
312  *
313  *              test again with worst order with prefetchers enabled
314  *                      mcspdr   1CL  3.8-4.0 in order, 2.6-2.7 worst order
315  *                      mcspdr   2CL  4.2-4.4 in order, 3.8-3.9 worst order
316  *                      mcspdr   4CL  4.5-5.2 in order, 4.0-4.2 worst order
317  *                      mcspdr   8CL  4.4-5.1 in order, 4.3-4.7 worst order
318  *                      mcspdr  16CL  4.4-4.8 in order, 4.4-5.3 worst order
319  *                      mcspdr2 stck  3.0-3.0 in order, 2.9-3.0 worst order
320  *                      mcspdro stck  4.2-4.3 in order, 4.2-4.4 worst order
321  *                      mcspdro-vcctx 5.2-6.4 in order, 5.0-5.9 worst order
322  *                      can we see the 2 humps?
323  *                              mcspdrs 1CL yes, clearly
324  *                              mcspdr2 no
325  *
326  * PROGRAM FEATURES
327  *              - verbosity?  vcoremap, preempts, the throughput and latency histograms?
328  *              - have a max workers option (0?) == max vcores
329  *              - would like to randomize (within bounds) the hold/delay times
330  *                      - help avoid convoys with MCS locks
331  *
332  * PERFORMANCE:
333  *              pcore control?  (hyperthreading, core 0, cross socket?)
334  *                      want some options for controlling which threads run where, or which
335  *                      vcores are even used (like turning off hyperthreading)?
336  *              implement ticket spinlocks?  (more fair, more effects of preempts)
337  *                      no simple way to do PDR either, other than 'check everyone'
338  *              MCS vs MCSPDR vs __MCSPDR
339  *                      MCS seems slightly better than __MCSPDR (and it should)
340  *                      MCSPDR is a bit worse than __MCSPDR
341  *                              - the uth_disable/enable code seems to make a difference.
342  *                              - i see why the latencies are worse, since they have extra work
343  *                              to do, but the internal part that contends with other cores
344  *                              shouldn't be affected, unless there's some other thing going on.
345  *                              Or perhaps there isn't always someone waiting for the lock?
346  *                              - faking VC ctx mostly negates the cost of MCSPDR vs __MCSPDR
347  *                      things that made a big diff: CL aligning the qnodes, putting qnodes
348  *                      on stacks, reading in the vcoreid once before ensuring()
349  *              both MCS CAS unlocks could use some branch prediction work
350  *              spinpdr locks are 2-3x faster than spinlocks...
351  *                      test, test&set  vs the existing test&set, plus lots of asserts
352  *
353  *              some delay (like 10us) lowers latency while maintaining throughput
354  *                      - makes sense esp with MCS.  if you join the queue at the last
355  *                      second, you'll measure lower latency than attempting right away
356  *                      - also true for spinlocks
357  *                      - we can probably figure out the max throughput (TP = f(delay)) for
358  *                      each lock type
359  *
360  *              hard to get steady numbers with MCS - different runs of the same test
361  *              will vary in throughput by around 15-30% (e.g., MCS varying from 3k-4k
362  *              L/ms)
363  *                      - happens on c89 (NUMA) and hossin (UMA)
364  *                      - spinlocks seem a little steadier.
365  *                      - for MCS locks, the order in which they line up across the pcores
366  *                      will matter.  like if on one run, i regularly hand off between cores
367  *                      in the same socket and only do one cross-socket step
368  *                      - run a lot of shorter ones to get a trend, for now
369  *                      - might be correllated with spikes in held times (last bin)
370  *                      - can't turn off legacy USB on c89 (SMM) - interferes with PXE
371  *
372  * PREEMPTS:
373  *              better preempt record tracking?
374  *                      i just hacked some event-intercept and timestamp code together
375  *                      maybe put it in the event library?
376  *                      the timestamps definitely helped debugging
377  *
378  *              is it true that if uthread code never spins outside a PDR lock, then it
379  *              doesn't need preemption IPIs?  (just someone checks the event at some
380  *              point). 
381  *                      think so: so long as you make progress and when you aren't, you
382  *                      check events (like if a uthread blocks on something and enters VC
383  *                      ctx)
384  *              adjusting the number of workers, whether vcores or uthreads
385  *              - if you have more lockers than cores:
386  *                      - spinpdr a worker will get starved (akaros) (without 2LS support)
387  *                              - running this from uth context will cause a handle_events
388  *                      - mcspdr will require the kernel to switch (akaros)
389  *                      - spin (akaros) might DL (o/w nothing), (linux) poor perf
390  *                      - mcs (akaros) will DL, (linux) poor perf
391  *                      - poor perf (latency spikes) comes from running the wrong thread
392  *                      sometimes
393  *                      - deadlock comes from the lack of kernel-level context switching
394  *              - if we scale workers down to the number of active vcores:
395  *                      - two things: the initial hit, and the steady state.  during the
396  *                      initial hit, we can still deadlock, since we have more lockers than
397  *                      cores
398  *                              - non-pdr (akaros) could deadlock in the initial hit
399  *                              - (akaros) steady state, everything is normal (just fewer cores)
400  *                      - how can we adjust this in linux?
401  *                              - if know how many cores you have, then futex wait the others
402  *                              - need some way to wake them back up
403  *                              - if you do this in userspace, you might need something PDR-like
404  *                              to handle when the "2LS" code gets preempted
405  *                      - as mentioned above, the problem in akaros is that the lock/unlock
406  *                      might be happening too fast to get into the steady-state and recover
407  *                      from the initial preemption
408  *              - one of our benefits is that we can adapt in userspace, with userspace
409  *              knowledge, under any circumstance.
410  *                      - we have the deadlock windows (forcing PDR).
411  *                      - in return, we can do this adaptation in userspace
412  *                      - and (arguably) anyone who does this in userspace will need PDR
413  *
414  * MEASUREMENT (user/parlib/measure.c)
415  *              extract into its own library, for linux apps
416  *              print out raw TSC times?  might help sync up diff timelines
417  *              Need more latency bins, spinlocks vary too much
418  *              maybe we need better high/low too, since this hist looks bad too
419  *                      or not center on the average?
420  *                      for printing, its hard to know without already binning.
421  *                      maybe bin once (latency?), then use that to adjust the hist?
422  *
423  *              Had this on a spinlock:
424  *              [      32 -    35656] 1565231:
425  *              (less than 200 intermediate)
426  *          [  286557 - 20404788]   65298: *
427  *
428  *              Samples per dot: 34782
429  *              Total samples: 1640606
430  *              Avg time   : 96658
431  *              Stdev time : 604064.440882
432  *              Coef Var   : 6.249503
433  *                      High coeff of var with serious outliers, adjusted bins
434  *                      50/75/90/99: 33079 / 33079 / 33079 / 290219 (-<860)
435  *                      Min / Max  : 32 / 20404788
436  *              was 50/75/90 really within 860 of each other?
437  *
438  *              when we are preempted and don't even attempt anything, say for 10ms, it
439  *              actually doesn't hurt our 50/75/90/99 too much.  we have a ridiculous
440  *              stddev and max, and high average, but there aren't any additional
441  *              attempts at locking to mess with the attempt-latency.  Only nr_vcores
442  *              requests are in flight during the preemption, but we can spit out around
443  *              5000 per ms when we aren't preempted.
444  *
445  */
446
447 const char *argp_program_version = "lock_test v0.1475263";
448 const char *argp_program_bug_address = "<akaros+subscribe@googlegroups.com>";
449
450 static char doc[] = "lock_test -- spinlock benchmarking";
451 static char args_doc[] = "-w NUM -l NUM -t LOCK";
452
453 #define OPT_VC_CTX 1
454 #define OPT_ADJ_WORKERS 2
455
456 static struct argp_option options[] = {
457         {"workers",             'w', "NUM",     OPTION_NO_USAGE, "Number of threads/cores"},
458         {0, 0, 0, 0, ""},
459         {"loops",               'l', "NUM",     OPTION_NO_USAGE, "Number of loops per worker"},
460         {0, 0, 0, 0, ""},
461         {"type",                't', "LOCK",OPTION_NO_USAGE, "Type of lock to use.  "
462                                                      "Options:\n"
463                                                      "\tmcs\n"
464                                                      "\tmcscas\n"
465                                                      "\tmcspdr\n"
466                                                      "\tmcspdro\n"
467                                                      "\t__mcspdro\n"
468                                                      "\tspin\n"
469                                                      "\tspinpdr"},
470         {0, 0, 0, 0, "Other options (not mandatory):"},
471         {"adj_workers", OPT_ADJ_WORKERS, 0,     0, "Adjust workers such that the "
472                                                "number of workers equals the "
473                                                "number of vcores"},
474         {"vc_ctx",              OPT_VC_CTX, 0,  0, "Run threads in mock-vcore context"},
475         {0, 0, 0, 0, ""},
476         {"hold",                'h', "NSEC",    0, "nsec to hold the lock"},
477         {"delay",               'd', "NSEC",    0, "nsec to delay between grabs"},
478         {"print",               'p', "ROWS",    0, "Print ROWS of optional measurements"},
479         {"outfile",             'o', "FILE",    0, "Print ROWS of optional measurements"},
480         { 0 }
481 };
482
483 struct prog_args {
484         int                                                     nr_threads;
485         int                                                     nr_loops;
486         int                                                     hold_time;
487         int                                                     delay_time;
488         int                                                     nr_print_rows;
489         bool                                            fake_vc_ctx;
490         bool                                            adj_workers;
491         char                                            *outfile_path;
492         void *(*lock_type)(void *arg);
493 };
494 struct prog_args pargs = {0};
495
496 /* Globals */
497 struct time_stamp {
498         uint64_t pre;
499         uint64_t acq;
500         uint64_t un;
501         bool valid;
502 };
503 struct time_stamp **times;
504 bool run_locktest = TRUE;
505 pthread_barrier_t start_test;
506
507 /* Locking functions.  Define globals here, init them in main (if possible), and
508  * use the lock_func() macro to make your thread func. */
509
510 #define lock_func(lock_name, lock_cmd, unlock_cmd)                             \
511 void *lock_name##_thread(void *arg)                                            \
512 {                                                                              \
513         long thread_id = (long)arg;                                                \
514         int hold_time = ACCESS_ONCE(pargs.hold_time);                              \
515         int delay_time = ACCESS_ONCE(pargs.delay_time);                            \
516         int nr_loops = ACCESS_ONCE(pargs.nr_loops);                                \
517         bool fake_vc_ctx = ACCESS_ONCE(pargs.fake_vc_ctx);                         \
518         bool adj_workers = ACCESS_ONCE(pargs.adj_workers);                         \
519         uint64_t pre_lock, acq_lock, un_lock;                                      \
520         struct time_stamp *this_time;                                              \
521         struct mcs_lock_qnode mcs_qnode = MCS_QNODE_INIT;                          \
522         struct mcs_pdro_qnode pdro_qnode = MCSPDRO_QNODE_INIT;                     \
523         int i;                                                                     \
524         /* guessing a unique vcoreid for vcoreid for the __mcspdr test.  if the
525          * program gets preempted for that test, things may go nuts */             \
526         pdro_qnode.vcoreid = thread_id + 1 % pargs.nr_threads;                     \
527         /* Wait til all threads are created.  Ideally, I'd like to busywait unless
528          * absolutely critical to yield */                                         \
529         pthread_barrier_wait(&start_test);                                         \
530         if (fake_vc_ctx) {                                                         \
531                 /* tells the kernel / other vcores we're in vc ctx */                  \
532                 uth_disable_notifs();                                                  \
533                 /* tricks ourselves into believing we're in vc ctx */                  \
534                 __vcore_context = TRUE;                                                \
535         }                                                                          \
536         for (i = 0; i < nr_loops; i++) {                                           \
537                 if (!run_locktest)                                                     \
538                         break;                                                             \
539                 pre_lock = read_tsc_serialized();                                      \
540                                                                                \
541                 lock_cmd                                                               \
542                                                                                \
543                 acq_lock = read_tsc_serialized();                                      \
544                 if (hold_time)                                                         \
545                         ndelay(hold_time);                                                 \
546                                                                                \
547                 unlock_cmd                                                             \
548                                                                                \
549                 un_lock = read_tsc_serialized();                                       \
550                 this_time = &times[thread_id][i];                                      \
551                 this_time->pre = pre_lock;                                             \
552                 this_time->acq = acq_lock;                                             \
553                 this_time->un = un_lock;                                               \
554                 /* Can turn these on or off to control which samples we gather */      \
555                 this_time->valid = TRUE;                                               \
556                 /* this_time->valid = (num_vcores() == max_vcores());  */              \
557                                                                                \
558                 if (delay_time)                                                        \
559                         ndelay(delay_time);                                                \
560                 /* worker thread ids are 0..n-1.  if we're one of the threads that's
561                  * beyond the VC count, we yield. */                                   \
562                 if (adj_workers && num_vcores() < thread_id + 1) {                     \
563                         if (fake_vc_ctx) {                                                 \
564                                 __vcore_context = FALSE;                                       \
565                                 uth_enable_notifs();                                           \
566                         }                                                                  \
567                         /* we'll come back up once we have enough VCs running */           \
568                         pthread_yield();                                                   \
569                         if (fake_vc_ctx) {                                                 \
570                                 uth_disable_notifs();                                          \
571                                 __vcore_context = TRUE;                                        \
572                         }                                                                  \
573                 }                                                                      \
574                 cmb();                                                                 \
575         }                                                                          \
576         /* First thread to finish stops the test */                                \
577         run_locktest = FALSE;                                                      \
578         if (fake_vc_ctx) {                                                         \
579                 __vcore_context = FALSE;                                               \
580                 uth_enable_notifs();                                                   \
581         }                                                                          \
582         return (void*)(long)i;                                                     \
583 }
584
585 #define fake_lock_func(lock_name, x1, x2)                                      \
586 void *lock_name##_thread(void *arg)                                            \
587 {                                                                              \
588         printf("Lock " #lock_name " not supported!\n");                            \
589         exit(-1);                                                                  \
590 }
591
592 spinlock_t spin_lock = SPINLOCK_INITIALIZER;
593 struct mcs_lock mcs_lock = MCS_LOCK_INIT;
594
595 /* Defines locking funcs like "mcs_thread" */
596 lock_func(mcs,
597           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
598           mcs_lock_unlock(&mcs_lock, &mcs_qnode);)
599 lock_func(mcscas,
600           mcs_lock_lock(&mcs_lock, &mcs_qnode);,
601           mcs_lock_unlock_cas(&mcs_lock, &mcs_qnode);)
602 lock_func(spin,
603           spinlock_lock(&spin_lock);,
604           spinlock_unlock(&spin_lock);)
605
606 #ifdef __ros__
607 struct spin_pdr_lock spdr_lock = SPINPDR_INITIALIZER;
608 struct mcs_pdr_lock mcspdr_lock;
609 struct mcs_pdro_lock mcspdro_lock = MCSPDRO_LOCK_INIT;
610
611 lock_func(mcspdr,
612           mcs_pdr_lock(&mcspdr_lock);,
613           mcs_pdr_unlock(&mcspdr_lock);)
614 lock_func(mcspdro,
615           mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
616           mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
617 lock_func(__mcspdro,
618           __mcs_pdro_lock(&mcspdro_lock, &pdro_qnode);,
619           __mcs_pdro_unlock(&mcspdro_lock, &pdro_qnode);)
620 lock_func(spinpdr,
621           spin_pdr_lock(&spdr_lock);,
622           spin_pdr_unlock(&spdr_lock);)
623 #else
624
625 fake_lock_func(mcspdr, 0, 0);
626 fake_lock_func(mcspdro, 0, 0);
627 fake_lock_func(__mcspdro, 0, 0);
628 fake_lock_func(spinpdr, 0, 0);
629
630 #endif
631
632 static int get_acq_latency(void **data, int i, int j, uint64_t *sample)
633 {
634         struct time_stamp **times = (struct time_stamp**)data;
635         /* 0 for initial time means we didn't measure */
636         if (times[i][j].pre == 0)
637                 return -1;
638         /* can optionally throw out invalid times (keep this in sync with the
639          * lock_test macro, based on what you want to meaasure. */
640         #if 0
641         if (!times[i][j].valid)
642                 return -1;
643         #endif
644         *sample = times[i][j].acq - times[i][j].pre - get_tsc_overhead();
645         return 0;
646 }
647
648 static int get_hld_latency(void **data, int i, int j, uint64_t *sample)
649 {
650         struct time_stamp **times = (struct time_stamp**)data;
651         /* 0 for initial time means we didn't measure */
652         if (times[i][j].pre == 0)
653                 return -1;
654         *sample = times[i][j].un - times[i][j].acq - get_tsc_overhead();
655         return 0;
656 }
657
658 static int get_acq_timestamp(void **data, int i, int j, uint64_t *sample)
659 {
660         struct time_stamp **times = (struct time_stamp**)data;
661         /* 0 for initial time means we didn't measure */
662         if (times[i][j].pre == 0)
663                 return -1;
664         *sample = times[i][j].acq;
665         return 0;
666 }
667
668 #ifdef __ros__
669
670 /* Lousy event intercept.  build something similar in the event library? */
671 #define MAX_NR_EVENT_TRACES 1000
672 uint64_t preempts[MAX_NR_EVENT_TRACES] = {0};
673 uint64_t indirs[MAX_NR_EVENT_TRACES] = {0};
674 atomic_t preempt_idx;
675 atomic_t indir_idx;
676 atomic_t preempt_cnt;
677 atomic_t indir_cnt;
678
679 static void trace_preempt(struct event_msg *ev_msg, unsigned int ev_type,
680                           void *data)
681 {
682         unsigned long my_slot = atomic_fetch_and_add(&preempt_idx, 1);
683         if (my_slot < MAX_NR_EVENT_TRACES)
684                 preempts[my_slot] = read_tsc();
685         atomic_inc(&preempt_cnt);
686 }
687
688 static void trace_indir(struct event_msg *ev_msg, unsigned int ev_type,
689                         void *data)
690 {
691         unsigned long my_slot = atomic_fetch_and_add(&indir_idx, 1);
692         if (my_slot < MAX_NR_EVENT_TRACES)
693                 indirs[my_slot] = read_tsc();
694         atomic_inc(&indir_cnt);
695 }
696
697 /* Helper, prints out the preempt trace */
698 static void print_preempt_trace(uint64_t starttsc, int nr_print_rows)
699 {
700         /* reusing nr_print_rows for the nr preempt/indirs rows as well */
701         int preempt_rows = MIN(MAX_NR_EVENT_TRACES, nr_print_rows);
702         if (pargs.fake_vc_ctx) {
703                 printf("No preempt trace available when faking vc ctx\n");
704                 return;
705         }
706         printf("\n");
707         printf("Nr Preempts: %d\n", atomic_read(&preempt_cnt));
708         printf("Nr Indirs  : %d\n", atomic_read(&indir_cnt));
709         if (preempt_rows)
710                 printf("Preempt/Indir events:\n-----------------\n");
711         for (int i = 0; i < preempt_rows; i++) {
712                 if (preempts[i])
713                         printf("Preempt %3d at %6llu\n", i, tsc2msec(preempts[i]
714                                                                      - starttsc));
715         }
716         for (int i = 0; i < preempt_rows; i++) {
717                 if (indirs[i])
718                         printf("Indir   %3d at %6llu\n", i, tsc2msec(indirs[i]
719                                                                      - starttsc));
720         }
721 }
722
723 /* Make sure we have enough VCs for nr_threads, pref 1:1 at the start */
724 static void os_prep_work(pthread_t *worker_threads, int nr_threads)
725 {
726         if (nr_threads > max_vcores()) {
727                 printf("Too many threads (%d) requested, can't get more than %d vc\n",
728                        nr_threads, max_vcores());
729                 exit(-1);
730         }
731         atomic_init(&preempt_idx, 0);
732         atomic_init(&indir_idx, 0);
733         atomic_init(&preempt_cnt, 0);
734         atomic_init(&indir_cnt, 0);
735         pthread_can_vcore_request(FALSE);       /* 2LS won't manage vcores */
736         pthread_need_tls(FALSE);
737         pthread_mcp_init();                                     /* gives us one vcore */
738         register_ev_handler(EV_VCORE_PREEMPT, trace_preempt, 0);
739         register_ev_handler(EV_CHECK_MSGS, trace_indir, 0);
740         if (pargs.fake_vc_ctx) {
741                 /* need to disable events when faking vc ctx.  since we're looping and
742                  * not handling events, we could run OOM */
743                 clear_kevent_q(EV_VCORE_PREEMPT);
744                 clear_kevent_q(EV_CHECK_MSGS);
745         }
746         if (vcore_request_more(nr_threads - 1)) {
747                 printf("Failed to request %d more vcores, currently have %d\n",
748                        nr_threads - 1, num_vcores());
749                 exit(-1);
750         }
751         for (int i = 0; i < nr_threads; i++) {
752                 printd("Vcore %d mapped to pcore %d\n", i,
753                        __procinfo.vcoremap[i].pcoreid);
754         }
755 }
756
757 static void os_post_work(pthread_t *worker_threads, int nr_threads)
758 {
759 }
760
761 #endif
762
763 /* Argument parsing */
764 static error_t parse_opt (int key, char *arg, struct argp_state *state)
765 {
766         struct prog_args *pargs = state->input;
767         switch (key) {
768                 case 'w':
769                         pargs->nr_threads = atoi(arg);
770                         if (pargs->nr_threads < 0) {
771                                 printf("Negative nr_threads...\n\n");
772                                 argp_usage(state);
773                         }
774                         break;
775                 case 'l':
776                         pargs->nr_loops = atoi(arg);
777                         if (pargs->nr_loops < 0) {
778                                 printf("Negative nr_loops...\n\n");
779                                 argp_usage(state);
780                         }
781                         break;
782                 case OPT_ADJ_WORKERS:
783                         pargs->adj_workers = TRUE;
784                         break;
785                 case OPT_VC_CTX:
786                         pargs->fake_vc_ctx = TRUE;
787                         break;
788                 case 'h':
789                         pargs->hold_time = atoi(arg);
790                         if (pargs->hold_time < 0) {
791                                 printf("Negative hold_time...\n\n");
792                                 argp_usage(state);
793                         }
794                         break;
795                 case 'd':
796                         pargs->delay_time = atoi(arg);
797                         if (pargs->delay_time < 0) {
798                                 printf("Negative delay_time...\n\n");
799                                 argp_usage(state);
800                         }
801                         break;
802                 case 'o':
803                         pargs->outfile_path = arg;
804                         break;
805                 case 'p':
806                         pargs->nr_print_rows = atoi(arg);
807                         if (pargs->nr_print_rows < 0) {
808                                 printf("Negative print_rows...\n\n");
809                                 argp_usage(state);
810                         }
811                         break;
812                 case 't':
813                         if (!strcmp("mcs", arg)) {
814                                 pargs->lock_type = mcs_thread;
815                                 break;
816                         }
817                         if (!strcmp("mcscas", arg)) {
818                                 pargs->lock_type = mcscas_thread;
819                                 break;
820                         }
821                         if (!strcmp("mcspdr", arg)) {
822                                 pargs->lock_type = mcspdr_thread;
823                                 break;
824                         }
825                         if (!strcmp("mcspdro", arg)) {
826                                 pargs->lock_type = mcspdro_thread;
827                                 break;
828                         }
829                         if (!strcmp("__mcspdro", arg)) {
830                                 pargs->lock_type = __mcspdro_thread;
831                                 break;
832                         }
833                         if (!strcmp("spin", arg)) {
834                                 pargs->lock_type = spin_thread;
835                                 break;
836                         }
837                         if (!strcmp("spinpdr", arg)) {
838                                 pargs->lock_type = spinpdr_thread;
839                                 break;
840                         }
841                         printf("Unknown locktype %s\n\n", arg);
842                         argp_usage(state);
843                         break;
844                 case ARGP_KEY_ARG:
845                         printf("Warning, extra argument %s ignored\n\n", arg);
846                         break;
847                 case ARGP_KEY_END:
848                         if (!pargs->nr_threads) {
849                                 printf("Must select a number of threads.\n\n");
850                                 argp_usage(state);
851                                 break;
852                         }
853                         if (!pargs->nr_loops) {
854                                 printf("Must select a number of loops.\n\n");
855                                 argp_usage(state);
856                                 break;
857                         }
858                         if (!pargs->lock_type) {
859                                 printf("Must select a type of lock.\n\n");
860                                 argp_usage(state);
861                                 break;
862                         }
863                         break;
864                 default:
865                         return ARGP_ERR_UNKNOWN;
866         }
867         return 0;
868 }
869
870 static struct argp argp = {options, parse_opt, args_doc, doc};
871
872 int main(int argc, char** argv)
873 {
874         pthread_t *worker_threads;
875         void **loops_done;
876         struct timeval start_tv = {0};
877         struct timeval end_tv = {0};
878         long usec_diff, total_loops = 0;
879         uint64_t starttsc;
880         int nr_threads, nr_loops;
881         FILE *outfile;
882         struct sample_stats acq_stats, hld_stats;
883
884         argp_parse(&argp, argc, argv, 0, 0, &pargs);
885         nr_threads = pargs.nr_threads;
886         nr_loops = pargs.nr_loops;
887         mcs_pdr_init(&mcspdr_lock);
888
889         if (pargs.outfile_path) {
890                 /* RDWR, CREAT, TRUNC, O666 */
891                 outfile = fopen(pargs.outfile_path, "w+");
892                 if (!outfile) {
893                         perror("outfile");
894                         exit(-1);
895                 }
896         }
897         worker_threads = malloc(sizeof(pthread_t) * nr_threads);
898         if (!worker_threads) {
899                 perror("pthread_t malloc failed:");
900                 exit(-1);
901         }
902         loops_done = malloc(sizeof(void*) * nr_threads);
903         if (!loops_done) {
904                 perror("loops_done malloc failed");
905                 exit(-1);
906         }
907         printf("Making %d workers of %d loops each, %sadapting workers to vcores, "
908                "and %sfaking vcore context\n", nr_threads, nr_loops,
909                pargs.adj_workers ? "" : "not ",
910                pargs.fake_vc_ctx ? "" : "not ");
911         pthread_barrier_init(&start_test, NULL, nr_threads);
912
913         times = malloc(sizeof(struct time_stamp *) * nr_threads);
914         assert(times);
915         for (int i = 0; i < nr_threads; i++) {
916                 times[i] = malloc(sizeof(struct time_stamp) * nr_loops);
917                 if (!times[i]) {
918                         perror("Record keeping malloc");
919                         exit(-1);
920                 }
921                 memset(times[i], 0, sizeof(struct time_stamp) * nr_loops);
922         }
923         printf("Record tracking takes %ld bytes of memory\n",
924                nr_threads * nr_loops * sizeof(struct time_stamp));
925         os_prep_work(worker_threads, nr_threads);       /* ensure we have enough VCs */
926         /* Doing this in MCP ctx, so we might have been getting a few preempts
927          * already.  Want to read start before the threads pass their barrier */
928         starttsc = read_tsc();
929         /* create and join on yield */
930         for (long i = 0; i < nr_threads; i++) {
931                 if (pthread_create(&worker_threads[i], NULL, pargs.lock_type,
932                                    (void*)i))
933                         perror("pth_create failed");
934         }
935         os_post_work(worker_threads, nr_threads);
936         if (gettimeofday(&start_tv, 0))
937                 perror("Start time error...");
938         for (int i = 0; i < nr_threads; i++) {
939                 pthread_join(worker_threads[i], &loops_done[i]);
940         }
941         if (gettimeofday(&end_tv, 0))
942                 perror("End time error...");
943
944         printf("Acquire times (TSC Ticks)\n---------------------------\n");
945         acq_stats.get_sample = get_acq_latency;
946         compute_stats((void**)times, nr_threads, nr_loops, &acq_stats);
947
948         printf("Held times (from acq til rel done) (TSC Ticks)\n------\n");
949         hld_stats.get_sample = get_hld_latency;
950         compute_stats((void**)times, nr_threads, nr_loops, &hld_stats);
951
952         usec_diff = (end_tv.tv_sec - start_tv.tv_sec) * 1000000 +
953                     (end_tv.tv_usec - start_tv.tv_usec);
954         printf("Time to run: %ld usec\n", usec_diff);
955
956         printf("\nLock throughput:\n-----------------\n");
957         /* throughput for the entire duration (in ms), 1ms steps.  print as many
958          * steps as they ask for (up to the end of the run). */
959         print_throughput((void**)times, usec_diff / 1000 + 1, msec2tsc(1),
960                          pargs.nr_print_rows,
961                          starttsc, nr_threads,
962                          nr_loops, get_acq_timestamp);
963         print_preempt_trace(starttsc, pargs.nr_print_rows);
964
965         for (int i = 0; i < nr_threads; i++) {
966                 total_loops += (long)loops_done[i];
967                 if (!loops_done[i])
968                         printf("WARNING: thread %d performed 0 loops!\n", i);
969         }
970         printf("Average number of loops done, per thread: %ld\n",
971                total_loops / nr_threads);
972         for (int i = 0; i < nr_threads; i++)
973                 printf("\tThread %d performed %lu loops\n", i, (long)loops_done[i]);
974
975         if (pargs.outfile_path) {
976                 fprintf(outfile, "#");
977                 for (char **arg = argv; *arg; arg++)
978                         fprintf(outfile, " %s", *arg);
979                 fprintf(outfile, "\n");
980                 fprintf(outfile, "# thread_id attempt pre acq(uire) un(lock) "
981                                  "tsc_overhead\n");
982                 fprintf(outfile, "# acquire latency: acq - pre - tsc_overhead\n");
983                 fprintf(outfile, "# hold time: un - acq - tsc_overhead\n");
984                 fprintf(outfile, "# tsc_frequency %llu\n", get_tsc_freq());
985                 fprintf(outfile, "# tsc_overhead is 0 on linux, hard code it with a "
986                                  "value from akaros\n");
987                 for (int i = 0; i < nr_threads; i++) {
988                         for (int j = 0; j < nr_loops; j++) {
989                                 struct time_stamp *ts = &times[i][j];
990                                 if (!ts->pre)
991                                         break; /* empty record */
992                                 fprintf(outfile, "%d %d %llu %llu %llu %llu\n", i, j, ts->pre,
993                                         ts->acq, ts->un, get_tsc_overhead());
994                         }
995                 }
996                 fclose(outfile);
997         }
998         printf("Done, exiting\n");
999 }