akaros/kern/arch/x86/rdtsc_test.c
<<
>>
Prefs
   1/* Barret Rhoden
   2 *
   3 * Code heavily ported from "How to Benchmark Code Execution Times on Intel(R)
   4 * IA-32 and IA-64 Instruction Set Architectures" for linux, except for
   5 * check_timing_stability().
   6 *
   7 * The idea behind this was that the traditional style of using rdtsc was to
   8 * call:
   9 *      cpuid;
  10 *      rdtsc;
  11 * since rdtsc does no serialization (meaning later instructions can get
  12 * executed before it, or vice versa).  While this first cpuid isn't a big deal,
  13 * doing this in pairs means reading the end time also measures cpuid.  This is
  14 * a problem since cpuid can vary quite a bit.
  15 *
  16 * If we use rdtscp for the end call, we can put the cpuid after rdtscp, thereby
  17 * not including cpuid's overhead (and variability) in our measurement.  That's
  18 * where the intel doc ends.  For more info, check out:
  19 *      http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
  20 *
  21 * Note that the Intel SDM says you can serialize rdtsc with lfence, such as:
  22 *      lfence;
  23 *      rdtsc;
  24 * Linux uses this (mfence on amd64, lfence on intel).  For more info:
  25 *              https://lkml.org/lkml/2008/1/2/353
  26 * Note this use of lfence before rdtsc is supposedly serializing any
  27 * instruction, not just loads.  Some stranger on the internet suggested that
  28 * while lfence only serializes memory (and not arbitrary instructions), in
  29 * actual hardware there is no point to reorder non-memory instructions around
  30 * rdtsc:
  31 *      http://stackoverflow.com/questions/12631856/difference-between-rdtscp-rdtsc-memory-and-cpuid-rdtsc
  32 *      (look for janneb's response to questions about his comment)
  33 *
  34 * Its not clear from what anyone writes as to whether or not you need to
  35 * serialize below rdtsc.  Supposedly, you'd need cpuid/lfence on both sides of
  36 * rdtsc to prevent reordering in both directions.  Andi Kleen does this in a
  37 * few places
  38 *      https://lkml.org/lkml/2008/1/7/276
  39 * though other places in the kernel suggest it is unnecessary (at least for
  40 * loads:
  41 *      http://lxr.linux.no/#linux+v3.8.2/arch/x86/kvm/x86.c#L1258
  42 * The intel docs don't mention it (otherwise we would be told to use
  43 * lfence;rdtsc;lfence).  The howto this file is based off of didn't mention it
  44 * either, other than to say rdtscp needs to serialize from below.  AFAIK,
  45 * rdtscp is like rdtsc, except that it serializes from above (and also returns
  46 * the CPU id).  If rdtscp needs to serialize from below, then so should rdtsc.
  47 *
  48 * That being said, if these rdtsc(p) calls do not need serialization from
  49 * below, then rdtscp (which provides serialization from above) should not need
  50 * any additional serialization (lfence or cpuid).
  51 *
  52 * I tried out a few options for the assembly for the start and end time
  53 * measurements, using the intel benchmark.  The benchmark reports variance, max
  54 * deviation, and minimum per inner loop (line), as well as an overall variance,
  55 * max dev, and variance of vars/mins.
  56 *
  57 * CASE    START ASM            END ASM
  58 * ---------------------------------------------------
  59 * case 0: cpuid;rdtsc;         cpuid;rdtscp;
  60 * case 1: cpuid;rdtsc;         rdtscp;cpuid; (or rdtscp;lfence)
  61 * case 2: lfence;rdtsc;        rdtscp;cpuid; (or rdtscp;lfence)
  62 * case 3: rdtscp;              rdtscp;cpuid; (or rdtscp;lfence)
  63 * case 4: rdtscp;              rdtscp;
  64 * case 5: lfence;rdtsc;        lfence;rdtsc;
  65 * case 6: lfence;rdtsc;lfence; lfence;rdtsc;lfence;
  66 *
  67 * Note I only ran these a couple times, with 1000x10000, and I did notice some
  68 * slight variation between runs (on cases 3 and 4).
  69 *
  70 * case 0:       wildly variant, variance of variances wasn't 0, etc (as
  71 * reported by intel).
  72 * case 0:  some lines     0 var, 0-8 max dev, 420 min
  73 * case 0: other lines 50-60 var,  20 max dev, 388 min
  74 *
  75 * For all of the other cases, variance of variances and of minvalues was 0.
  76 *
  77 * case 1: most lines 2-3 var, 4 max dev, 44 min, 2 var 4 max dev overall
  78 * case 2: most lines 2-3 var, 4 max dev, 44 min, 2 var 4 max dev overall
  79 * case 3: most lines   0 var, 0 max dev, 32 min, 0 var 0 max dev overall
  80 * case 4: most lines   0 var, 0 max dev, 32 min, 0 var 4 max dev overall
  81 * case 5: most lines   3 var, 4 max dev, 28 min, 2 var 4 max dev overall
  82 * case 6: most lines   3 var, 4 max dev, 44 min, 2 var 4 max dev overall
  83 *
  84 *      case 1-3: cpuid vs lfence: both seem to work the same and have no effect
  85 *      (since they are outside the loop)
  86 *
  87 * So running with rdtscp for both start and stop (case 3 and 4) had the least
  88 * amount of variance (both per line and total).  When these cases have had any
  89 * deviation, it was because one run had a min of 28, but o/w was 32.  (1 out of
  90 * 10000000, often the first run).
  91 *
  92 * All the others have a little deviation, but with a more stable min.  Again,
  93 * this is taken mostly from a small number of runs (of 1kx10k).
  94 *
  95 * Note that cases 5 and 6 have lfences inside the measurement area, and this
  96 * does not seem to cause problems the same way cpuid does.  However, lfences
  97 * inside the critical section (esp after whatever code we are measuring)
  98 * probably will have an effect on real code that has made memory accesses (keep
  99 * in mind we need to do an mfence on amd64 here).
 100 *
 101 * All that being said, it's not clear which option to use.  Ideally, we want
 102 * an isolated region of code to be measured, with very little variance and max
 103 * deviation.  If cases 1-6 are all the same in terms of protection (which I'm
 104 * not sure about), then 3-4 look nice.  However, the fact that sometimes the
 105 * min is less than 'normal', means that we could get negative numbers for some
 106 * measurements (the goal is to determine the overhead and subtract that from
 107 * our total measurement, and if we think the overhead is 32 but was actually 28
 108 * for a run, we could have issues).
 109 *
 110 * But wait, there's more:
 111 *
 112 * When we add code around (and inside) the measurement, things get even worse:
 113 * - If we put variable (a volatile btw) = j + i; in the loop, there's no real
 114 *   change.  I checked cases 1, 4 and 5, 1 being the intel recommended, 4 being
 115 *   the one with the best variance with no code, and 5 being a good symmetric
 116 *   choice (same on start and end).  Case 1 had no change at all.  4 and 5 had
 117 *   little change (min was the same, occasional deviation).  Note that case 5
 118 *   doesn't use rdtscp at the end either.
 119 * - If we put in variable = i; as well, the minimum still is unaffected, and
 120 *   there is a little more variance.  For example, for case 4, the min is still
 121 *   32, and sometimes you get a 36.
 122 *
 123 * If we add more code (like a for loop that grows in length with each outer
 124 * loop), eventually we can detect the existence of the instructions.  The Intel
 125 * author talks about this in 3.3 when he finds the resolution of the benchmark.
 126 *
 127 * My hunch is that the rdtsc(p) calls hide the latency of some previous
 128 * instructions, regardless of serialization commands.  We see this 'hiding' of
 129 * the cost of instructions regardless of whether or not the first or last
 130 * commands are rdtscp (I'm more concerned with the end time call, which is
 131 * where this hiding may be happening).  Perhaps the pipeline needs to be
 132 * drained (or something), and it takes a certain amount of time to do so,
 133 * regardless of a few extra instructions squeezed in.  Meaning we can't tell
 134 * the difference between 0 and a few cycles, and probably a few cycles are
 135 * 'free' / hidden by the rdtsc call.
 136 *
 137 * Bottom line?  Our measurements are inexact, despite the stable minimum and
 138 * low variance.  Everything will be +/- our max deviation, as well as
 139 * potentially underestimating by a few cycles/ticks.  One thing we can do is
 140 * try to see what the resolution is of the different methods.
 141 *
 142 * case 1: cpuid;rdtsc;         rdtscp;cpuid; (or rdtscp;lfence)
 143 * -------------------
 144 * loop_size:0 >>>> variance(cycles): 3; max_deviation: 8; min time: 44
 145 * loop_size:1 >>>> variance(cycles): 6; max_deviation: 28; min time: 44
 146 * loop_size:2 >>>> variance(cycles): 4; max_deviation: 16; min time: 44
 147 * loop_size:3 >>>> variance(cycles): 12; max_deviation: 44; min time: 44
 148 * loop_size:4 >>>> variance(cycles): 10; max_deviation: 32; min time: 44
 149 * loop_size:5 >>>> variance(cycles): 10; max_deviation: 32; min time: 44
 150 * loop_size:6 >>>> variance(cycles): 12; max_deviation: 36; min time: 44
 151 * loop_size:7 >>>> variance(cycles): 5; max_deviation: 32; min time: 48
 152 * loop_size:8 >>>> variance(cycles): 16; max_deviation: 52; min time: 48
 153 * loop_size:9 >>>> variance(cycles): 13; max_deviation: 48; min time: 52
 154 * loop_size:10 >>>> variance(cycles): 9; max_deviation: 36; min time: 52
 155 * loop_size:11 >>>> variance(cycles): 16; max_deviation: 64; min time: 56
 156 *
 157 * case 4: rdtscp;              rdtscp;
 158 * -------------------
 159 * loop_size:0 >>>> variance(cycles): 1; max_deviation: 20; min time: 32
 160 * loop_size:1 >>>> variance(cycles): 12; max_deviation: 36; min time: 36
 161 * loop_size:2 >>>> variance(cycles): 13; max_deviation: 32; min time: 36
 162 * loop_size:3 >>>> variance(cycles): 7; max_deviation: 32; min time: 40
 163 * loop_size:4 >>>> variance(cycles): 1; max_deviation: 16; min time: 44
 164 * loop_size:5 >>>> variance(cycles): 4; max_deviation: 28; min time: 44
 165 * loop_size:6 >>>> variance(cycles): 12; max_deviation: 48; min time: 44
 166 * loop_size:7 >>>> variance(cycles): 8; max_deviation: 32; min time: 44
 167 * loop_size:8 >>>> variance(cycles): 10; max_deviation: 48; min time: 48
 168 *
 169 * case 5: lfence;rdtsc;        lfence;rdtsc;
 170 * -------------------
 171 * loop_size:0 >>>> variance(cycles): 3; max_deviation: 12; min time: 28
 172 * loop_size:1 >>>> variance(cycles): 8; max_deviation: 28; min time: 32
 173 * loop_size:2 >>>> variance(cycles): 8; max_deviation: 28; min time: 32
 174 * loop_size:3 >>>> variance(cycles): 6; max_deviation: 28; min time: 32
 175 * loop_size:4 >>>> variance(cycles): 2; max_deviation: 24; min time: 36
 176 * loop_size:5 >>>> variance(cycles): 6; max_deviation: 28; min time: 36
 177 * loop_size:6 >>>> variance(cycles): 11; max_deviation: 44; min time: 36
 178 * loop_size:7 >>>> variance(cycles): 7; max_deviation: 32; min time: 36
 179 * loop_size:8 >>>> variance(cycles): 1; max_deviation: 16; min time: 40
 180 *
 181 * For cases 4 and 5, we notice quite quickly.  The for loop itself has some
 182 * overhead (probably more than our simple stores and adds).  So the resolution
 183 * of these methods is a little more than a loop's overhead.  For case 1, we
 184 * need about 7 loops, in addition to the overhead, until we can reliably detect
 185 * the additional instructions.  Note the deviation and variation increases for
 186 * all cases.
 187 *
 188 *
 189 * What about extra code before the measurement?  I reran the test cases with
 190 * some extra tsc-related code above the measurement (an accidental asm
 191 * insertion of lfence;rdtsc above reading the start time) and with no work in
 192 * between:
 193 *      case 1: no effect
 194 *      case 2: no effect
 195 * These both had some form of serialization (cpuid or lfence) above the rdtsc
 196 * command.  But when we try using just rdtscp (with no extra serialization:)
 197 *      case 3, normal: lines   0 var, 0 max dev, 32 min, 0 var 0 max dev
 198 *      case 3, extras: lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev
 199 * Similar deal with case 4.  Lots of 28s and deviation.  It looks like some
 200 * times the rdtsc diff is only 28, and others 32 (hence the deviation of 4).
 201 * Note this means the measurement interval is *lower*, which means the code was
 202 * *faster*.  Was the rdtscp not serializing instructions from above (which
 203 * doesn't make sense, since anything sneaking in from above should make the
 204 * code *slower*)?  Or is it because the previous command was rdtsc, which might
 205 * 'speed up' subsequent rdtscs.  I tried it again, with a little work between
 206 * the unused TSC read and the start tsc read:
 207 *      case 3, more crap : lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev
 208 * So no real change from adding minor code in between.  What about adding an
 209 * lfence above the rdtscp (so it is almost exactly like case 2)?
 210 * Our assembly code now looks like:
 211 *      lfence;
 212 *      rdtsc;
 213 *      mov %edx, (memory);     // these get overwritten
 214 *      mov %eax, (memory);     // these get overwritten
 215 *
 216 *      mov (memory), %eax;     // misc work (variable = i + j)
 217 *      add %esi, %eax;         // misc work (variable = i + j)
 218 *      mov %eax, (memory);     // misc work (variable = i + j)
 219 *
 220 *      lfence;
 221 *      rdtscp;                 // this is the real start measurement
 222 *      mov %edx, (memory);
 223 *      mov %eax, (memory);
 224 *
 225 *      // no extra work here
 226 *
 227 *      rdtscp;                 // this is the real end measurement
 228 *      mov %edx, (memory);
 229 *      mov %eax, (memory);
 230 *      cpuid;                  // this is case 3, with sync after
 231 *
 232 * Even with this extra lfence, case 3-style still shows numbers like:
 233 *      case 3, added crap: lines 2-3 var, 4 max dev, 28 min, 2 var 4 max dev
 234 * So either rdtscp is somehow faster due to internal-processor-caching (a
 235 * previous rdtsc makes the next rdtscp somewhat faster sometimes, including
 236 * after some instructions and an lfence), or the baseline case of no variation
 237 * is "wrong", and we really should expect between 28 and 32.  FWIW, the Intel
 238 * author also had a max deviation of 4 (per line).  And remember, on rare
 239 * occasions we get a 28 for case 3 and 4 (the other 9999999 times it is 32).
 240 *
 241 * Note how the modified case 3 is pretty much the same *in performance* as a
 242 * case 5.  But its code is nearly identical to case 2.  If you change the start
 243 * measurement's rdtscp to an rdtsc, the min goes from 28 -> 44 (this is case
 244 * 2).  And if you change the end measurements rdtscp to an lfence; rdtscp, we
 245 * go from 44->48 (this is no case).  Then if you change that rdtscp to an
 246 * rdtsc, we drop from 48->28 (this is case 5).  Based on this, it looks like
 247 * the different types of rdtsc take their time measurement at different points
 248 * within their execution.  rdtsc probably takes its measurement earlier in the
 249 * instruction (~16-20 cycles/ticks earlier perhaps?), based on the 48->28
 250 * back-side step and the front-side 28->44 step.
 251 *
 252 * Anyway, what matters is a relatively stable method without a lot of variance
 253 * that has a solid floor/min that we can detect at runtime (to run tests on a
 254 * given machine).  Using rdtscp for the start measurement seems unreliable
 255 * (when run alone we get 32, when run with things we get 28, on the corei7).
 256 * So even though case 3 and 4 had nice low variances and deviations, I don't
 257 * trust it, and would rather go with something that always gives me the same
 258 * result (as well as being a low result).  So case 5 will be my go-to for now.
 259 * It should have the same protection as the others (perhaps 6 is better), it is
 260 * stable, and it has a low overhead and low resolution (less capacity to hide
 261 * instruction latency).  Finally, the start and end measurements use the same
 262 * code, which is very convenient.
 263 *
 264 * This isn't conclusive - we'd need to do more tests with different workloads
 265 * on different machines, and probably talk to an intel architect.
 266 *
 267 * Still reading?  There's one more thing: System Management Mode!  This is an
 268 * interrupt context that is invisible to the OS, but we can see its effects in
 269 * our measurements.  If you run this code with the default settings, you often
 270 * won't see it (unless you have some loops).  However, if you run with
 271 * 1024x16384 (0x400 by 0x4000), you are likely to see very large max
 272 * deviations, such as 100, 600, or even 1500000.  From what I can tell, the
 273 * likelihood depends on how long the inner loop.  Using case 5 at 0x400,
 274 * 0x4000, after 3-4 runs, I had one line out of 1024 lines that was much
 275 * higher.  Three were 112, one was 1659260.  AFAIK, this is system management
 276 * mode kicking in.  You can mitigate this by disabling all types of USB legacy
 277 * support in the BIOS.  Specifically, faking USB keyboards and mice (making
 278 * them look like PS/2) and USB mass storage (making them look like a HDD) all
 279 * lead to an increase in SMIs.  For more info, check out:
 280 *      https://rt.wiki.kernel.org/index.php/HOWTO:_Build_an_RT-application
 281 * It is not sufficient to merely not use things like the USB mass storage.  It
 282 * needs to be disabled in the BIOS.  At least, this is true on my nehalem.  A
 283 * while back, we had an issue with microbenchmarks taking 10% longer if you
 284 * held down a key on the keyboard, even if the code was running on a core that
 285 * did not receive the keyboard IRQ.  Turns out this was due to a USB keyboard
 286 * in legacy mode.  The real root of this problem was SMM, which forces all
 287 * cores to enter SMM whenever any core enters SMM (hence the cross-core
 288 * interference).
 289 *
 290 * So finally, disable anything that may lead to SMM interference.  I have some
 291 * code that runs at startup that tries to determine the min time for the given
 292 * approved method of measurement (i.e., case 5), and also tries to detect SMIs
 293 * via massive latency spikes.  */
 294
 295#include <ros/common.h>
 296#include <arch/arch.h>
 297#include <stdio.h>
 298#include <kmalloc.h>
 299#include <time.h>
 300#include <smp.h>
 301#include <ros/procinfo.h>
 302
 303#define STAT_SIZE_DEF 10000
 304#define LOOP_BOUND_DEF 1000
 305
 306/* Fills in the **times with the results of the double loop measurement.  There
 307 * are many options for start and end time measurements, all inside #if 0 #endif
 308 * comments.  Copy/paste whichever you'd like to test out. */
 309static inline void filltimes(uint64_t **times, unsigned int loop_bound,
 310                             unsigned int stat_size)
 311{
 312        unsigned long flags;
 313        int i, j;
 314        uint64_t start, end;
 315        unsigned int start_low, start_high, end_low, end_high;
 316        unsigned int dummy_low, dummy_high;
 317        volatile int variable = 0;
 318        int8_t state = 0;
 319
 320        /* Variety of warmups.  recommended for cpuid... */
 321        asm volatile ("cpuid\n\t"
 322                      "rdtsc\n\t"
 323                      "cpuid\n\t"
 324                      "rdtsc\n\t"
 325                      "cpuid\n\t"
 326                      "rdtsc\n\t"
 327                      "mov %%edx, %0\n\t"
 328                      "mov %%eax, %1\n\t": "=m" (dummy_high), "=m" (dummy_low)::
 329                      "%eax", "%ebx", "%ecx", "%edx");
 330        for (j = 0; j < loop_bound; j++) {
 331                for (i = 0; i < stat_size; i++) {
 332                        variable = 0;
 333                        /* starting side, i want to make sure we always copy out
 334                         * to memory (stack), instead of sometimes using
 335                         * registers (and other times not).  if you use =a, for
 336                         * instance, with no work, the compiler will use esi and
 337                         * edi to store start_high and _low.
 338                         *
 339                         * The same concern is probably unnecessary at the end,
 340                         * but it might keep the compiler from reserving the use
 341                         * of those registers.*/
 342
 343                        #if 0 /* extra crap before the measurement code */
 344                        asm volatile (
 345                                "lfence;"
 346                                "rdtsc;"
 347                                "mov %%edx, %0;"
 348                                "mov %%eax, %1;"
 349                                : "=m" (dummy_high), "=m" (dummy_low)
 350                                :
 351                                : "%eax", "%edx");
 352
 353                        variable = i + j;
 354                        #endif
 355
 356                        asm volatile (
 357                                "lfence;"
 358                                "rdtsc;"
 359                                "mov %%edx, %0;"
 360                                "mov %%eax, %1;"
 361                                : "=m" (start_high), "=m" (start_low)
 362                                :
 363                                : "%eax", "%edx");
 364                        #if 0   /* types of start time measurements */
 365                        asm volatile (
 366                                      "cpuid;"
 367                                      "rdtsc;"
 368                                      "mov %%edx, %0;"
 369                                      "mov %%eax, %1;"
 370                                      : "=m" (start_high), "=m" (start_low)
 371                                      :
 372                                      : "%eax", "%ebx", "%ecx", "%edx");
 373                        asm volatile (
 374                                      "lfence;"
 375                                      "rdtsc;"
 376                                      "mov %%edx, %0;"
 377                                      "mov %%eax, %1;"
 378                                      : "=m" (start_high), "=m" (start_low)
 379                                      :
 380                                      : "%eax", "%edx");
 381                        asm volatile (
 382                                      "lfence;"
 383                                      "rdtsc;"
 384                                      "lfence;"
 385                                      "mov %%edx, %0;"
 386                                      "mov %%eax, %1;"
 387                                      : "=m" (start_high), "=m" (start_low)
 388                                      :
 389                                      : "%eax", "%edx");
 390
 391                        asm volatile(
 392                                     "rdtscp;"
 393                                     "mov %%edx, %0;"
 394                                     "mov %%eax, %1;"
 395                                    : "=m" (start_high), "=m" (start_low)
 396                                    :
 397                                    : "%eax", "%ecx", "%edx");
 398                        #endif
 399
 400                        /* call the function to measure here */
 401
 402                        #if 0 /* some options for code to measure */
 403                        variable = j;
 404
 405                        variable = i + j;
 406
 407                        for (int k = 0; k < j; k++)
 408                                variable = k;
 409                        #endif
 410
 411                        asm volatile("lfence;"
 412                                     "rdtsc;"
 413                                     "mov %%edx, %0;"
 414                                     "mov %%eax, %1;"
 415                                     : "=m" (end_high), "=m" (end_low)
 416                                     :
 417                                     : "%eax", "%edx");
 418                        #if 0   /* types of end time measurements */
 419                        asm volatile("cpuid;"
 420                                     "rdtsc;"
 421                                     "mov %%edx, %0;"
 422                                     "mov %%eax, %1;"
 423                                     : "=m" (end_high), "=m" (end_low)
 424                                     :
 425                                     : "%eax", "%ebx", "%ecx", "%edx");
 426                        asm volatile("lfence;"
 427                                     "rdtsc;"
 428                                     "mov %%edx, %0;"
 429                                     "mov %%eax, %1;"
 430                                     : "=m" (end_high), "=m" (end_low)
 431                                     :
 432                                     : "%eax", "%edx");
 433                        asm volatile("lfence;"
 434                                     "rdtsc;"
 435                                                  "lfence;"
 436                                     "mov %%edx, %0;"
 437                                     "mov %%eax, %1;"
 438                                     : "=m" (end_high), "=m" (end_low)
 439                                     :
 440                                     : "%eax", "%edx");
 441
 442                        asm volatile(
 443                                     "rdtscp;"
 444                                     "mov %%edx, %0;"
 445                                     "mov %%eax, %1;"
 446                                     : "=m" (end_high), "=m" (end_low)
 447                                     :
 448                                     : "%eax", "%ecx", "%edx");
 449                        asm volatile(
 450                                     "rdtscp;"
 451                                                 "lfence;"
 452                                     "mov %%edx, %0;"
 453                                     "mov %%eax, %1;"
 454                                     : "=m" (end_high), "=m" (end_low)
 455                                     :
 456                                     : "%eax", "%ecx", "%edx");
 457                        asm volatile(
 458                                     "rdtscp;"
 459                                     "mov %%edx, %0;"
 460                                     "mov %%eax, %1;"
 461                                     "cpuid;"
 462                                     : "=m" (end_high), "=m" (end_low)
 463                                     :
 464                                     : "%eax", "%ebx", "%ecx", "%edx");
 465                        #endif
 466
 467                        start = ( ((uint64_t)start_high << 32) | start_low );
 468                        end = ( ((uint64_t)end_high << 32) | end_low );
 469
 470                        if ( (int64_t)(end - start) < 0) {
 471                                printk("CRITICAL ERROR IN TAKING THE TIME!!!!\n"
 472                                       "loop(%d) stat(%d) start = %llu, "
 473                                       "end = %llu, variable = %u\n", j, i,
 474                                       start, end, variable);
 475                                times[j][i] = 0;
 476                        } else {
 477                                times[j][i] = end - start;
 478                        }
 479                }
 480        }
 481}
 482
 483/* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance, doing pop
 484 * variance, multiplying by N/N, and not checking overflow of size*size */
 485uint64_t var_calc(uint64_t *inputs, int size)
 486{
 487        int i;
 488        uint64_t acc = 0, previous = 0, temp_var = 0;
 489
 490        for (i = 0; i < size; i++) {
 491                if (acc < previous)
 492                        goto overflow;
 493                previous = acc;
 494                acc += inputs[i];
 495        }
 496        acc = acc * acc;
 497        if (acc < previous)
 498                goto overflow;
 499        previous = 0;
 500        for (i = 0; i < size; i++) {
 501                if (temp_var < previous)
 502                        goto overflow;
 503                previous = temp_var;
 504                temp_var+= (inputs[i]*inputs[i]);
 505        }
 506        temp_var = temp_var * size;
 507        if (temp_var < previous)
 508                goto overflow;
 509        temp_var = (temp_var - acc)/(((uint64_t)(size))*((uint64_t)(size)));
 510        return (temp_var);
 511overflow:
 512        printk("CRITICAL OVERFLOW ERROR IN var_calc!!!!!!\n\n");
 513        return -1;
 514}
 515
 516int test_rdtsc(unsigned int loop_bound, unsigned int stat_size)
 517{
 518        int8_t state = 0;
 519
 520        int i = 0, j = 0, spurious = 0, k = 0;
 521        uint64_t **times;
 522        uint64_t *variances;
 523        uint64_t *min_values;
 524        uint64_t max_dev = 0, min_time = 0, max_time = 0, prev_min = 0;
 525        uint64_t tot_var = 0, max_dev_all = 0, var_of_vars = 0, var_of_mins = 0;
 526
 527        loop_bound = loop_bound ?: LOOP_BOUND_DEF;
 528        stat_size = stat_size ?: STAT_SIZE_DEF;
 529
 530        printk("Running rdtsc tests...\n");
 531
 532        times = kmalloc(loop_bound * sizeof(uint64_t*), 0);
 533        if (!times) {
 534                printk("unable to allocate memory for times\n");
 535                return 0;
 536        }
 537
 538        for (j = 0; j < loop_bound; j++) {
 539                times[j] = kmalloc(stat_size * sizeof(uint64_t), 0);
 540                if (!times[j]) {
 541                        printk("unable to allocate memory for times[%d]\n", j);
 542                        for (k = 0; k < j; k++)
 543                                kfree(times[k]);
 544                        return 0;
 545                }
 546        }
 547
 548        variances = kmalloc(loop_bound * sizeof(uint64_t), 0);
 549        if (!variances) {
 550                printk("unable to allocate memory for variances\n");
 551                // not bothering to free **times
 552                return 0;
 553        }
 554
 555        min_values = kmalloc(loop_bound * sizeof(uint64_t), 0);
 556        if (!min_values) {
 557                printk("unable to allocate memory for min_values\n");
 558                // not bothering to free **times or variances
 559                return 0;
 560        }
 561
 562        disable_irqsave(&state);
 563
 564        filltimes(times, loop_bound, stat_size);
 565
 566        enable_irqsave(&state);
 567
 568        for (j = 0; j < loop_bound; j++) {
 569                max_dev = 0;
 570                min_time = 0;
 571                max_time = 0;
 572
 573                for (i = 0; i < stat_size; i++) {
 574                        if ((min_time == 0) || (min_time > times[j][i]))
 575                                min_time = times[j][i];
 576                        if (max_time < times[j][i])
 577                                max_time = times[j][i];
 578                }
 579                max_dev = max_time - min_time;
 580                min_values[j] = min_time;
 581                if ((prev_min != 0) && (prev_min > min_time))
 582                        spurious++;
 583                if (max_dev > max_dev_all)
 584                        max_dev_all = max_dev;
 585                variances[j] = var_calc(times[j], stat_size);
 586                tot_var += variances[j];
 587
 588                printk("loop_size:%d >>>> variance(cycles): %llu; "
 589                       "max_deviation: %llu; min time: %llu\n", j, variances[j],
 590                       max_dev, min_time);
 591                prev_min = min_time;
 592        }
 593
 594        var_of_vars = var_calc(variances, loop_bound);
 595        var_of_mins = var_calc(min_values, loop_bound);
 596
 597        printk("total number of spurious min values = %d\n", spurious);
 598        /* is this next one the mean variance, not the total? */
 599        printk("total variance = %llu\n", (tot_var/loop_bound));
 600        printk("absolute max deviation = %llu\n", max_dev_all);
 601        printk("variance of variances = %llu\n", var_of_vars);
 602        printk("variance of minimum values = %llu\n", var_of_mins);
 603
 604        for (j = 0; j < loop_bound; j++) {
 605                kfree(times[j]);
 606        }
 607        kfree(times);
 608        kfree(variances);
 609        kfree(min_values);
 610        return 0;
 611}
 612
 613
 614/* Crude SMI or other TSC-instability detection. */
 615bool check_timing_stability(void)
 616{
 617        uint64_t min_overhead = UINT64_MAX;
 618        uint64_t max_overhead = 0;
 619        uint64_t start, end, diff;
 620        uint32_t edx;
 621        int8_t irq_state = 0;
 622        volatile int dummy = 0;
 623
 624        /* Don't even bother if we don't have an invariant TSC */
 625        cpuid(0x80000007, 0x0, 0, 0, 0, &edx);
 626        if (!(edx & (1 << 8))) {
 627                printk("Invariant TSC not present.  Do not benchmark!\n");
 628                return FALSE;
 629        }
 630        disable_irqsave(&irq_state);
 631        /* 2mil detected an SMI about 95% of the time on my nehalem. */
 632        for (int i = 0; i < 3000000; i++) {
 633                start = read_tsc_serialized();
 634                for (int j = 0; j < 500; j++)
 635                        dummy = j;
 636                end = read_tsc_serialized();
 637                if ((int64_t)(end - start) < 0) {
 638                        printk("TSC stability overflow error!\n");
 639                        return FALSE;
 640                }
 641                diff = end - start;
 642                min_overhead = MIN(min_overhead, diff);
 643                max_overhead = MAX(max_overhead, diff);
 644        }
 645        enable_irqsave(&irq_state);
 646        if (max_overhead - min_overhead > 50) {
 647                printk("Test TSC overhead unstable (Min: %llu, Max: %llu).  "
 648                       "Do not benchmark!\n", min_overhead, max_overhead);
 649                return FALSE;
 650        }
 651        return TRUE;
 652}
 653
 654void test_tsc_cycles(void)
 655{
 656        uint64_t start, end;
 657        int8_t irq_state = 0;
 658
 659        disable_irqsave(&irq_state);
 660        start = read_tsc_serialized();
 661        for (int i = 0; i < 1000; i++) {
 662                asm volatile ("addl $1, %%eax;"
 663                              "addl $1, %%eax;"
 664                              "addl $1, %%eax;"
 665                              "addl $1, %%eax;"
 666                              "addl $1, %%eax;"
 667                              "addl $1, %%eax;"
 668                              "addl $1, %%eax;"
 669                              "addl $1, %%eax;"
 670                              "addl $1, %%eax;"
 671                              "addl $1, %%eax;"
 672                              "addl $1, %%eax;"
 673                              "addl $1, %%eax;"
 674                              "addl $1, %%eax;"
 675                              "addl $1, %%eax;"
 676                              "addl $1, %%eax;"
 677                              "addl $1, %%eax;"
 678                              "addl $1, %%eax;"
 679                              "addl $1, %%eax;"
 680                              "addl $1, %%eax;"
 681                              "addl $1, %%eax;"
 682                              "addl $1, %%eax;"
 683                              "addl $1, %%eax;"
 684                              "addl $1, %%eax;"
 685                              "addl $1, %%eax;"
 686                              "addl $1, %%eax;"
 687                              "addl $1, %%eax;"
 688                              "addl $1, %%eax;"
 689                              "addl $1, %%eax;"
 690                              "addl $1, %%eax;"
 691                              "addl $1, %%eax;"
 692                              "addl $1, %%eax;"
 693                              "addl $1, %%eax;"
 694                              "addl $1, %%eax;"
 695                              "addl $1, %%eax;"
 696                              "addl $1, %%eax;"
 697                              "addl $1, %%eax;"
 698                              "addl $1, %%eax;"
 699                              "addl $1, %%eax;"
 700                              "addl $1, %%eax;"
 701                              "addl $1, %%eax;"
 702                              "addl $1, %%eax;"
 703                              "addl $1, %%eax;"
 704                              "addl $1, %%eax;"
 705                              "addl $1, %%eax;"
 706                              "addl $1, %%eax;"
 707                              "addl $1, %%eax;"
 708                              "addl $1, %%eax;"
 709                              "addl $1, %%eax;"
 710                              "addl $1, %%eax;"
 711                              "addl $1, %%eax;"
 712                              "addl $1, %%eax;"
 713                              "addl $1, %%eax;"
 714                              "addl $1, %%eax;"
 715                              "addl $1, %%eax;"
 716                              "addl $1, %%eax;"
 717                              "addl $1, %%eax;"
 718                              "addl $1, %%eax;"
 719                              "addl $1, %%eax;"
 720                              "addl $1, %%eax;"
 721                              "addl $1, %%eax;"
 722                              "addl $1, %%eax;"
 723                              "addl $1, %%eax;"
 724                              "addl $1, %%eax;"
 725                              "addl $1, %%eax;"
 726                              "addl $1, %%eax;"
 727                              "addl $1, %%eax;"
 728                              "addl $1, %%eax;"
 729                              "addl $1, %%eax;"
 730                              "addl $1, %%eax;"
 731                              "addl $1, %%eax;"
 732                              "addl $1, %%eax;"
 733                              "addl $1, %%eax;"
 734                              "addl $1, %%eax;"
 735                              "addl $1, %%eax;"
 736                              "addl $1, %%eax;"
 737                              "addl $1, %%eax;"
 738                              "addl $1, %%eax;"
 739                              "addl $1, %%eax;"
 740                              "addl $1, %%eax;"
 741                              "addl $1, %%eax;"
 742                              "addl $1, %%eax;"
 743                              "addl $1, %%eax;"
 744                              "addl $1, %%eax;"
 745                              "addl $1, %%eax;"
 746                              "addl $1, %%eax;"
 747                              "addl $1, %%eax;"
 748                              "addl $1, %%eax;"
 749                              "addl $1, %%eax;"
 750                              "addl $1, %%eax;"
 751                              "addl $1, %%eax;"
 752                              "addl $1, %%eax;"
 753                              "addl $1, %%eax;"
 754                              "addl $1, %%eax;"
 755                              "addl $1, %%eax;"
 756                              "addl $1, %%eax;"
 757                              "addl $1, %%eax;"
 758                              "addl $1, %%eax;"
 759                              "addl $1, %%eax;"
 760                              "addl $1, %%eax;"
 761                              "addl $1, %%eax;"
 762                              : : : "eax", "cc");
 763        }
 764        end = read_tsc_serialized();
 765        end = end - start - __proc_global_info.tsc_overhead;
 766        printk("%llu (100,000) ticks passed, run twice to load the icache\n",
 767               end);
 768
 769        enable_irqsave(&irq_state);
 770}
 771
 772static inline __attribute__((always_inline))
 773uint64_t pmc_cycles(void)
 774{
 775        unsigned int a = 0, d = 0;
 776        int ecx = (1 << 30) + 1;
 777
 778        asm volatile("lfence; rdpmc" : "=a"(a), "=d"(d) : "c"(ecx));
 779        return ((uint64_t)a) | (((uint64_t)d) << 32);
 780}
 781
 782/* run with $ perf stat m kfunc wrmsr_test 0xMSR 100000 */
 783void wrmsr_test(unsigned int msr, int loops)
 784{
 785        uint64_t start_cycles, diff_cycles;
 786        uint64_t start_time, diff_time;
 787        uint64_t msrval = read_msr(msr);
 788
 789        loops = MAX(loops, 1);
 790        start_cycles = pmc_cycles();
 791        start_time = start_timing();
 792
 793        for (int i = 0; i < loops; i++)
 794                write_msr(msr, msrval);
 795
 796        diff_cycles = pmc_cycles() - start_cycles;
 797        diff_time = stop_timing(start_time);
 798
 799        printk("msr 0x%x, cycles per: %llu, nsec per: %llu\n", msr,
 800               diff_cycles / loops, tsc2nsec(diff_time) / loops);
 801}
 802
 803/* Does a basic test for interference.  You should kfunc this, often after
 804 * starting the monitor on another core.  You can spam it with ipi_spam().
 805 * You'll also need the PMCs to run.  Easiest way is with:
 806 * $ perf stat -e cycles sleep 9999999. */
 807void interference_test(void)
 808{
 809        #define THRESHOLD 200
 810        uint64_t low_samples[THRESHOLD] = {0};
 811        uint64_t deadline = sec2tsc(5); /* assumes TSC and cycles are close */
 812        uint64_t start, diff;
 813        size_t nr_below_thresh = 0;
 814        size_t nr_over_thresh = 0;
 815        size_t total = 0;
 816        size_t max = 0;
 817
 818        deadline += pmc_cycles();
 819        enable_irq();
 820        do {
 821                total++;
 822                start = pmc_cycles();
 823                diff = pmc_cycles() - start;
 824                if (diff < COUNT_OF(low_samples))
 825                        low_samples[diff]++;
 826                max = diff > max ? diff : max;
 827                if (diff < THRESHOLD)
 828                        nr_below_thresh++;
 829                else
 830                        nr_over_thresh++;
 831                if (!start) {
 832                        warn("rdpmc got 0, is perf stat -e cycles running? (aborting)");
 833                        break;
 834                }
 835        } while (start < deadline);
 836        disable_irq();
 837
 838        printk("\nCore %d\n--------\n", core_id());
 839        for (int i = 0; i < COUNT_OF(low_samples); i++) {
 840                if (low_samples[i])
 841                        printk("\t[ %2d ] : %lu\n", i, low_samples[i]);
 842        }
 843        printk("Total loops %lu, threshold %u\n", total, THRESHOLD);
 844        printk("Nr over thresh %lu\n", nr_over_thresh);
 845        printk("Nr below thresh %lu\n", nr_below_thresh);
 846        printk("Max %lu\n", max);
 847}
 848
 849/* Kfunc this to spam a core with IPIs */
 850void ipi_spam(int coreid)
 851{
 852        for (int i = 0; i < 1000; i++) {
 853                send_ipi(coreid, I_POKE_CORE);
 854                udelay(1000);
 855        }
 856}
 857
 858/* Kfunc this to halt with IRQs off.  Note this doesn't fully work as
 859 * advertised.  Keyboard and NIC IRQs still wake it up, but LAPIC timers don't
 860 * seem to. */
 861void superhalt(void)
 862{
 863        unsigned int x86_cstate = X86_MWAIT_C2;
 864
 865        disable_irq();
 866        asm volatile("monitor" : : "a"(KERNBASE), "c"(0), "d"(0));
 867        asm volatile("mwait" : : "c"(0x0), "a"(x86_cstate) : "memory");
 868        printk("Core %d woke from superhalt!\n", core_id());
 869}
 870