tests/linux: support multiple kernel lock tests
authorBarret Rhoden <brho@cs.berkeley.edu>
Thu, 25 Jun 2020 18:01:50 +0000 (14:01 -0400)
committerBarret Rhoden <brho@cs.berkeley.edu>
Thu, 25 Jun 2020 18:01:50 +0000 (14:01 -0400)
Just like userspace, we can now specify multiple tests to run in the
kernel.  It's basically the same as in userspace.

There are a few other fixups.  Notably:
- disable IRQs for multiple loops.  For whatever reason, disable/enable
IRQs (even outside the timestmping and critical section) hurts
performance (throughput and latency).  Just do it every so often.  Still
not 100% on this.

- hold_time is copied into the stack, instead of being global.  We could
also make it read_mostly - not sure if that would help.  The issue is
that it's often adjacent to other cache lines in the module.  Such as
the cache line holding the lock.  The adjacent cacheline prefetcher
*might* pull in that cacheline in exclusive mode.  That would cause all
threads to take a cache miss when reading hold_time.  You'd think that
grabbing the lock would always pull in hold_time for the lock holder,
but not necessarily.  It's an MCS lock, so you might have touched the
mcs_l a long time ago.  One reason why we wouldn't miss on hold_time
would be if we did READ_ONCE(lock_test) *before grabbing the lock*,
which would also pull in the cacheline.  That would only matter if the
AClP didn't always invalidate the line (which is true).  It's a mess.
Not 100% on this either.

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
tests/linux/modules/mcs.c
tests/lock_test.c

index 39d07fe..5993313 100644 (file)
 #include <linux/completion.h>
 #include <asm/msr.h>
 
+/* Seems fine either way.  Userspace uses lfence; rdtsc.  So use this if you're
+ * paranoid about comparisons between user and kernel. */
+#if 1
+static inline u64 read_tsc_serialized(void)
+{
+       u32 lo, hi;
+
+       asm volatile("lfence; rdtsc" : "=a" (lo), "=d" (hi));
+       return (u64)hi << 32 | lo;
+}
+
+#else
+#define read_tsc_serialized rdtsc_ordered
+#endif
+
+static void simple_ndelay(u64 nsec)
+{
+       u64 end;
+
+       end = rdtsc() + (tsc_khz * nsec) / 1000000;
+       do {
+               cpu_relax();
+       } while (rdtsc() < end);
+}
+
 struct lock_sample {
        u64 pre;
        u64 acq;
@@ -32,9 +57,113 @@ struct lock_sample {
        bool valid;
 };
 
+/* Consider using 128, if you're worried about Advanced Cacheline Prefetching */
+#define CL_SZ 64
+
+/* How long we'll run without IRQs enabled */
+#define MSEC_WITHOUT_IRQ 100
+
+/*********************** MCS ******************************/
+
+#define MCS_LOCK_INIT {0}
+#define MCS_QNODE_INIT {0, 0}
+
+typedef struct mcs_lock_qnode
+{
+       struct mcs_lock_qnode *next;
+       int locked;
+} __attribute__((aligned(CL_SZ))) mcs_lock_qnode_t;
+
+typedef struct mcs_lock
+{
+       mcs_lock_qnode_t *lock;
+} mcs_lock_t;
+
+/* Dirty trick to get an isolated cache line; need the attrib on the type. */
+struct {
+       struct mcs_lock ___mcs_l;
+} __attribute__((aligned(CL_SZ))) __mcs_l = {MCS_LOCK_INIT};
+#define mcs_l __mcs_l.___mcs_l
+
+void mcs_lock_init(struct mcs_lock *lock)
+{
+       memset(lock, 0, sizeof(mcs_lock_t));
+}
+
+static inline mcs_lock_qnode_t *mcs_qnode_swap(mcs_lock_qnode_t **addr,
+                                               mcs_lock_qnode_t *val)
+{
+       return (mcs_lock_qnode_t*) __sync_lock_test_and_set((void**)addr, val);
+}
+
+void notrace mcs_lock_lock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode)
+{
+       mcs_lock_qnode_t *predecessor;
+
+       qnode->next = 0;
+       barrier();      /* swap provides a CPU mb() */
+       predecessor = mcs_qnode_swap(&lock->lock, qnode);
+       if (predecessor) {
+               qnode->locked = 1;
+               smp_wmb();
+               predecessor->next = qnode;
+               /* no need for a wrmb(), since this will only get unlocked
+                * after they read our previous write */
+               while (qnode->locked)
+                       cpu_relax();
+       }
+       barrier();/* just need a cmb, the swap handles the CPU wmb/wrmb() */
+}
+
+/* CAS version (no usurper) */
+void notrace mcs_lock_unlock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode)
+{
+       /* Check if someone is already waiting on us to unlock */
+       if (qnode->next == 0) {
+               /* no need for CPU mbs, since there's an atomic_cas() */
+               barrier();
+               /* If we're still the lock, just swap it with 0 (unlock) and
+                * return */
+               if (__sync_bool_compare_and_swap((void**)&lock->lock, qnode, 0))
+                       return;
+               /* We failed, someone is there and we are some (maybe a
+                * different) thread's pred.  Since someone else was waiting,
+                * they should have made themselves our next.  Spin (very
+                * briefly!) til it happens. */
+               while (qnode->next == 0)
+                       cpu_relax();
+               /* Alpha wants a read_barrier_depends() here */
+               /* Now that we have a next, unlock them */
+               qnode->next->locked = 0;
+       } else {
+               /* mb()s necessary since we didn't call an atomic_swap() */
+               /* need to make sure any previous writes don't pass unlocking */
+               smp_wmb();
+               /* need to make sure any reads happen before the unlocking */
+               barrier(); //rwmb();
+               /* simply unlock whoever is next */
+               qnode->next->locked = 0;
+       }
+}
+
+/*********************** QUEUE ******************************/
+struct {
+       struct qspinlock ___qsl;
+} __attribute__((aligned(CL_SZ))) __qsl = {__ARCH_SPIN_LOCK_UNLOCKED};
+#define qsl __qsl.___qsl
+
+/*********************** SPIN ******************************/
+struct {
+       arch_spinlock_t ___asl;;
+} __attribute__((aligned(CL_SZ))) __asl = {__ARCH_SPIN_LOCK_UNLOCKED};
+#define asl __asl.___asl
+
+
 /* mtx protects all variables and the test run */
 static struct mutex mtx;
 
+struct lock_test;
+static struct lock_test *test;
 static DECLARE_COMPLETION(test_done);
 
 static unsigned int nr_threads;
@@ -51,71 +180,144 @@ static void **retvals;
 static void *results;
 static size_t results_sz;
 
-static bool run_locktest __cacheline_aligned_in_smp;
-static atomic_t horses __cacheline_aligned_in_smp;
-
-static struct qspinlock l = __ARCH_SPIN_LOCK_UNLOCKED;
-
-static int __mcs_thread_lock_test(void *arg)
-{
-       long thread_id = (long)arg;
-       u64 pre_lock, acq_lock, un_lock;
-       struct lock_sample *this_time;
-       int i;
-
-       atomic_dec(&horses);
-       while (atomic_read(&horses))
-               cpu_relax();
-       for (i = 0; i < nr_loops; i++) {
-               /*
+static bool run_locktest;
+static atomic_t horses;
+
+/* Every lock type has their own function, named __lock_name_thread(). */
+#define lock_func(lock_name, pre_cmd, lock_cmd, unlock_cmd)                    \
+static int __##lock_name##_thread(void *arg)                                   \
+{                                                                              \
+       long thread_id = (long)arg;                                            \
+       u64 pre_lock, acq_lock, un_lock;                                       \
+       struct lock_sample *this_time;                                         \
+       int i;                                                                 \
+       u64 next_irq;                                                          \
+                                                                               \
+       /*                                                                     
+        * hold_time is the important one to have locally.  o/w we might cache
+        * miss during the critical section.  Why would we miss?  Perhaps
+        * because hold_time is on the adjacent cache line to the spinlock, and
+        * !(MSR 0x1A4 & 2), though that'd only make sense if Adjacent Cachelin
+        * Prefetching prefetched in exclusive mode (and thus invalidating).
+        * The others are important too, though less so.  Their miss would be
+        * outside the critical section, but if you happen to rearrange the
+        * file, they could falsely share with the lock.
+        */                                                                    \
+       unsigned int hold_time_l = READ_ONCE(hold_time);                       \
+       unsigned int delay_time_l = READ_ONCE(delay_time);                     \
+       unsigned int nr_loops_l = READ_ONCE(nr_loops);                         \
+                                                                               \
+       pre_cmd                                                                \
+                                                                               \
+       atomic_dec(&horses);                                                   \
+       while (atomic_read(&horses))                                           \
+               cpu_relax();                                                   \
+                                                                               \
+       /*                                                                     
+        * I'd like to enable/disable IRQs in the loop, but that affects the
+        * test, even if they are outside the timestamps and the critical
+        * section.  Instead, just turn them on periodically.  100ms was what I
+        * noticed didn't affect the test's throughput (Haswell).
+        */                                                                    \
+       local_irq_disable();                                                   \
+       next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ;                       \
+                                                                               \
+       for (i = 0; i < nr_loops_l; i++) {                                     \
+               /*                                                             
                 * might be able to replace this with post-processing.  let the
-                * test run, and discard all entries after the first finisher
-                */
-               if (!READ_ONCE(run_locktest))
-                       break;
-
-               local_irq_disable();
-               pre_lock = rdtsc_ordered();
-
-               queued_spin_lock(&l);
-
-               acq_lock = rdtsc_ordered();
-
-               if (hold_time)
-                       ndelay(hold_time);
-
-               queued_spin_unlock(&l);
+                * test run, and discard all entries after the first finisher  
+                */                                                            \
+               if (!READ_ONCE(run_locktest))                                  \
+                       break;                                                 \
+                                                                               \
+               pre_lock = read_tsc_serialized();                              \
+                                                                               \
+               lock_cmd                                                       \
+                                                                               \
+               acq_lock = read_tsc_serialized();                              \
+                                                                               \
+               if (hold_time_l)                                               \
+                       simple_ndelay(hold_time_l);                            \
+                                                                               \
+               unlock_cmd                                                     \
+                                                                               \
+               un_lock = read_tsc_serialized();                               \
+                                                                               \
+               this_time = &times[thread_id][i];                              \
+               this_time->pre = pre_lock;                                     \
+               this_time->acq = acq_lock;                                     \
+               this_time->un = un_lock;                                       \
+               /* Can turn these on/off to control which samples we gather */ \
+               this_time->valid = true;                                       \
+               if (delay_time_l)                                              \
+                       simple_ndelay(delay_time_l);                           \
+               /*                                                             
+                * This can throw off your delay_time.  Think of delay_time as 
+                * the least amount of time we'll wait between reacquiring the 
+                * lock.                                                       
+                */                                                            \
+               if (next_irq < un_lock) {                                      \
+                       local_irq_enable();                                    \
+                       cond_resched();         /* since we're here. */        \
+                       local_irq_disable();                                   \
+                       next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ;       \
+               }                                                              \
+       }                                                                      \
+                                                                               \
+       local_irq_enable();                                                    \
+                                                                               \
+       /* First thread to finish stops the test */                            \
+       WRITE_ONCE(run_locktest, false);                                       \
+       /*                                                                     
+        * Wakes the controller thread.  The others will be done soon, to      
+        * complete the hokey thread join.                                     
+        */                                                                    \
+       complete(&test_done);                                                  \
+                                                                               \
+       WRITE_ONCE(retvals[thread_id], (void*)(long)i);                        \
+                                                                               \
+       return 0;                                                              \
+}
 
-               un_lock = rdtsc_ordered();
+lock_func(mcs,
+         struct mcs_lock_qnode qn = MCS_QNODE_INIT;,
+         mcs_lock_lock(&mcs_l, &qn);,
+         mcs_lock_unlock(&mcs_l, &qn););
+lock_func(queue,
+         ;,
+         queued_spin_lock(&qsl);,
+         queued_spin_unlock(&qsl););
+lock_func(spin,
+         ;,
+         arch_spin_lock(&asl);,
+         arch_spin_unlock(&asl););
+
+/* ID is for userspace, name is for the kthread, func is what runs */
+struct lock_test {
+       unsigned int id;
+       const char *name;
+       int (*func)(void *);
+};
 
-               local_irq_enable();
+#define LOCKTEST_MCS           1
+#define LOCKTEST_QUEUE                 2
+#define LOCKTEST_SPIN          3
 
-               this_time = &times[thread_id][i];
-               this_time->pre = pre_lock;
-               this_time->acq = acq_lock;
-               this_time->un = un_lock;
-               /* Can turn these on/off to control which samples we gather */
-               this_time->valid = true;
-               if (delay_time)
-                       ndelay(delay_time);
-               /*
-                * This can throw off your delay_time.  Think of delay_time as
-                * the least amount of time we'll wait between reacquiring the
-                * lock.  After all, IRQs are enabled, so all bets are off.
-                */
-               cond_resched();
-       }
-       /* First thread to finish stops the test */
-       WRITE_ONCE(run_locktest, false);
-       /*
-        * Wakes the controller thread.  The others will be done soon, to
-        * complete the hokey thread join.
-        */
-       complete(&test_done);
+static struct lock_test tests[] = {
+       {LOCKTEST_MCS,          "mcs", __mcs_thread},
+       {LOCKTEST_QUEUE,      "queue", __queue_thread},
+       {LOCKTEST_SPIN,        "spin", __spin_thread},
+       {}
+};
 
-       WRITE_ONCE(retvals[thread_id], (void*)(long)i);
+static struct lock_test *get_test(unsigned int id)
+{
+       struct lock_test *ret;
 
-       return 0;
+       for (ret = tests; ret->id; ret++)
+               if (ret->id == id)
+                       return ret;
+       return NULL;
 }
 
 /*
@@ -193,8 +395,8 @@ static int mcs_lock_test(void)
        }
 
        for (i = 0; i < nr_threads; i++) {
-               threads[i] = kthread_create_on_cpu(__mcs_thread_lock_test,
-                                                  (void*)(long)i, i, "mcs-%u");
+               threads[i] = kthread_create_on_cpu(test->func,
+                                                  (void*)(long)i, i, "mcs-""%u");
                if (IS_ERR(threads[i])) {
                        while (--i >= 0) {
                                /*
@@ -242,6 +444,10 @@ static ssize_t mcs_read(struct file *filp, struct kobject *kobj,
 {
        mutex_lock(&mtx);
 
+       if (!test) {
+               mutex_unlock(&mtx);
+               return -ENOLCK; /* i'd kill for errstr */
+       }
        if (!off) {
                if (mcs_lock_test()) {
                        mutex_unlock(&mtx);
@@ -291,19 +497,25 @@ static ssize_t mcs_write(struct file *filp, struct kobject *kobj,
                         struct bin_attribute *bin_attr,
                         char *buf, loff_t off, size_t count)
 {
-       unsigned int threads, loops, hold, delay;
+       unsigned int id, threads, loops, hold, delay;
        ssize_t ret;
+       struct lock_test *t;
 
        /* TODO: check_mul_overflow and whatnot, esp for the result_sz buffer */
-       ret = sscanf(buf, "%u %u %u %u", &threads, &loops, &hold,
+       ret = sscanf(buf, "%u %u %u %u %u", &id, &threads, &loops, &hold,
                     &delay);
-       if (ret != 4)
+       if (ret != 5)
                return -EINVAL;
+       
+       t = get_test(id);
+       if (!t)
+               return -ENOLCK;
        if (threads > num_online_cpus())
                return -ENODEV;
        if (threads == 0)
                threads = num_online_cpus();
        mutex_lock(&mtx);
+       test = t;
        nr_threads = threads;
        nr_loops = loops;
        hold_time = hold;
@@ -340,6 +552,7 @@ static int __init mcs_init(void)
                pr_err("\n\nfucked %d !!!\n\n\n", __LINE__);
                return -1;
        }
+
        return 0;
 }
 
index 0dbab60..fa5c9a9 100644 (file)
@@ -21,7 +21,7 @@
 #include <string.h>
 
 #define handle_error(msg) \
-       do { perror(msg); exit(EXIT_FAILURE); } while (0)
+       do { perror("Error, aborting: " msg); exit(EXIT_FAILURE); } while (0)
 
 /* OS dependent #incs */
 #ifdef __akaros__
@@ -499,6 +499,7 @@ static struct argp_option options[] = {
 struct lock_test {
        const char *name;
        void *(*func)(void *arg);
+       int id;
 };
 
 struct prog_args {
@@ -650,11 +651,17 @@ static struct lock_test tests[] = {
 
 #else
 
+#define LOCKTEST_MCS           1
+#define LOCKTEST_QUEUE                 2
+#define LOCKTEST_SPIN          3
+
 static struct lock_test tests[] = {
        {"mcs", mcs_thread},
        {"mcscas", mcscas_thread},
-       {"mcs-kernel", NULL},
        {"spin", spin_thread},
+       {"mcs-kernel", NULL, LOCKTEST_MCS},
+       {"queue-kernel", NULL, LOCKTEST_QUEUE},
+       {"spin-kernel", NULL, LOCKTEST_SPIN},
        {}
 };
 
@@ -930,7 +937,7 @@ static struct results run_pthreads_test(void)
 
 #ifdef __akaros__
 
-static struct results run_kernel_mod_test(void)
+static struct results run_kernel_mod_test(struct lock_test *t)
 {
        printf("Unsupported on Akaros\n");
        exit(-1);
@@ -942,7 +949,7 @@ static struct results run_kernel_mod_test(void)
  * The test runs on a pread(off==0).  The return format is all N of the
  * loops_done void*, followed by a 2D array of samples for threads then loops.
  * (t0_l0, t0_l1, t0_l2..., t1_l0, t1_l1, t1_l2...). */
-static struct results run_kernel_mod_test(void)
+static struct results run_kernel_mod_test(struct lock_test *t)
 {
        struct results results;
        int fd;
@@ -955,8 +962,8 @@ static struct results run_kernel_mod_test(void)
        fd = open("/sys/kernel/mcs", O_WRONLY);
        if (fd < 0)
                handle_error("open write");
-       if (dprintf(fd, "%u %u %u %u", pargs.nr_threads, pargs.nr_loops,
-                   pargs.hold_time, pargs.delay_time) < 0)
+       if (dprintf(fd, "%u %u %u %u %u", pargs.test->id, pargs.nr_threads,
+                   pargs.nr_loops, pargs.hold_time, pargs.delay_time) < 0)
                handle_error("setting opts.  too many threads?");
        /* For the change in parameters (notably the file size, due to
         * threads * loops) to take effect, you need to close and reopen. */
@@ -998,8 +1005,8 @@ static struct results run_kernel_mod_test(void)
 
 static struct results run_test(void)
 {
-       if (!strcmp(pargs.test->name, "mcs-kernel"))
-               return run_kernel_mod_test();
+       if (pargs.test->id)
+               return run_kernel_mod_test(pargs.test);
        return run_pthreads_test();
 }