tests/lock_test: add a linux kernel module for MCS locks
authorBarret Rhoden <brho@cs.berkeley.edu>
Tue, 9 Jun 2020 16:59:11 +0000 (12:59 -0400)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 9 Jun 2020 17:10:41 +0000 (13:10 -0400)
This uses the in-kernel "queue" locks, which are basically special MCS
locks.  The test runs in the kernel, and we extract the information to
userspace for processing, using the same commands and analysis as for
user-space spinlocks.

Right now, there's only the one kernel lock.  We could add more, like
regular spinlocks and whatnot.  That way, you'd be able to test a
variety of spinlocks, with the low-interference you'd expect from
running inside the kernel (irqs disabled, no resched while holding
locks, etc).

One reason to add other locks would be to test *our own* locks, instead
of Linux's locks.  In that sense, the module would be a platform for
benchmarking lock performance, generally.  Versus just testing Linux
implementations.   Especially since, from early numbers, our Linux
userspace MCS locks (which are basically the ones we coded for Akaros)
are better than the in-kernel MCS locks (queue_lock()).

To build this, you'll need to set up your LINUX_KDIR variable, but also
hack the source to export a symbol.  Oh well.  You'll also need to be
running that kernel.  You should be able to hack this up for a running
machine.  (e.g. cat the symtab, find the function, cast that arbitrary
number to a function pointer, and cross your fingers).

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
tests/linux/modules/mcs.c [new file with mode: 0644]
tests/lock_test.c

diff --git a/tests/linux/modules/mcs.c b/tests/linux/modules/mcs.c
new file mode 100644 (file)
index 0000000..39d07fe
--- /dev/null
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2013, 2014 The Regents of the University of California
+ * Copyright (c) 2020 Google Inc
+ *
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ *
+ * Sorry, but you'll need to change your linux source to expose this function:
+
+ EXPORT_SYMBOL_GPL(kthread_create_on_cpu);
+
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <asm/msr.h>
+
+struct lock_sample {
+       u64 pre;
+       u64 acq;
+       u64 un;
+       bool valid;
+};
+
+/* mtx protects all variables and the test run */
+static struct mutex mtx;
+
+static DECLARE_COMPLETION(test_done);
+
+static unsigned int nr_threads;
+static unsigned int nr_loops;
+static unsigned int hold_time;
+static unsigned int delay_time;
+
+/* array[nr_thread] of pointers of lock_sample[nr_loops] */
+static struct lock_sample **times;
+/* array[nr_thread] of task* */
+static struct task_struct **threads;
+/* array[nr_thread] of void* */
+static void **retvals;
+static void *results;
+static size_t results_sz;
+
+static bool run_locktest __cacheline_aligned_in_smp;
+static atomic_t horses __cacheline_aligned_in_smp;
+
+static struct qspinlock l = __ARCH_SPIN_LOCK_UNLOCKED;
+
+static int __mcs_thread_lock_test(void *arg)
+{
+       long thread_id = (long)arg;
+       u64 pre_lock, acq_lock, un_lock;
+       struct lock_sample *this_time;
+       int i;
+
+       atomic_dec(&horses);
+       while (atomic_read(&horses))
+               cpu_relax();
+       for (i = 0; i < nr_loops; i++) {
+               /*
+                * might be able to replace this with post-processing.  let the
+                * test run, and discard all entries after the first finisher
+                */
+               if (!READ_ONCE(run_locktest))
+                       break;
+
+               local_irq_disable();
+               pre_lock = rdtsc_ordered();
+
+               queued_spin_lock(&l);
+
+               acq_lock = rdtsc_ordered();
+
+               if (hold_time)
+                       ndelay(hold_time);
+
+               queued_spin_unlock(&l);
+
+               un_lock = rdtsc_ordered();
+
+               local_irq_enable();
+
+               this_time = &times[thread_id][i];
+               this_time->pre = pre_lock;
+               this_time->acq = acq_lock;
+               this_time->un = un_lock;
+               /* Can turn these on/off to control which samples we gather */
+               this_time->valid = true;
+               if (delay_time)
+                       ndelay(delay_time);
+               /*
+                * This can throw off your delay_time.  Think of delay_time as
+                * the least amount of time we'll wait between reacquiring the
+                * lock.  After all, IRQs are enabled, so all bets are off.
+                */
+               cond_resched();
+       }
+       /* First thread to finish stops the test */
+       WRITE_ONCE(run_locktest, false);
+       /*
+        * Wakes the controller thread.  The others will be done soon, to
+        * complete the hokey thread join.
+        */
+       complete(&test_done);
+
+       WRITE_ONCE(retvals[thread_id], (void*)(long)i);
+
+       return 0;
+}
+
+/*
+ * This consolidates the results in a format we will export to userspace.  We
+ * could have just used this format for the test itself, but then the times
+ * arrays wouldn't be NUMA local.
+ */
+static int mcs_build_output(struct lock_sample **times, void **retvals)
+{
+       int i;
+       size_t sz_rets = nr_threads * sizeof(void*);
+       size_t sz_times_per = nr_loops * sizeof(struct lock_sample);
+
+       results_sz = sz_rets + nr_threads * sz_times_per;
+
+       kvfree(results);
+
+       results = kvzalloc(results_sz, GFP_KERNEL);
+       if (!results) {
+               pr_err("fucked %d", __LINE__);
+               return -1;
+       }
+
+       memcpy(results, retvals, sz_rets);
+       for (i = 0; i < nr_threads; i++) {
+               memcpy(results + sz_rets + i * sz_times_per,
+                      times[i], sz_times_per);
+       }
+
+       return 0;
+}
+
+static int mcs_lock_test(void)
+{
+       int i;
+       int ret = -1;
+       size_t amt;
+
+       atomic_set(&horses, nr_threads);
+       WRITE_ONCE(run_locktest, true);
+
+       times = kcalloc(nr_threads, sizeof(struct lock_sample *), GFP_KERNEL);
+       if (!times) {
+               pr_err("fucked %d", __LINE__);
+               return ret;
+       }
+
+       if (check_mul_overflow((size_t)nr_loops, sizeof(struct lock_sample),
+                              &amt)) {
+               pr_err("fucked %d", __LINE__);
+               goto out_times;
+       }
+       for (i = 0; i < nr_threads; i++) {
+               times[i] = kvzalloc_node(amt, GFP_KERNEL, cpu_to_node(i));
+               if (!times[i]) {
+                       /* we clean up the times[i]s below */
+                       pr_err("fucked %d", __LINE__);
+                       goto out_times;
+               }
+       }
+
+       retvals = kcalloc(nr_threads, sizeof(void *), GFP_KERNEL);
+       if (!retvals) {
+               pr_err("fucked %d", __LINE__);
+               goto out_times;
+       }
+       for (i = 0; i < nr_threads; i++)
+               retvals[i] = (void*)-1;
+
+       threads = kcalloc(nr_threads, sizeof(struct task_struct *),
+                         GFP_KERNEL);
+       if (!threads) {
+               pr_err("fucked %d", __LINE__);
+               goto out_retvals;
+       }
+
+       for (i = 0; i < nr_threads; i++) {
+               threads[i] = kthread_create_on_cpu(__mcs_thread_lock_test,
+                                                  (void*)(long)i, i, "mcs-%u");
+               if (IS_ERR(threads[i])) {
+                       while (--i >= 0) {
+                               /*
+                                * We could recover, perhaps with something like
+                                * kthread_stop(threads[i]), but we'd need those
+                                * threads to check kthread_should_stop(),
+                                * perhaps in their hokey barrier.  I've never
+                                * had this fail, so I haven't tested it.
+                                */
+                       }
+                       pr_err("fucked %d", __LINE__);
+                       goto out_threads;
+               }
+       }
+       for (i = 0; i < nr_threads; i++) {
+               /* what's the deal with refcnting here?  it looks like an
+                * uncounted ref: create->result = current.  so once we start
+                * them, we probably can't touch this again. */
+               wake_up_process(threads[i]);
+       }
+
+       /* Hokey join.  We know when the test is done but wait for the others */
+       wait_for_completion(&test_done);
+       for (i = 0; i < nr_threads; i++) {
+               while (READ_ONCE(retvals[i]) == (void*)-1)
+                       cond_resched();
+       }
+
+       ret = mcs_build_output(times, retvals);
+
+out_threads:
+       kfree(threads);
+out_retvals:
+       kfree(retvals);
+out_times:
+       for (i = 0; i < nr_threads; i++)
+               kvfree(times[i]);
+       kfree(times);
+       return ret;
+}
+
+static ssize_t mcs_read(struct file *filp, struct kobject *kobj,
+                       struct bin_attribute *bin_attr,
+                       char *buf, loff_t off, size_t count)
+{
+       mutex_lock(&mtx);
+
+       if (!off) {
+               if (mcs_lock_test()) {
+                       mutex_unlock(&mtx);
+                       return -1;
+               }
+       }
+       if (!results) {
+               pr_err("fucked %d", __LINE__);
+               mutex_unlock(&mtx);
+               return -1;
+       }
+       /* mildly concerned about addition overflow.  caller's job? */
+       if (count + off > results_sz) {
+               pr_err("fucked off %lld count %lu sz %lu\n", off, count,
+                      results_sz);
+               count = results_sz - off;
+       }
+       memcpy(buf, results + off, count);
+
+       mutex_unlock(&mtx);
+
+       return count;
+}
+
+static loff_t __mcs_get_results_size(void)
+{
+       return nr_threads *
+               (sizeof(void*) + nr_loops * sizeof(struct lock_sample));
+}
+
+/*
+ * Unfortunately, this doesn't update the file live.  It'll only take effect the
+ * next time you open it.  So users need to write, close, open, read.
+ */
+static void __mcs_update_size(void)
+{
+       struct kernfs_node *kn = kernfs_find_and_get(kernel_kobj->sd, "mcs");
+
+       if (!kn) {
+               pr_err("fucked %d", __LINE__);
+               return;
+       }
+       kn->attr.size = __mcs_get_results_size();
+}
+
+static ssize_t mcs_write(struct file *filp, struct kobject *kobj,
+                        struct bin_attribute *bin_attr,
+                        char *buf, loff_t off, size_t count)
+{
+       unsigned int threads, loops, hold, delay;
+       ssize_t ret;
+
+       /* TODO: check_mul_overflow and whatnot, esp for the result_sz buffer */
+       ret = sscanf(buf, "%u %u %u %u", &threads, &loops, &hold,
+                    &delay);
+       if (ret != 4)
+               return -EINVAL;
+       if (threads > num_online_cpus())
+               return -ENODEV;
+       if (threads == 0)
+               threads = num_online_cpus();
+       mutex_lock(&mtx);
+       nr_threads = threads;
+       nr_loops = loops;
+       hold_time = hold;
+       delay_time = delay;
+       __mcs_update_size();
+       mutex_unlock(&mtx);
+       return count;
+}
+
+struct bin_attribute mcs_attr = {
+       .attr = {
+               .name = "mcs",
+               .mode = 0666,
+       },
+       .size = 0,
+       .private = NULL,
+       .read = mcs_read,
+       .write = mcs_write,
+};
+
+static int __init mcs_init(void)
+{
+       mutex_init(&mtx);
+
+       /*
+        * The user needs to set these, but start with sensible defaults in case
+        * they read without writing.
+        */
+       nr_threads = num_online_cpus();
+       nr_loops = 10000;
+       mcs_attr.size = __mcs_get_results_size();
+
+       if (sysfs_create_bin_file(kernel_kobj, &mcs_attr)) {
+               pr_err("\n\nfucked %d !!!\n\n\n", __LINE__);
+               return -1;
+       }
+       return 0;
+}
+
+static void __exit mcs_exit(void)
+{
+       sysfs_remove_bin_file(kernel_kobj, &mcs_attr);
+}
+
+module_init(mcs_init);
+module_exit(mcs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Barret Rhoden <brho@google.com>");
+MODULE_DESCRIPTION("MCS lock test");
index c2bd6ba..0dbab60 100644 (file)
@@ -653,6 +653,7 @@ static struct lock_test tests[] = {
 static struct lock_test tests[] = {
        {"mcs", mcs_thread},
        {"mcscas", mcscas_thread},
+       {"mcs-kernel", NULL},
        {"spin", spin_thread},
        {}
 };
@@ -881,7 +882,7 @@ struct results {
        struct lock_sample **thread_samples;
 };
 
-static struct results run_test(void)
+static struct results run_pthreads_test(void)
 {
        struct results results;
        void **loops_done;
@@ -927,6 +928,81 @@ static struct results run_test(void)
        return results;
 }
 
+#ifdef __akaros__
+
+static struct results run_kernel_mod_test(void)
+{
+       printf("Unsupported on Akaros\n");
+       exit(-1);
+}
+
+#else
+
+/* Works with tests/linux/modules/mcs.ko.  Write commands in, read results back.
+ * The test runs on a pread(off==0).  The return format is all N of the
+ * loops_done void*, followed by a 2D array of samples for threads then loops.
+ * (t0_l0, t0_l1, t0_l2..., t1_l0, t1_l1, t1_l2...). */
+static struct results run_kernel_mod_test(void)
+{
+       struct results results;
+       int fd;
+       void *kresults;
+       size_t sz_rets, sz_times_per, results_sz;
+       ssize_t ret, amt = 0;
+       struct lock_sample **thread_samples;
+       struct lock_sample *base;
+
+       fd = open("/sys/kernel/mcs", O_WRONLY);
+       if (fd < 0)
+               handle_error("open write");
+       if (dprintf(fd, "%u %u %u %u", pargs.nr_threads, pargs.nr_loops,
+                   pargs.hold_time, pargs.delay_time) < 0)
+               handle_error("setting opts.  too many threads?");
+       /* For the change in parameters (notably the file size, due to
+        * threads * loops) to take effect, you need to close and reopen. */
+       close(fd);
+       fd = open("/sys/kernel/mcs", O_RDONLY);
+       if (fd < 0)
+               handle_error("open read");
+
+       sz_rets = pargs.nr_threads * sizeof(void*);
+       sz_times_per = pargs.nr_loops * sizeof(struct lock_sample);
+       results_sz = sz_rets + pargs.nr_threads * sz_times_per;
+       kresults = malloc(results_sz);
+       if (!kresults)
+               handle_error("alloc");
+
+       do {
+               ret = read(fd, kresults + amt, results_sz - amt);
+               if (ret < 0)
+                       handle_error("read");
+               amt += ret;
+       } while (ret != 0);
+
+       if (amt != results_sz)
+               printf("\n\nfucked, got %ld wanted %ld\n\n", amt, results_sz);
+
+       thread_samples = malloc(pargs.nr_threads * sizeof(struct lock_sample*));
+       if (!thread_samples)
+               handle_error("alloc");
+       base = kresults + pargs.nr_threads * sizeof(void*);
+       for (int i = 0; i < pargs.nr_threads; i++)
+               thread_samples[i] = base + i * pargs.nr_loops;
+
+       results.loops_done = kresults;
+       results.thread_samples = thread_samples;
+       return results;
+}
+
+#endif
+
+static struct results run_test(void)
+{
+       if (!strcmp(pargs.test->name, "mcs-kernel"))
+               return run_kernel_mod_test();
+       return run_pthreads_test();
+}
+
 static void analyze(struct results *results)
 {
        void **loops_done = results->loops_done;