perf: Clean up perf_{session,alloc} management
[akaros.git] / kern / include / ros / bcq.h
index b5b255b..4bf7222 100644 (file)
@@ -6,22 +6,30 @@
  * consumer.
  */
 
-#ifndef ROS_INC_BCQ_H
-#define ROS_INC_BCQ_H
+#pragma once
 
 #include <ros/common.h>
 #include <ros/bcq_struct.h>
-/* Each arch has some basic atomic ops.  We need comp_and_swap for now. */
-#include <arch/atomic.h>
 #include <string.h>
 
+/* Pain in the ass includes.  Glibc has an atomic.h, and eventually userspace
+ * will have to deal with the clobbering. */
+#ifdef ROS_KERNEL
+#include <atomic.h>
+/* dequeue uses relax_vc, which is user-only.  Some kernel tests call dequeue.*/
+#define cpu_relax_vc(x) cpu_relax()
+#else
+#include <parlib/arch/atomic.h>
+#include <parlib/vcore.h>
+#endif /* ROS_KERNEL */
+
 /* Bounded Concurrent Queues, untrusted consumer
  *
  * This is a producer/consumer circular buffer, in which the producer never
  * blocks and does not need to trust the data structure (which is writable by
  * the consumer).
  *
- * A producer enqueues and item, based on the indexes of the producer and
+ * A producer enqueues an item, based on the indexes of the producer and
  * consumer.  Enqueue cannot block, but can fail if the queue is full or if it
  * fails to enqueue after a certain amount of tries.
  *
@@ -55,7 +63,7 @@
  * enough so the producer knows the item is free.  If pub_idx was their item,
  * they move it forward to the next item.  If it is not, currently, they spin
  * and wait until the previous consumer finishes, and then move it forward.
- * This isn't ideal, and we can deal with this in the future.  
+ * This isn't ideal, and we can deal with this in the future.
  *
  * Enqueue will enqueue the item pointed to by elem.  Dequeue will write an
  * item into the memory pointed to by elem.
  * bcq_enqueue(&some_bcq, &some_my_type, my_size, num_fails_okay);
  * bcq_dequeue(&some_bcq, &some_my_type, my_size);
  *
+ * They both return 0 on success, or some error code on failure.
  *
  * TODO later:
- * How about an atomic_add_return for the prod?  Now that we use powers of two,
- * CAS is probably overkill.
- *
  * Automatically round up.
  *
  * Watch out for ABA.  Could use ctrs in the top of the indexes.  Not really an
@@ -102,9 +108,12 @@ struct bcq_header {
 
 #endif
 
-/* Functions */                                                                
+/* Functions */
 #define bcq_init(_bcq, _ele_type, _num_elems)                                  \
-       memset((_bcq), 0, sizeof( _ele_type ) * (_num_elems))                                 
+({                                                                             \
+       memset((_bcq), 0, sizeof(*(_bcq)));                                        \
+       assert((_num_elems) == ROUNDUPPWR2(_num_elems));                           \
+})
 
 /* Num empty buffer slots in the BCQ */
 #define BCQ_FREE_SLOTS(_p, _cp, _ne) ((_ne) - ((_p) - (_cp)))
@@ -126,6 +135,7 @@ struct bcq_header {
        uint32_t __prod, __new_prod, __cons_pub, __failctr = 0;                    \
        int __retval = 0;                                                          \
        do {                                                                       \
+               cmb();                                                                 \
                if (((_num_fail)) && (__failctr++ >= (_num_fail))) {                   \
                        __retval = -EFAIL;                                                 \
                        break;                                                             \
@@ -137,10 +147,11 @@ struct bcq_header {
                        break;                                                             \
                }                                                                      \
                __new_prod = __prod + 1;                                               \
-       } while (!atomic_comp_swap(&(_bcq)->hdr.prod_idx, __prod, __new_prod));    \
+       } while (!atomic_cas_u32(&(_bcq)->hdr.prod_idx, __prod, __new_prod));      \
        if (!__retval) {                                                           \
                /* from here out, __prod is the local __prod that we won */            \
                (_bcq)->wraps[__prod & ((_num_elems)-1)].elem = *(_elem);              \
+               wmb();                                                                 \
                (_bcq)->wraps[__prod & ((_num_elems)-1)].rdy_for_cons = TRUE;          \
        }                                                                          \
        __retval;                                                                  \
@@ -153,6 +164,7 @@ struct bcq_header {
        uint32_t __prod, __cons_pvt, __new_cons_pvt, __cons_pub;                   \
        int __retval = 0;                                                          \
        do {                                                                       \
+               cmb();                                                                 \
                __prod = (_bcq)->hdr.prod_idx;                                         \
                __cons_pvt = (_bcq)->hdr.cons_pvt_idx;                                 \
                if (BCQ_NO_WORK(__prod, __cons_pvt)) {                                 \
@@ -160,7 +172,7 @@ struct bcq_header {
                        break;                                                             \
                }                                                                      \
                __new_cons_pvt = (__cons_pvt + 1);                                     \
-       } while (!atomic_comp_swap(&(_bcq)->hdr.cons_pvt_idx, __cons_pvt,          \
+       } while (!atomic_cas_u32(&(_bcq)->hdr.cons_pvt_idx, __cons_pvt,            \
                                   __new_cons_pvt));                               \
        if (!__retval) {                                                           \
                /* from here out, __cons_pvt is the local __cons_pvt that we won */    \
@@ -171,72 +183,15 @@ struct bcq_header {
                (_bcq)->wraps[__cons_pvt & ((_num_elems)-1)].rdy_for_cons = FALSE;     \
                /* wait til we're the cons_pub, then advance it by one */              \
                while ((_bcq)->hdr.cons_pub_idx != __cons_pvt)                         \
-                       cpu_relax();                                                       \
+                       cpu_relax_vc(vcore_id());                                          \
                (_bcq)->hdr.cons_pub_idx = __cons_pvt + 1;                             \
        }                                                                          \
        __retval;                                                                  \
 })
 
-#if 0
-/* Original C Code, for posterity / debugging */
-static inline int enqueue(struct __name_bcq *bcq, __elem_t *elem,
-                          int num_fail)
-{
-       uint32_t __prod, __new_prod, __cons_pub, __failctr = 0;
-       do {
-               if ((num_fail) && (__failctr++ >= num_fail)) {
-                       printk("FAILED\n");
-                       return -EFAIL;
-               }
-               __prod = bcq->hdr.prod_idx;
-               __cons_pub = bcq->hdr.cons_pub_idx;
-       printk("# free slots : %d\n", BCQ_FREE_SLOTS(__prod, __cons_pub, __num_elems));
-
-//             printk("__prod = %p, cons_pub = %p\n", __prod, __cons_pub-1);
-//             printk("__prod mod = %p, cons_pub mod = %p\n", __prod &(__num_elems-1), (__cons_pub-1) &(__num_elems-1));
-
-               if (BCQ_FULL(__prod, __cons_pub, __num_elems)) {
-                       printk("BUSY\n");
-                       return -EBUSY;
-               }
-               __new_prod = __prod + 1;
-       } while (!atomic_comp_swap(&bcq->hdr.prod_idx, __prod, __new_prod));
-       /* from here out, __prod is the local __prod that we won */
-
-       printk("enqueuing to location %d\n", __prod & (__num_elems-1));
-
-       bcq->wraps[__prod & (__num_elems-1)].elem = *elem;
-       bcq->wraps[__prod & (__num_elems-1)].rdy_for_cons = TRUE;
-       return 0;
-}
-
-/* Similar to enqueue, spin afterwards til cons_pub is our element, then */
-/* advance it. */
-static inline int dequeue(struct __name_bcq *bcq, __elem_t *elem)
-{
-       uint32_t __prod, __cons_pvt, __new_cons_pvt, __cons_pub;
-       do {
-               __prod = bcq->hdr.prod_idx;
-               __cons_pvt = bcq->hdr.cons_pvt_idx;
-               if (BCQ_NO_WORK(__prod, __cons_pvt))
-                       return -EBUSY;
-               __new_cons_pvt = (__cons_pvt + 1);
-       } while (!atomic_comp_swap(&bcq->hdr.cons_pvt_idx, __cons_pvt,
-                                  __new_cons_pvt));
-       /* from here out, __cons_pvt is the local __cons_pvt that we won */
-       printk("dequeueing from location %d\n", __cons_pvt & (__num_elems-1));
-
-       /* wait for the producer to finish copying it in */
-       while (!bcq->wraps[__cons_pvt & (__num_elems-1)].rdy_for_cons)
-               cpu_relax();
-       *elem = bcq->wraps[__cons_pvt & (__num_elems-1)].elem;
-       bcq->wraps[__cons_pvt & (__num_elems-1)].rdy_for_cons = FALSE;
-       /* wait til we're the cons_pub, then advance it by one */
-       while (bcq->hdr.cons_pub_idx != __cons_pvt)
-               cpu_relax();
-       bcq->hdr.cons_pub_idx = __cons_pvt + 1;
-       return 0;
-}
-#endif
+/* Checks of a bcq is empty (meaning no work), instead of trying to dequeue */
+#define bcq_empty(_bcq)                                                        \
+       BCQ_NO_WORK((_bcq)->hdr.prod_idx, (_bcq)->hdr.cons_pvt_idx)
 
-#endif /* !ROS_INC_BCQ_H */
+#define bcq_nr_full(_bcq)                                                      \
+       ((_bcq)->hdr.prod_idx - (_bcq)->hdr.cons_pub_idx)