mlx4: Support TSO/GSO/LSO
[akaros.git] / kern / drivers / net / mlx4 / en_tx.c
index 7bed3a8..58b1a75 100644 (file)
  *
  */
 
-#include <asm/page.h>
 #include <linux/mlx4/cq.h>
-#include <linux/slab.h>
 #include <linux/mlx4/qp.h>
-#include <linux/skbuff.h>
-#include <linux/if_vlan.h>
-#include <linux/prefetch.h>
-#include <linux/vmalloc.h>
-#include <linux/tcp.h>
-#include <linux/ip.h>
-#include <linux/moduleparam.h>
-
 #include "mlx4_en.h"
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
-                          struct mlx4_en_tx_ring **pring, u32 size,
-                          u16 stride, int node, int queue_index)
+                          struct mlx4_en_tx_ring **pring, uint32_t size,
+                          uint16_t stride, int node, int queue_index)
 {
        struct mlx4_en_dev *mdev = priv->mdev;
        struct mlx4_en_tx_ring *ring;
        int tmp;
        int err;
 
-       ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+       ring = kzalloc_node(sizeof(*ring), MEM_WAIT, node);
        if (!ring) {
-               ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+               ring = kzmalloc(sizeof(*ring), MEM_WAIT);
                if (!ring) {
                        en_err(priv, "Failed allocating TX ring\n");
                        return -ENOMEM;
@@ -68,7 +58,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        ring->stride = stride;
 
        tmp = size * sizeof(struct mlx4_en_tx_info);
-       ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
+       ring->tx_info = kmalloc_node(tmp, MEM_WAIT | __GFP_NOWARN, node);
        if (!ring->tx_info) {
                ring->tx_info = vmalloc(tmp);
                if (!ring->tx_info) {
@@ -80,9 +70,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
                 ring->tx_info, tmp);
 
-       ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
+       ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, MEM_WAIT, node);
        if (!ring->bounce_buf) {
-               ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+               ring->bounce_buf = kmalloc(MAX_DESC_SIZE, MEM_WAIT);
                if (!ring->bounce_buf) {
                        err = -ENOMEM;
                        goto err_info;
@@ -119,15 +109,19 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
                goto err_map;
        }
 
-       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
+       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, MEM_WAIT);
        if (err) {
                en_err(priv, "Failed allocating qp %d\n", ring->qpn);
                goto err_reserve;
        }
        ring->qp.event = mlx4_en_sqp_event;
 
+#if 0 // AKAROS_PORT
        err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
        if (err) {
+#else
+       if (true) {
+#endif
                en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
                ring->bf.uar = &mdev->priv_uar;
                ring->bf.uar->map = mdev->uar_map;
@@ -161,7 +155,9 @@ err_bounce:
        kfree(ring->bounce_buf);
        ring->bounce_buf = NULL;
 err_info:
+#if 0 // AKAROS_PORT
        kvfree(ring->tx_info);
+#endif
        ring->tx_info = NULL;
 err_ring:
        kfree(ring);
@@ -172,6 +168,8 @@ err_ring:
 void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
                             struct mlx4_en_tx_ring **pring)
 {
+       panic("Disabled");
+#if 0 // AKAROS_PORT
        struct mlx4_en_dev *mdev = priv->mdev;
        struct mlx4_en_tx_ring *ring = *pring;
        en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
@@ -188,6 +186,7 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
        ring->tx_info = NULL;
        kfree(ring);
        *pring = NULL;
+#endif
 }
 
 int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
@@ -215,9 +214,11 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 
        err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
                               &ring->qp, &ring->qp_state);
+#if 0 // AKAROS_PORT
        if (!cpumask_empty(&ring->affinity_mask))
                netif_set_xps_queue(priv->dev, &ring->affinity_mask,
                                    ring->queue_index);
+#endif
 
        return err;
 }
@@ -233,7 +234,7 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 
 static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
                              struct mlx4_en_tx_ring *ring, int index,
-                             u8 owner)
+                             uint8_t owner)
 {
        __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
        struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
@@ -265,18 +266,19 @@ static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
 }
 
 
-static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+static uint32_t mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
                                struct mlx4_en_tx_ring *ring,
-                               int index, u8 owner, u64 timestamp)
+                               int index, uint8_t owner, uint64_t timestamp)
 {
        struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
        struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
        struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
        void *end = ring->buf + ring->buf_size;
-       struct sk_buff *skb = tx_info->skb;
+       struct block *block = tx_info->block;
        int nr_maps = tx_info->nr_maps;
        int i;
 
+#if 0 // AKAROS_PORT
        /* We do not touch skb here, so prefetch skb->users location
         * to speedup consume_skb()
         */
@@ -288,6 +290,7 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
                mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
                skb_tstamp_tx(skb, &hwts);
        }
+#endif
 
        /* Optimize the common case when there are no wraparounds */
        if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
@@ -338,13 +341,15 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
                        }
                }
        }
-       dev_consume_skb_any(skb);
+       freeb(block);
        return tx_info->nr_txbb;
 }
 
 
-int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+int mlx4_en_free_tx_buf(struct ether *dev, struct mlx4_en_tx_ring *ring)
 {
+       panic("Disabled");
+#if 0 // AKAROS_PORT
        struct mlx4_en_priv *priv = netdev_priv(dev);
        int cnt = 0;
 
@@ -353,7 +358,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
        en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
                 ring->cons, ring->prod);
 
-       if ((u32) (ring->prod - ring->cons) > ring->size) {
+       if ((uint32_t) (ring->prod - ring->cons) > ring->size) {
                if (netif_msg_tx_err(priv))
                        en_warn(priv, "Tx consumer passed producer!\n");
                return 0;
@@ -373,36 +378,39 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
                en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
 
        return cnt;
+#endif
 }
 
-static bool mlx4_en_process_tx_cq(struct net_device *dev,
-                                struct mlx4_en_cq *cq)
+static bool mlx4_en_process_tx_cq(struct ether *dev,
+                                 struct mlx4_en_cq *cq)
 {
        struct mlx4_en_priv *priv = netdev_priv(dev);
        struct mlx4_cq *mcq = &cq->mcq;
        struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
        struct mlx4_cqe *cqe;
-       u16 index;
-       u16 new_index, ring_index, stamp_index;
-       u32 txbbs_skipped = 0;
-       u32 txbbs_stamp = 0;
-       u32 cons_index = mcq->cons_index;
+       uint16_t index;
+       uint16_t new_index, ring_index, stamp_index;
+       uint32_t txbbs_skipped = 0;
+       uint32_t txbbs_stamp = 0;
+       uint32_t cons_index = mcq->cons_index;
        int size = cq->size;
-       u32 size_mask = ring->size_mask;
+       uint32_t size_mask = ring->size_mask;
        struct mlx4_cqe *buf = cq->buf;
-       u32 packets = 0;
-       u32 bytes = 0;
+       uint32_t packets = 0;
+       uint32_t bytes = 0;
        int factor = priv->cqe_factor;
-       u64 timestamp = 0;
+       uint64_t timestamp = 0;
        int done = 0;
        int budget = priv->tx_work_limit;
-       u32 last_nr_txbb;
-       u32 ring_cons;
+       uint32_t last_nr_txbb;
+       uint32_t ring_cons;
 
        if (!priv->port_up)
                return true;
 
+#if 0 // AKAROS_PORT
        netdev_txq_bql_complete_prefetchw(ring->tx_queue);
+#endif
 
        index = cons_index & size_mask;
        cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
@@ -418,7 +426,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
                 * make sure we read the CQE after we read the
                 * ownership bit
                 */
-               dma_rmb();
+               bus_rmb();
 
                if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
                             MLX4_CQE_OPCODE_ERROR)) {
@@ -471,6 +479,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
        ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
        ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
 
+#if 0 // AKAROS_PORT
        netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
 
        /*
@@ -481,66 +490,72 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
                netif_tx_wake_queue(ring->tx_queue);
                ring->wake_queue++;
        }
+#endif
        return done < budget;
 }
 
+static void mlx4_en_poll_tx_cq(uint32_t srcid, long a0, long a1, long a2);
+
 void mlx4_en_tx_irq(struct mlx4_cq *mcq)
 {
        struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
        struct mlx4_en_priv *priv = netdev_priv(cq->dev);
 
        if (likely(priv->port_up))
+#if 0 // AKAROS_PORT
                napi_schedule_irqoff(&cq->napi);
+#else
+               send_kernel_message(core_id(), mlx4_en_poll_tx_cq, (long)cq,
+                                   0, 0, KMSG_ROUTINE);
+#endif
        else
                mlx4_en_arm_cq(priv, cq);
 }
 
 /* TX CQ polling - called by NAPI */
-int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
+static void mlx4_en_poll_tx_cq(uint32_t srcid, long a0, long a1, long a2)
 {
-       struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
-       struct net_device *dev = cq->dev;
+       struct mlx4_en_cq *cq = (struct mlx4_en_cq *)a0;
+       struct ether *dev = cq->dev;
        struct mlx4_en_priv *priv = netdev_priv(dev);
        int clean_complete;
 
        clean_complete = mlx4_en_process_tx_cq(dev, cq);
        if (!clean_complete)
-               return budget;
+               return;
 
-       napi_complete(napi);
        mlx4_en_arm_cq(priv, cq);
-
-       return 0;
 }
 
 static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
                                                      struct mlx4_en_tx_ring *ring,
-                                                     u32 index,
+                                                     uint32_t index,
                                                      unsigned int desc_size)
 {
-       u32 copy = (ring->size - index) * TXBB_SIZE;
+       uint32_t copy = (ring->size - index) * TXBB_SIZE;
        int i;
 
        for (i = desc_size - copy - 4; i >= 0; i -= 4) {
                if ((i & (TXBB_SIZE - 1)) == 0)
                        wmb();
 
-               *((u32 *) (ring->buf + i)) =
-                       *((u32 *) (ring->bounce_buf + copy + i));
+               *((uint32_t *) (ring->buf + i)) =
+                       *((uint32_t *) (ring->bounce_buf + copy + i));
        }
 
        for (i = copy - 4; i >= 4 ; i -= 4) {
                if ((i & (TXBB_SIZE - 1)) == 0)
                        wmb();
 
-               *((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
-                       *((u32 *) (ring->bounce_buf + i));
+               *((uint32_t *) (ring->buf + index * TXBB_SIZE + i)) =
+                       *((uint32_t *) (ring->bounce_buf + i));
        }
 
        /* Return real descriptor location */
        return ring->buf + index * TXBB_SIZE;
 }
 
+#if 0 // AKAROS_PORT
 /* Decide if skb can be inlined in tx descriptor to avoid dma mapping
  *
  * It seems strange we do not simply use skb_copy_bits().
@@ -582,7 +597,7 @@ static int inline_size(const struct sk_buff *skb)
 
 static int get_real_size(const struct sk_buff *skb,
                         const struct skb_shared_info *shinfo,
-                        struct net_device *dev,
+                        struct ether *dev,
                         int *lso_header_size,
                         bool *inline_ok,
                         void **pfrag)
@@ -627,7 +642,7 @@ static int get_real_size(const struct sk_buff *skb,
 static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
                             const struct sk_buff *skb,
                             const struct skb_shared_info *shinfo,
-                            int real_size, u16 *vlan_tag,
+                            int real_size, uint16_t *vlan_tag,
                             int tx_ind, void *fragptr)
 {
        struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
@@ -669,17 +684,21 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
                                       skb_frag_size(&shinfo->frags[0]));
                }
 
-               dma_wmb();
+               bus_wmb();
                inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
        }
 }
+#endif
 
-u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-                        void *accel_priv, select_queue_fallback_t fallback)
+uint16_t mlx4_en_select_queue(struct ether *dev, struct sk_buff *skb,
+                             void *accel_priv,
+                             select_queue_fallback_t fallback)
 {
+       panic("Disabled");
+#if 0 // AKAROS_PORT
        struct mlx4_en_priv *priv = netdev_priv(dev);
-       u16 rings_p_up = priv->num_tx_rings_p_up;
-       u8 up = 0;
+       uint16_t rings_p_up = priv->num_tx_rings_p_up;
+       uint8_t up = 0;
 
        if (dev->num_tc)
                return skb_tx_hash(dev, skb);
@@ -688,15 +707,254 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
                up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
 
        return fallback(dev, skb) % rings_p_up + up * rings_p_up;
+#endif
 }
 
+#if 0 // AKAROS_PORT
 static void mlx4_bf_copy(void __iomem *dst, const void *src,
                         unsigned int bytecnt)
 {
        __iowrite64_copy(dst, src, bytecnt / 8);
 }
+#endif
 
-netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
+/* Akaros's TCP stack always puts the TCP header in the block main body, so we
+ * don't need to worry about those being in extra_data.  Linux stores the
+ * location of the transport header and whatnot in the SKB - we do the same. */
+static size_t get_lso_hdr_size(struct block *block)
+{
+       if (!(block->flag & Btso))
+               return 0;
+       return block->transport_header_end;
+}
+
+netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
+{
+       struct mlx4_en_priv *priv = netdev_priv(dev);
+       struct mlx4_en_tx_ring *ring;
+       struct mlx4_en_tx_desc *tx_desc;
+       struct mlx4_wqe_data_seg *data;
+       struct mlx4_en_tx_info *tx_info;
+       int nr_txbb;
+       int desc_size;
+       int real_size;
+       uint32_t index;
+       __be32 op_own;
+       int i_frag;
+       int nr_frags = 0;
+       bool bounce = false;
+       dma_addr_t dma = 0;
+       uint32_t byte_count = 0;
+       size_t lso_header_size;
+       /* linear there is something to send in the block header */
+       bool is_linear;
+
+       if (!priv->port_up)
+               goto tx_drop;
+
+       ring = priv->tx_ring[0]; /* TODO multi-queue support */
+
+       lso_header_size = get_lso_hdr_size(block);
+       for (i_frag = 0; i_frag < block->nr_extra_bufs; i_frag++) {
+               const struct extra_bdata *ebd;
+
+               ebd = &block->extra_data[i_frag];
+               if (ebd->base && ebd->len > 0)
+                       nr_frags++;
+       }
+       /* Transport stack should always put the packet headers in the main body. */
+       assert(!(lso_header_size > BHLEN(block)));
+       /* == means there is nothing in the block main body other than the headers.
+        * in which case, we won't need an extra data_seg. */
+       is_linear = lso_header_size < BHLEN(block);
+
+       real_size = CTRL_SIZE + nr_frags * DS_SIZE;
+       if (is_linear)
+               real_size += DS_SIZE;
+       if (lso_header_size)
+               real_size += ALIGN(lso_header_size + 4, DS_SIZE);
+       if (unlikely(!real_size))
+               goto tx_drop;
+
+       /* Align descriptor to TXBB size */
+       desc_size = ALIGN(real_size, TXBB_SIZE);
+       nr_txbb = desc_size / TXBB_SIZE;
+       if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+               en_warn(priv, "Oversized header or SG list\n");
+               goto tx_drop;
+       }
+
+       index = ring->prod & ring->size_mask;
+
+       /* See if we have enough space for whole descriptor TXBB for setting
+        * SW ownership on next descriptor; if not, use a bounce buffer. */
+       if (likely(index + nr_txbb <= ring->size))
+               tx_desc = ring->buf + index * TXBB_SIZE;
+       else {
+               tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+               bounce = true;
+       }
+
+       /* Save skb in tx_info ring */
+       tx_info = &ring->tx_info[index];
+       tx_info->block = block;
+       tx_info->nr_txbb = nr_txbb;
+
+       data = &tx_desc->data;
+       if (lso_header_size)
+               data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
+                                                     DS_SIZE));
+
+       /* valid only for none inline segments */
+       tx_info->data_offset = (void *)data - (void *)tx_desc;
+       tx_info->inl = 0;
+       tx_info->linear = is_linear ? 1 : 0;
+       tx_info->nr_maps = nr_frags + tx_info->linear;
+       data += tx_info->nr_maps - 1;
+
+       /* Map fragments if any */
+       for (i_frag = block->nr_extra_bufs - 1; i_frag >= 0; i_frag--) {
+               const struct extra_bdata *ebd;
+
+               ebd = &block->extra_data[i_frag];
+               if (!ebd->base || ebd->len <= 0)
+                       continue;
+
+               byte_count = ebd->len;
+               dma = dma_map_single(0, (void *)(ebd->base + ebd->off),
+                                    byte_count, DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(0, dma)))
+                       goto tx_drop_unmap;
+
+               data->addr = cpu_to_be64(dma);
+               data->lkey = ring->mr_key;
+               bus_wmb();
+               data->byte_count = cpu_to_be32(byte_count);
+               --data;
+       }
+
+       if (tx_info->linear) {
+               byte_count = BHLEN(block) - lso_header_size;
+
+               dma = dma_map_single(0, block->rp + lso_header_size, byte_count,
+                                    DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(0, dma)))
+                       goto tx_drop_unmap;
+
+               data->addr = cpu_to_be64(dma);
+               data->lkey = ring->mr_key;
+               bus_wmb();
+               data->byte_count = cpu_to_be32(byte_count);
+       }
+
+       /* tx completion can avoid cache line miss for common cases */
+       tx_info->map0_dma = dma;
+       tx_info->map0_byte_count = byte_count;
+
+       /*
+        * For timestamping add flag to skb_shinfo and
+        * set flag for further reference
+        */
+       tx_info->ts_requested = 0;
+
+       /* Prepare ctrl segement apart opcode+ownership */
+       tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+       if (likely(block->flag & BCKSUM_FLAGS)) {
+               assert(block->flag & (Budpck | Btcpck));
+               tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+                                                        MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               ring->tx_csum++;
+       }
+
+       if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
+               struct ethhdr *ethh;
+
+               /* Copy dst mac address to wqe. This allows loopback in eSwitch,
+                * so that VFs and PF can communicate with each other
+                */
+               ethh = (struct ethhdr *)block->rp;
+               tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
+               tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
+       }
+
+       /* Handle LSO (TSO) packets */
+       if (lso_header_size) {
+               int i;
+
+               /* Mark opcode as LSO */
+               op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+                       ((ring->prod & ring->size) ?
+                               cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+               /* Fill in the LSO prefix */
+               tx_desc->lso.mss_hdr_size = cpu_to_be32(block->mss << 16 |
+                                                       lso_header_size);
+
+               /* Copy headers;
+                * note that we already verified that it is linear.
+                * brho - meaning that the lso_header_size is within block->rp. */
+               memcpy(tx_desc->lso.header, block->rp, lso_header_size);
+
+               ring->tso_packets++;
+
+               i = ((BLEN(block) - lso_header_size) / block->mss) +
+                       !!((BLEN(block) - lso_header_size) % block->mss);
+               tx_info->nr_bytes = BLEN(block) + (i - 1) * lso_header_size;
+               ring->packets += i;
+       } else {
+
+               /* Normal (Non LSO) packet */
+               op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+                       ((ring->prod & ring->size) ?
+                        cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+               tx_info->nr_bytes = MAX_T(unsigned int, BLEN(block), ETH_ZLEN);
+               ring->packets++;
+       }
+       ring->bytes += tx_info->nr_bytes;
+       AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, BLEN(block));
+
+       ring->prod += nr_txbb;
+
+       /* If we used a bounce buffer then copy descriptor back into place */
+       if (unlikely(bounce))
+               tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+       real_size = (real_size / 16) & 0x3f; /* Clear fence bit. */
+
+       tx_desc->ctrl.vlan_tag = 0;
+       tx_desc->ctrl.ins_vlan = 0;
+       tx_desc->ctrl.fence_size = real_size;
+
+       /* Ensure new descriptor hits memory
+        * before setting ownership of this descriptor to HW
+        */
+       bus_wmb();
+       tx_desc->ctrl.owner_opcode = op_own;
+       wmb();
+       /* Since there is no iowrite*_native() that writes the
+        * value as is, without byteswapping - using the one
+        * the doesn't do byteswapping in the relevant arch
+        * endianness.
+        */
+#if defined(__LITTLE_ENDIAN)
+       write32(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
+#else
+       iowrite32be(ring->doorbell_qpn,
+                   ring->bf.uar->map + MLX4_SEND_DOORBELL);
+#endif
+
+       return NETDEV_TX_OK;
+
+tx_drop_unmap:
+       en_err(priv, "DMA mapping error\n");
+
+tx_drop:
+       priv->stats.tx_dropped++;
+       return NETDEV_TX_OK;
+}
+
+#if 0 // AKAROS_PORT
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct ether *dev)
 {
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -709,9 +967,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
        int nr_txbb;
        int desc_size;
        int real_size;
-       u32 index, bf_index;
+       uint32_t index, bf_index;
        __be32 op_own;
-       u16 vlan_tag = 0;
+       uint16_t vlan_tag = 0;
        int i_frag;
        int lso_header_size;
        void *fragptr = NULL;
@@ -719,7 +977,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
        bool send_doorbell;
        bool stop_queue;
        bool inline_ok;
-       u32 ring_cons;
+       uint32_t ring_cons;
 
        if (!priv->port_up)
                goto tx_drop;
@@ -752,7 +1010,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 
        /* Track current inflight packets for performance analysis */
        AVG_PERF_COUNTER(priv->pstats.inflight_avg,
-                        (u32)(ring->prod - ring_cons - 1));
+                        (uint32_t)(ring->prod - ring_cons - 1));
 
        /* Packet is good - grab an index and transmit it */
        index = ring->prod & ring->size_mask;
@@ -790,7 +1048,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 
        if (!tx_info->inl) {
                dma_addr_t dma = 0;
-               u32 byte_count = 0;
+               uint32_t byte_count = 0;
 
                /* Map fragments if any */
                for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
@@ -806,7 +1064,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 
                        data->addr = cpu_to_be64(dma);
                        data->lkey = ring->mr_key;
-                       dma_wmb();
+                       bus_wmb();
                        data->byte_count = cpu_to_be32(byte_count);
                        --data;
                }
@@ -823,7 +1081,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 
                        data->addr = cpu_to_be64(dma);
                        data->lkey = ring->mr_key;
-                       dma_wmb();
+                       bus_wmb();
                        data->byte_count = cpu_to_be32(byte_count);
                }
                /* tx completion can avoid cache line miss for common cases */
@@ -893,7 +1151,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
                op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
                        ((ring->prod & ring->size) ?
                         cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
-               tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+               tx_info->nr_bytes = MAX_T(unsigned int, skb->len, ETH_ZLEN);
                ring->packets++;
        }
        ring->bytes += tx_info->nr_bytes;
@@ -936,11 +1194,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
                tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
                                       cpu_to_be32(real_size);
 
-               op_own |= htonl((bf_index & 0xffff) << 8);
+               op_own |= cpu_to_be32((bf_index & 0xffff) << 8);
                /* Ensure new descriptor hits memory
                 * before setting ownership of this descriptor to HW
                 */
-               dma_wmb();
+               bus_wmb();
                tx_desc->ctrl.owner_opcode = op_own;
 
                wmb();
@@ -960,7 +1218,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
                /* Ensure new descriptor hits memory
                 * before setting ownership of this descriptor to HW
                 */
-               dma_wmb();
+               bus_wmb();
                tx_desc->ctrl.owner_opcode = op_own;
                if (send_doorbell) {
                        wmb();
@@ -988,7 +1246,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
                 * Need a memory barrier to make sure ring->cons was not
                 * updated before queue was stopped.
                 */
-               smp_rmb();
+               rmb();
 
                ring_cons = ACCESS_ONCE(ring->cons);
                if (unlikely(((int)(ring->prod - ring_cons)) <=
@@ -1014,4 +1272,5 @@ tx_drop:
        priv->stats.tx_dropped++;
        return NETDEV_TX_OK;
 }
+#endif