mlx4: Support TSO/GSO/LSO
[akaros.git] / kern / drivers / net / mlx4 / en_tx.c
index 1ece711..58b1a75 100644 (file)
@@ -44,9 +44,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        int tmp;
        int err;
 
-       ring = kzalloc_node(sizeof(*ring), KMALLOC_WAIT, node);
+       ring = kzalloc_node(sizeof(*ring), MEM_WAIT, node);
        if (!ring) {
-               ring = kzmalloc(sizeof(*ring), KMALLOC_WAIT);
+               ring = kzmalloc(sizeof(*ring), MEM_WAIT);
                if (!ring) {
                        en_err(priv, "Failed allocating TX ring\n");
                        return -ENOMEM;
@@ -58,7 +58,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        ring->stride = stride;
 
        tmp = size * sizeof(struct mlx4_en_tx_info);
-       ring->tx_info = kmalloc_node(tmp, KMALLOC_WAIT | __GFP_NOWARN, node);
+       ring->tx_info = kmalloc_node(tmp, MEM_WAIT | __GFP_NOWARN, node);
        if (!ring->tx_info) {
                ring->tx_info = vmalloc(tmp);
                if (!ring->tx_info) {
@@ -70,9 +70,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
                 ring->tx_info, tmp);
 
-       ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, KMALLOC_WAIT, node);
+       ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, MEM_WAIT, node);
        if (!ring->bounce_buf) {
-               ring->bounce_buf = kmalloc(MAX_DESC_SIZE, KMALLOC_WAIT);
+               ring->bounce_buf = kmalloc(MAX_DESC_SIZE, MEM_WAIT);
                if (!ring->bounce_buf) {
                        err = -ENOMEM;
                        goto err_info;
@@ -109,7 +109,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
                goto err_map;
        }
 
-       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, KMALLOC_WAIT);
+       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, MEM_WAIT);
        if (err) {
                en_err(priv, "Failed allocating qp %d\n", ring->qpn);
                goto err_reserve;
@@ -494,6 +494,8 @@ static bool mlx4_en_process_tx_cq(struct ether *dev,
        return done < budget;
 }
 
+static void mlx4_en_poll_tx_cq(uint32_t srcid, long a0, long a1, long a2);
+
 void mlx4_en_tx_irq(struct mlx4_cq *mcq)
 {
        struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
@@ -503,28 +505,26 @@ void mlx4_en_tx_irq(struct mlx4_cq *mcq)
 #if 0 // AKAROS_PORT
                napi_schedule_irqoff(&cq->napi);
 #else
-               { /* TODO */ }
+               send_kernel_message(core_id(), mlx4_en_poll_tx_cq, (long)cq,
+                                   0, 0, KMSG_ROUTINE);
 #endif
        else
                mlx4_en_arm_cq(priv, cq);
 }
 
 /* TX CQ polling - called by NAPI */
-int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
+static void mlx4_en_poll_tx_cq(uint32_t srcid, long a0, long a1, long a2)
 {
-       struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+       struct mlx4_en_cq *cq = (struct mlx4_en_cq *)a0;
        struct ether *dev = cq->dev;
        struct mlx4_en_priv *priv = netdev_priv(dev);
        int clean_complete;
 
        clean_complete = mlx4_en_process_tx_cq(dev, cq);
        if (!clean_complete)
-               return budget;
+               return;
 
-       napi_complete(napi);
        mlx4_en_arm_cq(priv, cq);
-
-       return 0;
 }
 
 static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
@@ -718,6 +718,241 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
 }
 #endif
 
+/* Akaros's TCP stack always puts the TCP header in the block main body, so we
+ * don't need to worry about those being in extra_data.  Linux stores the
+ * location of the transport header and whatnot in the SKB - we do the same. */
+static size_t get_lso_hdr_size(struct block *block)
+{
+       if (!(block->flag & Btso))
+               return 0;
+       return block->transport_header_end;
+}
+
+netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
+{
+       struct mlx4_en_priv *priv = netdev_priv(dev);
+       struct mlx4_en_tx_ring *ring;
+       struct mlx4_en_tx_desc *tx_desc;
+       struct mlx4_wqe_data_seg *data;
+       struct mlx4_en_tx_info *tx_info;
+       int nr_txbb;
+       int desc_size;
+       int real_size;
+       uint32_t index;
+       __be32 op_own;
+       int i_frag;
+       int nr_frags = 0;
+       bool bounce = false;
+       dma_addr_t dma = 0;
+       uint32_t byte_count = 0;
+       size_t lso_header_size;
+       /* linear there is something to send in the block header */
+       bool is_linear;
+
+       if (!priv->port_up)
+               goto tx_drop;
+
+       ring = priv->tx_ring[0]; /* TODO multi-queue support */
+
+       lso_header_size = get_lso_hdr_size(block);
+       for (i_frag = 0; i_frag < block->nr_extra_bufs; i_frag++) {
+               const struct extra_bdata *ebd;
+
+               ebd = &block->extra_data[i_frag];
+               if (ebd->base && ebd->len > 0)
+                       nr_frags++;
+       }
+       /* Transport stack should always put the packet headers in the main body. */
+       assert(!(lso_header_size > BHLEN(block)));
+       /* == means there is nothing in the block main body other than the headers.
+        * in which case, we won't need an extra data_seg. */
+       is_linear = lso_header_size < BHLEN(block);
+
+       real_size = CTRL_SIZE + nr_frags * DS_SIZE;
+       if (is_linear)
+               real_size += DS_SIZE;
+       if (lso_header_size)
+               real_size += ALIGN(lso_header_size + 4, DS_SIZE);
+       if (unlikely(!real_size))
+               goto tx_drop;
+
+       /* Align descriptor to TXBB size */
+       desc_size = ALIGN(real_size, TXBB_SIZE);
+       nr_txbb = desc_size / TXBB_SIZE;
+       if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+               en_warn(priv, "Oversized header or SG list\n");
+               goto tx_drop;
+       }
+
+       index = ring->prod & ring->size_mask;
+
+       /* See if we have enough space for whole descriptor TXBB for setting
+        * SW ownership on next descriptor; if not, use a bounce buffer. */
+       if (likely(index + nr_txbb <= ring->size))
+               tx_desc = ring->buf + index * TXBB_SIZE;
+       else {
+               tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+               bounce = true;
+       }
+
+       /* Save skb in tx_info ring */
+       tx_info = &ring->tx_info[index];
+       tx_info->block = block;
+       tx_info->nr_txbb = nr_txbb;
+
+       data = &tx_desc->data;
+       if (lso_header_size)
+               data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
+                                                     DS_SIZE));
+
+       /* valid only for none inline segments */
+       tx_info->data_offset = (void *)data - (void *)tx_desc;
+       tx_info->inl = 0;
+       tx_info->linear = is_linear ? 1 : 0;
+       tx_info->nr_maps = nr_frags + tx_info->linear;
+       data += tx_info->nr_maps - 1;
+
+       /* Map fragments if any */
+       for (i_frag = block->nr_extra_bufs - 1; i_frag >= 0; i_frag--) {
+               const struct extra_bdata *ebd;
+
+               ebd = &block->extra_data[i_frag];
+               if (!ebd->base || ebd->len <= 0)
+                       continue;
+
+               byte_count = ebd->len;
+               dma = dma_map_single(0, (void *)(ebd->base + ebd->off),
+                                    byte_count, DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(0, dma)))
+                       goto tx_drop_unmap;
+
+               data->addr = cpu_to_be64(dma);
+               data->lkey = ring->mr_key;
+               bus_wmb();
+               data->byte_count = cpu_to_be32(byte_count);
+               --data;
+       }
+
+       if (tx_info->linear) {
+               byte_count = BHLEN(block) - lso_header_size;
+
+               dma = dma_map_single(0, block->rp + lso_header_size, byte_count,
+                                    DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(0, dma)))
+                       goto tx_drop_unmap;
+
+               data->addr = cpu_to_be64(dma);
+               data->lkey = ring->mr_key;
+               bus_wmb();
+               data->byte_count = cpu_to_be32(byte_count);
+       }
+
+       /* tx completion can avoid cache line miss for common cases */
+       tx_info->map0_dma = dma;
+       tx_info->map0_byte_count = byte_count;
+
+       /*
+        * For timestamping add flag to skb_shinfo and
+        * set flag for further reference
+        */
+       tx_info->ts_requested = 0;
+
+       /* Prepare ctrl segement apart opcode+ownership */
+       tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+       if (likely(block->flag & BCKSUM_FLAGS)) {
+               assert(block->flag & (Budpck | Btcpck));
+               tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+                                                        MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               ring->tx_csum++;
+       }
+
+       if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
+               struct ethhdr *ethh;
+
+               /* Copy dst mac address to wqe. This allows loopback in eSwitch,
+                * so that VFs and PF can communicate with each other
+                */
+               ethh = (struct ethhdr *)block->rp;
+               tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
+               tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
+       }
+
+       /* Handle LSO (TSO) packets */
+       if (lso_header_size) {
+               int i;
+
+               /* Mark opcode as LSO */
+               op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+                       ((ring->prod & ring->size) ?
+                               cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+               /* Fill in the LSO prefix */
+               tx_desc->lso.mss_hdr_size = cpu_to_be32(block->mss << 16 |
+                                                       lso_header_size);
+
+               /* Copy headers;
+                * note that we already verified that it is linear.
+                * brho - meaning that the lso_header_size is within block->rp. */
+               memcpy(tx_desc->lso.header, block->rp, lso_header_size);
+
+               ring->tso_packets++;
+
+               i = ((BLEN(block) - lso_header_size) / block->mss) +
+                       !!((BLEN(block) - lso_header_size) % block->mss);
+               tx_info->nr_bytes = BLEN(block) + (i - 1) * lso_header_size;
+               ring->packets += i;
+       } else {
+
+               /* Normal (Non LSO) packet */
+               op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+                       ((ring->prod & ring->size) ?
+                        cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+               tx_info->nr_bytes = MAX_T(unsigned int, BLEN(block), ETH_ZLEN);
+               ring->packets++;
+       }
+       ring->bytes += tx_info->nr_bytes;
+       AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, BLEN(block));
+
+       ring->prod += nr_txbb;
+
+       /* If we used a bounce buffer then copy descriptor back into place */
+       if (unlikely(bounce))
+               tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+       real_size = (real_size / 16) & 0x3f; /* Clear fence bit. */
+
+       tx_desc->ctrl.vlan_tag = 0;
+       tx_desc->ctrl.ins_vlan = 0;
+       tx_desc->ctrl.fence_size = real_size;
+
+       /* Ensure new descriptor hits memory
+        * before setting ownership of this descriptor to HW
+        */
+       bus_wmb();
+       tx_desc->ctrl.owner_opcode = op_own;
+       wmb();
+       /* Since there is no iowrite*_native() that writes the
+        * value as is, without byteswapping - using the one
+        * the doesn't do byteswapping in the relevant arch
+        * endianness.
+        */
+#if defined(__LITTLE_ENDIAN)
+       write32(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
+#else
+       iowrite32be(ring->doorbell_qpn,
+                   ring->bf.uar->map + MLX4_SEND_DOORBELL);
+#endif
+
+       return NETDEV_TX_OK;
+
+tx_drop_unmap:
+       en_err(priv, "DMA mapping error\n");
+
+tx_drop:
+       priv->stats.tx_dropped++;
+       return NETDEV_TX_OK;
+}
+
 #if 0 // AKAROS_PORT
 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct ether *dev)
 {