mlx4: Support TSO/GSO/LSO
authorBarret Rhoden <brho@cs.berkeley.edu>
Tue, 6 Jun 2017 17:01:42 +0000 (13:01 -0400)
committerBarret Rhoden <brho@cs.berkeley.edu>
Tue, 6 Jun 2017 17:01:42 +0000 (13:01 -0400)
The changes were taken from the Linux transmit function.  I added back in
the 'linear' checks (meaning, is there data in the block main body),
ignored the 'inline' stuff, and turned on LSO (a.k.a. TSO/GSO).

Also, there was a bug with the xsum flags.  Bipck is never set by the
stack, so we weren't setting a flag that Linux always sets.  We do the IP
header xsum in software - it's just 20 bytes.  However, the Bipck flag is
used by drivers to tell the stack that the NIC did the IP header xsum.
That's just the way it works for Plan 9.  This all popped up because LSO'd
packets had incorrect header xsums.

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
kern/drivers/net/mlx4/en_netdev.c
kern/drivers/net/mlx4/en_tx.c

index 777c382..984a117 100644 (file)
@@ -2995,7 +2995,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
                        NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
                        NETIF_F_HW_VLAN_CTAG_FILTER;
 #else
-       dev->feat = NETIF_F_SG | NETIF_F_IP_CSUM | NETF_PADMIN;
+       dev->feat = dev->hw_features | NETIF_F_SG | NETIF_F_IP_CSUM | NETF_PADMIN;
 #endif
        dev->hw_features |= NETIF_F_LOOPBACK |
                        NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
index 1204c28..58b1a75 100644 (file)
@@ -718,6 +718,16 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
 }
 #endif
 
+/* Akaros's TCP stack always puts the TCP header in the block main body, so we
+ * don't need to worry about those being in extra_data.  Linux stores the
+ * location of the transport header and whatnot in the SKB - we do the same. */
+static size_t get_lso_hdr_size(struct block *block)
+{
+       if (!(block->flag & Btso))
+               return 0;
+       return block->transport_header_end;
+}
+
 netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
 {
        struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -735,12 +745,16 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
        bool bounce = false;
        dma_addr_t dma = 0;
        uint32_t byte_count = 0;
+       size_t lso_header_size;
+       /* linear there is something to send in the block header */
+       bool is_linear;
 
        if (!priv->port_up)
                goto tx_drop;
 
        ring = priv->tx_ring[0]; /* TODO multi-queue support */
 
+       lso_header_size = get_lso_hdr_size(block);
        for (i_frag = 0; i_frag < block->nr_extra_bufs; i_frag++) {
                const struct extra_bdata *ebd;
 
@@ -748,8 +762,17 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
                if (ebd->base && ebd->len > 0)
                        nr_frags++;
        }
-
-       real_size = CTRL_SIZE + (nr_frags + 1) * DS_SIZE;
+       /* Transport stack should always put the packet headers in the main body. */
+       assert(!(lso_header_size > BHLEN(block)));
+       /* == means there is nothing in the block main body other than the headers.
+        * in which case, we won't need an extra data_seg. */
+       is_linear = lso_header_size < BHLEN(block);
+
+       real_size = CTRL_SIZE + nr_frags * DS_SIZE;
+       if (is_linear)
+               real_size += DS_SIZE;
+       if (lso_header_size)
+               real_size += ALIGN(lso_header_size + 4, DS_SIZE);
        if (unlikely(!real_size))
                goto tx_drop;
 
@@ -778,12 +801,15 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
        tx_info->nr_txbb = nr_txbb;
 
        data = &tx_desc->data;
+       if (lso_header_size)
+               data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
+                                                     DS_SIZE));
 
        /* valid only for none inline segments */
        tx_info->data_offset = (void *)data - (void *)tx_desc;
        tx_info->inl = 0;
-       tx_info->linear = 1;
-       tx_info->nr_maps = nr_frags + 1;
+       tx_info->linear = is_linear ? 1 : 0;
+       tx_info->nr_maps = nr_frags + tx_info->linear;
        data += tx_info->nr_maps - 1;
 
        /* Map fragments if any */
@@ -807,16 +833,19 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
                --data;
        }
 
-       byte_count = BHLEN(block);
+       if (tx_info->linear) {
+               byte_count = BHLEN(block) - lso_header_size;
 
-       dma = dma_map_single(0, block->rp, byte_count, DMA_TO_DEVICE);
-       if (unlikely(dma_mapping_error(0, dma)))
-               goto tx_drop_unmap;
+               dma = dma_map_single(0, block->rp + lso_header_size, byte_count,
+                                    DMA_TO_DEVICE);
+               if (unlikely(dma_mapping_error(0, dma)))
+                       goto tx_drop_unmap;
 
-       data->addr = cpu_to_be64(dma);
-       data->lkey = ring->mr_key;
-       bus_wmb();
-       data->byte_count = cpu_to_be32(byte_count);
+               data->addr = cpu_to_be64(dma);
+               data->lkey = ring->mr_key;
+               bus_wmb();
+               data->byte_count = cpu_to_be32(byte_count);
+       }
 
        /* tx completion can avoid cache line miss for common cases */
        tx_info->map0_dma = dma;
@@ -831,10 +860,10 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
        /* Prepare ctrl segement apart opcode+ownership */
        tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
        if (likely(block->flag & BCKSUM_FLAGS)) {
-               if (block->flag & Bipck)
-                       tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
-               if (block->flag & (Budpck | Btcpck))
-                       tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               assert(block->flag & (Budpck | Btcpck));
+               tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+                                                        MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               ring->tx_csum++;
        }
 
        if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
@@ -848,12 +877,39 @@ netdev_tx_t mlx4_send_packet(struct block *block, struct ether *dev)
                tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
        }
 
-       /* Normal (Non LSO) packet */
-       op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
-               ((ring->prod & ring->size) ?
-                cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
-       tx_info->nr_bytes = MAX_T(unsigned int, BLEN(block), ETH_ZLEN);
-       ring->packets++;
+       /* Handle LSO (TSO) packets */
+       if (lso_header_size) {
+               int i;
+
+               /* Mark opcode as LSO */
+               op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+                       ((ring->prod & ring->size) ?
+                               cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+               /* Fill in the LSO prefix */
+               tx_desc->lso.mss_hdr_size = cpu_to_be32(block->mss << 16 |
+                                                       lso_header_size);
+
+               /* Copy headers;
+                * note that we already verified that it is linear.
+                * brho - meaning that the lso_header_size is within block->rp. */
+               memcpy(tx_desc->lso.header, block->rp, lso_header_size);
+
+               ring->tso_packets++;
+
+               i = ((BLEN(block) - lso_header_size) / block->mss) +
+                       !!((BLEN(block) - lso_header_size) % block->mss);
+               tx_info->nr_bytes = BLEN(block) + (i - 1) * lso_header_size;
+               ring->packets += i;
+       } else {
+
+               /* Normal (Non LSO) packet */
+               op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+                       ((ring->prod & ring->size) ?
+                        cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+               tx_info->nr_bytes = MAX_T(unsigned int, BLEN(block), ETH_ZLEN);
+               ring->packets++;
+       }
        ring->bytes += tx_info->nr_bytes;
        AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, BLEN(block));