Skip to content

Commit 88a85f9

Browse files
Achiad Shochatdavem330
authored andcommitted
net/mlx5e: TX latency optimization to save DMA reads
A regular TX WQE execution involves two or more DMA reads - one to fetch the WQE, and another one per WQE gather entry. These DMA reads obviously increase the TX latency. There are two mlx5 mechanisms to bypass these DMA reads: 1) Inline WQE 2) Blue Flame (BF) An inline WQE contains a whole packet, thus saves the DMA read/s of the regular WQE gather entry/s. Inline WQE support was already added in the previous commit. A BF WQE is written directly to the device I/O mapped memory, thus enables saving the DMA read that fetches the WQE. The BF WQE I/O write must be in cache line granularity, thus uses the CPU write combining mechanism. A BF WQE I/O write acts also as a TX doorbell for notifying the device of new TX WQEs. A BF WQE is written to the same I/O mapped address as the regular TX doorbell, thus this address is being mapped twice - once by ioremap() and once by io_mapping_map_wc(). While both mechanisms reduce the TX latency, they both consume more CPU cycles than a regular WQE: - A BF WQE must still be written to host memory, in addition to being written directly to the device I/O mapped memory. - An inline WQE involves copying the SKB data into it. To handle this tradeoff, we introduce here a heuristic algorithm that strives to avoid using these two mechanisms in case the TX queue is being back-pressured by the device, and limit their usage rate otherwise. An inline WQE will always be "Blue Flamed" (written directly to the device I/O mapped memory) while a BF WQE may not be inlined (may contain gather entries). Preliminary testing using netperf UDP_RR shows that the latency goes down from 17.5us to 16.9us, while the message rate (tested with pktgen) stays the same. Signed-off-by: Achiad Shochat <achiad@mellanox.com> Signed-off-by: Amir Vadai <amirv@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 58d5229 commit 88a85f9

File tree

6 files changed

+79
-19
lines changed

6 files changed

+79
-19
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060

6161
#define MLX5E_TX_CQ_POLL_BUDGET 128
6262
#define MLX5E_UPDATE_STATS_INTERVAL 200 /* msecs */
63+
#define MLX5E_SQ_BF_BUDGET 16
6364

6465
static const char vport_strings[][ETH_GSTRING_LEN] = {
6566
/* vport statistics */
@@ -268,7 +269,9 @@ struct mlx5e_sq {
268269
/* dirtied @xmit */
269270
u16 pc ____cacheline_aligned_in_smp;
270271
u32 dma_fifo_pc;
271-
u32 bf_offset;
272+
u16 bf_offset;
273+
u16 prev_cc;
274+
u8 bf_budget;
272275
struct mlx5e_sq_stats stats;
273276

274277
struct mlx5e_cq cq;
@@ -281,9 +284,10 @@ struct mlx5e_sq {
281284
struct mlx5_wq_cyc wq;
282285
u32 dma_fifo_mask;
283286
void __iomem *uar_map;
287+
void __iomem *uar_bf_map;
284288
struct netdev_queue *txq;
285289
u32 sqn;
286-
u32 bf_buf_size;
290+
u16 bf_buf_size;
287291
u16 max_inline;
288292
u16 edge;
289293
struct device *pdev;
@@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv,
493497
struct mlx5e_params *new_params);
494498

495499
static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
496-
struct mlx5e_tx_wqe *wqe)
500+
struct mlx5e_tx_wqe *wqe, int bf_sz)
497501
{
502+
u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
503+
498504
/* ensure wqe is visible to device before updating doorbell record */
499505
dma_wmb();
500506

@@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
505511
*/
506512
wmb();
507513

508-
mlx5_write64((__be32 *)&wqe->ctrl,
509-
sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset,
510-
NULL);
514+
if (bf_sz) {
515+
__iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
516+
517+
/* flush the write-combining mapped buffer */
518+
wmb();
519+
520+
} else {
521+
mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
522+
}
511523

512524
sq->bf_offset ^= sq->bf_buf_size;
513525
}

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
514514

515515
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
516516
sq->uar_map = sq->uar.map;
517+
sq->uar_bf_map = sq->uar.bf_map;
517518
sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
518519
sq->max_inline = param->max_inline;
519520

@@ -524,11 +525,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
524525
txq_ix = c->ix + tc * priv->params.num_channels;
525526
sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
526527

527-
sq->pdev = c->pdev;
528-
sq->mkey_be = c->mkey_be;
529-
sq->channel = c;
530-
sq->tc = tc;
531-
sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
528+
sq->pdev = c->pdev;
529+
sq->mkey_be = c->mkey_be;
530+
sq->channel = c;
531+
sq->tc = tc;
532+
sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
533+
sq->bf_budget = MLX5E_SQ_BF_BUDGET;
532534
priv->txq_to_sq_map[txq_ix] = sq;
533535

534536
return 0;

drivers/net/ethernet/mellanox/mlx5/core/en_tx.c

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
5757

5858
if (notify_hw) {
5959
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
60-
mlx5e_tx_notify_hw(sq, wqe);
60+
mlx5e_tx_notify_hw(sq, wqe, 0);
6161
}
6262
}
6363

@@ -110,15 +110,15 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
110110
}
111111

112112
static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
113-
struct sk_buff *skb)
113+
struct sk_buff *skb, bool bf)
114114
{
115115
/* Some NIC TX decisions, e.g loopback, are based on the packet
116116
* headers and occur before the data gather.
117117
* Therefore these headers must be copied into the WQE
118118
*/
119119
#define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
120120

121-
if (skb_headlen(skb) <= sq->max_inline)
121+
if (bf && (skb_headlen(skb) <= sq->max_inline))
122122
return skb_headlen(skb);
123123

124124
return MLX5E_MIN_INLINE;
@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
137137

138138
u8 opcode = MLX5_OPCODE_SEND;
139139
dma_addr_t dma_addr = 0;
140+
bool bf = false;
140141
u16 headlen;
141142
u16 ds_cnt;
142143
u16 ihs;
@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
149150
else
150151
sq->stats.csum_offload_none++;
151152

153+
if (sq->cc != sq->prev_cc) {
154+
sq->prev_cc = sq->cc;
155+
sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
156+
}
157+
152158
if (skb_is_gso(skb)) {
153159
u32 payload_len;
154160

@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
161167
sq->stats.tso_packets++;
162168
sq->stats.tso_bytes += payload_len;
163169
} else {
164-
ihs = mlx5e_get_inline_hdr_size(sq, skb);
170+
bf = sq->bf_budget &&
171+
!skb->xmit_more &&
172+
!skb_shinfo(skb)->nr_frags;
173+
ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
165174
MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
166175
ETH_ZLEN);
167176
}
@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
233242
}
234243

235244
if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
245+
int bf_sz = 0;
246+
247+
if (bf && sq->uar_bf_map)
248+
bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
249+
236250
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
237-
mlx5e_tx_notify_hw(sq, wqe);
251+
mlx5e_tx_notify_hw(sq, wqe, bf_sz);
238252
}
239253

240254
/* fill sq edge with nops to avoid wqe wrap around */
241255
while ((sq->pc & wq->sz_m1) > sq->edge)
242256
mlx5e_send_nop(sq, false);
243257

258+
sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
259+
244260
sq->stats.packets++;
245261
return NETDEV_TX_OK;
246262

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
654654
}
655655
#endif
656656

657+
static int map_bf_area(struct mlx5_core_dev *dev)
658+
{
659+
resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
660+
resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
661+
662+
dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
663+
664+
return dev->priv.bf_mapping ? 0 : -ENOMEM;
665+
}
666+
667+
static void unmap_bf_area(struct mlx5_core_dev *dev)
668+
{
669+
if (dev->priv.bf_mapping)
670+
io_mapping_free(dev->priv.bf_mapping);
671+
}
672+
657673
static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
658674
{
659675
struct mlx5_priv *priv = &dev->priv;
@@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
808824
goto err_stop_eqs;
809825
}
810826

827+
if (map_bf_area(dev))
828+
dev_err(&pdev->dev, "Failed to map blue flame area\n");
829+
811830
err = mlx5_irq_set_affinity_hints(dev);
812831
if (err) {
813832
dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
814-
goto err_free_comp_eqs;
833+
goto err_unmap_bf_area;
815834
}
816835

817836
MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
@@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
823842

824843
return 0;
825844

826-
err_free_comp_eqs:
845+
err_unmap_bf_area:
846+
unmap_bf_area(dev);
847+
827848
free_comp_eqs(dev);
828849

829850
err_stop_eqs:
@@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
881902
mlx5_cleanup_qp_table(dev);
882903
mlx5_cleanup_cq_table(dev);
883904
mlx5_irq_clear_affinity_hints(dev);
905+
unmap_bf_area(dev);
884906
free_comp_eqs(dev);
885907
mlx5_stop_eqs(dev);
886908
mlx5_free_uuars(dev, &priv->uuari);

drivers/net/ethernet/mellanox/mlx5/core/uar.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
#include <linux/kernel.h>
3434
#include <linux/module.h>
35+
#include <linux/io-mapping.h>
3536
#include <linux/mlx5/driver.h>
3637
#include <linux/mlx5/cmd.h>
3738
#include "mlx5_core.h"
@@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
246247
goto err_free_uar;
247248
}
248249

250+
if (mdev->priv.bf_mapping)
251+
uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
252+
uar->index << PAGE_SHIFT);
253+
249254
return 0;
250255

251256
err_free_uar:
@@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar);
257262

258263
void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
259264
{
265+
io_mapping_unmap(uar->bf_map);
260266
iounmap(uar->map);
261267
mlx5_cmd_free_uar(mdev, uar->index);
262268
}

include/linux/mlx5/driver.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ struct mlx5_uar {
380380
u32 index;
381381
struct list_head bf_list;
382382
unsigned free_bf_bmap;
383-
void __iomem *wc_map;
383+
void __iomem *bf_map;
384384
void __iomem *map;
385385
};
386386

@@ -435,6 +435,8 @@ struct mlx5_priv {
435435
struct mlx5_uuar_info uuari;
436436
MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
437437

438+
struct io_mapping *bf_mapping;
439+
438440
/* pages stuff */
439441
struct workqueue_struct *pg_wq;
440442
struct rb_root page_root;

0 commit comments

Comments
 (0)