Skip to content

Commit 1eb8c69

Browse files
amirvdavem330
authored andcommitted
net/mlx4_en: Add accelerated RFS support
Use RFS infrastructure and flow steering in HW to keep CPU affinity of rx interrupts and application per TCP stream. A flow steering filter is added to the HW whenever the RFS ndo callback is invoked by core networking code. Because the invocation takes place in interrupt context, the actual setup of HW is done using workqueue. Whenever new filter is added, the driver checks for expiry of existing filters. Since there's window in time between the point where the core RFS code invoked the ndo callback, to the point where the HW is configured from the workqueue context, the 2nd, 3rd etc packets from that stream will cause the net core to invoke the callback again and again. To prevent inefficient/double configuration of the HW, the filters are kept in a database which is indexed using hash function to enable fast access. Signed-off-by: Amir Vadai <amirv@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent d9236c3 commit 1eb8c69

File tree

4 files changed

+342
-1
lines changed

4 files changed

+342
-1
lines changed

drivers/net/ethernet/mellanox/mlx4/en_cq.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
7777
struct mlx4_en_dev *mdev = priv->mdev;
7878
int err = 0;
7979
char name[25];
80+
struct cpu_rmap *rmap =
81+
#ifdef CONFIG_RFS_ACCEL
82+
priv->dev->rx_cpu_rmap;
83+
#else
84+
NULL;
85+
#endif
8086

8187
cq->dev = mdev->pndev[priv->port];
8288
cq->mcq.set_ci_db = cq->wqres.db.db;
@@ -91,7 +97,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
9197
sprintf(name, "%s-%d", priv->dev->name,
9298
cq->ring);
9399
/* Set IRQ for specific name (per ring) */
94-
if (mlx4_assign_eq(mdev->dev, name, NULL,
100+
if (mlx4_assign_eq(mdev->dev, name, rmap,
95101
&cq->vector)) {
96102
cq->vector = (cq->ring + 1 + priv->port)
97103
% mdev->dev->caps.num_comp_vectors;

drivers/net/ethernet/mellanox/mlx4/en_netdev.c

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#include <linux/if_vlan.h>
3737
#include <linux/delay.h>
3838
#include <linux/slab.h>
39+
#include <linux/hash.h>
40+
#include <net/ip.h>
3941

4042
#include <linux/mlx4/driver.h>
4143
#include <linux/mlx4/device.h>
@@ -66,6 +68,299 @@ static int mlx4_en_setup_tc(struct net_device *dev, u8 up)
6668
return 0;
6769
}
6870

71+
#ifdef CONFIG_RFS_ACCEL
72+
73+
struct mlx4_en_filter {
74+
struct list_head next;
75+
struct work_struct work;
76+
77+
__be32 src_ip;
78+
__be32 dst_ip;
79+
__be16 src_port;
80+
__be16 dst_port;
81+
82+
int rxq_index;
83+
struct mlx4_en_priv *priv;
84+
u32 flow_id; /* RFS infrastructure id */
85+
int id; /* mlx4_en driver id */
86+
u64 reg_id; /* Flow steering API id */
87+
u8 activated; /* Used to prevent expiry before filter
88+
* is attached
89+
*/
90+
struct hlist_node filter_chain;
91+
};
92+
93+
static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);
94+
95+
static void mlx4_en_filter_work(struct work_struct *work)
96+
{
97+
struct mlx4_en_filter *filter = container_of(work,
98+
struct mlx4_en_filter,
99+
work);
100+
struct mlx4_en_priv *priv = filter->priv;
101+
struct mlx4_spec_list spec_tcp = {
102+
.id = MLX4_NET_TRANS_RULE_ID_TCP,
103+
{
104+
.tcp_udp = {
105+
.dst_port = filter->dst_port,
106+
.dst_port_msk = (__force __be16)-1,
107+
.src_port = filter->src_port,
108+
.src_port_msk = (__force __be16)-1,
109+
},
110+
},
111+
};
112+
struct mlx4_spec_list spec_ip = {
113+
.id = MLX4_NET_TRANS_RULE_ID_IPV4,
114+
{
115+
.ipv4 = {
116+
.dst_ip = filter->dst_ip,
117+
.dst_ip_msk = (__force __be32)-1,
118+
.src_ip = filter->src_ip,
119+
.src_ip_msk = (__force __be32)-1,
120+
},
121+
},
122+
};
123+
struct mlx4_spec_list spec_eth = {
124+
.id = MLX4_NET_TRANS_RULE_ID_ETH,
125+
};
126+
struct mlx4_net_trans_rule rule = {
127+
.list = LIST_HEAD_INIT(rule.list),
128+
.queue_mode = MLX4_NET_TRANS_Q_LIFO,
129+
.exclusive = 1,
130+
.allow_loopback = 1,
131+
.promisc_mode = MLX4_FS_PROMISC_NONE,
132+
.port = priv->port,
133+
.priority = MLX4_DOMAIN_RFS,
134+
};
135+
int rc;
136+
__be64 mac;
137+
__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
138+
139+
list_add_tail(&spec_eth.list, &rule.list);
140+
list_add_tail(&spec_ip.list, &rule.list);
141+
list_add_tail(&spec_tcp.list, &rule.list);
142+
143+
mac = cpu_to_be64((priv->mac & MLX4_MAC_MASK) << 16);
144+
145+
rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
146+
memcpy(spec_eth.eth.dst_mac, &mac, ETH_ALEN);
147+
memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
148+
149+
filter->activated = 0;
150+
151+
if (filter->reg_id) {
152+
rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
153+
if (rc && rc != -ENOENT)
154+
en_err(priv, "Error detaching flow. rc = %d\n", rc);
155+
}
156+
157+
rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
158+
if (rc)
159+
en_err(priv, "Error attaching flow. err = %d\n", rc);
160+
161+
mlx4_en_filter_rfs_expire(priv);
162+
163+
filter->activated = 1;
164+
}
165+
166+
static inline struct hlist_head *
167+
filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
168+
__be16 src_port, __be16 dst_port)
169+
{
170+
unsigned long l;
171+
int bucket_idx;
172+
173+
l = (__force unsigned long)src_port |
174+
((__force unsigned long)dst_port << 2);
175+
l ^= (__force unsigned long)(src_ip ^ dst_ip);
176+
177+
bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);
178+
179+
return &priv->filter_hash[bucket_idx];
180+
}
181+
182+
static struct mlx4_en_filter *
183+
mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
184+
__be32 dst_ip, __be16 src_port, __be16 dst_port,
185+
u32 flow_id)
186+
{
187+
struct mlx4_en_filter *filter = NULL;
188+
189+
filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
190+
if (!filter)
191+
return NULL;
192+
193+
filter->priv = priv;
194+
filter->rxq_index = rxq_index;
195+
INIT_WORK(&filter->work, mlx4_en_filter_work);
196+
197+
filter->src_ip = src_ip;
198+
filter->dst_ip = dst_ip;
199+
filter->src_port = src_port;
200+
filter->dst_port = dst_port;
201+
202+
filter->flow_id = flow_id;
203+
204+
filter->id = priv->last_filter_id++;
205+
206+
list_add_tail(&filter->next, &priv->filters);
207+
hlist_add_head(&filter->filter_chain,
208+
filter_hash_bucket(priv, src_ip, dst_ip, src_port,
209+
dst_port));
210+
211+
return filter;
212+
}
213+
214+
static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
215+
{
216+
struct mlx4_en_priv *priv = filter->priv;
217+
int rc;
218+
219+
list_del(&filter->next);
220+
221+
rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
222+
if (rc && rc != -ENOENT)
223+
en_err(priv, "Error detaching flow. rc = %d\n", rc);
224+
225+
kfree(filter);
226+
}
227+
228+
static inline struct mlx4_en_filter *
229+
mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
230+
__be16 src_port, __be16 dst_port)
231+
{
232+
struct hlist_node *elem;
233+
struct mlx4_en_filter *filter;
234+
struct mlx4_en_filter *ret = NULL;
235+
236+
hlist_for_each_entry(filter, elem,
237+
filter_hash_bucket(priv, src_ip, dst_ip,
238+
src_port, dst_port),
239+
filter_chain) {
240+
if (filter->src_ip == src_ip &&
241+
filter->dst_ip == dst_ip &&
242+
filter->src_port == src_port &&
243+
filter->dst_port == dst_port) {
244+
ret = filter;
245+
break;
246+
}
247+
}
248+
249+
return ret;
250+
}
251+
252+
static int
253+
mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
254+
u16 rxq_index, u32 flow_id)
255+
{
256+
struct mlx4_en_priv *priv = netdev_priv(net_dev);
257+
struct mlx4_en_filter *filter;
258+
const struct iphdr *ip;
259+
const __be16 *ports;
260+
__be32 src_ip;
261+
__be32 dst_ip;
262+
__be16 src_port;
263+
__be16 dst_port;
264+
int nhoff = skb_network_offset(skb);
265+
int ret = 0;
266+
267+
if (skb->protocol != htons(ETH_P_IP))
268+
return -EPROTONOSUPPORT;
269+
270+
ip = (const struct iphdr *)(skb->data + nhoff);
271+
if (ip_is_fragment(ip))
272+
return -EPROTONOSUPPORT;
273+
274+
ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
275+
276+
src_ip = ip->saddr;
277+
dst_ip = ip->daddr;
278+
src_port = ports[0];
279+
dst_port = ports[1];
280+
281+
if (ip->protocol != IPPROTO_TCP)
282+
return -EPROTONOSUPPORT;
283+
284+
spin_lock_bh(&priv->filters_lock);
285+
filter = mlx4_en_filter_find(priv, src_ip, dst_ip, src_port, dst_port);
286+
if (filter) {
287+
if (filter->rxq_index == rxq_index)
288+
goto out;
289+
290+
filter->rxq_index = rxq_index;
291+
} else {
292+
filter = mlx4_en_filter_alloc(priv, rxq_index,
293+
src_ip, dst_ip,
294+
src_port, dst_port, flow_id);
295+
if (!filter) {
296+
ret = -ENOMEM;
297+
goto err;
298+
}
299+
}
300+
301+
queue_work(priv->mdev->workqueue, &filter->work);
302+
303+
out:
304+
ret = filter->id;
305+
err:
306+
spin_unlock_bh(&priv->filters_lock);
307+
308+
return ret;
309+
}
310+
311+
void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv,
312+
struct mlx4_en_rx_ring *rx_ring)
313+
{
314+
struct mlx4_en_filter *filter, *tmp;
315+
LIST_HEAD(del_list);
316+
317+
spin_lock_bh(&priv->filters_lock);
318+
list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
319+
list_move(&filter->next, &del_list);
320+
hlist_del(&filter->filter_chain);
321+
}
322+
spin_unlock_bh(&priv->filters_lock);
323+
324+
list_for_each_entry_safe(filter, tmp, &del_list, next) {
325+
cancel_work_sync(&filter->work);
326+
mlx4_en_filter_free(filter);
327+
}
328+
}
329+
330+
static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
331+
{
332+
struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
333+
LIST_HEAD(del_list);
334+
int i = 0;
335+
336+
spin_lock_bh(&priv->filters_lock);
337+
list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
338+
if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
339+
break;
340+
341+
if (filter->activated &&
342+
!work_pending(&filter->work) &&
343+
rps_may_expire_flow(priv->dev,
344+
filter->rxq_index, filter->flow_id,
345+
filter->id)) {
346+
list_move(&filter->next, &del_list);
347+
hlist_del(&filter->filter_chain);
348+
} else
349+
last_filter = filter;
350+
351+
i++;
352+
}
353+
354+
if (last_filter && (&last_filter->next != priv->filters.next))
355+
list_move(&priv->filters, &last_filter->next);
356+
357+
spin_unlock_bh(&priv->filters_lock);
358+
359+
list_for_each_entry_safe(filter, tmp, &del_list, next)
360+
mlx4_en_filter_free(filter);
361+
}
362+
#endif
363+
69364
static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
70365
{
71366
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -1079,6 +1374,11 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
10791374
{
10801375
int i;
10811376

1377+
#ifdef CONFIG_RFS_ACCEL
1378+
free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
1379+
priv->dev->rx_cpu_rmap = NULL;
1380+
#endif
1381+
10821382
for (i = 0; i < priv->tx_ring_num; i++) {
10831383
if (priv->tx_ring[i].tx_info)
10841384
mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
@@ -1134,6 +1434,15 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
11341434
goto err;
11351435
}
11361436

1437+
#ifdef CONFIG_RFS_ACCEL
1438+
priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->rx_ring_num);
1439+
if (!priv->dev->rx_cpu_rmap)
1440+
goto err;
1441+
1442+
INIT_LIST_HEAD(&priv->filters);
1443+
spin_lock_init(&priv->filters_lock);
1444+
#endif
1445+
11371446
return 0;
11381447

11391448
err:
@@ -1241,6 +1550,9 @@ static const struct net_device_ops mlx4_netdev_ops = {
12411550
#endif
12421551
.ndo_set_features = mlx4_en_set_features,
12431552
.ndo_setup_tc = mlx4_en_setup_tc,
1553+
#ifdef CONFIG_RFS_ACCEL
1554+
.ndo_rx_flow_steer = mlx4_en_filter_rfs,
1555+
#endif
12441556
};
12451557

12461558
int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
@@ -1358,6 +1670,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
13581670
NETIF_F_HW_VLAN_FILTER;
13591671
dev->hw_features |= NETIF_F_LOOPBACK;
13601672

1673+
if (mdev->dev->caps.steering_mode ==
1674+
MLX4_STEERING_MODE_DEVICE_MANAGED)
1675+
dev->hw_features |= NETIF_F_NTUPLE;
1676+
13611677
mdev->pndev[port] = dev;
13621678

13631679
netif_carrier_off(dev);

drivers/net/ethernet/mellanox/mlx4/en_rx.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,9 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
389389
mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
390390
vfree(ring->rx_info);
391391
ring->rx_info = NULL;
392+
#ifdef CONFIG_RFS_ACCEL
393+
mlx4_en_cleanup_filters(priv, ring);
394+
#endif
392395
}
393396

394397
void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,

0 commit comments

Comments
 (0)