Skip to content

Commit bf5a755

Browse files
hkchudavem330
authored andcommitted
net-gre-gro: Add GRE support to the GRO stack
This patch built on top of Commit 299603e ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu <hkchu@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent cdb3f4a commit bf5a755

File tree

6 files changed

+216
-7
lines changed

6 files changed

+216
-7
lines changed

include/linux/netdevice.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,10 @@ struct napi_gro_cb {
16321632
int data_offset;
16331633

16341634
/* This is non-zero if the packet cannot be merged with the new skb. */
1635-
int flush;
1635+
u16 flush;
1636+
1637+
/* Save the IP ID here and check when we get to the transport layer */
1638+
u16 flush_id;
16361639

16371640
/* Number of segments aggregated. */
16381641
u16 count;
@@ -1651,6 +1654,9 @@ struct napi_gro_cb {
16511654
/* Used in ipv6_gro_receive() */
16521655
int proto;
16531656

1657+
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
1658+
__wsum csum;
1659+
16541660
/* used in skb_gro_receive() slow path */
16551661
struct sk_buff *last;
16561662
};
@@ -1900,6 +1906,14 @@ static inline void *skb_gro_network_header(struct sk_buff *skb)
19001906
skb_network_offset(skb);
19011907
}
19021908

1909+
static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
1910+
const void *start, unsigned int len)
1911+
{
1912+
if (skb->ip_summed == CHECKSUM_COMPLETE)
1913+
NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
1914+
csum_partial(start, len, 0));
1915+
}
1916+
19031917
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
19041918
unsigned short type,
19051919
const void *daddr, const void *saddr,
@@ -2440,6 +2454,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
24402454
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
24412455
struct sk_buff *napi_get_frags(struct napi_struct *napi);
24422456
gro_result_t napi_gro_frags(struct napi_struct *napi);
2457+
struct packet_offload *gro_find_receive_by_type(__be16 type);
2458+
struct packet_offload *gro_find_complete_by_type(__be16 type);
24432459

24442460
static inline void napi_free_frags(struct napi_struct *napi)
24452461
{

net/core/dev.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3846,6 +3846,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
38463846

38473847
skb_gro_reset_offset(skb);
38483848
gro_list_prepare(napi, skb);
3849+
NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
38493850

38503851
rcu_read_lock();
38513852
list_for_each_entry_rcu(ptype, head, list) {
@@ -3922,6 +3923,31 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
39223923
goto pull;
39233924
}
39243925

3926+
struct packet_offload *gro_find_receive_by_type(__be16 type)
3927+
{
3928+
struct list_head *offload_head = &offload_base;
3929+
struct packet_offload *ptype;
3930+
3931+
list_for_each_entry_rcu(ptype, offload_head, list) {
3932+
if (ptype->type != type || !ptype->callbacks.gro_receive)
3933+
continue;
3934+
return ptype;
3935+
}
3936+
return NULL;
3937+
}
3938+
3939+
struct packet_offload *gro_find_complete_by_type(__be16 type)
3940+
{
3941+
struct list_head *offload_head = &offload_base;
3942+
struct packet_offload *ptype;
3943+
3944+
list_for_each_entry_rcu(ptype, offload_head, list) {
3945+
if (ptype->type != type || !ptype->callbacks.gro_complete)
3946+
continue;
3947+
return ptype;
3948+
}
3949+
return NULL;
3950+
}
39253951

39263952
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
39273953
{

net/ipv4/af_inet.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,9 +1391,15 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
13911391
NAPI_GRO_CB(p)->flush |=
13921392
(iph->ttl ^ iph2->ttl) |
13931393
(iph->tos ^ iph2->tos) |
1394-
(__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) |
1395-
((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1394+
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
13961395

1396+
/* Save the IP ID check to be included later when we get to
1397+
* the transport layer so only the inner most IP ID is checked.
1398+
* This is because some GSO/TSO implementations do not
1399+
* correctly increment the IP ID for the outer hdrs.
1400+
*/
1401+
NAPI_GRO_CB(p)->flush_id =
1402+
((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
13971403
NAPI_GRO_CB(p)->flush |= flush;
13981404
}
13991405

net/ipv4/gre_offload.c

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,170 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
116116
return segs;
117117
}
118118

119+
/* Compute the whole skb csum in s/w and store it, then verify GRO csum
120+
* starting from gro_offset.
121+
*/
122+
static __sum16 gro_skb_checksum(struct sk_buff *skb)
123+
{
124+
__sum16 sum;
125+
126+
skb->csum = skb_checksum(skb, 0, skb->len, 0);
127+
NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
128+
csum_partial(skb->data, skb_gro_offset(skb), 0));
129+
sum = csum_fold(NAPI_GRO_CB(skb)->csum);
130+
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
131+
if (unlikely(!sum))
132+
netdev_rx_csum_fault(skb->dev);
133+
} else
134+
skb->ip_summed = CHECKSUM_COMPLETE;
135+
136+
return sum;
137+
}
138+
139+
static struct sk_buff **gre_gro_receive(struct sk_buff **head,
140+
struct sk_buff *skb)
141+
{
142+
struct sk_buff **pp = NULL;
143+
struct sk_buff *p;
144+
const struct gre_base_hdr *greh;
145+
unsigned int hlen, grehlen;
146+
unsigned int off;
147+
int flush = 1;
148+
struct packet_offload *ptype;
149+
__be16 type;
150+
151+
off = skb_gro_offset(skb);
152+
hlen = off + sizeof(*greh);
153+
greh = skb_gro_header_fast(skb, off);
154+
if (skb_gro_header_hard(skb, hlen)) {
155+
greh = skb_gro_header_slow(skb, hlen, off);
156+
if (unlikely(!greh))
157+
goto out;
158+
}
159+
160+
/* Only support version 0 and K (key), C (csum) flags. Note that
161+
* although the support for the S (seq#) flag can be added easily
162+
* for GRO, this is problematic for GSO hence can not be enabled
163+
* here because a GRO pkt may end up in the forwarding path, thus
164+
* requiring GSO support to break it up correctly.
165+
*/
166+
if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
167+
goto out;
168+
169+
type = greh->protocol;
170+
171+
rcu_read_lock();
172+
ptype = gro_find_receive_by_type(type);
173+
if (ptype == NULL)
174+
goto out_unlock;
175+
176+
grehlen = GRE_HEADER_SECTION;
177+
178+
if (greh->flags & GRE_KEY)
179+
grehlen += GRE_HEADER_SECTION;
180+
181+
if (greh->flags & GRE_CSUM)
182+
grehlen += GRE_HEADER_SECTION;
183+
184+
hlen = off + grehlen;
185+
if (skb_gro_header_hard(skb, hlen)) {
186+
greh = skb_gro_header_slow(skb, hlen, off);
187+
if (unlikely(!greh))
188+
goto out_unlock;
189+
}
190+
if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
191+
__sum16 csum = 0;
192+
193+
if (skb->ip_summed == CHECKSUM_COMPLETE)
194+
csum = csum_fold(NAPI_GRO_CB(skb)->csum);
195+
/* Don't trust csum error calculated/reported by h/w */
196+
if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
197+
csum = gro_skb_checksum(skb);
198+
199+
/* GRE CSUM is the 1's complement of the 1's complement sum
200+
* of the GRE hdr plus payload so it should add up to 0xffff
201+
* (and 0 after csum_fold()) just like the IPv4 hdr csum.
202+
*/
203+
if (csum)
204+
goto out_unlock;
205+
}
206+
flush = 0;
207+
208+
for (p = *head; p; p = p->next) {
209+
const struct gre_base_hdr *greh2;
210+
211+
if (!NAPI_GRO_CB(p)->same_flow)
212+
continue;
213+
214+
/* The following checks are needed to ensure only pkts
215+
* from the same tunnel are considered for aggregation.
216+
* The criteria for "the same tunnel" includes:
217+
* 1) same version (we only support version 0 here)
218+
* 2) same protocol (we only support ETH_P_IP for now)
219+
* 3) same set of flags
220+
* 4) same key if the key field is present.
221+
*/
222+
greh2 = (struct gre_base_hdr *)(p->data + off);
223+
224+
if (greh2->flags != greh->flags ||
225+
greh2->protocol != greh->protocol) {
226+
NAPI_GRO_CB(p)->same_flow = 0;
227+
continue;
228+
}
229+
if (greh->flags & GRE_KEY) {
230+
/* compare keys */
231+
if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
232+
NAPI_GRO_CB(p)->same_flow = 0;
233+
continue;
234+
}
235+
}
236+
}
237+
238+
skb_gro_pull(skb, grehlen);
239+
240+
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
241+
skb_gro_postpull_rcsum(skb, greh, grehlen);
242+
243+
pp = ptype->callbacks.gro_receive(head, skb);
244+
245+
out_unlock:
246+
rcu_read_unlock();
247+
out:
248+
NAPI_GRO_CB(skb)->flush |= flush;
249+
250+
return pp;
251+
}
252+
253+
int gre_gro_complete(struct sk_buff *skb, int nhoff)
254+
{
255+
struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
256+
struct packet_offload *ptype;
257+
unsigned int grehlen = sizeof(*greh);
258+
int err = -ENOENT;
259+
__be16 type;
260+
261+
type = greh->protocol;
262+
if (greh->flags & GRE_KEY)
263+
grehlen += GRE_HEADER_SECTION;
264+
265+
if (greh->flags & GRE_CSUM)
266+
grehlen += GRE_HEADER_SECTION;
267+
268+
rcu_read_lock();
269+
ptype = gro_find_complete_by_type(type);
270+
if (ptype != NULL)
271+
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
272+
273+
rcu_read_unlock();
274+
return err;
275+
}
276+
119277
static const struct net_offload gre_offload = {
120278
.callbacks = {
121279
.gso_send_check = gre_gso_send_check,
122280
.gso_segment = gre_gso_segment,
281+
.gro_receive = gre_gro_receive,
282+
.gro_complete = gre_gro_complete,
123283
},
124284
};
125285

net/ipv4/tcp_offload.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
197197
goto out_check_final;
198198

199199
found:
200-
flush = NAPI_GRO_CB(p)->flush;
200+
/* Include the IP ID check below from the inner most IP hdr */
201+
flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
201202
flush |= (__force int)(flags & TCP_FLAG_CWR);
202203
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
203204
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -230,7 +231,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
230231
pp = head;
231232

232233
out:
233-
NAPI_GRO_CB(skb)->flush |= flush;
234+
NAPI_GRO_CB(skb)->flush |= (flush != 0);
234235

235236
return pp;
236237
}
@@ -280,7 +281,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
280281
if (NAPI_GRO_CB(skb)->flush)
281282
goto skip_csum;
282283

283-
wsum = skb->csum;
284+
wsum = NAPI_GRO_CB(skb)->csum;
284285

285286
switch (skb->ip_summed) {
286287
case CHECKSUM_NONE:

net/ipv6/ip6_offload.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
190190
unsigned int nlen;
191191
unsigned int hlen;
192192
unsigned int off;
193-
int flush = 1;
193+
u16 flush = 1;
194194
int proto;
195195
__wsum csum;
196196

0 commit comments

Comments
 (0)