Skip to content

Commit 841e360

Browse files
Suresh SiddhaH. Peter Anvin
authored andcommitted
x86, fpu: always use kernel_fpu_begin/end() for in-kernel FPU usage
use kernel_fpu_begin/end() instead of unconditionally accessing cr0 and saving/restoring just the few used xmm/ymm registers. This has some advantages like: * If the task's FPU state is already active, then kernel_fpu_begin() will just save the user-state and avoiding the read/write of cr0. In general, cr0 accesses are much slower. * Manual save/restore of xmm/ymm registers will affect the 'modified' and the 'init' optimizations brought in the by xsaveopt/xrstor infrastructure. * Foward compatibility with future vector register extensions will be a problem if the xmm/ymm registers are manually saved and restored (corrupting the extended state of those vector registers). With this patch, there was no significant difference in the xor throughput using AVX, measured during boot. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Link: http://lkml.kernel.org/r/1345842782-24175-5-git-send-email-suresh.b.siddha@intel.com Cc: Jim Kukunas <james.t.kukunas@linux.intel.com> Cc: NeilBrown <neilb@suse.de> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
1 parent 9c1c3fa commit 841e360

File tree

3 files changed

+29
-142
lines changed

3 files changed

+29
-142
lines changed

arch/x86/include/asm/xor_32.h

Lines changed: 8 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = {
534534
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535535
*/
536536

537-
#define XMMS_SAVE \
538-
do { \
539-
preempt_disable(); \
540-
cr0 = read_cr0(); \
541-
clts(); \
542-
asm volatile( \
543-
"movups %%xmm0,(%0) ;\n\t" \
544-
"movups %%xmm1,0x10(%0) ;\n\t" \
545-
"movups %%xmm2,0x20(%0) ;\n\t" \
546-
"movups %%xmm3,0x30(%0) ;\n\t" \
547-
: \
548-
: "r" (xmm_save) \
549-
: "memory"); \
550-
} while (0)
551-
552-
#define XMMS_RESTORE \
553-
do { \
554-
asm volatile( \
555-
"sfence ;\n\t" \
556-
"movups (%0),%%xmm0 ;\n\t" \
557-
"movups 0x10(%0),%%xmm1 ;\n\t" \
558-
"movups 0x20(%0),%%xmm2 ;\n\t" \
559-
"movups 0x30(%0),%%xmm3 ;\n\t" \
560-
: \
561-
: "r" (xmm_save) \
562-
: "memory"); \
563-
write_cr0(cr0); \
564-
preempt_enable(); \
565-
} while (0)
566-
567-
#define ALIGN16 __attribute__((aligned(16)))
568-
569537
#define OFFS(x) "16*("#x")"
570538
#define PF_OFFS(x) "256+16*("#x")"
571539
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
@@ -587,10 +555,8 @@ static void
587555
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
588556
{
589557
unsigned long lines = bytes >> 8;
590-
char xmm_save[16*4] ALIGN16;
591-
int cr0;
592558

593-
XMMS_SAVE;
559+
kernel_fpu_begin();
594560

595561
asm volatile(
596562
#undef BLOCK
@@ -633,18 +599,16 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
633599
:
634600
: "memory");
635601

636-
XMMS_RESTORE;
602+
kernel_fpu_end();
637603
}
638604

639605
static void
640606
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
641607
unsigned long *p3)
642608
{
643609
unsigned long lines = bytes >> 8;
644-
char xmm_save[16*4] ALIGN16;
645-
int cr0;
646610

647-
XMMS_SAVE;
611+
kernel_fpu_begin();
648612

649613
asm volatile(
650614
#undef BLOCK
@@ -694,18 +658,16 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
694658
:
695659
: "memory" );
696660

697-
XMMS_RESTORE;
661+
kernel_fpu_end();
698662
}
699663

700664
static void
701665
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
702666
unsigned long *p3, unsigned long *p4)
703667
{
704668
unsigned long lines = bytes >> 8;
705-
char xmm_save[16*4] ALIGN16;
706-
int cr0;
707669

708-
XMMS_SAVE;
670+
kernel_fpu_begin();
709671

710672
asm volatile(
711673
#undef BLOCK
@@ -762,18 +724,16 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
762724
:
763725
: "memory" );
764726

765-
XMMS_RESTORE;
727+
kernel_fpu_end();
766728
}
767729

768730
static void
769731
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
770732
unsigned long *p3, unsigned long *p4, unsigned long *p5)
771733
{
772734
unsigned long lines = bytes >> 8;
773-
char xmm_save[16*4] ALIGN16;
774-
int cr0;
775735

776-
XMMS_SAVE;
736+
kernel_fpu_begin();
777737

778738
/* Make sure GCC forgets anything it knows about p4 or p5,
779739
such that it won't pass to the asm volatile below a
@@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
850810
like assuming they have some legal value. */
851811
asm("" : "=r" (p4), "=r" (p5));
852812

853-
XMMS_RESTORE;
813+
kernel_fpu_end();
854814
}
855815

856816
static struct xor_block_template xor_block_pIII_sse = {

arch/x86/include/asm/xor_64.h

Lines changed: 9 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -34,41 +34,7 @@
3434
* no advantages to be gotten from x86-64 here anyways.
3535
*/
3636

37-
typedef struct {
38-
unsigned long a, b;
39-
} __attribute__((aligned(16))) xmm_store_t;
40-
41-
/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42-
tell it to do a clts before the register saving. */
43-
#define XMMS_SAVE \
44-
do { \
45-
preempt_disable(); \
46-
asm volatile( \
47-
"movq %%cr0,%0 ;\n\t" \
48-
"clts ;\n\t" \
49-
"movups %%xmm0,(%1) ;\n\t" \
50-
"movups %%xmm1,0x10(%1) ;\n\t" \
51-
"movups %%xmm2,0x20(%1) ;\n\t" \
52-
"movups %%xmm3,0x30(%1) ;\n\t" \
53-
: "=&r" (cr0) \
54-
: "r" (xmm_save) \
55-
: "memory"); \
56-
} while (0)
57-
58-
#define XMMS_RESTORE \
59-
do { \
60-
asm volatile( \
61-
"sfence ;\n\t" \
62-
"movups (%1),%%xmm0 ;\n\t" \
63-
"movups 0x10(%1),%%xmm1 ;\n\t" \
64-
"movups 0x20(%1),%%xmm2 ;\n\t" \
65-
"movups 0x30(%1),%%xmm3 ;\n\t" \
66-
"movq %0,%%cr0 ;\n\t" \
67-
: \
68-
: "r" (cr0), "r" (xmm_save) \
69-
: "memory"); \
70-
preempt_enable(); \
71-
} while (0)
37+
#include <asm/i387.h>
7238

7339
#define OFFS(x) "16*("#x")"
7440
#define PF_OFFS(x) "256+16*("#x")"
@@ -91,10 +57,8 @@ static void
9157
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
9258
{
9359
unsigned int lines = bytes >> 8;
94-
unsigned long cr0;
95-
xmm_store_t xmm_save[4];
9660

97-
XMMS_SAVE;
61+
kernel_fpu_begin();
9862

9963
asm volatile(
10064
#undef BLOCK
@@ -135,19 +99,16 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
13599
: [inc] "r" (256UL)
136100
: "memory");
137101

138-
XMMS_RESTORE;
102+
kernel_fpu_end();
139103
}
140104

141105
static void
142106
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143107
unsigned long *p3)
144108
{
145109
unsigned int lines = bytes >> 8;
146-
xmm_store_t xmm_save[4];
147-
unsigned long cr0;
148-
149-
XMMS_SAVE;
150110

111+
kernel_fpu_begin();
151112
asm volatile(
152113
#undef BLOCK
153114
#define BLOCK(i) \
@@ -194,18 +155,16 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
194155
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195156
: [inc] "r" (256UL)
196157
: "memory");
197-
XMMS_RESTORE;
158+
kernel_fpu_end();
198159
}
199160

200161
static void
201162
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202163
unsigned long *p3, unsigned long *p4)
203164
{
204165
unsigned int lines = bytes >> 8;
205-
xmm_store_t xmm_save[4];
206-
unsigned long cr0;
207166

208-
XMMS_SAVE;
167+
kernel_fpu_begin();
209168

210169
asm volatile(
211170
#undef BLOCK
@@ -261,18 +220,16 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
261220
: [inc] "r" (256UL)
262221
: "memory" );
263222

264-
XMMS_RESTORE;
223+
kernel_fpu_end();
265224
}
266225

267226
static void
268227
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269228
unsigned long *p3, unsigned long *p4, unsigned long *p5)
270229
{
271230
unsigned int lines = bytes >> 8;
272-
xmm_store_t xmm_save[4];
273-
unsigned long cr0;
274231

275-
XMMS_SAVE;
232+
kernel_fpu_begin();
276233

277234
asm volatile(
278235
#undef BLOCK
@@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
336293
: [inc] "r" (256UL)
337294
: "memory");
338295

339-
XMMS_RESTORE;
296+
kernel_fpu_end();
340297
}
341298

342299
static struct xor_block_template xor_block_sse = {

arch/x86/include/asm/xor_avx.h

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,32 +20,6 @@
2020
#include <linux/compiler.h>
2121
#include <asm/i387.h>
2222

23-
#define ALIGN32 __aligned(32)
24-
25-
#define YMM_SAVED_REGS 4
26-
27-
#define YMMS_SAVE \
28-
do { \
29-
preempt_disable(); \
30-
cr0 = read_cr0(); \
31-
clts(); \
32-
asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33-
asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34-
asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35-
asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36-
} while (0);
37-
38-
#define YMMS_RESTORE \
39-
do { \
40-
asm volatile("sfence" : : : "memory"); \
41-
asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42-
asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43-
asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44-
asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45-
write_cr0(cr0); \
46-
preempt_enable(); \
47-
} while (0);
48-
4923
#define BLOCK4(i) \
5024
BLOCK(32 * i, 0) \
5125
BLOCK(32 * (i + 1), 1) \
@@ -60,10 +34,9 @@ do { \
6034

6135
static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
6236
{
63-
unsigned long cr0, lines = bytes >> 9;
64-
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
37+
unsigned long lines = bytes >> 9;
6538

66-
YMMS_SAVE
39+
kernel_fpu_begin();
6740

6841
while (lines--) {
6942
#undef BLOCK
@@ -82,16 +55,15 @@ do { \
8255
p1 = (unsigned long *)((uintptr_t)p1 + 512);
8356
}
8457

85-
YMMS_RESTORE
58+
kernel_fpu_end();
8659
}
8760

8861
static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
8962
unsigned long *p2)
9063
{
91-
unsigned long cr0, lines = bytes >> 9;
92-
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
64+
unsigned long lines = bytes >> 9;
9365

94-
YMMS_SAVE
66+
kernel_fpu_begin();
9567

9668
while (lines--) {
9769
#undef BLOCK
@@ -113,16 +85,15 @@ do { \
11385
p2 = (unsigned long *)((uintptr_t)p2 + 512);
11486
}
11587

116-
YMMS_RESTORE
88+
kernel_fpu_end();
11789
}
11890

11991
static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
12092
unsigned long *p2, unsigned long *p3)
12193
{
122-
unsigned long cr0, lines = bytes >> 9;
123-
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
94+
unsigned long lines = bytes >> 9;
12495

125-
YMMS_SAVE
96+
kernel_fpu_begin();
12697

12798
while (lines--) {
12899
#undef BLOCK
@@ -147,16 +118,15 @@ do { \
147118
p3 = (unsigned long *)((uintptr_t)p3 + 512);
148119
}
149120

150-
YMMS_RESTORE
121+
kernel_fpu_end();
151122
}
152123

153124
static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154125
unsigned long *p2, unsigned long *p3, unsigned long *p4)
155126
{
156-
unsigned long cr0, lines = bytes >> 9;
157-
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
127+
unsigned long lines = bytes >> 9;
158128

159-
YMMS_SAVE
129+
kernel_fpu_begin();
160130

161131
while (lines--) {
162132
#undef BLOCK
@@ -184,7 +154,7 @@ do { \
184154
p4 = (unsigned long *)((uintptr_t)p4 + 512);
185155
}
186156

187-
YMMS_RESTORE
157+
kernel_fpu_end();
188158
}
189159

190160
static struct xor_block_template xor_block_avx = {

0 commit comments

Comments
 (0)