Skip to content

Commit f97fc81

Browse files
stevecapperlinarowildea01
authored andcommitted
arm64: percpu: Implement this_cpu operations
The generic this_cpu operations disable interrupts to ensure that the requested operation is protected from pre-emption. For arm64, this is overkill and can hurt throughput and latency. This patch provides arm64 specific implementations for the this_cpu operations. Rather than disable interrupts, we use the exclusive monitor or atomic operations as appropriate. The following operations are implemented: add, add_return, and, or, read, write, xchg. We also wire up a cmpxchg implementation from cmpxchg.h. Testing was performed using the percpu_test module and hackbench on a Juno board running 3.18-rc4. Signed-off-by: Steve Capper <steve.capper@linaro.org> Reviewed-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
1 parent 15670ef commit f97fc81

File tree

2 files changed

+219
-2
lines changed

2 files changed

+219
-2
lines changed

arch/arm64/include/asm/cmpxchg.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,10 @@ static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old,
246246
__ret; \
247247
})
248248

249-
#define this_cpu_cmpxchg_8(ptr, o, n) \
250-
cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
249+
#define this_cpu_cmpxchg_1(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
250+
#define this_cpu_cmpxchg_2(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
251+
#define this_cpu_cmpxchg_4(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
252+
#define this_cpu_cmpxchg_8(ptr, o, n) cmpxchg_local(raw_cpu_ptr(&(ptr)), o, n)
251253

252254
#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2) \
253255
cmpxchg_double_local(raw_cpu_ptr(&(ptr1)), raw_cpu_ptr(&(ptr2)), \

arch/arm64/include/asm/percpu.h

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,221 @@ static inline unsigned long __my_cpu_offset(void)
4444

4545
#endif /* CONFIG_SMP */
4646

47+
#define PERCPU_OP(op, asm_op) \
48+
static inline unsigned long __percpu_##op(void *ptr, \
49+
unsigned long val, int size) \
50+
{ \
51+
unsigned long loop, ret; \
52+
\
53+
switch (size) { \
54+
case 1: \
55+
do { \
56+
asm ("//__per_cpu_" #op "_1\n" \
57+
"ldxrb %w[ret], %[ptr]\n" \
58+
#asm_op " %w[ret], %w[ret], %w[val]\n" \
59+
"stxrb %w[loop], %w[ret], %[ptr]\n" \
60+
: [loop] "=&r" (loop), [ret] "=&r" (ret), \
61+
[ptr] "+Q"(*(u8 *)ptr) \
62+
: [val] "Ir" (val)); \
63+
} while (loop); \
64+
break; \
65+
case 2: \
66+
do { \
67+
asm ("//__per_cpu_" #op "_2\n" \
68+
"ldxrh %w[ret], %[ptr]\n" \
69+
#asm_op " %w[ret], %w[ret], %w[val]\n" \
70+
"stxrh %w[loop], %w[ret], %[ptr]\n" \
71+
: [loop] "=&r" (loop), [ret] "=&r" (ret), \
72+
[ptr] "+Q"(*(u16 *)ptr) \
73+
: [val] "Ir" (val)); \
74+
} while (loop); \
75+
break; \
76+
case 4: \
77+
do { \
78+
asm ("//__per_cpu_" #op "_4\n" \
79+
"ldxr %w[ret], %[ptr]\n" \
80+
#asm_op " %w[ret], %w[ret], %w[val]\n" \
81+
"stxr %w[loop], %w[ret], %[ptr]\n" \
82+
: [loop] "=&r" (loop), [ret] "=&r" (ret), \
83+
[ptr] "+Q"(*(u32 *)ptr) \
84+
: [val] "Ir" (val)); \
85+
} while (loop); \
86+
break; \
87+
case 8: \
88+
do { \
89+
asm ("//__per_cpu_" #op "_8\n" \
90+
"ldxr %[ret], %[ptr]\n" \
91+
#asm_op " %[ret], %[ret], %[val]\n" \
92+
"stxr %w[loop], %[ret], %[ptr]\n" \
93+
: [loop] "=&r" (loop), [ret] "=&r" (ret), \
94+
[ptr] "+Q"(*(u64 *)ptr) \
95+
: [val] "Ir" (val)); \
96+
} while (loop); \
97+
break; \
98+
default: \
99+
BUILD_BUG(); \
100+
} \
101+
\
102+
return ret; \
103+
}
104+
105+
PERCPU_OP(add, add)
106+
PERCPU_OP(and, and)
107+
PERCPU_OP(or, orr)
108+
#undef PERCPU_OP
109+
110+
static inline unsigned long __percpu_read(void *ptr, int size)
111+
{
112+
unsigned long ret;
113+
114+
switch (size) {
115+
case 1:
116+
ret = ACCESS_ONCE(*(u8 *)ptr);
117+
break;
118+
case 2:
119+
ret = ACCESS_ONCE(*(u16 *)ptr);
120+
break;
121+
case 4:
122+
ret = ACCESS_ONCE(*(u32 *)ptr);
123+
break;
124+
case 8:
125+
ret = ACCESS_ONCE(*(u64 *)ptr);
126+
break;
127+
default:
128+
BUILD_BUG();
129+
}
130+
131+
return ret;
132+
}
133+
134+
static inline void __percpu_write(void *ptr, unsigned long val, int size)
135+
{
136+
switch (size) {
137+
case 1:
138+
ACCESS_ONCE(*(u8 *)ptr) = (u8)val;
139+
break;
140+
case 2:
141+
ACCESS_ONCE(*(u16 *)ptr) = (u16)val;
142+
break;
143+
case 4:
144+
ACCESS_ONCE(*(u32 *)ptr) = (u32)val;
145+
break;
146+
case 8:
147+
ACCESS_ONCE(*(u64 *)ptr) = (u64)val;
148+
break;
149+
default:
150+
BUILD_BUG();
151+
}
152+
}
153+
154+
static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
155+
int size)
156+
{
157+
unsigned long ret, loop;
158+
159+
switch (size) {
160+
case 1:
161+
do {
162+
asm ("//__percpu_xchg_1\n"
163+
"ldxrb %w[ret], %[ptr]\n"
164+
"stxrb %w[loop], %w[val], %[ptr]\n"
165+
: [loop] "=&r"(loop), [ret] "=&r"(ret),
166+
[ptr] "+Q"(*(u8 *)ptr)
167+
: [val] "r" (val));
168+
} while (loop);
169+
break;
170+
case 2:
171+
do {
172+
asm ("//__percpu_xchg_2\n"
173+
"ldxrh %w[ret], %[ptr]\n"
174+
"stxrh %w[loop], %w[val], %[ptr]\n"
175+
: [loop] "=&r"(loop), [ret] "=&r"(ret),
176+
[ptr] "+Q"(*(u16 *)ptr)
177+
: [val] "r" (val));
178+
} while (loop);
179+
break;
180+
case 4:
181+
do {
182+
asm ("//__percpu_xchg_4\n"
183+
"ldxr %w[ret], %[ptr]\n"
184+
"stxr %w[loop], %w[val], %[ptr]\n"
185+
: [loop] "=&r"(loop), [ret] "=&r"(ret),
186+
[ptr] "+Q"(*(u32 *)ptr)
187+
: [val] "r" (val));
188+
} while (loop);
189+
break;
190+
case 8:
191+
do {
192+
asm ("//__percpu_xchg_8\n"
193+
"ldxr %[ret], %[ptr]\n"
194+
"stxr %w[loop], %[val], %[ptr]\n"
195+
: [loop] "=&r"(loop), [ret] "=&r"(ret),
196+
[ptr] "+Q"(*(u64 *)ptr)
197+
: [val] "r" (val));
198+
} while (loop);
199+
break;
200+
default:
201+
BUILD_BUG();
202+
}
203+
204+
return ret;
205+
}
206+
207+
#define _percpu_add(pcp, val) \
208+
__percpu_add(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
209+
210+
#define _percpu_add_return(pcp, val) (typeof(pcp)) (_percpu_add(pcp, val))
211+
212+
#define _percpu_and(pcp, val) \
213+
__percpu_and(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
214+
215+
#define _percpu_or(pcp, val) \
216+
__percpu_or(raw_cpu_ptr(&(pcp)), val, sizeof(pcp))
217+
218+
#define _percpu_read(pcp) (typeof(pcp)) \
219+
(__percpu_read(raw_cpu_ptr(&(pcp)), sizeof(pcp)))
220+
221+
#define _percpu_write(pcp, val) \
222+
__percpu_write(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp))
223+
224+
#define _percpu_xchg(pcp, val) (typeof(pcp)) \
225+
(__percpu_xchg(raw_cpu_ptr(&(pcp)), (unsigned long)(val), sizeof(pcp)))
226+
227+
#define this_cpu_add_1(pcp, val) _percpu_add(pcp, val)
228+
#define this_cpu_add_2(pcp, val) _percpu_add(pcp, val)
229+
#define this_cpu_add_4(pcp, val) _percpu_add(pcp, val)
230+
#define this_cpu_add_8(pcp, val) _percpu_add(pcp, val)
231+
232+
#define this_cpu_add_return_1(pcp, val) _percpu_add_return(pcp, val)
233+
#define this_cpu_add_return_2(pcp, val) _percpu_add_return(pcp, val)
234+
#define this_cpu_add_return_4(pcp, val) _percpu_add_return(pcp, val)
235+
#define this_cpu_add_return_8(pcp, val) _percpu_add_return(pcp, val)
236+
237+
#define this_cpu_and_1(pcp, val) _percpu_and(pcp, val)
238+
#define this_cpu_and_2(pcp, val) _percpu_and(pcp, val)
239+
#define this_cpu_and_4(pcp, val) _percpu_and(pcp, val)
240+
#define this_cpu_and_8(pcp, val) _percpu_and(pcp, val)
241+
242+
#define this_cpu_or_1(pcp, val) _percpu_or(pcp, val)
243+
#define this_cpu_or_2(pcp, val) _percpu_or(pcp, val)
244+
#define this_cpu_or_4(pcp, val) _percpu_or(pcp, val)
245+
#define this_cpu_or_8(pcp, val) _percpu_or(pcp, val)
246+
247+
#define this_cpu_read_1(pcp) _percpu_read(pcp)
248+
#define this_cpu_read_2(pcp) _percpu_read(pcp)
249+
#define this_cpu_read_4(pcp) _percpu_read(pcp)
250+
#define this_cpu_read_8(pcp) _percpu_read(pcp)
251+
252+
#define this_cpu_write_1(pcp, val) _percpu_write(pcp, val)
253+
#define this_cpu_write_2(pcp, val) _percpu_write(pcp, val)
254+
#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
255+
#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
256+
257+
#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
258+
#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
259+
#define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
260+
#define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
261+
47262
#include <asm-generic/percpu.h>
48263

49264
#endif /* __ASM_PERCPU_H */

0 commit comments

Comments
 (0)