Skip to content

Commit 2aae950

Browse files
Andi KleenLinus Torvalds
authored andcommitted
x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu
This implements new vDSO for x86-64. The concept is similar to the existing vDSOs on i386 and PPC. x86-64 has had static vsyscalls before, but these are not flexible enough anymore. A vDSO is a ELF shared library supplied by the kernel that is mapped into user address space. The vDSO mapping is randomized for each process for security reasons. Doing this was needed for clock_gettime, because clock_gettime always needs a syscall fallback and having one at a fixed address would have made buffer overflow exploits too easy to write. The vdso can be disabled with vdso=0 It currently includes a new gettimeofday implemention and optimized clock_gettime(). The gettimeofday implementation is slightly faster than the one in the old vsyscall. clock_gettime is significantly faster than the syscall for CLOCK_MONOTONIC and CLOCK_REALTIME. The new calls are generally faster than the old vsyscall. Advantages over the old x86-64 vsyscalls: - Extensible - Randomized - Cleaner - Easier to virtualize (the old static address range previously causes overhead e.g. for Xen because it has to create special page tables for it) Weak points: - glibc support still to be written The VM interface is partly based on Ingo Molnar's i386 version. Includes compile fix from Joachim Deguara Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent a586df0 commit 2aae950

File tree

23 files changed

+554
-21
lines changed

23 files changed

+554
-21
lines changed

Documentation/kernel-parameters.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1882,7 +1882,7 @@ and is between 256 and 4096 characters. It is defined in the file
18821882
usbhid.mousepoll=
18831883
[USBHID] The interval which mice are to be polled at.
18841884

1885-
vdso= [IA-32,SH]
1885+
vdso= [IA-32,SH,x86-64]
18861886
vdso=2: enable compat VDSO (default with COMPAT_VDSO)
18871887
vdso=1: enable VDSO (default)
18881888
vdso=0: disable VDSO mapping

arch/x86_64/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kern
7676
libs-y += arch/x86_64/lib/
7777
core-y += arch/x86_64/kernel/ \
7878
arch/x86_64/mm/ \
79-
arch/x86_64/crypto/
79+
arch/x86_64/crypto/ \
80+
arch/x86_64/vdso/
8081
core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
8182
drivers-$(CONFIG_PCI) += arch/x86_64/pci/
8283
drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/

arch/x86_64/ia32/ia32_binfmt.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
int sysctl_vsyscall32 = 1;
4040

41+
#undef ARCH_DLINFO
4142
#define ARCH_DLINFO do { \
4243
if (sysctl_vsyscall32) { \
4344
NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \

arch/x86_64/kernel/time.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include <asm/hpet.h>
4545
#include <asm/mpspec.h>
4646
#include <asm/nmi.h>
47+
#include <asm/vgtod.h>
4748

4849
static char *timename = NULL;
4950

arch/x86_64/kernel/vmlinux.lds.S

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ SECTIONS
9393
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
9494
{ *(.vsyscall_gtod_data) }
9595
vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
96+
.vsyscall_clock : AT(VLOAD(.vsyscall_clock))
97+
{ *(.vsyscall_clock) }
98+
vsyscall_clock = VVIRT(.vsyscall_clock);
9699

97100

98101
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
@@ -189,6 +192,12 @@ SECTIONS
189192
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
190193
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
191194

195+
/* vdso blob that is mapped into user space */
196+
vdso_start = . ;
197+
.vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
198+
. = ALIGN(4096);
199+
vdso_end = .;
200+
192201
#ifdef CONFIG_BLK_DEV_INITRD
193202
. = ALIGN(4096);
194203
__initramfs_start = .;

arch/x86_64/kernel/vsyscall.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <asm/segment.h>
4343
#include <asm/desc.h>
4444
#include <asm/topology.h>
45+
#include <asm/vgtod.h>
4546

4647
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
4748
#define __syscall_clobber "r11","rcx","memory"
@@ -57,26 +58,9 @@
5758
* - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
5859
* Try to keep this structure as small as possible to avoid cache line ping pongs
5960
*/
60-
struct vsyscall_gtod_data_t {
61-
seqlock_t lock;
62-
63-
/* open coded 'struct timespec' */
64-
time_t wall_time_sec;
65-
u32 wall_time_nsec;
66-
67-
int sysctl_enabled;
68-
struct timezone sys_tz;
69-
struct { /* extract of a clocksource struct */
70-
cycle_t (*vread)(void);
71-
cycle_t cycle_last;
72-
cycle_t mask;
73-
u32 mult;
74-
u32 shift;
75-
} clock;
76-
};
7761
int __vgetcpu_mode __section_vgetcpu_mode;
7862

79-
struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =
63+
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
8064
{
8165
.lock = SEQLOCK_UNLOCKED,
8266
.sysctl_enabled = 1,
@@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
9680
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
9781
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
9882
vsyscall_gtod_data.sys_tz = sys_tz;
83+
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84+
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
9985
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
10086
}
10187

arch/x86_64/mm/init.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,3 +774,12 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
774774
return __alloc_bootmem_core(pgdat->bdata, size,
775775
SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
776776
}
777+
778+
const char *arch_vma_name(struct vm_area_struct *vma)
779+
{
780+
if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
781+
return "[vdso]";
782+
if (vma == &gate_vma)
783+
return "[vsyscall]";
784+
return NULL;
785+
}

arch/x86_64/vdso/Makefile

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#
2+
# x86-64 vDSO.
3+
#
4+
5+
# files to link into the vdso
6+
# vdso-start.o has to be first
7+
vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
8+
9+
# files to link into kernel
10+
obj-y := vma.o vdso.o vdso-syms.o
11+
12+
vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
13+
14+
$(obj)/vdso.o: $(obj)/vdso.so
15+
16+
targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
17+
18+
# The DSO images are built using a special linker script.
19+
quiet_cmd_syscall = SYSCALL $@
20+
cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
21+
-Wl,-T,$(filter-out FORCE,$^) -o $@
22+
23+
export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
24+
25+
vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
26+
$(call ld-option, -Wl$(comma)--hash-style=sysv) \
27+
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
28+
SYSCFLAGS_vdso.so = $(vdso-flags)
29+
30+
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
31+
32+
$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
33+
$(call if_changed,syscall)
34+
35+
CF := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
36+
37+
$(obj)/vclock_gettime.o: CFLAGS = $(CF)
38+
$(obj)/vgetcpu.o: CFLAGS = $(CF)
39+
40+
# We also create a special relocatable object that should mirror the symbol
41+
# table and layout of the linked DSO. With ld -R we can then refer to
42+
# these symbols in the kernel code rather than hand-coded addresses.
43+
extra-y += vdso-syms.o
44+
$(obj)/built-in.o: $(obj)/vdso-syms.o
45+
$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
46+
47+
SYSCFLAGS_vdso-syms.o = -r -d
48+
$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
49+
$(call if_changed,syscall)

arch/x86_64/vdso/vclock_gettime.c

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Copyright 2006 Andi Kleen, SUSE Labs.
3+
* Subject to the GNU Public License, v.2
4+
*
5+
* Fast user context implementation of clock_gettime and gettimeofday.
6+
*
7+
* The code should have no internal unresolved relocations.
8+
* Check with readelf after changing.
9+
* Also alternative() doesn't work.
10+
*/
11+
12+
#include <linux/kernel.h>
13+
#include <linux/posix-timers.h>
14+
#include <linux/time.h>
15+
#include <linux/string.h>
16+
#include <asm/vsyscall.h>
17+
#include <asm/vgtod.h>
18+
#include <asm/timex.h>
19+
#include <asm/hpet.h>
20+
#include <asm/unistd.h>
21+
#include <asm/io.h>
22+
#include <asm/vgtod.h>
23+
#include "vextern.h"
24+
25+
#define gtod vdso_vsyscall_gtod_data
26+
27+
static long vdso_fallback_gettime(long clock, struct timespec *ts)
28+
{
29+
long ret;
30+
asm("syscall" : "=a" (ret) :
31+
"0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
32+
return ret;
33+
}
34+
35+
static inline long vgetns(void)
36+
{
37+
cycles_t (*vread)(void);
38+
vread = gtod->clock.vread;
39+
return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >>
40+
gtod->clock.shift;
41+
}
42+
43+
static noinline int do_realtime(struct timespec *ts)
44+
{
45+
unsigned long seq, ns;
46+
do {
47+
seq = read_seqbegin(&gtod->lock);
48+
ts->tv_sec = gtod->wall_time_sec;
49+
ts->tv_nsec = gtod->wall_time_nsec;
50+
ns = vgetns();
51+
} while (unlikely(read_seqretry(&gtod->lock, seq)));
52+
timespec_add_ns(ts, ns);
53+
return 0;
54+
}
55+
56+
/* Copy of the version in kernel/time.c which we cannot directly access */
57+
static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58+
{
59+
while (nsec >= NSEC_PER_SEC) {
60+
nsec -= NSEC_PER_SEC;
61+
++sec;
62+
}
63+
while (nsec < 0) {
64+
nsec += NSEC_PER_SEC;
65+
--sec;
66+
}
67+
ts->tv_sec = sec;
68+
ts->tv_nsec = nsec;
69+
}
70+
71+
static noinline int do_monotonic(struct timespec *ts)
72+
{
73+
unsigned long seq, ns, secs;
74+
do {
75+
seq = read_seqbegin(&gtod->lock);
76+
secs = gtod->wall_time_sec;
77+
ns = gtod->wall_time_nsec + vgetns();
78+
secs += gtod->wall_to_monotonic.tv_sec;
79+
ns += gtod->wall_to_monotonic.tv_nsec;
80+
} while (unlikely(read_seqretry(&gtod->lock, seq)));
81+
vset_normalized_timespec(ts, secs, ns);
82+
return 0;
83+
}
84+
85+
int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86+
{
87+
if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88+
switch (clock) {
89+
case CLOCK_REALTIME:
90+
return do_realtime(ts);
91+
case CLOCK_MONOTONIC:
92+
return do_monotonic(ts);
93+
}
94+
return vdso_fallback_gettime(clock, ts);
95+
}
96+
int clock_gettime(clockid_t, struct timespec *)
97+
__attribute__((weak, alias("__vdso_clock_gettime")));
98+
99+
int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100+
{
101+
long ret;
102+
if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
103+
BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
104+
offsetof(struct timespec, tv_nsec) ||
105+
sizeof(*tv) != sizeof(struct timespec));
106+
do_realtime((struct timespec *)tv);
107+
tv->tv_usec /= 1000;
108+
if (unlikely(tz != NULL)) {
109+
/* This relies on gcc inlining the memcpy. We'll notice
110+
if it ever fails to do so. */
111+
memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
112+
}
113+
return 0;
114+
}
115+
asm("syscall" : "=a" (ret) :
116+
"0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
117+
return ret;
118+
}
119+
int gettimeofday(struct timeval *, struct timezone *)
120+
__attribute__((weak, alias("__vdso_gettimeofday")));

arch/x86_64/vdso/vdso-note.S

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/*
2+
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3+
* Here we can supply some information useful to userland.
4+
*/
5+
6+
#include <linux/uts.h>
7+
#include <linux/version.h>
8+
#include <linux/elfnote.h>
9+
10+
ELFNOTE_START(Linux, 0, "a")
11+
.long LINUX_VERSION_CODE
12+
ELFNOTE_END

arch/x86_64/vdso/vdso-start.S

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.globl vdso_kernel_start
2+
vdso_kernel_start:

arch/x86_64/vdso/vdso.S

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.section ".vdso","a"
2+
.incbin "arch/x86_64/vdso/vdso.so"

arch/x86_64/vdso/vdso.lds.S

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3+
* object prelinked to its virtual address, and with only one read-only
4+
* segment (that fits in one page). This script controls its layout.
5+
*/
6+
#include <asm/asm-offsets.h>
7+
#include "voffset.h"
8+
9+
#define VDSO_PRELINK 0xffffffffff700000
10+
11+
SECTIONS
12+
{
13+
. = VDSO_PRELINK + SIZEOF_HEADERS;
14+
15+
.hash : { *(.hash) } :text
16+
.gnu.hash : { *(.gnu.hash) }
17+
.dynsym : { *(.dynsym) }
18+
.dynstr : { *(.dynstr) }
19+
.gnu.version : { *(.gnu.version) }
20+
.gnu.version_d : { *(.gnu.version_d) }
21+
.gnu.version_r : { *(.gnu.version_r) }
22+
23+
/* This linker script is used both with -r and with -shared.
24+
For the layouts to match, we need to skip more than enough
25+
space for the dynamic symbol table et al. If this amount
26+
is insufficient, ld -shared will barf. Just increase it here. */
27+
. = VDSO_PRELINK + VDSO_TEXT_OFFSET;
28+
29+
.text : { *(.text) } :text
30+
.text.ptr : { *(.text.ptr) } :text
31+
. = VDSO_PRELINK + 0x900;
32+
.data : { *(.data) } :text
33+
.bss : { *(.bss) } :text
34+
35+
.altinstructions : { *(.altinstructions) } :text
36+
.altinstr_replacement : { *(.altinstr_replacement) } :text
37+
38+
.note : { *(.note.*) } :text :note
39+
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
40+
.eh_frame : { KEEP (*(.eh_frame)) } :text
41+
.dynamic : { *(.dynamic) } :text :dynamic
42+
.useless : {
43+
*(.got.plt) *(.got)
44+
*(.gnu.linkonce.d.*)
45+
*(.dynbss)
46+
*(.gnu.linkonce.b.*)
47+
} :text
48+
}
49+
50+
/*
51+
* We must supply the ELF program headers explicitly to get just one
52+
* PT_LOAD segment, and set the flags explicitly to make segments read-only.
53+
*/
54+
PHDRS
55+
{
56+
text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
57+
dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
58+
note PT_NOTE FLAGS(4); /* PF_R */
59+
eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
60+
}
61+
62+
/*
63+
* This controls what symbols we export from the DSO.
64+
*/
65+
VERSION
66+
{
67+
LINUX_2.6 {
68+
global:
69+
clock_gettime;
70+
__vdso_clock_gettime;
71+
gettimeofday;
72+
__vdso_gettimeofday;
73+
getcpu;
74+
__vdso_getcpu;
75+
local: *;
76+
};
77+
}

0 commit comments

Comments
 (0)