Skip to content

Commit 9c225f2

Browse files
torvaldsAl Viro
authored andcommitted
vfs: atomic f_pos accesses as per POSIX
Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan <panyongzhi@gmail.com> Reported-by: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 1b56e98 commit 9c225f2

File tree

6 files changed

+55
-18
lines changed

6 files changed

+55
-18
lines changed

fs/file_table.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
135135
atomic_long_set(&f->f_count, 1);
136136
rwlock_init(&f->f_owner.lock);
137137
spin_lock_init(&f->f_lock);
138+
mutex_init(&f->f_pos_lock);
138139
eventpoll_init_file(f);
139140
/* f->f_version: 0 */
140141
return f;

fs/namei.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
18841884

18851885
nd->path = f.file->f_path;
18861886
if (flags & LOOKUP_RCU) {
1887-
if (f.need_put)
1887+
if (f.flags & FDPUT_FPUT)
18881888
*fp = f.file;
18891889
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
18901890
rcu_read_lock();

fs/open.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
705705
return 0;
706706
}
707707

708+
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
709+
if (S_ISREG(inode->i_mode))
710+
f->f_mode |= FMODE_ATOMIC_POS;
711+
708712
f->f_op = fops_get(inode->i_fop);
709713
if (unlikely(WARN_ON(!f->f_op))) {
710714
error = -ENODEV;

fs/read_write.c

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
264264
}
265265
EXPORT_SYMBOL(vfs_llseek);
266266

267+
/*
268+
* We only lock f_pos if we have threads or if the file might be
269+
* shared with another process. In both cases we'll have an elevated
270+
* file count (done either by fdget() or by fork()).
271+
*/
272+
static inline struct fd fdget_pos(int fd)
273+
{
274+
struct fd f = fdget(fd);
275+
struct file *file = f.file;
276+
277+
if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
278+
if (file_count(file) > 1) {
279+
f.flags |= FDPUT_POS_UNLOCK;
280+
mutex_lock(&file->f_pos_lock);
281+
}
282+
}
283+
return f;
284+
}
285+
286+
static inline void fdput_pos(struct fd f)
287+
{
288+
if (f.flags & FDPUT_POS_UNLOCK)
289+
mutex_unlock(&f.file->f_pos_lock);
290+
fdput(f);
291+
}
292+
267293
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
268294
{
269295
off_t retval;
270-
struct fd f = fdget(fd);
296+
struct fd f = fdget_pos(fd);
271297
if (!f.file)
272298
return -EBADF;
273299

@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
278304
if (res != (loff_t)retval)
279305
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
280306
}
281-
fdput(f);
307+
fdput_pos(f);
282308
return retval;
283309
}
284310

@@ -498,31 +524,31 @@ static inline void file_pos_write(struct file *file, loff_t pos)
498524

499525
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
500526
{
501-
struct fd f = fdget(fd);
527+
struct fd f = fdget_pos(fd);
502528
ssize_t ret = -EBADF;
503529

504530
if (f.file) {
505531
loff_t pos = file_pos_read(f.file);
506532
ret = vfs_read(f.file, buf, count, &pos);
507533
if (ret >= 0)
508534
file_pos_write(f.file, pos);
509-
fdput(f);
535+
fdput_pos(f);
510536
}
511537
return ret;
512538
}
513539

514540
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
515541
size_t, count)
516542
{
517-
struct fd f = fdget(fd);
543+
struct fd f = fdget_pos(fd);
518544
ssize_t ret = -EBADF;
519545

520546
if (f.file) {
521547
loff_t pos = file_pos_read(f.file);
522548
ret = vfs_write(f.file, buf, count, &pos);
523549
if (ret >= 0)
524550
file_pos_write(f.file, pos);
525-
fdput(f);
551+
fdput_pos(f);
526552
}
527553

528554
return ret;
@@ -797,15 +823,15 @@ EXPORT_SYMBOL(vfs_writev);
797823
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
798824
unsigned long, vlen)
799825
{
800-
struct fd f = fdget(fd);
826+
struct fd f = fdget_pos(fd);
801827
ssize_t ret = -EBADF;
802828

803829
if (f.file) {
804830
loff_t pos = file_pos_read(f.file);
805831
ret = vfs_readv(f.file, vec, vlen, &pos);
806832
if (ret >= 0)
807833
file_pos_write(f.file, pos);
808-
fdput(f);
834+
fdput_pos(f);
809835
}
810836

811837
if (ret > 0)
@@ -817,15 +843,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
817843
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
818844
unsigned long, vlen)
819845
{
820-
struct fd f = fdget(fd);
846+
struct fd f = fdget_pos(fd);
821847
ssize_t ret = -EBADF;
822848

823849
if (f.file) {
824850
loff_t pos = file_pos_read(f.file);
825851
ret = vfs_writev(f.file, vec, vlen, &pos);
826852
if (ret >= 0)
827853
file_pos_write(f.file, pos);
828-
fdput(f);
854+
fdput_pos(f);
829855
}
830856

831857
if (ret > 0)
@@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
968994
const struct compat_iovec __user *,vec,
969995
compat_ulong_t, vlen)
970996
{
971-
struct fd f = fdget(fd);
997+
struct fd f = fdget_pos(fd);
972998
ssize_t ret;
973999
loff_t pos;
9741000

@@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
9781004
ret = compat_readv(f.file, vec, vlen, &pos);
9791005
if (ret >= 0)
9801006
f.file->f_pos = pos;
981-
fdput(f);
1007+
fdput_pos(f);
9821008
return ret;
9831009
}
9841010

@@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
10351061
const struct compat_iovec __user *, vec,
10361062
compat_ulong_t, vlen)
10371063
{
1038-
struct fd f = fdget(fd);
1064+
struct fd f = fdget_pos(fd);
10391065
ssize_t ret;
10401066
loff_t pos;
10411067

@@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
10451071
ret = compat_writev(f.file, vec, vlen, &pos);
10461072
if (ret >= 0)
10471073
f.file->f_pos = pos;
1048-
fdput(f);
1074+
fdput_pos(f);
10491075
return ret;
10501076
}
10511077

include/linux/file.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)
2828

2929
struct fd {
3030
struct file *file;
31-
int need_put;
31+
unsigned int flags;
3232
};
33+
#define FDPUT_FPUT 1
34+
#define FDPUT_POS_UNLOCK 2
3335

3436
static inline void fdput(struct fd fd)
3537
{
36-
if (fd.need_put)
38+
if (fd.flags & FDPUT_FPUT)
3739
fput(fd.file);
3840
}
3941

include/linux/fs.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
123123
/* File is opened with O_PATH; almost nothing can be done with it */
124124
#define FMODE_PATH ((__force fmode_t)0x4000)
125125

126+
/* File needs atomic accesses to f_pos */
127+
#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
128+
126129
/* File was opened by fanotify and shouldn't generate fanotify events */
127130
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
128131

@@ -780,13 +783,14 @@ struct file {
780783
const struct file_operations *f_op;
781784

782785
/*
783-
* Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
786+
* Protects f_ep_links, f_flags.
784787
* Must not be taken from IRQ context.
785788
*/
786789
spinlock_t f_lock;
787790
atomic_long_t f_count;
788791
unsigned int f_flags;
789792
fmode_t f_mode;
793+
struct mutex f_pos_lock;
790794
loff_t f_pos;
791795
struct fown_struct f_owner;
792796
const struct cred *f_cred;

0 commit comments

Comments
 (0)