Skip to content

Commit 5aa15b5

Browse files
Paul JacksonLinus Torvalds
authored andcommitted
[PATCH] cpusets: remove depth counted locking hack
Remove a rather hackish depth counter on cpuset locking. The depth counter was avoiding a possible double trip on the global cpuset_sem semaphore. It worked, but now an improved version of cpuset locking is available, to come in the next patch, using two global semaphores. This patch reverses "cpuset semaphore depth check deadlock fix" The kernel still works, even after this patch, except for some rare and difficult to reproduce race conditions when agressively creating and destroying cpusets marked with the notify_on_release option, on very large systems. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent f35f31d commit 5aa15b5

File tree

1 file changed

+40
-65
lines changed

1 file changed

+40
-65
lines changed

kernel/cpuset.c

Lines changed: 40 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL;
180180
*/
181181

182182
static DECLARE_MUTEX(cpuset_sem);
183-
static struct task_struct *cpuset_sem_owner;
184-
static int cpuset_sem_depth;
185-
186-
/*
187-
* The global cpuset semaphore cpuset_sem can be needed by the
188-
* memory allocator to update a tasks mems_allowed (see the calls
189-
* to cpuset_update_current_mems_allowed()) or to walk up the
190-
* cpuset hierarchy to find a mem_exclusive cpuset see the calls
191-
* to cpuset_excl_nodes_overlap()).
192-
*
193-
* But if the memory allocation is being done by cpuset.c code, it
194-
* usually already holds cpuset_sem. Double tripping on a kernel
195-
* semaphore deadlocks the current task, and any other task that
196-
* subsequently tries to obtain the lock.
197-
*
198-
* Run all up's and down's on cpuset_sem through the following
199-
* wrappers, which will detect this nested locking, and avoid
200-
* deadlocking.
201-
*/
202-
203-
static inline void cpuset_down(struct semaphore *psem)
204-
{
205-
if (cpuset_sem_owner != current) {
206-
down(psem);
207-
cpuset_sem_owner = current;
208-
}
209-
cpuset_sem_depth++;
210-
}
211-
212-
static inline void cpuset_up(struct semaphore *psem)
213-
{
214-
if (--cpuset_sem_depth == 0) {
215-
cpuset_sem_owner = NULL;
216-
up(psem);
217-
}
218-
}
219183

220184
/*
221185
* A couple of forward declarations required, due to cyclic reference loop:
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
558522
* Refresh current tasks mems_allowed and mems_generation from
559523
* current tasks cpuset. Call with cpuset_sem held.
560524
*
561-
* This routine is needed to update the per-task mems_allowed
562-
* data, within the tasks context, when it is trying to allocate
563-
* memory (in various mm/mempolicy.c routines) and notices
564-
* that some other task has been modifying its cpuset.
525+
* Be sure to call refresh_mems() on any cpuset operation which
526+
* (1) holds cpuset_sem, and (2) might possibly alloc memory.
527+
* Call after obtaining cpuset_sem lock, before any possible
528+
* allocation. Otherwise one risks trying to allocate memory
529+
* while the task cpuset_mems_generation is not the same as
530+
* the mems_generation in its cpuset, which would deadlock on
531+
* cpuset_sem in cpuset_update_current_mems_allowed().
532+
*
533+
* Since we hold cpuset_sem, once refresh_mems() is called, the
534+
* test (current->cpuset_mems_generation != cs->mems_generation)
535+
* in cpuset_update_current_mems_allowed() will remain false,
536+
* until we drop cpuset_sem. Anyone else who would change our
537+
* cpusets mems_generation needs to lock cpuset_sem first.
565538
*/
566539

567540
static void refresh_mems(void)
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
867840
}
868841
buffer[nbytes] = 0; /* nul-terminate */
869842

870-
cpuset_down(&cpuset_sem);
843+
down(&cpuset_sem);
871844

872845
if (is_removed(cs)) {
873846
retval = -ENODEV;
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
901874
if (retval == 0)
902875
retval = nbytes;
903876
out2:
904-
cpuset_up(&cpuset_sem);
877+
up(&cpuset_sem);
905878
cpuset_release_agent(pathbuf);
906879
out1:
907880
kfree(buffer);
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
941914
{
942915
cpumask_t mask;
943916

944-
cpuset_down(&cpuset_sem);
917+
down(&cpuset_sem);
945918
mask = cs->cpus_allowed;
946-
cpuset_up(&cpuset_sem);
919+
up(&cpuset_sem);
947920

948921
return cpulist_scnprintf(page, PAGE_SIZE, mask);
949922
}
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
952925
{
953926
nodemask_t mask;
954927

955-
cpuset_down(&cpuset_sem);
928+
down(&cpuset_sem);
956929
mask = cs->mems_allowed;
957-
cpuset_up(&cpuset_sem);
930+
up(&cpuset_sem);
958931

959932
return nodelist_scnprintf(page, PAGE_SIZE, mask);
960933
}
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
13511324
if (!cs)
13521325
return -ENOMEM;
13531326

1354-
cpuset_down(&cpuset_sem);
1327+
down(&cpuset_sem);
1328+
refresh_mems();
13551329
cs->flags = 0;
13561330
if (notify_on_release(parent))
13571331
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
13761350
* will down() this new directory's i_sem and if we race with
13771351
* another mkdir, we might deadlock.
13781352
*/
1379-
cpuset_up(&cpuset_sem);
1353+
up(&cpuset_sem);
13801354

13811355
err = cpuset_populate_dir(cs->dentry);
13821356
/* If err < 0, we have a half-filled directory - oh well ;) */
13831357
return 0;
13841358
err:
13851359
list_del(&cs->sibling);
1386-
cpuset_up(&cpuset_sem);
1360+
up(&cpuset_sem);
13871361
kfree(cs);
13881362
return err;
13891363
}
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
14051379

14061380
/* the vfs holds both inode->i_sem already */
14071381

1408-
cpuset_down(&cpuset_sem);
1382+
down(&cpuset_sem);
1383+
refresh_mems();
14091384
if (atomic_read(&cs->count) > 0) {
1410-
cpuset_up(&cpuset_sem);
1385+
up(&cpuset_sem);
14111386
return -EBUSY;
14121387
}
14131388
if (!list_empty(&cs->children)) {
1414-
cpuset_up(&cpuset_sem);
1389+
up(&cpuset_sem);
14151390
return -EBUSY;
14161391
}
14171392
parent = cs->parent;
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
14271402
spin_unlock(&d->d_lock);
14281403
cpuset_d_remove_dir(d);
14291404
dput(d);
1430-
cpuset_up(&cpuset_sem);
1405+
up(&cpuset_sem);
14311406
cpuset_release_agent(pathbuf);
14321407
return 0;
14331408
}
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk)
15301505
if (notify_on_release(cs)) {
15311506
char *pathbuf = NULL;
15321507

1533-
cpuset_down(&cpuset_sem);
1508+
down(&cpuset_sem);
15341509
if (atomic_dec_and_test(&cs->count))
15351510
check_for_release(cs, &pathbuf);
1536-
cpuset_up(&cpuset_sem);
1511+
up(&cpuset_sem);
15371512
cpuset_release_agent(pathbuf);
15381513
} else {
15391514
atomic_dec(&cs->count);
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
15541529
{
15551530
cpumask_t mask;
15561531

1557-
cpuset_down(&cpuset_sem);
1532+
down(&cpuset_sem);
15581533
task_lock((struct task_struct *)tsk);
15591534
guarantee_online_cpus(tsk->cpuset, &mask);
15601535
task_unlock((struct task_struct *)tsk);
1561-
cpuset_up(&cpuset_sem);
1536+
up(&cpuset_sem);
15621537

15631538
return mask;
15641539
}
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void)
15831558
if (!cs)
15841559
return; /* task is exiting */
15851560
if (current->cpuset_mems_generation != cs->mems_generation) {
1586-
cpuset_down(&cpuset_sem);
1561+
down(&cpuset_sem);
15871562
refresh_mems();
1588-
cpuset_up(&cpuset_sem);
1563+
up(&cpuset_sem);
15891564
}
15901565
}
15911566

@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
16841659
return 0;
16851660

16861661
/* Not hardwall and node outside mems_allowed: scan up cpusets */
1687-
cpuset_down(&cpuset_sem);
1662+
down(&cpuset_sem);
16881663
cs = current->cpuset;
16891664
if (!cs)
16901665
goto done; /* current task exiting */
16911666
cs = nearest_exclusive_ancestor(cs);
16921667
allowed = node_isset(node, cs->mems_allowed);
16931668
done:
1694-
cpuset_up(&cpuset_sem);
1669+
up(&cpuset_sem);
16951670
return allowed;
16961671
}
16971672

@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
17121687
const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
17131688
int overlap = 0; /* do cpusets overlap? */
17141689

1715-
cpuset_down(&cpuset_sem);
1690+
down(&cpuset_sem);
17161691
cs1 = current->cpuset;
17171692
if (!cs1)
17181693
goto done; /* current task exiting */
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
17231698
cs2 = nearest_exclusive_ancestor(cs2);
17241699
overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
17251700
done:
1726-
cpuset_up(&cpuset_sem);
1701+
up(&cpuset_sem);
17271702

17281703
return overlap;
17291704
}
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
17461721
return -ENOMEM;
17471722

17481723
tsk = m->private;
1749-
cpuset_down(&cpuset_sem);
1724+
down(&cpuset_sem);
17501725
task_lock(tsk);
17511726
cs = tsk->cpuset;
17521727
task_unlock(tsk);
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
17611736
seq_puts(m, buf);
17621737
seq_putc(m, '\n');
17631738
out:
1764-
cpuset_up(&cpuset_sem);
1739+
up(&cpuset_sem);
17651740
kfree(buf);
17661741
return retval;
17671742
}

0 commit comments

Comments
 (0)