CSL 730: Parallel Programming: Openmp
CSL 730: Parallel Programming: Openmp
CSL 730: Parallel Programming: Openmp
Hello
OpenMP
#pragma
omp
parallel
{
//
I
am
now
thread
i
of
n
switch(omp_get_thread_num())
{
case
0
:
blah1..
case
1:
blah2..
}
}
//
Back
to
normal
Parallel
Construct
Execu;on
Model
Encountering
thread
creates
a
team:
Itself
(master)
+
zero
or
more
addiOonal
threads.
Memory
Model
NoOon
of
temporary
view
of
memory
Allows
local
caching
Need
to
relax
consistency
model
Relaxed Consistency
Unsynchronized access:
If two threads write to the same shared variable the result is
undefined
If a thread reads and another writes, the read value is undefined
Relaxed Consistency
Unsynchronized access:
If two threads write to the same shared variable the result is
writes
->
T1
ushes
->
T2
ushes
->
T2
reads
T1
undefined
Same
o
rder
s
een
b
y
a
ll
t
hreads
If a thread reads and another writes, the read value is undefined
!
thread 1 ! !
b = 1! !
a=b=0
!
!
thread 2
a=1
if (a == 0) {!!
!
critical section !!
}!
!
!
!
if (b == 0) {
critical section
}
!
!
!
thread 1 ! !
b = 1! !
a=b=0
!
!
thread 2
a=1
if (a == 0) {!!
!
critical section !!
}!
!
!
!
if (b == 0) {
critical section
}
!
!
!
thread 1 ! !
a=b=0
!
!
b = 1! !
!
!
flush(b); flush(a);!
if (a == 0) {!!
!
critical section !!
}!
!
!
!
!
!
!
!
!
thread 2
a=1
flush(a); flush(b);
if (b == 0) {
critical section
}
!
thread 1 ! !
a=b=0
!
!
b = 1! !
!
!
flush(b); flush(a);!
if (a == 0) {!!
!
critical section !!
}!
!
!
!
!
!
!
!
!
thread 2
a=1
flush(a); flush(b);
if (b == 0) {
critical section
}
!
thread 1 ! !
a=b=0
!
!
b = 1! !
!
!
flush(b,a);! !
if (a == 0) {!!
!
critical section !!
}!
!
!
!
!
!
!
!
thread 2
a=1
flush(a,b);
if (b == 0) {
critical section
}
Thread
Control
Environment Variable
Initial value
OMP_NUM_THREADS omp_set_num_threads
*
omp_get_max_threads
Implementation
defined
OMP_DYNAMIC
omp_set_dynamic
omp_get_dynamic
Implementation
defined
OMP_NESTED
omp_set_nested
omp_get_nested
false
OMP_SCHEDULE
*
Implementation
defined
Parallel
Construct
#pragma
omp
parallel
\
{
}
if(boolean)
\
private(var1,
var2,
var3)
\
rstprivate(var1,
var2,
var3)
\
default(private
|
shared
|
none)
\
shared(var1,
var2)
\
copyin(var1,
var2)
\
reducOon(operator:list)
\
num_threads(n)
Parallel
Construct
#pragma
omp
parallel
\
{
}
if(boolean)
\
private(var1,
var2,
var3)
\
rstprivate(var1,
var2,
var3)
\
default(private
|
shared
|
none)
\
shared(var1,
var2)
\
copyin(var1,
var2)
\
Restric;ons:
reducOon(operator:list)
\
Cannot
branch
in
or
out
No
side
effect
from
clause:
must
not
depend
num_threads(n)
on
any
ordering
of
the
evaluations
Upto
one
if
clause
Upto
one
num_threads
clause
num_threads
must
be
a
+ve
integer
Whats wrong?
int
;d,
size;
int
numProcs
=
omp_get_num_procs();
#pragma
omp
parallel
num_threads(numProcs)
{
size
=
getProblemSize()/numProcs;
//
assume
divisible
;d
=
omp_get_thread_num();
doTask(;d*size,
size);
}
doTask(int
start,
int
count)
{
//
Each
threads
instance
has
its
own
ac;va;on
record
for(int
i
=
0,
t=start;
i<
count;
i++,
t+=1)
doit(t);
}
12
Whats wrong?
int
;d,
size;
int
numProcs
=
omp_get_num_procs();
#pragma
omp
parallel
num_threads(numProcs)
{
size
=
getProblemSize()/numProcs;
//
assume
divisible
;d
=
omp_get_thread_num();
doTask(;d*size,
size);
}
doTask(int
start,
int
count)
{
//
Each
threads
instance
has
its
own
ac;va;on
record
for(int
i
=
0,
t=start;
i<
count;
i++,
t+=1)
doit(t);
}
12
Private clause
int
;d,
size;
int
numprocs
=
omp_get_num_procs();
#pragma
omp
parallel
num_threads(numProcs)
private(;d)
{
size
=
getProblemSize()/numProcs;
;d
=
omp_get_thread_num();
doTask(;d*size,
size);
}
doTask(int
start,
int
count)
{
//
Each
threads
instance
has
its
own
ac;va;on
record
for(int
i
=
0,
t=start;
i<
count;
i++;
t+=stride)
doit(t);
}
}
14
Parallel
Loop
#pragma
omp
parallel
for
for
(i=
0;
i
<
N;
++i)
{
blah
}
Num
of
iteraOons
must
be
known
when
the
construct
is
encountered
Must
be
the
same
for
each
encountering
thread
Restric;ons:
same
loop
control
expression
for
all
threads
in
the
team.
At
most
one
schedule,
nowait,
ordered
clause
chunk_size
must
be
a
loop/construct
invariant,
+ve
integer
ordered
clause
required
if
any
ordered
region
inside
Private clause
int
;d,
size;
int
numProcs
=
omp_get_num_procs();
#pragma
omp
parallel
num_threads(numProcs)
private(;d)
{
size
=
getProblemSize()/numProcs;
;d
=
omp_get_thread_num();
doTask(;d*size,
size);
?
e
d
o
c
s
i
h
t
r
e
b
}
m
e
m
e
20
20
Parallel
Construct
#pragma
omp
parallel
\
{
}
if(boolean)
\
private(var1,
var2,
var3)
\
rstprivate(var1,
var2,
var3)
\
default(private
|
shared
|
none)
\
shared(var1,
var2)
\
copyin(var1,
var2)
\
reducOon(operator:list)
\
num_threads(n)
Schedule(kind[,
chunk_size])
Divide
itera;ons
into
con;guous
sets,
chunks
chunks
are
assigned
transparently
to
threads
Reduc;ons
Reductions are common
scalar f(v1 .. vn)
reduc;on
Clause
reduction (<op> :<variable>)
+
*
&
|
^
&&
||
Sum
Product
Bitwise
and
Bitwise
or
Bitwise
exclusive
or
Logical
and
Logical
or
Single
Construct
#pragma
omp
parallel
{
#pragma
omp
for
for(
int
i=0;
i<N;
i++
)
a[i]
=
f0(i);
#pragma
omp
single
x
=
f1(a);
#pragma
omp
for
for(int
i=0;
i<N;
i++
)
b[i]
=
x
*
f2(i);
}
Hidden complexity
Sec;ons
Construct
#pragma
omp
sec;ons
{
#pragma
omp
sec;on
{
//
do
this
}
#pragma
omp
sec;on
{
//
do
that
}
//
}
Master
Direc;ve
#pragma
omp
parallel
{
#pragma
omp
for
for(
int
i=0;
i<100;
i++
)
a[i]
=
f0(i);
#pragma
omp
master
x
=
f1(a);
}
Only
master
executes.
No
synchronizaOon.
Cri;cal
Sec;on
#pragma
omp
cri;cal
(accessBankBalance)
{
}
A
single
thread
at
a
;me
through
all
regions
of
the
same
name
Barrier
Direc;ve
#pragma
omp
barrier
Stand-alone
Binds
to
inner-most
parallel
region
All
threads
in
the
team
must
execute
if (! ready)
#pragma omp barrier
Same
sequence
of
work-sharing
and
barrier
for
the
enOre
team
Ordered
Direc;ve
#pragma
omp
ordered
{
}
Binds to inner-most enclosing loop
The structured block executed in loop
sequential order
The loop must declare the ordered clause
Each thread must encounter only one
ordered region
Flush
Direc;ve
#pragma
omp
ush
(var1,
var2)
Stand-alone,
like
barrier
Only
directly
aects
the
encountering
thread
List-of-vars
ensures
that
any
compiler
re-ordering
moves
all
ushes
together
implicit:
barrier,
atomic,
cri;cal,
locks
Atomic
Direc;ve
#pragma omp atomic
i++;
Light-weight
cri;cal
sec;on
Only
for
some
expressions
int omp_get_num_procs();
int omp_get_num_threads();
int omp_get_thread_num();
int omp_get_ancestor_thread_num();
double omp_get_wtime();
35
36
Nes;ng
Restric;ons
A
cri;cal
region
may
not
be
nested
ever
inside
a
cri;cal
region
with
the
same
name
Not
sucient
to
prevent
deadlock
Not
allowed
without
intervening
parallel
region:
Inside
work-sharing,
cri;cal,
ordered,
or
master
Work-sharing
barrier
Inside
a
work-sharing
region
master
region
Inside
a
cri;cal
region
ordered
region
EXAMPLES
Ordered Construct
int i;
#pragma omp for ordered
for (i=0; i<n; i++) {
if(isGroupA(i) {
#pragma omp ordered
doit(i);
}
else {
#pragma omp ordered
doit(partner(i));
}
40
41
a, b, c are shared
a,
b,
c
are
shared
i,
j,
k
are
private
OpenMP
Jacobi
for
some
number
of
Omesteps/iteraOons
{
#pragma
omp
parallel
for
for
(int
i=0;
i<n;
i++
)
for(
int
j=0,
j<n,
j++
)
temp[i][j]
=
0.25
*
(
grid[i-1][j]
+
grid[i+1][j]
grid[i][j-1]
+
grid[i][j+1]
);
#pragma
omp
parallel
for
for(
int
i=0;
i<n;
i++
)
for(
int
j=0;
j<n;
j++
)
grid[i][j]
=
temp[i][j];
}
47
nowait
nowait
Eciency
Issues
Minimize
synchroniza;on
Avoid
BARRIER,
CRITICAL,
ORDERED,
and
locks
Use
NOWAIT
Use
named
CRITICAL
sec;ons
for
ne-grained
locking
Use
MASTER
(instead
of
SINGLE)
Parallelize
at
the
highest
level
possible
such
as
outer
FOR
loops
keep
parallel
regions
large
FLUSH
is
expensive
LASTPRIVATE
has
synchroniza;on
overhead
Thread
safe
malloc/free
are
expensive
Reduce
False
sharing
Design
of
data
structures
Use
PRIVATE
Synchroniza;on
Race
condi;on
depends
on
;ming
Deadlock
wai;ng
for
a
non-existent
condi;on
Livelock
con;nuously
adjus;ng,
but
task
progress
stalled
Try
to
Avoid
nested
locks
Release
locks
religiously
Avoid
while
true
(especially,
during
tes;ng)
Be
careful
with
Non
thread-safe
libraries
Concurrent
access
to
shared
data
IO
inside
parallel
regions
Diering
views
of
shared
memory
(FLUSH)
NOWAIT