CSL 730: Parallel Programming: Openmp

Download as pdf or txt
Download as pdf or txt
You are on page 1of 74

CSL

730: Parallel Programming


OpenMP

Hello OpenMP
#pragma omp parallel
{
// I am now thread i of n
switch(omp_get_thread_num()) {

case 0 : blah1..

case 1: blah2..
}
}
// Back to normal

Parallel
Construct

Extremely simple to use and incredibly powerful


Fork-Join model
Every thread has its own execuOon context
Variables can be declared shared or private

Execu;on Model
Encountering thread creates a team:
Itself (master) + zero or more addiOonal threads.

Applies to structured block immediately following


Each thread executes separately the code in {}
But, also see: Work-sharing constructs

Theres an implicit barrier at the end of block


Only master conOnues beyond the barrier
May be nested
SomeOmes disabled by default

Memory Model
NoOon of temporary view of memory
Allows local caching
Need to relax consistency model

Supports threadprivate memory


global scope
Variables declared before parallel construct:
Shared by default
May be designated as private
n-1 copies of the original variable is created
May not be iniOalized by the system

Variable Sharing among Threads


Shared:
Heap allocated storage
StaOc data members
const variable (no mutable member)
Private:
auto Variables declared in a scope inside the construct
Loop variable in for construct
private to the construct

Others are shared unless declared private


You can change default

Arguments passed by reference inherit from original

Relaxed Consistency
Unsynchronized access:
If two threads write to the same shared variable the result is
undefined
If a thread reads and another writes, the read value is undefined

Memory atom size is implementation dependent


Flush x,y,z .. enforces consistency. Specs say:
If the intersection of the flush-sets of two flushes performed by
two different threads is nonempty, then the two flushes must be
completed as if in some sequential order, seen by all threads.
If the intersection of the flush-sets of two flushes performed by
one thread is nonempty, then the two flushes must appear to be
completed in that threads program order.
6

Relaxed Consistency
Unsynchronized access:
If two threads write to the same shared variable the result is
writes -> T1 ushes -> T2 ushes -> T2 reads
T1 undefined
Same
o
rder
s
een
b
y
a
ll
t
hreads
If a thread reads and another writes, the read value is undefined

Memory atom size is implementation dependent


Flush x,y,z .. enforces consistency. Specs say:
If the intersection of the flush-sets of two flushes performed by
two different threads is nonempty, then the two flushes must be
completed as if in some sequential order, seen by all threads.
If the intersection of the flush-sets of two flushes performed by
one thread is nonempty, then the two flushes must appear to be
completed in that threads program order.
6

Beware of Compiler Re-ordering


!

!
thread 1 ! !

b = 1! !

a=b=0
!
!

thread 2

a=1

if (a == 0) {!!
!
critical section !!
}!
!
!
!

if (b == 0) {
critical section
}

!
!

Beware of Compiler Re-ordering


!

!
thread 1 ! !

b = 1! !

a=b=0
!
!

thread 2

a=1

if (a == 0) {!!
!
critical section !!
}!
!
!
!

if (b == 0) {
critical section
}

!
!

Beware of Compiler Re-ordering


!

!
thread 1 ! !

a=b=0
!
!

b = 1! !
!
!
flush(b); flush(a);!
if (a == 0) {!!
!
critical section !!
}!
!
!
!

!
!
!
!
!

thread 2
a=1
flush(a); flush(b);
if (b == 0) {
critical section
}

Beware of Compiler Re-ordering


!

!
thread 1 ! !

a=b=0
!
!

b = 1! !
!
!
flush(b); flush(a);!
if (a == 0) {!!
!
critical section !!
}!
!
!
!

!
!
!
!
!

thread 2
a=1
flush(a); flush(b);
if (b == 0) {
critical section
}

Beware of Compiler Re-ordering


!

!
thread 1 ! !

a=b=0
!
!

b = 1! !
!
!
flush(b,a);! !
if (a == 0) {!!
!
critical section !!
}!
!
!
!

!
!
!
!

thread 2
a=1
flush(a,b);
if (b == 0) {
critical section
}

Thread Control
Environment Variable

Ways to modify value

Way to retrieve value

Initial value

OMP_NUM_THREADS omp_set_num_threads
*

omp_get_max_threads

Implementation
defined

OMP_DYNAMIC

omp_set_dynamic

omp_get_dynamic

Implementation
defined

OMP_NESTED

omp_set_nested

omp_get_nested

false

OMP_SCHEDULE
*

Implementation
defined

* Also see construct clause: num_threads, schedule

Parallel Construct
#pragma omp parallel \

{
}

if(boolean) \
private(var1, var2, var3) \
rstprivate(var1, var2, var3) \
default(private | shared | none) \
shared(var1, var2) \
copyin(var1, var2) \
reducOon(operator:list) \
num_threads(n)

Parallel Construct
#pragma omp parallel \

{
}

if(boolean) \
private(var1, var2, var3) \
rstprivate(var1, var2, var3) \
default(private | shared | none) \
shared(var1, var2) \
copyin(var1, var2) \
Restric;ons:
reducOon(operator:list) \
Cannot branch in or out
No side effect from clause: must not depend
num_threads(n)
on any ordering of the evaluations
Upto one if clause
Upto one num_threads clause
num_threads must be a +ve integer

Whats wrong?
int ;d, size;
int numProcs = omp_get_num_procs();
#pragma omp parallel num_threads(numProcs)
{
size = getProblemSize()/numProcs; // assume divisible
;d = omp_get_thread_num();
doTask(;d*size, size);
}
doTask(int start, int count)
{ // Each threads instance has its own ac;va;on record
for(int i = 0, t=start; i< count; i++, t+=1)
doit(t);
}
12

Whats wrong?
int ;d, size;
int numProcs = omp_get_num_procs();
#pragma omp parallel num_threads(numProcs)
{
size = getProblemSize()/numProcs; // assume divisible
;d = omp_get_thread_num();
doTask(;d*size, size);
}
doTask(int start, int count)
{ // Each threads instance has its own ac;va;on record
for(int i = 0, t=start; i< count; i++, t+=1)
doit(t);
}
12

Declare locally (private)


int size;
int numProcs = omp_get_num_procs();
#pragma omp parallel num_threads(numProcs)
{
size = getProblemSize()/numProcs;
int ;d = omp_get_thread_num();
doTask(;d*size, size);
}
doTask(int start, int count)
{ // Each threads instance has its own ac;va;on record
for(int i = 0, t=start; i< count; i++; t+=1)
doit(t);
}
13

Private clause
int ;d, size;
int numprocs = omp_get_num_procs();
#pragma omp parallel num_threads(numProcs) private(;d)
{
size = getProblemSize()/numProcs;
;d = omp_get_thread_num();
doTask(;d*size, size);
}
doTask(int start, int count)
{ // Each threads instance has its own ac;va;on record
for(int i = 0, t=start; i< count; i++; t+=stride)
doit(t);
}
}
14

Parallel Loop
#pragma omp parallel for
for (i= 0; i < N; ++i) {

blah
}
Num of iteraOons must be known when the
construct is encountered
Must be the same for each encountering thread

Compiler puts a barrier at the end of parallel for


But see nowait

Parallel For Construct


#pragma omp for \
private(var1, var2, var3) \
rstprivate(var1, var2, var3)\
lastprivate(var1, var2) \
reduc;on(operator: list) \
ordered \
schedule(kind[,chunk_size])\
nowait
Canonical For Loop

Parallel For Construct


#pragma omp for \
private(var1, var2, var3) \
rstprivate(var1, var2, var3)\
lastprivate(var1, var2) \
reduc;on(operator: list) \
ordered \
schedule(kind[,chunk_size])\
nowait
Canonical For Loop
No break

Parallel For Construct


#pragma omp for \
private(var1, var2, var3) \
rstprivate(var1, var2, var3)\
lastprivate(var1, var2) \
reduc;on(operator: list) \
ordered \
schedule(kind[,chunk_size])\
nowait
Canonical For Loop
No break

Restric;ons:
same loop control expression for all
threads in the team.
At most one schedule, nowait, ordered
clause
chunk_size must be a loop/construct
invariant, +ve integer
ordered clause required if any ordered
region inside

Firstprivate and Lastprivate


Ini;al value of private variable is unspecied
rstprivate ini;alizes copies with the original
Once per thread (not once per itera;on)
Original exists before the construct

The original copy lives aher the construct


lastprivate forces sequen;al-like behavior
thread execu;ng the sequen;ally last itera;on (or last listed
sec;on) writes to the original copy

Firstprivate and Lastprivate


#pragma omp parallel for firstprivate( simple )
! for (int i=0; i<N; i++) {
simple += a[f1(i, omp_get_thread_num())]
!!
f2(simple);
!}
#pragma omp parallel for lastprivate( doneEarly )
for( i=0; i<N; i++ ) {
! doneEarly = f0(i);
}

Private clause
int ;d, size;
int numProcs = omp_get_num_procs();
#pragma omp parallel num_threads(numProcs) private(;d)
{
size = getProblemSize()/numProcs;
;d = omp_get_thread_num();
doTask(;d*size, size);
?
e
d
o
c

s
i
h
t

r
e
b
}
m
e
m
e

doTask(int start, int count)


{ // Each threads instance has its own ac;va;on record
for(int i = 0, t=start; i< count; i++; t+=stride)
doit(t);
}
}
19

Work Sharing for


#pragma omp parallel for
{
for(int i=0; i<problemSize; i++)
doit(i);
}

20

Work Sharing for


#pragma omp parallel for
{
for(int i=0; i<problemSize; i++)
doit(i);
}
Works even if number of tasks is not
divisible by number of threads

20

Parallel Construct
#pragma omp parallel \

{
}

if(boolean) \
private(var1, var2, var3) \
rstprivate(var1, var2, var3) \
default(private | shared | none) \
shared(var1, var2) \
copyin(var1, var2) \
reducOon(operator:list) \
num_threads(n)

Parallel For Construct


#pragma omp for \
private(var1, var2, var3) \
rstprivate(var1, var2, var3)\
lastprivate(var1, var2) \
reduc;on(operator: list) \
ordered \
schedule(kind[,chunk_size])\
nowait
Canonical For Loop

Schedule(kind[, chunk_size])
Divide itera;ons into con;guous sets, chunks
chunks are assigned transparently to threads

sta.c: chunks are assigned in a round-robin fashion


default chunk_size is roughly Load/num_threads

dynamic: chunks are assigned to threads as requested


default chunk_size is 1

guided: dynamic, with chunk size propor;onal to #unassigned


itera;ons divided by num_threads
chunk size is at least chunk_size itera;ons (except the last)
default chunk_size is 1

run.me: taken directly from environment variable

Reduc;ons
Reductions are common
scalar f(v1 .. vn)

Specify reduction operation and variable


OpenMP code combines results from the
loop
stores partial results in private variables

reduc;on Clause
reduction (<op> :<variable>)

+
*
&
|
^
&&
||

Sum
Product
Bitwise and
Bitwise or
Bitwise exclusive or
Logical and
Logical or

Add to parallel for

OpenMP creates a loop to combine copies of the


variable
The resulOng loop may not be parallel

Single Construct
#pragma omp parallel
{
#pragma omp for
for( int i=0; i<N; i++ ) a[i] = f0(i);
#pragma omp single
x = f1(a);
#pragma omp for
for(int i=0; i<N; i++ ) b[i] = x * f2(i);
}

Only one of the threads executes


Other threads wait for it
unless NOWAIT is specied

Hidden complexity

Threads may not hit single

Sec;ons Construct
#pragma omp sec;ons
{
#pragma omp sec;on
{

// do this
}
#pragma omp sec;on
{

// do that
}
//
}

omp sec.on pragma must be closely nested in a sec;ons


construct, where no other work-sharing construct may appear.

Other Synchroniza;on Direc;ves


#pragma omp master
{
}
binds to the innermost enclosing parallel region
Only the master executes
No implied barrier

Master Direc;ve
#pragma omp parallel
{
#pragma omp for
for( int i=0; i<100; i++ ) a[i] = f0(i);
#pragma omp master
x = f1(a);
}
Only master executes.
No synchronizaOon.

Cri;cal Sec;on
#pragma omp cri;cal (accessBankBalance)
{
}
A single thread at a ;me
through all regions of the same name

Applies to all threads


The name is op;onal
Anonymous = global cri;cal region

Barrier Direc;ve
#pragma omp barrier

Stand-alone
Binds to inner-most parallel region
All threads in the team must execute

they will all wait for each other at this instrucOon


Dangerous:

if (! ready)
#pragma omp barrier
Same sequence of work-sharing and barrier for the
enOre team

Ordered Direc;ve
#pragma omp ordered
{
}
Binds to inner-most enclosing loop
The structured block executed in loop
sequential order
The loop must declare the ordered clause
Each thread must encounter only one
ordered region

Flush Direc;ve
#pragma omp ush (var1, var2)
Stand-alone, like barrier
Only directly aects the encountering thread
List-of-vars ensures that any compiler re-ordering
moves all ushes together
implicit:
barrier, atomic, cri;cal, locks

Atomic Direc;ve
#pragma omp atomic
i++;
Light-weight cri;cal sec;on
Only for some expressions

x = expr (no mutual exclusion on expr evalua;on)


x++
++x
x--
--x

Helper Functions: General


void omp_set_dynamic (int);
int omp_get_dynamic ();

void omp_set_nested (int);


int omp_get_nested ();

int omp_get_num_procs();
int omp_get_num_threads();
int omp_get_thread_num();
int omp_get_ancestor_thread_num();
double omp_get_wtime();
35

Helper Functions: Mutex


void omp_init_lock (omp_lock_t *);
void omp_destroy_lock (omp_lock_t *);

void omp_set_lock (omp_lock_t *);


void omp_unset_lock (omp_lock_t *);
int omp_test_lock (omp_lock_t *);
nested lock versions:
e.g., omp_set_nest_lock(omp_test_lock_t *);

36

Nes;ng Restric;ons
A cri;cal region may not be nested ever inside a cri;cal
region with the same name
Not sucient to prevent deadlock
Not allowed without intervening parallel region:
Inside work-sharing, cri;cal, ordered, or master
Work-sharing
barrier
Inside a work-sharing region
master region
Inside a cri;cal region
ordered region

EXAMPLES

Firstprivate and Lastprivate


#pragma omp parallel for firstprivate( simple )
! for (int i=0; i<N; i++) {
simple += a[f1(i, omp_get_thread_num())]
!!
f2(simple);
!}
#pragma omp parallel for lastprivate( doneEarly )
for( i=0; i<N; i++ ) {
! doneEarly = f0(i);
}

Ordered Construct
int i;
#pragma omp for ordered
for (i=0; i<n; i++) {
if(isGroupA(i) {
#pragma omp ordered
doit(i);

}
else {
#pragma omp ordered
doit(partner(i));

}
40

Wrong Use of multiple Orders


int i;
#pragma omp for ordered
for (i=0; i<n; i++) {
#pragma omp ordered
doit(i);
#pragma omp ordered
doit(partner(i));

41

OpenMP Matrix Mul;ply

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
for( int j=0; j<n; j++ ) {

c[i][j] = 0.0;

for(int k=0; k<n; k++ )


c[i][j] += a[i][k]*b[k][j];
}

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
for( int j=0; j<n; j++ ) {

c[i][j] = 0.0;

for(int k=0; k<n; k++ )


c[i][j] += a[i][k]*b[k][j];
}

a, b, c are shared

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
for( int j=0; j<n; j++ ) {

c[i][j] = 0.0;

for(int k=0; k<n; k++ )


c[i][j] += a[i][k]*b[k][j];
}

a, b, c are shared
i, j, k are private

OpenMP Matrix Mul;ply

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

c[i][j] = 0.0;

for(int k=0; k<n; k++ )


c[i][j] += a[i][k]*b[k][j];
}

OpenMP Matrix Mul;ply

OpenMP Matrix Mul;ply


#pragma omp parallel for

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

#pragma omp parallel for reduc;on(+:sum)

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

#pragma omp parallel for reduc;on(+:sum)
for(int k=0; k<n; k++ )

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

#pragma omp parallel for reduc;on(+:sum)
for(int k=0; k<n; k++ )

sum += a[i][k]*b[k][j];

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

#pragma omp parallel for reduc;on(+:sum)
for(int k=0; k<n; k++ )

sum += a[i][k]*b[k][j];
c[i][j] = sum;

OpenMP Matrix Mul;ply


#pragma omp parallel for
for( int i=0; i<n; i++ )
#pragma omp parallel for
for( int j=0; j<n; j++ ) {

int sum = 0.0;

#pragma omp parallel for reduc;on(+:sum)
for(int k=0; k<n; k++ )

sum += a[i][k]*b[k][j];
c[i][j] = sum;
}

OpenMP Matrix Multiply: Triangular


#pragma omp parallel for schedule (dynamic, 1 )
for( int i=0; i<n; i++ )
for( int j=i; j<n; j++ ) {

c[i][j] = 0.0;

for(int k=i; k<n; k++ )


c[i][j] += a[i][k]*b[k][j];
}

This multiplies upper-triangular matrix A with B


Unbalanced workload
Schedule improves this

OpenMP Jacobi
for some number of Omesteps/iteraOons {
#pragma omp parallel for
for (int i=0; i<n; i++ )

for( int j=0, j<n, j++ )


temp[i][j] = 0.25 *



( grid[i-1][j] + grid[i+1][j]



grid[i][j-1] + grid[i][j+1] );
#pragma omp parallel for
for( int i=0; i<n; i++ )

for( int j=0; j<n; j++ )


grid[i][j] = temp[i][j];
}

This could be improved by using just one parallel region


Implicit barrier after loops eliminates race on grid

#pragma omp parallel shared(a, b, nthreads, locka, lockb) private(tid)


#pragma omp sections nowait
{
#pragma omp section
{
omp_set_lock(&locka);
for (i=0; i<N; i++)
a[i] = i * DELTA;
omp_set_lock(&lockb);
for (i=0; i<N; i++)
b[i] += a[i];
omp_unset_lock(&lockb);
omp_unset_lock(&locka);
}
#pragma omp section
{
omp_set_lock(&lockb);
for (i=0; i<N; i++)
b[i] = i * PI;
Assume:
omp_set_lock(&locka);




v
ariables
d
eclared
for (i=0; i<N; i++)
a[i] += b[i];
locks ini;alized
omp_unset_lock(&locka);
omp_unset_lock(&lockb);
}
} /* end of sections */

Find the Error

47

void workSum(oat *x, oat *y, int *index, int n) {


int i;
#pragma omp parallel for shared(x, y, index, n)
for (i=0; i<n; i++) {
#pragma omp atomic
x[index[i]] += work1(i);
y[i] += work2(i);
}
int work0 = x[0]
}

void workSum(oat *x, oat *y, int *index, int n) {


int i;
#pragma omp parallel for shared(x, y, index, n)
for (i=0; i<n; i++) {
#pragma omp atomic
x[index[i]] += work1(i);
y[i] += work2(i);
}
int work0 = x[0]
}

nowait

Find the Error


void workSum(oat *x, oat *y, int *index, int n) {
int i;
#pragma omp parallel for shared(x, y, index, n)
for (i=0; i<n; i++) {
#pragma omp atomic
x[index[i]] += work1(i);
y[i] += work2(i);
}
int work0 = x[0]
}

nowait

Eciency Issues
Minimize synchroniza;on
Avoid BARRIER, CRITICAL, ORDERED, and locks
Use NOWAIT
Use named CRITICAL sec;ons for ne-grained locking
Use MASTER (instead of SINGLE)
Parallelize at the highest level possible
such as outer FOR loops
keep parallel regions large
FLUSH is expensive
LASTPRIVATE has synchroniza;on overhead
Thread safe malloc/free are expensive
Reduce False sharing
Design of data structures
Use PRIVATE

Common SMP Errors

Synchroniza;on
Race condi;on
depends on ;ming
Deadlock
wai;ng for a non-existent condi;on
Livelock
con;nuously adjus;ng, but task progress stalled
Try to
Avoid nested locks
Release locks religiously
Avoid while true (especially, during tes;ng)
Be careful with
Non thread-safe libraries
Concurrent access to shared data
IO inside parallel regions
Diering views of shared memory (FLUSH)
NOWAIT

You might also like