Parallel Processing1 Parallel Processing (CS 667) Lecture 4: Shared Memory Programming with Pthreads...
-
Upload
abraham-washington -
Category
Documents
-
view
217 -
download
0
Transcript of Parallel Processing1 Parallel Processing (CS 667) Lecture 4: Shared Memory Programming with Pthreads...
Parallel Processing 1
Parallel Processing (CS 667)
Lecture 4: Shared Memory Programming with Pthreads*
Jeremy R. Johnson
*Some of this lecture was derived from Pthreads Programming by Nichols, Buttlar, and Farrell and POSIX Threads Programming Tutorial (computing.llnl.gov/tutorials/pthreads) by Blaise Barney
Parallel Processing 2
Introduction• Objective: To learn how to write parallel programs using threads (using the
Pthreads library) and to understand the execution model of threads vs. processes.
• Topics– Concurrent programming with UNIX Processes
– Introduction to shared memory parallel programming with Pthreads• Threads• fork/join• race conditions• Synchronization• performance issues - synchronization overhead, contention and granularity, load balance, cache
coherency and false sharing.
– Introduction parallel program design paradigms• Data parallelism (static scheduling)• Task parallelism with workers• Divide and conquer parallelism (fork/join)
Processes
• Processes contain information about program resources and program execution state
– Process ID, process group ID, user ID, and group ID– Environment– Working directory– Program instructions– Registers– Stack– Heap– File descriptors– Signal actions– Shared libraries– Inter-process communication tools (such as message queues, pipes,
semaphores, or shared memory).
Parallel Processing 3
UNIX Process
Parallel Processing 4
Threads
• An independent stream of instructions that can be scheduled to run
– Stack pointer– Registers (program counter)– Scheduling properties (such as policy or priority)– Set of pending and blocked signals– Thread specific data
• “lightweight process”– Cost of creating and managing threads much less than processes– Threads live within a process and share process resources such as
address space
• Pthreads – standard thread API (IEEE Std 1003.1)
Parallel Processing 5
Threads within a UNIX Process
Parallel Processing 6
Shared Memory Model
• All threads have access to the same global, shared memory
• All threads within a process share the same address space
• Threads also have their own private data
• Programmers are responsible for synchronizing access (protecting) globally shared data.
Parallel Processing 7
Simple Example
void do_one_thing(int *);
void do_another_thing(int *);
void do_wrap_up(int, int);
int r1 = 0, r2 = 0;
extern int
main(void)
{
do_one_thing(&r1);
do_another_thing(&r2);
do_wrap_up(r1, r2);
return 0;
}
Parallel Processing 8
Parallel Processing 9
do_another_thing() i j k--------------------------------------main()
main()--------do_one_thing() --------do_another_thing()---------
r1r2
SPPCGP0GP1…
PIDUIDGID
Open FilesLocksSockets…
Stack
Text
Data
Heap
Registers
Identity
Resources
Virtual Address Space
Simple Example (Processes)
int shared_mem_id, *shared_mem_ptr;
int *r1p, *r2p;
extern int main(void)
{
pid_t child1_pid, child2_pid;
int status;
/* initialize shared memory segment */
if ((shared_mem_id = shmget(IPC_PRIVATE, 2*sizeof(int), 0660)) == -1)
perror("shmget"), exit(1);
if ((shared_mem_ptr = (int *)shmat(shared_mem_id, (void *)0, 0)) == (void *)-1
)
perror("shmat failed"), exit(1);
r1p = shared_mem_ptr;
r2p = (shared_mem_ptr + 1);
*r1p = 0;
*r2p = 0;
Parallel Processing 10
Simple Example (Processes)
if ((child1_pid = fork()) == 0) {
/* first child */
do_one_thing(r1p);
return 0;
} else if (child1_pid == -1) {
perror("fork"), exit(1);
}
/* parent */
if ((child2_pid = fork()) == 0) {
/* second child */
do_another_thing(r2p);
return 0;
} else if (child2_pid == -1) {
perror("fork"), exit(1);
}
Parallel Processing 11
/* parent */
if ((waitpid(child1_pid, &status, 0) == -1))
perror("waitpid"), exit(1);
if ((waitpid(child2_pid, &status, 0) == -1))
perror("waitpid"), exit(1);
do_wrap_up(*r1p, *r2p);
return 0;
}
Parallel Processing 12
do_one_thing() i j k---------------------------main()
main()--------do_one_thing() --------do_another_thing()---------
SPPCGP0GP1…
PIDUIDGID
Open FilesLocksSockets
…
Stack
Text
Data
Heap
Registers
Identity
Resources
Virtual Address Space
do_another_thing() i j k---------------------------main()
main()--------do_one_thing() --------do_another_thing()---------
SPPCGP0GP1…
PIDUIDGID
Open FilesLocksSockets
…
Stack
Text
Data
Heap
Registers
Identity
Resources
Virtual Address Space
Shared Memory
Simple Example (PThreads)
int r1 = 0, r2 = 0;
extern int
main(void)
{
pthread_t thread1, thread2;
if (pthread_create(&thread1,
NULL,
do_one_thing,
(void *) &r1) != 0)
perror("pthread_create"), exit(1);
if (pthread_create(&thread2,
NULL,
do_another_thing,
(void *) &r2) != 0)
perror("pthread_create"), exit(1);
Parallel Processing 13
if (pthread_join(thread1, NULL) != 0)
perror("pthread_join"),exit(1);
if (pthread_join(thread2, NULL) != 0)
perror("pthread_join"),exit(1);
do_wrap_up(r1, r2);
return 0;
}
Parallel Processing 14
do_another_thing() i j k--------------------------------------main()
main()--------do_one_thing() --------do_another_thing()-----------------r1r2
SPPCGP0GP1…
PIDUIDGID
Open FilesLocksSockets…
Stack
Text
Data
Heap
Registers
Identity
Resources
Virtual Address Space
do_another_thing() i j k--------------------------------------main()
Stack
SPPCGP0GP1…
Registers
Thread 1
Thread 2
Concurrency and Parallelism
Parallel Processing 15
Time
do_one_thing()do_another_thing() do_wrap_up()
do_one_thing() do_another_thing() do_wrap_up()
do_one_thing()
do_another_thing()
do_wrap_up()
Unix Fork
• The fork() call
– Creates a child process that is identical to the parent process
– The child has its own PID
– The fork() call provides different return values to the parent [child’s PID] and the child [0]
Parallel Processing 16
Parallel Processing 17
--------fork()-----------------
PID = 7274
--------fork()-----------------
PID = 7274
--------fork()-----------------
PID = 7275
fork
Parent
Child
Thread Creation
• pthread_create creates a new thread and makes it executable
– pthread_create (thread,attr,start_routine,arg) • thread - unique identifier
• attr – attribute
• Start_routine – the routine the newly created thread will execute
• arg – a single argument passed to start_routine
Parallel Processing 18
Thread Creation
• Once created, threads are peers, and may create other threads
Parallel Processing 19
Thread Join
• "Joining" is one way to accomplish synchronization between threads.
• The pthread_join() subroutine blocks the calling thread until the specified threadid thread terminates.
Parallel Processing 20
Fork/Join Overhead
• Compare the overhead of procedure call, process fork/join, thread create/join
– Procedure call (no args)• 1.2 10-8 sec (.12 ns)
– Process• 0.0012 sec (1.2 ms)
– Thread• 0.000042 sec (42 s)
Parallel Processing 21
Race Conditions
• When two or more threads access the same resource at the same time
Parallel Processing 22
Tim
e
Thread 1 Thread 2 Balance
Withdraw $50 Withdraw $50Read Balance $125 Read Balance $125Set Balance $75 Set Balance $75
Bad Count
int sum= 0;
void count(int *arg)
{
int i;
for (i=0;i<*arg;i++) {
sum++;
}
}
int main(int argc, char **argv)
{
int error,i;
int numcounters = NUMCOUNTERS;
int limit = LIMIT;
pthread_t tid[NUMCOUNTERS];
Parallel Processing 23
pthread_setconcurrency(numcounters);
for (i=0;i<numcounters;i++)
{
error = pthread_create(&tid[i],NULL,(void *(*)(void *))count,&limit);
}
for (i=0;i<numcounters;i++)
{
error = pthread_join(tid[i],NULL);
}
printf("Counters finished with count = %d\n",sum);
printf("Count should be %d X %d = %d\n",numcounters,limit,numcounters*limit);
return 0;
}
Mutex
• Mutex variables are for protecting shared data when multiple writes occur.
• A mutex variable acts like a "lock" protecting access to a shared data resource. Only one thread can own (lock) a mutex at any given time
Parallel Processing 24
Mutex Operations
• pthread_mutex_lock (mutex) – The pthread_mutex_lock() routine is used by a thread to
acquire a lock on the specified mutex variable. If the mutex is already locked by another thread, this call will block the calling thread until the mutex is unlocked.
• Pthread_mutex_unlock (mutex) – will unlock a mutex if called by the owning
thread. Calling this routine is required after a thread has completed its use of protected data if other threads are to acquire the mutex for their work with the protected data.
Parallel Processing 25
Good Countint sum= 0;
pthread_mutex_t lock;
void count(int *arg)
{
int i;
for (i=0;i<*arg;i++)
{
pthread_mutex_lock(&lock);
sum++;
pthread_mutex_unlock(&lock);
}
}
int main(int argc, char **argv)
{
int error,i;
int numcounters = NUMCOUNTERS;
int limit = LIMIT;
pthread_t mytid, tid[MAXCOUNTERS];
Parallel Processing 26
pthread_setconcurrency(numcounters);
pthread_mutex_init(&lock,NULL);
for (i=1;i<=numcounters;i++)
{
error = pthread_create(&tid[i],NULL,(void *(*)(void *))count, &limit);
}
for (i=1;i<=numcounters;i++)
{
error = pthread_join(tid[i],NULL);
}
printf("Counters finished with count = %d\n",sum);
printf("Count should be %d X %d = %d\n",numcounters,limit,numcounters*limit);
return 0;
}
Better Count
int sum= 0;
pthread_mutex_t lock;
void count(int *arg)
{
int i;
int localsum = 0;
for (i=0;i<*arg;i++)
{
localsum++;
}
pthread_mutex_lock(&lock);
sum = sum + localsum;
pthread_mutex_unlock(&lock);
}
Parallel Processing 27
Linked Listtypedef struct llist_node {
int index;
void *datap;
struct llist_node *nextp;
} llist_node_t;
typedef llist_node_t *llist_t;
int llist_insert_data (int index, void *datap, llist_t *llistp)
{
llist_node_t *cur, *prev, *new;
int found = FALSE;
for (cur=prev=*llistp; cur != NULL; prev=cur, cur=cur->nextp) {
if (cur->index == index) {
free(cur->datap);
cur->datap = datap;
found=TRUE;
break;
}
Parallel Processing 28
else if (cur->index > index){
break;
}
}
if (!found) {
new = (llist_node_t *)malloc(sizeof(llist_node_t));
new->index = index;
new->datap = datap;
new->nextp = cur;
if (cur==*llistp)
*llistp = new;
else
prev->nextp = new;
}
return 0;
}
Race Conditions for Linked Lists
• When two or more threads insert things can go awry
Parallel Processing 29
new 1 new 2
prev cur
Threadsafe Code
• Refers to an application's ability to execute multiple threads simultaneously without "clobbering" shared data or creating "race" conditions.
Parallel Processing 30
Threadsafe Linked List
typedef struct llist {
llist_node_t *first;
pthread_mutex_t mutex;
} llist_t;
int llist_init (llist_t *llistp)
{
int rtn;
llistp->first = NULL;
if ((rtn = pthread_mutex_init(&(llistp->mutex), NULL)) !=0)
fprintf(stderr, "pthread_mutex_init error %d",rtn), exit(1);
return 0;
}
Parallel Processing 31
int llist_insert_data (int index, void *datap, llist_t *llistp)
{
llist_node_t *cur, *prev, *new;
int found = FALSE;
pthread_mutex_lock(&(llistp->mutex));
for (cur=prev=llistp->first; cur != NULL; prev=cur, cur=cur->nextp) {
… pthread_mutex_unlock(&(llistp->mutex));
return 0;
}
Access Patterns and Granularity
• Lock entire list (coarse grain) or lock individual nodes (fine grain)?
• Individual nodes allows more concurrency but incurs more overhead and is more difficult to program.
• Use readers/writers lock (allow multiple readers but exclusive writing)
Parallel Processing 32
Condition Variables
• While mutexes implement synchronization by controlling thread access to data, condition variables allow threads to synchronize based upon the actual value of data.
• Without condition variables, the programmer would need to have threads continually polling (possibly in a critical section), to check if the condition is met.
• A condition variable is a way to achieve the same goal without polling
• Always used with a mutexParallel Processing 33
Using Condition variables
Thread A
• Do work up to the point where a certain condition must occur (such as "count" must reach a specified value)
• Lock associated mutex and check value of a global variable
• Call pthread_cond_wait() to perform a blocking wait for signal from Thread-B. Note that a call to pthread_cond_wait() automatically and atomically unlocks the associated mutex variable so that it can be used by Thread-B.
• When signalled, wake up. Mutex is automatically and atomically locked.
• Explicitly unlock mutex• Continue
Thread B
• Do work
• Lock associated mutex
• Change the value of the global variable that Thread-A is waiting upon.
• Check value of the global Thread-A wait variable. If it fulfills the desired condition, signal Thread-A.
• Unlock mutex.
• Continue
Parallel Processing 34
Condition Variable Example
void *watch_count(void *idp)
{
int i=0, save_state, save_type;
int *my_id = idp;
pthread_mutex_lock(&count_lock);
while (count < COUNT_THRES) {
pthread_cond_wait(&count_hit_threshold, &count_lock);
}
pthread_mutex_unlock(&count_lock);
return(NULL);
}
Parallel Processing 35
void *inc_count(void *idp)
{
int i=0, save_state, save_type;
int *my_id = idp;
for (i=0; i<TCOUNT; i++) {
pthread_mutex_lock(&count_lock);
count++;
if (count == COUNT_THRES) {
pthread_cond_signal(&count_hit_threshold);
}
pthread_mutex_unlock(&count_lock);
}
return(NULL);
}
Reader/Writer Lock
typedef struct rdwr_var {
int readers_reading;
int writer_writing;
pthread_mutex_t mutex;
pthread_cond_t lock_free;
} pthread_rdwr_t;
typedef void * pthread_rdwrattr_t;
#define pthread_rdwrattr_default NULL;
int pthread_rdwr_init_np(pthread_rdwr_t *rdwrp, pthread_rdwrattr_t *attrp);
int pthread_rdwr_rlock_np(pthread_rdwr_t *rdwrp);
int pthread_rdwr_runlock_np(pthread_rdwr_t *rdwrp);
int pthread_rdwr_wlock_np(pthread_rdwr_t *rdwrp);
int pthread_rdwr_wunlock_np(pthread_rdwr_t *rdwrp);
Parallel Processing 36
Reader/Writer Lockint llist_insert_data (int index, void *datap, llist_t *llistp)
{
…
pthread_rdwr_wlock_np(&(llistp->rwlock));
…
pthread_rdwr_wunlock_np(&(llistp->rwlock));
return 0;
}
int llist_find_data(int index, void **datapp, llist_t *llistp)
{
…
pthread_rdwr_rlock_np(&(llistp->rwlock));
…
pthread_rdwr_runlock_np(&(llistp->rwlock));
return 0;
}
Parallel Processing 37
Reader/Writer Lock Init
int pthread_rdwr_init_np(pthread_rdwr_t *rdwrp, pthread_rdwrattr_t *attrp)
{
rdwrp->readers_reading = 0;
rdwrp->writer_writing = 0;
pthread_mutex_init(&(rdwrp->mutex), NULL);
pthread_cond_init(&(rdwrp->lock_free), NULL);
return 0;
}
Parallel Processing 38
Read Lock
int pthread_rdwr_rlock_np(pthread_rdwr_t *rdwrp){
pthread_mutex_lock(&(rdwrp->mutex));
while(rdwrp->writer_writing) {
pthread_cond_wait(&(rdwrp->lock_free), &(rdwrp->mutex));
}
rdwrp->readers_reading++;
pthread_mutex_unlock(&(rdwrp->mutex));
return 0;
}
Parallel Processing 39
Read Unlockint pthread_rdwr_runlock_np(pthread_rdwr_t *rdwrp)
{
pthread_mutex_lock(&(rdwrp->mutex));
if (rdwrp->readers_reading == 0) {
pthread_mutex_unlock(&(rdwrp->mutex));
return -1;
}
else {
rdwrp->readers_reading--;
if (rdwrp->readers_reading == 0) {
pthread_cond_signal(&(rdwrp->lock_free));
}
pthread_mutex_unlock(&(rdwrp->mutex));
return 0;
}
}
Parallel Processing 40
Write Lock
int pthread_rdwr_wlock_np(pthread_rdwr_t *rdwrp)
{
pthread_mutex_lock(&(rdwrp->mutex));
while(rdwrp->writer_writing || rdwrp->readers_reading) {
pthread_cond_wait(&(rdwrp->lock_free), &(rdwrp->mutex));
}
rdwrp->writer_writing++;
pthread_mutex_unlock(&(rdwrp->mutex));
return 0;
}
Parallel Processing 41
Write Unlock
int pthread_rdwr_wunlock_np(pthread_rdwr_t *rdwrp)
{
pthread_mutex_lock(&(rdwrp->mutex));
if (rdwrp->writer_writing == 0) {
pthread_mutex_unlock(&(rdwrp->mutex));
return -1;
}
else {
rdwrp->writer_writing = 0;
pthread_cond_broadcast(&(rdwrp->lock_free));
pthread_mutex_unlock(&(rdwrp->mutex));
return 0;
}
}
Parallel Processing 42
Parallel Programming
• Task parallelism vs. data parallelism
• Fork/join parallelism (divide & conquer)
• Static scheduling
• Dynamic scheduling with workers
Parallel Processing 43
Sequential Count
int X[MAXSIZE];
int icount(int l,int u)
{
int i;
int y = 0;
for (i=l; i<=u;i++)
y = y + X[i];
return y;
}
Parallel Processing 44
int rcount(int l,int u)
{
int m;
int y1,y2;
if ( (u-l) == 0)
return X[l];
else
{
m = (l+u)/2;
y1 = rcount(l,m);
y2 = rcount(m+1,u);
return (y1 + y2);
}
}
Counting with a Parallel Loop
int sum= 0;
int numcounters;
int size;
pthread_mutex_t lock;
Parallel Processing 45
void count(int *id)
{
int i,lsum;
lsum = 0;
for (i=*id;i<size;i+=numcounters)
{
lsum = lsum + X[i];
}
pthread_mutex_lock(&lock);
sum = sum + lsum;
pthread_mutex_unlock(&lock);
}
Counting with Workers
void get_task(int *start, int *stop)
{
pthread_mutex_lock(&task_lock);
*start = task_index;
if (*start + task_chunk > n)
*stop = n;
else
*stop = *start + task_chunk;
task_index = *stop;
pthread_mutex_unlock(&task_lock);
}
Parallel Processing 46
void worker()
{
int start,stop,i;
int y = 0;
get_task(&start,&stop);
for (i=start; i<stop;i++)
y = y + X[i];
pthread_mutex_lock(&sum_lock);
sum = sum + y;
pthread_mutex_unlock(&sum_lock);
}
Parallel Divide & Conquerint pcount(int *arg)
{
int error,arg1[3],arg2[3];
int l,u,m;
int y,y1,y2;
pthread_t tid1,tid2;
l = arg[0];
u = arg[1];
if ( (u-l) <= cutoff)
y = count(l,u);
else
{
m = (l+u)/2;
arg1[0] = l;
arg1[1] = m;
Parallel Processing 47
error = pthread_create(&tid1,NULL,(void *(*)(void *))pcount,arg1);
/* y2 = count(m+1,u); */
arg2[0] = m+1;
arg2[1] = u;
error = pthread_create(&tid2,NULL,(void *(*)(void *))pcount,arg2);
error = pthread_join(tid1,NULL);
y1 = arg1[2];
error = pthread_join(tid2,NULL);
y2 = arg2[2];
y = y1 + y2;
}
/* thr_exit(&y); */
arg[2] = y;
}