Mpi Matrix

24
MPI Sample Program Parallel Matrix Multiplication Preeyakorn Tipwai RMUTL 1 st High Performance Computing Workshop Feb 2-3, 2009

description

MPI matrix mul

Transcript of Mpi Matrix

Page 1: Mpi Matrix

MPI Sample ProgramParallel Matrix Multiplication

Preeyakorn TipwaiRMUTL 1st High Performance Computing

WorkshopFeb 2-3, 2009

Page 2: Mpi Matrix

Matrix Multiplication

P1

P2

P3

P4

P0

P1

Pprocess_count-1

A B C

Page 3: Mpi Matrix

Implementation1. Master (process 0) reads data2. Master sends size of data to slaves3. Slaves allocate memory4. Master broadcasts second matrix to all other

processes5. Master sends respective parts of first matrix to all

other processes6. Every process performs its local multiplication7. All slave processes send back their result.

Page 4: Mpi Matrix

Data partitioning

first partition the data.distribute the relevant pieces of data to each of the processors. processors perform their operations on the data.results are either send to other processors or used for further operations.

Page 5: Mpi Matrix

Matrix Multiplication

⎥⎥⎥⎥

⎢⎢⎢⎢

nnnn

n

n

AAA

AAAAAA

..........

..

..

21

11110

00100

⎥⎥⎥⎥

⎢⎢⎢⎢

nnnn

n

n

BBB

BBBBBB

..........

..

..

21

11110

00100

⎥⎥⎥⎥

⎢⎢⎢⎢

nnnn

n

n

CCC

CCCCCC

..........

..

..

21

11110

00100

=.

0030032002000000 *...*** nn BABABABAC ++++=

1031032102010001 *...*** nn BABABABAC ++++=

nnnnnnn BABABABAC *...*** 03032020000 ++++=

-We have to multiply each row of A with all of B, to result in the same row of C-number of columns in A must equal number of rows in B-let’s try n*n matrices

Page 6: Mpi Matrix

Matrix definition

#define TAG_MATRIX_PARTITION 0x4560

typedef struct{ unsigned int m, n; // Rows, cols

double *data; // Data, ordered by row, then by coldouble **rows; // Pointers to rows in data

} TMatrix;

⎥⎥⎥⎥

⎢⎢⎢⎢

nnnn

n

n

AAA

AAAAAA

..........

..

..

21

11110

00100rows[0]

rows[n]

rows[1]

data

Page 7: Mpi Matrix

Matrix operations

TMatrix createMatrix (const unsigned int rows, const unsigned int cols);TMatrix readMatrix (char filename[128]);void freeMatrix (TMatrix *matrix);int validMatrix (TMatrix matrix);TMatrix initMatrix (void);TMatrix matrixMultiply (TMatrix A, TMatrix B);void doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);void printMatrix (char name[128],TMatrix A);

Page 8: Mpi Matrix

createMatrix

TMatrix createMatrix(const unsigned int rows, const unsigned int cols){ TMatrix matrix;

unsigned long int m, n;unsigned int i,j;m = rows; n = cols;matrix.m = rows;matrix.n = cols;matrix.data = (double *) malloc(sizeof(double) * m * n);matrix.rows = (double **) malloc(sizeof(double *) * m);if (validMatrix(matrix)){

matrix.m = rows; matrix.n = cols;for (i = 0; i < rows; i++){

matrix.rows[i] = matrix.data + (i * cols);}

}else{

freeMatrix(&matrix);}return matrix;

}

Page 9: Mpi Matrix

freeMatrix

void freeMatrix (TMatrix *matrix)

{

if (matrix == NULL) return;

if (matrix -> data) { free(matrix -> data); matrix -> data = NULL; }

if (matrix -> rows) { free(matrix -> rows); matrix -> rows = NULL; }

matrix -> m = 0;

matrix -> n = 0;

}

Page 10: Mpi Matrix

validMatrix

int validMatrix (TMatrix matrix){

if ((matrix.data == NULL) || (matrix.rows == NULL) ||(matrix.m == 0) || (matrix.n == 0))

return 0;else return 1;

}

Page 11: Mpi Matrix

initMatrix

TMatrix initMatrix(){

TMatrix matrix;matrix.m = 0;matrix.n = 0;matrix.data = NULL;matrix.rows = NULL;return matrix;

}

Page 12: Mpi Matrix

matrixMultiply

TMatrix matrixMultiply(TMatrix A, TMatrix B){

TMatrix C;C = initMatrix();if (validMatrix(A) && validMatrix(B) && (A.n == B.m)){

C = createMatrix(A.m, B.n);if (validMatrix(C)){

doMatrixMultiply(A, B, C);}

}return C;

}

Page 13: Mpi Matrix

doMatrixMultiply

void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C){

unsigned int i, j, k;double sum;for (i = 0; i < A.m; i++) // Rows{

for (j = 0; j < B.n; j++) // Cols{

sum = 0;for (k = 0; k < A.n; k++)

sum += A.rows[i][k] * B.rows[k][j];C.rows[i][j] = sum;

}}

}

Page 14: Mpi Matrix

printMatrix

void printMatrix(char name[128], TMatrix A){

unsigned int i, j;printf("%s:\n", name);if (validMatrix(A))

{ for (i = 0; i < A.m; i++){

for (j = 0; j < A.n; j++) printf ("%7.3f ", A.rows[i][j]);

printf ("\n");}

}}

Page 15: Mpi Matrix

readMatrixint readMatrix(char *filename, TMatrix *A){ FILE *fp;

unsigned int m, n, i, j;float d;int result = 0;if ((fp = fopen (filename, "r")) == NULL) return 0;do{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;

if ((m == 0) || (n == 0)) break;*A = createMatrix(m,n);if (!validMatrix(*A)) break;for (i = 0; i < m; i ++){ for (j = 0; j < n; j ++)

{ if (fscanf (fp, "%f", &d) != 1) break;A -> rows[i][j] = d;

}if (j != n) break;

}if (i != m) break;

result = 1;} while (0);fclose (fp);return result;

}

Page 16: Mpi Matrix

Write your main function

int main (int argc, char *argv[]){ int processor_rank = 0;

int processor_count = 1;MPI_Status status;TMatrix A,B,C,D;unsigned int m, n= 4, i, j, offset;double time0, time1;A = initMatrix(); B = initMatrix(); C = initMatrix(); D = initMatrix();MPI_Init(&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &processor_count);MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank );

// Free matrix datafreeMatrix(&A); freeMatrix(&B); freeMatrix(&C);// Wait for everyone to stop MPI_Barrier(MPI_COMM_WORLD);

// Always use MPI_Finalize as the last instruction of the programMPI_Finalize();return 0;

}

WORK HERE

Page 17: Mpi Matrix

Processes

Rank = 0 others

Time stampRead the matrices A, BAllocate memories for matrix CBroadcast(send) size of matrixBroadcast(send) matrix BSplit A into partsSend each process a part of AMultiply first part here, result in CReceive other parts of CTime stamp

Broadcast(receive) size of matrixAllocate memories for matricesBroadcast(receive) matrix BReceive a part of AMultiply their part of matrix, result in CSend the result back to 0

Page 18: Mpi Matrix

partitioning

P0

P1

Pprocessor_count-1

n rows

number of rows for each process (m) = n / processor_count

n columns

start row for each process i = m * i

amount of data for each process = m * n

Page 19: Mpi Matrix

For examples, 8*8 matrices4 processors

P0

P1

P3

8rows

m= 8 / 4 = 2

8 columns

start row for each process i = 2 * i

amount of data for each process = 2 * 8

Page 20: Mpi Matrix

when rank = 0if (processor_rank == 0){ time0 = MPI_Wtime();

readMatrix(argv[1], &A);readMatrix(argv[2], &B);n = A.n;m = n / processor_count;C = createMatrix(n,n);

// Broadcast (send) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Broadcast (send) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

// Send each process it's own part of Afor (i = 1; i < processor_count; i++)

MPI_Send((void *)A.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD);

...

}

Page 21: Mpi Matrix

when rank = 0if (processor_rank == 0){

...

// Multiply own part of matrix A with B into already existing matrix CA.m = m;doMatrixMultiply(A,B,C);

A.m = n;

// Receive part of C matrix from each processfor (i = 1; i < processor_count; i++)

MPI_Recv((void *)C.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD, &status);

// Record finish timetime1 = MPI_Wtime();printMatrix("A",A);printMatrix("B",B);printMatrix("C",C);// Print time statisticsprintf ("Total time using [%2d] processors : [%f] seconds\n",

processor_count, time1 - time0);}

Page 22: Mpi Matrix

Other ranks

else{

// Broadcast (receive) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Allocate memory for matricesm = n / processor_count;A = createMatrix(m, n);B = createMatrix(n ,n);

// Broadcast (receive) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);MPI_Recv((void *)A.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,

MPI_COMM_WORLD, &status);

// Multiply local matricesC = matrixMultiply(A,B);

// Send back resultMPI_Send((void *)C.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,

MPI_COMM_WORLD);

}

Page 23: Mpi Matrix

Multiplication 1000 x 1000

1000 x 1000 Matrix multiplication

0

20

40

60

80

100

120

140

0 10 20 30 40 50 60

Processors

Tim

e (s

)

Tp T1 / p

Page 24: Mpi Matrix

Multiplication 5000 x 5000

5000 x 5000 Matrix multiplication

0

10000

20000

30000

40000

50000

60000

70000

80000

90000

0 5 10 15 20 25 30 35

Processors

Tim

e (s

)

Tp T1 / p