Mpi Matrix

MPI Sample ProgramParallel Matrix Multiplication

Preeyakorn TipwaiRMUTL 1st High Performance Computing

WorkshopFeb 2-3, 2009

Matrix Multiplication

P1

P2

P3

P4

P0

P1

…

Pprocess_count-1

A B C

Implementation1. Master (process 0) reads data2. Master sends size of data to slaves3. Slaves allocate memory4. Master broadcasts second matrix to all other

processes5. Master sends respective parts of first matrix to all

other processes6. Every process performs its local multiplication7. All slave processes send back their result.

Data partitioning

first partition the data.distribute the relevant pieces of data to each of the processors. processors perform their operations on the data.results are either send to other processors or used for further operations.

Matrix Multiplication

⎥⎥⎥⎥

⎦

⎤

⎢⎢⎢⎢

⎣

⎡

nnnn

n

n

AAA

AAAAAA

..........

..

..

21

11110

00100

⎥⎥⎥⎥

⎦

⎤

⎢⎢⎢⎢

⎣

⎡

nnnn

n

n

BBB

BBBBBB

..........

..

..

21

11110

00100

⎥⎥⎥⎥

⎦

⎤

⎢⎢⎢⎢

⎣

⎡

nnnn

n

n

CCC

CCCCCC

..........

..

..

21

11110

00100

=.

0030032002000000 *...*** nn BABABABAC ++++=

1031032102010001 *...*** nn BABABABAC ++++=

nnnnnnn BABABABAC *...*** 03032020000 ++++=

-We have to multiply each row of A with all of B, to result in the same row of C-number of columns in A must equal number of rows in B-let’s try n*n matrices

Matrix definition

#define TAG_MATRIX_PARTITION 0x4560

typedef struct{ unsigned int m, n; // Rows, cols

double *data; // Data, ordered by row, then by coldouble **rows; // Pointers to rows in data

} TMatrix;

⎥⎥⎥⎥

⎦

⎤

⎢⎢⎢⎢

⎣

⎡

nnnn

n

n

AAA

AAAAAA

..........

..

..

21

11110

00100rows[0]

rows[n]

rows[1]

data

Matrix operations

TMatrix createMatrix (const unsigned int rows, const unsigned int cols);TMatrix readMatrix (char filename[128]);void freeMatrix (TMatrix *matrix);int validMatrix (TMatrix matrix);TMatrix initMatrix (void);TMatrix matrixMultiply (TMatrix A, TMatrix B);void doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);void printMatrix (char name[128],TMatrix A);

createMatrix

TMatrix createMatrix(const unsigned int rows, const unsigned int cols){ TMatrix matrix;

unsigned long int m, n;unsigned int i,j;m = rows; n = cols;matrix.m = rows;matrix.n = cols;matrix.data = (double *) malloc(sizeof(double) * m * n);matrix.rows = (double **) malloc(sizeof(double *) * m);if (validMatrix(matrix)){

matrix.m = rows; matrix.n = cols;for (i = 0; i < rows; i++){

matrix.rows[i] = matrix.data + (i * cols);}

}else{

freeMatrix(&matrix);}return matrix;

}

freeMatrix

void freeMatrix (TMatrix *matrix)

{

if (matrix == NULL) return;

if (matrix -> data) { free(matrix -> data); matrix -> data = NULL; }

if (matrix -> rows) { free(matrix -> rows); matrix -> rows = NULL; }

matrix -> m = 0;

matrix -> n = 0;

}

validMatrix

int validMatrix (TMatrix matrix){

if ((matrix.data == NULL) || (matrix.rows == NULL) ||(matrix.m == 0) || (matrix.n == 0))

return 0;else return 1;

}

initMatrix

TMatrix initMatrix(){

TMatrix matrix;matrix.m = 0;matrix.n = 0;matrix.data = NULL;matrix.rows = NULL;return matrix;

}

matrixMultiply

TMatrix matrixMultiply(TMatrix A, TMatrix B){

TMatrix C;C = initMatrix();if (validMatrix(A) && validMatrix(B) && (A.n == B.m)){

C = createMatrix(A.m, B.n);if (validMatrix(C)){

doMatrixMultiply(A, B, C);}

}return C;

}

doMatrixMultiply

void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C){

unsigned int i, j, k;double sum;for (i = 0; i < A.m; i++) // Rows{

for (j = 0; j < B.n; j++) // Cols{

sum = 0;for (k = 0; k < A.n; k++)

sum += A.rows[i][k] * B.rows[k][j];C.rows[i][j] = sum;

}}

}

printMatrix

void printMatrix(char name[128], TMatrix A){

unsigned int i, j;printf("%s:\n", name);if (validMatrix(A))

{ for (i = 0; i < A.m; i++){

for (j = 0; j < A.n; j++) printf ("%7.3f ", A.rows[i][j]);

printf ("\n");}

}}

readMatrixint readMatrix(char *filename, TMatrix *A){ FILE *fp;

unsigned int m, n, i, j;float d;int result = 0;if ((fp = fopen (filename, "r")) == NULL) return 0;do{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;

if ((m == 0) || (n == 0)) break;*A = createMatrix(m,n);if (!validMatrix(*A)) break;for (i = 0; i < m; i ++){ for (j = 0; j < n; j ++)

{ if (fscanf (fp, "%f", &d) != 1) break;A -> rows[i][j] = d;

}if (j != n) break;

}if (i != m) break;

result = 1;} while (0);fclose (fp);return result;

}

Write your main function

int main (int argc, char *argv[]){ int processor_rank = 0;

int processor_count = 1;MPI_Status status;TMatrix A,B,C,D;unsigned int m, n= 4, i, j, offset;double time0, time1;A = initMatrix(); B = initMatrix(); C = initMatrix(); D = initMatrix();MPI_Init(&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &processor_count);MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank );

// Free matrix datafreeMatrix(&A); freeMatrix(&B); freeMatrix(&C);// Wait for everyone to stop MPI_Barrier(MPI_COMM_WORLD);

// Always use MPI_Finalize as the last instruction of the programMPI_Finalize();return 0;

}

WORK HERE

Processes

Rank = 0 others

Time stampRead the matrices A, BAllocate memories for matrix CBroadcast(send) size of matrixBroadcast(send) matrix BSplit A into partsSend each process a part of AMultiply first part here, result in CReceive other parts of CTime stamp

Broadcast(receive) size of matrixAllocate memories for matricesBroadcast(receive) matrix BReceive a part of AMultiply their part of matrix, result in CSend the result back to 0

partitioning

P0

P1

…

Pprocessor_count-1

n rows

number of rows for each process (m) = n / processor_count

n columns

start row for each process i = m * i

amount of data for each process = m * n

For examples, 8*8 matrices4 processors

P0

P1

…

P3

8rows

m= 8 / 4 = 2

8 columns

start row for each process i = 2 * i

amount of data for each process = 2 * 8

when rank = 0if (processor_rank == 0){ time0 = MPI_Wtime();

readMatrix(argv[1], &A);readMatrix(argv[2], &B);n = A.n;m = n / processor_count;C = createMatrix(n,n);

// Broadcast (send) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Broadcast (send) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

// Send each process it's own part of Afor (i = 1; i < processor_count; i++)

MPI_Send((void *)A.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD);

...

}

when rank = 0if (processor_rank == 0){

...

// Multiply own part of matrix A with B into already existing matrix CA.m = m;doMatrixMultiply(A,B,C);

A.m = n;

// Receive part of C matrix from each processfor (i = 1; i < processor_count; i++)

MPI_Recv((void *)C.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD, &status);

// Record finish timetime1 = MPI_Wtime();printMatrix("A",A);printMatrix("B",B);printMatrix("C",C);// Print time statisticsprintf ("Total time using [%2d] processors : [%f] seconds\n",

processor_count, time1 - time0);}

Other ranks

else{

// Broadcast (receive) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Allocate memory for matricesm = n / processor_count;A = createMatrix(m, n);B = createMatrix(n ,n);

// Broadcast (receive) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);MPI_Recv((void *)A.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,

MPI_COMM_WORLD, &status);

// Multiply local matricesC = matrixMultiply(A,B);

// Send back resultMPI_Send((void *)C.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,

MPI_COMM_WORLD);

}

Multiplication 1000 x 1000

1000 x 1000 Matrix multiplication

0

20

40

60

80

100

120

140

0 10 20 30 40 50 60

Processors

Tim

e (s

)

Tp T1 / p

Multiplication 5000 x 5000

5000 x 5000 Matrix multiplication

0

10000

20000

30000

40000

50000

60000

70000

80000

90000

0 5 10 15 20 25 30 35

Processors

Tim

e (s

)

Tp T1 / p

Mpi Matrix

Documents

Transcript of Mpi Matrix