Mpi Matrix
-
Upload
alishahabipoor -
Category
Documents
-
view
231 -
download
1
description
Transcript of Mpi Matrix
MPI Sample ProgramParallel Matrix Multiplication
Preeyakorn TipwaiRMUTL 1st High Performance Computing
WorkshopFeb 2-3, 2009
Matrix Multiplication
P1
P2
P3
P4
P0
P1
…
Pprocess_count-1
A B C
Implementation1. Master (process 0) reads data2. Master sends size of data to slaves3. Slaves allocate memory4. Master broadcasts second matrix to all other
processes5. Master sends respective parts of first matrix to all
other processes6. Every process performs its local multiplication7. All slave processes send back their result.
Data partitioning
first partition the data.distribute the relevant pieces of data to each of the processors. processors perform their operations on the data.results are either send to other processors or used for further operations.
Matrix Multiplication
⎥⎥⎥⎥
⎦
⎤
⎢⎢⎢⎢
⎣
⎡
nnnn
n
n
AAA
AAAAAA
..........
..
..
21
11110
00100
⎥⎥⎥⎥
⎦
⎤
⎢⎢⎢⎢
⎣
⎡
nnnn
n
n
BBB
BBBBBB
..........
..
..
21
11110
00100
⎥⎥⎥⎥
⎦
⎤
⎢⎢⎢⎢
⎣
⎡
nnnn
n
n
CCC
CCCCCC
..........
..
..
21
11110
00100
=.
0030032002000000 *...*** nn BABABABAC ++++=
1031032102010001 *...*** nn BABABABAC ++++=
nnnnnnn BABABABAC *...*** 03032020000 ++++=
-We have to multiply each row of A with all of B, to result in the same row of C-number of columns in A must equal number of rows in B-let’s try n*n matrices
Matrix definition
#define TAG_MATRIX_PARTITION 0x4560
typedef struct{ unsigned int m, n; // Rows, cols
double *data; // Data, ordered by row, then by coldouble **rows; // Pointers to rows in data
} TMatrix;
⎥⎥⎥⎥
⎦
⎤
⎢⎢⎢⎢
⎣
⎡
nnnn
n
n
AAA
AAAAAA
..........
..
..
21
11110
00100rows[0]
rows[n]
rows[1]
data
Matrix operations
TMatrix createMatrix (const unsigned int rows, const unsigned int cols);TMatrix readMatrix (char filename[128]);void freeMatrix (TMatrix *matrix);int validMatrix (TMatrix matrix);TMatrix initMatrix (void);TMatrix matrixMultiply (TMatrix A, TMatrix B);void doMatrixMultiply (TMatrix A, TMatrix B, TMatrix C);void printMatrix (char name[128],TMatrix A);
createMatrix
TMatrix createMatrix(const unsigned int rows, const unsigned int cols){ TMatrix matrix;
unsigned long int m, n;unsigned int i,j;m = rows; n = cols;matrix.m = rows;matrix.n = cols;matrix.data = (double *) malloc(sizeof(double) * m * n);matrix.rows = (double **) malloc(sizeof(double *) * m);if (validMatrix(matrix)){
matrix.m = rows; matrix.n = cols;for (i = 0; i < rows; i++){
matrix.rows[i] = matrix.data + (i * cols);}
}else{
freeMatrix(&matrix);}return matrix;
}
freeMatrix
void freeMatrix (TMatrix *matrix)
{
if (matrix == NULL) return;
if (matrix -> data) { free(matrix -> data); matrix -> data = NULL; }
if (matrix -> rows) { free(matrix -> rows); matrix -> rows = NULL; }
matrix -> m = 0;
matrix -> n = 0;
}
validMatrix
int validMatrix (TMatrix matrix){
if ((matrix.data == NULL) || (matrix.rows == NULL) ||(matrix.m == 0) || (matrix.n == 0))
return 0;else return 1;
}
initMatrix
TMatrix initMatrix(){
TMatrix matrix;matrix.m = 0;matrix.n = 0;matrix.data = NULL;matrix.rows = NULL;return matrix;
}
matrixMultiply
TMatrix matrixMultiply(TMatrix A, TMatrix B){
TMatrix C;C = initMatrix();if (validMatrix(A) && validMatrix(B) && (A.n == B.m)){
C = createMatrix(A.m, B.n);if (validMatrix(C)){
doMatrixMultiply(A, B, C);}
}return C;
}
doMatrixMultiply
void doMatrixMultiply(TMatrix A, TMatrix B, TMatrix C){
unsigned int i, j, k;double sum;for (i = 0; i < A.m; i++) // Rows{
for (j = 0; j < B.n; j++) // Cols{
sum = 0;for (k = 0; k < A.n; k++)
sum += A.rows[i][k] * B.rows[k][j];C.rows[i][j] = sum;
}}
}
printMatrix
void printMatrix(char name[128], TMatrix A){
unsigned int i, j;printf("%s:\n", name);if (validMatrix(A))
{ for (i = 0; i < A.m; i++){
for (j = 0; j < A.n; j++) printf ("%7.3f ", A.rows[i][j]);
printf ("\n");}
}}
readMatrixint readMatrix(char *filename, TMatrix *A){ FILE *fp;
unsigned int m, n, i, j;float d;int result = 0;if ((fp = fopen (filename, "r")) == NULL) return 0;do{ if (fscanf (fp, "%d%d", &m, &n) != 2) break;
if ((m == 0) || (n == 0)) break;*A = createMatrix(m,n);if (!validMatrix(*A)) break;for (i = 0; i < m; i ++){ for (j = 0; j < n; j ++)
{ if (fscanf (fp, "%f", &d) != 1) break;A -> rows[i][j] = d;
}if (j != n) break;
}if (i != m) break;
result = 1;} while (0);fclose (fp);return result;
}
Write your main function
int main (int argc, char *argv[]){ int processor_rank = 0;
int processor_count = 1;MPI_Status status;TMatrix A,B,C,D;unsigned int m, n= 4, i, j, offset;double time0, time1;A = initMatrix(); B = initMatrix(); C = initMatrix(); D = initMatrix();MPI_Init(&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &processor_count);MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank );
// Free matrix datafreeMatrix(&A); freeMatrix(&B); freeMatrix(&C);// Wait for everyone to stop MPI_Barrier(MPI_COMM_WORLD);
// Always use MPI_Finalize as the last instruction of the programMPI_Finalize();return 0;
}
WORK HERE
Processes
Rank = 0 others
Time stampRead the matrices A, BAllocate memories for matrix CBroadcast(send) size of matrixBroadcast(send) matrix BSplit A into partsSend each process a part of AMultiply first part here, result in CReceive other parts of CTime stamp
Broadcast(receive) size of matrixAllocate memories for matricesBroadcast(receive) matrix BReceive a part of AMultiply their part of matrix, result in CSend the result back to 0
partitioning
P0
P1
…
Pprocessor_count-1
n rows
number of rows for each process (m) = n / processor_count
n columns
start row for each process i = m * i
amount of data for each process = m * n
For examples, 8*8 matrices4 processors
P0
P1
…
P3
8rows
m= 8 / 4 = 2
8 columns
start row for each process i = 2 * i
amount of data for each process = 2 * 8
when rank = 0if (processor_rank == 0){ time0 = MPI_Wtime();
readMatrix(argv[1], &A);readMatrix(argv[2], &B);n = A.n;m = n / processor_count;C = createMatrix(n,n);
// Broadcast (send) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Broadcast (send) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Send each process it's own part of Afor (i = 1; i < processor_count; i++)
MPI_Send((void *)A.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD);
...
}
when rank = 0if (processor_rank == 0){
...
// Multiply own part of matrix A with B into already existing matrix CA.m = m;doMatrixMultiply(A,B,C);
A.m = n;
// Receive part of C matrix from each processfor (i = 1; i < processor_count; i++)
MPI_Recv((void *)C.rows[m*i], m*n, MPI_DOUBLE,i, TAG_MATRIX_PARTITION, MPI_COMM_WORLD, &status);
// Record finish timetime1 = MPI_Wtime();printMatrix("A",A);printMatrix("B",B);printMatrix("C",C);// Print time statisticsprintf ("Total time using [%2d] processors : [%f] seconds\n",
processor_count, time1 - time0);}
Other ranks
else{
// Broadcast (receive) size of matrixMPI_Bcast((void *)&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Allocate memory for matricesm = n / processor_count;A = createMatrix(m, n);B = createMatrix(n ,n);
// Broadcast (receive) B matrixMPI_Bcast((void *)B.data, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);MPI_Recv((void *)A.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD, &status);
// Multiply local matricesC = matrixMultiply(A,B);
// Send back resultMPI_Send((void *)C.data, m*n, MPI_DOUBLE, 0, TAG_MATRIX_PARTITION,
MPI_COMM_WORLD);
}
Multiplication 1000 x 1000
1000 x 1000 Matrix multiplication
0
20
40
60
80
100
120
140
0 10 20 30 40 50 60
Processors
Tim
e (s
)
Tp T1 / p
Multiplication 5000 x 5000
5000 x 5000 Matrix multiplication
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
0 5 10 15 20 25 30 35
Processors
Tim
e (s
)
Tp T1 / p