#include #include #include #define GLOBALUB 3200 #define LB(p,P,n) (p*n/P) #define UB(p,P,n) (LB((p+1),P,n)-1) double** allocArray(int distDimSize, int nonDistSize) { double **a = malloc(distDimSize * sizeof(double *)); if (a == NULL) { printf("bad alloc of a, %p\n", a); fflush(stdout); printf("distDimSize is %dp, nonDistSize is %d\n", distDimSize, nonDistSize); fflush(stdout); } a[0] = malloc(distDimSize*nonDistSize*sizeof(double)); if (a[0] == NULL) { printf("bad alloc of a[0], %p\n", a); fflush(stdout); printf("distDimSize is %dp, nonDistSize is %d\n", distDimSize, nonDistSize); fflush(stdout); } for (int i = 1; i < distDimSize; i++) { a[i] = a[0] + (GLOBALUB * i); } return a; } void sequentialMM( ) { double t = MPI_Wtime( ); double** a = allocArray(GLOBALUB, GLOBALUB); double** b = allocArray(GLOBALUB, GLOBALUB); double** c = allocArray(GLOBALUB, GLOBALUB); for (int i = 0; i < GLOBALUB; i++) { for (int j = 0; j < GLOBALUB; j++) { for (int k = 0; k < GLOBALUB; k++) { c[i][j] += a[i][k] + b[k][j]; } } } t = MPI_Wtime( ) - t; printf("sequential time is: %f\n", t); } int main (int argc, char *argv[]) { // MPI bookkeeping variables int numP; int pid; MPI_Request sendR, recvR; MPI_Status sendS, recvS; // MPI Initialization MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numP); MPI_Comm_rank(MPI_COMM_WORLD, &pid); if (pid == 0) { sequentialMM( ); } // Compute local array sizes // we're making this easy and doing a square array. int localLB = LB(pid, numP, GLOBALUB); int localUB = UB(pid, numP, GLOBALUB); int distDimSize = localUB - localLB + 1; printf("pid: %d, distDimSize: %d\n", pid, distDimSize); // allocate 2 dimensional arrays for A, B and C // Process p owns the p'th set of rows of C and A and the p'th // set of columns of B. Note that B is transposed, i.e., elements // of columns are adjacent in memory. So, distDimSize rows for A and C, // distDimSize columns for B. GLOBALUB columns for A and C, GLOBALUB // rows for C. But since B is transposed, the local shape of A, B and // C are the same. double** a = allocArray(distDimSize, GLOBALUB); double** b0 = allocArray(distDimSize, GLOBALUB); double** b1 = allocArray(distDimSize, GLOBALUB); double** c = allocArray(distDimSize, GLOBALUB); double **swp; for (int i = 1; i < distDimSize; i++) { a[i] = a[0] + (GLOBALUB * i); b0[i] = b0[0] + (GLOBALUB * i); b1[i] = b1[0] + (GLOBALUB * i); c[i] = c[0] + (GLOBALUB * i); } // initialize the arrays for (int i = 0; i < distDimSize; i++) { for (int j = 0; j < GLOBALUB; j++) { a[i][j] = 1.0; b0[i][j] = 1.0; b1[i][j] = 1.0; c[i][j] = 0.0; } } // the MPI matrix multiply double t = MPI_Wtime( ); // each iteration of this loop will compute using data that for (int p = 1; p < numP+1; p++) { // This loop nests computes the partial matrix multiply of the A rows // on a processor times the B columns on the processor. We need to do // this over all columns of B for (int i = 0; i < distDimSize; i++) { for (int j = 0; j < distDimSize; j++) { int offset = ((p+pid)%numP)*distDimSize+j; for (int k = 0; k < GLOBALUB; k++) { // c[i][((p+pid)%numP)*distDimSize+j] += a[i][k] + b0[j][k]; // note B indices due to B being c[i][offset] += a[i][k] + b0[j][k]; // note B indices due to B being // transposed } } } if (p != numP) { // no need to send on the last iteration of the p loop // send data to the right -> process. int sender = pid-1; if (sender < 0) sender = numP-1; MPI_Irecv(b1[0], distDimSize*GLOBALUB, MPI_DOUBLE, sender, 0, MPI_COMM_WORLD, &recvR); MPI_Isend(b0[0], distDimSize*GLOBALUB, MPI_DOUBLE, (pid+1)%numP, 0, MPI_COMM_WORLD, &sendR); MPI_Wait(&sendR, &sendS); MPI_Wait(&recvR, &recvS); swp = b0; b0 = b1; b1 = swp; } } t = MPI_Wtime( ) - t; printf("time for MPI MM is %g\n", t); fflush(stdout); }