#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

#define GLOBALUB 3200
#define LB(p,P,n) (p*n/P)
#define UB(p,P,n) (LB((p+1),P,n)-1)

double** allocArray(int distDimSize, int nonDistSize) {
   double **a = malloc(distDimSize * sizeof(double *));
   if (a == NULL) {
      printf("bad alloc of a, %p\n", a); fflush(stdout);
      printf("distDimSize is %dp, nonDistSize is %d\n", 
             distDimSize, nonDistSize); fflush(stdout);
   }

   a[0] = malloc(distDimSize*nonDistSize*sizeof(double));
   if (a[0] == NULL) {
      printf("bad alloc of a[0], %p\n", a); fflush(stdout);
      printf("distDimSize is %dp, nonDistSize is %d\n", 
             distDimSize, nonDistSize); fflush(stdout);
   }

   for (int i = 1; i < distDimSize; i++) {
      a[i] = a[0] + (GLOBALUB * i);
   }
   return a;
}

void sequentialMM( ) {
   double t = MPI_Wtime( );
   double** a = allocArray(GLOBALUB, GLOBALUB);
   double** b = allocArray(GLOBALUB, GLOBALUB);
   double** c = allocArray(GLOBALUB, GLOBALUB);

   for (int i = 0; i < GLOBALUB; i++) {
      for (int j = 0; j < GLOBALUB; j++) {
         for (int k = 0; k < GLOBALUB; k++) {
            c[i][j] += a[i][k] + b[k][j];
         }
      }
   }
   t = MPI_Wtime( ) - t;
   printf("sequential time is: %f\n", t);
}

int main (int argc, char *argv[]) {

   // MPI bookkeeping variables
   int numP;
   int pid;
   MPI_Request sendR, recvR;
   MPI_Status sendS, recvS;

   // MPI Initialization
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &numP);
   MPI_Comm_rank(MPI_COMM_WORLD, &pid);

   if (pid == 0) {
      sequentialMM( );
   }

   // Compute local array sizes 
   // we're making this easy and doing a square array.
   int localLB = LB(pid, numP, GLOBALUB);
   int localUB = UB(pid, numP, GLOBALUB);
   int distDimSize = localUB - localLB + 1;
   printf("pid: %d, distDimSize: %d\n", pid, distDimSize);

   // allocate 2 dimensional arrays for A, B and C
   // Process p owns the p'th set of rows of C and A and the p'th
   // set of columns of B.  Note that B is transposed, i.e., elements
   // of columns are adjacent in memory.  So, distDimSize rows for A and C,
   // distDimSize columns for B.  GLOBALUB columns for A and C, GLOBALUB
   // rows for C.  But since B is transposed, the local shape of A, B and
   // C are the same.
   double** a = allocArray(distDimSize, GLOBALUB);
   double** b0 = allocArray(distDimSize, GLOBALUB);
   double** b1 = allocArray(distDimSize, GLOBALUB);
   double** c = allocArray(distDimSize, GLOBALUB);
   double **swp;

   for (int i = 1; i < distDimSize; i++) {
      a[i] = a[0] + (GLOBALUB * i);
      b0[i] = b0[0] + (GLOBALUB * i);
      b1[i] = b1[0] + (GLOBALUB * i);
      c[i] = c[0] + (GLOBALUB * i);
   }

   // initialize the arrays
   for (int i = 0; i < distDimSize; i++) {
      for (int j = 0; j < GLOBALUB; j++) {
         a[i][j] = 1.0;
         b0[i][j] = 1.0;
         b1[i][j] = 1.0;
         c[i][j] = 0.0;
      }
   }

   // the MPI matrix multiply
   double t = MPI_Wtime( );
   // each iteration of this loop will compute using data that 
   for (int p = 1; p < numP+1; p++) {
      // This loop nests computes the partial matrix multiply of the A rows
      // on a processor times the B columns on the processor.  We need to do
      // this over all columns of B
      for (int i = 0; i < distDimSize; i++) {
         for (int j = 0; j < distDimSize; j++) {
            int offset = ((p+pid)%numP)*distDimSize+j;
            for (int k = 0; k < GLOBALUB; k++) {
               // c[i][((p+pid)%numP)*distDimSize+j] += a[i][k] + b0[j][k]; // note B indices due to B being
               c[i][offset] += a[i][k] + b0[j][k]; // note B indices due to B being
                                             // transposed
            }
         }
      }

      if (p != numP) { // no need to send on the last iteration of the p loop
         // send data to the right -> process.
         int sender = pid-1; if (sender < 0) sender = numP-1;
         MPI_Irecv(b1[0], distDimSize*GLOBALUB, MPI_DOUBLE, sender, 0, 
                   MPI_COMM_WORLD, &recvR);
         MPI_Isend(b0[0], distDimSize*GLOBALUB, MPI_DOUBLE, 
                   (pid+1)%numP, 0, MPI_COMM_WORLD, &sendR);
         MPI_Wait(&sendR, &sendS);
         MPI_Wait(&recvR, &recvS); 
         swp = b0; b0 = b1; b1 = swp;
      } 
   }
   t = MPI_Wtime( ) - t;
   printf("time for MPI MM is %g\n", t); fflush(stdout);
}