#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <stdio.h>
#include <math.h>

#define MATRIX_DIMENSION 36

int index(int row, int col, int sp) {
   return ( ((row+sp)%sp)*sp + (col+sp)%sp );
}

void main(int argc, char *argv[]) {

   float *a, *b, *c, *tmp_a, *tmp_b;    /* local matrices, help fields for matrices */
   float **A, **B, **C;                 /* global matrices */
   int i, j, k, l, p;                   /* loop indices; processor number */
   int p_imin, p_imax, p_jmin, p_jmax;  /* loop indices limits for local matrices calculation */
   int dg, dl, dl2, sp;                 /* dimensions global, local, local spare; square root of p */
   int my_rank, my_row, my_col;         /* ... */
   double t1, t2;                       /* time measurement */
   MPI_Status status;                   /* ... */
 
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &p);
   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
   
   dg = MATRIX_DIMENSION;   /* dimension global */

   sp = sqrt(p);            /* square root of p */
   if (sp*sp != p) { 
      if (my_rank == 0) printf("Number of processors is not a quadratic number.\n");
      MPI_Finalize();
      exit(0);
   }

   dl  = dg / sp;   /* dimension local */
   dl2 = dl * dl;   /* dimension local square */

   my_col =  my_rank % sp ;
   my_row = (my_rank-my_col) / sp ;

   a = (float *)malloc( dl2 * sizeof(float) );
   b = (float *)malloc( dl2 * sizeof(float) );
   c = (float *)malloc( dl2 * sizeof(float) );

   for(i=0; i<dl2 ; i++) 
      c[i] = 0.0;

   tmp_a = (float *)malloc( dl2 * sizeof(float) );
   tmp_b = (float *)malloc( dl2 * sizeof(float) );

   if(my_rank == 0) {
      A = (float **)malloc( dg  * sizeof(float*) );
      B = (float **)malloc( dg  * sizeof(float*) );
      C = (float **)malloc( dg  * sizeof(float*) );

      for(i=0; i<dg; i++) {
         A[i] = (float *)malloc( dg * sizeof(float) );
         B[i] = (float *)malloc( dg * sizeof(float) );
         C[i] = (float *)malloc( dg * sizeof(float) );
      }
      for(i=0; i<dg ; i++) {
         for(j=0; j<dg ; j++) { 
            A[i][j] = rand();
            B[i][j] = rand();
            C[i][j] = 0.0;
	 }
      }

      t1 = MPI_Wtime();
      for(k=0; k<p; k++) {
	 p_jmin = (k % sp    ) *dl; 
	 p_jmax = (k % sp + 1) *dl-1;
	 p_imin =  (k - (k % sp))/sp     *dl;
	 p_imax = ((k - (k % sp))/sp +1) *dl -1;
         l = 0; 
         for(i=p_imin; i<=p_imax; i++) {
	    for(j=p_jmin; j<=p_jmax; j++) {
               tmp_a[l] = A[i][j];
	       tmp_b[l] = B[i][j];
	       l++;
	    }
         }
	 if(k==0) {
            memcpy(a, tmp_a, dl2 * sizeof(float));
	    memcpy(b, tmp_b, dl2 * sizeof(float));
	 } else {
            MPI_Send(tmp_a, dl2, MPI_FLOAT, k, 1, MPI_COMM_WORLD);
	    MPI_Send(tmp_b, dl2, MPI_FLOAT, k, 2, MPI_COMM_WORLD);
         }
      }
   } else {
      MPI_Recv(a, dl2, MPI_FLOAT, 0 , 1, MPI_COMM_WORLD, &status);
      MPI_Recv(b, dl2, MPI_FLOAT, 0 , 2, MPI_COMM_WORLD, &status);
   }
   /*initialization procedure */
   MPI_Sendrecv(a,     dl2, MPI_FLOAT, index(my_row,my_col-my_row,sp), 1, 
		tmp_a, dl2, MPI_FLOAT, index(my_row,my_col+my_row,sp), 1, MPI_COMM_WORLD, &status);
   memcpy(a, tmp_a, dl2 * sizeof(float) );
   MPI_Sendrecv(b,     dl2, MPI_FLOAT, index(my_row-my_col,my_col,sp), 1, 
                tmp_b, dl2, MPI_FLOAT, index(my_row+my_col,my_col,sp), 1, MPI_COMM_WORLD, &status);
   memcpy(b, tmp_b, dl2 * sizeof(float) );

   /* main loop of Cannon's algorithm */
   for(l=0; l<sp; l++) {
    
      for(i=0; i<dl; i++)
         for(j=0; j<dl; j++)
            for(k=0; k<dl; k++)
               c[i*dl+j] += a[i*dl+k]*b[k*dl+j];

      /* shift "a" to the left */
      MPI_Send(a , dl2, MPI_FLOAT, index(my_row, my_col-1, sp), 1, MPI_COMM_WORLD);
      MPI_Recv(a , dl2, MPI_FLOAT, index(my_row, my_col+1, sp), 1, MPI_COMM_WORLD, &status);

      /* shift "b" upwards */
      MPI_Send(b , dl2, MPI_FLOAT, index(my_row-1, my_col, sp), 1, MPI_COMM_WORLD);
      MPI_Recv(b , dl2, MPI_FLOAT, index(my_row+1, my_col, sp), 1, MPI_COMM_WORLD, &status);
   }

   if(my_rank == 0) {
      t2 = MPI_Wtime();
      printf("Elapsed time: %6.2e seconds\n", (t2-t1));

      /* collect matrices "c" and compose matrix "C" */ 
      /* not yet implemented */
   } else {
      /* send matrix "c" to process 0 */
      /* not yet implemented */
   }  
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
}

