#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<math.h>
#include<mpi.h>


double random_number(double *rmin, double *rmax){
  return (*rmin) + (float)rand()/(float)RAND_MAX*((*rmax)-(*rmin));
}

/* The function cart calculates from the individual rank of
   the process the cartesian coordinates of the process.
   The argument icart is an array of two integers where the
   first one denotes the row, the second one the column of the
   process with respect to the global matrix. */
void cart(int rank, int nproz, int *icart){
  int ndim = sqrt(nproz);
  icart[0] = rank/ndim;  /* line */
  icart[1] = rank%ndim;  /* column */
}

/* The function rank() converts the cartesian coordinates
   of each process into the individual rank. */
int rank(int icart0, int icart1, int nproz){
  int ndim = sqrt(nproz);
  if ( icart0==-1 ) icart0 = ndim-1;
  if ( icart0==ndim ) icart0 = 0;
  if ( icart1==-1 ) icart1 = ndim-1;
  if ( icart1==ndim ) icart1 = 0;
  return icart0*ndim+icart1;
}

int main(int argc, char **argv){
  double **A, **B, **C;
  double *Af, *Bf, *Cf;
  double *buffer;
  int buffer_size;
  int i,j,k;
  double r_min,r_max;
  int n_ges;

  clock_t tcpu_1, tcpu_2;

  int i_lef,i_rig,i_top,i_bot;

  int myrank,nproz;
  int irank;
  int *icart;
  int ndim,l;
  MPI_Status status;

  srand(time(0));

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproz);

/* ndim is the number of submatrices in each row and column. */
  ndim = sqrt(nproz);

/* In icart the individual cartesian coordinates of each process are stored. */
  icart = (int*)malloc(2*sizeof(double));
  cart(myrank,nproz,icart);

  n_ges = 100;

  buffer_size = ndim*sizeof(double)*n_ges*n_ges;
  buffer = (double*)malloc(buffer_size);
  MPI_Buffer_attach(buffer,buffer_size);

  r_min = 0.0;
  r_max = 1.0;

/* Allocate the fields A,B and C as a coherent part of the memory:
   this makes it easier to pass the whole fields to other processes. */
  A = (double**)malloc(n_ges*sizeof(double*));
  B = (double**)malloc(n_ges*sizeof(double*));
  C = (double**)malloc(n_ges*sizeof(double*));

  Af = (double*)malloc(n_ges*n_ges*sizeof(double));
  Bf = (double*)malloc(n_ges*n_ges*sizeof(double));
  Cf = (double*)malloc(n_ges*n_ges*sizeof(double));

  for (i=0; i<n_ges; i++){
    A[i] = &Af[i*n_ges];
    B[i] = &Bf[i*n_ges];
    C[i] = &Cf[i*n_ges];
  }

/* Initialisation of the fields: */
  for(i=0; i<n_ges; i++){
    for(j=0; j<n_ges; j++){
      A[i][j] = random_number(&r_min,&r_max);
      B[i][j] = random_number(&r_min,&r_max);
      C[i][j] = 0.;
    }
  }

/* Get the numbers (ranks) of the neighbours */
  i_lef = rank(icart[0],icart[1]-1,nproz);
  i_rig = rank(icart[0],icart[1]+1,nproz);
  i_top = rank(icart[0]-1,icart[1],nproz);
  i_bot = rank(icart[0]+1,icart[1],nproz);

/* If desired: doublecheck the topology */
/*
  printf("%d: i=%d j=%d\n",myrank,icart[0],icart[1]);
  printf("%d: l:%d r:%d t:%d b:%d\n",myrank,i_lef,i_rig,i_top,i_bot);
*/

/* For correct initialisation first pass the blockmatrix A
   column-times to the left and B row-times upward! */
  for (i=0; i<ndim; i++){
    for (k=0; k<i; k++){
      for (j=0; j<ndim; j++){
        irank = rank(i,j,nproz);
        if ( myrank==irank ){
          MPI_Bsend(&A[0][0],n_ges*n_ges,MPI_DOUBLE,i_lef,
                   i,MPI_COMM_WORLD);
          MPI_Recv(&A[0][0],n_ges*n_ges,MPI_DOUBLE,i_rig,
                   i,MPI_COMM_WORLD,&status);
        }
      }
    }
  }
  for (j=0; j<ndim; j++){
    for (k=0; k<j; k++){
      for (i=0; i<ndim; i++){
        irank = rank(i,j,nproz);
        if ( myrank==irank ){
          MPI_Bsend(&B[0][0],n_ges*n_ges,MPI_DOUBLE,i_top,
                   j,MPI_COMM_WORLD);
          MPI_Recv(&B[0][0],n_ges*n_ges,MPI_DOUBLE,i_bot,
                   j,MPI_COMM_WORLD,&status);
        }
      }
    }
  }

  tcpu_1 = clock();

  for (l=0; l<ndim; l++){   /* l-loop counts the passing steps */

    for(i=0; i<n_ges; i++){   /* This is the local matrix multiplication. */
      for(j=0; j<n_ges; j++){
        for (k=0; k<n_ges; k++){
          C[i][j] += A[i][k]*B[k][j];
        }
      }
    }

    if ( l!=ndim-1 ){  /* Not necessary last time: */
/* Each process passes its A to the left and gets a fresh one from the right. */
      MPI_Bsend(&A[0][0],n_ges*n_ges,MPI_DOUBLE,i_lef,
               l,MPI_COMM_WORLD);
      MPI_Recv(&A[0][0],n_ges*n_ges,MPI_DOUBLE,i_rig,
               l,MPI_COMM_WORLD,&status);

/* Each process passes its B down and gets a fresh one from top. */
      MPI_Bsend(&B[0][0],n_ges*n_ges,MPI_DOUBLE,i_top,
               l+ndim,MPI_COMM_WORLD);
      MPI_Recv(&B[0][0],n_ges*n_ges,MPI_DOUBLE,i_bot,
               l+ndim,MPI_COMM_WORLD,&status);
    }

  }

/* This is an output for 2x2 submatrices. */
/*
  printf("%d: %6.3f  %6.3f\n",myrank,C[0][0],C[0][1]);
  printf("%d: %6.3f  %6.3f\n",myrank,C[1][0],C[1][1]);
*/

  tcpu_2 = clock();

  printf("%d: %.lf\n",myrank,(tcpu_2-tcpu_1)/(double)CLOCKS_PER_SEC);

  MPI_Buffer_detach(buffer,&buffer_size);
  MPI_Finalize();
  
  return 0;
}

