#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<math.h>
#include<mpi.h>


double random_number(double *rmin, double *rmax){
  return (*rmin) + (float)rand()/(float)RAND_MAX*((*rmax)-(*rmin));
}


int main(int argc, char **argv){
  double **A, **B, **C;
  double *Af, *Bf, *Cf;
  int i,j,k;
  double r_min,r_max;
  int n_ges;

  clock_t tcpu_1, tcpu_2;

  int i_lef,i_rig,i_top,i_bot;

  int myrank,nproz;
  int irank;
  int ndim,l;
  MPI_Status status;
  MPI_Comm comm_cart;
  int *idims, *iperiods, *icart;

  srand(time(0));

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproz);

/* ndim is the number of submatrices in each row and column. */
  ndim = sqrt(nproz);

  icart    = (int*)malloc(2*sizeof(int));
  idims    = (int*)malloc(2*sizeof(int));
  iperiods = (int*)malloc(2*sizeof(int));
  idims[0] = ndim; idims[1] = ndim;  /* dimensions in each direction */
  iperiods[0] = 1; iperiods[1] = 1;  /* periodic boundaries in each direction */

/* Create the new cartsian-based communicator */
  MPI_Cart_create(MPI_COMM_WORLD,2,idims,iperiods,0,&comm_cart);

  n_ges = 100;
  r_min = 0.0;
  r_max = 1.0;

/* Allocate the fields A,B and C as a coherent part of the memory:
   this makes it easier to pass the whole fields to other processes. */
  A = (double**)malloc(n_ges*sizeof(double*));
  B = (double**)malloc(n_ges*sizeof(double*));
  C = (double**)malloc(n_ges*sizeof(double*));

  Af = (double*)malloc(n_ges*n_ges*sizeof(double));
  Bf = (double*)malloc(n_ges*n_ges*sizeof(double));
  Cf = (double*)malloc(n_ges*n_ges*sizeof(double));

  for (i=0; i<n_ges; i++){
    A[i] = &Af[i*n_ges];
    B[i] = &Bf[i*n_ges];
    C[i] = &Cf[i*n_ges];
  }

/* Initialisation of the fields: */
  for(i=0; i<n_ges; i++){
    for(j=0; j<n_ges; j++){
      A[i][j] = random_number(&r_min,&r_max);
      B[i][j] = random_number(&r_min,&r_max);
      C[i][j] = 0.;
    }
  }

  if (myrank==0) A[0][0] = 2.;

/* Get the numbers (ranks) of the neighbours */
  MPI_Cart_shift(comm_cart,1,-1,&i_rig,&i_lef);
  MPI_Cart_shift(comm_cart,0,1,&i_top,&i_bot);

/* If desired: doublecheck the topology */
/*
  printf("%d: l:%d r:%d t:%d b:%d\n",myrank,i_lef,i_rig,i_top,i_bot);
*/

/* For correct initialisation first pass the blockmatrix A
   column-times to the left and B row-times upward! */
  for (j=0; j<ndim; j++){
    for (k=0; k<j; k++){
      for (i=0; i<ndim; i++){
        icart[0] = i; icart[1] = j;
        MPI_Cart_rank(comm_cart,icart,&irank);
        if ( myrank==irank ){
          MPI_Sendrecv_replace(&B[0][0],n_ges*n_ges,MPI_DOUBLE,
                        i_bot,l+ndim,i_top,l+ndim,comm_cart,&status);
        }
      }
    }
  }
  for (i=0; i<ndim; i++){
    for (k=0; k<i; k++){
      for (j=0; j<ndim; j++){
        icart[0] = i; icart[1] = j;
        MPI_Cart_rank(comm_cart,icart,&irank);
        if ( myrank==irank ){
          MPI_Sendrecv_replace(&A[0][0],n_ges*n_ges,MPI_DOUBLE,
                        i_lef,l,i_rig,l,comm_cart,&status);
        }
      }
    }
  }

  tcpu_1 = clock();

  for (l=0; l<ndim; l++){   /* l-loop counts the passing steps */

    for(i=0; i<n_ges; i++){   /* This is the local matrix multiplication. */
      for(j=0; j<n_ges; j++){
        for (k=0; k<n_ges; k++){
          C[i][j] += A[i][k]*B[k][j];
        }
      }
    }

    if ( l!=ndim-1 ){  /* Not necessary last time: */
/* Each process passes its A to the left and gets a fresh one from the right. */
/* Each process passes its B down and gets a fresh one from top. */
      MPI_Sendrecv_replace(&A[0][0],n_ges*n_ges,MPI_DOUBLE,
                    i_lef,l,i_rig,l,comm_cart,&status);
      MPI_Sendrecv_replace(&B[0][0],n_ges*n_ges,MPI_DOUBLE,
                    i_bot,l+ndim,i_top,l+ndim,comm_cart,&status);
    }

  }

/* This is an output for 2x2 submatrices. */
/*
  printf("%d: %6.3f  %6.3f\n",myrank,C[0][0],C[0][1]);
  printf("%d: %6.3f  %6.3f\n",myrank,C[1][0],C[1][1]);
*/

  tcpu_2 = clock();

  printf("%d: %.lf\n",myrank,(tcpu_2-tcpu_1)/(double)CLOCKS_PER_SEC);

  MPI_Finalize();

  return 0;
}

