#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <mpi.h>

#define DIMENSION 300





int main(int argc, char *argv[]) {
   MPI_Status status;
   int p, my_rank, i,j,k, dest;
   double **A, **A_ser;
   double *buffer;
   double l;
   double start_par, end_par, start_ser, end_ser, serial_time, parallel_time, speedup;
   double diff, error;

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &p);
   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

   if (DIMENSION % p != 0) {
       if (my_rank == 0) {
	   printf("The number of rows given (%d) cannot be divided evenly by the number of processes (%d)\n",
		  DIMENSION, p);
       }
       fflush(stdout);
       fflush(stderr);
       MPI_Barrier(MPI_COMM_WORLD);
       MPI_Finalize();
       exit(1);
   }

   
   A = (double**) malloc(DIMENSION * sizeof(double *));
   for (i=0;i<DIMENSION;i++) {
       A[i] = (double*) malloc(DIMENSION * sizeof(double));
   }
   buffer = (double*) malloc(DIMENSION * sizeof(double));

   for (i=0;i<DIMENSION;i++) {
       for (k=0;k<DIMENSION;k++) {
	   A[i][k] = 0;
       }
   }

   if (my_rank == 0) {
       A_ser = (double**) malloc(DIMENSION * sizeof(double *));
       for (i=0;i<DIMENSION;i++) {
	   A_ser[i] = (double*) malloc(DIMENSION * sizeof(double));
       }
       for (i=0;i<DIMENSION;i++) {
	   for (k=0;k<DIMENSION;k++) {
	       A[i][k] = ((double) rand()) / ((double) RAND_MAX);
	   }
	   A[i][i] = 1 + (((double) rand()) / ((double) RAND_MAX));
	   if (A[i][i] == 0.0) A[i][i] = 1.0;
       }
       for (i=0;i<DIMENSION;i++) {
	   for (k=0;k<DIMENSION;k++) {
	       A_ser[i][k] = A[i][k];
	   }
       }
   }

   if (my_rank == 0) {
       start_par = MPI_Wtime();
       for (i=0;i<DIMENSION;i++) {
	   dest = i % p;
	   if (dest != 0) {
	       for (j=0;j<DIMENSION;j++) 
		   buffer[j] = A[j][i];
	       MPI_Send(buffer,DIMENSION, MPI_DOUBLE, dest, i, MPI_COMM_WORLD);
	   }
       }
   } else {
       for (i=0;i<DIMENSION;i++) {
	   if (i % p == my_rank) {
	       MPI_Recv(buffer,DIMENSION, MPI_DOUBLE, 0, i, MPI_COMM_WORLD, &status);
	       for (j=0;j<DIMENSION;j++) 
		   A[j][i] = buffer[j];
	   }
       }
   }




   for (k=0;k<DIMENSION-1;k++) {
       if (my_rank == k%p) {
	   for (i=k+1;i<DIMENSION;i++) {
	       A[i][k] = -1.0 * A[i][k] / A[k][k];
	       buffer[i - k - 1] = A[i][k];
	   }
       }
       MPI_Bcast(buffer,DIMENSION-k-1,MPI_DOUBLE,k%p,MPI_COMM_WORLD);
       for (j=k+1;j<DIMENSION;j++) {
	   for (i=k+1;i<DIMENSION;i++) {
	       A[i][j] += buffer[i -k -1] * A[k][j];
	   }
       }
   }


   if (my_rank == 0) {
       for (i=0;i<DIMENSION;i++) {
	   if (i%p != 0) {
	       MPI_Recv(buffer,DIMENSION,MPI_DOUBLE,i%p,i,MPI_COMM_WORLD,&status);
	       for (j=0;j<DIMENSION;j++)
		   A[j][i] = buffer[j];
	   }
       }
       for (i=1;i<DIMENSION;i++) {
	   for (j=0;j<i;j++) {
	       A[i][j] = 0.0;
	   }
       }
       end_par = MPI_Wtime();
   } else {
       for (i=0;i<DIMENSION;i++) {
	   if (my_rank == i%p) {
	       for (j=0;j<DIMENSION;j++) 
		   buffer[j] = A[j][i];
	       MPI_Send(buffer,DIMENSION,MPI_DOUBLE,0,i,MPI_COMM_WORLD);
	   }
       }
   }
   


   if (my_rank == 0) {
       start_ser = MPI_Wtime();
       for (k=0;k<DIMENSION-1;k++) {
	   for (i=k+1;i<DIMENSION;i++) {
	       l = A_ser[i][k] / A_ser[k][k];
	       for (j=k;j<DIMENSION;j++) {
		   A_ser[i][j] -= l*A_ser[k][j];
	       }
	   }
       } 
       for (i=1;i<DIMENSION;i++) {
	   for (j=0;j<i;j++) {
	       A_ser[i][j] = 0.0;
	   }
       } 
       end_ser = MPI_Wtime();

       serial_time = end_ser - start_ser;
       parallel_time = end_par - start_par;
       speedup = serial_time / parallel_time;

       printf("Applied Gaussian elimination on a %dx%d-matrix\n",DIMENSION,DIMENSION);
       printf("Parallel version: %e seconds\n",parallel_time);
       printf("Serial version: %e seconds\n",serial_time);
       printf("This is a speedup of %e\n",speedup);


       error = 0;
       for (i=0;i<DIMENSION;i++) {
	   for (j=0;j<DIMENSION;j++) {
	       diff = A_ser[i][j] - A[i][j];
	       error += diff * diff;
	   }
       }
       
       printf("'Correctness' of the parallel version -\n \\sum_i \\sum_k (A[i][k] - A'[i][k])^2 = %e\n",error);
   }

   fflush(stdout);
   fflush(stderr);
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
   return 0;
}

