#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <mpi.h>

int main(int argc, char **argv) {

  double *a,*b,*c,*d,*x;
  int i,j,l,n,p,q;

  int myrank,nproz;
  int irank;
  MPI_Status status;

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
  MPI_Comm_size(MPI_COMM_WORLD,&nproz);

  n = 12;      /* dimension of matrix A */
  p = nproz;   /* number of partitions (blocks in one direction) */

  if ( n%p!=0 ){
    if ( myrank==0 ){
      printf("Die Prozeßanzahl sollte ein Teiler von n sein.\n");
    }
    MPI_Finalize();
    exit(0);
  }

  q = n/p; 

  a = (double *) malloc( n * sizeof(double)  );
  b = (double *) malloc( n * sizeof(double)  );
  c = (double *) malloc( n * sizeof(double)  );
  d = (double *) malloc( n * sizeof(double)  );
  x = (double *) malloc( n * sizeof(double)  );

  if (myrank==0){
    for(i=0; i<n ;i++) {
      a[i] = 1.0; 
      b[i] = 4.0;
      c[i] = 5.0;
      d[i] = 3.0;
    }
    a[0]   = 0.0;
    c[n-1] = 0.0; 
  }

/* Distribute the raws to the processes: */
  if (myrank==0){
    for(irank=1; irank<nproz; irank++){
      MPI_Send(a,n,MPI_DOUBLE,irank,0,MPI_COMM_WORLD);
      MPI_Send(b,n,MPI_DOUBLE,irank,0,MPI_COMM_WORLD);
      MPI_Send(c,n,MPI_DOUBLE,irank,0,MPI_COMM_WORLD);
      MPI_Send(d,n,MPI_DOUBLE,irank,0,MPI_COMM_WORLD);
    } 
  } 
  if (myrank!=0){
    MPI_Recv(a,n,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&status);
    MPI_Recv(b,n,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&status);
    MPI_Recv(c,n,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&status);
    MPI_Recv(d,n,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&status);
  }

  l = myrank;

  /* step 1: Elimination of the subdiagonal elements */
  /*         -> no communication needed */
  for(j=1; j<q ;j++) {    
    b[l*q+j] += -a[l*q+j] / b[l*q+j-1] * c[l*q+j-1];
    d[l*q+j] += -a[l*q+j] / b[l*q+j-1] * d[l*q+j-1];
    if( l>0 ) {
      a[l*q+j] = -a[l*q+j] / b[l*q+j-1] * a[l*q+j-1];
    }
  }

  /* step 2: Elimination of the upper side diagonal */
  /*         -> for the last element of the partition the first 
                (modified) raw of the next partition is required   */
  /* first compute the raws but the last one */
  if ( myrank!=0 ){
    for(j=q-3; j>=0 ;j--){
      a[l*q+j] += -c[l*q+j] / b[l*q+j+1] * a[l*q+j+1];
    }
  }

  for(j=q-3; j>=0; j--){
    d[l*q+j] +=  -c[l*q+j] / b[l*q+j+1] * d[l*q+j+1];
    c[l*q+j]  =  -c[l*q+j] / b[l*q+j+1] * c[l*q+j+1];
  }

  /* pass the first raw of the partition to the upper neighbour */
  for(irank=1; irank<nproz; irank++){
    if ( myrank==irank ){
      MPI_Send(&a[l*q],1,MPI_DOUBLE,irank-1,1,MPI_COMM_WORLD);
      MPI_Send(&b[l*q],1,MPI_DOUBLE,irank-1,2,MPI_COMM_WORLD);
      MPI_Send(&c[l*q],1,MPI_DOUBLE,irank-1,3,MPI_COMM_WORLD);
      MPI_Send(&d[l*q],1,MPI_DOUBLE,irank-1,4,MPI_COMM_WORLD);
    } 
  } 
  for(irank=0; irank<nproz-1; irank++){
    if ( myrank==irank ){
      MPI_Recv(&a[(l+1)*q],1,MPI_DOUBLE,irank+1,1,MPI_COMM_WORLD,&status);
      MPI_Recv(&b[(l+1)*q],1,MPI_DOUBLE,irank+1,2,MPI_COMM_WORLD,&status);
      MPI_Recv(&c[(l+1)*q],1,MPI_DOUBLE,irank+1,3,MPI_COMM_WORLD,&status);
      MPI_Recv(&d[(l+1)*q],1,MPI_DOUBLE,irank+1,4,MPI_COMM_WORLD,&status);
    } 
  } 

  /* finally the last raw */
  if ( myrank!=nproz-1 ){
    b[(l+1)*q-1] += -c[(l+1)*q-1] / b[(l+1)*q] * a[(l+1)*q];
    d[(l+1)*q-1] += -c[(l+1)*q-1] / b[(l+1)*q] * d[(l+1)*q];
    c[(l+1)*q-1]  = -c[(l+1)*q-1] / b[(l+1)*q] * c[(l+1)*q];
  }


  /* step 3: transformation to an upper triangular matrix: eliminate
             a's by means of the last line of the upper neighbour */
  /*         -> first receive the upper neighbour's last modified raw,
                then compute own last raw and pass the modified elements;
                the rest can be done afterwards in parallel. */
  for(irank=0; irank<nproz-1; irank++){
    if ( myrank==irank+1 ){
      MPI_Recv(&b[l*q-1],1,MPI_DOUBLE,irank,1,MPI_COMM_WORLD,&status);
      MPI_Recv(&c[l*q-1],1,MPI_DOUBLE,irank,2,MPI_COMM_WORLD,&status);
      MPI_Recv(&d[l*q-1],1,MPI_DOUBLE,irank,3,MPI_COMM_WORLD,&status);
      b[(l+1)*q-1] += -a[(l+1)*q-1] / b[l*q-1] * c[l*q-1];
      d[(l+1)*q-1] += -a[(l+1)*q-1] / b[l*q-1] * d[l*q-1];
    } 
    if ( myrank==irank ){
      MPI_Send(&b[(l+1)*q-1],1,MPI_DOUBLE,irank+1,1,MPI_COMM_WORLD);
      MPI_Send(&c[(l+1)*q-1],1,MPI_DOUBLE,irank+1,2,MPI_COMM_WORLD);
      MPI_Send(&d[(l+1)*q-1],1,MPI_DOUBLE,irank+1,3,MPI_COMM_WORLD);
    } 
  } 

  /* computation of the raws 0 up to q-2 in parallel */
  if ( myrank!=0 ){
    for(j=0; j<q-1; j++) {
      c[l*q+j] += -a[l*q+j] / b[l*q-1] * c[l*q-1];
      d[l*q+j] += -a[l*q+j] / b[l*q-1] * d[l*q-1];
    }
  }

  /* step 4: transform matrix to diagonal form*/
  /*         -> first receive the elements of the last raw of the
                lower neighbour, then calculate the right hand side
                of the last raw of the own partition and pass it upwards;
                the remaining elements are treated in parallel */
  for(irank=nproz-1; irank>=0; irank--){
    if ( myrank==irank-1 ){
      MPI_Recv(&b[(l+2)*q-1],1,MPI_DOUBLE,irank,3,MPI_COMM_WORLD,&status);
      MPI_Recv(&d[(l+2)*q-1],1,MPI_DOUBLE,irank,3,MPI_COMM_WORLD,&status);
    }
    if ( myrank==irank ){
      if ( irank!=nproz-1 )
        d[(l+1)*q-1] += -c[(l+1)*q-1] / b[(l+2)*q-1] * d[(l+2)*q-1];
      if ( irank!=0 )
        MPI_Send(&b[(l+1)*q-1],1,MPI_DOUBLE,irank-1,3,MPI_COMM_WORLD);
        MPI_Send(&d[(l+1)*q-1],1,MPI_DOUBLE,irank-1,3,MPI_COMM_WORLD);
    } 
  }

  for(j=0; j<=q-2; j++){
    d[l*q+j] += -c[l*q+j] / b[(l+1)*q-1] * d[(l+1)*q-1];
  }


  /* calculate the solution: */ 
  for(j=0; j<q; j++){
    x[l*q+j] = d[l*q+j] / b[l*q+j];
  }

  if ( myrank!=0 ){
    MPI_Send(&x[l*q],q,MPI_DOUBLE,0,4,MPI_COMM_WORLD);
  } 
  if ( myrank==0 ){
    for(irank=1; irank<nproz; irank++){
      MPI_Recv(&x[irank*q],q,MPI_DOUBLE,irank,4,MPI_COMM_WORLD,&status);
    }
  }

  /* print the solution: */
  if (myrank==0){
    for(j=0; j<n; j++)
      printf(" x[%d] = %6.2f\n", j, x[j] );
  }

  fflush(stdout);
  MPI_Finalize();
}


