#include<stdlib.h>
#include<stdio.h>
#include<mpi.h>
#include<time.h>

/* The function random_number(double *rmin, double *rmax)
   returns a randum number between rmin and rmax */
double random_number(double *rmin, double *rmax){
  return (*rmin) + (float)rand()/(float)RAND_MAX*((*rmax)-(*rmin));
}

void main(int argc, char **argv){
  int n_ges;
  int myrank, nproz;
  int n_pro, n_pro_real, n_sent, n_loc;
  int dest,i,source;
  double z,z_loc;
  float end, start;
  double *rmin, *rmax;
  double *a, *b, *r;

/*  clock_t tcpu; */

  MPI_Status status;

  srand(time(0));

  rmin = (double*)malloc(sizeof(double));
  rmax = (double*)malloc(sizeof(double));

  *rmin = 0.;
  *rmax = 1.;

  n_ges=10000000;

  MPI_Init(&argc,&argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MPI_Comm_size(MPI_COMM_WORLD, &nproz);

  if ( myrank==0 ){
    /* Only process 0 allocates the memory for the full vectors */
    a = (double*)malloc(n_ges*sizeof(double));
    b = (double*)malloc(n_ges*sizeof(double));
    r = (double*)malloc(n_ges*sizeof(double));
    /* Initialisation of the vector komponents */
    for(i=0; i<n_ges; i++){
      a[i] = random_number(rmin,rmax);
      b[i] = random_number(rmin,rmax);
    }

    n_pro = n_ges/nproz; /* n_pro is the rough number of data for each
                            process */

    if ( n_ges%nproz!=0 ) n_pro_real = n_pro+1;
    else n_pro_real = n_pro;  /* n_pro_real: usual number 
             of data per process. +1 if n_ges is not a multiple
             of p in order to avoid the last process to have only
             a large amount of data to process */

    n_loc = n_pro_real;   /* the local vector length */
    n_sent = n_loc;       /* counts the data sent to other processors */
            /* initialisation: the data processed by process 0 itself */

    for (dest=1; dest<nproz; dest++){
      if (dest==nproz-1) n_pro_real = n_ges - n_sent;
      printf("process %d's vector length: %d\n",dest,n_pro_real);
      /* before the data is send, tell the receiver the amount of data */
      MPI_Send(&n_pro_real,1,MPI_INT,dest,0,MPI_COMM_WORLD);
      /* process 0 is to send the appropriate data */
      MPI_Send(&(a[n_sent]),n_pro_real,MPI_DOUBLE,dest,1,MPI_COMM_WORLD);
      MPI_Send(&(b[n_sent]),n_pro_real,MPI_DOUBLE,dest,2,MPI_COMM_WORLD);
      n_sent += n_pro_real;
    }
  }

  else{
    /* get the local vector length first */
    MPI_Recv(&n_loc,1,MPI_INT,0,0,MPI_COMM_WORLD,&status);
    /* allocation of the memory individually needed */
    a = (double*)malloc(n_loc*sizeof(double));
    b = (double*)malloc(n_loc*sizeof(double));
    r = (double*)malloc(n_loc*sizeof(double));
    /* the other processes are receiving */
    MPI_Recv(a,n_loc,MPI_DOUBLE,0,1,MPI_COMM_WORLD,&status);
    MPI_Recv(b,n_loc,MPI_DOUBLE,0,2,MPI_COMM_WORLD,&status);
  }

/* the core calculation */
/*  tcpu = clock(); */
  start = MPI_Wtime();
  z_loc = 0.;
  for(i=0; i<n_loc; i++){
    r[i]=a[i]+b[i];
    z_loc += a[i]*b[i];
  }
  end = MPI_Wtime();

  printf("%d: elapsed time: %f\n",myrank,end-start);
/*  printf("%d: time by c-functions: %.4lf\n",
        myrank,(clock()-tcpu)/(double)CLOCKS_PER_SEC);  */

  fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);

  if ( myrank!=0 ){
    /* each process sends back the results */
    MPI_Send(r,n_loc,MPI_DOUBLE,0,1,MPI_COMM_WORLD);
  }

  else{
    /* process 0 receives the data: for this purpose the
       local vectorlength are recalculated (they could
       also be passed from the other processes */
    if ( n_ges%nproz!=0 ) n_pro_real = n_pro+1;
    else n_pro_real = n_pro;

    n_loc = n_pro_real;
    n_sent = n_loc;
    for(source=1; source<nproz; source++){
      if ( source==nproz-1 ) n_pro_real = n_ges - n_sent;
      MPI_Recv(&(r[n_sent]),n_pro_real,MPI_DOUBLE,source,1,
           MPI_COMM_WORLD,&status);
      n_sent += n_pro_real;
    }
  }

  fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);


  MPI_Reduce(&z_loc,&z,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
  if ( myrank==0 ){
    printf("z = %f\n",z);
  }
 
  MPI_Finalize();
}

