#include<stdlib.h>
#include<stdio.h>
#include<mpi.h>
#include<time.h>

double random_number(double *rmin, double *rmax){
  return (*rmin) + (float)rand()/(float)RAND_MAX*((*rmax)-(*rmin));
}

void main(int argc, char **argv){
  int n_ges;
  int myrank, nproz;
  int n_pro, n_pro_real, n_sent;
  int *n_loc, *n_pos;
  int dest,i,source;
  double z,z_loc;
  float end, start;
  double *rmin, *rmax;
  double *a, *b, *r;

/*  clock_t tcpu; */

  MPI_Status status;

  srand(time(0));

  rmin = (double*)malloc(sizeof(double));
  rmax = (double*)malloc(sizeof(double));

  *rmin = 0.;
  *rmax = 1.;

  n_ges=10000000;

  MPI_Init(&argc,&argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MPI_Comm_size(MPI_COMM_WORLD, &nproz);

  n_loc = (int*)malloc(nproz*sizeof(int));
  n_pos = (int*)malloc(nproz*sizeof(int));

  if ( myrank==0 ){
    a = (double*)malloc(n_ges*sizeof(double));
    b = (double*)malloc(n_ges*sizeof(double));
    r = (double*)malloc(n_ges*sizeof(double));
    for(i=0; i<n_ges; i++){
      a[i] = random_number(rmin,rmax);
      b[i] = random_number(rmin,rmax);
    }
  }

  n_pro = n_ges/nproz;

  if ( n_ges%nproz!=0 ) n_pro_real = n_pro+1;
  else n_pro_real = n_pro;  /* n_pro_real: usual number 
                                    of data per processor */

  n_loc[0] = n_pro_real;   /* the local number of data */
  n_pos[0] = 0;

  for (dest=1; dest<nproz; dest++){
    n_pos[dest] = n_pos[dest-1] + n_loc[dest-1];
    if (dest==nproz-1) n_loc[dest] = n_ges - n_pos[nproz-1];
    else n_loc[dest] = n_pro_real;
    if ( myrank==dest ){
      a = (double*)malloc(n_loc[dest]*sizeof(double));
      b = (double*)malloc(n_loc[dest]*sizeof(double));
      r = (double*)malloc(n_loc[dest]*sizeof(double));
    }
  }

  if ( myrank==0 ){
    for (i=0; i<nproz; i++){
      printf("process %d's vector length: %d starting point: %d\n",
            i,n_loc[i],n_pos[i]);
    }
  }

  MPI_Scatterv(a,n_loc,n_pos,MPI_DOUBLE,a,n_loc[myrank],
               MPI_DOUBLE,0,MPI_COMM_WORLD);
  MPI_Scatterv(b,n_loc,n_pos,MPI_DOUBLE,b,n_loc[myrank],
               MPI_DOUBLE,0,MPI_COMM_WORLD);

/*  tcpu = clock(); */
  start = MPI_Wtime();
  z_loc = 0.;
  for(i=0; i<n_loc[myrank]; i++){
    r[i]=a[i]+b[i];
    z_loc += a[i]*b[i];
  }
  end = MPI_Wtime();

  printf("%d: elapsed time: %f\n",myrank,end-start);
/*  printf("%d: time by c-functions: %.4lf\n",
        myrank,(clock()-tcpu)/(double)CLOCKS_PER_SEC);  */

  MPI_Gatherv(r,n_loc[myrank],MPI_DOUBLE,r,n_loc,n_pos,
              MPI_DOUBLE,0,MPI_COMM_WORLD);
  
  MPI_Reduce(&z_loc,&z,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
  if ( myrank==0 ){
    printf("z = %f\n",z);
  }

  fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Finalize();
}

