#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <mpi.h>

#define N 500
#define EPSILON 1e-12
#define MAX_ITER 10000

double **make_matrix(int n) {
    int i;
    double **A;

    A = (double**) malloc(n * sizeof(double*));
    for (i=0;i<n;i++) 
	A[i] = (double*) malloc(n * sizeof(double));
    return A;
}

void free_matrix(double **A, int n) {
    int i;
    for (i=0;i<n;i++) 
	free(A[i]);
    free(A);
}

double **fill_random_matrix(double **A, int n) {
    int i, j;

    for (i=0;i<n;i++) {
	for (j=0;j<=i;j++) {
	    A[i][j] = ((double) rand()) / ((double) RAND_MAX);
	    A[j][i] = A[i][j];
	}
	A[i][i] += 12.0;
    }
    return A;
}

double *fill_random_vector(double *v, int n) {
    int i;

    for (i=0;i<n;i++) 
	v[i] = ((double) rand()) / ((double) RAND_MAX);
    return v;
}

void print_matrix(double **A, int n) {
    int i, j;

    for (i=0;i<n;i++) {
	for (j=0;j<n;j++) {
	    printf("%e ",A[i][j]);
	}
	printf("\n");
    }
    printf("\n");
}

void print_vector(double *v, int n) {
    int i;

    for (i=0;i<n;i++) 
	printf("%e\n",v[i]);
    printf("\n");
}

double **copy_matrix(double **A, int n) {
    double **B;

    int i,j;

    B = make_matrix(n);
    for (i=0;i<n;i++) 
	for (j=0;j<n;j++) 
	    B[i][j] = A[i][j];
    return B;
}

double *copy_vector(double *v, int n) {
    double *w;
    int i;
    w = (double*) malloc(n * sizeof(double));
    for (i=0;i<n;i++) 
	w[i] = v[i];
    return w;
}

double *solve_by_gaussian_elimination(double **A, double *d, int n) {
    double **A_;
    double *x, *d_;
    double l;
    int i,j,k;
    
    A_ = copy_matrix(A,n);
    d_ = copy_vector(d,n);
    for (k=0;k<n-1;k++) {
	for (i=k+1;i<n;i++) {
	    l = A_[i][k] / A_[k][k];
	    for (j=k;j<n;j++) {
		A_[i][j] -= l*A_[k][j];
	    }
	    d_[i] -= l*d_[k];
	}
    }
    x = (double*) malloc(n * sizeof(double));
    for (k=n-1;k>=0;k--) {
	l = 0;
	for (i=n-1;i>k;i--) {
	    l += x[i]*A_[k][i];
	}
	x[k] = (d_[k] - l) / A_[k][k];
    }
    free_matrix(A_,n);
    free(d_);
    return x;
}

double *serial_matrix_vector(double **A, double *x, int n) {
    int i, k;
    double *d;

    d = (double *) malloc(n * sizeof(double));
    for (i=0;i<n;i++) {
	d[i] = 0.0;
	for (k=0;k<n;k++) {
	    d[i] += A[i][k] * x[k];
	}
    }
    return d;
}


double *parallel_matrix_vector(double **A, double *x, int n, int p, int my_rank, int root) {
    int n_loc, n_loc_root;
    double *x_loc, *d_loc, *d;
    double **A_loc;
    int i, k, l;
    MPI_Status status;

    n_loc = n / p;
    n_loc_root = n_loc + (n % p);
    d = NULL;
    if (my_rank == root) {
	d = (double *) malloc(n * sizeof(double));
	x_loc = x;
	d_loc = d;
	A_loc = A;
	MPI_Bcast(x,n,MPI_DOUBLE,root,MPI_COMM_WORLD);
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		for (l=k;l<k+n_loc;l++) {
		    MPI_Send(A[l],n,MPI_DOUBLE,i,l,MPI_COMM_WORLD);
		}
		k += n_loc;
	    }
	}
	
	for (i=0;i<n_loc_root;i++) {
	    d_loc[i] = 0.0;
	    for (k=0;k<n;k++) {
		d_loc[i] += A_loc[i][k] * x_loc[k];
	    }
	}
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		MPI_Recv(&d[k],n_loc,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&status);
		k += n_loc;
	    }
	    } 
    } else {
	d_loc = (double *) malloc(n_loc * sizeof(double));
	x_loc = (double *) malloc(n * sizeof(double));
	A_loc = (double **) malloc(n_loc * sizeof(double));
	for (i=0;i<n_loc;i++) {
	    A_loc[i] = (double *) malloc(n * sizeof(double));
	}
	MPI_Bcast(x_loc,n,MPI_DOUBLE,root,MPI_COMM_WORLD);
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i == my_rank) {
		for (l=0;l<n_loc;l++) {
		    MPI_Recv(A_loc[l],n,MPI_DOUBLE,root,l+k,MPI_COMM_WORLD,&status);
		}
	    }
	    if (i != root) {
		k += n_loc;
	    }
	}
	for (i=0;i<n_loc;i++) {
	    d_loc[i] = 0.0;
	    for (k=0;k<n;k++) {
		d_loc[i] += A_loc[i][k] * x_loc[k];
	    }
	}
	MPI_Send(d_loc,n_loc,MPI_DOUBLE,root,my_rank,MPI_COMM_WORLD); 
	for (i=0;i<n_loc;i++) {
	    free(A_loc[i]);
	}
	free(A_loc);
	free(d_loc);
	free(x_loc);
    }
    return d;
}

double serial_dot(double *a, double *b, int n) {
    int i;
    double sum;

    sum = 0.0;
    for (i=0;i<n;i++) 
	sum += a[i] * b[i];
    return sum;
}

double serial_euclidian_norm(double *a, int n) {
    return sqrt(serial_dot(a,a,n));
}

double parallel_dot(double *a, double *b, int n, int p, int my_rank, int root) {
    int n_loc, n_loc_root;
    double *a_loc, *b_loc;
    double sum, sum_loc;
    int i, k;
    MPI_Status status;

    sum = 0.0;
    n_loc = n / p;
    n_loc_root = n_loc + (n % p);
    if (my_rank == root) {
	a_loc = a;
	b_loc = b;
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		MPI_Send(&a[k],n_loc,MPI_DOUBLE,i,i,MPI_COMM_WORLD);
		MPI_Send(&b[k],n_loc,MPI_DOUBLE,i,i+p,MPI_COMM_WORLD);
		k += n_loc;
	    }
	}
	sum_loc = 0.0;
	for (i=0;i<n_loc_root;i++) {
	    sum_loc += a_loc[i] * b_loc[i];
	}
    } else {
	a_loc = (double *) malloc(n_loc * sizeof(double));
	b_loc = (double *) malloc(n_loc * sizeof(double));
	MPI_Recv(a_loc,n_loc,MPI_DOUBLE,root,my_rank,MPI_COMM_WORLD,&status);
	MPI_Recv(b_loc,n_loc,MPI_DOUBLE,root,my_rank+p,MPI_COMM_WORLD,&status);
	sum_loc = 0.0;
	for (i=0;i<n_loc;i++) {
	    sum_loc += a_loc[i] * b_loc[i];
	}
	free(a_loc);
	free(b_loc);
    }
    MPI_Reduce(&sum_loc,&sum,1,MPI_DOUBLE,MPI_SUM,root,MPI_COMM_WORLD);
    return sum;
}

double parallel_self_dot(double *a, int n, int p, int my_rank, int root) {
    int n_loc, n_loc_root;
    double *a_loc;
    double sum, sum_loc;
    int i, k;
    MPI_Status status;

    sum = 0.0;
    n_loc = n / p;
    n_loc_root = n_loc + (n % p);
    if (my_rank == root) {
	a_loc = a;
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		MPI_Send(&a[k],n_loc,MPI_DOUBLE,i,i,MPI_COMM_WORLD);
		k += n_loc;
	    }
	}
	sum_loc = 0.0;
	for (i=0;i<n_loc_root;i++) {
	    sum_loc += a_loc[i] * a_loc[i];
	}
    } else {
	a_loc = (double *) malloc(n_loc * sizeof(double));
	MPI_Recv(a_loc,n_loc,MPI_DOUBLE,root,my_rank,MPI_COMM_WORLD,&status);
	sum_loc = 0.0;
	for (i=0;i<n_loc;i++) {
	    sum_loc += a_loc[i] * a_loc[i];
	}
	free(a_loc);
    }
    MPI_Reduce(&sum_loc,&sum,1,MPI_DOUBLE,MPI_SUM,root,MPI_COMM_WORLD);
    return sum;
}

double parallel_euclidian_norm(double *a, int n, int p, int my_rank, int root) {
    return sqrt(parallel_self_dot(a,n,p,my_rank,root));
}

double *serial_scaled_vector_plus_vector(double s, double *a, double *b, int n) {
    int i;
    double *c;
    
    c = (double *) malloc(n * sizeof(double));
    for (i=0;i<n;i++) 
	c[i] = s * a[i] + b[i];
    return c;
}


double *parallel_scaled_vector_plus_vector(double s, double *a, double *b, int n, int p, int my_rank, int root) {
    int n_loc, n_loc_root;
    double *a_loc, *b_loc, *c_loc, *c;
    int i, k;
    MPI_Status status;

    n_loc = n / p;
    n_loc_root = n_loc + (n % p);
    c = NULL;
    MPI_Bcast(&s,1,MPI_DOUBLE,root,MPI_COMM_WORLD);
    if (my_rank == root) {
	a_loc = a;
	b_loc = b;
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		MPI_Send(&a[k],n_loc,MPI_DOUBLE,i,i,MPI_COMM_WORLD);
		MPI_Send(&b[k],n_loc,MPI_DOUBLE,i,i+p,MPI_COMM_WORLD);
		k += n_loc;
	    }
	}
	c = (double *) malloc(n * sizeof(double));
	c_loc = c;
	for (i=0;i<n_loc_root;i++) {
	    c_loc[i] = s * a_loc[i] + b_loc[i];
	}
	k = n_loc_root;
	for (i=0;i<p;i++) {
	    if (i != root) {
		MPI_Recv(&c[k],n_loc,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&status);
		k += n_loc;
	    }
	} 
    } else {
	a_loc = (double *) malloc(n_loc * sizeof(double));
	b_loc = (double *) malloc(n_loc * sizeof(double));
	MPI_Recv(a_loc,n_loc,MPI_DOUBLE,root,my_rank,MPI_COMM_WORLD,&status);
	MPI_Recv(b_loc,n_loc,MPI_DOUBLE,root,my_rank+p,MPI_COMM_WORLD,&status);
	c_loc = (double *) malloc(n_loc * sizeof(double));
	for (i=0;i<n_loc;i++) {
	    c_loc[i] = s * a_loc[i] + b_loc[i];
	}
	MPI_Send(c_loc,n_loc,MPI_DOUBLE,root,my_rank,MPI_COMM_WORLD);
	free(a_loc);
	free(b_loc);
	free(c_loc);
    }
    return c;
}



int main(int argc, char *argv[]) {
    int proc, my_rank, i; 
    double **A;
    double *d, *x_p, *x_s, *p, *r, *temp, *A_p, *diff;
    double norm, alpha, beta, r_dot_r, p_dot_A_p, r_dot_r_old;
    
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &proc);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    r = NULL;
    d = NULL;
    x_p = NULL;
    p = NULL;
    temp = NULL;

    if (my_rank == 0) {
	A = fill_random_matrix(make_matrix(N),N);
	d = fill_random_vector((double *) malloc(N * sizeof(double)),N);
	printf("Working on a %dx%d matrix A and a %d component vector d\n",N,N,N);
	if (N < 12) {
	    print_matrix(A,N);
	    print_vector(d,N);
	} 
	x_s = solve_by_gaussian_elimination(A,d,N);
	x_p = (double *) malloc(N * sizeof(double));
	for (i=0;i<N;i++) 
	    x_p[i] = 0.0;
    }
    
    temp = parallel_matrix_vector(A,x_p,N,proc,my_rank,0);
    p = parallel_scaled_vector_plus_vector(-1.0,temp,d,N,proc,my_rank,0);
    if (my_rank == 0) {
	free(temp);
	r = copy_vector(p,N);
    }
    r_dot_r = parallel_self_dot(r,N,proc,my_rank,0);

    i = 1;
    while (i < MAX_ITER) {
	if (my_rank == 0) printf("Iteration %d: ",i);
	i++;

	A_p = parallel_matrix_vector(A,p,N,proc,my_rank,0);
	p_dot_A_p = parallel_dot(p,A_p,N,proc,my_rank,0);
	if (my_rank == 0) alpha = -1.0 * (r_dot_r / p_dot_A_p);
	temp = parallel_scaled_vector_plus_vector(-1.0 * alpha, p, x_p, N, proc, my_rank, 0);
	if (my_rank == 0) {
	    free(x_p);
	    x_p = temp;
	}
	temp = parallel_scaled_vector_plus_vector(alpha, A_p, r, N, proc, my_rank, 0);
	if (my_rank == 0) {
	    free(r);
	    free(A_p);
	    r = temp;
	}
	if (my_rank == 0) r_dot_r_old = r_dot_r;
	r_dot_r = parallel_self_dot(r,N,proc,my_rank,0);
	MPI_Bcast(&r_dot_r,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
	if (my_rank == 0) printf("|| r ||^2 = r_dot_r = %e\n",r_dot_r);
	if (r_dot_r <= EPSILON) break;
	if (my_rank == 0) beta = r_dot_r / r_dot_r_old;
	temp = parallel_scaled_vector_plus_vector(beta,p,r,N,proc,my_rank,0);
	if (my_rank == 0) {
	    free(p);
	    p = temp;
	}
    }
    if (my_rank == 0) {
	free(r);
	free(p);
    }
    if (my_rank == 0) {
	if (N < 12) {
	    print_vector(x_s,N);
	    print_vector(x_p,N);
	}
	diff = serial_scaled_vector_plus_vector(-1.0,x_s,x_p,N);
	norm = serial_euclidian_norm(diff,N);
	printf("|| x_s - x_p || = %e\n",norm);
	free(diff);
	free(x_s);
	free(x_p);
	free(d);
	free_matrix(A,N);
    }
    
    fflush(stdout);
    fflush(stderr);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
    return 0;
}


