/* * Copyright (c) 2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2006 The Technical University of Chemnitz. All * rights reserved. * * Authors: * Peter Gottschling * Torsten Hoefler * */ #include #include #include #include #include "nbc.h" /* ugly global variables */ double gt1, gt2, gt3, gt4; struct array_3d { double *array; int x_dim, y_dim, z_dim, z_inc; /* must be x_dim * y_dim */ /* correspondingly y_inc == x_dim and x_inc == 1 */ }; /* plane within a 3D array */ struct plane { double *first; /* pointer to first element in 3D array */ int major_dim, minor_dim, /* dimensions of plane */ major_inc, minor_inc, /* increments in both directions */ par_inc; /* increment parallel to the plane */ }; /* Information about Cartesian grid */ struct grid_info { /* neighbors */ int left, right, top, bottom, front, back; }; /* Set of buffers to send to or to receive from neighbors */ struct buffer_set { double *start; int left, right, top, bottom, front, back; int d_left, d_right, d_top, d_bottom, d_front, d_back; }; /* All communication related data */ struct comm_data_t { MPI_Comm processor_grid; struct grid_info info; struct buffer_set send_buffers, recv_buffers; int non_blocking; NBC_Handle *handle; int rank; }; /* ========= Functions ========= */ inline int vector_size(struct array_3d *a) { return a->x_dim * a->y_dim * a->z_dim; } inline int x_plane_size(struct array_3d *a) { return a->y_dim * a->z_dim; } inline int y_plane_size(struct array_3d *a) { return a->x_dim * a->z_dim; } inline int z_plane_size(struct array_3d *a) { return a->x_dim * a->y_dim; } /* block size of th processor out of num_proc with overall block size n in [0, num_proc-1] */ inline int nth_block_size(int n, int num_proc, int total) { return total / num_proc + (total % num_proc > n); } void alloc_buffer_set(struct buffer_set *bs, struct grid_info *gi, struct array_3d *a) { /* memory must be contiguous in buffers (would be easier otherwise) */ int total; /* all sizes and displacements initialized to 0 */ bs->left= bs->right= bs->front= bs->back= bs->top= bs->bottom= 0; bs->d_left= bs->d_right= bs->d_front= bs->d_back= bs->d_top= bs->d_bottom= 0; total= 0; if (gi->left != MPI_PROC_NULL) { bs->left= x_plane_size(a); bs->d_left= total; total+= x_plane_size(a); } if (gi->right != MPI_PROC_NULL) { bs->right= x_plane_size(a); bs->d_right= total; total+= x_plane_size(a); } if (gi->front != MPI_PROC_NULL) { bs->front= y_plane_size(a); bs->d_front= total; total+= y_plane_size(a); } if (gi->back != MPI_PROC_NULL) { bs->back= y_plane_size(a); bs->d_back= total; total+= y_plane_size(a); } if (gi->top != MPI_PROC_NULL) { bs->top= z_plane_size(a); bs->d_top= total; total+= z_plane_size(a); } if (gi->bottom != MPI_PROC_NULL) { bs->bottom= z_plane_size(a); bs->d_bottom= total; total+= z_plane_size(a); } bs->start= (double*) malloc(total * sizeof(double)); /* int my_rank; */ /* MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); */ /* printf("rank %i: left nb = %i\n", my_rank, bs->left); */ /* printf("rank %i: right nb = %i\n", my_rank, bs->right); */ } inline double av(struct array_3d *a, int x, int y, int z) { return a->array[z * a->z_inc + y * a->x_dim + x]; } /* same as pointer */ inline double* ap(struct array_3d *a, int x, int y, int z) { return &a->array[z * a->z_inc + y * a->x_dim + x]; } /* Returns the th plane in x direction */ void x_plane(struct array_3d *a, int x, struct plane *p) { assert(0 <= x && x < a->x_dim); p->first= a->array + x; p->major_dim= a->z_dim; p->minor_dim= a->y_dim; p->major_inc= a->z_inc; p->minor_inc= a->x_dim; p->par_inc= 1; } void left_plane(struct array_3d *a, struct plane *p) { x_plane(a, 0, p); } void right_plane(struct array_3d *a, struct plane *p) { x_plane(a, a->x_dim - 1, p); } /* Returns the th plane in y direction */ void y_plane(struct array_3d *a, int y, struct plane *p) { assert(0 <= y && y < a->y_dim); p->first= a->array + y * a->x_dim; p->major_dim= a->z_dim; p->minor_dim= a->x_dim; p->major_inc= a->z_inc; p->minor_inc= 1; p->par_inc= a->x_dim; } void front_plane(struct array_3d *a, struct plane *p) { y_plane(a, 0, p); } void back_plane(struct array_3d *a, struct plane *p) { y_plane(a, a->y_dim - 1, p); } /* Returns the th plane in z direction */ void z_plane(struct array_3d *a, int z, struct plane *p) { assert(0 <= z && z < a->z_dim); p->first= a->array + z * a->z_inc; p->major_dim= a->y_dim; p->minor_dim= a->x_dim; p->major_inc= a->x_dim; p->minor_inc= 1; p->par_inc= a->z_inc; } void bottom_plane(struct array_3d *a, struct plane *p) { z_plane(a, 0, p); } void top_plane(struct array_3d *a, struct plane *p) { z_plane(a, a->z_dim - 1, p); } void print_array_3d(struct array_3d *a) { int i, j, k; /* z planes */ for (i= 0; i < a->z_dim; i++) { for (j= 0; j < a->y_dim; j++) { for (k= 0; k < a->x_dim; k++) printf("%9f ", av(a, k, j, i)); printf("\n"); } printf("\n"); } } void parallel_print_array_3d(struct array_3d *a, struct comm_data_t *comm_data) { int num_proc, id, i, buf; MPI_Status status; MPI_Comm_rank(comm_data->processor_grid, &id); MPI_Comm_size(comm_data->processor_grid, &num_proc); for (i= 0; i < num_proc; i++) if (id == i) { if (id > 0) MPI_Recv(&buf, 1, MPI_INT, id - 1, 998, comm_data->processor_grid, &status); printf ("\nOn processor %i:\n", id); print_array_3d(a); if (id < num_proc-1) MPI_Send(&buf, 1, MPI_INT, id + 1, 998, comm_data->processor_grid); } } void init_array_3d(struct array_3d *a, int xd, int yd, int zd) { a->x_dim= xd; a->y_dim= yd; a->z_dim= zd; a->z_inc= xd * yd; a->array= (double*)malloc(xd * yd * zd * sizeof(double)); } /* Set array to values 1, 2, ... dimensions must be defined, array allocated */ void iota_array_3d(struct array_3d *a) { int array_size, n; array_size= a->x_dim * a->y_dim * a->z_dim; for (n= 0; n < array_size; n++) a->array[n]= (double) n; } /* Set array to dimensions must be defined, array allocated */ void set_array_3d(struct array_3d *a, double value) { int array_size, n; array_size= a->x_dim * a->y_dim * a->z_dim; for (n= 0; n < array_size; n++) a->array[n]= value; } void set_plane(struct plane *p, double value) { int i, j, major_index, minor_index; for (major_index= i= 0; i < p->major_dim; i++, major_index+= p->major_inc) for (minor_index= major_index, j= 0; j < p->minor_dim; j++, minor_index+= p->minor_inc) p->first[minor_index]= value; } inline check_same_dimensions(struct array_3d *v1, struct array_3d *v2) { assert(v1->x_dim == v2->x_dim && v1->y_dim == v2->y_dim && v1->z_dim == v2->z_dim); } /* Same semantic as volume_mult_simple but the vast majority of grid points has no branches */ void volume_mult(struct array_3d *v_in, struct array_3d *v_out, struct comm_data_t *comm_data) { int i, j, k, test_modulo; double tmp; test_modulo= v_in->z_dim / 100; if (test_modulo == 0) test_modulo= 1; check_same_dimensions(v_in, v_out); /* First plane if exist */ if (v_in->z_dim > 0) { i= 0; for (j= 0; j < v_in->y_dim; j++) for (k= 0; k < v_in->x_dim; k++) { tmp= 6.0 * av(v_in, k, j, i); if (v_in->z_dim > 1) tmp-= av(v_in, k, j, i+1); if (j > 0) tmp-= av(v_in, k, j-1, i); if (j < v_in->y_dim-1) tmp-= av(v_in, k, j+1, i); if (k > 0) tmp-= av(v_in, k-1, j, i); if (k < v_in->x_dim-1) tmp-= av(v_in, k+1, j, i); *ap(v_out, k, j, i)= tmp; } } /* Second to second-last plane */ for (i= 1; i < v_in->z_dim-1; i++) { if ((comm_data->non_blocking) && (i % test_modulo == 0)) NBC_Test(comm_data->handle); /* First line */ if (v_in->y_dim > 0) { j= 0; for (k= 0; k < v_in->x_dim; k++) { tmp= 6.0 * av(v_in, k, j, i) - av(v_in, k, j, i-1) - av(v_in, k, j, i+1); if (j < v_in->y_dim-1) tmp-= av(v_in, k, j+1, i); if (k > 0) tmp-= av(v_in, k-1, j, i); if (k < v_in->x_dim-1) tmp-= av(v_in, k+1, j, i); *ap(v_out, k, j, i)= tmp; } } /* Second to second-last line */ for (j= 1; j < v_in->y_dim-1; j++) { /* First point if exist */ if (v_in->x_dim > 0) { *ap(v_out, 0, j, i)= 6.0 * av(v_in, 0, j, i) - av(v_in, 0, j, i-1) - av(v_in, 0, j, i+1) - av(v_in, 0, j-1, i) - av(v_in, 0, j+1, i); if (0 < v_in->x_dim-1) *ap(v_out, 0, j, i)-= av(v_in, 1, j, i); } /* Second to second-last point */ for (k= 1; k < v_in->x_dim-1; k++) { *ap(v_out, k, j, i)= 6.0 * av(v_in, k, j, i) - av(v_in, k, j, i-1) - av(v_in, k, j, i+1) - av(v_in, k, j-1, i) - av(v_in, k, j+1, i) - av(v_in, k-1, j, i) - av(v_in, k+1, j, i); } /* Last point if exist, implies k-1 is valid */ if (v_in->x_dim > 1) { k= v_in->x_dim - 1; *ap(v_out, k, j, i)= 6.0 * av(v_in, k, j, i) - av(v_in, k, j, i-1) - av(v_in, k, j, i+1) - av(v_in, k, j-1, i) - av(v_in, k, j+1, i) - av(v_in, k-1, j, i); } } /* Last line if exist, implies j-1 is valid */ if (v_in->y_dim > 1) { j= v_in->y_dim - 1; for (k= 0; k < v_in->x_dim; k++) { tmp= 6.0 * av(v_in, k, j, i) - av(v_in, k, j, i-1) - av(v_in, k, j, i+1) - av(v_in, k, j-1, i); if (k > 0) tmp-= av(v_in, k-1, j, i); if (k < v_in->x_dim-1) tmp-= av(v_in, k+1, j, i); *ap(v_out, k, j, i)= tmp; } } } /* Last plane if there is more then one, implies that i-1 is valid */ if (v_in->z_dim > 1) { i= v_in->z_dim - 1; for (j= 0; j < v_in->y_dim; j++) for (k= 0; k < v_in->x_dim; k++) { tmp= 6.0 * av(v_in, k, j, i) - av(v_in, k, j, i-1); if (j > 0) tmp-= av(v_in, k, j-1, i); if (j < v_in->y_dim-1) tmp-= av(v_in, k, j+1, i); if (k > 0) tmp-= av(v_in, k-1, j, i); if (k < v_in->x_dim-1) tmp-= av(v_in, k+1, j, i); *ap(v_out, k, j, i)= tmp; } } } void volume_mult_simple(struct array_3d *v_in, struct array_3d *v_out) { int i, j, k; double tmp; check_same_dimensions(v_in, v_out); for (i= 0; i < v_in->z_dim; i++) for (j= 0; j < v_in->y_dim; j++) for (k= 0; k < v_in->x_dim; k++) { tmp= 6.0 * av(v_in, k, j, i); if (i > 0) tmp-= av(v_in, k, j, i-1); if (i < v_in->z_dim-1) tmp-= av(v_in, k, j, i+1); if (j > 0) tmp-= av(v_in, k, j-1, i); if (j < v_in->y_dim-1) tmp-= av(v_in, k, j+1, i); if (k > 0) tmp-= av(v_in, k-1, j, i); if (k < v_in->x_dim-1) tmp-= av(v_in, k+1, j, i); *ap(v_out, k, j, i)= tmp; } } void plane_buffer_mult(struct plane *p, double* buffer) { int i, j, major_index, minor_index; for (major_index= i= 0; i < p->major_dim; i++, major_index+= p->major_inc) for (minor_index= major_index, j= 0; j < p->minor_dim; j++, minor_index+= p->minor_inc) p->first[minor_index]-= *buffer++; } void plane_buffer_copy(struct plane *p, double* buffer) { int i, j, major_index, minor_index; for (major_index= i= 0; i < p->major_dim; i++, major_index+= p->major_inc) for (minor_index= major_index, j= 0; j < p->minor_dim; j++, minor_index+= p->minor_inc) *buffer++= p->first[minor_index]; } inline copy_dims(int d_in[3], int d_out[3]) { d_out[0]= d_in[0]; d_out[1]= d_in[1]; d_out[2]= d_in[2]; } inline int nb_rank(MPI_Comm grid, int rank, int dim, int dir, int dims[3], int my_coords[3]) { int nb_coords[3], my_nb; /* check for outer boundaries: first lower end then upper */ if (dir == -1 && my_coords[dim] == 0 || dir == 1 && my_coords[dim] == dims[dim]-1) return MPI_PROC_NULL; copy_dims(my_coords, nb_coords); nb_coords[dim]+= dir; MPI_Cart_rank(grid, nb_coords, &my_nb); return my_nb; } void init_processor_grid(MPI_Comm *processor_grid, struct grid_info *gi) { int num_proc, my_rank, dims[]= {0, 0, 0}, periods[]= {0, 0, 0}, my_coords[3], nb_coords[3]; MPI_Comm_size(MPI_COMM_WORLD, &num_proc); MPI_Dims_create(num_proc, 3, dims); MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, 0, processor_grid); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); /* if (!my_rank) */ /* printf("dims = {%i, %i, %i}\n", dims[0], dims[1], dims[2]); */ MPI_Cart_coords(*processor_grid, my_rank, 3, my_coords); /* printf("rank %i: my_coords = {%i, %i, %i}\n", my_rank, my_coords[0], my_coords[1], my_coords[2]); */ gi->left= nb_rank(*processor_grid, my_rank, 0, -1, dims, my_coords); gi->right= nb_rank(*processor_grid, my_rank, 0, 1, dims, my_coords); gi->front= nb_rank(*processor_grid, my_rank, 1, -1, dims, my_coords); gi->back= nb_rank(*processor_grid, my_rank, 1, 1, dims, my_coords); gi->bottom= nb_rank(*processor_grid, my_rank, 2, -1, dims, my_coords); gi->top= nb_rank(*processor_grid, my_rank, 2, 1, dims, my_coords); /* printf("rank %i: left nb = %i\n", my_rank, gi->left); */ /* printf("rank %i: right nb = %i\n", my_rank, gi->right); */ } /* fill send buffer with boundary values of v */ void fill_buffers(struct array_3d *v, struct buffer_set *send_buffers) { struct plane v_left_plane, v_right_plane, v_front_plane, v_back_plane, v_bottom_plane, v_top_plane; left_plane(v, &v_left_plane); right_plane(v, &v_right_plane); front_plane(v, &v_front_plane); back_plane(v, &v_back_plane); top_plane(v, &v_top_plane); bottom_plane(v, &v_bottom_plane); /* single buffer sizes are set to 0 if there is no neighbor, we use this as test */ if (send_buffers->left) plane_buffer_copy(&v_left_plane, send_buffers->start + send_buffers->d_left); if (send_buffers->right) plane_buffer_copy(&v_right_plane, send_buffers->start + send_buffers->d_right); if (send_buffers->front) plane_buffer_copy(&v_front_plane, send_buffers->start + send_buffers->d_front); if (send_buffers->back) plane_buffer_copy(&v_back_plane, send_buffers->start + send_buffers->d_back); if (send_buffers->top) plane_buffer_copy(&v_top_plane, send_buffers->start + send_buffers->d_top); if (send_buffers->bottom) plane_buffer_copy(&v_bottom_plane, send_buffers->start + send_buffers->d_bottom); } void start_send_boundaries(struct comm_data_t *comm_data) { /* send and receive displacements and sizes are equal b/c we always send whole planes */ int *sendcounts, *sdispls, *recvcounts, *rdispls, num_proc, i; struct buffer_set send_buffers, recv_buffers; struct grid_info gi; MPI_Comm processor_grid; send_buffers= comm_data->send_buffers; recv_buffers= comm_data->recv_buffers; gi= comm_data->info; processor_grid= comm_data->processor_grid; MPI_Comm_size(processor_grid, &num_proc); sendcounts= (int*) malloc(num_proc * sizeof(int)); sdispls= (int*) malloc(num_proc * sizeof(int)); recvcounts= (int*) malloc(num_proc * sizeof(int)); rdispls= (int*) malloc(num_proc * sizeof(int)); for (i= 0; i < num_proc; i++) { sendcounts[i]= 0; sdispls[i]= 0; recvcounts[i]= 0; rdispls[0]= 0; } /* replace for the six neighbors */ if (send_buffers.left) { sendcounts[gi.left]= recvcounts[gi.left]= send_buffers.left; sdispls[gi.left]= rdispls[gi.left]= send_buffers.d_left; } if (send_buffers.right) { sendcounts[gi.right]= recvcounts[gi.right]= send_buffers.right; sdispls[gi.right]= rdispls[gi.right]= send_buffers.d_right; } if (send_buffers.front) { sendcounts[gi.front]= recvcounts[gi.front]= send_buffers.front; sdispls[gi.front]= rdispls[gi.front]= send_buffers.d_front; } if (send_buffers.back) { sendcounts[gi.back]= recvcounts[gi.back]= send_buffers.back; sdispls[gi.back]= rdispls[gi.back]= send_buffers.d_back; } if (send_buffers.top) { sendcounts[gi.top]= recvcounts[gi.top]= send_buffers.top; sdispls[gi.top]= rdispls[gi.top]= send_buffers.d_top; } if (send_buffers.bottom) { sendcounts[gi.bottom]= recvcounts[gi.bottom]= send_buffers.bottom; sdispls[gi.bottom]= rdispls[gi.bottom]= send_buffers.d_bottom; } gt1 = MPI_Wtime(); if (comm_data->non_blocking) NBC_Ialltoallv(send_buffers.start, sendcounts, sdispls, MPI_DOUBLE, recv_buffers.start, recvcounts, rdispls, MPI_DOUBLE, processor_grid, comm_data->handle); else { MPI_Alltoallv(send_buffers.start, sendcounts, sdispls, MPI_DOUBLE, recv_buffers.start, recvcounts, rdispls, MPI_DOUBLE, processor_grid); /*NBC_Ialltoallv(send_buffers.start, sendcounts, sdispls, MPI_DOUBLE, recv_buffers.start, recvcounts, rdispls, MPI_DOUBLE, processor_grid, comm_data->handle); NBC_Wait(comm_data->handle);*/ } gt4 = MPI_Wtime(); free(sendcounts); free(sdispls); free(recvcounts); free(rdispls); } void finish_send_boundaries(struct comm_data_t *comm_data) { if (comm_data->non_blocking) NBC_Wait(comm_data->handle); gt2 = MPI_Wtime(); } /* Multiply value of send_buffer with -1 and add this to corresponding values in outer planes */ void mult_boundaries(struct array_3d *v, struct buffer_set *recv_buffers) { struct plane v_left_plane, v_right_plane, v_front_plane, v_back_plane, v_bottom_plane, v_top_plane; left_plane(v, &v_left_plane); right_plane(v, &v_right_plane); front_plane(v, &v_front_plane); back_plane(v, &v_back_plane); top_plane(v, &v_top_plane); bottom_plane(v, &v_bottom_plane); /* single buffer sizes are set to 0 if there is no neighbor, we use this as test */ if (recv_buffers->left) plane_buffer_mult(&v_left_plane, recv_buffers->start + recv_buffers->d_left); if (recv_buffers->right) plane_buffer_mult(&v_right_plane, recv_buffers->start + recv_buffers->d_right); if (recv_buffers->front) plane_buffer_mult(&v_front_plane, recv_buffers->start + recv_buffers->d_front); if (recv_buffers->back) plane_buffer_mult(&v_back_plane, recv_buffers->start + recv_buffers->d_back); if (recv_buffers->top) plane_buffer_mult(&v_top_plane, recv_buffers->start + recv_buffers->d_top); if (recv_buffers->bottom) plane_buffer_mult(&v_bottom_plane, recv_buffers->start + recv_buffers->d_bottom); } void matrix_vector_mult(struct array_3d *v_in, struct array_3d *v_out, struct comm_data_t *comm_data) { fill_buffers(v_in, &comm_data->send_buffers); start_send_boundaries(comm_data); volume_mult(v_in, v_out, comm_data); /* printf("--vol in mvm\n"); */ /* print_array_3d(v_out); */ finish_send_boundaries(comm_data); mult_boundaries(v_out, &comm_data->recv_buffers); } /* Allocates send and receive buffers, needs 3D array for size of buffers */ void allocate_buffers(struct comm_data_t *comm_data, struct array_3d *x) { alloc_buffer_set(&comm_data->send_buffers, &comm_data->info, x); alloc_buffer_set(&comm_data->recv_buffers, &comm_data->info, x); comm_data->handle = malloc(sizeof(NBC_Handle)); } void allocate_vectors(struct array_3d *x, struct array_3d *b, int argc, char** argv, struct comm_data_t *comm_data) { int total_x, total_y, total_z, local_x, local_y, local_z, dims[3], periods[3], my_coords[3]; if (argc < 5) { printf("Usage: cg_poisson_3d \n"); printf(" Dimensions of the whole domain\n"); printf(" CG stops when r_k/r_0 < rel_error\n"); exit(1); } total_x= atoi(argv[1]); total_y= atoi(argv[2]); total_z= atoi(argv[3]); MPI_Cart_get(comm_data->processor_grid, 3, dims, periods, my_coords); local_x= nth_block_size(my_coords[0], dims[0], total_x); local_y= nth_block_size(my_coords[1], dims[1], total_y); local_z= nth_block_size(my_coords[2], dims[2], total_z); init_array_3d(x, local_x, local_y, local_z); init_array_3d(b, local_x, local_y, local_z); } /* Sets b such that x=1 is the solution, x initialized to 0 x= 1; b= Ax; x= 0; */ void init_vectors(struct array_3d *x, struct array_3d *b, struct comm_data_t *comm_data) { set_array_3d(x, 1.0); matrix_vector_mult(x, b, comm_data); set_array_3d(x, 0.0); } /* v+= w */ void vector_assign_add(struct array_3d *v, struct array_3d *w) { int size, i; size= vector_size(v); for (i= 0; i < size; i++) v->array[i] += w->array[i]; } double parallel_dot(struct array_3d *v, struct array_3d *w, struct comm_data_t *comm_data) { int size, size_b, i; double local_dot_0, local_dot_1, local_dot_2, local_dot_3, global_dot; local_dot_0= local_dot_1= local_dot_2= local_dot_3= 0.0; size= vector_size(v); size_b= (size / 4) * 4; for (i= 0; i < size_b; i+= 4) { local_dot_0+= v->array[i] * w->array[i]; local_dot_1+= v->array[i+1] * w->array[i+1]; local_dot_2+= v->array[i+2] * w->array[i+2]; local_dot_3+= v->array[i+3] * w->array[i+3]; } for (i= size_b; i < size; i++) local_dot_0+= v->array[i] * w->array[i]; local_dot_0+= local_dot_1 + local_dot_2 + local_dot_3; MPI_Allreduce(&local_dot_0, &global_dot, 1, MPI_DOUBLE, MPI_SUM, comm_data->processor_grid); return global_dot; } int conjugate_gradient(struct array_3d *b, struct array_3d *x, double rel_error, struct comm_data_t *comm_data) { int size, i, iter; struct array_3d v, q, r; double res_0, gamma, gamma_old, beta, delta, alpha; double t0,t1,t2,t3; init_array_3d(&v, x->x_dim, x->y_dim, x->z_dim); init_array_3d(&q, x->x_dim, x->y_dim, x->z_dim); init_array_3d(&r, x->x_dim, x->y_dim, x->z_dim); size= x->x_dim * x->y_dim * x->z_dim; /* since x == 0, q and r are b */ for (i= 0; i < size; i++) q.array[i]= r.array[i]= b->array[i]; res_0= gamma= parallel_dot(&r, &r, comm_data); rel_error= rel_error * rel_error; /* b/c we compare dot products not norms */ for(iter= 0; fabs(gamma/res_0) >= rel_error && iter < 10000; iter++) { if (iter > 0) { beta= gamma / gamma_old; for (i= 0; i < size; i++) q.array[i]= r.array[i] + beta * q.array[i]; } t1= MPI_Wtime(); matrix_vector_mult(&q, &v, comm_data); t2= MPI_Wtime(); delta= parallel_dot(&q, &v, comm_data); alpha= gamma / delta; for (i= 0; i < size; i++) { x->array[i]+= alpha * q.array[i]; r.array[i]-= alpha * v.array[i]; } gamma_old= gamma; gamma= parallel_dot(&r, &r, comm_data); t3= MPI_Wtime(); if((iter == 2) && (comm_data->rank == 0)) { printf("matrix_vector_mult: %.4lf, rest: %.4lf\n", t2-t1, t3-t2); printf("gt2-gt1: %lf\n", gt2-gt1); printf("gt4-gt1: %lf\n", gt4-gt1); } } return iter; } int main(int argc, char** argv) { struct array_3d x, b; struct comm_data_t comm_data; double rel_error; double t1, t2, t3; int rank, iter_block, iter_non_block; MPI_Init(&argc, &argv); comm_data.non_blocking= 0; init_processor_grid(&comm_data.processor_grid, &comm_data.info); allocate_vectors(&x, &b, argc, argv, &comm_data); allocate_buffers(&comm_data, &x); init_vectors(&x, &b, &comm_data); MPI_Comm_rank(comm_data.processor_grid, &comm_data.rank); /* matrix_vector_mult(b, x, comm_data); */ /* print_array_3d(x); */ rel_error= atof(argv[4]); t1= MPI_Wtime(); iter_block= conjugate_gradient(&b, &x, rel_error, &comm_data); t2= MPI_Wtime(); if (comm_data.rank == 0) printf("blocking run done ... \n"); comm_data.non_blocking= 1; iter_non_block= conjugate_gradient(&b, &x, rel_error, &comm_data); t3= MPI_Wtime(); if (comm_data.rank == 0) printf("non blocking run done ... \n"); if (comm_data.rank == 0) { printf("Blocking operation %.2lf, non-blocking operation %.2lf (%.2lf)\n", t2-t1, t3-t2, ((t2-t1)-(t3-t2))/(t2-t1)*100); printf("Time resolution: %lf\n", MPI_Wtick()); printf("Iterations blocking: %i, non-blocking: %i\n", iter_block, iter_non_block); } /* parallel_print_array_3d(x, comm_data); */ MPI_Finalize(); return 0; }