// cudafun.cu -- 
// CUDA memory allocation & shallow water kernel routines
// To compile (Cuda 3.2):
// 	nvcc -c --gpu_architecture sm_13 -I${CUDA_INSTALL_PATH}/include 
//       -Xcompiler -fpic
//
// ! Incomplete kernel call !
//
// Matthias Griessinger, University of Erlangen, 2011.


#include <stdio.h>
//#include <cuda_runtime_api.h>
#ifndef _CUDA_MACROS_H_
#define _CUDA_MACROS_H_

#define safecall(call) do{\
  cudaError_t err = call ;\
  if( cudaSuccess != err ){\
    fprintf(stdout, "cuda error at %s:%d, %s\n",\
      __FILE__, __LINE__, cudaGetErrorString(err));\
    fflush(stdout);\
  }\
  } while(0)

#define BLOCKDIM 96

#ifdef _SHARED_WRITEBACK_3_
#define SHARED_MEM_MULT 3
#else
#define SHARED_MEM_MULT 1
#endif

#endif


/* *********** DEVICE SELECTION ************************* */

extern "C" void getDeviceInfo( int rank, int size, const char* hostname) {
  /* Print device information */
  int deviceCount, device;
  cudaDeviceProp deviceProp;

  cudaGetDeviceCount(&deviceCount);

  if ( 0 == rank ) {
    printf("## rank %i/%i on %s --\t Device Test: No. Cards: %d\n",
      rank, size-1, hostname, deviceCount);
    for( device = 0; device < deviceCount; ++device) {
      cudaGetDeviceProperties(&deviceProp, device);
      printf("## rank %i/%i on %s --\t Device %d: %s\n",
        rank, size-1, hostname, device, deviceProp.name);
    }
  }
}


extern "C" int selectDevice( int rank, int size, const char* hostname ) {
  /* Select GPU device (for multiple cards);
     call before any GPU memory/kernel calls,
     otherwise no effect (default card selected)
  */
  int deviceCount, takedevice, device;
  cudaDeviceProp deviceProp;

  cudaGetDeviceCount(&deviceCount);

  takedevice = rank%deviceCount;
  cudaSetDevice(takedevice);
  cudaGetDevice(&device);
  cudaGetDeviceProperties(&deviceProp, device);

  printf("rank %i/%i on %s --\t Selecting Device %d: %s\n",
    rank, size, hostname, device, deviceProp.name);

  return device;
}


/* *********** KERNEL LAUNCH PARAMETERS ***************** */

typedef struct {
  int gridDim;
  int blockDim;
} KERNEL_LAUNCHER;

KERNEL_LAUNCHER _launcher_;

extern "C" void setKernelDims( const int gridDim, const int blockDim ) {
  _launcher_.gridDim  = gridDim;
  _launcher_.blockDim = blockDim;
}

extern "C" void printKernelDims() {
	printf(" kernel dims: %i x %i\n", 
		_launcher_.gridDim, _launcher_.blockDim);
   fflush(stdout);
}


/* *********** CUDA MEMORY **************************** */

extern "C" void* allocDeviceMemory( size_t bytesize ) {
    char* mem = NULL;
    safecall(cudaMalloc( (void**)&mem, bytesize ));
	 fprintf(stdout,"allocDevice: allocating %lu bytes at %p\n", bytesize, mem);fflush(stdout);

    return (void*)mem;
}

extern "C" void* allocHostMemory( size_t bytesize ) {
    /* returns aligned CPU memory for faster transfer */
    char* mem = NULL;
    safecall(cudaHostAlloc( (void**)&mem, bytesize, 0 ));

    return (void*)mem;
}


extern "C" void copyDeviceToHost( void* hostmem, void* devicemem, size_t bytesize ) {
  /* copy bytesize bytes from GPU to host */
  safecall(cudaMemcpy( hostmem, devicemem, bytesize, cudaMemcpyDeviceToHost ));
}

extern "C" void copyHostToDevice( void* devmem, void* hostmem, size_t bytesize ) {
  /* copy bytesize bytes from host to GPU */
  safecall(cudaMemcpy( devmem, hostmem, bytesize, cudaMemcpyHostToDevice ));
}


extern "C" void freeDeviceMemory( void* mem ) {
	 fprintf(stdout,"freeDevice: freeing at %p\n", mem);fflush(stdout);
    safecall(cudaFree( mem ));
}

extern "C" void freeHostMemory( void* mem ) {
    safecall(cudaFreeHost( mem ));
}


extern "C" void dummy( ) {
    fprintf(stdout, "dummy 1 at %s:%d\n",__FILE__, __LINE__);
    fflush(stdout);
	double* mem;
	//double* mem = (double*)allocDeviceMemory( 128*sizeof(double) );
	cudaMalloc( (void**)&mem, 128*sizeof(double));

    fprintf(stdout, "dummy 2 at %s:%d\n",__FILE__, __LINE__);
    fflush(stdout);
	cudaFree(mem);
    fprintf(stdout, "dummy 3 at %s:%d\n",__FILE__, __LINE__);
    fflush(stdout);
}

/* *********** GPU KERNELS ************************** */


typedef struct {
	double edgeflux_s;
	double edgeflux_x;
	double edgeflux_y;
	double max_speed;
} Fluxes;

typedef struct {
	double u;
	double uh;
	double h;
} Velocity;

__global__ void __set_to_default__(double* edge, 
												size_t N, double def) {
	/* set input array edge of length N to value def */

	size_t k;
	for( k = blockIdx.x * blockDim.x + threadIdx.x; k < N; k += gridDim.x * blockDim.x ) {
		edge[k] = def;
 	}
}

__global__ void __set_arrays_to_default__(double* edge, 
												double* xmom, 
												double* ymom, 
												size_t N, double def) {
	/* set input arrays edge, xmom, ymom of length N to value def */

	size_t k;
	for( k = blockIdx.x * blockDim.x + threadIdx.x; k < N; k += gridDim.x * blockDim.x ) {
		edge[k] = def;
		xmom[k] = def;
		ymom[k] = def;
 	}
}


__global__ void __compute_time_step__( const long* tri_full_flag, 
							const double* max_speed_array, 
							const double* radii,
							const size_t number_of_elements,
							const double epsilon,
							const double time0,
							double* times_out ) {
    /* Computes minimal timestep in each triangle k, and finds 
       minimal timestep in each block of threads.

       Output is written to times_out, final reduction must
       be performed on CPU.*/
	
    // shared memory size defined at kernel launch,
    // set according to blockDim
	 extern __shared__ double sdata[];
	
	 unsigned int tid = threadIdx.x;

    // initialize thread with default (previous) timestep
    double mytime = time0;
   
    // For all triangles 
	 for( size_t k = blockIdx.x * blockDim.x + threadIdx.x; k < number_of_elements; k += gridDim.x*blockDim.x) {
        if( 1 == tri_full_flag[k] && max_speed_array[k] > epsilon ) {
				mytime = fmin( mytime, radii[k] / max_speed_array[k] );
		  }
	 }
    
    // each thread in block writes to shared memory
    sdata[tid] = mytime;
    __syncthreads();

    // Reduce to one value per thread block by successively 
    // comparing value pairs; in first sweep, first half of 
    // threads compares first half of values to second half and
    // writes min to first half; 2nd sweep, first fourth compares
    // first and second fourth a.s.o.
    for(unsigned int s=blockDim.x/2; s>0; s>>=1) {
		if(tid < s) {
			sdata[tid] = fmin( sdata[tid + s], sdata[tid]);
		}
		__syncthreads();
	}

    // Lead thread writes min for this block to global mem
    if (tid == 0) {
		times_out[blockIdx.x] = sdata[0];
	}
}


__device__ double2 __rotate__(double q1, double q2, double n1, double n2) {
  /*Rotate the momentum component q (q1, q2)
    from x,y coordinates to coordinates based on normal vector (n1, n2).

    Result is returned in array 3x1 r
    To rotate in opposite direction, call rotate with (q, n1, -n2)

    Contents of q are changed by this function */

  double2 q;

  // Rotate
  q.x =  n1*q1 + n2*q2;
  q.y = -n2*q1 + n1*q2;

  return q;
}

__device__ Velocity __compute_speed__(
              double uh, 
              double h,
              const double epsilon,
              const double h0,
              const double limiting_threshold) {
  
  Velocity result;

  if (h < limiting_threshold) {   
    // Apply limiting of speeds according to the ANUGA manual
    if (h < epsilon) {
      h = 0.0;  // Could have been negative
      result.u = 0.0;
    } else {
      result.u = uh/(h + h0/ h);    
    }
  

    // Adjust momentum to be consistent with speed
    uh = result.u * h;
  } else {
    // We are in deep water - no need for limiting
    result.u = uh/ h;
  }
  
  result.uh = uh; 
  result.h = h; 
  return result;
}

__device__ Fluxes __flux_function_central__(double2 stage, double2 xmom,
					 double2 ymom, double2 z_lr,
					 double2 n,
					 const double epsilon, 
					 const double h0,
					 const double limiting_threshold,
					 const double g) {
					 //double *edgeflux,
					 //double *max_speed) {

  /*Compute fluxes between volumes for the shallow water wave equation
    cast in terms of the 'stage', w = h+z using
    the 'central scheme' as described in

    Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For
    Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'.
    Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740.

    The implemented formula is given in equation (3.15) on page 714
  */
  double2 tmp;
  double h_left, uh_left, vh_left, u_left;
  double h_right, uh_right, vh_right, u_right;
  double s_min, s_max, soundspeed_left, soundspeed_right;
  double denom, inverse_denominator, z;

  // Cuda doesn't do static arrays
  double fs_l, fs_r;
  double2 fv_l, fv_r;
  Velocity velos;
  Fluxes fluxes;

  // Align x- and y-momentum with x-axis
  // Do not be confused: xmom.x and xmom.y are 
  // left and right momenti in x direction
  tmp = __rotate__( xmom.x, ymom.x, n.x, n.y);
  xmom.x = tmp.x; ymom.x = tmp.y;
  tmp = __rotate__( xmom.y, ymom.y, n.x, n.y);
  xmom.y = tmp.x; ymom.y = tmp.y;

  z = 0.5*(z_lr.x + z_lr.y); // Average elevation values. 
                              // Even though this will nominally allow 
                  // for discontinuities in the elevation data,
                  // there is currently no numerical support for
                  // this so results may be strange near
                  // jumps in the bed.

  // Compute speeds in x-direction
  h_left = stage.x - z;

  uh_left = xmom.x; // q_left_rotated[1];
  velos = __compute_speed__(uh_left, h_left, 
              epsilon, h0, limiting_threshold);
  u_left = velos.u;
  uh_left = velos.uh;
  h_left = velos.h;

  h_right = stage.y - z;
  uh_right = xmom.y; // q_right_rotated[1];
  velos = __compute_speed__(uh_right, h_right, 
               epsilon, h0, limiting_threshold);
  u_right = velos.u;
  uh_right = velos.uh;
  h_right = velos.h;

  // Momentum in y-direction
  vh_left  = ymom.x; //q_left_rotated[2];
  vh_right = ymom.y; //q_right_rotated[2];

  // Limit y-momentum if necessary 
  // Leaving this out, improves speed significantly (Ole 27/5/2009)
  // All validation tests pass, so do we really need it anymore? 
  velos = __compute_speed__(vh_left, h_left, 
         epsilon, h0, limiting_threshold);
  vh_left = velos.uh;
  h_left = velos.h;
  velos = __compute_speed__(vh_right, h_right, 
         epsilon, h0, limiting_threshold);
  vh_right = velos.uh;
  h_right = velos.h;

  // Maximal and minimal wave speeds
  soundspeed_left  = sqrt(g*h_left);
  soundspeed_right = sqrt(g*h_right);  

  s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
  if (s_max < 0.0) 
  {
    s_max = 0.0;
  }

  s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
  if (s_min > 0.0)
  {
    s_min = 0.0;
  }
  
  // Flux formulas
  fs_l 	= u_left*h_left;
  fv_l.x = u_left*uh_left + 0.5*g*h_left*h_left;
  fv_l.y = u_left*vh_left;

  fs_r   = u_right*h_right;
  fv_r.x = u_right*uh_right + 0.5*g*h_right*h_right;
  fv_r.y = u_right*vh_right;

  // Flux computation
  denom = s_max - s_min;
  if (denom < epsilon) 
  { // FIXME (Ole): Try using h0 here
	 fluxes.edgeflux_s = 0.0;
	 fluxes.edgeflux_x = 0.0;
	 fluxes.edgeflux_y = 0.0;
    fluxes.max_speed = 0.0;
  } 
  else 
  {
    inverse_denominator = 1.0/denom;

      fluxes.edgeflux_s = s_max*fs_l - s_min*fs_r;
      fluxes.edgeflux_s += s_max*s_min*(stage.y - stage.x);
      fluxes.edgeflux_s *= inverse_denominator;
      
      fluxes.edgeflux_x = s_max*fv_l.x - s_min*fv_r.x;
      fluxes.edgeflux_x += s_max*s_min*(xmom.y - xmom.x);
      fluxes.edgeflux_x *= inverse_denominator;
      
      fluxes.edgeflux_y = s_max*fv_l.y - s_min*fv_r.y;
      fluxes.edgeflux_y += s_max*s_min*(ymom.y - ymom.x);
      fluxes.edgeflux_y *= inverse_denominator;

    	// Maximal wavespeed
    	fluxes.max_speed = fmax(fabs(s_max), fabs(s_min));

    // Rotate back
    tmp = __rotate__( fluxes.edgeflux_x, fluxes.edgeflux_y, n.x, -n.y);
	 fluxes.edgeflux_x = tmp.x; fluxes.edgeflux_y = tmp.y;
  }
 
  return fluxes; 
}


__global__ void __compute_fluxes_central_kernel__(
		const int number_of_elements,
        double timestep,
        const double epsilon,
        const double H0,
        const double g,
        const long* neighbours,
        const long* neighbour_edges,
        const double* normals,
        double* edgelengths,
        const double* areas,
        const double* stage_edge_values,
        const double* xmom_edge_values,
        const double* ymom_edge_values,
        const double* bed_edge_values,
        const double* stage_boundary_values,
        const double* xmom_boundary_values,
        const double* ymom_boundary_values,
        double* stage_explicit_update,
        double* xmom_explicit_update,
        double* ymom_explicit_update,
        double* max_speed_array,
        const int optimise_dry_cells) {
    /* __global__: called by CPU and executed on GPU;
                   must return void, scalar variables, structs (and scalar members)
                   are copied automatically, arrays must be allocated on device.
    */

    // Local variables
    double length; //, zl, zr;
    double h0 = H0*H0; // This ensures a good balance when h approaches H0.

    double limiting_threshold = 10 * H0; // Avoid applying limiter below this
    // threshold for performance reasons.
    // See ANUGA manual under flux limiting
    int i, k, m, n;
    int ki, nm = 0, ki2; // Index shorthands

    Fluxes fluxes;
	
    double2 stage, xmom, ymom, z_lr, normvec;
  
    // Shared memory for explicit update quantity reduction
    extern __shared__ double update_shared[]; // empty [] array:(byte)size defined by kernel call
	 //__shared__ double update_shared[BLOCKDIM*SHARED_MEM_MULT]; // OR static size (not both)

    // For all edges; 
    for( ki = blockIdx.x * blockDim.x + threadIdx.x; ki < 3*number_of_elements; ki += gridDim.x * blockDim.x ) {

            // Get left hand side values from triangle k, edge i
            stage.x = stage_edge_values[ki];
            xmom.x = xmom_edge_values[ki];
            ymom.x = ymom_edge_values[ki];
            z_lr.x = bed_edge_values[ki];

            // Get right hand side values either from neighbouring triangle
            // or from boundary array (Quantities at neighbour on nearest face).
            n = neighbours[ki];
            if (n < 0) {
                // Neighbour is a boundary condition
                m = -n - 1; // Convert negative flag to boundary index

                // Bad access order consider binding boundary_values to Texture cache
                stage.y = stage_boundary_values[m];
                xmom.y = xmom_boundary_values[m];
                ymom.y = ymom_boundary_values[m];
                z_lr.y = z_lr.x; // Extend bed elevation to boundary
            }
            else {
                // Neighbour is a real triangle
                m = neighbour_edges[ki];
                nm = n * 3 + m; // Linear index (triangle n, edge m)

                // Again, bind to Texture cache
                stage.y = stage_edge_values[nm];
                xmom.y = xmom_edge_values[nm];
                ymom.y = ymom_edge_values[nm];
                z_lr.y = bed_edge_values[nm];
            }

            // Now we have values for this edge - both from left and right side.

            /*if (optimise_dry_cells) {
                // Check if flux calculation is necessary across this edge
                // This check will exclude dry cells.
                // This will also optimise cases where zl != zr as
                // long as both are dry

                if (fabs(ql[0] - zl) < epsilon &&
                        fabs(qr[0] - zr) < epsilon) {
                    // Cell boundary is dry

                    already_computed_flux[ki] = call; // #k Done
                    if (n >= 0) {
                        already_computed_flux[nm] = call; // #n Done
                    }

                    max_speed = 0.0;
                    continue;
                }
            }*/


            /*if (fabs(zl-zr)>1.0e-10) {TODO:
                report_python_error(AT,"Discontinuous Elevation");
                return 0.0;
            }*/
            
            // Outward pointing normal vector (domain.normals[k, 2*i:2*i+2])
            ki2 = 2 * ki; //k*6 + i*2
            // Bad access order, use Texture or shared mem
				normvec.x = normals[ki2];
            normvec.y = normals[ki2+1];

            // Edge flux computation (triangle k, edge i);
				// TODO: subroutine causes unspecified launch failure
            //fluxes = __flux_function_central__( stage, xmom, ymom, z_lr, normvec,
            //        epsilon, h0, limiting_threshold, g);
  

            // Multiply edgeflux by edgelength
            length = edgelengths[ki];
            fluxes.edgeflux_s *= length;
            fluxes.edgeflux_x *= length;
            fluxes.edgeflux_y *= length;

            
				// Use shared memory to accumulate flux for one triangle;
				// requires blockDim.x to be multiple of 3 (should also be multiple of 32)

				#ifdef _SHARED_WRITEBACK_3_

					// Accumulate all update arrays in one sweep;
					// if shared memory is large enough

            	update_shared[threadIdx.x] = fluxes.edgeflux_s;
            	update_shared[threadIdx.x+blockDim.x] = fluxes.edgeflux_x;
            	update_shared[threadIdx.x+2*blockDim.x] = fluxes.edgeflux_y;
					__syncthreads();
					

					// Each third of the threads in block write back an update array
					// with contiguous access, each thread sums fluxes in a triangle

					if( threadIdx.x < blockDim.x/3) {
                  // Calculate contiguous index
						i = threadIdx.x;
						k = blockIdx.x*(blockDim.x/3) + i;
						stage_explicit_update[k] = ( update_shared[3*i]
															+ update_shared[3*i+1]
															+ update_shared[3*i+2]) / areas[k];
					} else if( threadIdx.x < 2*(blockDim.x/3) ) {
						i = threadIdx.x - (blockDim.x/3);
						k = blockIdx.x*(blockDim.x/3) + i;
						xmom_explicit_update[k] = ( update_shared[blockDim.x+3*i]
															+ update_shared[blockDim.x+3*i+1]
															+ update_shared[blockDim.x+3*i+2]) / areas[k];
					} else {
						i = threadIdx.x - 2*(blockDim.x/3);
						k = blockIdx.x*(blockDim.x/3) + i;
						ymom_explicit_update[k] = ( update_shared[2*blockDim.x+3*i]
															+ update_shared[2*blockDim.x+3*i+1]
															+ update_shared[2*blockDim.x+3*i+2] ) / areas[k];

					}
					__syncthreads();
            
            #else   
            
					// Write each update array back by itself;
					// only first third of threads busy
	            update_shared[threadIdx.x]= fluxes.edgeflux_s;
					__syncthreads();
               if( threadIdx.x < blockDim.x/3 ) {
						i = threadIdx.x;
						k = blockIdx.x*(blockDim.x/3) + i;
               	stage_explicit_update[k] = ( update_shared[3*i] 
												+ update_shared[3*i+1]
												+ update_shared[3*i+2] ) / areas[k];
					}
					__syncthreads();
            
				
               update_shared[threadIdx.x] = fluxes.edgeflux_x;
					__syncthreads();
               if( threadIdx.x < blockDim.x/3 ) {
						i = threadIdx.x;
						k = blockIdx.x*(blockDim.x/3) + i;
               	xmom_explicit_update[k] = ( update_shared[3*i] 
												+ update_shared[3*i+1]
												+ update_shared[3*i+2] ) / areas[k];
					}
					__syncthreads();
               
					update_shared[threadIdx.x] = fluxes.edgeflux_y;
					__syncthreads();
               if( threadIdx.x < blockDim.x/3 ) {
						i = threadIdx.x;
						k = blockIdx.x*(blockDim.x/3) + i;
               	ymom_explicit_update[k] = ( update_shared[3*i] 
												+ update_shared[3*i+1]
												+ update_shared[3*i+2] ) / areas[k];
					}
					__syncthreads();
				
				#endif
			   
				// Likewise, get and write maximum speed within triangle
            // update_shared[threadIdx.x] = fluxes.max_speed;
				__syncthreads();
            if( threadIdx.x < blockDim.x/3 ) {
					i = threadIdx.x;
					k = blockIdx.x*(blockDim.x/3) + i;
					fluxes.max_speed = fmax( update_shared[3*i], update_shared[3*i+1] );
					max_speed_array[k] = fmax( fluxes.max_speed, update_shared[3*i+2] );
				}
           
    } // End edge ki

    // computation of timestep in seperate routine because of triangle-wise access
}


/* *********** KERNEL WRAPPER FUNCTIONS ************************** */

extern "C" void _set_to_default( double* edge, double* xmom, double* ymom, size_t N, double def) {
	
	 __set_arrays_to_default__ <<< _launcher_.gridDim, _launcher_.blockDim >>> ( edge, xmom, ymom, N, def);
    safecall(cudaThreadSynchronize());
}

/*extern "C" void _set_to_default( double* edge, size_t N, double def) {
	
	 __set_to_default__ <<< _launcher_.gridDim, _launcher_.blockDim >>> ( edge, N, def);
    safecall(cudaThreadSynchronize());
}*/

extern "C" double _compute_fluxes_central(
		int number_of_elements,
        double timestep,
        double epsilon,
        double H0,
        double g,
        long* neighbours,
        long* neighbour_edges,
        double* normals,
        double* edgelengths,
        double* radii,
        double* areas,
        long* tri_full_flag,
        double* stage_edge_values,
        double* xmom_edge_values,
        double* ymom_edge_values,
        double* bed_edge_values,
        double* stage_boundary_values,
        double* xmom_boundary_values,
        double* ymom_boundary_values,
        double* stage_explicit_update,
        double* xmom_explicit_update,
        double* ymom_explicit_update,
        double* max_speed_array,
        int optimise_dry_cells) {

    static long call = 1; // Static local variable flagging already computed flux
    int i;
    // Start computation
    call++; // Flag 'id' of flux calculation for this timestep

    // prepare memory for timestep reduction (TODO: (de)allocate only once)
    const size_t reduction_size = _launcher_.gridDim*sizeof(double);
    double* times_out = (double*)allocHostMemory( reduction_size  );
    double* times_out_gpu = (double*)allocDeviceMemory( reduction_size );

    printf("shared mum mult: %i\n", SHARED_MEM_MULT);

    if( 0 != _launcher_.blockDim%3 ) {
		fprintf(stderr,"error: blockDim required to be multiple of 3!\n");
    }	
	 
    __compute_fluxes_central_kernel__ <<< _launcher_.gridDim, _launcher_.blockDim, 
	  											 		_launcher_.blockDim*sizeof(double)*SHARED_MEM_MULT >>> (
    //__compute_fluxes_central_kernel__ <<< _launcher_.gridDim, BLOCKDIM >>> ( // for static shared memory size
		number_of_elements,
        timestep,
        epsilon,
        H0,
        g,
        neighbours,
        neighbour_edges,
        normals,
        edgelengths,
        areas,
        stage_edge_values,
        xmom_edge_values,
        ymom_edge_values,
        bed_edge_values,
        stage_boundary_values,
        xmom_boundary_values,
        ymom_boundary_values,
        stage_explicit_update,
        xmom_explicit_update,
        ymom_explicit_update,
        max_speed_array,
        optimise_dry_cells);
	 

    safecall(cudaThreadSynchronize()); // prevents overlap of kernels
    
	 // Some timestepping debug: (timestep 1.0)
	 //printKernelDims();
	 //_set_to_default( max_speed_array, radii, areas, number_of_elements, 3.3); // 
	
    __compute_time_step__ <<< _launcher_.gridDim, _launcher_.blockDim, _launcher_.blockDim*sizeof(double) >>> (
	                  tri_full_flag, 
							max_speed_array, 
							radii,
							number_of_elements,
							epsilon,
							timestep,
							times_out_gpu );

    safecall(cudaThreadSynchronize());
    
    copyDeviceToHost( times_out, times_out_gpu, reduction_size );
    for( i=0; i < _launcher_.gridDim; ++i) {
		timestep = min( timestep, times_out[i] );
    }
	 //printf("\ntimestep = %f\n",timestep);
	 //fflush(stdout);

    freeDeviceMemory( times_out_gpu );
    freeHostMemory( times_out );

    return timestep;
}