Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

cudafun.cu @ 8932

Last change on this file since 8932 was 8176, checked in by steve, 14 years ago
Moved and then moved back anuga_1d. Will have to get Paul Bryan to first update the pipe flow directory
File size: 23.3 KB

Line
1	// cudafun.cu --
2	// CUDA memory allocation & shallow water kernel routines
3	// To compile (Cuda 3.2):
4	// nvcc -c --gpu_architecture sm_13 -I${CUDA_INSTALL_PATH}/include
5	// -Xcompiler -fpic
6	//
7	// ! Incomplete kernel call !
8	//
9	// Matthias Griessinger, University of Erlangen, 2011.
10
11
12	#include <stdio.h>
13	//#include <cuda_runtime_api.h>
14	#ifndef _CUDA_MACROS_H_
15	#define _CUDA_MACROS_H_
16
17	#define safecall(call) do{\
18	cudaError_t err = call ;\
19	if( cudaSuccess != err ){\
20	fprintf(stdout, "cuda error at %s:%d, %s\n",\
21	__FILE__, __LINE__, cudaGetErrorString(err));\
22	fflush(stdout);\
23	}\
24	} while(0)
25
26	#define BLOCKDIM 96
27
28	#ifdef _SHARED_WRITEBACK_3_
29	#define SHARED_MEM_MULT 3
30	#else
31	#define SHARED_MEM_MULT 1
32	#endif
33
34	#endif
35
36
37	/* ********* DEVICE SELECTION *********************** */
38
39	extern "C" void getDeviceInfo( int rank, int size, const char* hostname) {
40	/* Print device information */
41	int deviceCount, device;
42	cudaDeviceProp deviceProp;
43
44	cudaGetDeviceCount(&deviceCount);
45
46	if ( 0 == rank ) {
47	printf("## rank %i/%i on %s --\t Device Test: No. Cards: %d\n",
48	rank, size-1, hostname, deviceCount);
49	for( device = 0; device < deviceCount; ++device) {
50	cudaGetDeviceProperties(&deviceProp, device);
51	printf("## rank %i/%i on %s --\t Device %d: %s\n",
52	rank, size-1, hostname, device, deviceProp.name);
53	}
54	}
55	}
56
57
58	extern "C" int selectDevice( int rank, int size, const char* hostname ) {
59	/* Select GPU device (for multiple cards);
60	call before any GPU memory/kernel calls,
61	otherwise no effect (default card selected)
62	*/
63	int deviceCount, takedevice, device;
64	cudaDeviceProp deviceProp;
65
66	cudaGetDeviceCount(&deviceCount);
67
68	takedevice = rank%deviceCount;
69	cudaSetDevice(takedevice);
70	cudaGetDevice(&device);
71	cudaGetDeviceProperties(&deviceProp, device);
72
73	printf("rank %i/%i on %s --\t Selecting Device %d: %s\n",
74	rank, size, hostname, device, deviceProp.name);
75
76	return device;
77	}
78
79
80	/* ********* KERNEL LAUNCH PARAMETERS *************** */
81
82	typedef struct {
83	int gridDim;
84	int blockDim;
85	} KERNEL_LAUNCHER;
86
87	KERNEL_LAUNCHER _launcher_;
88
89	extern "C" void setKernelDims( const int gridDim, const int blockDim ) {
90	_launcher_.gridDim = gridDim;
91	_launcher_.blockDim = blockDim;
92	}
93
94	extern "C" void printKernelDims() {
95	printf(" kernel dims: %i x %i\n",
96	_launcher_.gridDim, _launcher_.blockDim);
97	fflush(stdout);
98	}
99
100
101	/* ********* CUDA MEMORY ************************** */
102
103	extern "C" void* allocDeviceMemory( size_t bytesize ) {
104	char* mem = NULL;
105	safecall(cudaMalloc( (void**)&mem, bytesize ));
106	fprintf(stdout,"allocDevice: allocating %lu bytes at %p\n", bytesize, mem);fflush(stdout);
107
108	return (void*)mem;
109	}
110
111	extern "C" void* allocHostMemory( size_t bytesize ) {
112	/* returns aligned CPU memory for faster transfer */
113	char* mem = NULL;
114	safecall(cudaHostAlloc( (void**)&mem, bytesize, 0 ));
115
116	return (void*)mem;
117	}
118
119
120	extern "C" void copyDeviceToHost( void* hostmem, void* devicemem, size_t bytesize ) {
121	/* copy bytesize bytes from GPU to host */
122	safecall(cudaMemcpy( hostmem, devicemem, bytesize, cudaMemcpyDeviceToHost ));
123	}
124
125	extern "C" void copyHostToDevice( void* devmem, void* hostmem, size_t bytesize ) {
126	/* copy bytesize bytes from host to GPU */
127	safecall(cudaMemcpy( devmem, hostmem, bytesize, cudaMemcpyHostToDevice ));
128	}
129
130
131	extern "C" void freeDeviceMemory( void* mem ) {
132	fprintf(stdout,"freeDevice: freeing at %p\n", mem);fflush(stdout);
133	safecall(cudaFree( mem ));
134	}
135
136	extern "C" void freeHostMemory( void* mem ) {
137	safecall(cudaFreeHost( mem ));
138	}
139
140
141	extern "C" void dummy( ) {
142	fprintf(stdout, "dummy 1 at %s:%d\n",__FILE__, __LINE__);
143	fflush(stdout);
144	double* mem;
145	//double* mem = (double)allocDeviceMemory( 128sizeof(double) );
146	cudaMalloc( (void*)&mem, 128sizeof(double));
147
148	fprintf(stdout, "dummy 2 at %s:%d\n",__FILE__, __LINE__);
149	fflush(stdout);
150	cudaFree(mem);
151	fprintf(stdout, "dummy 3 at %s:%d\n",__FILE__, __LINE__);
152	fflush(stdout);
153	}
154
155	/* ********* GPU KERNELS ************************ */
156
157
158	typedef struct {
159	double edgeflux_s;
160	double edgeflux_x;
161	double edgeflux_y;
162	double max_speed;
163	} Fluxes;
164
165	typedef struct {
166	double u;
167	double uh;
168	double h;
169	} Velocity;
170
171	__global__ void __set_to_default__(double* edge,
172	size_t N, double def) {
173	/* set input array edge of length N to value def */
174
175	size_t k;
176	for( k = blockIdx.x * blockDim.x + threadIdx.x; k < N; k += gridDim.x * blockDim.x ) {
177	edge[k] = def;
178	}
179	}
180
181	__global__ void __set_arrays_to_default__(double* edge,
182	double* xmom,
183	double* ymom,
184	size_t N, double def) {
185	/* set input arrays edge, xmom, ymom of length N to value def */
186
187	size_t k;
188	for( k = blockIdx.x * blockDim.x + threadIdx.x; k < N; k += gridDim.x * blockDim.x ) {
189	edge[k] = def;
190	xmom[k] = def;
191	ymom[k] = def;
192	}
193	}
194
195
196	__global__ void __compute_time_step__( const long* tri_full_flag,
197	const double* max_speed_array,
198	const double* radii,
199	const size_t number_of_elements,
200	const double epsilon,
201	const double time0,
202	double* times_out ) {
203	/* Computes minimal timestep in each triangle k, and finds
204	minimal timestep in each block of threads.
205
206	Output is written to times_out, final reduction must
207	be performed on CPU.*/
208
209	// shared memory size defined at kernel launch,
210	// set according to blockDim
211	extern __shared__ double sdata[];
212
213	unsigned int tid = threadIdx.x;
214
215	// initialize thread with default (previous) timestep
216	double mytime = time0;
217
218	// For all triangles
219	for( size_t k = blockIdx.x * blockDim.x + threadIdx.x; k < number_of_elements; k += gridDim.x*blockDim.x) {
220	if( 1 == tri_full_flag[k] && max_speed_array[k] > epsilon ) {
221	mytime = fmin( mytime, radii[k] / max_speed_array[k] );
222	}
223	}
224
225	// each thread in block writes to shared memory
226	sdata[tid] = mytime;
227	__syncthreads();
228
229	// Reduce to one value per thread block by successively
230	// comparing value pairs; in first sweep, first half of
231	// threads compares first half of values to second half and
232	// writes min to first half; 2nd sweep, first fourth compares
233	// first and second fourth a.s.o.
234	for(unsigned int s=blockDim.x/2; s>0; s>>=1) {
235	if(tid < s) {
236	sdata[tid] = fmin( sdata[tid + s], sdata[tid]);
237	}
238	__syncthreads();
239	}
240
241	// Lead thread writes min for this block to global mem
242	if (tid == 0) {
243	times_out[blockIdx.x] = sdata[0];
244	}
245	}
246
247
248	__device__ double2 __rotate__(double q1, double q2, double n1, double n2) {
249	/*Rotate the momentum component q (q1, q2)
250	from x,y coordinates to coordinates based on normal vector (n1, n2).
251
252	Result is returned in array 3x1 r
253	To rotate in opposite direction, call rotate with (q, n1, -n2)
254
255	Contents of q are changed by this function */
256
257	double2 q;
258
259	// Rotate
260	q.x = n1q1 + n2q2;
261	q.y = -n2q1 + n1q2;
262
263	return q;
264	}
265
266	__device__ Velocity __compute_speed__(
267	double uh,
268	double h,
269	const double epsilon,
270	const double h0,
271	const double limiting_threshold) {
272
273	Velocity result;
274
275	if (h < limiting_threshold) {
276	// Apply limiting of speeds according to the ANUGA manual
277	if (h < epsilon) {
278	h = 0.0; // Could have been negative
279	result.u = 0.0;
280	} else {
281	result.u = uh/(h + h0/ h);
282	}
283
284
285	// Adjust momentum to be consistent with speed
286	uh = result.u * h;
287	} else {
288	// We are in deep water - no need for limiting
289	result.u = uh/ h;
290	}
291
292	result.uh = uh;
293	result.h = h;
294	return result;
295	}
296
297	__device__ Fluxes __flux_function_central__(double2 stage, double2 xmom,
298	double2 ymom, double2 z_lr,
299	double2 n,
300	const double epsilon,
301	const double h0,
302	const double limiting_threshold,
303	const double g) {
304	//double *edgeflux,
305	//double *max_speed) {
306
307	/*Compute fluxes between volumes for the shallow water wave equation
308	cast in terms of the 'stage', w = h+z using
309	the 'central scheme' as described in
310
311	Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For
312	Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'.
313	Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740.
314
315	The implemented formula is given in equation (3.15) on page 714
316	*/
317	double2 tmp;
318	double h_left, uh_left, vh_left, u_left;
319	double h_right, uh_right, vh_right, u_right;
320	double s_min, s_max, soundspeed_left, soundspeed_right;
321	double denom, inverse_denominator, z;
322
323	// Cuda doesn't do static arrays
324	double fs_l, fs_r;
325	double2 fv_l, fv_r;
326	Velocity velos;
327	Fluxes fluxes;
328
329	// Align x- and y-momentum with x-axis
330	// Do not be confused: xmom.x and xmom.y are
331	// left and right momenti in x direction
332	tmp = __rotate__( xmom.x, ymom.x, n.x, n.y);
333	xmom.x = tmp.x; ymom.x = tmp.y;
334	tmp = __rotate__( xmom.y, ymom.y, n.x, n.y);
335	xmom.y = tmp.x; ymom.y = tmp.y;
336
337	z = 0.5*(z_lr.x + z_lr.y); // Average elevation values.
338	// Even though this will nominally allow
339	// for discontinuities in the elevation data,
340	// there is currently no numerical support for
341	// this so results may be strange near
342	// jumps in the bed.
343
344	// Compute speeds in x-direction
345	h_left = stage.x - z;
346
347	uh_left = xmom.x; // q_left_rotated[1];
348	velos = __compute_speed__(uh_left, h_left,
349	epsilon, h0, limiting_threshold);
350	u_left = velos.u;
351	uh_left = velos.uh;
352	h_left = velos.h;
353
354	h_right = stage.y - z;
355	uh_right = xmom.y; // q_right_rotated[1];
356	velos = __compute_speed__(uh_right, h_right,
357	epsilon, h0, limiting_threshold);
358	u_right = velos.u;
359	uh_right = velos.uh;
360	h_right = velos.h;
361
362	// Momentum in y-direction
363	vh_left = ymom.x; //q_left_rotated[2];
364	vh_right = ymom.y; //q_right_rotated[2];
365
366	// Limit y-momentum if necessary
367	// Leaving this out, improves speed significantly (Ole 27/5/2009)
368	// All validation tests pass, so do we really need it anymore?
369	velos = __compute_speed__(vh_left, h_left,
370	epsilon, h0, limiting_threshold);
371	vh_left = velos.uh;
372	h_left = velos.h;
373	velos = __compute_speed__(vh_right, h_right,
374	epsilon, h0, limiting_threshold);
375	vh_right = velos.uh;
376	h_right = velos.h;
377
378	// Maximal and minimal wave speeds
379	soundspeed_left = sqrt(g*h_left);
380	soundspeed_right = sqrt(g*h_right);
381
382	s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
383	if (s_max < 0.0)
384	{
385	s_max = 0.0;
386	}
387
388	s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
389	if (s_min > 0.0)
390	{
391	s_min = 0.0;
392	}
393
394	// Flux formulas
395	fs_l = u_left*h_left;
396	fv_l.x = u_leftuh_left + 0.5gh_lefth_left;
397	fv_l.y = u_left*vh_left;
398
399	fs_r = u_right*h_right;
400	fv_r.x = u_rightuh_right + 0.5gh_righth_right;
401	fv_r.y = u_right*vh_right;
402
403	// Flux computation
404	denom = s_max - s_min;
405	if (denom < epsilon)
406	{ // FIXME (Ole): Try using h0 here
407	fluxes.edgeflux_s = 0.0;
408	fluxes.edgeflux_x = 0.0;
409	fluxes.edgeflux_y = 0.0;
410	fluxes.max_speed = 0.0;
411	}
412	else
413	{
414	inverse_denominator = 1.0/denom;
415
416	fluxes.edgeflux_s = s_maxfs_l - s_minfs_r;
417	fluxes.edgeflux_s += s_maxs_min(stage.y - stage.x);
418	fluxes.edgeflux_s *= inverse_denominator;
419
420	fluxes.edgeflux_x = s_maxfv_l.x - s_minfv_r.x;
421	fluxes.edgeflux_x += s_maxs_min(xmom.y - xmom.x);
422	fluxes.edgeflux_x *= inverse_denominator;
423
424	fluxes.edgeflux_y = s_maxfv_l.y - s_minfv_r.y;
425	fluxes.edgeflux_y += s_maxs_min(ymom.y - ymom.x);
426	fluxes.edgeflux_y *= inverse_denominator;
427
428	// Maximal wavespeed
429	fluxes.max_speed = fmax(fabs(s_max), fabs(s_min));
430
431	// Rotate back
432	tmp = __rotate__( fluxes.edgeflux_x, fluxes.edgeflux_y, n.x, -n.y);
433	fluxes.edgeflux_x = tmp.x; fluxes.edgeflux_y = tmp.y;
434	}
435
436	return fluxes;
437	}
438
439
440	__global__ void __compute_fluxes_central_kernel__(
441	const int number_of_elements,
442	double timestep,
443	const double epsilon,
444	const double H0,
445	const double g,
446	const long* neighbours,
447	const long* neighbour_edges,
448	const double* normals,
449	double* edgelengths,
450	const double* areas,
451	const double* stage_edge_values,
452	const double* xmom_edge_values,
453	const double* ymom_edge_values,
454	const double* bed_edge_values,
455	const double* stage_boundary_values,
456	const double* xmom_boundary_values,
457	const double* ymom_boundary_values,
458	double* stage_explicit_update,
459	double* xmom_explicit_update,
460	double* ymom_explicit_update,
461	double* max_speed_array,
462	const int optimise_dry_cells) {
463	/* __global__: called by CPU and executed on GPU;
464	must return void, scalar variables, structs (and scalar members)
465	are copied automatically, arrays must be allocated on device.
466	*/
467
468	// Local variables
469	double length; //, zl, zr;
470	double h0 = H0*H0; // This ensures a good balance when h approaches H0.
471
472	double limiting_threshold = 10 * H0; // Avoid applying limiter below this
473	// threshold for performance reasons.
474	// See ANUGA manual under flux limiting
475	int i, k, m, n;
476	int ki, nm = 0, ki2; // Index shorthands
477
478	Fluxes fluxes;
479
480	double2 stage, xmom, ymom, z_lr, normvec;
481
482	// Shared memory for explicit update quantity reduction
483	extern __shared__ double update_shared[]; // empty [] array:(byte)size defined by kernel call
484	//__shared__ double update_shared[BLOCKDIM*SHARED_MEM_MULT]; // OR static size (not both)
485
486	// For all edges;
487	for( ki = blockIdx.x * blockDim.x + threadIdx.x; ki < 3number_of_elements; ki += gridDim.x blockDim.x ) {
488
489	// Get left hand side values from triangle k, edge i
490	stage.x = stage_edge_values[ki];
491	xmom.x = xmom_edge_values[ki];
492	ymom.x = ymom_edge_values[ki];
493	z_lr.x = bed_edge_values[ki];
494
495	// Get right hand side values either from neighbouring triangle
496	// or from boundary array (Quantities at neighbour on nearest face).
497	n = neighbours[ki];
498	if (n < 0) {
499	// Neighbour is a boundary condition
500	m = -n - 1; // Convert negative flag to boundary index
501
502	// Bad access order consider binding boundary_values to Texture cache
503	stage.y = stage_boundary_values[m];
504	xmom.y = xmom_boundary_values[m];
505	ymom.y = ymom_boundary_values[m];
506	z_lr.y = z_lr.x; // Extend bed elevation to boundary
507	}
508	else {
509	// Neighbour is a real triangle
510	m = neighbour_edges[ki];
511	nm = n * 3 + m; // Linear index (triangle n, edge m)
512
513	// Again, bind to Texture cache
514	stage.y = stage_edge_values[nm];
515	xmom.y = xmom_edge_values[nm];
516	ymom.y = ymom_edge_values[nm];
517	z_lr.y = bed_edge_values[nm];
518	}
519
520	// Now we have values for this edge - both from left and right side.
521
522	/*if (optimise_dry_cells) {
523	// Check if flux calculation is necessary across this edge
524	// This check will exclude dry cells.
525	// This will also optimise cases where zl != zr as
526	// long as both are dry
527
528	if (fabs(ql[0] - zl) < epsilon &&
529	fabs(qr[0] - zr) < epsilon) {
530	// Cell boundary is dry
531
532	already_computed_flux[ki] = call; // #k Done
533	if (n >= 0) {
534	already_computed_flux[nm] = call; // #n Done
535	}
536
537	max_speed = 0.0;
538	continue;
539	}
540	}*/
541
542
543	/*if (fabs(zl-zr)>1.0e-10) {TODO:
544	report_python_error(AT,"Discontinuous Elevation");
545	return 0.0;
546	}*/
547
548	// Outward pointing normal vector (domain.normals[k, 2i:2i+2])
549	ki2 = 2 * ki; //k6 + i2
550	// Bad access order, use Texture or shared mem
551	normvec.x = normals[ki2];
552	normvec.y = normals[ki2+1];
553
554	// Edge flux computation (triangle k, edge i);
555	// TODO: subroutine causes unspecified launch failure
556	//fluxes = __flux_function_central__( stage, xmom, ymom, z_lr, normvec,
557	// epsilon, h0, limiting_threshold, g);
558
559
560	// Multiply edgeflux by edgelength
561	length = edgelengths[ki];
562	fluxes.edgeflux_s *= length;
563	fluxes.edgeflux_x *= length;
564	fluxes.edgeflux_y *= length;
565
566
567	// Use shared memory to accumulate flux for one triangle;
568	// requires blockDim.x to be multiple of 3 (should also be multiple of 32)
569
570	#ifdef _SHARED_WRITEBACK_3_
571
572	// Accumulate all update arrays in one sweep;
573	// if shared memory is large enough
574
575	update_shared[threadIdx.x] = fluxes.edgeflux_s;
576	update_shared[threadIdx.x+blockDim.x] = fluxes.edgeflux_x;
577	update_shared[threadIdx.x+2*blockDim.x] = fluxes.edgeflux_y;
578	__syncthreads();
579
580
581	// Each third of the threads in block write back an update array
582	// with contiguous access, each thread sums fluxes in a triangle
583
584	if( threadIdx.x < blockDim.x/3) {
585	// Calculate contiguous index
586	i = threadIdx.x;
587	k = blockIdx.x*(blockDim.x/3) + i;
588	stage_explicit_update[k] = ( update_shared[3*i]
589	+ update_shared[3*i+1]
590	+ update_shared[3*i+2]) / areas[k];
591	} else if( threadIdx.x < 2*(blockDim.x/3) ) {
592	i = threadIdx.x - (blockDim.x/3);
593	k = blockIdx.x*(blockDim.x/3) + i;
594	xmom_explicit_update[k] = ( update_shared[blockDim.x+3*i]
595	+ update_shared[blockDim.x+3*i+1]
596	+ update_shared[blockDim.x+3*i+2]) / areas[k];
597	} else {
598	i = threadIdx.x - 2*(blockDim.x/3);
599	k = blockIdx.x*(blockDim.x/3) + i;
600	ymom_explicit_update[k] = ( update_shared[2blockDim.x+3i]
601	+ update_shared[2blockDim.x+3i+1]
602	+ update_shared[2blockDim.x+3i+2] ) / areas[k];
603
604	}
605	__syncthreads();
606
607	#else
608
609	// Write each update array back by itself;
610	// only first third of threads busy
611	update_shared[threadIdx.x]= fluxes.edgeflux_s;
612	__syncthreads();
613	if( threadIdx.x < blockDim.x/3 ) {
614	i = threadIdx.x;
615	k = blockIdx.x*(blockDim.x/3) + i;
616	stage_explicit_update[k] = ( update_shared[3*i]
617	+ update_shared[3*i+1]
618	+ update_shared[3*i+2] ) / areas[k];
619	}
620	__syncthreads();
621
622
623	update_shared[threadIdx.x] = fluxes.edgeflux_x;
624	__syncthreads();
625	if( threadIdx.x < blockDim.x/3 ) {
626	i = threadIdx.x;
627	k = blockIdx.x*(blockDim.x/3) + i;
628	xmom_explicit_update[k] = ( update_shared[3*i]
629	+ update_shared[3*i+1]
630	+ update_shared[3*i+2] ) / areas[k];
631	}
632	__syncthreads();
633
634	update_shared[threadIdx.x] = fluxes.edgeflux_y;
635	__syncthreads();
636	if( threadIdx.x < blockDim.x/3 ) {
637	i = threadIdx.x;
638	k = blockIdx.x*(blockDim.x/3) + i;
639	ymom_explicit_update[k] = ( update_shared[3*i]
640	+ update_shared[3*i+1]
641	+ update_shared[3*i+2] ) / areas[k];
642	}
643	__syncthreads();
644
645	#endif
646
647	// Likewise, get and write maximum speed within triangle
648	// update_shared[threadIdx.x] = fluxes.max_speed;
649	__syncthreads();
650	if( threadIdx.x < blockDim.x/3 ) {
651	i = threadIdx.x;
652	k = blockIdx.x*(blockDim.x/3) + i;
653	fluxes.max_speed = fmax( update_shared[3i], update_shared[3i+1] );
654	max_speed_array[k] = fmax( fluxes.max_speed, update_shared[3*i+2] );
655	}
656
657	} // End edge ki
658
659	// computation of timestep in seperate routine because of triangle-wise access
660	}
661
662
663	/* ********* KERNEL WRAPPER FUNCTIONS ************************ */
664
665	extern "C" void _set_to_default( double* edge, double* xmom, double* ymom, size_t N, double def) {
666
667	__set_arrays_to_default__ <<< _launcher_.gridDim, _launcher_.blockDim >>> ( edge, xmom, ymom, N, def);
668	safecall(cudaThreadSynchronize());
669	}
670
671	/extern "C" void _set_to_default( double edge, size_t N, double def) {
672
673	__set_to_default__ <<< _launcher_.gridDim, _launcher_.blockDim >>> ( edge, N, def);
674	safecall(cudaThreadSynchronize());
675	}*/
676
677	extern "C" double _compute_fluxes_central(
678	int number_of_elements,
679	double timestep,
680	double epsilon,
681	double H0,
682	double g,
683	long* neighbours,
684	long* neighbour_edges,
685	double* normals,
686	double* edgelengths,
687	double* radii,
688	double* areas,
689	long* tri_full_flag,
690	double* stage_edge_values,
691	double* xmom_edge_values,
692	double* ymom_edge_values,
693	double* bed_edge_values,
694	double* stage_boundary_values,
695	double* xmom_boundary_values,
696	double* ymom_boundary_values,
697	double* stage_explicit_update,
698	double* xmom_explicit_update,
699	double* ymom_explicit_update,
700	double* max_speed_array,
701	int optimise_dry_cells) {
702
703	static long call = 1; // Static local variable flagging already computed flux
704	int i;
705	// Start computation
706	call++; // Flag 'id' of flux calculation for this timestep
707
708	// prepare memory for timestep reduction (TODO: (de)allocate only once)
709	const size_t reduction_size = _launcher_.gridDim*sizeof(double);
710	double* times_out = (double*)allocHostMemory( reduction_size );
711	double* times_out_gpu = (double*)allocDeviceMemory( reduction_size );
712
713	printf("shared mum mult: %i\n", SHARED_MEM_MULT);
714
715	if( 0 != _launcher_.blockDim%3 ) {
716	fprintf(stderr,"error: blockDim required to be multiple of 3!\n");
717	}
718
719	__compute_fluxes_central_kernel__ <<< _launcher_.gridDim, _launcher_.blockDim,
720	_launcher_.blockDimsizeof(double)SHARED_MEM_MULT >>> (
721	//__compute_fluxes_central_kernel__ <<< _launcher_.gridDim, BLOCKDIM >>> ( // for static shared memory size
722	number_of_elements,
723	timestep,
724	epsilon,
725	H0,
726	g,
727	neighbours,
728	neighbour_edges,
729	normals,
730	edgelengths,
731	areas,
732	stage_edge_values,
733	xmom_edge_values,
734	ymom_edge_values,
735	bed_edge_values,
736	stage_boundary_values,
737	xmom_boundary_values,
738	ymom_boundary_values,
739	stage_explicit_update,
740	xmom_explicit_update,
741	ymom_explicit_update,
742	max_speed_array,
743	optimise_dry_cells);
744
745
746	safecall(cudaThreadSynchronize()); // prevents overlap of kernels
747
748	// Some timestepping debug: (timestep 1.0)
749	//printKernelDims();
750	//_set_to_default( max_speed_array, radii, areas, number_of_elements, 3.3); //
751
752	__compute_time_step__ <<< _launcher_.gridDim, _launcher_.blockDim, _launcher_.blockDim*sizeof(double) >>> (
753	tri_full_flag,
754	max_speed_array,
755	radii,
756	number_of_elements,
757	epsilon,
758	timestep,
759	times_out_gpu );
760
761	safecall(cudaThreadSynchronize());
762
763	copyDeviceToHost( times_out, times_out_gpu, reduction_size );
764	for( i=0; i < _launcher_.gridDim; ++i) {
765	timestep = min( timestep, times_out[i] );
766	}
767	//printf("\ntimestep = %f\n",timestep);
768	//fflush(stdout);
769
770	freeDeviceMemory( times_out_gpu );
771	freeHostMemory( times_out );
772
773	return timestep;
774	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/anuga_work/anuga_gpu/cudafun.cu @ 8932

Download in other formats: