Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

compute_fluxes.c @ 9329

Last change on this file since 9329 was 9017, checked in by steve, 12 years ago
Adding in Zhe (John) Weng's anuga_cuda code as obtained from googlecode https://code.google.com/p/anuga-cuda
Property svn:executable set to ``*
File size: 33.6 KB

Line
1
2	// OpenHMPP ANUGA compute_fluxes
3	//
4	// Zhe Weng 2013
5
6
7	//#include<stdio.h>
8	//#include<stdlib.h>
9	//#include<string.h>
10	//#include<math.h>
11	//#include<assert.h>
12
13	#include "hmpp_fun.h"
14
15	#ifndef TOLERANCE
16	#define TOLERANCE 1e-8
17	#endif
18
19	#define USING_ORIGINAL_CUDA
20	//double max(double a, double b)
21	//{ return (a >= b) ? a : b; }
22	//
23	//double min(double a, double b)
24	//{ return (a <= b) ? a : b; }
25
26
27	/*
28	double sqrt(double x)
29	{
30	double mx = 32000;
31	double mn = 0;
32	while(mx - mn > 1e-9)
33	{
34	double md = (mx + mn) / 2;
35	if(md * md > x)
36	mx = md;
37	else
38	mn = md;
39	}
40	return mx;
41	}
42	*/
43
44
45	/*
46	void spe_bubble_sort(int* _list , long* neighbours, int k)
47	{
48	int temp;
49
50	if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) )
51	{
52	temp = _list[2];
53	_list[2] = _list[1];
54	_list[1] = temp;
55	}
56	if ( neighbours[_list[1]]>=0 and neighbours[ _list[1] ] < k and (neighbours[_list[0]]<0 or neighbours[_list[1]]<neighbours[_list[0]]) )
57	{
58	temp = _list[1];
59	_list[1] = _list[0];
60	_list[0] = temp;
61	}
62	if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) )
63	{
64	temp = _list[2];
65	_list[2] = _list[1];
66	_list[1] = temp;
67	}
68	}
69	*/
70
71	int _rotate(double *q, double n1, double n2)
72	{
73	/*Rotate the momentum component q (q[1], q[2])
74	from x,y coordinates to coordinates based on normal vector (n1, n2).
75
76	Result is returned in array 3x1 r
77	To rotate in opposite direction, call rotate with (q, n1, -n2)
78
79	Contents of q are changed by this function */
80
81
82	double q1, q2;
83
84	q1 = q[1]; // uh momentum
85	q2 = q[2]; // vh momentum
86
87	// Rotate
88	q[1] = n1 * q1 + n2*q2;
89	q[2] = -n2 * q1 + n1*q2;
90
91	return 0;
92	}
93
94
95	double _compute_speed(double *uh,
96	double *h,
97	double epsilon,
98	double h0,
99	double limiting_threshold)
100	{
101	double u;
102
103	if (*h < limiting_threshold) {
104	// Apply limiting of speeds according to the ANUGA manual
105	if (*h < epsilon) {
106	*h = 0.0; // Could have been negative
107	u = 0.0;
108	} else {
109	u = uh / (h + h0 / *h);
110	}
111
112
113	// Adjust momentum to be consistent with speed
114	uh = u *h;
115	} else {
116	// We are in deep water - no need for limiting
117	u = uh / h;
118	}
119
120	return u;
121	}
122
123
124	int _flux_function_central(
125	#ifdef USING_ORIGINAL_CUDA
126	double q_left, double q_right,
127	double z_left, double z_right,
128	double n1, double n2,
129	double epsilon,
130	double h0,
131	double limiting_threshold,
132	double g,
133	double *edgeflux,
134	double *max_speed)
135	#else
136	double q_left0,
137	double q_left1,
138	double q_left2,
139	double q_right0,
140	double q_right1,
141	double q_right2,
142	double z_left,
143	double z_right,
144	double n1,
145	double n2,
146	double epsilon,
147	double h0,
148	double limiting_threshold,
149	double g,
150	double *edgeflux0,
151	double *edgeflux1,
152	double *edgeflux2,
153	double *max_speed)
154	#endif
155	{
156	#ifdef USING_ORIGINAL_CUDA
157	int i;
158	#endif
159
160	double w_left, h_left, uh_left, vh_left, u_left;
161	double w_right, h_right, uh_right, vh_right, u_right;
162	double s_min, s_max, soundspeed_left, soundspeed_right;
163	double denom, inverse_denominator, z;
164	double temp;
165
166	// Workspace (allocate once, use many)
167	double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
168
169	// Copy conserved quantities to protect from modification
170	#ifdef USING_ORIGINAL_CUDA
171	q_left_rotated[0] = q_left[0];
172	q_right_rotated[0] = q_right[0];
173	q_left_rotated[1] = q_left[1];
174	q_right_rotated[1] = q_right[1];
175	q_left_rotated[2] = q_left[2];
176	q_right_rotated[2] = q_right[2];
177
178	// Align x- and y-momentum with x-axis
179	//_rotate(q_left_rotated, n1, n2);
180	q_left_rotated[1] = n1q_left[1] + n2q_left[2];
181	q_left_rotated[2] = -n2q_left[1] + n1q_left[2];
182
183	//_rotate(q_right_rotated, n1, n2);
184	q_right_rotated[1] = n1q_right[1] + n2q_right[2];
185	q_right_rotated[2] = -n2q_right[1] + n1q_right[2];
186	#else
187	q_left_rotated[0] = q_left0;
188	q_right_rotated[0] = q_right0;
189	q_left_rotated[1] = q_left1;
190	q_right_rotated[1] = q_right1;
191	q_left_rotated[2] = q_left2;
192	q_right_rotated[2] = q_right2;
193
194	// Align x- and y-momentum with x-axis
195	//_rotate(q_left_rotated, n1, n2);
196	q_left_rotated[1] = n1q_left1 + n2q_left2;
197	q_left_rotated[2] = -n2q_left1 + n1q_left2;
198
199	//_rotate(q_right_rotated, n1, n2);
200	q_right_rotated[1] = n1q_right1 + n2q_right2;
201	q_right_rotated[2] = -n2q_right1 + n1q_right2;
202	#endif
203
204
205	if (fabs(z_left - z_right) > 1.0e-10) {
206	//report_python_error(AT, "Discontinuous Elevation");
207	return 0.0;
208	}
209	z = 0.5 * (z_left + z_right); // Average elevation values.
210
211	// Compute speeds in x-direction
212	w_left = q_left_rotated[0];
213	h_left = w_left - z;
214	uh_left = q_left_rotated[1];
215	u_left = _compute_speed(&uh_left, &h_left,
216	epsilon, h0, limiting_threshold);
217	w_right = q_right_rotated[0];
218	h_right = w_right - z;
219	uh_right = q_right_rotated[1];
220	u_right = _compute_speed(&uh_right, &h_right,
221	epsilon, h0, limiting_threshold);
222	// Momentum in y-direction
223	vh_left = q_left_rotated[2];
224	vh_right = q_right_rotated[2];
225
226	// Limit y-momentum if necessary
227	// Leaving this out, improves speed significantly (Ole 27/5/2009)
228	// All validation tests pass, so do we really need it anymore?
229	_compute_speed(&vh_left, &h_left,
230	epsilon, h0, limiting_threshold);
231	_compute_speed(&vh_right, &h_right,
232	epsilon, h0, limiting_threshold);
233
234	// Maximal and minimal wave speeds
235	soundspeed_left = sqrt(g * h_left);
236	soundspeed_right = sqrt(g * h_right);
237
238	// Code to use fast square root optimisation if desired.
239	// Timings on AMD 64 for the Okushiri profile gave the following timings
240	//
241	// SQRT Total Flux
242	//=============================
243	//
244	// Ref 405s 152s
245	// Fast (dbl) 453s 173s
246	// Fast (sng) 437s 171s
247	//
248	// Consequently, there is currently (14/5/2009) no reason to use this
249	// approximation.
250
251	//soundspeed_left = fast_squareroot_approximation(g*h_left);
252	//soundspeed_right = fast_squareroot_approximation(g*h_right);
253
254	s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right);
255	if (s_max < 0.0) {
256	s_max = 0.0;
257	}
258
259	s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right);
260	if (s_min > 0.0) {
261	s_min = 0.0;
262	}
263
264	// Flux formulas
265	flux_left[0] = u_left*h_left;
266	flux_left[1] = u_left * uh_left + 0.5 * g * h_left*h_left;
267	flux_left[2] = u_left*vh_left;
268
269	flux_right[0] = u_right*h_right;
270	flux_right[1] = u_right * uh_right + 0.5 * g * h_right*h_right;
271	flux_right[2] = u_right*vh_right;
272
273	// Flux computation
274	denom = s_max - s_min;
275	if (denom < epsilon) { // FIXME (Ole): Try using h0 here
276	//memset(edgeflux, 0, 3 * sizeof (double));
277	edgeflux[0]=0.0;
278	edgeflux[1]=0.0;
279	edgeflux[2]=0.0;
280	*max_speed = 0.0;
281	}
282	else {
283	inverse_denominator = 1.0 / denom;
284	#ifdef USING_ORIGINAL_CUDA
285	//#pragma hmppcg noparallel
286	//for (i = 0; i < 3; i++) {
287	// edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i];
288	// edgeflux[i] += s_max s_min (q_right_rotated[i] -q_left_rotated[i]);
289	// edgeflux[i] *= inverse_denominator;
290	//}
291	edgeflux[0] = s_max * flux_left[0] - s_min * flux_right[0];
292	edgeflux[0]+=s_max s_min (q_right_rotated[0] -q_left_rotated[0]);
293	edgeflux[0] *= inverse_denominator;
294
295	edgeflux[1] = s_max * flux_left[1] - s_min * flux_right[1];
296	edgeflux[1]+=s_max s_min (q_right_rotated[1] -q_left_rotated[1]);
297	edgeflux[1] *= inverse_denominator;
298
299	edgeflux[2] = s_max * flux_left[2] - s_min * flux_right[2];
300	edgeflux[2]+=s_max s_min (q_right_rotated[2] -q_left_rotated[2]);
301	edgeflux[2] *= inverse_denominator;
302	#else
303	edgeflux0 = s_max flux_left[0] - s_min * flux_right[0];
304	edgeflux0 += s_max s_min *(q_right_rotated[0] -q_left_rotated[0]);
305	edgeflux0 = inverse_denominator;
306
307	edgeflux1 = s_max flux_left[1] - s_min * flux_right[1];
308	edgeflux1 += s_max s_min *(q_right_rotated[1] -q_left_rotated[1]);
309	edgeflux1 = inverse_denominator;
310
311	edgeflux2 = s_max flux_left[2] - s_min * flux_right[2];
312	edgeflux2 += s_max s_min *(q_right_rotated[2] -q_left_rotated[2]);
313	edgeflux2 = inverse_denominator;
314	#endif
315
316	// Maximal wavespeed
317	*max_speed = fmax(fabs(s_max), fabs(s_min));
318
319	// Rotate back
320	//_rotate(edgeflux, n1, -n2);
321	#ifdef USING_ORIGINAL_CUDA
322	temp = edgeflux[1];
323	edgeflux[1] = n1* temp -n2*edgeflux[2];
324	edgeflux[2] = n2temp + n1edgeflux[2];
325	#else
326	temp = *edgeflux1;
327	edgeflux1 = n1 temp -n2* *edgeflux2;
328	edgeflux2 = n2temp + n1* *edgeflux2;
329	#endif
330	}
331
332	return 0;
333	}
334
335
336
337
338
339
340
341	/*****************************************/
342	/* The CUDA compute fluex function */
343	/*****************************************/
344
345
346	#ifdef USING_LOCAL_DIRECTIVES
347	#pragma hmpp cf_central codelet, target=CUDA args[*].transfer=atcall
348	#endif
349	void compute_fluxes_central_structure_CUDA(
350	int N,
351	int N3,
352	int N6,
353	int N2,
354
355	double timestep[N],
356	long neighbours[N3],
357	long neighbour_edges[N3],
358	double normals[N6],
359	double edgelengths[N3],
360	double radii[N],
361	double areas[N],
362	long tri_full_flag[N],
363	double stage_edge_values[N3],
364	double xmom_edge_values[N3],
365	double ymom_edge_values[N3],
366	double bed_edge_values[N3],
367	double stage_boundary_values[N2],
368	double xmom_boundary_values[N2],
369	double ymom_boundary_values[N2],
370	double stage_explicit_update[N],
371	double xmom_explicit_update[N],
372	double ymom_explicit_update[N],
373	double max_speed_array[N],
374
375	double evolve_max_timestep,
376	double g,
377	double epsilon,
378	double h0,
379	double limiting_threshold,
380	int optimise_dry_cells)
381	{
382	int k;
383	int i, m, n;
384	int ki, nm = 0, ki2;
385
386	double max_speed, max_speed_total=0 , length, inv_area, zl, zr;
387
388	#ifdef USING_ORIGINAL_CUDA
389	double ql[3], qr[3], edgeflux[3];
390	#else
391	double ql0, ql1, ql2,
392	qr0, qr1, qr2,
393	edgeflux0, edgeflux1, edgeflux2;
394	#endif
395
396	#pragma hmppcg gridify(k), private(max_speed, max_speed_total, length,&
397	#pragma hmppcg & inv_area, zl, zr, i, m, n, ki, nm, ki2, ql, qr, &
398	#pragma hmppcg & edgeflux), global(timestep, neighbours, &
399	#pragma hmppcg & neighbour_edges, normals, edgelengths, radii, &
400	#pragma hmppcg & areas, tri_full_flag, stage_edge_values, &
401	#pragma hmppcg & xmom_boundary_values, ymom_boundary_values, &
402	#pragma hmppcg & stage_explicit_update, xmom_explicit_update, &
403	#pragma hmppcg & ymom_explicit_update, max_speed_array, &
404	#pragma hmppcg & evolve_max_timestep, g, epsilon, h0, &
405	#pragma hmppcg & limiting_threshold, optimise_dry_cells)
406	for(k=0; k<N; k++)
407	{
408	timestep[k] = evolve_max_timestep;
409
410	// Loop through neighbours and compute edge flux for each
411	for (i = 0; i < 3; i++)
412	{
413	ki = k * 3 + i; // Linear index to edge i of triangle k
414
415	n = neighbours[ki];
416
417	#ifdef USING_ORIGINAL_CUDA
418	ql[0] = stage_edge_values[ki];
419	ql[1] = xmom_edge_values[ki];
420	ql[2] = ymom_edge_values[ki];
421	#else
422	ql0 = stage_edge_values[ki];
423	ql1 = xmom_edge_values[ki];
424	ql2 = ymom_edge_values[ki];
425	#endif
426	zl = bed_edge_values[ki];
427
428	if (n < 0) {
429	m = -n - 1; // Convert negative flag to boundary index
430
431	#ifdef USING_ORIGINAL_CUDA
432	qr[0] = stage_boundary_values[m];
433	qr[1] = xmom_boundary_values[m];
434	qr[2] = ymom_boundary_values[m];
435	#else
436	qr0 = stage_boundary_values[m];
437	qr1 = xmom_boundary_values[m];
438	qr2 = ymom_boundary_values[m];
439	#endif
440	zr = zl; // Extend bed elevation to boundary
441	} else {
442	m = neighbour_edges[ki];
443	nm = n * 3 + m; // Linear index (triangle n, edge m)
444
445	#ifdef USING_ORIGINAL_CUDA
446	qr[0] = stage_edge_values[nm];
447	qr[1] = xmom_edge_values[nm];
448	qr[2] = ymom_edge_values[nm];
449	#else
450	qr0 = stage_edge_values[nm];
451	qr1 = xmom_edge_values[nm];
452	qr2 = ymom_edge_values[nm];
453	#endif
454	zr = bed_edge_values[nm];
455	}
456
457	if (optimise_dry_cells){
458	#ifdef USING_ORIGINAL_CUDA
459	if ( fabs(ql[0] - zl) < epsilon &&
460	fabs(qr[0] - zr) < epsilon) {
461	#else
462	if ( fabs(ql0 - zl) < epsilon &&
463	fabs(qr0 - zr) < epsilon) {
464	#endif
465	max_speed = 0.0;
466	continue;
467	}
468	}
469
470	ki2 = 2 * ki; //k6 + i2
471	#ifdef USING_ORIGINAL_CUDA
472	_flux_function_central(ql, qr, zl, zr,
473	normals[ki2], normals[ki2 + 1],
474	epsilon, h0, limiting_threshold, g,
475	edgeflux, &max_speed);
476
477	length = edgelengths[ki];
478	edgeflux[0] *= length;
479	edgeflux[1] *= length;
480	edgeflux[2] *= length;
481
482
483	stage_explicit_update[k] -= edgeflux[0];
484	xmom_explicit_update[k] -= edgeflux[1];
485	ymom_explicit_update[k] -= edgeflux[2];
486	#else
487	_flux_function_central(
488	ql0, ql1, ql2,
489	qr0, qr1, qr2,
490	zl,
491	zr,
492	normals[ki2], normals[ki2 + 1],
493	epsilon, h0, limiting_threshold, g,
494	&edgeflux0,
495	&edgeflux1,
496	&edgeflux2,
497	&max_speed);
498
499	length = edgelengths[ki];
500	edgeflux0 *= length;
501	edgeflux1 *= length;
502	edgeflux2 *= length;
503
504
505	stage_explicit_update[k] -= edgeflux0;
506	xmom_explicit_update[k] -= edgeflux1;
507	ymom_explicit_update[k] -= edgeflux2;
508	#endif
509	if (tri_full_flag[k] == 1) {
510	//if (max_speed > elements[Depsilon]) {
511	if ( max_speed > epsilon) {
512	timestep[k] = fmin(timestep[k], radii[k] / max_speed);
513	if (n >= 0) {
514	timestep[k] = fmin(timestep[k], radii[n] / max_speed);
515	}
516	}
517	}
518
519	if (n < 0 \|\| n > k){
520	max_speed_total = fmax(max_speed_total, max_speed);
521	}
522	} // End edge i (and neighbour n)
523
524	inv_area = 1.0 / areas[k];
525	stage_explicit_update[k] *= inv_area;
526	xmom_explicit_update[k] *= inv_area;
527	ymom_explicit_update[k] *= inv_area;
528
529	max_speed_array[k] = max_speed_total;
530	}
531	}
532
533
534
535	#ifdef USING_LOCAL_DIRECTIVES
536	#pragma hmpp cf_central_single codelet, target=CUDA args[*].transfer=atcall
537	#endif
538	void compute_fluxes_central_structure_cuda_single(
539	int N,
540	int N3,
541	int N6,
542	int N2,
543
544	double timestep[N],
545	int neighbours[N3],
546	int neighbour_edges[N3],
547	double normals[N6],
548	double edgelengths[N3],
549	double radii[N],
550	double areas[N],
551	int tri_full_flag[N],
552	double stage_edge_values[N3],
553	double xmom_edge_values[N3],
554	double ymom_edge_values[N3],
555	double bed_edge_values[N3],
556	double stage_boundary_values[N2],
557	double xmom_boundary_values[N2],
558	double ymom_boundary_values[N2],
559	double stage_explicit_update[N],
560	double xmom_explicit_update[N],
561	double ymom_explicit_update[N],
562	double max_speed_array[N],
563
564	double evolve_max_timestep,
565	double g,
566	double epsilon,
567	double h0,
568	double limiting_threshold,
569	int optimise_dry_cells)
570	{
571	int k;
572
573
574	for (k=0; k < N; k++)
575	{
576	double w_left, h_left, uh_left, vh_left, u_left;
577	double w_right, h_right, uh_right, vh_right, u_right;
578	double s_min, s_max, soundspeed_left, soundspeed_right;
579	double denom, inverse_denominator, z;
580	double temp;
581
582
583	double q_left_0, q_left_1, q_left_2;
584	double q_right_0, q_right_1, q_right_2;
585	double q_left_rotated_0, q_left_rotated_1, q_left_rotated_2;
586	double q_right_rotated_0, q_right_rotated_1, q_right_rotated_2;
587	double flux_left_0, flux_left_1, flux_left_2;
588	double flux_right_0, flux_right_1, flux_right_2;
589	double edgeflux_0, edgeflux_1, edgeflux_2;
590
591
592	double max_speed, max_speed_total;
593	double z_left, z_right;
594	double n1, n2;
595	double length, inv_area;
596
597
598	int i, m, ni;
599	int ki, nm;
600
601	max_speed_total = 0;
602	stage_explicit_update[k] = 0;
603	xmom_explicit_update[k] = 0;
604	ymom_explicit_update[k] = 0;
605
606	for (i=0; i< 3; i++)
607	{
608	ki= k*3 + i;
609
610	q_left_0 = stage_edge_values[ki];
611	q_left_1 = xmom_edge_values[ki];
612	q_left_2 = ymom_edge_values[ki];
613	z_left = bed_edge_values[ki];
614	ni = neighbours[ki];
615	if (ni<0) {
616	m= -ni -1;
617
618	q_right_0 = stage_boundary_values[m];
619	q_right_1 = xmom_boundary_values[m];
620	q_right_2 = ymom_boundary_values[m];
621	z_right = z_left;
622	} else {
623	m = neighbour_edges[ki];
624	nm = ni *3 + m;
625
626	q_right_0 = stage_edge_values[nm];
627	q_right_1 = xmom_edge_values[nm];
628	q_right_2 = ymom_edge_values[nm];
629	z_right = bed_edge_values[nm];
630	}
631
632	if(optimise_dry_cells) {
633	if(fabs(q_left_0 - z_left) < epsilon &&
634	fabs(q_right_0 - z_right) < epsilon ) {
635	max_speed = 0.0;
636	continue;
637	}
638	}
639	n1 = normals[2*ki];
640	n2 = normals[2*ki + 1];
641	/////////////////////////////////////////////////////////
642
643
644	// Copy conserved quantities to protect from modification
645	q_left_rotated_0 = q_left_0;
646	q_right_rotated_0 = q_right_0;
647	q_left_rotated_1 = q_left_1;
648	q_right_rotated_1 = q_right_1;
649	q_left_rotated_2 = q_left_2;
650	q_right_rotated_2 = q_right_2;
651
652	// Align x- and y-momentum with x-axis
653	//_rotate(q_left_rotated, n1, n2);
654	q_left_rotated_1 = n1q_left_1 + n2q_left_2;
655	q_left_rotated_2 = -n2q_left_1 + n1q_left_2;
656
657	//_rotate(q_right_rotated, n1, n2);
658	q_right_rotated_1 = n1q_right_1 + n2q_right_2;
659	q_right_rotated_2 = -n2q_right_1 + n1q_right_2;
660
661
662	if (fabs(z_left - z_right) > 1.0e-10) {
663	//report_python_error(AT, "Discontinuous Elevation");
664	//return 0.0;
665	}
666	z = 0.5 * (z_left + z_right); // Average elevation values.
667
668	// Compute speeds in x-direction
669	w_left = q_left_rotated_0;
670	h_left = w_left - z;
671	uh_left = q_left_rotated_1;
672	//u_left = _compute_speed(&uh_left, &h_left,
673	// epsilon, h0, limiting_threshold);
674
675	if (h_left < limiting_threshold) {
676
677	if (h_left < epsilon) {
678	h_left = 0.0; // Could have been negative
679	u_left = 0.0;
680	} else {
681	u_left = uh_left/(h_left + h0/ h_left);
682	}
683
684	uh_left = u_left * h_left;
685	} else {
686	u_left = uh_left/ h_left;
687	}
688
689	w_right = q_right_rotated_0;
690	h_right = w_right - z;
691	uh_right = q_right_rotated_1;
692	//u_right = _compute_speed(&uh_right, &h_right,
693	// epsilon, h0, limiting_threshold);
694
695	if (h_right < limiting_threshold) {
696
697	if (h_right < epsilon) {
698	h_right = 0.0; // Could have been negative
699	u_right = 0.0;
700	} else {
701	u_right = uh_right/(h_right + h0/ h_right);
702	}
703
704	uh_right = u_right * h_right;
705	} else {
706	u_right = uh_right/ h_right;
707	}
708
709
710
711
712	// Momentum in y-direction
713	vh_left = q_left_rotated_2;
714	vh_right = q_right_rotated_2;
715
716	soundspeed_left = sqrt(g * h_left);
717	soundspeed_right = sqrt(g * h_right);
718
719	s_max = u_left + soundspeed_left;
720	if(s_max < u_right + soundspeed_right)
721	s_max = u_right + soundspeed_right;
722
723
724	if (s_max < 0.0) {
725	s_max = 0.0;
726	}
727
728	s_min = u_left - soundspeed_left;
729	if (s_min > u_right - soundspeed_right)
730	s_min = u_right - soundspeed_right;
731
732	if (s_min > 0.0) {
733	s_min = 0.0;
734	}
735
736	// Flux formulas
737	flux_left_0 = u_left*h_left;
738	flux_left_1 = u_left * uh_left + 0.5 * g * h_left*h_left;
739	flux_left_2 = u_left*vh_left;
740
741	flux_right_0 = u_right*h_right;
742	flux_right_1 = u_right * uh_right + 0.5 * g * h_right*h_right;
743	flux_right_2 = u_right*vh_right;
744
745	// Flux computation
746	denom = s_max - s_min;
747	if (denom < epsilon) { // FIXME (Ole): Try using h0 here
748	edgeflux_0 = 0;
749	edgeflux_1 = 0;
750	edgeflux_2 = 0;
751
752	max_speed = 0.0;
753	}
754	else {
755	inverse_denominator = 1.0 / denom;
756	//for (j = 0; j < 3; j++) {
757	// edgeflux[j] = s_max flux_left[j] -s_min flux_right[j];
758	// edgeflux[j] += s_max * s_min * (
759	// q_right_rotated[j] - q_left_rotated[j]);
760	// edgeflux[j] *= inverse_denominator;
761	//}
762	edgeflux_0 = s_max flux_left_0 -s_min flux_right_0;
763	edgeflux_0 += s_max * s_min * (
764	q_right_rotated_0 - q_left_rotated_0);
765	edgeflux_0 *= inverse_denominator;
766
767	edgeflux_1 = s_max flux_left_1 -s_min flux_right_1;
768	edgeflux_1 += s_max * s_min * (
769	q_right_rotated_1 - q_left_rotated_1);
770	edgeflux_1 *= inverse_denominator;
771
772	edgeflux_2 = s_max flux_left_2 -s_min flux_right_2;
773	edgeflux_2 += s_max * s_min * (
774	q_right_rotated_2 - q_left_rotated_2);
775	edgeflux_2 *= inverse_denominator;
776
777
778
779	// Maximal wavespeed
780	//max_speed = max(fabs(s_max), fabs(s_min));
781	max_speed = fabs(s_max);
782	if (max_speed < fabs(s_min))
783	max_speed = fabs(s_min);
784
785	// Rotate back
786	//_rotate(edgeflux, n1, -n2);
787	temp = edgeflux_1;
788	edgeflux_1 = n1* temp -n2*edgeflux_2;
789	edgeflux_2 = n2temp + n1edgeflux_2;
790	}
791
792	//////////////////////////////////////////////////////
793
794	length = edgelengths[ki];
795	edgeflux_0 *= length;
796	edgeflux_1 *= length;
797	edgeflux_2 *= length;
798
799	stage_explicit_update[k]-= edgeflux_0;
800	xmom_explicit_update[k] -= edgeflux_1;
801	ymom_explicit_update[k] -= edgeflux_2;
802
803	if (tri_full_flag[k] == 1) {
804	if (max_speed > epsilon) {
805	//timestep[k] = min(timestep[k], radii[k]/ max_speed);
806	if (timestep[k] > radii[k]/max_speed)
807	timestep[k] = radii[k]/ max_speed;
808
809	if (ni>=0){
810	//timestep[k] =min(timestep[k], radii[ni]/max_speed);
811	if (timestep[k] > radii[ni]/max_speed)
812	timestep[k] = radii[ni]/ max_speed;
813
814	}
815	}
816	}
817	if (ni < 0 \|\| ni > k){
818	max_speed_total = fmax(max_speed_total, max_speed);
819	}
820	}
821
822	inv_area = 1.0 / areas[k];
823	stage_explicit_update[k] *= inv_area;
824	xmom_explicit_update[k] *= inv_area;
825	ymom_explicit_update[k] *= inv_area;
826
827	max_speed_array[k] = max_speed_total;
828	}
829	}
830
831
832	#ifdef USING_MAIN
833	int main(int argc, char *argv[])
834	{
835	int N, N2, optimise_dry_cells;
836	int cnt_s =0, cnt_x =0, cnt_y =0, cnt_m = 0;
837
838	double evolve_max_timestep,
839	g,
840	epsilon,
841	h0,
842	limiting_threshold;
843
844	double * sv, // stage_vertex_values
845	* sb, // stage_boundary_values
846	* se, // stage_edge_values,
847	* sc, // stage_centroid_values,
848	* su, // stage_explicit_update
849
850	* be, // bed_edge_values,
851	* bc, // bed_centroid_values,
852
853	* xb, // xmom_boundary_values
854	* xe, // xmom_edge_values
855	* xu, // xmom_explicit_update
856
857	* yb, // ymom_boundary_values
858	* ye, // ymom_edge_values
859	* yu, // ymom_explicit_update
860
861	* normals,
862	* el, // edgelengths;
863	* radii,
864	* a, // areas,
865	* vc; // vertex_coordinates,
866
867	int * tri; // tri_full_flag
868	int * n, // neighbours
869	* ne; // neighbour_edges
870
871	double * timestep, * max_speed_array;
872
873	// Testing group
874	double * test_stage_explicit_update,
875	* test_xmom_explicit_update,
876	* test_ymom_explicit_update,
877	* test_max_speed_array,
878	* test_timestep;
879	int i;
880
881	char file_name[]="merimbula_cf.dat";
882
883	freopen(file_name, "r", stdin);
884
885	scanf("%d\n", &N);
886	scanf("%d\n", &N2);
887	scanf("%d\n", &optimise_dry_cells);
888	scanf("%lf\n",&evolve_max_timestep);
889	scanf("%lf\n",&g);
890	scanf("%lf\n",&epsilon);
891	scanf("%lf\n",&h0);
892	scanf("%lf\n",&limiting_threshold);
893
894
895	printf("%d %lf\n", N, g);
896
897
898	sv = (double )malloc( N3*sizeof(double));
899	assert(sv != NULL);
900	sb = (double )malloc( N2sizeof(double));
901	assert(sb != NULL);
902	se = (double )malloc( N3*sizeof(double));
903	assert(se != NULL);
904	sc = (double )malloc( Nsizeof(double));
905	assert(sc != NULL);
906	su = (double )malloc( Nsizeof(double));
907	assert(su != NULL);
908
909
910	be = (double )malloc( N3*sizeof(double));
911	assert(be != NULL);
912	bc = (double )malloc( Nsizeof(double));
913	assert(bc != NULL);
914
915
916	xb = (double )malloc( N2sizeof(double));
917	assert(xb != NULL);
918	xe = (double )malloc( N3*sizeof(double));
919	assert(xe != NULL);
920	xu = (double )malloc( Nsizeof(double));
921	assert(xu != NULL);
922
923	yb = (double )malloc( N2sizeof(double));
924	assert(yb != NULL);
925	ye = (double )malloc( N3*sizeof(double));
926	assert(ye != NULL);
927	yu = (double )malloc( Nsizeof(double));
928	assert(yu != NULL);
929
930
931	n = (int )malloc( N3*sizeof(int));
932	assert(n != NULL);
933	ne = (int )malloc( N3*sizeof(int));
934	assert(ne != NULL);
935
936
937	normals = (double )malloc( N6*sizeof(double));
938	assert(normals != NULL);
939	el = (double )malloc( N3*sizeof(double));
940	assert(el != NULL);
941	radii = (double )malloc( Nsizeof(double));
942	assert(radii != NULL);
943	a = (double )malloc( Nsizeof(double));
944	assert(a != NULL);
945
946	tri = (int )malloc( Nsizeof(int));
947	assert(tri != NULL);
948
949	vc = (double )malloc( N6*sizeof(double));
950	assert(vc != NULL);
951
952	timestep = (double )malloc(Nsizeof(double));
953	assert(timestep != NULL);
954	max_speed_array = (double )malloc(Nsizeof(double));
955	assert(max_speed_array != NULL);
956
957	test_stage_explicit_update = (double )malloc(Nsizeof(double));
958	assert(test_stage_explicit_update != NULL);
959	test_xmom_explicit_update = (double )malloc(Nsizeof(double));
960	assert(test_xmom_explicit_update != NULL);
961	test_ymom_explicit_update = (double )malloc(Nsizeof(double));
962	assert(test_ymom_explicit_update != NULL);
963	test_max_speed_array = (double )malloc(Nsizeof(double));
964	assert(test_max_speed_array != NULL);
965	test_timestep = (double )malloc(Nsizeof(double));
966	assert(test_timestep != NULL);
967
968
969	for(i=0; i < N; i++)
970	{
971	scanf("%lf %lf %lf\n", sv+i3, sv+i3 +1, sv+i*3 +2);
972	scanf("%lf %lf %lf\n", se+i3, se+i3 +1, se+i*3 +2);
973	scanf("%lf\n", sc+i);
974	scanf("%lf\n", su+i);
975
976	scanf("%lf %lf %lf\n",be+i3, be+i3 +1, be+i*3 +2);
977	scanf("%lf\n", bc+i);
978
979	scanf("%lf %lf %lf\n", xe+i3, xe+i3 +1, xe+i*3 +2);
980	scanf("%lf\n", xu+i);
981	scanf("%lf %lf %lf\n", ye+i3, ye+i3 +1, ye+i*3 +2);
982	scanf("%lf\n", yu+i);
983
984	scanf("%d %d %d\n", n+i3, n+i3 +1, n+i*3 +2);
985
986	scanf("%d %d %d\n", ne+i3, ne+i3 +1, ne+i*3 +2);
987
988
989	scanf("%lf %lf %lf %lf %lf %lf\n", normals+i6, normals+i6+1, normals+i6+2,normals+i6+3, normals+i6+4, normals+i6+5);
990
991	scanf("%lf %lf %lf\n", el+i3, el+i3 +1, el+i*3 +2);
992
993	scanf("%lf\n", radii+i);
994	scanf("%lf\n", a+i);
995	scanf("%d\n", tri+i);
996
997	scanf("%lf %lf\n", vc+i6, vc+i6 +1);
998	scanf("%lf %lf\n", vc+i6+2, vc+i6 +3);
999	scanf("%lf %lf\n", vc+i6+4, vc+i6 +5);
1000	}
1001
1002	for(i=0; i < N2; i++)
1003	{
1004	scanf("%lf\n", sb+i);
1005	scanf("%lf\n", xb+i);
1006	scanf("%lf\n", yb+i);
1007	}
1008
1009	cnt_m = 0;
1010	for(i=0; i < N; i++)
1011	{
1012	if (max_speed_array[i] != test_max_speed_array[i])
1013	cnt_m ++;
1014	}
1015	printf("\n --> Test initial input %d\n", cnt_m);
1016
1017	printf(" --> Enter Kernel\n");
1018	#pragma hmpp cf_central_single callsite
1019	compute_fluxes_central_structure_cuda_single(
1020	N,
1021	N*3,
1022	N*6,
1023	N2,
1024
1025	timestep,
1026	n,
1027	ne,
1028	normals,
1029	el,
1030	radii,
1031	a,
1032	tri,
1033	se,
1034	xe,
1035	ye,
1036	be,
1037	sb,
1038	xb,
1039	yb,
1040	su,
1041	xu,
1042	yu,
1043	max_speed_array,
1044
1045	evolve_max_timestep,
1046	g,
1047	epsilon,
1048	h0,
1049	limiting_threshold,
1050	optimise_dry_cells);
1051
1052	/*
1053	printf(" --> Enter C\n");
1054	compute_fluxes_central_structure_cuda_single( N, N3, N6, N2, timestep, n, ne, normals, el, radii, a, tri, se, xe, ye, be, sb, xb, yb, su, xu, yu, max_speed_array, evolve_max_timestep, g, epsilon, h0, limiting_threshold, optimise_dry_cells);
1055
1056	*/
1057
1058
1059	printf(" --> Enter Kernel\n");
1060	#pragma hmpp cf_central callsite
1061	compute_fluxes_central_structure_CUDA(
1062	N,
1063	N*3,
1064	N*6,
1065	N2,
1066
1067	timestep,
1068	n,
1069	ne,
1070	normals,
1071	el,
1072	radii,
1073	a,
1074	tri,
1075	se,
1076	xe,
1077	ye,
1078	be,
1079	sb,
1080	xb,
1081	yb,
1082	su,
1083	xu,
1084	yu,
1085	max_speed_array,
1086
1087	evolve_max_timestep,
1088	g,
1089	epsilon,
1090	h0,
1091	limiting_threshold,
1092	optimise_dry_cells);
1093
1094
1095
1096	printf(" --> Enter original C\n");
1097	compute_fluxes_central_structure_CUDA(
1098	N, N3, N6, N2,
1099	test_timestep, n, ne, normals, el, radii, a, tri,
1100	se, xe, ye, be, sb, xb, yb,
1101	test_stage_explicit_update,
1102	test_xmom_explicit_update,
1103	test_ymom_explicit_update,
1104	test_max_speed_array,
1105	evolve_max_timestep, g, epsilon, h0, limiting_threshold,
1106	optimise_dry_cells);
1107
1108
1109
1110	for (cnt_s=cnt_x=cnt_y=cnt_m=i=0; i < N; i++)
1111	{
1112	if ( fabs(su[i] - test_stage_explicit_update[i]) >= TOLERANCE)
1113	{
1114	cnt_s++;
1115	if (cnt_s <= 10)
1116	printf(" sta %d %lf %lf\n", i, su[i], test_stage_explicit_update[i]);
1117	}
1118	if ( fabs(xu[i] - test_xmom_explicit_update[i]) >= TOLERANCE)
1119	{
1120	cnt_x++;
1121	if (cnt_x <= 10)
1122	printf(" xmom %d %lf %lf\n", i, xu[i], test_xmom_explicit_update[i]);
1123	}
1124	if ( fabs(yu[i] - test_ymom_explicit_update[i]) >= TOLERANCE)
1125	{
1126	cnt_y++;
1127	if (cnt_y <= 10)
1128	printf(" ymom %d %lf %lf\n", i, yu[i], test_ymom_explicit_update[i]);
1129	}
1130	if ( fabs(max_speed_array[i] -test_max_speed_array[i]) >=TOLERANCE)
1131	{
1132	cnt_m++;
1133	if (cnt_m <= 10)
1134	printf(" max %d %lf %lf\n", i, max_speed_array[i], test_max_speed_array[i]);
1135	}
1136	}
1137	printf("se:%d xe:%d ye:%d max:%d errors found\n", cnt_s, cnt_x, cnt_y, cnt_m);
1138	}
1139	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/anuga_work/anuga_cuda/src/anuga_HMPP/compute_fluxes.c @ 9329

Download in other formats: