1 | |
---|
2 | // OpenHMPP ANUGA compute_fluxes |
---|
3 | // |
---|
4 | // Zhe Weng 2013 |
---|
5 | |
---|
6 | |
---|
7 | //#include<stdio.h> |
---|
8 | //#include<stdlib.h> |
---|
9 | //#include<string.h> |
---|
10 | //#include<math.h> |
---|
11 | //#include<assert.h> |
---|
12 | |
---|
13 | #include "hmpp_fun.h" |
---|
14 | |
---|
15 | #ifndef TOLERANCE |
---|
16 | #define TOLERANCE 1e-8 |
---|
17 | #endif |
---|
18 | |
---|
19 | #define USING_ORIGINAL_CUDA |
---|
20 | //double max(double a, double b) |
---|
21 | //{ return (a >= b) ? a : b; } |
---|
22 | // |
---|
23 | //double min(double a, double b) |
---|
24 | //{ return (a <= b) ? a : b; } |
---|
25 | |
---|
26 | |
---|
27 | /* |
---|
28 | double sqrt(double x) |
---|
29 | { |
---|
30 | double mx = 32000; |
---|
31 | double mn = 0; |
---|
32 | while(mx - mn > 1e-9) |
---|
33 | { |
---|
34 | double md = (mx + mn) / 2; |
---|
35 | if(md * md > x) |
---|
36 | mx = md; |
---|
37 | else |
---|
38 | mn = md; |
---|
39 | } |
---|
40 | return mx; |
---|
41 | } |
---|
42 | */ |
---|
43 | |
---|
44 | |
---|
45 | /* |
---|
46 | void spe_bubble_sort(int* _list , long* neighbours, int k) |
---|
47 | { |
---|
48 | int temp; |
---|
49 | |
---|
50 | if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) ) |
---|
51 | { |
---|
52 | temp = _list[2]; |
---|
53 | _list[2] = _list[1]; |
---|
54 | _list[1] = temp; |
---|
55 | } |
---|
56 | if ( neighbours[_list[1]]>=0 and neighbours[ _list[1] ] < k and (neighbours[_list[0]]<0 or neighbours[_list[1]]<neighbours[_list[0]]) ) |
---|
57 | { |
---|
58 | temp = _list[1]; |
---|
59 | _list[1] = _list[0]; |
---|
60 | _list[0] = temp; |
---|
61 | } |
---|
62 | if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) ) |
---|
63 | { |
---|
64 | temp = _list[2]; |
---|
65 | _list[2] = _list[1]; |
---|
66 | _list[1] = temp; |
---|
67 | } |
---|
68 | } |
---|
69 | */ |
---|
70 | |
---|
71 | int _rotate(double *q, double n1, double n2) |
---|
72 | { |
---|
73 | /*Rotate the momentum component q (q[1], q[2]) |
---|
74 | from x,y coordinates to coordinates based on normal vector (n1, n2). |
---|
75 | |
---|
76 | Result is returned in array 3x1 r |
---|
77 | To rotate in opposite direction, call rotate with (q, n1, -n2) |
---|
78 | |
---|
79 | Contents of q are changed by this function */ |
---|
80 | |
---|
81 | |
---|
82 | double q1, q2; |
---|
83 | |
---|
84 | q1 = q[1]; // uh momentum |
---|
85 | q2 = q[2]; // vh momentum |
---|
86 | |
---|
87 | // Rotate |
---|
88 | q[1] = n1 * q1 + n2*q2; |
---|
89 | q[2] = -n2 * q1 + n1*q2; |
---|
90 | |
---|
91 | return 0; |
---|
92 | } |
---|
93 | |
---|
94 | |
---|
95 | double _compute_speed(double *uh, |
---|
96 | double *h, |
---|
97 | double epsilon, |
---|
98 | double h0, |
---|
99 | double limiting_threshold) |
---|
100 | { |
---|
101 | double u; |
---|
102 | |
---|
103 | if (*h < limiting_threshold) { |
---|
104 | // Apply limiting of speeds according to the ANUGA manual |
---|
105 | if (*h < epsilon) { |
---|
106 | *h = 0.0; // Could have been negative |
---|
107 | u = 0.0; |
---|
108 | } else { |
---|
109 | u = *uh / (*h + h0 / *h); |
---|
110 | } |
---|
111 | |
---|
112 | |
---|
113 | // Adjust momentum to be consistent with speed |
---|
114 | *uh = u * *h; |
---|
115 | } else { |
---|
116 | // We are in deep water - no need for limiting |
---|
117 | u = *uh / *h; |
---|
118 | } |
---|
119 | |
---|
120 | return u; |
---|
121 | } |
---|
122 | |
---|
123 | |
---|
124 | int _flux_function_central( |
---|
125 | #ifdef USING_ORIGINAL_CUDA |
---|
126 | double *q_left, double *q_right, |
---|
127 | double z_left, double z_right, |
---|
128 | double n1, double n2, |
---|
129 | double epsilon, |
---|
130 | double h0, |
---|
131 | double limiting_threshold, |
---|
132 | double g, |
---|
133 | double *edgeflux, |
---|
134 | double *max_speed) |
---|
135 | #else |
---|
136 | double q_left0, |
---|
137 | double q_left1, |
---|
138 | double q_left2, |
---|
139 | double q_right0, |
---|
140 | double q_right1, |
---|
141 | double q_right2, |
---|
142 | double z_left, |
---|
143 | double z_right, |
---|
144 | double n1, |
---|
145 | double n2, |
---|
146 | double epsilon, |
---|
147 | double h0, |
---|
148 | double limiting_threshold, |
---|
149 | double g, |
---|
150 | double *edgeflux0, |
---|
151 | double *edgeflux1, |
---|
152 | double *edgeflux2, |
---|
153 | double *max_speed) |
---|
154 | #endif |
---|
155 | { |
---|
156 | #ifdef USING_ORIGINAL_CUDA |
---|
157 | int i; |
---|
158 | #endif |
---|
159 | |
---|
160 | double w_left, h_left, uh_left, vh_left, u_left; |
---|
161 | double w_right, h_right, uh_right, vh_right, u_right; |
---|
162 | double s_min, s_max, soundspeed_left, soundspeed_right; |
---|
163 | double denom, inverse_denominator, z; |
---|
164 | double temp; |
---|
165 | |
---|
166 | // Workspace (allocate once, use many) |
---|
167 | double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3]; |
---|
168 | |
---|
169 | // Copy conserved quantities to protect from modification |
---|
170 | #ifdef USING_ORIGINAL_CUDA |
---|
171 | q_left_rotated[0] = q_left[0]; |
---|
172 | q_right_rotated[0] = q_right[0]; |
---|
173 | q_left_rotated[1] = q_left[1]; |
---|
174 | q_right_rotated[1] = q_right[1]; |
---|
175 | q_left_rotated[2] = q_left[2]; |
---|
176 | q_right_rotated[2] = q_right[2]; |
---|
177 | |
---|
178 | // Align x- and y-momentum with x-axis |
---|
179 | //_rotate(q_left_rotated, n1, n2); |
---|
180 | q_left_rotated[1] = n1*q_left[1] + n2*q_left[2]; |
---|
181 | q_left_rotated[2] = -n2*q_left[1] + n1*q_left[2]; |
---|
182 | |
---|
183 | //_rotate(q_right_rotated, n1, n2); |
---|
184 | q_right_rotated[1] = n1*q_right[1] + n2*q_right[2]; |
---|
185 | q_right_rotated[2] = -n2*q_right[1] + n1*q_right[2]; |
---|
186 | #else |
---|
187 | q_left_rotated[0] = q_left0; |
---|
188 | q_right_rotated[0] = q_right0; |
---|
189 | q_left_rotated[1] = q_left1; |
---|
190 | q_right_rotated[1] = q_right1; |
---|
191 | q_left_rotated[2] = q_left2; |
---|
192 | q_right_rotated[2] = q_right2; |
---|
193 | |
---|
194 | // Align x- and y-momentum with x-axis |
---|
195 | //_rotate(q_left_rotated, n1, n2); |
---|
196 | q_left_rotated[1] = n1*q_left1 + n2*q_left2; |
---|
197 | q_left_rotated[2] = -n2*q_left1 + n1*q_left2; |
---|
198 | |
---|
199 | //_rotate(q_right_rotated, n1, n2); |
---|
200 | q_right_rotated[1] = n1*q_right1 + n2*q_right2; |
---|
201 | q_right_rotated[2] = -n2*q_right1 + n1*q_right2; |
---|
202 | #endif |
---|
203 | |
---|
204 | |
---|
205 | if (fabs(z_left - z_right) > 1.0e-10) { |
---|
206 | //report_python_error(AT, "Discontinuous Elevation"); |
---|
207 | return 0.0; |
---|
208 | } |
---|
209 | z = 0.5 * (z_left + z_right); // Average elevation values. |
---|
210 | |
---|
211 | // Compute speeds in x-direction |
---|
212 | w_left = q_left_rotated[0]; |
---|
213 | h_left = w_left - z; |
---|
214 | uh_left = q_left_rotated[1]; |
---|
215 | u_left = _compute_speed(&uh_left, &h_left, |
---|
216 | epsilon, h0, limiting_threshold); |
---|
217 | w_right = q_right_rotated[0]; |
---|
218 | h_right = w_right - z; |
---|
219 | uh_right = q_right_rotated[1]; |
---|
220 | u_right = _compute_speed(&uh_right, &h_right, |
---|
221 | epsilon, h0, limiting_threshold); |
---|
222 | // Momentum in y-direction |
---|
223 | vh_left = q_left_rotated[2]; |
---|
224 | vh_right = q_right_rotated[2]; |
---|
225 | |
---|
226 | // Limit y-momentum if necessary |
---|
227 | // Leaving this out, improves speed significantly (Ole 27/5/2009) |
---|
228 | // All validation tests pass, so do we really need it anymore? |
---|
229 | _compute_speed(&vh_left, &h_left, |
---|
230 | epsilon, h0, limiting_threshold); |
---|
231 | _compute_speed(&vh_right, &h_right, |
---|
232 | epsilon, h0, limiting_threshold); |
---|
233 | |
---|
234 | // Maximal and minimal wave speeds |
---|
235 | soundspeed_left = sqrt(g * h_left); |
---|
236 | soundspeed_right = sqrt(g * h_right); |
---|
237 | |
---|
238 | // Code to use fast square root optimisation if desired. |
---|
239 | // Timings on AMD 64 for the Okushiri profile gave the following timings |
---|
240 | // |
---|
241 | // SQRT Total Flux |
---|
242 | //============================= |
---|
243 | // |
---|
244 | // Ref 405s 152s |
---|
245 | // Fast (dbl) 453s 173s |
---|
246 | // Fast (sng) 437s 171s |
---|
247 | // |
---|
248 | // Consequently, there is currently (14/5/2009) no reason to use this |
---|
249 | // approximation. |
---|
250 | |
---|
251 | //soundspeed_left = fast_squareroot_approximation(g*h_left); |
---|
252 | //soundspeed_right = fast_squareroot_approximation(g*h_right); |
---|
253 | |
---|
254 | s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right); |
---|
255 | if (s_max < 0.0) { |
---|
256 | s_max = 0.0; |
---|
257 | } |
---|
258 | |
---|
259 | s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right); |
---|
260 | if (s_min > 0.0) { |
---|
261 | s_min = 0.0; |
---|
262 | } |
---|
263 | |
---|
264 | // Flux formulas |
---|
265 | flux_left[0] = u_left*h_left; |
---|
266 | flux_left[1] = u_left * uh_left + 0.5 * g * h_left*h_left; |
---|
267 | flux_left[2] = u_left*vh_left; |
---|
268 | |
---|
269 | flux_right[0] = u_right*h_right; |
---|
270 | flux_right[1] = u_right * uh_right + 0.5 * g * h_right*h_right; |
---|
271 | flux_right[2] = u_right*vh_right; |
---|
272 | |
---|
273 | // Flux computation |
---|
274 | denom = s_max - s_min; |
---|
275 | if (denom < epsilon) { // FIXME (Ole): Try using h0 here |
---|
276 | //memset(edgeflux, 0, 3 * sizeof (double)); |
---|
277 | edgeflux[0]=0.0; |
---|
278 | edgeflux[1]=0.0; |
---|
279 | edgeflux[2]=0.0; |
---|
280 | *max_speed = 0.0; |
---|
281 | } |
---|
282 | else { |
---|
283 | inverse_denominator = 1.0 / denom; |
---|
284 | #ifdef USING_ORIGINAL_CUDA |
---|
285 | //#pragma hmppcg noparallel |
---|
286 | //for (i = 0; i < 3; i++) { |
---|
287 | // edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i]; |
---|
288 | // edgeflux[i] += s_max *s_min *(q_right_rotated[i] -q_left_rotated[i]); |
---|
289 | // edgeflux[i] *= inverse_denominator; |
---|
290 | //} |
---|
291 | edgeflux[0] = s_max * flux_left[0] - s_min * flux_right[0]; |
---|
292 | edgeflux[0]+=s_max *s_min *(q_right_rotated[0] -q_left_rotated[0]); |
---|
293 | edgeflux[0] *= inverse_denominator; |
---|
294 | |
---|
295 | edgeflux[1] = s_max * flux_left[1] - s_min * flux_right[1]; |
---|
296 | edgeflux[1]+=s_max *s_min *(q_right_rotated[1] -q_left_rotated[1]); |
---|
297 | edgeflux[1] *= inverse_denominator; |
---|
298 | |
---|
299 | edgeflux[2] = s_max * flux_left[2] - s_min * flux_right[2]; |
---|
300 | edgeflux[2]+=s_max *s_min *(q_right_rotated[2] -q_left_rotated[2]); |
---|
301 | edgeflux[2] *= inverse_denominator; |
---|
302 | #else |
---|
303 | *edgeflux0 = s_max * flux_left[0] - s_min * flux_right[0]; |
---|
304 | *edgeflux0 += s_max *s_min *(q_right_rotated[0] -q_left_rotated[0]); |
---|
305 | *edgeflux0 *= inverse_denominator; |
---|
306 | |
---|
307 | *edgeflux1 = s_max * flux_left[1] - s_min * flux_right[1]; |
---|
308 | *edgeflux1 += s_max *s_min *(q_right_rotated[1] -q_left_rotated[1]); |
---|
309 | *edgeflux1 *= inverse_denominator; |
---|
310 | |
---|
311 | *edgeflux2 = s_max * flux_left[2] - s_min * flux_right[2]; |
---|
312 | *edgeflux2 += s_max *s_min *(q_right_rotated[2] -q_left_rotated[2]); |
---|
313 | *edgeflux2 *= inverse_denominator; |
---|
314 | #endif |
---|
315 | |
---|
316 | // Maximal wavespeed |
---|
317 | *max_speed = fmax(fabs(s_max), fabs(s_min)); |
---|
318 | |
---|
319 | // Rotate back |
---|
320 | //_rotate(edgeflux, n1, -n2); |
---|
321 | #ifdef USING_ORIGINAL_CUDA |
---|
322 | temp = edgeflux[1]; |
---|
323 | edgeflux[1] = n1* temp -n2*edgeflux[2]; |
---|
324 | edgeflux[2] = n2*temp + n1*edgeflux[2]; |
---|
325 | #else |
---|
326 | temp = *edgeflux1; |
---|
327 | *edgeflux1 = n1* temp -n2* *edgeflux2; |
---|
328 | *edgeflux2 = n2*temp + n1* *edgeflux2; |
---|
329 | #endif |
---|
330 | } |
---|
331 | |
---|
332 | return 0; |
---|
333 | } |
---|
334 | |
---|
335 | |
---|
336 | |
---|
337 | |
---|
338 | |
---|
339 | |
---|
340 | |
---|
341 | /*****************************************/ |
---|
342 | /* The CUDA compute fluex function */ |
---|
343 | /*****************************************/ |
---|
344 | |
---|
345 | |
---|
346 | #ifdef USING_LOCAL_DIRECTIVES |
---|
347 | #pragma hmpp cf_central codelet, target=CUDA args[*].transfer=atcall |
---|
348 | #endif |
---|
349 | void compute_fluxes_central_structure_CUDA( |
---|
350 | int N, |
---|
351 | int N3, |
---|
352 | int N6, |
---|
353 | int N2, |
---|
354 | |
---|
355 | double timestep[N], |
---|
356 | long neighbours[N3], |
---|
357 | long neighbour_edges[N3], |
---|
358 | double normals[N6], |
---|
359 | double edgelengths[N3], |
---|
360 | double radii[N], |
---|
361 | double areas[N], |
---|
362 | long tri_full_flag[N], |
---|
363 | double stage_edge_values[N3], |
---|
364 | double xmom_edge_values[N3], |
---|
365 | double ymom_edge_values[N3], |
---|
366 | double bed_edge_values[N3], |
---|
367 | double stage_boundary_values[N2], |
---|
368 | double xmom_boundary_values[N2], |
---|
369 | double ymom_boundary_values[N2], |
---|
370 | double stage_explicit_update[N], |
---|
371 | double xmom_explicit_update[N], |
---|
372 | double ymom_explicit_update[N], |
---|
373 | double max_speed_array[N], |
---|
374 | |
---|
375 | double evolve_max_timestep, |
---|
376 | double g, |
---|
377 | double epsilon, |
---|
378 | double h0, |
---|
379 | double limiting_threshold, |
---|
380 | int optimise_dry_cells) |
---|
381 | { |
---|
382 | int k; |
---|
383 | int i, m, n; |
---|
384 | int ki, nm = 0, ki2; |
---|
385 | |
---|
386 | double max_speed, max_speed_total=0 , length, inv_area, zl, zr; |
---|
387 | |
---|
388 | #ifdef USING_ORIGINAL_CUDA |
---|
389 | double ql[3], qr[3], edgeflux[3]; |
---|
390 | #else |
---|
391 | double ql0, ql1, ql2, |
---|
392 | qr0, qr1, qr2, |
---|
393 | edgeflux0, edgeflux1, edgeflux2; |
---|
394 | #endif |
---|
395 | |
---|
396 | #pragma hmppcg gridify(k), private(max_speed, max_speed_total, length,& |
---|
397 | #pragma hmppcg & inv_area, zl, zr, i, m, n, ki, nm, ki2, ql, qr, & |
---|
398 | #pragma hmppcg & edgeflux), global(timestep, neighbours, & |
---|
399 | #pragma hmppcg & neighbour_edges, normals, edgelengths, radii, & |
---|
400 | #pragma hmppcg & areas, tri_full_flag, stage_edge_values, & |
---|
401 | #pragma hmppcg & xmom_boundary_values, ymom_boundary_values, & |
---|
402 | #pragma hmppcg & stage_explicit_update, xmom_explicit_update, & |
---|
403 | #pragma hmppcg & ymom_explicit_update, max_speed_array, & |
---|
404 | #pragma hmppcg & evolve_max_timestep, g, epsilon, h0, & |
---|
405 | #pragma hmppcg & limiting_threshold, optimise_dry_cells) |
---|
406 | for(k=0; k<N; k++) |
---|
407 | { |
---|
408 | timestep[k] = evolve_max_timestep; |
---|
409 | |
---|
410 | // Loop through neighbours and compute edge flux for each |
---|
411 | for (i = 0; i < 3; i++) |
---|
412 | { |
---|
413 | ki = k * 3 + i; // Linear index to edge i of triangle k |
---|
414 | |
---|
415 | n = neighbours[ki]; |
---|
416 | |
---|
417 | #ifdef USING_ORIGINAL_CUDA |
---|
418 | ql[0] = stage_edge_values[ki]; |
---|
419 | ql[1] = xmom_edge_values[ki]; |
---|
420 | ql[2] = ymom_edge_values[ki]; |
---|
421 | #else |
---|
422 | ql0 = stage_edge_values[ki]; |
---|
423 | ql1 = xmom_edge_values[ki]; |
---|
424 | ql2 = ymom_edge_values[ki]; |
---|
425 | #endif |
---|
426 | zl = bed_edge_values[ki]; |
---|
427 | |
---|
428 | if (n < 0) { |
---|
429 | m = -n - 1; // Convert negative flag to boundary index |
---|
430 | |
---|
431 | #ifdef USING_ORIGINAL_CUDA |
---|
432 | qr[0] = stage_boundary_values[m]; |
---|
433 | qr[1] = xmom_boundary_values[m]; |
---|
434 | qr[2] = ymom_boundary_values[m]; |
---|
435 | #else |
---|
436 | qr0 = stage_boundary_values[m]; |
---|
437 | qr1 = xmom_boundary_values[m]; |
---|
438 | qr2 = ymom_boundary_values[m]; |
---|
439 | #endif |
---|
440 | zr = zl; // Extend bed elevation to boundary |
---|
441 | } else { |
---|
442 | m = neighbour_edges[ki]; |
---|
443 | nm = n * 3 + m; // Linear index (triangle n, edge m) |
---|
444 | |
---|
445 | #ifdef USING_ORIGINAL_CUDA |
---|
446 | qr[0] = stage_edge_values[nm]; |
---|
447 | qr[1] = xmom_edge_values[nm]; |
---|
448 | qr[2] = ymom_edge_values[nm]; |
---|
449 | #else |
---|
450 | qr0 = stage_edge_values[nm]; |
---|
451 | qr1 = xmom_edge_values[nm]; |
---|
452 | qr2 = ymom_edge_values[nm]; |
---|
453 | #endif |
---|
454 | zr = bed_edge_values[nm]; |
---|
455 | } |
---|
456 | |
---|
457 | if (optimise_dry_cells){ |
---|
458 | #ifdef USING_ORIGINAL_CUDA |
---|
459 | if ( fabs(ql[0] - zl) < epsilon && |
---|
460 | fabs(qr[0] - zr) < epsilon) { |
---|
461 | #else |
---|
462 | if ( fabs(ql0 - zl) < epsilon && |
---|
463 | fabs(qr0 - zr) < epsilon) { |
---|
464 | #endif |
---|
465 | max_speed = 0.0; |
---|
466 | continue; |
---|
467 | } |
---|
468 | } |
---|
469 | |
---|
470 | ki2 = 2 * ki; //k*6 + i*2 |
---|
471 | #ifdef USING_ORIGINAL_CUDA |
---|
472 | _flux_function_central(ql, qr, zl, zr, |
---|
473 | normals[ki2], normals[ki2 + 1], |
---|
474 | epsilon, h0, limiting_threshold, g, |
---|
475 | edgeflux, &max_speed); |
---|
476 | |
---|
477 | length = edgelengths[ki]; |
---|
478 | edgeflux[0] *= length; |
---|
479 | edgeflux[1] *= length; |
---|
480 | edgeflux[2] *= length; |
---|
481 | |
---|
482 | |
---|
483 | stage_explicit_update[k] -= edgeflux[0]; |
---|
484 | xmom_explicit_update[k] -= edgeflux[1]; |
---|
485 | ymom_explicit_update[k] -= edgeflux[2]; |
---|
486 | #else |
---|
487 | _flux_function_central( |
---|
488 | ql0, ql1, ql2, |
---|
489 | qr0, qr1, qr2, |
---|
490 | zl, |
---|
491 | zr, |
---|
492 | normals[ki2], normals[ki2 + 1], |
---|
493 | epsilon, h0, limiting_threshold, g, |
---|
494 | &edgeflux0, |
---|
495 | &edgeflux1, |
---|
496 | &edgeflux2, |
---|
497 | &max_speed); |
---|
498 | |
---|
499 | length = edgelengths[ki]; |
---|
500 | edgeflux0 *= length; |
---|
501 | edgeflux1 *= length; |
---|
502 | edgeflux2 *= length; |
---|
503 | |
---|
504 | |
---|
505 | stage_explicit_update[k] -= edgeflux0; |
---|
506 | xmom_explicit_update[k] -= edgeflux1; |
---|
507 | ymom_explicit_update[k] -= edgeflux2; |
---|
508 | #endif |
---|
509 | if (tri_full_flag[k] == 1) { |
---|
510 | //if (max_speed > elements[Depsilon]) { |
---|
511 | if ( max_speed > epsilon) { |
---|
512 | timestep[k] = fmin(timestep[k], radii[k] / max_speed); |
---|
513 | if (n >= 0) { |
---|
514 | timestep[k] = fmin(timestep[k], radii[n] / max_speed); |
---|
515 | } |
---|
516 | } |
---|
517 | } |
---|
518 | |
---|
519 | if (n < 0 || n > k){ |
---|
520 | max_speed_total = fmax(max_speed_total, max_speed); |
---|
521 | } |
---|
522 | } // End edge i (and neighbour n) |
---|
523 | |
---|
524 | inv_area = 1.0 / areas[k]; |
---|
525 | stage_explicit_update[k] *= inv_area; |
---|
526 | xmom_explicit_update[k] *= inv_area; |
---|
527 | ymom_explicit_update[k] *= inv_area; |
---|
528 | |
---|
529 | max_speed_array[k] = max_speed_total; |
---|
530 | } |
---|
531 | } |
---|
532 | |
---|
533 | |
---|
534 | |
---|
535 | #ifdef USING_LOCAL_DIRECTIVES |
---|
536 | #pragma hmpp cf_central_single codelet, target=CUDA args[*].transfer=atcall |
---|
537 | #endif |
---|
538 | void compute_fluxes_central_structure_cuda_single( |
---|
539 | int N, |
---|
540 | int N3, |
---|
541 | int N6, |
---|
542 | int N2, |
---|
543 | |
---|
544 | double timestep[N], |
---|
545 | int neighbours[N3], |
---|
546 | int neighbour_edges[N3], |
---|
547 | double normals[N6], |
---|
548 | double edgelengths[N3], |
---|
549 | double radii[N], |
---|
550 | double areas[N], |
---|
551 | int tri_full_flag[N], |
---|
552 | double stage_edge_values[N3], |
---|
553 | double xmom_edge_values[N3], |
---|
554 | double ymom_edge_values[N3], |
---|
555 | double bed_edge_values[N3], |
---|
556 | double stage_boundary_values[N2], |
---|
557 | double xmom_boundary_values[N2], |
---|
558 | double ymom_boundary_values[N2], |
---|
559 | double stage_explicit_update[N], |
---|
560 | double xmom_explicit_update[N], |
---|
561 | double ymom_explicit_update[N], |
---|
562 | double max_speed_array[N], |
---|
563 | |
---|
564 | double evolve_max_timestep, |
---|
565 | double g, |
---|
566 | double epsilon, |
---|
567 | double h0, |
---|
568 | double limiting_threshold, |
---|
569 | int optimise_dry_cells) |
---|
570 | { |
---|
571 | int k; |
---|
572 | |
---|
573 | |
---|
574 | for (k=0; k < N; k++) |
---|
575 | { |
---|
576 | double w_left, h_left, uh_left, vh_left, u_left; |
---|
577 | double w_right, h_right, uh_right, vh_right, u_right; |
---|
578 | double s_min, s_max, soundspeed_left, soundspeed_right; |
---|
579 | double denom, inverse_denominator, z; |
---|
580 | double temp; |
---|
581 | |
---|
582 | |
---|
583 | double q_left_0, q_left_1, q_left_2; |
---|
584 | double q_right_0, q_right_1, q_right_2; |
---|
585 | double q_left_rotated_0, q_left_rotated_1, q_left_rotated_2; |
---|
586 | double q_right_rotated_0, q_right_rotated_1, q_right_rotated_2; |
---|
587 | double flux_left_0, flux_left_1, flux_left_2; |
---|
588 | double flux_right_0, flux_right_1, flux_right_2; |
---|
589 | double edgeflux_0, edgeflux_1, edgeflux_2; |
---|
590 | |
---|
591 | |
---|
592 | double max_speed, max_speed_total; |
---|
593 | double z_left, z_right; |
---|
594 | double n1, n2; |
---|
595 | double length, inv_area; |
---|
596 | |
---|
597 | |
---|
598 | int i, m, ni; |
---|
599 | int ki, nm; |
---|
600 | |
---|
601 | max_speed_total = 0; |
---|
602 | stage_explicit_update[k] = 0; |
---|
603 | xmom_explicit_update[k] = 0; |
---|
604 | ymom_explicit_update[k] = 0; |
---|
605 | |
---|
606 | for (i=0; i< 3; i++) |
---|
607 | { |
---|
608 | ki= k*3 + i; |
---|
609 | |
---|
610 | q_left_0 = stage_edge_values[ki]; |
---|
611 | q_left_1 = xmom_edge_values[ki]; |
---|
612 | q_left_2 = ymom_edge_values[ki]; |
---|
613 | z_left = bed_edge_values[ki]; |
---|
614 | ni = neighbours[ki]; |
---|
615 | if (ni<0) { |
---|
616 | m= -ni -1; |
---|
617 | |
---|
618 | q_right_0 = stage_boundary_values[m]; |
---|
619 | q_right_1 = xmom_boundary_values[m]; |
---|
620 | q_right_2 = ymom_boundary_values[m]; |
---|
621 | z_right = z_left; |
---|
622 | } else { |
---|
623 | m = neighbour_edges[ki]; |
---|
624 | nm = ni *3 + m; |
---|
625 | |
---|
626 | q_right_0 = stage_edge_values[nm]; |
---|
627 | q_right_1 = xmom_edge_values[nm]; |
---|
628 | q_right_2 = ymom_edge_values[nm]; |
---|
629 | z_right = bed_edge_values[nm]; |
---|
630 | } |
---|
631 | |
---|
632 | if(optimise_dry_cells) { |
---|
633 | if(fabs(q_left_0 - z_left) < epsilon && |
---|
634 | fabs(q_right_0 - z_right) < epsilon ) { |
---|
635 | max_speed = 0.0; |
---|
636 | continue; |
---|
637 | } |
---|
638 | } |
---|
639 | n1 = normals[2*ki]; |
---|
640 | n2 = normals[2*ki + 1]; |
---|
641 | ///////////////////////////////////////////////////////// |
---|
642 | |
---|
643 | |
---|
644 | // Copy conserved quantities to protect from modification |
---|
645 | q_left_rotated_0 = q_left_0; |
---|
646 | q_right_rotated_0 = q_right_0; |
---|
647 | q_left_rotated_1 = q_left_1; |
---|
648 | q_right_rotated_1 = q_right_1; |
---|
649 | q_left_rotated_2 = q_left_2; |
---|
650 | q_right_rotated_2 = q_right_2; |
---|
651 | |
---|
652 | // Align x- and y-momentum with x-axis |
---|
653 | //_rotate(q_left_rotated, n1, n2); |
---|
654 | q_left_rotated_1 = n1*q_left_1 + n2*q_left_2; |
---|
655 | q_left_rotated_2 = -n2*q_left_1 + n1*q_left_2; |
---|
656 | |
---|
657 | //_rotate(q_right_rotated, n1, n2); |
---|
658 | q_right_rotated_1 = n1*q_right_1 + n2*q_right_2; |
---|
659 | q_right_rotated_2 = -n2*q_right_1 + n1*q_right_2; |
---|
660 | |
---|
661 | |
---|
662 | if (fabs(z_left - z_right) > 1.0e-10) { |
---|
663 | //report_python_error(AT, "Discontinuous Elevation"); |
---|
664 | //return 0.0; |
---|
665 | } |
---|
666 | z = 0.5 * (z_left + z_right); // Average elevation values. |
---|
667 | |
---|
668 | // Compute speeds in x-direction |
---|
669 | w_left = q_left_rotated_0; |
---|
670 | h_left = w_left - z; |
---|
671 | uh_left = q_left_rotated_1; |
---|
672 | //u_left = _compute_speed(&uh_left, &h_left, |
---|
673 | // epsilon, h0, limiting_threshold); |
---|
674 | |
---|
675 | if (h_left < limiting_threshold) { |
---|
676 | |
---|
677 | if (h_left < epsilon) { |
---|
678 | h_left = 0.0; // Could have been negative |
---|
679 | u_left = 0.0; |
---|
680 | } else { |
---|
681 | u_left = uh_left/(h_left + h0/ h_left); |
---|
682 | } |
---|
683 | |
---|
684 | uh_left = u_left * h_left; |
---|
685 | } else { |
---|
686 | u_left = uh_left/ h_left; |
---|
687 | } |
---|
688 | |
---|
689 | w_right = q_right_rotated_0; |
---|
690 | h_right = w_right - z; |
---|
691 | uh_right = q_right_rotated_1; |
---|
692 | //u_right = _compute_speed(&uh_right, &h_right, |
---|
693 | // epsilon, h0, limiting_threshold); |
---|
694 | |
---|
695 | if (h_right < limiting_threshold) { |
---|
696 | |
---|
697 | if (h_right < epsilon) { |
---|
698 | h_right = 0.0; // Could have been negative |
---|
699 | u_right = 0.0; |
---|
700 | } else { |
---|
701 | u_right = uh_right/(h_right + h0/ h_right); |
---|
702 | } |
---|
703 | |
---|
704 | uh_right = u_right * h_right; |
---|
705 | } else { |
---|
706 | u_right = uh_right/ h_right; |
---|
707 | } |
---|
708 | |
---|
709 | |
---|
710 | |
---|
711 | |
---|
712 | // Momentum in y-direction |
---|
713 | vh_left = q_left_rotated_2; |
---|
714 | vh_right = q_right_rotated_2; |
---|
715 | |
---|
716 | soundspeed_left = sqrt(g * h_left); |
---|
717 | soundspeed_right = sqrt(g * h_right); |
---|
718 | |
---|
719 | s_max = u_left + soundspeed_left; |
---|
720 | if(s_max < u_right + soundspeed_right) |
---|
721 | s_max = u_right + soundspeed_right; |
---|
722 | |
---|
723 | |
---|
724 | if (s_max < 0.0) { |
---|
725 | s_max = 0.0; |
---|
726 | } |
---|
727 | |
---|
728 | s_min = u_left - soundspeed_left; |
---|
729 | if (s_min > u_right - soundspeed_right) |
---|
730 | s_min = u_right - soundspeed_right; |
---|
731 | |
---|
732 | if (s_min > 0.0) { |
---|
733 | s_min = 0.0; |
---|
734 | } |
---|
735 | |
---|
736 | // Flux formulas |
---|
737 | flux_left_0 = u_left*h_left; |
---|
738 | flux_left_1 = u_left * uh_left + 0.5 * g * h_left*h_left; |
---|
739 | flux_left_2 = u_left*vh_left; |
---|
740 | |
---|
741 | flux_right_0 = u_right*h_right; |
---|
742 | flux_right_1 = u_right * uh_right + 0.5 * g * h_right*h_right; |
---|
743 | flux_right_2 = u_right*vh_right; |
---|
744 | |
---|
745 | // Flux computation |
---|
746 | denom = s_max - s_min; |
---|
747 | if (denom < epsilon) { // FIXME (Ole): Try using h0 here |
---|
748 | edgeflux_0 = 0; |
---|
749 | edgeflux_1 = 0; |
---|
750 | edgeflux_2 = 0; |
---|
751 | |
---|
752 | max_speed = 0.0; |
---|
753 | } |
---|
754 | else { |
---|
755 | inverse_denominator = 1.0 / denom; |
---|
756 | //for (j = 0; j < 3; j++) { |
---|
757 | // edgeflux[j] = s_max *flux_left[j] -s_min *flux_right[j]; |
---|
758 | // edgeflux[j] += s_max * s_min * ( |
---|
759 | // q_right_rotated[j] - q_left_rotated[j]); |
---|
760 | // edgeflux[j] *= inverse_denominator; |
---|
761 | //} |
---|
762 | edgeflux_0 = s_max *flux_left_0 -s_min *flux_right_0; |
---|
763 | edgeflux_0 += s_max * s_min * ( |
---|
764 | q_right_rotated_0 - q_left_rotated_0); |
---|
765 | edgeflux_0 *= inverse_denominator; |
---|
766 | |
---|
767 | edgeflux_1 = s_max *flux_left_1 -s_min *flux_right_1; |
---|
768 | edgeflux_1 += s_max * s_min * ( |
---|
769 | q_right_rotated_1 - q_left_rotated_1); |
---|
770 | edgeflux_1 *= inverse_denominator; |
---|
771 | |
---|
772 | edgeflux_2 = s_max *flux_left_2 -s_min *flux_right_2; |
---|
773 | edgeflux_2 += s_max * s_min * ( |
---|
774 | q_right_rotated_2 - q_left_rotated_2); |
---|
775 | edgeflux_2 *= inverse_denominator; |
---|
776 | |
---|
777 | |
---|
778 | |
---|
779 | // Maximal wavespeed |
---|
780 | //max_speed = max(fabs(s_max), fabs(s_min)); |
---|
781 | max_speed = fabs(s_max); |
---|
782 | if (max_speed < fabs(s_min)) |
---|
783 | max_speed = fabs(s_min); |
---|
784 | |
---|
785 | // Rotate back |
---|
786 | //_rotate(edgeflux, n1, -n2); |
---|
787 | temp = edgeflux_1; |
---|
788 | edgeflux_1 = n1* temp -n2*edgeflux_2; |
---|
789 | edgeflux_2 = n2*temp + n1*edgeflux_2; |
---|
790 | } |
---|
791 | |
---|
792 | ////////////////////////////////////////////////////// |
---|
793 | |
---|
794 | length = edgelengths[ki]; |
---|
795 | edgeflux_0 *= length; |
---|
796 | edgeflux_1 *= length; |
---|
797 | edgeflux_2 *= length; |
---|
798 | |
---|
799 | stage_explicit_update[k]-= edgeflux_0; |
---|
800 | xmom_explicit_update[k] -= edgeflux_1; |
---|
801 | ymom_explicit_update[k] -= edgeflux_2; |
---|
802 | |
---|
803 | if (tri_full_flag[k] == 1) { |
---|
804 | if (max_speed > epsilon) { |
---|
805 | //timestep[k] = min(timestep[k], radii[k]/ max_speed); |
---|
806 | if (timestep[k] > radii[k]/max_speed) |
---|
807 | timestep[k] = radii[k]/ max_speed; |
---|
808 | |
---|
809 | if (ni>=0){ |
---|
810 | //timestep[k] =min(timestep[k], radii[ni]/max_speed); |
---|
811 | if (timestep[k] > radii[ni]/max_speed) |
---|
812 | timestep[k] = radii[ni]/ max_speed; |
---|
813 | |
---|
814 | } |
---|
815 | } |
---|
816 | } |
---|
817 | if (ni < 0 || ni > k){ |
---|
818 | max_speed_total = fmax(max_speed_total, max_speed); |
---|
819 | } |
---|
820 | } |
---|
821 | |
---|
822 | inv_area = 1.0 / areas[k]; |
---|
823 | stage_explicit_update[k] *= inv_area; |
---|
824 | xmom_explicit_update[k] *= inv_area; |
---|
825 | ymom_explicit_update[k] *= inv_area; |
---|
826 | |
---|
827 | max_speed_array[k] = max_speed_total; |
---|
828 | } |
---|
829 | } |
---|
830 | |
---|
831 | |
---|
832 | #ifdef USING_MAIN |
---|
833 | int main(int argc, char *argv[]) |
---|
834 | { |
---|
835 | int N, N2, optimise_dry_cells; |
---|
836 | int cnt_s =0, cnt_x =0, cnt_y =0, cnt_m = 0; |
---|
837 | |
---|
838 | double evolve_max_timestep, |
---|
839 | g, |
---|
840 | epsilon, |
---|
841 | h0, |
---|
842 | limiting_threshold; |
---|
843 | |
---|
844 | double * sv, // stage_vertex_values |
---|
845 | * sb, // stage_boundary_values |
---|
846 | * se, // stage_edge_values, |
---|
847 | * sc, // stage_centroid_values, |
---|
848 | * su, // stage_explicit_update |
---|
849 | |
---|
850 | * be, // bed_edge_values, |
---|
851 | * bc, // bed_centroid_values, |
---|
852 | |
---|
853 | * xb, // xmom_boundary_values |
---|
854 | * xe, // xmom_edge_values |
---|
855 | * xu, // xmom_explicit_update |
---|
856 | |
---|
857 | * yb, // ymom_boundary_values |
---|
858 | * ye, // ymom_edge_values |
---|
859 | * yu, // ymom_explicit_update |
---|
860 | |
---|
861 | * normals, |
---|
862 | * el, // edgelengths; |
---|
863 | * radii, |
---|
864 | * a, // areas, |
---|
865 | * vc; // vertex_coordinates, |
---|
866 | |
---|
867 | int * tri; // tri_full_flag |
---|
868 | int * n, // neighbours |
---|
869 | * ne; // neighbour_edges |
---|
870 | |
---|
871 | double * timestep, * max_speed_array; |
---|
872 | |
---|
873 | // Testing group |
---|
874 | double * test_stage_explicit_update, |
---|
875 | * test_xmom_explicit_update, |
---|
876 | * test_ymom_explicit_update, |
---|
877 | * test_max_speed_array, |
---|
878 | * test_timestep; |
---|
879 | int i; |
---|
880 | |
---|
881 | char file_name[]="merimbula_cf.dat"; |
---|
882 | |
---|
883 | freopen(file_name, "r", stdin); |
---|
884 | |
---|
885 | scanf("%d\n", &N); |
---|
886 | scanf("%d\n", &N2); |
---|
887 | scanf("%d\n", &optimise_dry_cells); |
---|
888 | scanf("%lf\n",&evolve_max_timestep); |
---|
889 | scanf("%lf\n",&g); |
---|
890 | scanf("%lf\n",&epsilon); |
---|
891 | scanf("%lf\n",&h0); |
---|
892 | scanf("%lf\n",&limiting_threshold); |
---|
893 | |
---|
894 | |
---|
895 | printf("%d %lf\n", N, g); |
---|
896 | |
---|
897 | |
---|
898 | sv = (double *)malloc( N*3*sizeof(double)); |
---|
899 | assert(sv != NULL); |
---|
900 | sb = (double *)malloc( N2*sizeof(double)); |
---|
901 | assert(sb != NULL); |
---|
902 | se = (double *)malloc( N*3*sizeof(double)); |
---|
903 | assert(se != NULL); |
---|
904 | sc = (double *)malloc( N*sizeof(double)); |
---|
905 | assert(sc != NULL); |
---|
906 | su = (double *)malloc( N*sizeof(double)); |
---|
907 | assert(su != NULL); |
---|
908 | |
---|
909 | |
---|
910 | be = (double *)malloc( N*3*sizeof(double)); |
---|
911 | assert(be != NULL); |
---|
912 | bc = (double *)malloc( N*sizeof(double)); |
---|
913 | assert(bc != NULL); |
---|
914 | |
---|
915 | |
---|
916 | xb = (double *)malloc( N2*sizeof(double)); |
---|
917 | assert(xb != NULL); |
---|
918 | xe = (double *)malloc( N*3*sizeof(double)); |
---|
919 | assert(xe != NULL); |
---|
920 | xu = (double *)malloc( N*sizeof(double)); |
---|
921 | assert(xu != NULL); |
---|
922 | |
---|
923 | yb = (double *)malloc( N2*sizeof(double)); |
---|
924 | assert(yb != NULL); |
---|
925 | ye = (double *)malloc( N*3*sizeof(double)); |
---|
926 | assert(ye != NULL); |
---|
927 | yu = (double *)malloc( N*sizeof(double)); |
---|
928 | assert(yu != NULL); |
---|
929 | |
---|
930 | |
---|
931 | n = (int *)malloc( N*3*sizeof(int)); |
---|
932 | assert(n != NULL); |
---|
933 | ne = (int *)malloc( N*3*sizeof(int)); |
---|
934 | assert(ne != NULL); |
---|
935 | |
---|
936 | |
---|
937 | normals = (double *)malloc( N*6*sizeof(double)); |
---|
938 | assert(normals != NULL); |
---|
939 | el = (double *)malloc( N*3*sizeof(double)); |
---|
940 | assert(el != NULL); |
---|
941 | radii = (double *)malloc( N*sizeof(double)); |
---|
942 | assert(radii != NULL); |
---|
943 | a = (double *)malloc( N*sizeof(double)); |
---|
944 | assert(a != NULL); |
---|
945 | |
---|
946 | tri = (int *)malloc( N*sizeof(int)); |
---|
947 | assert(tri != NULL); |
---|
948 | |
---|
949 | vc = (double *)malloc( N*6*sizeof(double)); |
---|
950 | assert(vc != NULL); |
---|
951 | |
---|
952 | timestep = (double *)malloc(N*sizeof(double)); |
---|
953 | assert(timestep != NULL); |
---|
954 | max_speed_array = (double *)malloc(N*sizeof(double)); |
---|
955 | assert(max_speed_array != NULL); |
---|
956 | |
---|
957 | test_stage_explicit_update = (double *)malloc(N*sizeof(double)); |
---|
958 | assert(test_stage_explicit_update != NULL); |
---|
959 | test_xmom_explicit_update = (double *)malloc(N*sizeof(double)); |
---|
960 | assert(test_xmom_explicit_update != NULL); |
---|
961 | test_ymom_explicit_update = (double *)malloc(N*sizeof(double)); |
---|
962 | assert(test_ymom_explicit_update != NULL); |
---|
963 | test_max_speed_array = (double *)malloc(N*sizeof(double)); |
---|
964 | assert(test_max_speed_array != NULL); |
---|
965 | test_timestep = (double *)malloc(N*sizeof(double)); |
---|
966 | assert(test_timestep != NULL); |
---|
967 | |
---|
968 | |
---|
969 | for(i=0; i < N; i++) |
---|
970 | { |
---|
971 | scanf("%lf %lf %lf\n", sv+i*3, sv+i*3 +1, sv+i*3 +2); |
---|
972 | scanf("%lf %lf %lf\n", se+i*3, se+i*3 +1, se+i*3 +2); |
---|
973 | scanf("%lf\n", sc+i); |
---|
974 | scanf("%lf\n", su+i); |
---|
975 | |
---|
976 | scanf("%lf %lf %lf\n",be+i*3, be+i*3 +1, be+i*3 +2); |
---|
977 | scanf("%lf\n", bc+i); |
---|
978 | |
---|
979 | scanf("%lf %lf %lf\n", xe+i*3, xe+i*3 +1, xe+i*3 +2); |
---|
980 | scanf("%lf\n", xu+i); |
---|
981 | scanf("%lf %lf %lf\n", ye+i*3, ye+i*3 +1, ye+i*3 +2); |
---|
982 | scanf("%lf\n", yu+i); |
---|
983 | |
---|
984 | scanf("%d %d %d\n", n+i*3, n+i*3 +1, n+i*3 +2); |
---|
985 | |
---|
986 | scanf("%d %d %d\n", ne+i*3, ne+i*3 +1, ne+i*3 +2); |
---|
987 | |
---|
988 | |
---|
989 | scanf("%lf %lf %lf %lf %lf %lf\n", normals+i*6, normals+i*6+1, normals+i*6+2,normals+i*6+3, normals+i*6+4, normals+i*6+5); |
---|
990 | |
---|
991 | scanf("%lf %lf %lf\n", el+i*3, el+i*3 +1, el+i*3 +2); |
---|
992 | |
---|
993 | scanf("%lf\n", radii+i); |
---|
994 | scanf("%lf\n", a+i); |
---|
995 | scanf("%d\n", tri+i); |
---|
996 | |
---|
997 | scanf("%lf %lf\n", vc+i*6, vc+i*6 +1); |
---|
998 | scanf("%lf %lf\n", vc+i*6+2, vc+i*6 +3); |
---|
999 | scanf("%lf %lf\n", vc+i*6+4, vc+i*6 +5); |
---|
1000 | } |
---|
1001 | |
---|
1002 | for(i=0; i < N2; i++) |
---|
1003 | { |
---|
1004 | scanf("%lf\n", sb+i); |
---|
1005 | scanf("%lf\n", xb+i); |
---|
1006 | scanf("%lf\n", yb+i); |
---|
1007 | } |
---|
1008 | |
---|
1009 | cnt_m = 0; |
---|
1010 | for(i=0; i < N; i++) |
---|
1011 | { |
---|
1012 | if (max_speed_array[i] != test_max_speed_array[i]) |
---|
1013 | cnt_m ++; |
---|
1014 | } |
---|
1015 | printf("\n --> Test initial input %d\n", cnt_m); |
---|
1016 | |
---|
1017 | printf(" --> Enter Kernel\n"); |
---|
1018 | #pragma hmpp cf_central_single callsite |
---|
1019 | compute_fluxes_central_structure_cuda_single( |
---|
1020 | N, |
---|
1021 | N*3, |
---|
1022 | N*6, |
---|
1023 | N2, |
---|
1024 | |
---|
1025 | timestep, |
---|
1026 | n, |
---|
1027 | ne, |
---|
1028 | normals, |
---|
1029 | el, |
---|
1030 | radii, |
---|
1031 | a, |
---|
1032 | tri, |
---|
1033 | se, |
---|
1034 | xe, |
---|
1035 | ye, |
---|
1036 | be, |
---|
1037 | sb, |
---|
1038 | xb, |
---|
1039 | yb, |
---|
1040 | su, |
---|
1041 | xu, |
---|
1042 | yu, |
---|
1043 | max_speed_array, |
---|
1044 | |
---|
1045 | evolve_max_timestep, |
---|
1046 | g, |
---|
1047 | epsilon, |
---|
1048 | h0, |
---|
1049 | limiting_threshold, |
---|
1050 | optimise_dry_cells); |
---|
1051 | |
---|
1052 | /* |
---|
1053 | printf(" --> Enter C\n"); |
---|
1054 | compute_fluxes_central_structure_cuda_single( N, N*3, N*6, N2, timestep, n, ne, normals, el, radii, a, tri, se, xe, ye, be, sb, xb, yb, su, xu, yu, max_speed_array, evolve_max_timestep, g, epsilon, h0, limiting_threshold, optimise_dry_cells); |
---|
1055 | |
---|
1056 | */ |
---|
1057 | |
---|
1058 | |
---|
1059 | printf(" --> Enter Kernel\n"); |
---|
1060 | #pragma hmpp cf_central callsite |
---|
1061 | compute_fluxes_central_structure_CUDA( |
---|
1062 | N, |
---|
1063 | N*3, |
---|
1064 | N*6, |
---|
1065 | N2, |
---|
1066 | |
---|
1067 | timestep, |
---|
1068 | n, |
---|
1069 | ne, |
---|
1070 | normals, |
---|
1071 | el, |
---|
1072 | radii, |
---|
1073 | a, |
---|
1074 | tri, |
---|
1075 | se, |
---|
1076 | xe, |
---|
1077 | ye, |
---|
1078 | be, |
---|
1079 | sb, |
---|
1080 | xb, |
---|
1081 | yb, |
---|
1082 | su, |
---|
1083 | xu, |
---|
1084 | yu, |
---|
1085 | max_speed_array, |
---|
1086 | |
---|
1087 | evolve_max_timestep, |
---|
1088 | g, |
---|
1089 | epsilon, |
---|
1090 | h0, |
---|
1091 | limiting_threshold, |
---|
1092 | optimise_dry_cells); |
---|
1093 | |
---|
1094 | |
---|
1095 | |
---|
1096 | printf(" --> Enter original C\n"); |
---|
1097 | compute_fluxes_central_structure_CUDA( |
---|
1098 | N, N*3, N*6, N2, |
---|
1099 | test_timestep, n, ne, normals, el, radii, a, tri, |
---|
1100 | se, xe, ye, be, sb, xb, yb, |
---|
1101 | test_stage_explicit_update, |
---|
1102 | test_xmom_explicit_update, |
---|
1103 | test_ymom_explicit_update, |
---|
1104 | test_max_speed_array, |
---|
1105 | evolve_max_timestep, g, epsilon, h0, limiting_threshold, |
---|
1106 | optimise_dry_cells); |
---|
1107 | |
---|
1108 | |
---|
1109 | |
---|
1110 | for (cnt_s=cnt_x=cnt_y=cnt_m=i=0; i < N; i++) |
---|
1111 | { |
---|
1112 | if ( fabs(su[i] - test_stage_explicit_update[i]) >= TOLERANCE) |
---|
1113 | { |
---|
1114 | cnt_s++; |
---|
1115 | if (cnt_s <= 10) |
---|
1116 | printf(" sta %d %lf %lf\n", i, su[i], test_stage_explicit_update[i]); |
---|
1117 | } |
---|
1118 | if ( fabs(xu[i] - test_xmom_explicit_update[i]) >= TOLERANCE) |
---|
1119 | { |
---|
1120 | cnt_x++; |
---|
1121 | if (cnt_x <= 10) |
---|
1122 | printf(" xmom %d %lf %lf\n", i, xu[i], test_xmom_explicit_update[i]); |
---|
1123 | } |
---|
1124 | if ( fabs(yu[i] - test_ymom_explicit_update[i]) >= TOLERANCE) |
---|
1125 | { |
---|
1126 | cnt_y++; |
---|
1127 | if (cnt_y <= 10) |
---|
1128 | printf(" ymom %d %lf %lf\n", i, yu[i], test_ymom_explicit_update[i]); |
---|
1129 | } |
---|
1130 | if ( fabs(max_speed_array[i] -test_max_speed_array[i]) >=TOLERANCE) |
---|
1131 | { |
---|
1132 | cnt_m++; |
---|
1133 | if (cnt_m <= 10) |
---|
1134 | printf(" max %d %lf %lf\n", i, max_speed_array[i], test_max_speed_array[i]); |
---|
1135 | } |
---|
1136 | } |
---|
1137 | printf("se:%d xe:%d ye:%d max:%d errors found\n", cnt_s, cnt_x, cnt_y, cnt_m); |
---|
1138 | } |
---|
1139 | #endif |
---|