1 | //#define UNSORTED_DOMAIN |
---|
2 | #define USING_MULTI_FUNCTION |
---|
3 | //#define REARRANGED_DOMAIN |
---|
4 | #include "hmpp_fun.h" |
---|
5 | |
---|
6 | |
---|
7 | #ifdef UNSORTED_DOMAIN |
---|
8 | __device__ void spe_bubble_sort(int* _list , long* neighbours, int k) |
---|
9 | { |
---|
10 | int temp; |
---|
11 | if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) ) |
---|
12 | { |
---|
13 | temp = _list[2]; |
---|
14 | _list[2] = _list[1]; |
---|
15 | _list[1] = temp; |
---|
16 | } |
---|
17 | if ( neighbours[_list[1]]>=0 and neighbours[ _list[1] ] < k and (neighbours[_list[0]]<0 or neighbours[_list[1]]<neighbours[_list[0]]) ) |
---|
18 | { |
---|
19 | temp = _list[1]; |
---|
20 | _list[1] = _list[0]; |
---|
21 | _list[0] = temp; |
---|
22 | } |
---|
23 | if ( neighbours[_list[2]]>=0 and neighbours[ _list[2] ] < k and (neighbours[_list[1]]<0 or neighbours[_list[2]]<neighbours[_list[1]]) ) |
---|
24 | { |
---|
25 | temp = _list[2]; |
---|
26 | _list[2] = _list[1]; |
---|
27 | _list[1] = temp; |
---|
28 | } |
---|
29 | } |
---|
30 | #endif |
---|
31 | |
---|
32 | |
---|
33 | #ifdef USING_MULTI_FUNCTION |
---|
34 | |
---|
35 | #define Dtimestep 0 |
---|
36 | #define Depsilon 1 |
---|
37 | #define DH0 2 |
---|
38 | #define Dg 3 |
---|
39 | #define Doptimise_dry_cells 4 |
---|
40 | #define Dcall 5 |
---|
41 | #define Dnumber_of_elements |
---|
42 | |
---|
43 | |
---|
44 | int _rotate(double *q, double n1, double n2) |
---|
45 | { |
---|
46 | /*Rotate the momentum component q (q[1], q[2]) |
---|
47 | from x,y coordinates to coordinates based on normal vector (n1, n2). |
---|
48 | |
---|
49 | Result is returned in array 3x1 r |
---|
50 | To rotate in opposite direction, call rotate with (q, n1, -n2) |
---|
51 | |
---|
52 | Contents of q are changed by this function */ |
---|
53 | |
---|
54 | |
---|
55 | double q1, q2; |
---|
56 | |
---|
57 | q1 = q[1]; // uh momentum |
---|
58 | q2 = q[2]; // vh momentum |
---|
59 | |
---|
60 | // Rotate |
---|
61 | q[1] = n1 * q1 + n2*q2; |
---|
62 | q[2] = -n2 * q1 + n1*q2; |
---|
63 | |
---|
64 | return 0; |
---|
65 | } |
---|
66 | |
---|
67 | |
---|
68 | |
---|
69 | double _compute_speed(double *uh, |
---|
70 | double *h, |
---|
71 | double epsilon, |
---|
72 | double h0, |
---|
73 | double limiting_threshold) |
---|
74 | { |
---|
75 | double u; |
---|
76 | |
---|
77 | if (*h < limiting_threshold) { |
---|
78 | // Apply limiting of speeds according to the ANUGA manual |
---|
79 | if (*h < epsilon) { |
---|
80 | *h = 0.0; // Could have been negative |
---|
81 | u = 0.0; |
---|
82 | } else { |
---|
83 | u = *uh / (*h + h0 / *h); |
---|
84 | } |
---|
85 | |
---|
86 | |
---|
87 | // Adjust momentum to be consistent with speed |
---|
88 | *uh = u * *h; |
---|
89 | } else { |
---|
90 | // We are in deep water - no need for limiting |
---|
91 | u = *uh / *h; |
---|
92 | } |
---|
93 | |
---|
94 | return u; |
---|
95 | } |
---|
96 | |
---|
97 | |
---|
98 | |
---|
99 | int _flux_function_central(double *q_left, double *q_right, |
---|
100 | double z_left, double z_right, |
---|
101 | double n1, double n2, |
---|
102 | double epsilon, |
---|
103 | double h0, |
---|
104 | double limiting_threshold, |
---|
105 | double g, |
---|
106 | double *edgeflux, double *max_speed) |
---|
107 | { |
---|
108 | /*Compute fluxes between volumes for the shallow water wave equation |
---|
109 | cast in terms of the 'stage', w = h+z using |
---|
110 | the 'central scheme' as described in |
---|
111 | |
---|
112 | Kurganov, Noelle, Petrova. 'Semidiscrete Central-Upwind Schemes For |
---|
113 | Hyperbolic Conservation Laws and Hamilton-Jacobi Equations'. |
---|
114 | Siam J. Sci. Comput. Vol. 23, No. 3, pp. 707-740. |
---|
115 | |
---|
116 | The implemented formula is given in equation (3.15) on page 714 |
---|
117 | */ |
---|
118 | |
---|
119 | int i; |
---|
120 | |
---|
121 | double w_left, h_left, uh_left, vh_left, u_left; |
---|
122 | double w_right, h_right, uh_right, vh_right, u_right; |
---|
123 | double s_min, s_max, soundspeed_left, soundspeed_right; |
---|
124 | double denom, inverse_denominator, z; |
---|
125 | double temp; |
---|
126 | |
---|
127 | // Workspace (allocate once, use many) |
---|
128 | double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3]; |
---|
129 | |
---|
130 | // Copy conserved quantities to protect from modification |
---|
131 | q_left_rotated[0] = q_left[0]; |
---|
132 | q_right_rotated[0] = q_right[0]; |
---|
133 | q_left_rotated[1] = q_left[1]; |
---|
134 | q_right_rotated[1] = q_right[1]; |
---|
135 | q_left_rotated[2] = q_left[2]; |
---|
136 | q_right_rotated[2] = q_right[2]; |
---|
137 | |
---|
138 | // Align x- and y-momentum with x-axis |
---|
139 | //_rotate(q_left_rotated, n1, n2); |
---|
140 | q_left_rotated[1] = n1*q_left[1] + n2*q_left[2]; |
---|
141 | q_left_rotated[2] = -n2*q_left[1] + n1*q_left[2]; |
---|
142 | |
---|
143 | //_rotate(q_right_rotated, n1, n2); |
---|
144 | q_right_rotated[1] = n1*q_right[1] + n2*q_right[2]; |
---|
145 | q_right_rotated[2] = -n2*q_right[1] + n1*q_right[2]; |
---|
146 | |
---|
147 | |
---|
148 | if (fabs(z_left - z_right) > 1.0e-10) { |
---|
149 | //report_python_error(AT, "Discontinuous Elevation"); |
---|
150 | return 0.0; |
---|
151 | } |
---|
152 | z = 0.5 * (z_left + z_right); // Average elevation values. |
---|
153 | |
---|
154 | // Compute speeds in x-direction |
---|
155 | w_left = q_left_rotated[0]; |
---|
156 | h_left = w_left - z; |
---|
157 | uh_left = q_left_rotated[1]; |
---|
158 | u_left = _compute_speed(&uh_left, &h_left, |
---|
159 | epsilon, h0, limiting_threshold); |
---|
160 | /* |
---|
161 | if (h_left < limiting_threshold) { |
---|
162 | if (h_left < epsilon) { |
---|
163 | h_left = 0.0; // Could have been negative |
---|
164 | u_left = 0.0; |
---|
165 | } else { |
---|
166 | u_left = uh_left/(h_left + h0/ h_left); |
---|
167 | } |
---|
168 | |
---|
169 | uh_left = u_left * h_left; |
---|
170 | } else { |
---|
171 | u_left = uh_left/ h_left; |
---|
172 | } |
---|
173 | */ |
---|
174 | w_right = q_right_rotated[0]; |
---|
175 | h_right = w_right - z; |
---|
176 | uh_right = q_right_rotated[1]; |
---|
177 | u_right = _compute_speed(&uh_right, &h_right, |
---|
178 | epsilon, h0, limiting_threshold); |
---|
179 | /* |
---|
180 | if (h_right < limiting_threshold) { |
---|
181 | |
---|
182 | if (h_right < epsilon) { |
---|
183 | h_right = 0.0; // Could have been negative |
---|
184 | u_right = 0.0; |
---|
185 | } else { |
---|
186 | u_right = uh_right/(h_right + h0/ h_right); |
---|
187 | } |
---|
188 | |
---|
189 | uh_right = u_right * h_right; |
---|
190 | } else { |
---|
191 | u_right = uh_right/ h_right; |
---|
192 | } |
---|
193 | */ |
---|
194 | // Momentum in y-direction |
---|
195 | vh_left = q_left_rotated[2]; |
---|
196 | vh_right = q_right_rotated[2]; |
---|
197 | |
---|
198 | // Limit y-momentum if necessary |
---|
199 | // Leaving this out, improves speed significantly (Ole 27/5/2009) |
---|
200 | // All validation tests pass, so do we really need it anymore? |
---|
201 | _compute_speed(&vh_left, &h_left, |
---|
202 | epsilon, h0, limiting_threshold); |
---|
203 | _compute_speed(&vh_right, &h_right, |
---|
204 | epsilon, h0, limiting_threshold); |
---|
205 | |
---|
206 | // Maximal and minimal wave speeds |
---|
207 | soundspeed_left = sqrt(g * h_left); |
---|
208 | soundspeed_right = sqrt(g * h_right); |
---|
209 | |
---|
210 | // Code to use fast square root optimisation if desired. |
---|
211 | // Timings on AMD 64 for the Okushiri profile gave the following timings |
---|
212 | // |
---|
213 | // SQRT Total Flux |
---|
214 | //============================= |
---|
215 | // |
---|
216 | // Ref 405s 152s |
---|
217 | // Fast (dbl) 453s 173s |
---|
218 | // Fast (sng) 437s 171s |
---|
219 | // |
---|
220 | // Consequently, there is currently (14/5/2009) no reason to use this |
---|
221 | // approximation. |
---|
222 | |
---|
223 | //soundspeed_left = fast_squareroot_approximation(g*h_left); |
---|
224 | //soundspeed_right = fast_squareroot_approximation(g*h_right); |
---|
225 | |
---|
226 | s_max = fmax(u_left + soundspeed_left, u_right + soundspeed_right); |
---|
227 | if (s_max < 0.0) { |
---|
228 | s_max = 0.0; |
---|
229 | } |
---|
230 | |
---|
231 | s_min = fmin(u_left - soundspeed_left, u_right - soundspeed_right); |
---|
232 | if (s_min > 0.0) { |
---|
233 | s_min = 0.0; |
---|
234 | } |
---|
235 | |
---|
236 | // Flux formulas |
---|
237 | flux_left[0] = u_left*h_left; |
---|
238 | flux_left[1] = u_left * uh_left + 0.5 * g * h_left*h_left; |
---|
239 | flux_left[2] = u_left*vh_left; |
---|
240 | |
---|
241 | flux_right[0] = u_right*h_right; |
---|
242 | flux_right[1] = u_right * uh_right + 0.5 * g * h_right*h_right; |
---|
243 | flux_right[2] = u_right*vh_right; |
---|
244 | |
---|
245 | // Flux computation |
---|
246 | denom = s_max - s_min; |
---|
247 | if (denom < epsilon) { // FIXME (Ole): Try using h0 here |
---|
248 | //memset(edgeflux, 0, 3 * sizeof (double)); |
---|
249 | edgeflux[0]=0; |
---|
250 | edgeflux[1]=0; |
---|
251 | edgeflux[2]=0; |
---|
252 | *max_speed = 0.0; |
---|
253 | } |
---|
254 | else { |
---|
255 | inverse_denominator = 1.0 / denom; |
---|
256 | #pragma hmppcg noparallel |
---|
257 | for (i = 0; i < 3; i++) { |
---|
258 | edgeflux[i] = s_max * flux_left[i] - s_min * flux_right[i]; |
---|
259 | edgeflux[i] += s_max * s_min * (q_right_rotated[i] - q_left_rotated[i]); |
---|
260 | edgeflux[i] *= inverse_denominator; |
---|
261 | } |
---|
262 | |
---|
263 | // Maximal wavespeed |
---|
264 | *max_speed = fmax(fabs(s_max), fabs(s_min)); |
---|
265 | |
---|
266 | // Rotate back |
---|
267 | //_rotate(edgeflux, n1, -n2); |
---|
268 | temp = edgeflux[1]; |
---|
269 | edgeflux[1] = n1* temp -n2*edgeflux[2]; |
---|
270 | edgeflux[2] = n2*temp + n1*edgeflux[2]; |
---|
271 | } |
---|
272 | |
---|
273 | return 0; |
---|
274 | } |
---|
275 | |
---|
276 | |
---|
277 | |
---|
278 | |
---|
279 | |
---|
280 | |
---|
281 | /*****************************************/ |
---|
282 | /* The CUDA compute fluex function */ |
---|
283 | /*****************************************/ |
---|
284 | |
---|
285 | |
---|
286 | void compute_fluxes_central_structure_CUDA( |
---|
287 | int N, |
---|
288 | int N3, |
---|
289 | int N6, |
---|
290 | int N2, |
---|
291 | |
---|
292 | double timestep[N], |
---|
293 | long neighbours[N3], |
---|
294 | long neighbour_edges[N3], |
---|
295 | double normals[N6], |
---|
296 | double edgelengths[N3], |
---|
297 | double radii[N], |
---|
298 | double areas[N], |
---|
299 | long tri_full_flag[N], |
---|
300 | double stage_edge_values[N3], |
---|
301 | double xmom_edge_values[N3], |
---|
302 | double ymom_edge_values[N3], |
---|
303 | double bed_edge_values[N3], |
---|
304 | double stage_boundary_values[N2], |
---|
305 | double xmom_boundary_values[N2], |
---|
306 | double ymom_boundary_values[N2], |
---|
307 | double stage_explicit_update[N], |
---|
308 | double xmom_explicit_update[N], |
---|
309 | double ymom_explicit_update[N], |
---|
310 | double max_speed_array[N], |
---|
311 | |
---|
312 | double evolve_max_timestep, |
---|
313 | double g, |
---|
314 | double epsilon, |
---|
315 | double h0, |
---|
316 | double limiting_threshold, |
---|
317 | int optimise_dry_cells) |
---|
318 | { |
---|
319 | double max_speed, max_speed_total=0 , length, inv_area, zl, zr; |
---|
320 | |
---|
321 | int k, i, m, n; |
---|
322 | int ki, nm = 0, ki2; // Index shorthands |
---|
323 | |
---|
324 | double ql[3], qr[3], edgeflux[3]; |
---|
325 | |
---|
326 | |
---|
327 | |
---|
328 | #pragma hmppcg gridify(k), private(max_speed, max_speed_total, length,& |
---|
329 | #pragma hmppcg & inv_area, zl, zr, i, m, n, ki, nm, ki2, ql, qr, & |
---|
330 | #pragma hmppcg & edgeflux), global(timestep, neighbours, & |
---|
331 | #pragma hmppcg & neighbour_edges, normals, edgelengths, radii, & |
---|
332 | #pragma hmppcg & areas, tri_full_flag, stage_edge_values, & |
---|
333 | #pragma hmppcg & xmom_boundary_values, ymom_boundary_values, & |
---|
334 | #pragma hmppcg & stage_explicit_update, xmom_explicit_update, & |
---|
335 | #pragma hmppcg & ymom_explicit_update, max_speed_array, & |
---|
336 | #pragma hmppcg & evolve_max_timestep, g, epsilon, h0, & |
---|
337 | #pragma hmppcg & limiting_threshold, optimise_dry_cells) |
---|
338 | for (k=0; k<N; k++) |
---|
339 | { |
---|
340 | for ( i = 0; i < 3; i++) { |
---|
341 | |
---|
342 | #ifndef REARRANGED_DOMAIN |
---|
343 | ki = k * 3 + i; // Linear index to edge i of triangle k |
---|
344 | #else |
---|
345 | ki = k + N *i; |
---|
346 | #endif |
---|
347 | timestep[k] = evolve_max_timestep; |
---|
348 | |
---|
349 | stage_explicit_update[k] = 0; |
---|
350 | xmom_explicit_update[k] = 0; |
---|
351 | ymom_explicit_update[k] = 0; |
---|
352 | |
---|
353 | n = neighbours[ki]; |
---|
354 | |
---|
355 | ql[0] = stage_edge_values[ki]; |
---|
356 | ql[1] = xmom_edge_values[ki]; |
---|
357 | ql[2] = ymom_edge_values[ki]; |
---|
358 | zl = bed_edge_values[ki]; |
---|
359 | |
---|
360 | if (n < 0) { |
---|
361 | m = -n - 1; // Convert negative flag to boundary index |
---|
362 | |
---|
363 | qr[0] = stage_boundary_values[m]; |
---|
364 | qr[1] = xmom_boundary_values[m]; |
---|
365 | qr[2] = ymom_boundary_values[m]; |
---|
366 | zr = zl; // Extend bed elevation to boundary |
---|
367 | } else { |
---|
368 | m = neighbour_edges[ki]; |
---|
369 | #ifndef REARRANGED_DOMAIN |
---|
370 | nm = n * 3 + m; // Linear index (triangle n, edge m) |
---|
371 | #else |
---|
372 | nm = n + N*m; |
---|
373 | #endif |
---|
374 | qr[0] = stage_edge_values[nm]; |
---|
375 | qr[1] = xmom_edge_values[nm]; |
---|
376 | qr[2] = ymom_edge_values[nm]; |
---|
377 | zr = bed_edge_values[nm]; |
---|
378 | } |
---|
379 | |
---|
380 | if (optimise_dry_cells){ |
---|
381 | if ( fabs(ql[0] - zl) < epsilon && |
---|
382 | fabs(qr[0] - zr) < epsilon) { |
---|
383 | max_speed = 0.0; |
---|
384 | continue; |
---|
385 | } |
---|
386 | } |
---|
387 | |
---|
388 | #ifndef REARRANGED_DOMAIN |
---|
389 | ki2 = 2 * ki; //k*6 + i*2 |
---|
390 | _flux_function_central(ql, qr, zl, zr, |
---|
391 | normals[ki2], normals[ki2 + 1], |
---|
392 | epsilon, h0, limiting_threshold, g, |
---|
393 | edgeflux, &max_speed); |
---|
394 | #else |
---|
395 | ki2 = k + N*i*2; |
---|
396 | _flux_function_central(ql, qr, zl, zr, |
---|
397 | normals[ki2], normals[ki2 + N], |
---|
398 | epsilon, h0, limiting_threshold, g, |
---|
399 | edgeflux, &max_speed); |
---|
400 | #endif |
---|
401 | |
---|
402 | |
---|
403 | length = edgelengths[ki]; |
---|
404 | edgeflux[0] *= length; |
---|
405 | edgeflux[1] *= length; |
---|
406 | edgeflux[2] *= length; |
---|
407 | |
---|
408 | |
---|
409 | |
---|
410 | stage_explicit_update[k] -= edgeflux[0]; |
---|
411 | xmom_explicit_update[k] -= edgeflux[1]; |
---|
412 | ymom_explicit_update[k] -= edgeflux[2]; |
---|
413 | |
---|
414 | if (tri_full_flag[k] == 1) { |
---|
415 | if ( max_speed > epsilon) { |
---|
416 | timestep[k] = fmin(timestep[k], radii[k] / max_speed); |
---|
417 | |
---|
418 | if (n >= 0) { |
---|
419 | timestep[k] = fmin(timestep[k], radii[n] / max_speed); |
---|
420 | } |
---|
421 | } |
---|
422 | } |
---|
423 | |
---|
424 | if (n < 0 || n > k) |
---|
425 | max_speed_total = fmax(max_speed_total, max_speed); |
---|
426 | } // End edge i (and neighbour n) |
---|
427 | |
---|
428 | |
---|
429 | |
---|
430 | inv_area = 1.0 / areas[k]; |
---|
431 | stage_explicit_update[k] *= inv_area; |
---|
432 | xmom_explicit_update[k] *= inv_area; |
---|
433 | ymom_explicit_update[k] *= inv_area; |
---|
434 | |
---|
435 | max_speed_array[k] = max_speed_total; |
---|
436 | } |
---|
437 | } |
---|
438 | |
---|
439 | |
---|
440 | #endif |
---|
441 | |
---|
442 | /*****************************************/ |
---|
443 | /* Rearrange the domain variable order */ |
---|
444 | /* so as to achieve memory corelasing */ |
---|
445 | /* also combination all subfunctions */ |
---|
446 | /*****************************************/ |
---|
447 | |
---|
448 | #ifndef USING_MULTI_FUNCTION |
---|
449 | //__global__ void _flux_function_central_2( |
---|
450 | void compute_fluxes_central_structure_cuda_single( |
---|
451 | int N, |
---|
452 | double evolve_max_timestep, |
---|
453 | double g, |
---|
454 | double epsilon, |
---|
455 | double h0, |
---|
456 | double limiting_threshold, |
---|
457 | int optimise_dry_cells, |
---|
458 | |
---|
459 | double * timestep, |
---|
460 | long * neighbours, |
---|
461 | long * neighbour_edges, |
---|
462 | double * normals, |
---|
463 | double * edgelengths, |
---|
464 | double * radii, |
---|
465 | double * areas, |
---|
466 | long * tri_full_flag, |
---|
467 | double * stage_edge_values, |
---|
468 | double * xmom_edge_values, |
---|
469 | double * ymom_edge_values, |
---|
470 | double * bed_edge_values, |
---|
471 | double * stage_boundary_values, |
---|
472 | double * xmom_boundary_values, |
---|
473 | double * ymom_boundary_values, |
---|
474 | double * stage_explicit_update, |
---|
475 | double * xmom_explicit_update, |
---|
476 | double * ymom_explicit_update, |
---|
477 | double * max_speed_array) |
---|
478 | { |
---|
479 | int j; |
---|
480 | |
---|
481 | double w_left, h_left, uh_left, vh_left, u_left; |
---|
482 | double w_right, h_right, uh_right, vh_right, u_right; |
---|
483 | double s_min, s_max, soundspeed_left, soundspeed_right; |
---|
484 | double denom, inverse_denominator, z; |
---|
485 | double temp; |
---|
486 | |
---|
487 | // Workspace (allocate once, use many) |
---|
488 | double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3]; |
---|
489 | |
---|
490 | |
---|
491 | const long k = |
---|
492 | threadIdx.x+threadIdx.y*blockDim.x+ |
---|
493 | (blockIdx.x+blockIdx.y*gridDim.x)*blockDim.x*blockDim.y; |
---|
494 | |
---|
495 | double q_left[3], q_right[3]; |
---|
496 | double z_left, z_right; |
---|
497 | double n1, n2; |
---|
498 | double edgeflux[3], max_speed; |
---|
499 | double length, inv_area; |
---|
500 | |
---|
501 | int i, m, n; |
---|
502 | int ki, nm; |
---|
503 | |
---|
504 | __shared__ double sh_data[32*9]; |
---|
505 | |
---|
506 | if ( k >= N) |
---|
507 | return; |
---|
508 | |
---|
509 | timestep[k] = evolve_max_timestep; |
---|
510 | |
---|
511 | |
---|
512 | #ifdef UNSORTED_DOMAIN |
---|
513 | int b[3]={0,1,2}, l; |
---|
514 | spe_bubble_sort( b, neighbours+k*3, k); |
---|
515 | for (l = 0; l < 3; l++) { |
---|
516 | i = b[l]; |
---|
517 | #else |
---|
518 | for (i=0; i<3; i++) { |
---|
519 | #endif |
---|
520 | |
---|
521 | #ifdef REARRANGED_DOMAIN |
---|
522 | ki = k + i*N; |
---|
523 | #else |
---|
524 | ki = k * 3 + i; |
---|
525 | #endif |
---|
526 | n = neighbours[ki]; |
---|
527 | |
---|
528 | q_left[0] = stage_edge_values[ki]; |
---|
529 | q_left[1] = xmom_edge_values[ki]; |
---|
530 | q_left[2] = ymom_edge_values[ki]; |
---|
531 | z_left = bed_edge_values[ki]; |
---|
532 | |
---|
533 | if (n<0) { |
---|
534 | m= -n -1; |
---|
535 | |
---|
536 | q_right[0] = stage_boundary_values[m]; |
---|
537 | q_right[1] = xmom_boundary_values[m]; |
---|
538 | q_right[2] = ymom_boundary_values[m]; |
---|
539 | z_right = z_left; |
---|
540 | } else { |
---|
541 | m = neighbour_edges[ki]; |
---|
542 | #ifdef REARRANGED_DOMAIN |
---|
543 | nm = n + m*N; |
---|
544 | #else |
---|
545 | nm = n *3 + m; |
---|
546 | #endif |
---|
547 | |
---|
548 | q_right[0] = stage_edge_values[nm]; |
---|
549 | q_right[1] = xmom_edge_values[nm]; |
---|
550 | q_right[2] = ymom_edge_values[nm]; |
---|
551 | z_right = bed_edge_values[nm]; |
---|
552 | } |
---|
553 | |
---|
554 | if(optimise_dry_cells) { |
---|
555 | if(fabs(q_left[0] - z_left) < epsilon && |
---|
556 | fabs(q_right[0] - z_right) < epsilon ) { |
---|
557 | max_speed = 0.0; |
---|
558 | continue; |
---|
559 | } |
---|
560 | } |
---|
561 | |
---|
562 | |
---|
563 | #ifdef REARRANGED_DOMAIN |
---|
564 | n1 = normals[k + 2*i*N]; |
---|
565 | n2 = normals[k + (2*i+1)*N]; |
---|
566 | #else |
---|
567 | n1 = normals[2*ki]; |
---|
568 | n2 = normals[2*ki + 1]; |
---|
569 | #endif |
---|
570 | |
---|
571 | ///////////////////////////////////////////////////////// |
---|
572 | |
---|
573 | |
---|
574 | // Copy conserved quantities to protect from modification |
---|
575 | q_left_rotated[0] = q_left[0]; |
---|
576 | q_right_rotated[0] = q_right[0]; |
---|
577 | q_left_rotated[1] = q_left[1]; |
---|
578 | q_right_rotated[1] = q_right[1]; |
---|
579 | q_left_rotated[2] = q_left[2]; |
---|
580 | q_right_rotated[2] = q_right[2]; |
---|
581 | |
---|
582 | // Align x- and y-momentum with x-axis |
---|
583 | //_rotate(q_left_rotated, n1, n2); |
---|
584 | q_left_rotated[1] = n1*q_left[1] + n2*q_left[2]; |
---|
585 | q_left_rotated[2] = -n2*q_left[1] + n1*q_left[2]; |
---|
586 | |
---|
587 | //_rotate(q_right_rotated, n1, n2); |
---|
588 | q_right_rotated[1] = n1*q_right[1] + n2*q_right[2]; |
---|
589 | q_right_rotated[2] = -n2*q_right[1] + n1*q_right[2]; |
---|
590 | |
---|
591 | |
---|
592 | if (fabs(z_left - z_right) > 1.0e-10) { |
---|
593 | //report_python_error(AT, "Discontinuous Elevation"); |
---|
594 | //return 0.0; |
---|
595 | } |
---|
596 | z = 0.5 * (z_left + z_right); // Average elevation values. |
---|
597 | |
---|
598 | // Compute speeds in x-direction |
---|
599 | w_left = q_left_rotated[0]; |
---|
600 | h_left = w_left - z; |
---|
601 | uh_left = q_left_rotated[1]; |
---|
602 | //u_left = _compute_speed(&uh_left, &h_left, |
---|
603 | // epsilon, h0, limiting_threshold); |
---|
604 | |
---|
605 | if (h_left < limiting_threshold) { |
---|
606 | |
---|
607 | if (h_left < epsilon) { |
---|
608 | h_left = 0.0; // Could have been negative |
---|
609 | u_left = 0.0; |
---|
610 | } else { |
---|
611 | u_left = uh_left/(h_left + h0/ h_left); |
---|
612 | } |
---|
613 | |
---|
614 | uh_left = u_left * h_left; |
---|
615 | } else { |
---|
616 | u_left = uh_left/ h_left; |
---|
617 | } |
---|
618 | |
---|
619 | w_right = q_right_rotated[0]; |
---|
620 | h_right = w_right - z; |
---|
621 | uh_right = q_right_rotated[1]; |
---|
622 | //u_right = _compute_speed(&uh_right, &h_right, |
---|
623 | // epsilon, h0, limiting_threshold); |
---|
624 | |
---|
625 | if (h_right < limiting_threshold) { |
---|
626 | |
---|
627 | if (h_right < epsilon) { |
---|
628 | h_right = 0.0; // Could have been negative |
---|
629 | u_right = 0.0; |
---|
630 | } else { |
---|
631 | u_right = uh_right/(h_right + h0/ h_right); |
---|
632 | } |
---|
633 | |
---|
634 | uh_right = u_right * h_right; |
---|
635 | } else { |
---|
636 | u_right = uh_right/ h_right; |
---|
637 | } |
---|
638 | |
---|
639 | |
---|
640 | |
---|
641 | |
---|
642 | // Momentum in y-direction |
---|
643 | vh_left = q_left_rotated[2]; |
---|
644 | vh_right = q_right_rotated[2]; |
---|
645 | |
---|
646 | soundspeed_left = sqrt(g * h_left); |
---|
647 | soundspeed_right = sqrt(g * h_right); |
---|
648 | |
---|
649 | s_max = max(u_left + soundspeed_left, u_right + soundspeed_right); |
---|
650 | if (s_max < 0.0) { |
---|
651 | s_max = 0.0; |
---|
652 | } |
---|
653 | |
---|
654 | s_min = min(u_left - soundspeed_left, u_right - soundspeed_right); |
---|
655 | if (s_min > 0.0) { |
---|
656 | s_min = 0.0; |
---|
657 | } |
---|
658 | |
---|
659 | // Flux formulas |
---|
660 | flux_left[0] = u_left*h_left; |
---|
661 | flux_left[1] = u_left * uh_left + 0.5 * g * h_left*h_left; |
---|
662 | flux_left[2] = u_left*vh_left; |
---|
663 | |
---|
664 | flux_right[0] = u_right*h_right; |
---|
665 | flux_right[1] = u_right * uh_right + 0.5 * g * h_right*h_right; |
---|
666 | flux_right[2] = u_right*vh_right; |
---|
667 | |
---|
668 | // Flux computation |
---|
669 | denom = s_max - s_min; |
---|
670 | if (denom < epsilon) { // FIXME (Ole): Try using h0 here |
---|
671 | memset(edgeflux, 0, 3 * sizeof (double)); |
---|
672 | max_speed = 0.0; |
---|
673 | } |
---|
674 | else { |
---|
675 | inverse_denominator = 1.0 / denom; |
---|
676 | for (j = 0; j < 3; j++) { |
---|
677 | edgeflux[j] = s_max * flux_left[j] - s_min * flux_right[j]; |
---|
678 | edgeflux[j] += s_max * s_min * (q_right_rotated[j] - q_left_rotated[j]); |
---|
679 | edgeflux[j] *= inverse_denominator; |
---|
680 | } |
---|
681 | |
---|
682 | // Maximal wavespeed |
---|
683 | max_speed = max(fabs(s_max), fabs(s_min)); |
---|
684 | |
---|
685 | // Rotate back |
---|
686 | //_rotate(edgeflux, n1, -n2); |
---|
687 | temp = edgeflux[1]; |
---|
688 | edgeflux[1] = n1* temp -n2*edgeflux[2]; |
---|
689 | edgeflux[2] = n2*temp + n1*edgeflux[2]; |
---|
690 | } |
---|
691 | |
---|
692 | ////////////////////////////////////////////////////// |
---|
693 | |
---|
694 | length = edgelengths[ki]; |
---|
695 | edgeflux[0] *= length; |
---|
696 | edgeflux[1] *= length; |
---|
697 | edgeflux[2] *= length; |
---|
698 | |
---|
699 | sh_data[T + i*B]= -edgeflux[0]; |
---|
700 | sh_data[T + (i+3)*B]= -edgeflux[1]; |
---|
701 | sh_data[T + (i+6)*B]= -edgeflux[2]; |
---|
702 | __syncthreads(); |
---|
703 | |
---|
704 | |
---|
705 | //stage_explicit_update[k] -= edgeflux[0]; |
---|
706 | //xmom_explicit_update[k] -= edgeflux[1]; |
---|
707 | //ymom_explicit_update[k] -= edgeflux[2]; |
---|
708 | |
---|
709 | if (tri_full_flag[k] == 1) { |
---|
710 | if (max_speed > epsilon) { |
---|
711 | timestep[k] = min(timestep[k], radii[k]/ max_speed); |
---|
712 | |
---|
713 | if (n>=0){ |
---|
714 | timestep[k] = min(timestep[k], radii[n]/max_speed); |
---|
715 | } |
---|
716 | } |
---|
717 | } |
---|
718 | |
---|
719 | } |
---|
720 | |
---|
721 | |
---|
722 | inv_area = 1.0 / areas[k]; |
---|
723 | //stage_explicit_update[k] *= inv_area; |
---|
724 | //xmom_explicit_update[k] *= inv_area; |
---|
725 | //ymom_explicit_update[k] *= inv_area; |
---|
726 | |
---|
727 | stage_explicit_update[k] = (sh_data[T] + sh_data[T+B]+sh_data[T+B*2]) * inv_area; |
---|
728 | |
---|
729 | xmom_explicit_update[k] = (sh_data[T+B*3] + sh_data[T+B*4]+sh_data[T+B*5]) * inv_area; |
---|
730 | |
---|
731 | ymom_explicit_update[k] = (sh_data[T+B*6] + sh_data[T+B*7]+sh_data[T+B*8]) * inv_area; |
---|
732 | |
---|
733 | max_speed_array[k] = max_speed; |
---|
734 | } |
---|
735 | #endif |
---|