Context Navigation

← Previous Changeset
Next Changeset →

Changeset 7164

Timestamp:

Jun 9, 2009, 9:30:09 PM (16 years ago)

Author:

ole

Message:

Ported optimisation of shallow_water_ext.c
from changeset:7143 into numpy branch.

Changes to test_shallow_water.py and test_data_manager.py
implemented in this changeset still
need to be ported.

File:

: 1 edited

branches/numpy/anuga/shallow_water/shallow_water_ext.c (modified) (30 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/numpy/anuga/shallow_water/shallow_water_ext.c

-                      r6902
+                      r7164
 #include "math.h"
 #include <stdio.h>
 #include "numpy_shim.h"
 …
 // Computational function for rotation
+// FIXME: Perhaps inline this and profile
+// Tried to inline, but no speedup was achieved 27th May 2009 (Ole)
+// static inline int _rotate(double *q, double n1, double n2) {
 int _rotate(double *q, double n1, double n2) {
   /*Rotate the momentum component q (q[1], q[2])
 …
     if (dq1>=dq2){
       if (dq1>=0.0)
     *qmax=dq0+dq1;
+        *qmax=dq0+dq1;
       else
     *qmax=dq0;
+        *qmax=dq0;
       *qmin=dq0+dq2;
       if (*qmin>=0.0) *qmin = 0.0;
 …
     else{// dq1<dq2
       if (dq2>0)
     *qmax=dq0+dq2;
+        *qmax=dq0+dq2;
       else
     *qmax=dq0;
+        *qmax=dq0;
       *qmin=dq0+dq1;
       if (*qmin>=0.0) *qmin=0.0;
 …
     if (dq1<=dq2){
       if (dq1<0.0)
     *qmin=dq0+dq1;
+        *qmin=dq0+dq1;
       else
     *qmin=dq0;
+        *qmin=dq0;
       *qmax=dq0+dq2;
       if (*qmax<=0.0) *qmax=0.0;
 …
     else{// dq1>dq2
       if (dq2<0.0)
     *qmin=dq0+dq2;
+        *qmin=dq0+dq2;
       else
     *qmin=dq0;
+        *qmin=dq0;
       *qmax=dq0+dq1;
       if (*qmax<=0.0) *qmax=0.0;
 …
 // This is used by flux functions
 // Input parameters uh and h may be modified by this function.
 // FIXME: Perhaps inline this and profile
+// Tried to inline, but no speedup was achieved 27th May 2009 (Ole)
+//static inline double _compute_speed(double *uh,
 double _compute_speed(double *uh,
+              double *h,
+              double epsilon,
+              double h0) {
+                      double *h,
+                      double epsilon,
+                      double h0,
+                      double limiting_threshold) {
   double u;
+  if (*h < epsilon) {
+    *h = 0.0;  //Could have been negative
+    u = 0.0;
+  if (*h < limiting_threshold) {
+    // Apply limiting of speeds according to the ANUGA manual
+    if (*h < epsilon) {
+      *h = 0.0;  // Could have been negative
+      u = 0.0;
+    } else {
+      u = *uh/(*h + h0/ *h);
+    }
+    // Adjust momentum to be consistent with speed
+    *uh = u * *h;
   } else {
+    u = *uh/(*h + h0/ *h);
+    // We are in deep water - no need for limiting
+    u = *uh/ *h;
+  }
-  // Adjust momentum to be consistent with speed
-  *uh = u * *h;
   return u;
 …
 // Optimised squareroot computation (double version, slower)
+// Optimised squareroot computation (double version)
 double Xfast_squareroot_approximation(double number) {
   double x;
 …
                            double z_left, double z_right,
                            double n1, double n2,
+                           double epsilon, double H0, double g,
+                           double epsilon,
+                           double h0,
+                           double limiting_threshold,
+                           double g,
                            double *edgeflux, double *max_speed)
+{
 …
   double w_left, h_left, uh_left, vh_left, u_left;
   double w_right, h_right, uh_right, vh_right, u_right;
-  double v_left, v_right;
   double s_min, s_max, soundspeed_left, soundspeed_right;
   double denom, inverse_denominator, z;
 …
   static double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
-  double h0 = H0*H0; // This ensures a good balance when h approaches H0.
-                     // But evidence suggests that h0 can be as little as
-             // epsilon!
   // Copy conserved quantities to protect from modification
   q_left_rotated[0] = q_left[0];
 …
   z = 0.5*(z_left + z_right); // Average elevation values.
+                            // Even though this will nominally allow for discontinuities
+                            // in the elevation data, there is currently no numerical
+                            // support for this so results may be strange near jumps in the bed.
+                              // Even though this will nominally allow
+                              // for discontinuities in the elevation data,
+                              // there is currently no numerical support for
+                              // this so results may be strange near
+                              // jumps in the bed.
   // Compute speeds in x-direction
 …
   h_left = w_left - z;
   uh_left = q_left_rotated[1];
+  u_left = _compute_speed(&uh_left, &h_left, epsilon, h0);
+  u_left = _compute_speed(&uh_left, &h_left,
+                          epsilon, h0, limiting_threshold);
   w_right = q_right_rotated[0];
   h_right = w_right - z;
   uh_right = q_right_rotated[1];
+  u_right = _compute_speed(&uh_right, &h_right, epsilon, h0);
+  u_right = _compute_speed(&uh_right, &h_right,
+                           epsilon, h0, limiting_threshold);
   // Momentum in y-direction
 …
   vh_right = q_right_rotated[2];
+  // Limit y-momentum if necessary
+  v_left = _compute_speed(&vh_left, &h_left, epsilon, h0);
+  v_right = _compute_speed(&vh_right, &h_right, epsilon, h0);
+  // Limit y-momentum if necessary
+  // Leaving this out, improves speed significantly (Ole 27/5/2009)
+  // All validation tests pass, so do we really need it anymore?
+  _compute_speed(&vh_left, &h_left,
+                 epsilon, h0, limiting_threshold);
+  _compute_speed(&vh_right, &h_right,
+                 epsilon, h0, limiting_threshold);
   // Maximal and minimal wave speeds
 …
   // Code to use fast square root optimisation if desired.
+  // Timings on AMD 64 for the Okushiri profile gave the following timings
+  //
+  // SQRT           Total    Flux
+  //=============================
+  //
+  // Ref            405s     152s
+  // Fast (dbl)     453s     173s
+  // Fast (sng)     437s     171s
+  //
+  // Consequently, there is currently (14/5/2009) no reason to use this
+  // approximation.
   //soundspeed_left  = fast_squareroot_approximation(g*h_left);
   //soundspeed_right = fast_squareroot_approximation(g*h_right);
 …
   denom = s_max - s_min;
   if (denom < epsilon)
   { // FIXME (Ole): Try using H0 here
+  { // FIXME (Ole): Try using h0 here
     memset(edgeflux, 0, 3*sizeof(double));
     *max_speed = 0.0;
 …
   double h0 = H0*H0; //This ensures a good balance when h approaches H0.
+  double limiting_threshold = 10*H0; // Avoid applying limiter below this
   //Copy conserved quantities to protect from modification
   for (i=0; i<3; i++) {
 …
   h_left = w_left-z;
   uh_left = q_left_rotated[1];
+  u_left =_compute_speed(&uh_left, &h_left, epsilon, h0);
+  u_left =_compute_speed(&uh_left, &h_left,
+                         epsilon, h0, limiting_threshold);
   w_right = q_right_rotated[0];
   h_right = w_right-z;
   uh_right = q_right_rotated[1];
+  u_right =_compute_speed(&uh_right, &h_right, epsilon, h0);
+  u_right =_compute_speed(&uh_right, &h_right,
+                          epsilon, h0, limiting_threshold);
 …
       // FIXME: Try with this one precomputed
       for (i=0; i<3; i++) {
     dz = max(dz, fabs(zv[k3+i]-zc[k]));
+        dz = max(dz, fabs(zv[k3+i]-zc[k]));
+      }
+    }
 …
       if (dz > 0.0) {
     alpha = max( min( alpha_balance*hmin/dz, 1.0), 0.0 );
+        alpha = max( min( alpha_balance*hmin/dz, 1.0), 0.0 );
       } else {
     alpha = 1.0;  // Flat bed
+        alpha = 1.0;  // Flat bed
+      }
       //printf("Using old style limiter\n");
 …
       if (hmin < H0) {
+    alpha = 1.0;
+    for (i=0; i<3; i++) {
+      h_diff = hc_k - hv[i];
+      if (h_diff <= 0) {
+        // Deep water triangle is further away from bed than
+        // shallow water (hbar < h). Any alpha will do
+      } else {
+        // Denominator is positive which means that we need some of the
+        // h-limited stage.
+        alpha = min(alpha, (hc_k - H0)/h_diff);
+        alpha = 1.0;
+        for (i=0; i<3; i++) {
+          h_diff = hc_k - hv[i];
+          if (h_diff <= 0) {
+            // Deep water triangle is further away from bed than
+            // shallow water (hbar < h). Any alpha will do
+          } else {
+            // Denominator is positive which means that we need some of the
+            // h-limited stage.
+            alpha = min(alpha, (hc_k - H0)/h_diff);
+          }
+        }
+        // Ensure alpha in [0,1]
+        if (alpha>1.0) alpha=1.0;
+        if (alpha<0.0) alpha=0.0;
+      } else {
+        // Use w-limited stage exclusively in deeper water.
+        alpha = 1.0;
+      }
+    }
+    // Ensure alpha in [0,1]
+    if (alpha>1.0) alpha=1.0;
+    if (alpha<0.0) alpha=0.0;
+      } else {
+    // Use w-limited stage exclusively in deeper water.
+    alpha = 1.0;
+      }
+    }
     //  Let
     //
 …
     //   Momentum is balanced between constant and limited
     if (alpha < 1) {
       for (i=0; i<3; i++) {
     wv[k3+i] = zv[k3+i] + (1-alpha)*hc_k + alpha*hv[i];
     // Update momentum at vertices
     if (use_centroid_velocities == 1) {
       // This is a simple, efficient and robust option
       // It uses first order approximation of velocities, but retains
       // the order used by stage.
       // Speeds at centroids
       if (hc_k > epsilon) {
         uc = xmomc[k]/hc_k;
         vc = ymomc[k]/hc_k;
       } else {
         uc = 0.0;
         vc = 0.0;
+      }
       // Vertex momenta guaranteed to be consistent with depth guaranteeing
       // controlled speed
       hv[i] = wv[k3+i] - zv[k3+i]; // Recompute (balanced) vertex depth
       xmomv[k3+i] = uc*hv[i];
       ymomv[k3+i] = vc*hv[i];
     } else {
       // Update momentum as a linear combination of
       // xmomc and ymomc (shallow) and momentum
       // from extrapolator xmomv and ymomv (deep).
       // This assumes that values from xmomv and ymomv have
       // been established e.g. by the gradient limiter.
       // FIXME (Ole): I think this should be used with vertex momenta
       // computed above using centroid_velocities instead of xmomc
       // and ymomc as they'll be more representative first order
       // values.
       xmomv[k3+i] = (1-alpha)*xmomc[k] + alpha*xmomv[k3+i];
       ymomv[k3+i] = (1-alpha)*ymomc[k] + alpha*ymomv[k3+i];
+    }
+        wv[k3+i] = zv[k3+i] + (1-alpha)*hc_k + alpha*hv[i];
+        // Update momentum at vertices
+        if (use_centroid_velocities == 1) {
+          // This is a simple, efficient and robust option
+          // It uses first order approximation of velocities, but retains
+          // the order used by stage.
+          // Speeds at centroids
+          if (hc_k > epsilon) {
+            uc = xmomc[k]/hc_k;
+            vc = ymomc[k]/hc_k;
+          } else {
+            uc = 0.0;
+            vc = 0.0;
+          }
+          // Vertex momenta guaranteed to be consistent with depth guaranteeing
+          // controlled speed
+          hv[i] = wv[k3+i] - zv[k3+i]; // Recompute (balanced) vertex depth
+          xmomv[k3+i] = uc*hv[i];
+          ymomv[k3+i] = vc*hv[i];
+        } else {
+          // Update momentum as a linear combination of
+          // xmomc and ymomc (shallow) and momentum
+          // from extrapolator xmomv and ymomv (deep).
+          // This assumes that values from xmomv and ymomv have
+          // been established e.g. by the gradient limiter.
+          // FIXME (Ole): I think this should be used with vertex momenta
+          // computed above using centroid_velocities instead of xmomc
+          // and ymomc as they'll be more representative first order
+          // values.
+          xmomv[k3+i] = (1-alpha)*xmomc[k] + alpha*xmomv[k3+i];
+          ymomv[k3+i] = (1-alpha)*ymomc[k] + alpha*ymomv[k3+i];
+        }
+      }
+    }
 …
       if (hc < minimum_allowed_height) {
     // Set momentum to zero and ensure h is non negative
     xmomc[k] = 0.0;
     ymomc[k] = 0.0;
     if (hc <= 0.0) wc[k] = zc[k];
+        // Set momentum to zero and ensure h is non negative
+        xmomc[k] = 0.0;
+        ymomc[k] = 0.0;
+        if (hc <= 0.0) wc[k] = zc[k];
+      }
+    }
 …
     for (k=0; k<N; k++) {
       hc = wc[k] - zc[k];
       if (hc < minimum_allowed_height) {
 …
         } else {
           //Reduce excessive speeds derived from division by small hc
         //FIXME (Ole): This may be unnecessary with new slope limiters
         //in effect.
+          //FIXME (Ole): This may be unnecessary with new slope limiters
+          //in effect.
           u = xmomc[k]/hc;
       if (fabs(u) > maximum_allowed_speed) {
         reduced_speed = maximum_allowed_speed * u/fabs(u);
         //printf("Speed (u) has been reduced from %.3f to %.3f\n",
         //   u, reduced_speed);
         xmomc[k] = reduced_speed * hc;
+      }
+          if (fabs(u) > maximum_allowed_speed) {
+            reduced_speed = maximum_allowed_speed * u/fabs(u);
+            //printf("Speed (u) has been reduced from %.3f to %.3f\n",
+            //   u, reduced_speed);
+            xmomc[k] = reduced_speed * hc;
+          }
           v = ymomc[k]/hc;
       if (fabs(v) > maximum_allowed_speed) {
         reduced_speed = maximum_allowed_speed * v/fabs(v);
         //printf("Speed (v) has been reduced from %.3f to %.3f\n",
         //   v, reduced_speed);
         ymomc[k] = reduced_speed * hc;
+      }
+          if (fabs(v) > maximum_allowed_speed) {
+            reduced_speed = maximum_allowed_speed * v/fabs(v);
+            //printf("Speed (v) has been reduced from %.3f to %.3f\n",
+            //   v, reduced_speed);
+            ymomc[k] = reduced_speed * hc;
+          }
+        }
+      }
 …
   PyArrayObject *normal, *ql, *qr,  *edgeflux;
   double g, epsilon, max_speed, H0, zl, zr;
+  double h0, limiting_threshold;
   if (!PyArg_ParseTuple(args, "OOOddOddd",
 …
+  h0 = H0*H0; // This ensures a good balance when h approaches H0.
+              // But evidence suggests that h0 can be as little as
+              // epsilon!
+  limiting_threshold = 10*H0; // Avoid applying limiter below this
+                              // threshold for performance reasons.
+                              // See ANUGA manual under flux limiting
   _flux_function_central((double*) ql -> data,
+             (double*) qr -> data,
+             zl,
+             zr,
+             ((double*) normal -> data)[0],
+             ((double*) normal -> data)[1],
+             epsilon, H0, g,
+             (double*) edgeflux -> data,
+             &max_speed);
+                         (double*) qr -> data,
+                         zl,
+                         zr,
+                         ((double*) normal -> data)[0],
+                         ((double*) normal -> data)[1],
+                         epsilon, h0, limiting_threshold,
+                         g,
+                         (double*) edgeflux -> data,
+                         &max_speed);
   return Py_BuildValue("d", max_speed);
 …
   // Local variables
   double max_speed, length, inv_area, zl, zr;
+  double h0 = H0*H0; // This ensures a good balance when h approaches H0.
+  double limiting_threshold = 10*H0; // Avoid applying limiter below this
+                                     // threshold for performance reasons.
+                                     // See ANUGA manual under flux limiting
   int k, i, m, n;
   int ki, nm=0, ki2; // Index shorthands
   // Workspace (making them static actually made function slightly slower (Ole))
 …
   static long call = 1; // Static local variable flagging already computed flux
   // Start computation
   call++; // Flag 'id' of flux calculation for this timestep
 …
       _flux_function_central(ql, qr, zl, zr,
                              normals[ki2], normals[ki2+1],
                              epsilon, H0, g,
+                             epsilon, h0, limiting_threshold, g,
                              edgeflux, &max_speed);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7164

Legend:

branches/numpy/anuga/shallow_water/shallow_water_ext.c

Download in other formats: