Changeset 7164


Ignore:
Timestamp:
Jun 9, 2009, 9:30:09 PM (15 years ago)
Author:
ole
Message:

Ported optimisation of shallow_water_ext.c
from changeset:7143 into numpy branch.

Changes to test_shallow_water.py and test_data_manager.py
implemented in this changeset still
need to be ported.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/numpy/anuga/shallow_water/shallow_water_ext.c

    r6902 r7164  
    1818#include "math.h"
    1919#include <stdio.h>
    20 
    2120#include "numpy_shim.h"
    2221
     
    2827
    2928// Computational function for rotation
    30 // FIXME: Perhaps inline this and profile
     29// Tried to inline, but no speedup was achieved 27th May 2009 (Ole)
     30// static inline int _rotate(double *q, double n1, double n2) {
    3131int _rotate(double *q, double n1, double n2) {
    3232  /*Rotate the momentum component q (q[1], q[2])
     
    6868    if (dq1>=dq2){
    6969      if (dq1>=0.0)
    70     *qmax=dq0+dq1;
     70        *qmax=dq0+dq1;
    7171      else
    72     *qmax=dq0;
    73    
     72        *qmax=dq0;
     73     
    7474      *qmin=dq0+dq2;
    7575      if (*qmin>=0.0) *qmin = 0.0;
     
    7777    else{// dq1<dq2
    7878      if (dq2>0)
    79     *qmax=dq0+dq2;
     79        *qmax=dq0+dq2;
    8080      else
    81     *qmax=dq0;
    82    
     81        *qmax=dq0;
     82     
    8383      *qmin=dq0+dq1;   
    8484      if (*qmin>=0.0) *qmin=0.0;
     
    8888    if (dq1<=dq2){
    8989      if (dq1<0.0)
    90     *qmin=dq0+dq1;
     90        *qmin=dq0+dq1;
    9191      else
    92     *qmin=dq0;
    93    
     92        *qmin=dq0;
     93     
    9494      *qmax=dq0+dq2;   
    9595      if (*qmax<=0.0) *qmax=0.0;
     
    9797    else{// dq1>dq2
    9898      if (dq2<0.0)
    99     *qmin=dq0+dq2;
     99        *qmin=dq0+dq2;
    100100      else
    101     *qmin=dq0;
    102    
     101        *qmin=dq0;
     102     
    103103      *qmax=dq0+dq1;
    104104      if (*qmax<=0.0) *qmax=0.0;
     
    169169// This is used by flux functions
    170170// Input parameters uh and h may be modified by this function.
    171 
    172 // FIXME: Perhaps inline this and profile
     171// Tried to inline, but no speedup was achieved 27th May 2009 (Ole)
     172//static inline double _compute_speed(double *uh,
    173173double _compute_speed(double *uh,
    174               double *h,
    175               double epsilon,
    176               double h0) {
     174                      double *h,
     175                      double epsilon,
     176                      double h0,
     177                      double limiting_threshold) {
    177178 
    178179  double u;
    179180
    180  
    181   if (*h < epsilon) {
    182     *h = 0.0;  //Could have been negative
    183     u = 0.0;
     181  if (*h < limiting_threshold) {   
     182    // Apply limiting of speeds according to the ANUGA manual
     183    if (*h < epsilon) {
     184      *h = 0.0;  // Could have been negative
     185      u = 0.0;
     186    } else {
     187      u = *uh/(*h + h0/ *h);   
     188    }
     189 
     190
     191    // Adjust momentum to be consistent with speed
     192    *uh = u * *h;
    184193  } else {
    185     u = *uh/(*h + h0/ *h);   
     194    // We are in deep water - no need for limiting
     195    u = *uh/ *h;
    186196  }
    187  
    188 
    189   // Adjust momentum to be consistent with speed
    190   *uh = u * *h;
    191197 
    192198  return u;
     
    224230
    225231
    226 // Optimised squareroot computation (double version, slower)
     232// Optimised squareroot computation (double version)
    227233double Xfast_squareroot_approximation(double number) {
    228234  double x;
     
    253259                           double z_left, double z_right,
    254260                           double n1, double n2,
    255                            double epsilon, double H0, double g,
     261                           double epsilon,
     262                           double h0,
     263                           double limiting_threshold,
     264                           double g,
    256265                           double *edgeflux, double *max_speed)
    257266{
     
    272281  double w_left, h_left, uh_left, vh_left, u_left;
    273282  double w_right, h_right, uh_right, vh_right, u_right;
    274   double v_left, v_right; 
    275283  double s_min, s_max, soundspeed_left, soundspeed_right;
    276284  double denom, inverse_denominator, z;
     
    279287  static double q_left_rotated[3], q_right_rotated[3], flux_right[3], flux_left[3];
    280288
    281   double h0 = H0*H0; // This ensures a good balance when h approaches H0.
    282                      // But evidence suggests that h0 can be as little as
    283              // epsilon!
    284  
    285289  // Copy conserved quantities to protect from modification
    286290  q_left_rotated[0] = q_left[0];
     
    296300
    297301  z = 0.5*(z_left + z_right); // Average elevation values.
    298                             // Even though this will nominally allow for discontinuities
    299                             // in the elevation data, there is currently no numerical
    300                             // support for this so results may be strange near jumps in the bed.
     302                              // Even though this will nominally allow
     303                              // for discontinuities in the elevation data,
     304                              // there is currently no numerical support for
     305                              // this so results may be strange near
     306                              // jumps in the bed.
    301307
    302308  // Compute speeds in x-direction
     
    304310  h_left = w_left - z;
    305311  uh_left = q_left_rotated[1];
    306   u_left = _compute_speed(&uh_left, &h_left, epsilon, h0);
     312  u_left = _compute_speed(&uh_left, &h_left,
     313                          epsilon, h0, limiting_threshold);
    307314
    308315  w_right = q_right_rotated[0];
    309316  h_right = w_right - z;
    310317  uh_right = q_right_rotated[1];
    311   u_right = _compute_speed(&uh_right, &h_right, epsilon, h0); 
     318  u_right = _compute_speed(&uh_right, &h_right,
     319                           epsilon, h0, limiting_threshold);
    312320
    313321  // Momentum in y-direction
     
    315323  vh_right = q_right_rotated[2];
    316324
    317   // Limit y-momentum if necessary 
    318   v_left = _compute_speed(&vh_left, &h_left, epsilon, h0);
    319   v_right = _compute_speed(&vh_right, &h_right, epsilon, h0);
     325  // Limit y-momentum if necessary
     326  // Leaving this out, improves speed significantly (Ole 27/5/2009)
     327  // All validation tests pass, so do we really need it anymore?
     328  _compute_speed(&vh_left, &h_left,
     329                 epsilon, h0, limiting_threshold);
     330  _compute_speed(&vh_right, &h_right,
     331                 epsilon, h0, limiting_threshold);
    320332
    321333  // Maximal and minimal wave speeds
     
    324336 
    325337  // Code to use fast square root optimisation if desired.
     338  // Timings on AMD 64 for the Okushiri profile gave the following timings
     339  //
     340  // SQRT           Total    Flux
     341  //=============================
     342  //
     343  // Ref            405s     152s
     344  // Fast (dbl)     453s     173s
     345  // Fast (sng)     437s     171s
     346  //
     347  // Consequently, there is currently (14/5/2009) no reason to use this
     348  // approximation.
     349 
    326350  //soundspeed_left  = fast_squareroot_approximation(g*h_left);
    327351  //soundspeed_right = fast_squareroot_approximation(g*h_right);
     
    351375  denom = s_max - s_min;
    352376  if (denom < epsilon)
    353   { // FIXME (Ole): Try using H0 here
     377  { // FIXME (Ole): Try using h0 here
    354378    memset(edgeflux, 0, 3*sizeof(double));
    355379    *max_speed = 0.0;
     
    413437
    414438  double h0 = H0*H0; //This ensures a good balance when h approaches H0.
    415 
     439  double limiting_threshold = 10*H0; // Avoid applying limiter below this 
     440 
    416441  //Copy conserved quantities to protect from modification
    417442  for (i=0; i<3; i++) {
     
    430455  h_left = w_left-z;
    431456  uh_left = q_left_rotated[1];
    432   u_left =_compute_speed(&uh_left, &h_left, epsilon, h0);
     457  u_left =_compute_speed(&uh_left, &h_left,
     458                         epsilon, h0, limiting_threshold);
    433459
    434460  w_right = q_right_rotated[0];
    435461  h_right = w_right-z;
    436462  uh_right = q_right_rotated[1];
    437   u_right =_compute_speed(&uh_right, &h_right, epsilon, h0); 
     463  u_right =_compute_speed(&uh_right, &h_right,
     464                          epsilon, h0, limiting_threshold);
    438465
    439466
     
    656683      // FIXME: Try with this one precomputed
    657684      for (i=0; i<3; i++) {
    658     dz = max(dz, fabs(zv[k3+i]-zc[k]));
     685        dz = max(dz, fabs(zv[k3+i]-zc[k]));
    659686      }
    660687    }
     
    684711     
    685712      if (dz > 0.0) {
    686     alpha = max( min( alpha_balance*hmin/dz, 1.0), 0.0 );     
     713        alpha = max( min( alpha_balance*hmin/dz, 1.0), 0.0 );     
    687714      } else {
    688     alpha = 1.0;  // Flat bed
     715        alpha = 1.0;  // Flat bed
    689716      }
    690717      //printf("Using old style limiter\n");
     
    699726   
    700727      if (hmin < H0) {
    701     alpha = 1.0;
    702     for (i=0; i<3; i++) {
    703      
    704       h_diff = hc_k - hv[i];     
    705       if (h_diff <= 0) {
    706         // Deep water triangle is further away from bed than
    707         // shallow water (hbar < h). Any alpha will do
    708      
    709       } else { 
    710         // Denominator is positive which means that we need some of the
    711         // h-limited stage.
    712        
    713         alpha = min(alpha, (hc_k - H0)/h_diff);     
     728        alpha = 1.0;
     729        for (i=0; i<3; i++) {
     730         
     731          h_diff = hc_k - hv[i];     
     732          if (h_diff <= 0) {
     733            // Deep water triangle is further away from bed than
     734            // shallow water (hbar < h). Any alpha will do
     735     
     736          } else { 
     737            // Denominator is positive which means that we need some of the
     738            // h-limited stage.
     739           
     740            alpha = min(alpha, (hc_k - H0)/h_diff);     
     741          }
     742        }
     743
     744        // Ensure alpha in [0,1]
     745        if (alpha>1.0) alpha=1.0;
     746        if (alpha<0.0) alpha=0.0;
     747       
     748      } else {
     749        // Use w-limited stage exclusively in deeper water.
     750        alpha = 1.0;       
    714751      }
    715752    }
    716 
    717     // Ensure alpha in [0,1]
    718     if (alpha>1.0) alpha=1.0;
    719     if (alpha<0.0) alpha=0.0;
    720    
    721       } else {
    722     // Use w-limited stage exclusively in deeper water.
    723     alpha = 1.0;       
    724       }
    725     }
    726 
    727 
     753   
     754   
    728755    //  Let
    729756    //
     
    743770    //   Momentum is balanced between constant and limited
    744771
    745 
     772   
    746773    if (alpha < 1) {     
    747774      for (i=0; i<3; i++) {
    748      
    749     wv[k3+i] = zv[k3+i] + (1-alpha)*hc_k + alpha*hv[i];
    750 
    751     // Update momentum at vertices
    752     if (use_centroid_velocities == 1) {
    753       // This is a simple, efficient and robust option
    754       // It uses first order approximation of velocities, but retains
    755       // the order used by stage.
    756    
    757       // Speeds at centroids
    758       if (hc_k > epsilon) {
    759         uc = xmomc[k]/hc_k;
    760         vc = ymomc[k]/hc_k;
    761       } else {
    762         uc = 0.0;
    763         vc = 0.0;
    764       }
    765      
    766       // Vertex momenta guaranteed to be consistent with depth guaranteeing
    767       // controlled speed
    768       hv[i] = wv[k3+i] - zv[k3+i]; // Recompute (balanced) vertex depth
    769       xmomv[k3+i] = uc*hv[i];
    770       ymomv[k3+i] = vc*hv[i];
    771      
    772     } else {
    773       // Update momentum as a linear combination of
    774       // xmomc and ymomc (shallow) and momentum
    775       // from extrapolator xmomv and ymomv (deep).
    776       // This assumes that values from xmomv and ymomv have
    777       // been established e.g. by the gradient limiter.
    778 
    779       // FIXME (Ole): I think this should be used with vertex momenta
    780       // computed above using centroid_velocities instead of xmomc
    781       // and ymomc as they'll be more representative first order
    782       // values.
    783      
    784       xmomv[k3+i] = (1-alpha)*xmomc[k] + alpha*xmomv[k3+i];
    785       ymomv[k3+i] = (1-alpha)*ymomc[k] + alpha*ymomv[k3+i];
    786    
    787     }
     775       
     776        wv[k3+i] = zv[k3+i] + (1-alpha)*hc_k + alpha*hv[i];
     777
     778        // Update momentum at vertices
     779        if (use_centroid_velocities == 1) {
     780          // This is a simple, efficient and robust option
     781          // It uses first order approximation of velocities, but retains
     782          // the order used by stage.
     783   
     784          // Speeds at centroids
     785          if (hc_k > epsilon) {
     786            uc = xmomc[k]/hc_k;
     787            vc = ymomc[k]/hc_k;
     788          } else {
     789            uc = 0.0;
     790            vc = 0.0;
     791          }
     792         
     793          // Vertex momenta guaranteed to be consistent with depth guaranteeing
     794          // controlled speed
     795          hv[i] = wv[k3+i] - zv[k3+i]; // Recompute (balanced) vertex depth
     796          xmomv[k3+i] = uc*hv[i];
     797          ymomv[k3+i] = vc*hv[i];
     798     
     799        } else {
     800          // Update momentum as a linear combination of
     801          // xmomc and ymomc (shallow) and momentum
     802          // from extrapolator xmomv and ymomv (deep).
     803          // This assumes that values from xmomv and ymomv have
     804          // been established e.g. by the gradient limiter.
     805         
     806          // FIXME (Ole): I think this should be used with vertex momenta
     807          // computed above using centroid_velocities instead of xmomc
     808          // and ymomc as they'll be more representative first order
     809          // values.
     810         
     811          xmomv[k3+i] = (1-alpha)*xmomc[k] + alpha*xmomv[k3+i];
     812          ymomv[k3+i] = (1-alpha)*ymomc[k] + alpha*ymomv[k3+i];
     813         
     814        }
    788815      }
    789816    }
     
    816843      if (hc < minimum_allowed_height) {
    817844       
    818     // Set momentum to zero and ensure h is non negative
    819     xmomc[k] = 0.0;
    820     ymomc[k] = 0.0;
    821     if (hc <= 0.0) wc[k] = zc[k];
     845        // Set momentum to zero and ensure h is non negative
     846        xmomc[k] = 0.0;
     847        ymomc[k] = 0.0;
     848        if (hc <= 0.0) wc[k] = zc[k];
    822849      }
    823850    }
     
    827854    for (k=0; k<N; k++) {
    828855      hc = wc[k] - zc[k];
    829 
     856     
    830857      if (hc < minimum_allowed_height) {
    831858
     
    839866        } else {
    840867          //Reduce excessive speeds derived from division by small hc
    841         //FIXME (Ole): This may be unnecessary with new slope limiters
    842         //in effect.
     868          //FIXME (Ole): This may be unnecessary with new slope limiters
     869          //in effect.
    843870         
    844871          u = xmomc[k]/hc;
    845       if (fabs(u) > maximum_allowed_speed) {
    846         reduced_speed = maximum_allowed_speed * u/fabs(u);
    847         //printf("Speed (u) has been reduced from %.3f to %.3f\n",
    848         //   u, reduced_speed);
    849         xmomc[k] = reduced_speed * hc;
    850       }
    851 
     872          if (fabs(u) > maximum_allowed_speed) {
     873            reduced_speed = maximum_allowed_speed * u/fabs(u);
     874            //printf("Speed (u) has been reduced from %.3f to %.3f\n",
     875            //   u, reduced_speed);
     876            xmomc[k] = reduced_speed * hc;
     877          }
     878         
    852879          v = ymomc[k]/hc;
    853       if (fabs(v) > maximum_allowed_speed) {
    854         reduced_speed = maximum_allowed_speed * v/fabs(v);
    855         //printf("Speed (v) has been reduced from %.3f to %.3f\n",
    856         //   v, reduced_speed);
    857         ymomc[k] = reduced_speed * hc;
    858       }
     880          if (fabs(v) > maximum_allowed_speed) {
     881            reduced_speed = maximum_allowed_speed * v/fabs(v);
     882            //printf("Speed (v) has been reduced from %.3f to %.3f\n",
     883            //   v, reduced_speed);
     884            ymomc[k] = reduced_speed * hc;
     885          }
    859886        }
    860887      }
     
    914941  PyArrayObject *normal, *ql, *qr,  *edgeflux;
    915942  double g, epsilon, max_speed, H0, zl, zr;
     943  double h0, limiting_threshold;
    916944
    917945  if (!PyArg_ParseTuple(args, "OOOddOddd",
     
    923951
    924952 
     953  h0 = H0*H0; // This ensures a good balance when h approaches H0.
     954              // But evidence suggests that h0 can be as little as
     955              // epsilon!
     956             
     957  limiting_threshold = 10*H0; // Avoid applying limiter below this
     958                              // threshold for performance reasons.
     959                              // See ANUGA manual under flux limiting 
     960 
    925961  _flux_function_central((double*) ql -> data,
    926              (double*) qr -> data,
    927              zl,
    928              zr,                         
    929              ((double*) normal -> data)[0],
    930              ((double*) normal -> data)[1],         
    931              epsilon, H0, g,
    932              (double*) edgeflux -> data,
    933              &max_speed);
     962                         (double*) qr -> data,
     963                         zl,
     964                         zr,                         
     965                         ((double*) normal -> data)[0],
     966                         ((double*) normal -> data)[1],         
     967                         epsilon, h0, limiting_threshold,
     968                         g,
     969                         (double*) edgeflux -> data,
     970                         &max_speed);
    934971 
    935972  return Py_BuildValue("d", max_speed); 
     
    18081845  // Local variables
    18091846  double max_speed, length, inv_area, zl, zr;
     1847  double h0 = H0*H0; // This ensures a good balance when h approaches H0.
     1848
     1849  double limiting_threshold = 10*H0; // Avoid applying limiter below this
     1850                                     // threshold for performance reasons.
     1851                                     // See ANUGA manual under flux limiting
    18101852  int k, i, m, n;
    18111853  int ki, nm=0, ki2; // Index shorthands
     1854 
    18121855 
    18131856  // Workspace (making them static actually made function slightly slower (Ole)) 
     
    18151858
    18161859  static long call = 1; // Static local variable flagging already computed flux
    1817                   
     1860 
    18181861  // Start computation
    18191862  call++; // Flag 'id' of flux calculation for this timestep
     
    19011944      _flux_function_central(ql, qr, zl, zr,
    19021945                             normals[ki2], normals[ki2+1],
    1903                              epsilon, H0, g,
     1946                             epsilon, h0, limiting_threshold, g,
    19041947                             edgeflux, &max_speed);
    19051948     
Note: See TracChangeset for help on using the changeset viewer.