% Use the standard \LaTeXe\ article style in 12pt Computer Modern font
% on A4 paper by
\documentclass[12pt,a4paper]{article}
% Do \emph{not} change the width nor the height of the text from the
% defaults set by this document class.
%
% The alternative which is closer to what we actually use is
% \documentclass[11pt,a5paper]{article}
% \usepackage[a5paper]{geometry}
% Because it is a great size for on screen reading
% and prints nicely on a4paper either 2up or booklet.

% The preamble is to contain your own \LaTeX\ commands and to say
% what packages to use.  Three useful packages are the following:
\usepackage{graphicx} % avoid epsfig or earlier such packages
\usepackage{url}      % for URLs and DOIs
\newcommand{\doi}[1]{\url{http://dx.doi.org/#1}}
\usepackage{amsmath}  % many want amsmath extensions
\usepackage{amsfonts}
\usepackage{underscore}
% Avoid loading unused packages (as done by some \LaTeX\ editors).

% Create title and authors using \verb|\maketitle|.  Separate authors by
% \verb|\and| and put addresses in \verb|\thanks| command with
% \verb|\url| command \verb|\protect|ed.
\title{Parallelisation of a
finite volume method for hydrodynamic inundation modelling}


\author{
S.~G.~Roberts\thanks{Dept. of Maths, Australian National University,
Canberra, \textsc{Australia}. \protect\url{mailto:[stephen.roberts,
linda.stals]@anu.edu.au}} \and L.~Stals\footnotemark[1] \and
O.~M.~Nielsen\thanks{Risk Assessment Methods Project, Geospatial and
Earth Monitoring Division, Geoscience Australia, Symonston,
\textsc{Australia}. \protect\url{mailto:Ole.Nielsen@ga.gov.au}} }

\date{24 August 2006}

\newcommand{\AnuGA}{\textsc{anuga}}
\newcommand{\Python}{\textsc{python}}
\newcommand{\VPython}{\textsc{vpython}}
\newcommand{\pypar}{\textsc{mpi}}
\newcommand{\Metis}{\textsc{metis}}
\newcommand{\mpi}{\textsc{mpi}}

\newcommand{\UU}{\mathbf{U}}
\newcommand{\VV}{\mathbf{V}}
\newcommand{\EE}{\mathbf{E}}
\newcommand{\GG}{\mathbf{G}}
\newcommand{\FF}{\mathbf{F}}
\newcommand{\HH}{\mathbf{H}}
\newcommand{\SSS}{\mathbf{S}}
\newcommand{\nn}{\mathbf{n}}

\newcommand{\code}[1]{\texttt{#1}}


\begin{document}

% Use default \verb|\maketitle|.
\maketitle

% Use the \verb|abstract| environment.
\begin{abstract}
Geoscience Australia and the Australian National University are
developing a hydrodynamic inundation modelling tool called \AnuGA{}
to help simulate the impact of natural inundation hazards such as
riverine flooding, storm surges and tsunamis. The core of \AnuGA{}
is a \Python{} implementation of a finite volume method, based on
triangular meshes, for solving the conservative form of the Shallow
Water Wave equation. We describe the parallelisation of the code
using a domain decomposition strategy where we use the \Metis{}
graph partitioning library to decompose the finite volume meshes.
The parallel efficiency of our code is tested using a number of mesh
partitions, and we verify that the \Metis{} graph partition is
particularly efficient.
\end{abstract}

% By default we include a table of contents in each paper.
\tableofcontents

% Use \verb|\section|, \verb|\subsection|, \verb|\subsubsection| and
% possibly \verb|\paragraph| to structure your document.  Make sure
% you \verb|\label| them for cross referencing with \verb|\ref|\,.
\section{Introduction}
\label{sec:intro}

%Floods are the single greatest cause of death due to natural hazards
%in Australia, causing almost 40{\%} of the fatalities recorded
%between 1788 and 2003~\cite{Blong-2005}. Analysis of buildings
%damaged between 1900 and 2003 suggests that 93.6{\%} of damage is
%the result of meteorological hazards, of which almost 25{\%} is
%directly attributable to flooding~\cite{Blong-2005}.

%Flooding of coastal communities may result from surges of near shore
%waters caused by severe storms. The extent of inundation is
%critically linked to tidal conditions, bathymetry and topography; as
%recently exemplified in the United States by Hurricane Katrina.
%While events of this scale are rare,  the extensive development of
%Australia's coastal corridor means that storm surge inundation of
%even a few hundred metres beyond the shoreline has the potential to
%cause significant disruption and loss.
%
%Coastal communities also face the small but real risk of tsunami.
%Fortunately, catastrophic tsunami of the scale of the 26 December
%2004 event are exceedingly rare. However, smaller scale tsunami are
%more common and regularly threaten coastal communities around the
%world. Earthquakes which occur in the Java Trench near Indonesia
%(e.g.~\cite{TsuMIS1995}) and along the Puysegur Ridge to the south
%of New Zealand (e.g.~\cite{LebKC1998}) have potential to generate
%tsunami that may threaten Australia's northwestern and southeastern
%coastlines. This was particularly evident in the aftermath of the
%Java tsunami on the 17th July 2006 where Western Australia did
%experience a localised tsunami run up.

Hydrodynamic modelling allows flooding, storm surge and tsunami
hazards to be better understood, their impacts to be anticipated
and, with appropriate planning, their effects to be mitigated.
Geoscience Australia in collaboration with the Mathematical Sciences
Institute, Australian National University, is developing a software
application called \AnuGA{} to model the hydrodynamics of floods,
storm surges and tsunamis. These hazards are modelled using the
conservative shallow water equations which are described in
section~\ref{sec:model}. In \AnuGA{} the solution of these equations
are approximated using a finite volume method based on triangular
meshes, as described in section~\ref{sec:fvm}. Nielsen \textit{et
al.}~\cite{modsim2005} provides a more complete discussion of the
method and also provides a validation of the model and method on a
standard tsunami benchmark data set. Section~\ref{sec:parallel}
provides a description of the parallelisation of the code using a
domain decomposition strategy and in section~\ref{sec:analysis}
preliminary timing results which verify the scalability of the
parallel code are presented.


\section{Model}
\label{sec:model}

The shallow water wave equations are a system of differential
conservation equations which describe the flow of a thin layer of
fluid over terrain. The form of the equations are
\[
\frac{\partial \UU}{\partial t}+\frac{\partial \EE}{\partial
x}+\frac{\partial \GG}{\partial y}=\SSS \,,
\]
where $\UU=\left[ {{\begin{array}{*{20}c}
 h & {uh} & {vh} \\
\end{array} }} \right]^T$ is the vector of conserved quantities; water depth
$h$, $x$ horizontal momentum $uh$ and $y$ horizontal momentum $vh$.
Other quantities entering the system are bed elevation $z$ and stage
$w$ (absolute water level), where these variables are related via $w
= z + h$. The horizontal fluxes in the $x$ and $y$ directions, $\EE$
and $\GG$ are
\[
\EE=\left[ {{\begin{array}{*{20}c}
 {uh} \hfill \\
 {u^2h+gh^2/2} \hfill \\
 {uvh} \hfill \\
\end{array} }} \right] \quad \mbox{ and }\quad \GG=\left[ {{\begin{array}{*{20}c}
 {vh} \hfill \\
 {vuh} \hfill \\
 {v^2h+gh^2/2} \hfill \\
\end{array} }} \right]
\]
and the source term (which includes gravity and friction) is
\[
\SSS=\left[ {{\begin{array}{*{20}c}
 0 \hfill \\
 -{gh(z_{x} + S_{fx} )} \hfill \\
 -{gh(z_{y} + S_{fy} )} \hfill \\
\end{array} }} \right] \,,
\]
where $S_f$ is the bed friction. We model the friction term using
Manning's resistance law
\[
S_{fx} =\frac{u\eta ^2\sqrt {u^2+v^2} }{h^{4/3}}\quad
\mbox{and}\quad S_{fy} =\frac{v\eta ^2\sqrt {u^2+v^2} }{h^{4/3}} \,,
\]
in which $\eta$ is the Manning resistance coefficient.

As demonstrated in our papers, Zoppou and Roberts~\cite{Rob99l}, and
Nielsen \textit{et al.}~\cite{modsim2005}, these equations provide
an excellent model of flows associated with inundation such as dam
breaks and tsunamis.

\section{Finite volume method}
\label{sec:fvm}

We use a finite volume method for approximating the solution of the
shallow water wave equations \cite{Rob99l}. The study area is
represented by a mesh of triangular cells as in
Figure~\ref{fig:mesh:reconstruct}, in which the conserved quantities
$h$, $uh$, and $vh$, in each cell are to be determined. The size of
the triangular cells are varied within the mesh to allow greater
resolution in regions of particular interest.

%\begin{figure}
%\begin{center}
%\includegraphics[width=5.0cm,keepaspectratio=true]{step-five}
%\caption{Our finite volume method uses triangular meshes. Conserved
%quantities $h$, $uh$ and $vh$ are associated with the centroid of
%each triangular cell.} \label{fig:mesh}
%\end{center}
%\end{figure}

\begin{figure}
\begin{center}
\includegraphics[width=5.0cm,keepaspectratio=true]{step-reconstruct}
\caption{Conserved quantities $h$, $uh$ and $vh$ are associated with
the centroid of each triangular cell. From the values of the
conserved quantities at the centroid of the cell and its
neighbouring cells, a discontinuous piecewise linear reconstruction
of the conserved quantities is obtained.}
\label{fig:mesh:reconstruct}
\end{center}
\end{figure}

The equations constituting the finite volume method are obtained by
integrating the differential conservation equations over each
triangular cell of the mesh.

Introducing some notation; we use $i$ to refer to the index of the
$i$th triangular cell $T_i$, and ${\cal N}(i)$ to the set of indices
referring to the cells neighbouring the $i$th cell. The area of the
$i$th triangular cell is  $A_i$ and $l_{ij}$ is the length of the
edge between the $i$th and $j$th cells.

By applying the divergence theorem we obtain for each cell, an
equation which describes the rate of change of the average of the
conserved quantities within each cell, in terms of the fluxes across
the edges of the cell and the effect of the source term. In
particular, for each cell
$$
 \frac{d\UU_i }{dt}+ \frac1{A_i}\sum\limits_{j\in{\cal N}(i)} \HH_{ij} l_{ij} =
 \SSS_i  \,,
$$
where
%\begin{itemize}
%\item $\UU_i$ is the vector of conserved quantities averaged over the $i$th cell,
%\item $\SSS_i$ is the source term associated with the $i$th cell,
%and
%\item $\HH_{ij}$ is the outward normal flux of
%material across the \textit{ij}-th edge.
%\end{itemize}
%
$\UU_i$ is the vector of conserved quantities averaged over the
$i$th cell, $\SSS_i$~is the source term associated with the~$i$th
cell and $\HH_{ij}$~is the outward normal flux of material across
the \textit{ij}-th edge.


%\item $l_{ij}$ is the length of the edge between the $i$th and $j$th
%cells
%\item $m_{ij}$ is the midpoint of
%the \textit{ij}th edge,
%\item
%$\mathbf{n}_{ij} = (n_{ij,1} , n_{ij,2})$is the outward pointing
%normal along the \textit{ij}th edge, and The

The flux $\HH_{ij}$ is evaluated using a numerical flux function
$\HH(\cdot, \cdot ; \ \cdot)$ which is consistent with the shallow
water flux in the sense that for all conserved quantity vectors
$\UU$ and spatial vectors $\nn$
$$
H(\UU,\UU;\-\nn) = \EE(\UU) n_1 + \GG(\UU) n_2 \,.
$$
Then the flux across the \textit{ij}th edge is
$$
\HH_{ij}  = \HH(\bar{\UU}_i(m_{ij}), \bar{\UU}_j(m_{ij});\-
\mathbf{n}_{ij}) \,,
$$
where $m_{ij}$ is the midpoint of the \textit{ij}th edge and
$\mathbf{n}_{ij}$ is the outward pointing normal of the
\textit{ij}th edge. The function $\bar{\UU}_i(x,y)$ for $(x,y) \in
T_i$ is obtained from the average conserved quantity values,
$\UU_k$, for $k \in \{ i \} \cup {\cal N}(i)$ (\textit{i}th cell and
its neighbours). We use a second order reconstruction to produce a
piece wise linear function reconstruction, $\bar{\UU}_i(x,y)$, of
the conserved quantities for  all $(x,y) \in T_i$ for each cell (see
Figure~\ref{fig:mesh:reconstruct}). This function is allowed to be
discontinuous across the edges of the cells, but the slope of this
function is limited to avoid artificially introduced oscillations.

The numerical flux function $\HH(\cdot, \cdot ; \ \cdot)$ is
obtained by rotating the coordinate system so the outward normal is
in the $x$ direction. This then reduces the normal flux calculation
to a one dimensional flux calculation. The central upwind scheme as
described by Kurganov \textit{et al.}~\cite{KurNP2001} is then used
to approximate the resulting fluxes of the one dimension problem.


In the computations presented we use an explicit Euler time stepping
method with variable time stepping adapted to the Courant Friedrichs
Levy (CFL) condition number.


%\section{Validation}
%\label{sec:validation} The process of validating the \AnuGA{}
%application is in its early stages, however initial indications are
%encouraging.
%
%As part of the Third International Workshop on Long wave Runup
%Models in 2004 (\url{http://www.cee.cornell.edu/longwave}), four
%benchmark problems were specified to allow the comparison of
%numerical, analytical and physical models with laboratory and field
%data. One of these problems describes a wave tank simulation of the
%1993 Okushiri Island tsunami off Hokkaido, Japan \cite{MatH2001}. A
%significant feature of this tsunami was a maximum run up of 32~m
%observed at the head of the Monai Valley. This run up was not
%uniform along the coast and is thought to have resulted from a
%particular topographic effect. Among other features, simulations of
%the Hokkaido tsunami should capture this run up phenomenon.
%
%\begin{figure}[htbp]
%\centerline{\includegraphics[width=4in]{tsunami-fig-3.eps}}
%\caption{Comparison of wave tank and \AnuGA{} water stages at gauge
%5.}\label{fig:val}
%\end{figure}
%
%
%\begin{figure}[htbp]
%\centerline{\includegraphics[width=4in]{tsunami-fig-4.eps}}
%\caption{Complex reflection patterns and run up into Monai Valley
%simulated by \AnuGA{} and visualised using our netcdf OSG
%viewer.}\label{fig:run}
%\end{figure}
%
%
%The wave tank simulation of the Hokkaido tsunami was used as the
%first scenario for validating \AnuGA{}. The dataset provided
%bathymetry and topography along with initial water depth and the
%wave specifications. The dataset also contained water depth time
%series from three wave gauges situated offshore from the simulated
%inundation area.
%
%Figure~\ref{fig:val} compares the observed wave tank and modelled
%\AnuGA{} water depth (stage height) at one of the gauges. The plots
%show good agreement between the two time series, with \AnuGA{}
%closely modelling the initial draw down, the wave shoulder and the
%subsequent reflections. The discrepancy between modelled and
%simulated data in the first 10 seconds is due to the initial
%condition in the physical tank not being uniformly zero. Similarly
%good comparisons are evident with data from the other two gauges.
%Additionally, \AnuGA{} replicates exceptionally well the 32~m Monai
%Valley run up, and demonstrates its occurrence to be due to the
%interaction of the tsunami wave with two juxtaposed valleys above
%the coastline. The run up is depicted in Figure~\ref{fig:run}.
%
%This successful replication of the tsunami wave tank simulation on a
%complex 3D beach is a positive first step in validating the \AnuGA{}
%modelling capability. Subsequent validation will be conducted as
%additional datasets become available.


\section{Parallel implementation}\label{sec:parallel}

To parallelise our algorithm we use a domain decomposition strategy.
We start with a global mesh which defines the domain of our problem.
We must first subdivide the global mesh into a set of submeshes.
This partitioning is done using the \Metis{} partitioning library,
see Karypis~\cite{metis}. We demonstrate the efficiency of this
library in the following subsections. Once this partitioning has
been done, a layer of ``ghost'' cells is constructed and the
corresponding communication patterns are set. Then we start to
evolve our solution. The computation progresses independently on
each submesh, until the end of the time step when the appropriate
information is communicated among processing elements.

%\begin{figure}[hbtp]
%  \centerline{ \includegraphics[scale = 0.6]{domain.eps}}
%  \caption{The first step of the parallel algorithm is to divide
%  the mesh over the processors.}
%  \label{fig:subpart}
%\end{figure}


\begin{figure}[h!]
  \centerline{ \includegraphics[width=6.5cm]{mermesh.eps}}
  \caption{The global Lake Merimbula mesh}
 \label{fig:mergrid}
\end{figure}

\begin{figure}[h!]
  \centerline{ \includegraphics[width=6.5cm]{mermesh4c.eps}
  \includegraphics[width=6.5cm]{mermesh4a.eps}}

  \centerline{ \includegraphics[width=6.5cm]{mermesh4d.eps}
  \includegraphics[width=6.5cm]{mermesh4b.eps}}
  \caption{The global Lake Merimula mesh from Figure~\ref{fig:mergrid}, partitioned into 4 submeshes using \Metis.}
 \label{fig:mergrid4}
\end{figure}


\begin{table}[h!]
\caption{4-way and 8-way partition tests of Merimbula mesh showing
the percentage distribution of cells between the
submeshes.}\label{tbl:mer}
\begin{center}
\begin{tabular}{|c|c c c c|}
\hline
CPU & 0 & 1 & 2 & 3\\
\hline
Cells & 2757 & 2713 & 2761 & 2554\\
\% & 25.6\% & 25.2\% & 25.6\% & 23.7\%\ \\
\hline
\end{tabular}

\medskip

\begin{tabular}{|c|c c c c c c c c|}
\hline
CPU & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7\\
\hline
Cells & 1229 & 1293 & 1352 & 1341 & 1349 & 1401 & 1413 & 1407\\
\% & 11.4\% & 12.0\% & 12.5\% & 12.4\% & 12.5\% & 13.0\% & 13.1\% & 13.1\%\\
\hline
\end{tabular}
\end{center}
\end{table}

\subsection {Subdividing the global mesh}\label{sec:part1}

The first step in parallelising the code is to subdivide the mesh
into, roughly, equally sized partitions to ensure good load
balancing. On a rectangular mesh this may be done by a simple
coordinate based dissection method, but on a complicated domain such
as the mesh shown in Figure~\ref{fig:mergrid}, a more sophisticated
approach must be used.  We use the \Metis{} partitioning library
based on Karypis and Kumar's~\cite{gk:metis} recommendations.
Figure~\ref{fig:mergrid4} shows the original global grid partitioned
over four submeshes. Note that one submesh may comprise several
unconnected mesh partitions and Table~\ref{tbl:mer} gives the node
distribution over four submeshes and eight submeshes. These results
imply that \Metis{} gives a reasonably well balanced partition of
the mesh.

\begin{figure}[thp]
  \centerline{ \includegraphics[scale = 0.55]{subdomainfinal.eps}}
  \caption{An example subpartitioning of a global mesh into two submeshes, showing the
  ghost nodes used in the two submeshes.
}
 \label{fig:subdomainf}
\end{figure}


\paragraph{Ghost cells}

%\begin{figure}[h!]
%  \centerline{ \includegraphics[height=8cm]{subdomain.eps}}
%  \caption{An example subpartitioning of a global mesh into two submeshes.}
% \label{fig:subdomain}
%\end{figure}
%
%
%\begin{figure}[b!]
%  \centerline{ \includegraphics[height=8cm]{subdomainghost.eps}}
%  \caption{An example subpartitioning with ghost cells. The numbers
%in brackets shows the local numbering scheme that is calculated and
%stored with the mesh, but not implemented until the local mesh is
%built.}
% \label{fig:subdomaing}
%\end{figure}


Consider the example subpartitioning given in
Figure~\ref{fig:subdomainf}. The top mesh represents the global
mesh, whereas the two lower meshes display the partitioning of the
global mesh (together with extra ghost cells to facilitate
communication) onto two processors.

As an example, during the evolution calculations, cell~2 (residing
on processor~0) will need to access information from its' global
neighbour, cell~3 (residing on processor~1). A standard approach to
this problem is to add an extra layer of cells, which we call ghost
cells, to each submesh, on each processor. The ghost cells are used
to hold information that an ordinary cell in a submesh needs to
complete its calculations. The values associated with the ghost
cells need to be updated through communication calls usually only
once every time step (at least for first order time stepping). Such
communication patterns are determined and recorded before sub
partitioning the mesh into submeshes.

%
%
%Referring to Figure~\ref{fig:subdomainf} we see that during each
%communication call submesh~0  will have to send the updated values
%for global cell~2 and cell~4 to submesh~1, and similarly submesh~1
%will have to send the updated values for global cell~3 and cell~5 to
%submesh~0. Each processor records it own communication pattern, i.e.
%the expected pattern of data it expects to receive from the other
%processors so as to update the ghost cell data, and also the pattern
%of data sends used to send data to other processors to update their
%ghost cells.

The ghost cells in each submesh are treated exactly the same as any
other cell, the only way to differentiate them is to look at the
communication pattern. This means that the code is essentially the
same whether it is being run in serial or parallel, the only
difference is a communication call at the end of each time step and
an extra \texttt{if} statement in the local calculation of the time
step constraint to ensure that the ghost cells are not used in that
calculation.

\section{Performance analysis}\label{sec:analysis}


To evaluate the performance of the code on a parallel machine we ran
some examples on a cluster of four nodes connected with PathScale
InfiniPath \textsc{htx}. Each node has two \textsc{amd} Opteron 275
(Dual core 2.2 GHz Processors) and 4 GB of main memory. The system
achieves 60 Gigaflops with the Linpack benchmark, which is about
85\% of peak performance. For each test run we evaluate the parallel
efficiency as
\[
E_p = \frac{T_1}{pT_p} 100 \,,
\]
where $T_p = \max_{0\le i < p}\{t_i\}$, $p$ is the total number of
processors and $t_i$ is the time required to run the evolution code
on processor $i$.  Note that $t_i$ only includes the time required
to do the finite volume evolution calculations, not the time
required to build and subpartition the mesh.

\subsection{Advection, rectangular mesh}

The first example solves an advection equation on a rectangular mesh
of size $n$ by $m$. Table~\ref{tbl:rpa4080160} show the efficiency
results for different values of $n$ and $m$. The examples where $p
\le 4$ were run on one Opteron node containing four processors, the
$p = 8$ example was run on two nodes (giving a total of eight
processors). The communication within a node is faster than the
communication across nodes, so we would expect to see a decrease in
efficiency when we jump from four to eight processors. On the other
hand, the efficiency is likely to increase with $n$ and $m$, due to
an increased ratio between interior and exterior triangles and hence
an increased ratio of computation to communication. The results
displayed in Table~\ref{tbl:rpa4080160} verify this, expect perhaps
for the slightly higher than expected efficiency of the $p=2$,
$n=80$, $m=80$ case. Generally the efficiency results shown here are
consistent and competitive.

%\begin{table}[h!]
%\caption{Parallel Efficiency Results for the Advection Problem on a
%Rectangular Domain with {\tt N} = 40, {\tt M} =
%40.\label{tbl:rpa40}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 36.61 & \\
%2 & 18.76 & 98 \\
%4 & 10.16 & 90 \\
%8 & 6.39 & 72 \\\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}[h!]
%\caption{Parallel Efficiency Results for the Advection Problem on a
%Rectangular Domain with {\tt N} = 80, {\tt M} =
%80.\label{tbl:rpa80}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 282.18 & \\
%2 & 143.14 & 99 \\
%4 & 75.06 & 94 \\
%8 & 41.67 & 85 \\\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%
%\begin{table}[h!]
%\caption{Parallel Efficiency Results for the Advection Problem on a
%Rectangular Domain with {\tt N} = 160, {\tt M} =
%160.\label{tbl:rpa160}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 2200.35 & \\
%2 & 1126.35 & 97 \\
%4 & 569.49 & 97 \\
%8 & 304.43 & 90 \\\hline
%\end{tabular}
%\end{center}
%\end{table}


\begin{table}[h!]
\caption{Parallel efficiency results for the advection problem on
$n$ by $m$ rectangular meshes using $p$
processors.\label{tbl:rpa4080160}}
\begin{center}
\begin{tabular}{|c|c c| c c | c c |}\hline
 & \multicolumn{2}{c|}{40 by 40 mesh} & \multicolumn{2}{|c|}{80 by 80 mesh}
 & \multicolumn{2}{|c|}{160 by 160 mesh} \\ \hline
$p$ & $T_p$ (sec) & $E_p (\%)$ & $T_p$ (sec) & $E_p (\%)$ & $T_p$
(sec) & $E_p (\%)$
\\\hline
1 & 36.6 &    &282.&     &  2200. &  \\
2 & 18.8 & 98 & 143. & 99 &  1126. & 98 \\
4 & 10.2 & 90 & 75.1 & 94 &  569. & 97 \\
8 & 6.39 & 72 &41.7 & 85   &  304. & 90 \\\hline
\end{tabular}
\end{center}
\end{table}

%\begin{table}[h!]
%\caption{Parallel Efficiency Results for the Advection Problem on a
%Rectangular Domain with {\tt N} = 80, {\tt M} =
%80.\label{tbl:rpa80}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 282.18 & \\
%2 & 143.14 & 99 \\
%4 & 75.06 & 94 \\
%8 & 41.67 & 85 \\\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%
%\begin{table}[h!]
%\caption{Parallel Efficiency Results for the Advection Problem on a
%Rectangular Domain with {\tt N} = 160, {\tt M} =
%160.\label{tbl:rpa160}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 2200.35 & \\
%2 & 1126.35 & 97 \\
%4 & 569.49 & 97 \\
%8 & 304.43 & 90 \\\hline
%\end{tabular}
%\end{center}
%\end{table}


\subsection{Advection, Lake Merimbula mesh}

We now look at another advection example, except this time the mesh
comes from a study of water flow in Lake Merimbula, New South Wales.
The mesh is shown in Figure~\ref{fig:mergrid}. The results are given
in Table \ref{tbl:rpm}. These are good efficiency results,
especially considering the structure of this mesh.
%Note that since we are solving an advection problem the amount of calculation
%done on each triangle is relatively low, when we more to other problems that
%involve more calculations we would expect the computation to communication
%ratio to increase and thus get an increase in efficiency.

%\begin{table}
%\caption{Parallel Efficiency Results for the Advection Problem on
%the Lake Merimbula Mesh.\label{tbl:rpm}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 &145.17 & \\
%2 &77.52 & 94 \\
%4 & 41.24 & 88 \\
%8 & 22.96 & 79 \\\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}
%\caption{Parallel Efficiency Results for the Shallow Water Equation
%on the
%  Lake Merimbula Mesh.\label{tbl:rpsm}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 7.04 & \\
%2 & 3.62 & 97 \\
%4 & 1.94 & 91 \\
%8 & 1.15 & 77 \\\hline
%\end{tabular}
%\end{center}
%\end{table}


\begin{table}
\caption{Parallel efficiency results for the advection equation and
the shallow water equation on the Lake Merimbula mesh for $p$
processors.\label{tbl:rpm}}
\begin{center}
\begin{tabular}{|c|c c| c c |}\hline
 & \multicolumn{2}{c|}{Advection} & \multicolumn{2}{|c|}{Shallow water
 eqn} \\ \hline
$p$ & $T_p$ (sec) & $E_p (\%)$  & $T_p$ (sec) & $E_p (\%)$\\\hline
1 &145.0 &      & 7.04 &  \\
2 &77.5 & 94    & 3.62 & 97  \\
4 & 41.2 & 88   & 1.94 & 91 \\
8 & 23.0 & 79   & 1.15 & 76 \\\hline
\end{tabular}
\end{center}
\end{table}

%\begin{table}
%\caption{Parallel Efficiency Results for the Shallow Water Equation
%on the
%  Lake Merimbula Mesh.\label{tbl:rpsm}}
%\begin{center}
%\begin{tabular}{|c|c c|}\hline
%$n$ & $T_n$ (sec) & $E_n (\%)$ \\\hline
%1 & 7.04 & \\
%2 & 3.62 & 97 \\
%4 & 1.94 & 91 \\
%8 & 1.15 & 77 \\\hline
%\end{tabular}
%\end{center}
%\end{table}


\subsection{Shallow water, Lake Merimbula mesh}

The final example we look at is the shallow water equation on the
Lake Merimbula mesh.  The results for this case are also listed in
Table \ref{tbl:rpm}. The efficiency results for two and four
processors is again good. For eight processors the efficiency falls
off rather quickly.

On profiling the code we found that the loss of efficiency is due to
the boundary update routine.  To allow maximum flexibility in
experimenting with different boundary conditions, the boundary
routines are written in \Python (as opposed to most of the other
computationally intensive kernels which are written in \texttt{C}).
When running the code on one processor the boundary routine accounts
for about 72\% of the total computation time. The \Metis{}
subpartition of the mesh produced an imbalance in the number of
active boundary edges in each subpartition.  The profiler indicated
that when running the problem on eight processors, Processor 0 spent
about 3.8 times more time on the boundary calculation than Processor
7, indicating about 3.8 times as many active boundary edges. This
load imbalance reduced the parallel efficiency. In the future the
boundary routine will be rewritten in \texttt{C} to reduce its
overall contribution to the computation and so reduce the effect of
this active boundary imbalance.


\section{Conclusions}
\label{sec:6} \AnuGA{} is a flexible and robust modelling system
that simulates hydrodynamics by solving the shallow water wave
equation in a triangular mesh. It models the process of wetting and
drying as water enters and leaves an area and is capable of
capturing hydraulic shocks due to the ability of the finite volume
method to accommodate discontinuities in the solution. It has been
used to simulate the behaviour of hydrodynamic natural hazards such
as riverine flooding, storm surge and tsunami.

The use of the parallel code will enhance the modelling capability
of \AnuGA{} and will form part of Geoscience Australia's ongoing
research effort to model and understand the potential impact from
natural hazards in order to reduce their impact on Australian
communities.


%\paragraph{Acknowledgements:}
%The authors are grateful to Belinda Barnes, National Centre for
%Epidemiology and Population Health, Australian National University,
%and Matt Hayne and Augusto Sanabria, Risk Research Group, Geoscience
%Australia, for helpful reviews of a previous version of this paper.
%Author Nielsen publish with the permission of the CEO, Geoscience
%Australia.

% Preferably provide your bibliography as a separate .bbl file.
% Include URLs, DOIs, Math Review numbers or Zentralblatt numbers in your bibliography
% so we help connect the web of science and ensure maximum visibility
% for your article.


\bibliographystyle{plain}
\bibliography{database1}


\end{document}