source: branches/numpy/anuga/geospatial_data/geospatial_data.py

Last change on this file was 6902, checked in by rwilson, 16 years ago

Back-merge from Numeric trunk to numpy branch.

File size: 69.4 KB
Line 
1"""Class Geospatial_data
2
3Manipulation of locations on the planet and associated attributes.
4"""
5
6from sys import maxint
7from os import access, F_OK, R_OK,remove
8from types import DictType
9from warnings import warn
10from string import lower
11from copy import deepcopy
12import copy
13
14from Scientific.IO.NetCDF import NetCDFFile
15import numpy as num
16from numpy.random import randint, seed
17
18from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL
19from anuga.utilities.numerical_tools import ensure_numeric
20from anuga.coordinate_transforms.geo_reference import Geo_reference, \
21     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
22from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
23from anuga.utilities.system_tools import clean_line
24from anuga.utilities.anuga_exceptions import ANUGAError
25from anuga.config import points_file_block_line_size as MAX_READ_LINES
26from anuga.config import netcdf_mode_r, netcdf_mode_w, netcdf_mode_a
27from anuga.config import netcdf_float
28
29
30DEFAULT_ATTRIBUTE = 'elevation'
31
32
33##
34# @brief ??
35class Geospatial_data:
36
37    ##
38    # @brief
39    # @param data_points Mx2 iterable of tuples or array of x,y coordinates.
40    # @param attributes Associated values for each data point.
41    # @param geo_reference ??
42    # @param default_attribute_name ??
43    # @param file_name
44    # @param latitudes ??
45    # @param longitudes ??
46    # @param points_are_lats_longs True if points are lat/long, not UTM.
47    # @param max_read_lines Size of block to read, if blocking.
48    # @param load_file_now True if blocking but we eant to read file now.
49    # @param verbose True if this class instance is verbose.
50    def __init__(self,
51                 data_points=None, # this can also be a points file name
52                 attributes=None,
53                 geo_reference=None,
54                 default_attribute_name=None,
55                 file_name=None,
56                 latitudes=None,
57                 longitudes=None,
58                 points_are_lats_longs=False,
59                 max_read_lines=None,
60                 load_file_now=True,
61                 verbose=False):
62        """Create instance from data points and associated attributes
63
64        data_points: x,y coordinates in meters. Type must be either a
65        sequence of 2-tuples or an Mx2 numeric array of floats.  A file name
66        with extension .txt, .cvs or .pts can also be passed in here.
67
68        attributes: Associated values for each data point. The type
69        must be either a list or an array of length M or a dictionary
70        of lists (or arrays) of length M. In the latter case the keys
71        in the dictionary represent the attribute names, in the former
72        the attribute will get the default name "elevation".
73
74        geo_reference: Object representing the origin of the data
75        points. It contains UTM zone, easting and northing and data
76        points are assumed to be relative to this origin.
77        If geo_reference is None, the default geo ref object is used.
78
79        default_attribute_name: Name of default attribute to be used with
80        get_attribute_values. The idea is that the dataset can be
81        equipped with information about which attribute to return.
82        If None, the default is the "first"
83
84        latitudes, longitudes: Vectors of latitudes and longitudes,
85        used to specify location instead of points.
86
87        points_are_lats_longs: Set this as true if the points are actually
88        lats and longs, not UTM
89
90        max_read_lines: The number of rows read into memory when using
91        blocking to read a file.
92
93        load_file_now:  If true the file is automatically loaded
94        into the geospatial instance. Used when blocking.
95
96        file_name: Name of input netCDF file or .txt file. netCDF file must
97        have dimensions "points" etc.
98        .txt file is a comma seperated file with x, y and attribute
99        data.
100
101        The first line has the titles of the columns.  The first two
102        column titles are checked to see if they start with lat or
103        long (not case sensitive).  If so the data is assumed to be
104        latitude and longitude, in decimal format and converted to
105        UTM.  Otherwise the first two columns are assumed to be the x
106        and y, and the title names acually used are ignored.
107
108
109        The format for a .txt file is:
110            1st line:     [column names]
111            other lines:  x y [attributes]
112
113            for example:
114            x, y, elevation, friction
115            0.6, 0.7, 4.9, 0.3
116            1.9, 2.8, 5, 0.3
117            2.7, 2.4, 5.2, 0.3
118
119        The first two columns have to be x, y or lat, long
120        coordinates.
121
122
123        The format for a Points dictionary is:
124          ['pointlist'] a 2 column array describing points. 1st column x,
125          2nd column y.
126          ['attributelist'], a dictionary of 1D arrays, representing
127          attribute values at the point.  The dictionary key is the attribute
128          header.
129          ['geo_reference'] a Geo_refernece object. Use if the point
130          information is relative. This is optional.
131            eg
132            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
133            dic['attributelist']['elevation'] = [[7.0,5.0]
134
135        verbose:
136        """
137
138        if isinstance(data_points, basestring):
139            # assume data_points is really a file name
140            file_name = data_points
141
142        self.set_verbose(verbose)
143        self.geo_reference = None
144        self.file_name = file_name
145
146        if max_read_lines is None:
147            self.max_read_lines = MAX_READ_LINES
148        else:
149            self.max_read_lines = max_read_lines
150
151        if file_name is None:
152            if (latitudes is not None or longitudes is not None
153                    or points_are_lats_longs):
154                data_points, geo_reference = \
155                    _set_using_lat_long(latitudes=latitudes,
156                                        longitudes=longitudes,
157                                        geo_reference=geo_reference,
158                                        data_points=data_points,
159                                        points_are_lats_longs=
160                                            points_are_lats_longs)
161            self.check_data_points(data_points)
162            self.set_attributes(attributes)
163            self.set_geo_reference(geo_reference)
164            self.set_default_attribute_name(default_attribute_name)
165        elif load_file_now is True:
166            # watch for case where file name and points,
167            # attributes etc are provided!!
168            # if file name then all provided info will be removed!
169
170            if verbose is True:
171                if file_name is not None:
172                    print 'Loading Geospatial data from file: %s' % file_name
173
174            self.import_points_file(file_name, verbose=verbose)
175
176            self.check_data_points(self.data_points)
177            self.set_attributes(self.attributes)
178            self.set_geo_reference(self.geo_reference)
179            self.set_default_attribute_name(default_attribute_name)
180
181        if verbose is True:
182            if file_name is not None:
183                print 'Geospatial data created from file: %s' % file_name
184                if load_file_now is False:
185                    print 'Data will be loaded blockwise on demand'
186
187                    if file_name.endswith('csv') or file_name.endswith('txt'):
188                        pass
189                        # This message was misleading.
190                        # FIXME (Ole): Are we blocking here or not?
191                        # print 'ASCII formats are not that great for '
192                        # print 'blockwise reading. Consider storing this'
193                        # print 'data as a pts NetCDF format'
194
195    ##
196    # @brief Return length of the points set.
197    def __len__(self):
198        return len(self.data_points)
199
200    ##
201    # @brief Return a string representation of the points set.
202    def __repr__(self):
203        return str(self.get_data_points(absolute=True))
204
205    ##
206    # @brief Check data points.
207    # @param data_points Points data to check and store in instance.
208    # @note Throws ValueError exception if no data.
209    def check_data_points(self, data_points):
210        """Checks data points"""
211
212        if data_points is None:
213            self.data_points = None
214            msg = 'There is no data or file provided!'
215            raise ValueError, msg
216        else:
217            self.data_points = ensure_numeric(data_points)
218            if not (0,) == self.data_points.shape:
219                assert len(self.data_points.shape) == 2
220                assert self.data_points.shape[1] == 2
221
222    ##
223    # @brief Check and assign attributes data.
224    # @param attributes Dictionary or scalar to save as .attributes.
225    # @note Throws exception if unable to convert dict keys to numeric.
226    def set_attributes(self, attributes):
227        """Check and assign attributes dictionary"""
228
229        if attributes is None:
230            self.attributes = None
231            return
232
233        if not isinstance(attributes, DictType):
234            # Convert single attribute into dictionary
235            attributes = {DEFAULT_ATTRIBUTE: attributes}
236
237        # Check input attributes
238        for key in attributes.keys():
239            try:
240                attributes[key] = ensure_numeric(attributes[key])
241            except:
242                msg = ("Attribute '%s' (%s) could not be converted to a"
243                       "numeric vector" % (str(key), str(attributes[key])))
244                raise Exception, msg
245
246        self.attributes = attributes
247
248    ##
249    # @brief Set the georeference of geospatial data.
250    # @param geo_reference The georeference data to set.
251    # @note Will raise exception if param not instance of Geo_reference.
252    def set_geo_reference(self, geo_reference):
253        """Set the georeference of geospatial data.
254
255        It can also be used to change the georeference and will ensure that
256        the absolute coordinate values are unchanged.
257        """
258
259        if geo_reference is None:
260            # Use default - points are in absolute coordinates
261            geo_reference = Geo_reference()
262
263        # Allow for tuple (zone, xllcorner, yllcorner)
264        geo_reference = ensure_geo_reference(geo_reference)
265
266        if not isinstance(geo_reference, Geo_reference):
267            # FIXME (Ole): This exception will be raised even
268            # if geo_reference is None. Is that the intent Duncan?
269            msg = ('Argument geo_reference must be a valid Geo_reference '
270                   'object or None.')
271            raise Expection, msg
272
273        # If a geo_reference already exists, change the point data according to
274        # the new geo reference
275        if  self.geo_reference is not None:
276            self.data_points = self.get_data_points(geo_reference=geo_reference)
277
278        self.geo_reference = geo_reference
279
280    ##
281    # @brief Set default attribute name.
282    # @param default_attribute_name The default to save.
283    def set_default_attribute_name(self, default_attribute_name):
284        self.default_attribute_name = default_attribute_name
285
286    ##
287    # @brief Set the instance verbose flag.
288    # @param verbose The value to save.
289    # @note Will raise exception if param is not True or False.
290    def set_verbose(self, verbose=False):
291        if verbose in [False, True]:
292            self.verbose = verbose
293        else:
294            msg = 'Illegal value: %s' % str(verbose)
295            raise Exception, msg
296
297    ##
298    # @brief Clip geospatial data by a given polygon.
299    # @param polygon The polygon to clip with.
300    # @param closed True if points on clip boundary are not included in result.
301    # @param verbose True if this function is verbose.
302    def clip(self, polygon, closed=True, verbose=False):
303        """Clip geospatial data by a polygon
304
305        Input
306          polygon - Either a list of points, an Nx2 array or
307                    a Geospatial data object.
308          closed - (optional) determine whether points on boundary should be
309          regarded as belonging to the polygon (closed = True)
310          or not (closed = False). Default is True.
311
312        Output
313          New geospatial data object representing points inside
314          specified polygon.
315
316
317        Note - this method is non-destructive and leaves the data in 'self'
318        unchanged
319        """
320
321        from anuga.utilities.polygon import inside_polygon
322
323        if isinstance(polygon, Geospatial_data):
324            # Polygon is an object - extract points
325            polygon = polygon.get_data_points()
326
327        points = self.get_data_points()
328        inside_indices = inside_polygon(points, polygon, closed, verbose)
329
330        clipped_G = self.get_sample(inside_indices)
331
332        return clipped_G
333
334    ##
335    # @brief Clip points data by polygon, return points outside polygon.
336    # @param polygon The polygon to clip with.
337    # @param closed True if points on clip boundary are not included in result.
338    # @param verbose True if this function is verbose.
339    def clip_outside(self, polygon, closed=True, verbose=False):
340        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
341
342        Input
343          polygon - Either a list of points, an Nx2 array or
344                    a Geospatial data object.
345          closed - (optional) determine whether points on boundary should be
346          regarded as belonging to the polygon (closed = True)
347          or not (closed = False). Default is True.
348
349        Output
350          Geospatial data object representing point OUTSIDE specified polygon
351        """
352
353        from anuga.utilities.polygon import outside_polygon
354
355        if isinstance(polygon, Geospatial_data):
356            # Polygon is an object - extract points
357            polygon = polygon.get_data_points()
358
359        points = self.get_data_points()
360        outside_indices = outside_polygon(points, polygon, closed,verbose)
361
362        clipped_G = self.get_sample(outside_indices)
363
364        return clipped_G
365
366    ##
367    # @brief Get instance geo_reference data.
368    def get_geo_reference(self):
369        return self.geo_reference
370
371    ##
372    # @brief Get coordinates for all data points as an Nx2 array.
373    # @param absolute If True, return UTM, else relative to xll/yll corners.
374    # @param geo_reference If supplied, points are relative to it.
375    # @param as_lat_long If True, return points as lat/lon.
376    # @param isSouthHemisphere If True, return lat/lon points in S.Hemi.
377    # @return A set of data points, in appropriate form.
378    def get_data_points(self,
379                        absolute=True,
380                        geo_reference=None,
381                        as_lat_long=False,
382                        isSouthHemisphere=True):
383        """Get coordinates for all data points as an Nx2 array
384
385        If absolute is False returned coordinates are relative to the
386        internal georeference's xll and yll corners, otherwise
387        absolute UTM coordinates are returned.
388
389        If a geo_reference is passed the points are returned relative
390        to that geo_reference.
391
392        isSH (isSouthHemisphere) is only used when getting data
393        points "as_lat_long" is True and if FALSE will return lats and
394        longs valid for the Northern Hemisphere.
395
396        Default: absolute is True.
397        """
398
399        if as_lat_long is True:
400            msg = "Points need a zone to be converted into lats and longs"
401            assert self.geo_reference is not None, msg
402            zone = self.geo_reference.get_zone()
403            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
404            lats_longs = []
405            for point in self.get_data_points(True):
406                # UTMtoLL(northing, easting, zone,
407                lat_calced, long_calced = UTMtoLL(point[1], point[0],
408                                                  zone, isSouthHemisphere)
409                lats_longs.append((lat_calced, long_calced)) # to hash
410            return lats_longs
411
412        if absolute is True and geo_reference is None:
413            return self.geo_reference.get_absolute(self.data_points)
414        elif geo_reference is not None:
415            return geo_reference.change_points_geo_ref(self.data_points,
416                                                       self.geo_reference)
417        else:
418            # If absolute is False
419            return self.data_points
420
421    ##
422    # @brief Get value for attribute name.
423    # @param attribute_name Name to get value for.
424    # @note If name passed is None, return default attribute value.
425    def get_attributes(self, attribute_name=None):
426        """Return values for one named attribute.
427
428        If attribute_name is None, default_attribute_name is used
429        """
430
431        if attribute_name is None:
432            if self.default_attribute_name is not None:
433                attribute_name = self.default_attribute_name
434            else:
435                attribute_name = self.attributes.keys()[0]
436                # above line takes the first one from keys
437
438        if self.verbose is True:
439            print 'Using attribute %s' %attribute_name
440            print 'Available attributes: %s' %(self.attributes.keys())
441
442        msg = 'Attribute name %s does not exist in data set' % attribute_name
443        assert self.attributes.has_key(attribute_name), msg
444
445        return self.attributes[attribute_name]
446
447    ##
448    # @brief Get all instance attributes.
449    # @return The instance attribute dictionary, or None if no attributes.
450    def get_all_attributes(self):
451        """Return values for all attributes.
452        The return value is either None or a dictionary (possibly empty).
453        """
454
455        return self.attributes
456
457    ##
458    # @brief Override __add__() to allow addition of geospatial objects.
459    # @param self This object.
460    # @param other The second object.
461    # @return The new geospatial object.
462    def __add__(self, other):
463        """Returns the addition of 2 geospatial objects,
464        objects are concatencated to the end of each other
465
466        NOTE: doesn't add if objects contain different
467        attributes
468
469        Always return absolute points!
470        This also means, that if you add None to the object,
471        it will be turned into absolute coordinates
472
473        other can be None in which case nothing is added to self.
474        """
475
476        # find objects zone and checks if the same
477        geo_ref1 = self.get_geo_reference()
478        zone1 = geo_ref1.get_zone()
479
480        if other is not None:
481            geo_ref2 = other.get_geo_reference()
482            zone2 = geo_ref2.get_zone()
483            geo_ref1.reconcile_zones(geo_ref2)
484            new_points = num.concatenate((self.get_data_points(absolute=True),
485                                          other.get_data_points(absolute=True)),
486                                         axis = 0)
487
488            # Concatenate attributes if any
489            if self.attributes is None:
490                if other.attributes is not None:
491                    msg = ('Geospatial data must have the same '
492                           'attributes to allow addition.')
493                    raise Exception, msg
494
495                new_attributes = None
496            else:
497                new_attributes = {}
498                for x in self.attributes.keys():
499                    if other.attributes.has_key(x):
500                        attrib1 = self.attributes[x]
501                        attrib2 = other.attributes[x]
502                        new_attributes[x] = num.concatenate((attrib1, attrib2),
503                                                            axis=0) #??default#
504                    else:
505                        msg = ('Geospatial data must have the same '
506                               'attributes to allow addition.')
507                        raise Exception, msg
508        else:
509            # other is None:
510            new_points = self.get_data_points(absolute=True)
511            new_attributes = self.attributes
512
513        # Instantiate new data object and return absolute coordinates
514        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
515        return Geospatial_data(new_points, new_attributes, new_geo_ref)
516
517    ##
518    # @brief Override the addition case where LHS isn't geospatial object.
519    # @param self This object.
520    # @param other The second object.
521    # @return The new geospatial object.
522    def __radd__(self, other):
523        """Handle cases like None + Geospatial_data(...)"""
524
525        return self + other
526
527################################################################################
528#  IMPORT/EXPORT POINTS FILES
529################################################################################
530
531    ##
532    # @brief Import a .txt, .csv or .pts points data file.
533    # @param file_name
534    # @param delimiter
535    # @param verbose True if this function is to be verbose.
536    # @note Will throw IOError or SyntaxError if there is a problem.
537    def import_points_file(self, file_name, delimiter=None, verbose=False):
538        """ load an .txt, .csv or .pts file
539
540        Note: will throw an IOError/SyntaxError if it can't load the file.
541        Catch these!
542
543        Post condition: self.attributes dictionary has been set
544        """
545
546        if access(file_name, F_OK) == 0 :
547            msg = 'File %s does not exist or is not accessible' % file_name
548            raise IOError, msg
549
550        attributes = {}
551        if file_name[-4:] == ".pts":
552            try:
553                data_points, attributes, geo_reference = \
554                             _read_pts_file(file_name, verbose)
555            except IOError, e:
556                msg = 'Could not open file %s ' % file_name
557                raise IOError, msg
558        elif file_name[-4:] == ".txt" or file_name[-4:]== ".csv":
559            try:
560                data_points, attributes, geo_reference = \
561                             _read_csv_file(file_name, verbose)
562            except IOError, e:
563                # This should only be if a file is not found
564                msg = ('Could not open file %s. Check the file location.'
565                       % file_name)
566                raise IOError, msg
567            except SyntaxError, e:
568                # This should only be if there is a format error
569                msg = ('Problem with format of file %s.\n%s'
570                       % (file_name, Error_message['IOError']))
571                raise SyntaxError, msg
572        else:
573            msg = 'Extension %s is unknown' % file_name[-4:]
574            raise IOError, msg
575
576        self.data_points = data_points
577        self.attributes = attributes
578        self.geo_reference = geo_reference
579
580    ##
581    # @brief Write points data to a file (.csv or .pts).
582    # @param file_name Path to file to write.
583    # @param absolute ??
584    # @param as_lat_long ??
585    # @param isSouthHemisphere ??
586    def export_points_file(self, file_name, absolute=True,
587                           as_lat_long=False, isSouthHemisphere=True):
588        """write a points file as a text (.csv) or binary (.pts) file
589
590        file_name is the file name, including the extension
591        The point_dict is defined at the top of this file.
592
593        If absolute is True data the xll and yll are added to the points value
594        and the xll and yll of the geo_reference are set to 0.
595
596        If absolute is False data points at returned as relative to the xll
597        and yll and geo_reference remains uneffected
598
599        isSouthHemisphere: is only used when getting data
600        points "as_lat_long" is True and if FALSE will return lats and
601        longs valid for the Northern Hemisphere.
602        """
603
604        if (file_name[-4:] == ".pts"):
605            if absolute is True:
606                geo_ref = deepcopy(self.geo_reference)
607                geo_ref.xllcorner = 0
608                geo_ref.yllcorner = 0
609                _write_pts_file(file_name,
610                                self.get_data_points(absolute),
611                                self.get_all_attributes(),
612                                geo_ref)
613            else:
614                _write_pts_file(file_name,
615                                self.get_data_points(absolute),
616                                self.get_all_attributes(),
617                                self.get_geo_reference())
618        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
619            msg = "ERROR: trying to write a .txt file with relative data."
620            assert absolute, msg
621            _write_csv_file(file_name,
622                            self.get_data_points(absolute=True,
623                                                 as_lat_long=as_lat_long,
624                                           isSouthHemisphere=isSouthHemisphere),
625                            self.get_all_attributes(),
626                            as_lat_long=as_lat_long)
627        elif file_name[-4:] == ".urs" :
628            msg = "ERROR: Can not write a .urs file as a relative file."
629            assert absolute, msg
630            _write_urs_file(file_name,
631                            self.get_data_points(as_lat_long=True,
632                                           isSouthHemisphere=isSouthHemisphere))
633        else:
634            msg = 'Unknown file type %s ' %file_name
635            raise IOError, msg
636
637    ##
638    # @brief Get a subset of data that is referred to by 'indices'.
639    # @param indices A list of indices to select data subset with.
640    # @return A geospatial object containing data subset.
641    def get_sample(self, indices):
642        """ Returns a object which is a subset of the original
643        and the data points and attributes in this new object refer to
644        the indices provided
645
646        Input
647            indices- a list of integers that represent the new object
648        Output
649            New geospatial data object representing points specified by
650            the indices
651        """
652
653        # FIXME: add the geo_reference to this
654        points = self.get_data_points()
655        sampled_points = num.take(points, indices, axis=0)
656
657        attributes = self.get_all_attributes()
658
659        sampled_attributes = {}
660        if attributes is not None:
661            for key, att in attributes.items():
662                sampled_attributes[key] = num.take(att, indices, axis=0)
663
664        return Geospatial_data(sampled_points, sampled_attributes)
665
666    ##
667    # @brief Split one geospatial object into two.
668    # @param factor Relative size to make first result object.
669    # @param seed_num Random 'seed' - used only for unit test.
670    # @param verbose True if this function is to be verbose.
671    # @note Points in each result object are selected randomly.
672    def split(self, factor=0.5, seed_num=None, verbose=False):
673        """Returns two geospatial_data object, first is the size of the 'factor'
674        smaller the original and the second is the remainder. The two
675        new objects are disjoint sets of each other.
676
677        Points of the two new object have selected RANDOMLY.
678
679        This method create two lists of indices which are passed into
680        get_sample.  The lists are created using random numbers, and
681        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
682        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
683
684        Input -  the factor which to split the object, if 0.1 then 10% of the
685                 together object will be returned
686
687        Output - two geospatial_data objects that are disjoint sets of the
688                 original
689        """
690
691        i = 0
692        self_size = len(self)
693        random_list = []
694        remainder_list = []
695        new_size = round(factor * self_size)
696
697        # Find unique random numbers
698        if verbose: print "make unique random number list and get indices"
699
700        total = num.array(range(self_size), num.int)    #array default#
701        total_list = total.tolist()
702
703        if verbose: print "total list len", len(total_list)
704
705        # There will be repeated random numbers however will not be a
706        # problem as they are being 'pop'ed out of array so if there
707        # are two numbers the same they will pop different indicies,
708        # still basically random
709        ## create list of non-unquie random numbers
710        if verbose: print "create random numbers list %s long" %new_size
711
712        # Set seed if provided, mainly important for unit test!
713        # plus recalcule seed when no seed provided.
714        if seed_num is not None:
715            seed(seed_num)
716        else:
717            seed()
718
719        #if verbose: print "seed:", get_seed()
720
721        random_num = randint(0, self_size-1, (int(new_size),))
722        random_num = random_num.tolist()
723
724        # need to sort and reverse so the pop() works correctly
725        random_num.sort()
726        random_num.reverse()
727
728        if verbose: print "make random number list and get indices"
729
730        j = 0
731        k = 1
732        remainder_list = total_list[:]
733
734        # pops array index (random_num) from remainder_list
735        # (which starts as the total_list and appends to random_list)
736        random_num_len = len(random_num)
737        for i in random_num:
738            random_list.append(remainder_list.pop(i))
739            j += 1
740            # prints progress
741            if verbose and round(random_num_len/10*k) == j:
742                print '(%s/%s)' % (j, random_num_len)
743                k += 1
744
745        # FIXME: move to tests, it might take a long time
746        # then create an array of random length between 500 and 1000,
747        # and use a random factor between 0 and 1
748        # setup for assertion
749        test_total = random_list[:]
750        test_total.extend(remainder_list)
751        test_total.sort()
752        msg = ('The two random lists made from the original list when added '
753               'together DO NOT equal the original list')
754        assert total_list == test_total, msg
755
756        # Get new samples
757        if verbose: print "get values of indices for random list"
758        G1 = self.get_sample(random_list)
759        if verbose: print "get values of indices for opposite of random list"
760        G2 = self.get_sample(remainder_list)
761
762        return G1, G2
763
764    ##
765    # @brief Allow iteration over this object.
766    def __iter__(self):
767        """Read in the header, number_of_points and save the
768        file pointer position
769        """
770
771        # FIXME - what to do if the file isn't there
772
773        # FIXME (Ole): Shouldn't this go into the constructor?
774        # This method acts like the constructor when blocking.
775        # ... and shouldn't it be called block_size?
776        #
777        if self.max_read_lines is None:
778            self.max_read_lines = MAX_READ_LINES
779
780        if self.file_name[-4:] == ".pts":
781            # See if the file is there.  Throw a QUIET IO error if it isn't
782            fd = open(self.file_name,'r')
783            fd.close()
784
785            # Throws prints to screen if file not present
786            self.fid = NetCDFFile(self.file_name, netcdf_mode_r)
787
788            (self.blocking_georef,
789             self.blocking_keys,
790             self.number_of_points) = _read_pts_file_header(self.fid,
791                                                            self.verbose)
792            self.start_row = 0
793            self.last_row = self.number_of_points
794            self.show_verbose = 0
795            self.verbose_block_size = (self.last_row + 10)/10
796            self.block_number = 0
797            self.number_of_blocks = self.number_of_points/self.max_read_lines
798            # This computes the number of full blocks. The last block may be
799            # smaller and won't be included in this estimate.
800
801            if self.verbose is True:
802                print ('Reading %d points (in ~%d blocks) from file %s. '
803                       % (self.number_of_points, self.number_of_blocks,
804                          self.file_name)),
805                print ('Each block consists of %d data points'
806                       % self.max_read_lines)
807        else:
808            # Assume the file is a csv file
809            file_pointer = open(self.file_name)
810            self.header, self.file_pointer = _read_csv_file_header(file_pointer)
811            self.blocking_georef = None # Used for reconciling zones
812
813        return self
814
815    ##
816    # @brief Read another block into the instance.
817    def next(self):
818        """read a block, instanciate a new geospatial and return it"""
819
820        if self.file_name[-4:] == ".pts":
821            if self.start_row == self.last_row:
822                # Read the end of the file last iteration
823                # Remove blocking attributes
824                self.fid.close()
825                del self.max_read_lines
826                del self.blocking_georef
827                del self.last_row
828                del self.start_row
829                del self.blocking_keys
830                del self.fid
831                raise StopIteration
832            fin_row = self.start_row + self.max_read_lines
833            if fin_row > self.last_row:
834                fin_row = self.last_row
835
836            if self.verbose is True:
837                if (self.show_verbose >= self.start_row
838                    and self.show_verbose < fin_row):
839                    print ('Reading block %d (points %d to %d) out of %d'
840                           % (self.block_number, self.start_row,
841                              fin_row, self.number_of_blocks))
842
843                    self.show_verbose += max(self.max_read_lines,
844                                             self.verbose_block_size)
845
846            # Read next block
847            pointlist, att_dict, = _read_pts_file_blocking(self.fid,
848                                                           self.start_row,
849                                                           fin_row,
850                                                           self.blocking_keys)
851
852            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
853            self.start_row = fin_row
854
855            self.block_number += 1
856        else:
857            # Assume the file is a csv file
858            try:
859                (pointlist,
860                 att_dict,
861                 geo_ref,
862                 self.file_pointer) = _read_csv_file_blocking(self.file_pointer,
863                                                              self.header[:],
864                                                              max_read_lines=
865                                                           self.max_read_lines,
866                                                              verbose=
867                                                                  self.verbose)
868
869                # Check that the zones haven't changed.
870                if geo_ref is not None:
871                    geo_ref.reconcile_zones(self.blocking_georef)
872                    self.blocking_georef = geo_ref
873                elif self.blocking_georef is not None:
874                    msg = ('Geo reference given, then not given.'
875                           ' This should not happen.')
876                    raise ValueError, msg
877                geo = Geospatial_data(pointlist, att_dict, geo_ref)
878            except StopIteration:
879                self.file_pointer.close()
880                del self.header
881                del self.file_pointer
882                raise StopIteration
883            except ANUGAError:
884                self.file_pointer.close()
885                del self.header
886                del self.file_pointer
887                raise
888            except SyntaxError:
889                self.file_pointer.close()
890                del self.header
891                del self.file_pointer
892                # This should only be if there is a format error
893                msg = ('Could not open file %s.\n%s'
894                       % (self.file_name, Error_message['IOError']))
895                raise SyntaxError, msg
896        return geo
897
898##################### Error messages ###########
899Error_message = {}
900Em = Error_message
901Em['IOError'] = ('NOTE: The format for a comma separated .txt/.csv file is:\n'
902                 '        1st line:     [column names]\n'
903                 '        other lines:  [x value], [y value], [attributes]\n'
904                 '\n'
905                 '           for example:\n'
906                 '           x, y, elevation, friction\n'
907                 '           0.6, 0.7, 4.9, 0.3\n'
908                 '           1.9, 2.8, 5, 0.3\n'
909                 '           2.7, 2.4, 5.2, 0.3\n'
910                 '\n'
911                 'The first two columns are assumed to be x, y coordinates.\n'
912                 'The attribute values must be numeric.\n')
913
914##
915# @brief ??
916# @param latitudes ??
917# @param longitudes ??
918# @param geo_reference ??
919# @param data_points ??
920# @param points_are_lats_longs ??
921# @note IS THIS USED???
922def _set_using_lat_long(latitudes,
923                        longitudes,
924                        geo_reference,
925                        data_points,
926                        points_are_lats_longs):
927    """If the points has lat long info, assume it is in (lat, long) order."""
928
929    if geo_reference is not None:
930        msg = ('A georeference is specified yet latitude and longitude '
931               'are also specified!')
932        raise ValueError, msg
933
934    if data_points is not None and not points_are_lats_longs:
935        msg = ('Data points are specified yet latitude and longitude are '
936               'also specified.')
937        raise ValueError, msg
938
939    if points_are_lats_longs:
940        if data_points is None:
941            msg = "Data points are not specified."
942            raise ValueError, msg
943        lats_longs = ensure_numeric(data_points)
944        latitudes = num.ravel(lats_longs[:,0:1])
945        longitudes = num.ravel(lats_longs[:,1:])
946
947    if latitudes is None and longitudes is None:
948        msg = "Latitudes and Longitudes are not specified."
949        raise ValueError, msg
950
951    if latitudes is None:
952        msg = "Longitudes are specified yet latitudes aren't."
953        raise ValueError, msg
954
955    if longitudes is None:
956        msg = "Latitudes are specified yet longitudes aren't."
957        raise ValueError, msg
958
959    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
960                                                    longitudes=longitudes)
961    return data_points, Geo_reference(zone=zone)
962
963
964##
965# @brief Read a .pts data file.
966# @param file_name Path to file to read.
967# @param verbose True if this function is to be verbose.
968# @return (pointlist, attributes, geo_reference)
969def _read_pts_file(file_name, verbose=False):
970    """Read .pts NetCDF file
971
972    Return a (dict_points, dict_attribute, geo_ref)
973    eg
974    dict['points'] = [[1.0,2.0],[3.0,5.0]]
975    dict['attributelist']['elevation'] = [[7.0,5.0]]
976    """
977
978    if verbose: print 'Reading ', file_name
979
980    # See if the file is there.  Throw a QUIET IO error if it isn't
981    fd = open(file_name,'r')
982    fd.close()
983
984    # Throws prints to screen if file not present
985    fid = NetCDFFile(file_name, netcdf_mode_r)
986
987    pointlist = num.array(fid.variables['points'])
988    keys = fid.variables.keys()
989
990    if verbose: print 'Got %d variables: %s' % (len(keys), keys)
991
992    try:
993        keys.remove('points')
994    except IOError, e:
995        fid.close()
996        msg = "Expected keyword 'points' but could not find it"
997        raise IOError, msg
998
999    attributes = {}
1000    for key in keys:
1001        if verbose: print "reading attribute '%s'" % key
1002
1003        attributes[key] = num.array(fid.variables[key])
1004
1005    try:
1006        geo_reference = Geo_reference(NetCDFObject=fid)
1007    except AttributeError, e:
1008        geo_reference = None
1009
1010    fid.close()
1011
1012    return pointlist, attributes, geo_reference
1013
1014
1015##
1016# @brief Read a .csv data file.
1017# @param file_name Path to the .csv file to read.
1018# @param verbose True if this function is to be verbose.
1019def _read_csv_file(file_name, verbose=False):
1020    """Read .csv file
1021
1022    Return a dic of array of points, and dic of array of attribute
1023    eg
1024    dic['points'] = [[1.0,2.0],[3.0,5.0]]
1025    dic['attributelist']['elevation'] = [[7.0,5.0]]
1026    """
1027
1028    file_pointer = open(file_name)
1029    header, file_pointer = _read_csv_file_header(file_pointer)
1030    try:
1031        (pointlist,
1032         att_dict,
1033         geo_ref,
1034         file_pointer) = _read_csv_file_blocking(file_pointer,
1035                                                 header,
1036                                                 max_read_lines=1e30)
1037                                    # If the file is bigger that this, block..
1038                                    # FIXME (Ole) What's up here?
1039    except ANUGAError:
1040        file_pointer.close()
1041        raise
1042
1043    file_pointer.close()
1044
1045    return pointlist, att_dict, geo_ref
1046
1047
1048##
1049# @brief Read a .csv file header.
1050# @param file_pointer Open descriptor of the file to read.
1051# @param delimiter Header line delimiter string, split on this string.
1052# @param verbose True if this function is to be verbose.
1053# @return A tuple of (<cleaned header string>, <input file_pointer>)
1054
1055CSV_DELIMITER = ','
1056
1057def _read_csv_file_header(file_pointer,
1058                          delimiter=CSV_DELIMITER,
1059                          verbose=False):
1060    """Read the header of a .csv file
1061    Return a list of the header names
1062    """
1063
1064    line = file_pointer.readline()
1065    header = clean_line(line, delimiter)
1066
1067    return header, file_pointer
1068
1069##
1070# @brief Read a .csv file, with blocking.
1071# @param file_pointer Open descriptor of the file to read.
1072# @param header List of already read .csv header fields.
1073# @param delimiter Delimiter string header was split on.
1074# @param max_read_lines The max number of lines to read before blocking.
1075# @param verbose True if this function is to be verbose.
1076# @note Will throw IndexError, SyntaxError exceptions.
1077def _read_csv_file_blocking(file_pointer,
1078                            header,
1079                            delimiter=CSV_DELIMITER,
1080                            max_read_lines=MAX_READ_LINES,
1081                            verbose=False):
1082    """Read the body of a .csv file.
1083    header: The list header of the csv file, with the x and y labels.
1084    """
1085
1086    points = []
1087    pointattributes = []
1088    att_dict = {}
1089
1090    # This is to remove the x and y headers.
1091    header = header[:]
1092    try:
1093        x_header = header.pop(0)
1094        y_header = header.pop(0)
1095    except IndexError:
1096        # if there are not two columns this will occur.
1097        # eg if it is a space seperated file
1098        raise SyntaxError
1099
1100    read_lines = 0
1101    while read_lines < max_read_lines:
1102        line = file_pointer.readline()
1103        numbers = clean_line(line, delimiter)
1104        if len(numbers) <= 1:
1105            break
1106        if line[0] == '#':
1107            continue
1108
1109        read_lines += 1
1110
1111        try:
1112            x = float(numbers[0])
1113            y = float(numbers[1])
1114            points.append([x,y])
1115            numbers.pop(0)
1116            numbers.pop(0)
1117            if len(header) != len(numbers):
1118                file_pointer.close()
1119                msg = ('File load error. '
1120                       'There might be a problem with the file header.')
1121                raise SyntaxError, msg
1122            for i,n in enumerate(numbers):
1123                n.strip()
1124                if n != '\n' and n != '':
1125                    att_dict.setdefault(header[i],[]).append(float(n))
1126        except ValueError:
1127            raise SyntaxError
1128
1129    if points == []:
1130        raise StopIteration
1131
1132    pointlist = num.array(points, num.float)
1133    for key in att_dict.keys():
1134        att_dict[key] = num.array(att_dict[key], num.float)
1135
1136    # Do stuff here so the info is in lat's and longs
1137    geo_ref = None
1138    x_header = lower(x_header[:3])
1139    y_header = lower(y_header[:3])
1140    if (x_header == 'lon' or  x_header == 'lat') \
1141       and (y_header == 'lon' or  y_header == 'lat'):
1142        if x_header == 'lon':
1143            longitudes = num.ravel(pointlist[:,0:1])
1144            latitudes = num.ravel(pointlist[:,1:])
1145        else:
1146            latitudes = num.ravel(pointlist[:,0:1])
1147            longitudes = num.ravel(pointlist[:,1:])
1148
1149        pointlist, geo_ref = _set_using_lat_long(latitudes,
1150                                                 longitudes,
1151                                                 geo_reference=None,
1152                                                 data_points=None,
1153                                                 points_are_lats_longs=False)
1154
1155    return pointlist, att_dict, geo_ref, file_pointer
1156
1157
1158##
1159# @brief Read a .pts file header.
1160# @param fid Handle to the open .pts file.
1161# @param verbose True if the function is to be verbose.
1162# @return (geo_reference, keys, fid.dimensions['number_of_points'])
1163# @note Will throw IOError and AttributeError exceptions.
1164def _read_pts_file_header(fid, verbose=False):
1165    '''Read the geo_reference and number_of_points from a .pts file'''
1166
1167    keys = fid.variables.keys()
1168    try:
1169        keys.remove('points')
1170    except IOError, e:
1171        fid.close()
1172        msg = "Expected keyword 'points' but could not find it."
1173        raise IOError, msg
1174
1175    if verbose: print 'Got %d variables: %s' % (len(keys), keys)
1176
1177    try:
1178        geo_reference = Geo_reference(NetCDFObject=fid)
1179    except AttributeError, e:
1180        geo_reference = None
1181
1182    return geo_reference, keys, fid.dimensions['number_of_points']
1183
1184
1185##
1186# @brief Read the body of a .pts file, with blocking.
1187# @param fid Handle to already open file.
1188# @param start_row Start row index of points to return.
1189# @param fin_row End row index of points to return.
1190# @param keys Iterable of keys to return.
1191# @return Tuple of (pointlist, attributes).
1192def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1193    '''Read the body of a .pts file.'''
1194
1195    pointlist = num.array(fid.variables['points'][start_row:fin_row])
1196
1197    attributes = {}
1198    for key in keys:
1199        attributes[key] = num.array(fid.variables[key][start_row:fin_row])
1200
1201    return pointlist, attributes
1202
1203
1204##
1205# @brief Write a .pts data file.
1206# @param file_name Path to the file to write.
1207# @param write_data_points Data points to write.
1208# @param write_attributes Attributes to write.
1209# @param write_geo_reference Georef to write.
1210def _write_pts_file(file_name,
1211                    write_data_points,
1212                    write_attributes=None,
1213                    write_geo_reference=None):
1214    """Write .pts NetCDF file
1215
1216    NOTE: Below might not be valid ask Duncan : NB 5/2006
1217
1218    WARNING: This function mangles the point_atts data structure
1219    # F??ME: (DSG)This format has issues.
1220    # There can't be an attribute called points
1221    # consider format change
1222    # method changed by NB not sure if above statement is correct
1223
1224    should create new test for this
1225    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1226    for key in point_atts.keys():
1227        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1228        assert key in legal_keys, msg
1229    """
1230
1231    # NetCDF file definition
1232    outfile = NetCDFFile(file_name, netcdf_mode_w)
1233
1234    # Create new file
1235    outfile.institution = 'Geoscience Australia'
1236    outfile.description = ('NetCDF format for compact and portable storage '
1237                           'of spatial point data')
1238
1239    # Dimension definitions
1240    shape = write_data_points.shape[0]
1241    outfile.createDimension('number_of_points', shape)
1242    outfile.createDimension('number_of_dimensions', 2) # This is 2d data
1243
1244    # Variable definition
1245    outfile.createVariable('points', netcdf_float,
1246                           ('number_of_points', 'number_of_dimensions'))
1247
1248    # create variables
1249    outfile.variables['points'][:] = write_data_points
1250
1251    if write_attributes is not None:
1252        for key in write_attributes.keys():
1253            outfile.createVariable(key, netcdf_float, ('number_of_points',))
1254            outfile.variables[key][:] = write_attributes[key]
1255
1256    if write_geo_reference is not None:
1257        write_NetCDF_georeference(write_geo_reference, outfile)
1258
1259    outfile.close()
1260
1261
1262##
1263# @brief Write a .csv data file.
1264# @param file_name Path to the file to write.
1265# @param write_data_points Data points to write.
1266# @param write_attributes Attributes to write.
1267# @param as_lat_long True if points are lat/lon, else x/y.
1268# @param delimiter The CSV delimiter to use.
1269def _write_csv_file(file_name,
1270                    write_data_points,
1271                    write_attributes=None,
1272                    as_lat_long=False,
1273                    delimiter=','):
1274    """Write a .csv file."""
1275
1276    points = write_data_points
1277    pointattributes = write_attributes
1278
1279    fd = open(file_name, 'w')
1280
1281    if as_lat_long:
1282        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1283    else:
1284        titlelist = "x" + delimiter + "y"  + delimiter
1285
1286    if pointattributes is not None:
1287        for title in pointattributes.keys():
1288            titlelist = titlelist + title + delimiter
1289        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1290
1291    fd.write(titlelist + "\n")
1292
1293    # <x/lat> <y/long> [attributes]
1294    for i, vert in enumerate( points):
1295        if pointattributes is not None:
1296            attlist = ","
1297            for att in pointattributes.keys():
1298                attlist = attlist + str(pointattributes[att][i]) + delimiter
1299            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1300            attlist.strip()
1301        else:
1302            attlist = ''
1303
1304        fd.write(str(vert[0]) + delimiter + str(vert[1]) + attlist + "\n")
1305
1306    fd.close()
1307
1308
1309##
1310# @brief Write a URS file.
1311# @param file_name The path of the file to write.
1312# @param points
1313# @param delimiter
1314def _write_urs_file(file_name, points, delimiter=' '):
1315    """Write a URS format file.
1316    export a file, file_name, with the urs format
1317    the data points are in lats and longs
1318    """
1319
1320    fd = open(file_name, 'w')
1321
1322    # first line is # points
1323    fd.write(str(len(points)) + "\n")
1324
1325    # <lat> <long> <id#>
1326    for i, vert in enumerate( points):
1327        fd.write(str(round(vert[0],7)) + delimiter +
1328                 str(round(vert[1],7)) + delimiter + str(i) + "\n")
1329
1330    fd.close()
1331
1332
1333##
1334# @brief ??
1335# @param point_atts ??
1336# @return ??
1337def _point_atts2array(point_atts):
1338    point_atts['pointlist'] = num.array(point_atts['pointlist'], num.float)
1339
1340    for key in point_atts['attributelist'].keys():
1341        point_atts['attributelist'][key] = \
1342                num.array(point_atts['attributelist'][key], num.float)
1343
1344    return point_atts
1345
1346
1347##
1348# @brief Convert geospatial object to a points dictionary.
1349# @param geospatial_data The geospatial object to convert.
1350# @return A points dictionary.
1351def geospatial_data2points_dictionary(geospatial_data):
1352    """Convert geospatial data to points_dictionary"""
1353
1354    points_dictionary = {}
1355    points_dictionary['pointlist'] = geospatial_data.data_points
1356
1357    points_dictionary['attributelist'] = {}
1358
1359    for attribute_name in geospatial_data.attributes.keys():
1360        val = geospatial_data.attributes[attribute_name]
1361        points_dictionary['attributelist'][attribute_name] = val
1362
1363    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1364
1365    return points_dictionary
1366
1367
1368##
1369# @brief Convert a points dictionary to a geospatial object.
1370# @param points_dictionary A points dictionary to convert.
1371def points_dictionary2geospatial_data(points_dictionary):
1372    """Convert points_dictionary to geospatial data object"""
1373
1374    msg = "Points dictionary must have key 'pointlist'"
1375    assert points_dictionary.has_key('pointlist'), msg
1376
1377    msg = "Points dictionary must have key 'attributelist'"
1378    assert points_dictionary.has_key('attributelist'), msg
1379
1380    if points_dictionary.has_key('geo_reference'):
1381        geo = points_dictionary['geo_reference']
1382    else:
1383        geo = None
1384
1385    return Geospatial_data(points_dictionary['pointlist'],
1386                           points_dictionary['attributelist'],
1387                           geo_reference=geo)
1388
1389
1390##
1391# @brief Ensure that points are in absolute coordinates.
1392# @param points A list or array of points to check, or geospatial object.
1393# @param geo_reference If supplied,
1394# @return ??
1395def ensure_absolute(points, geo_reference=None):
1396    """Ensure that points are in absolute coordinates.
1397
1398    This function inputs several formats and
1399    outputs one format. - a numeric array of absolute points.
1400
1401    Input formats are;
1402      points: List or numeric array of coordinate pairs [xi, eta] of
1403              points or geospatial object or points file name
1404
1405    mesh_origin: A geo_reference object or 3-tuples consisting of
1406                 UTM zone, easting and northing.
1407                 If specified vertex coordinates are assumed to be
1408                 relative to their respective origins.
1409    """
1410
1411    # Input check
1412    if isinstance(points, basestring):
1413        # It's a string - assume it is a point file
1414        points = Geospatial_data(file_name=points)
1415
1416    if isinstance(points, Geospatial_data):
1417        points = points.get_data_points(absolute=True)
1418        msg = 'Use a Geospatial_data object or a mesh origin, not both.'
1419        assert geo_reference == None, msg
1420    else:
1421        points = ensure_numeric(copy.copy(points), num.float)
1422
1423    # Sort of geo_reference and convert points
1424    if geo_reference is None:
1425        geo = None    # Geo_reference()
1426    else:
1427        if isinstance(geo_reference, Geo_reference):
1428            geo = geo_reference
1429        else:
1430            geo = Geo_reference(geo_reference[0],
1431                                geo_reference[1],
1432                                geo_reference[2])
1433        points = geo.get_absolute(points)
1434
1435    return points
1436
1437
1438##
1439# @brief
1440# @param points
1441# @param geo_reference
1442# @return A geospatial object.
1443def ensure_geospatial(points, geo_reference=None):
1444    """Convert various data formats to a geospatial_data instance.
1445
1446    Inputed formats are;
1447    points:      List or numeric array of coordinate pairs [xi, eta] of
1448                 points or geospatial object
1449
1450    mesh_origin: A geo_reference object or 3-tuples consisting of
1451                 UTM zone, easting and northing.
1452                 If specified vertex coordinates are assumed to be
1453                 relative to their respective origins.
1454    """
1455
1456    # Input check
1457    if isinstance(points, Geospatial_data):
1458        msg = "Use a Geospatial_data object or a mesh origin, not both."
1459        assert geo_reference is None, msg
1460        return points
1461    else:
1462        # List or numeric array of absolute points
1463        points = ensure_numeric(points, num.float)
1464
1465    # Sort out geo reference
1466    if geo_reference is None:
1467        geo = None
1468    else:
1469        if isinstance(geo_reference, Geo_reference):
1470            geo = geo_reference
1471        else:
1472            geo = Geo_reference(geo_reference[0],
1473                                geo_reference[1],
1474                                geo_reference[2])
1475
1476    # Create Geospatial_data object with appropriate geo reference and return
1477    points = Geospatial_data(data_points=points, geo_reference=geo)
1478
1479    return points
1480
1481
1482##
1483# @brief
1484# @param data_file
1485# @param alpha_list
1486# @param mesh_file
1487# @param boundary_poly
1488# @param mesh_resolution
1489# @param north_boundary
1490# @param south_boundary
1491# @param east_boundary
1492# @param west_boundary
1493# @param plot_name
1494# @param split_factor
1495# @param seed_num
1496# @param cache
1497# @param verbose
1498def find_optimal_smoothing_parameter(data_file,
1499                                     alpha_list=None,
1500                                     mesh_file=None,
1501                                     boundary_poly=None,
1502                                     mesh_resolution=100000,
1503                                     north_boundary=None,
1504                                     south_boundary=None,
1505                                     east_boundary=None,
1506                                     west_boundary=None,
1507                                     plot_name='all_alphas',
1508                                     split_factor=0.1,
1509                                     seed_num=None,
1510                                     cache=False,
1511                                     verbose=False):
1512    """Removes a small random sample of points from 'data_file'.
1513    Then creates models with different alpha values from 'alpha_list' and
1514    cross validates the predicted value to the previously removed point data.
1515    Returns the alpha value which has the smallest covariance.
1516
1517    data_file: must not contain points outside the boundaries defined
1518               and it must be either a pts, txt or csv file.
1519
1520    alpha_list: the alpha values to test in a single list
1521
1522    mesh_file: name of the created mesh file or if passed in will read it.
1523               NOTE, if there is a mesh file mesh_resolution,
1524               north_boundary, south... etc will be ignored.
1525
1526    mesh_resolution: the maximum area size for a triangle
1527
1528    north_boundary... west_boundary: the value of the boundary
1529
1530    plot_name: the name for the plot contain the results
1531
1532    seed_num: the seed to the random number generator
1533
1534    USAGE:
1535        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1536                                             alpha_list=[0.0001, 0.01, 1],
1537                                             mesh_file=None,
1538                                             mesh_resolution=3,
1539                                             north_boundary=5,
1540                                             south_boundary=-5,
1541                                             east_boundary=5,
1542                                             west_boundary=-5,
1543                                             plot_name='all_alphas',
1544                                             seed_num=100000,
1545                                             verbose=False)
1546
1547    OUTPUT: returns the minumum normalised covalance calculate AND the
1548           alpha that created it. PLUS writes a plot of the results
1549
1550    NOTE: code will not work if the data_file extent is greater than the
1551    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
1552    """
1553
1554    from anuga.shallow_water import Domain
1555    from anuga.geospatial_data.geospatial_data import Geospatial_data
1556    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1557    from anuga.utilities.numerical_tools import cov
1558    from anuga.utilities.polygon import is_inside_polygon
1559    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1560
1561    attribute_smoothed = 'elevation'
1562
1563    if mesh_file is None:
1564        if verbose: print "building mesh"
1565        mesh_file = 'temp.msh'
1566
1567        if (north_boundary is None or south_boundary is None
1568            or east_boundary is None or west_boundary is None):
1569            no_boundary = True
1570        else:
1571            no_boundary = False
1572
1573        if no_boundary is True:
1574            msg = 'All boundaries must be defined'
1575            raise Expection, msg
1576
1577        poly_topo = [[east_boundary, south_boundary],
1578                     [east_boundary, north_boundary],
1579                     [west_boundary, north_boundary],
1580                     [west_boundary, south_boundary]]
1581
1582        create_mesh_from_regions(poly_topo,
1583                                 boundary_tags={'back': [2],
1584                                                'side': [1,3],
1585                                                'ocean': [0]},
1586                                 maximum_triangle_area=mesh_resolution,
1587                                 filename=mesh_file,
1588                                 use_cache=cache,
1589                                 verbose=verbose)
1590
1591    else: # if mesh file provided
1592        # test mesh file exists?
1593        if verbose: "reading from file: %s" % mesh_file
1594        if access(mesh_file,F_OK) == 0:
1595            msg = "file %s doesn't exist!" % mesh_file
1596            raise IOError, msg
1597
1598    # split topo data
1599    if verbose: print 'Reading elevation file: %s' % data_file
1600    G = Geospatial_data(file_name = data_file)
1601    if verbose: print 'Start split'
1602    G_small, G_other = G.split(split_factor, seed_num, verbose=verbose)
1603    if verbose: print 'Finish split'
1604    points = G_small.get_data_points()
1605
1606    if verbose: print "Number of points in sample to compare: ", len(points)
1607
1608    if alpha_list == None:
1609        alphas = [0.001,0.01,100]
1610        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,
1611        #          0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1612    else:
1613        alphas = alpha_list
1614
1615    # creates array with columns 1 and 2 are x, y. column 3 is elevation
1616    # 4 onwards is the elevation_predicted using the alpha, which will
1617    # be compared later against the real removed data
1618    data = num.array([], dtype=num.float)
1619
1620    data = num.resize(data, (len(points), 3+len(alphas)))
1621
1622    # gets relative point from sample
1623    data[:,0] = points[:,0]
1624    data[:,1] = points[:,1]
1625    elevation_sample = G_small.get_attributes(attribute_name=attribute_smoothed)
1626    data[:,2] = elevation_sample
1627
1628    normal_cov = num.array(num.zeros([len(alphas), 2]), dtype=num.float)
1629
1630    if verbose: print 'Setup computational domains with different alphas'
1631
1632    for i, alpha in enumerate(alphas):
1633        # add G_other data to domains with different alphas
1634        if verbose:
1635            print '\nCalculating domain and mesh for Alpha =', alpha, '\n'
1636        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1637        if verbose: print domain.statistics()
1638        domain.set_quantity(attribute_smoothed,
1639                            geospatial_data=G_other,
1640                            use_cache=cache,
1641                            verbose=verbose,
1642                            alpha=alpha)
1643
1644        # Convert points to geospatial data for use with get_values below
1645        points_geo = Geospatial_data(points, domain.geo_reference)
1646
1647        # returns the predicted elevation of the points that were "split" out
1648        # of the original data set for one particular alpha
1649        if verbose: print 'Get predicted elevation for location to be compared'
1650        elevation_predicted = \
1651                domain.quantities[attribute_smoothed].\
1652                    get_values(interpolation_points=points_geo)
1653
1654        # add predicted elevation to array that starts with x, y, z...
1655        data[:,i+3] = elevation_predicted
1656
1657        sample_cov = cov(elevation_sample)
1658        ele_cov = cov(elevation_sample - elevation_predicted)
1659        normal_cov[i,:] = [alpha, ele_cov / sample_cov]
1660
1661        if verbose:
1662            print 'Covariance for alpha ', normal_cov[i][0], '= ', \
1663                      normal_cov[i][1]
1664            print '-------------------------------------------- \n'
1665
1666    normal_cov0 = normal_cov[:,0]
1667    normal_cov_new = num.take(normal_cov, num.argsort(normal_cov0), axis=0)
1668
1669    if plot_name is not None:
1670        from pylab import savefig, semilogx, loglog
1671
1672        semilogx(normal_cov_new[:,0], normal_cov_new[:,1])
1673        loglog(normal_cov_new[:,0], normal_cov_new[:,1])
1674        savefig(plot_name, dpi=300)
1675
1676    if mesh_file == 'temp.msh':
1677        remove(mesh_file)
1678
1679    if verbose:
1680        print 'Final results:'
1681        for i, alpha in enumerate(alphas):
1682            print ('covariance for alpha %s = %s '
1683                   % (normal_cov[i][0], normal_cov[i][1]))
1684        print ('\nOptimal alpha is: %s '
1685               % normal_cov_new[(num.argmin(normal_cov_new, axis=0))[1], 0])
1686
1687    # covariance and optimal alpha
1688    return (min(normal_cov_new[:,1]),
1689            normal_cov_new[(num.argmin(normal_cov_new,axis=0))[1],0])
1690
1691
1692##
1693# @brief
1694# @param data_file
1695# @param alpha_list
1696# @param mesh_file
1697# @param boundary_poly
1698# @param mesh_resolution
1699# @param north_boundary
1700# @param south_boundary
1701# @param east_boundary
1702# @param west_boundary
1703# @param plot_name
1704# @param split_factor
1705# @param seed_num
1706# @param cache
1707# @param verbose
1708def old_find_optimal_smoothing_parameter(data_file,
1709                                         alpha_list=None,
1710                                         mesh_file=None,
1711                                         boundary_poly=None,
1712                                         mesh_resolution=100000,
1713                                         north_boundary=None,
1714                                         south_boundary=None,
1715                                         east_boundary=None,
1716                                         west_boundary=None,
1717                                         plot_name='all_alphas',
1718                                         split_factor=0.1,
1719                                         seed_num=None,
1720                                         cache=False,
1721                                         verbose=False):
1722    """
1723    data_file: must not contain points outside the boundaries defined
1724               and it either a pts, txt or csv file.
1725
1726    alpha_list: the alpha values to test in a single list
1727
1728    mesh_file: name of the created mesh file or if passed in will read it.
1729               NOTE, if there is a mesh file mesh_resolution,
1730               north_boundary, south... etc will be ignored.
1731
1732    mesh_resolution: the maximum area size for a triangle
1733
1734    north_boundary... west_boundary: the value of the boundary
1735
1736    plot_name: the name for the plot contain the results
1737
1738    seed_num: the seed to the random number generator
1739
1740    USAGE:
1741        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1742                                             alpha_list=[0.0001, 0.01, 1],
1743                                             mesh_file=None,
1744                                             mesh_resolution=3,
1745                                             north_boundary=5,
1746                                             south_boundary=-5,
1747                                             east_boundary=5,
1748                                             west_boundary=-5,
1749                                             plot_name='all_alphas',
1750                                             seed_num=100000,
1751                                             verbose=False)
1752
1753    OUTPUT: returns the minumum normalised covalance calculate AND the
1754            alpha that created it. PLUS writes a plot of the results
1755
1756    NOTE: code will not work if the data_file extend is greater than the
1757          boundary_polygon or the north_boundary...west_boundary
1758    """
1759
1760    from anuga.shallow_water import Domain
1761    from anuga.geospatial_data.geospatial_data import Geospatial_data
1762    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1763    from anuga.utilities.numerical_tools import cov
1764    from anuga.utilities.polygon import is_inside_polygon
1765    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1766
1767    attribute_smoothed = 'elevation'
1768
1769    if mesh_file is None:
1770        mesh_file = 'temp.msh'
1771
1772        if (north_boundary is None or south_boundary is None
1773            or east_boundary is None or west_boundary is None):
1774            no_boundary = True
1775        else:
1776            no_boundary = False
1777
1778        if no_boundary is True:
1779            msg = 'All boundaries must be defined'
1780            raise Expection, msg
1781
1782        poly_topo = [[east_boundary, south_boundary],
1783                     [east_boundary, north_boundary],
1784                     [west_boundary, north_boundary],
1785                     [west_boundary, south_boundary]]
1786
1787        create_mesh_from_regions(poly_topo,
1788                                 boundary_tags={'back': [2],
1789                                                'side': [1,3],
1790                                                'ocean': [0]},
1791                                 maximum_triangle_area=mesh_resolution,
1792                                 filename=mesh_file,
1793                                 use_cache=cache,
1794                                 verbose=verbose)
1795
1796    else: # if mesh file provided
1797        # test mesh file exists?
1798        if access(mesh_file,F_OK) == 0:
1799            msg = "file %s doesn't exist!" % mesh_file
1800            raise IOError, msg
1801
1802    # split topo data
1803    G = Geospatial_data(file_name=data_file)
1804    if verbose: print 'start split'
1805    G_small, G_other = G.split(split_factor, seed_num, verbose=verbose)
1806    if verbose: print 'finish split'
1807    points = G_small.get_data_points()
1808
1809    if verbose: print "Number of points in sample to compare: ", len(points)
1810
1811    if alpha_list == None:
1812        alphas = [0.001,0.01,100]
1813        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,
1814        #          0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1815    else:
1816        alphas = alpha_list
1817
1818    domains = {}
1819
1820    if verbose: print 'Setup computational domains with different alphas'
1821
1822    for alpha in alphas:
1823        # add G_other data to domains with different alphas
1824        if verbose:
1825            print '\nCalculating domain and mesh for Alpha =', alpha, '\n'
1826        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1827        if verbose: print domain.statistics()
1828        domain.set_quantity(attribute_smoothed,
1829                            geospatial_data=G_other,
1830                            use_cache=cache,
1831                            verbose=verbose,
1832                            alpha=alpha)
1833        domains[alpha] = domain
1834
1835    # creates array with columns 1 and 2 are x, y. column 3 is elevation
1836    # 4 onwards is the elevation_predicted using the alpha, which will
1837    # be compared later against the real removed data
1838    data = num.array([], dtype=num.float)
1839
1840    data = num.resize(data, (len(points), 3+len(alphas)))
1841
1842    # gets relative point from sample
1843    data[:,0] = points[:,0]
1844    data[:,1] = points[:,1]
1845    elevation_sample = G_small.get_attributes(attribute_name=attribute_smoothed)
1846    data[:,2] = elevation_sample
1847
1848    normal_cov = num.array(num.zeros([len(alphas), 2]), dtype=num.float)
1849
1850    if verbose:
1851        print 'Determine difference between predicted results and actual data'
1852
1853    for i, alpha in enumerate(domains):
1854        if verbose: print'Alpha =', alpha
1855
1856        points_geo = domains[alpha].geo_reference.change_points_geo_ref(points)
1857        # returns the predicted elevation of the points that were "split" out
1858        # of the original data set for one particular alpha
1859        elevation_predicted = \
1860                domains[alpha].quantities[attribute_smoothed].\
1861                        get_values(interpolation_points=points_geo)
1862
1863        # add predicted elevation to array that starts with x, y, z...
1864        data[:,i+3] = elevation_predicted
1865
1866        sample_cov = cov(elevation_sample)
1867        ele_cov = cov(elevation_sample - elevation_predicted)
1868        normal_cov[i,:] = [alpha,ele_cov / sample_cov]
1869        print 'memory usage during compare', mem_usage()
1870        if verbose: print 'cov', normal_cov[i][0], '= ', normal_cov[i][1]
1871
1872    normal_cov0 = normal_cov[:,0]
1873    normal_cov_new = num.take(normal_cov, num.argsort(normal_cov0), axis=0)
1874
1875    if plot_name is not None:
1876        from pylab import savefig,semilogx,loglog
1877
1878        semilogx(normal_cov_new[:,0], normal_cov_new[:,1])
1879        loglog(normal_cov_new[:,0], normal_cov_new[:,1])
1880        savefig(plot_name, dpi=300)
1881    if mesh_file == 'temp.msh':
1882        remove(mesh_file)
1883
1884    return (min(normal_cov_new[:,1]),
1885            normal_cov_new[(num.argmin(normal_cov_new, axis=0))[1],0])
1886
1887
1888if __name__ == "__main__":
1889    pass
1890
Note: See TracBrowser for help on using the repository browser.