source: branches/numpy/anuga/geospatial_data/geospatial_data.py @ 6441

Last change on this file since 6441 was 6441, checked in by rwilson, 16 years ago

After changes to get_absolute, ensure_numeric, etc.

File size: 69.4 KB
Line 
1"""Class Geospatial_data
2
3Manipulation of locations on the planet and associated attributes.
4"""
5
6from sys import maxint
7from os import access, F_OK, R_OK,remove
8from types import DictType
9from warnings import warn
10from string import lower
11from RandomArray import randint, seed, get_seed
12from copy import deepcopy
13
14from Scientific.IO.NetCDF import NetCDFFile
15import numpy as num
16
17from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL
18from anuga.utilities.numerical_tools import ensure_numeric
19from anuga.coordinate_transforms.geo_reference import Geo_reference, \
20     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
21from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
22from anuga.utilities.system_tools import clean_line
23from anuga.utilities.anuga_exceptions import ANUGAError
24from anuga.config import points_file_block_line_size as MAX_READ_LINES
25from anuga.config import netcdf_mode_r, netcdf_mode_w, netcdf_mode_a
26from anuga.config import netcdf_float
27
28
29DEFAULT_ATTRIBUTE = 'elevation'
30
31
32##
33# @brief ??
34class Geospatial_data:
35
36    ##
37    # @brief
38    # @param data_points Mx2 iterable of tuples or array of x,y coordinates.
39    # @param attributes Associated values for each data point.
40    # @param geo_reference ??
41    # @param default_attribute_name ??
42    # @param file_name
43    # @param latitudes ??
44    # @param longitudes ??
45    # @param points_are_lats_longs True if points are lat/long, not UTM.
46    # @param max_read_lines Size of block to read, if blocking.
47    # @param load_file_now True if blocking but we eant to read file now.
48    # @param verbose True if this class instance is verbose.
49    def __init__(self,
50                 data_points=None, # this can also be a points file name
51                 attributes=None,
52                 geo_reference=None,
53                 default_attribute_name=None,
54                 file_name=None,
55                 latitudes=None,
56                 longitudes=None,
57                 points_are_lats_longs=False,
58                 max_read_lines=None,
59                 load_file_now=True,
60                 verbose=False):
61        """Create instance from data points and associated attributes
62
63        data_points: x,y coordinates in meters. Type must be either a
64        sequence of 2-tuples or an Mx2 numeric array of floats.  A file name
65        with extension .txt, .cvs or .pts can also be passed in here.
66
67        attributes: Associated values for each data point. The type
68        must be either a list or an array of length M or a dictionary
69        of lists (or arrays) of length M. In the latter case the keys
70        in the dictionary represent the attribute names, in the former
71        the attribute will get the default name "elevation".
72
73        geo_reference: Object representing the origin of the data
74        points. It contains UTM zone, easting and northing and data
75        points are assumed to be relative to this origin.
76        If geo_reference is None, the default geo ref object is used.
77
78        default_attribute_name: Name of default attribute to be used with
79        get_attribute_values. The idea is that the dataset can be
80        equipped with information about which attribute to return.
81        If None, the default is the "first"
82
83        latitudes, longitudes: Vectors of latitudes and longitudes,
84        used to specify location instead of points.
85
86        points_are_lats_longs: Set this as true if the points are actually
87        lats and longs, not UTM
88
89        max_read_lines: The number of rows read into memory when using
90        blocking to read a file.
91
92        load_file_now:  If true the file is automatically loaded
93        into the geospatial instance. Used when blocking.
94
95        file_name: Name of input netCDF file or .txt file. netCDF file must
96        have dimensions "points" etc.
97        .txt file is a comma seperated file with x, y and attribute
98        data.
99
100        The first line has the titles of the columns.  The first two
101        column titles are checked to see if they start with lat or
102        long (not case sensitive).  If so the data is assumed to be
103        latitude and longitude, in decimal format and converted to
104        UTM.  Otherwise the first two columns are assumed to be the x
105        and y, and the title names acually used are ignored.
106
107
108        The format for a .txt file is:
109            1st line:     [column names]
110            other lines:  x y [attributes]
111
112            for example:
113            x, y, elevation, friction
114            0.6, 0.7, 4.9, 0.3
115            1.9, 2.8, 5, 0.3
116            2.7, 2.4, 5.2, 0.3
117
118        The first two columns have to be x, y or lat, long
119        coordinates.
120
121
122        The format for a Points dictionary is:
123          ['pointlist'] a 2 column array describing points. 1st column x,
124          2nd column y.
125          ['attributelist'], a dictionary of 1D arrays, representing
126          attribute values at the point.  The dictionary key is the attribute
127          header.
128          ['geo_reference'] a Geo_refernece object. Use if the point
129          information is relative. This is optional.
130            eg
131            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
132            dic['attributelist']['elevation'] = [[7.0,5.0]
133
134        verbose:
135        """
136
137        if isinstance(data_points, basestring):
138            # assume data_points is really a file name
139            file_name = data_points
140
141        self.set_verbose(verbose)
142        self.geo_reference = None
143        self.file_name = file_name
144
145        if max_read_lines is None:
146            self.max_read_lines = MAX_READ_LINES
147        else:
148            self.max_read_lines = max_read_lines
149
150        if file_name is None:
151            if (latitudes is not None or longitudes is not None
152                    or points_are_lats_longs):
153                data_points, geo_reference = \
154                    _set_using_lat_long(latitudes=latitudes,
155                                        longitudes=longitudes,
156                                        geo_reference=geo_reference,
157                                        data_points=data_points,
158                                        points_are_lats_longs=
159                                            points_are_lats_longs)
160            self.check_data_points(data_points)
161            self.set_attributes(attributes)
162            self.set_geo_reference(geo_reference)
163            self.set_default_attribute_name(default_attribute_name)
164        elif load_file_now is True:
165            # watch for case where file name and points,
166            # attributes etc are provided!!
167            # if file name then all provided info will be removed!
168
169            if verbose is True:
170                if file_name is not None:
171                    print 'Loading Geospatial data from file: %s' % file_name
172
173            self.import_points_file(file_name, verbose=verbose)
174
175            self.check_data_points(self.data_points)
176            self.set_attributes(self.attributes)
177            self.set_geo_reference(self.geo_reference)
178            self.set_default_attribute_name(default_attribute_name)
179
180        if verbose is True:
181            if file_name is not None:
182                print 'Geospatial data created from file: %s' % file_name
183                if load_file_now is False:
184                    print 'Data will be loaded blockwise on demand'
185
186                    if file_name.endswith('csv') or file_name.endswith('txt'):
187                        pass
188                        # This message was misleading.
189                        # FIXME (Ole): Are we blocking here or not?
190                        # print 'ASCII formats are not that great for '
191                        # print 'blockwise reading. Consider storing this'
192                        # print 'data as a pts NetCDF format'
193
194    ##
195    # @brief Return length of the points set.
196    def __len__(self):
197        return len(self.data_points)
198
199    ##
200    # @brief Return a string representation of the points set.
201    def __repr__(self):
202        return str(self.get_data_points(absolute=True))
203
204    ##
205    # @brief Check data points.
206    # @param data_points Points data to check and store in instance.
207    # @note Throws ValueError exception if no data.
208    def check_data_points(self, data_points):
209        """Checks data points"""
210
211        if data_points is None:
212            self.data_points = None
213            msg = 'There is no data or file provided!'
214            raise ValueError, msg
215        else:
216            self.data_points = ensure_numeric(data_points)
217            if not (0,) == self.data_points.shape:
218                assert len(self.data_points.shape) == 2
219                assert self.data_points.shape[1] == 2
220
221    ##
222    # @brief Check and assign attributes data.
223    # @param attributes Dictionary or scalar to save as .attributes.
224    # @note Throws exception if unable to convert dict keys to numeric.
225    def set_attributes(self, attributes):
226        """Check and assign attributes dictionary"""
227
228        if attributes is None:
229            self.attributes = None
230            return
231
232        if not isinstance(attributes, DictType):
233            # Convert single attribute into dictionary
234            attributes = {DEFAULT_ATTRIBUTE: attributes}
235
236        # Check input attributes
237        for key in attributes.keys():
238            try:
239                attributes[key] = ensure_numeric(attributes[key])
240            except:
241                msg = ("Attribute '%s' (%s) could not be converted to a"
242                       "numeric vector" % (str(key), str(attributes[key])))
243                raise Exception, msg
244
245        self.attributes = attributes
246
247    ##
248    # @brief Set the georeference of geospatial data.
249    # @param geo_reference The georeference data to set.
250    # @note Will raise exception if param not instance of Geo_reference.
251    def set_geo_reference(self, geo_reference):
252        """Set the georeference of geospatial data.
253
254        It can also be used to change the georeference and will ensure that
255        the absolute coordinate values are unchanged.
256        """
257
258        if geo_reference is None:
259            # Use default - points are in absolute coordinates
260            geo_reference = Geo_reference()
261
262        # Allow for tuple (zone, xllcorner, yllcorner)
263        geo_reference = ensure_geo_reference(geo_reference)
264
265        if not isinstance(geo_reference, Geo_reference):
266            # FIXME (Ole): This exception will be raised even
267            # if geo_reference is None. Is that the intent Duncan?
268            msg = ('Argument geo_reference must be a valid Geo_reference '
269                   'object or None.')
270            raise Expection, msg
271
272        # If a geo_reference already exists, change the point data according to
273        # the new geo reference
274        if  self.geo_reference is not None:
275            self.data_points = self.get_data_points(geo_reference=geo_reference)
276
277        self.geo_reference = geo_reference
278
279    ##
280    # @brief Set default attribute name.
281    # @param default_attribute_name The default to save.
282    def set_default_attribute_name(self, default_attribute_name):
283        self.default_attribute_name = default_attribute_name
284
285    ##
286    # @brief Set the instance verbose flag.
287    # @param verbose The value to save.
288    # @note Will raise exception if param is not True or False.
289    def set_verbose(self, verbose=False):
290        if verbose in [False, True]:
291            self.verbose = verbose
292        else:
293            msg = 'Illegal value: %s' % str(verbose)
294            raise Exception, msg
295
296    ##
297    # @brief Clip geospatial data by a given polygon.
298    # @param polygon The polygon to clip with.
299    # @param closed True if points on clip boundary are not included in result.
300    # @param verbose True if this function is verbose.
301    def clip(self, polygon, closed=True, verbose=False):
302        """Clip geospatial data by a polygon
303
304        Input
305          polygon - Either a list of points, an Nx2 array or
306                    a Geospatial data object.
307          closed - (optional) determine whether points on boundary should be
308          regarded as belonging to the polygon (closed = True)
309          or not (closed = False). Default is True.
310
311        Output
312          New geospatial data object representing points inside
313          specified polygon.
314
315
316        Note - this method is non-destructive and leaves the data in 'self'
317        unchanged
318        """
319
320        from anuga.utilities.polygon import inside_polygon
321
322        if isinstance(polygon, Geospatial_data):
323            # Polygon is an object - extract points
324            polygon = polygon.get_data_points()
325
326        points = self.get_data_points()
327        inside_indices = inside_polygon(points, polygon, closed, verbose)
328
329        clipped_G = self.get_sample(inside_indices)
330
331        return clipped_G
332
333    ##
334    # @brief Clip points data by polygon, return points outside polygon.
335    # @param polygon The polygon to clip with.
336    # @param closed True if points on clip boundary are not included in result.
337    # @param verbose True if this function is verbose.
338    def clip_outside(self, polygon, closed=True, verbose=False):
339        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
340
341        Input
342          polygon - Either a list of points, an Nx2 array or
343                    a Geospatial data object.
344          closed - (optional) determine whether points on boundary should be
345          regarded as belonging to the polygon (closed = True)
346          or not (closed = False). Default is True.
347
348        Output
349          Geospatial data object representing point OUTSIDE specified polygon
350        """
351
352        from anuga.utilities.polygon import outside_polygon
353
354        if isinstance(polygon, Geospatial_data):
355            # Polygon is an object - extract points
356            polygon = polygon.get_data_points()
357
358        points = self.get_data_points()
359        outside_indices = outside_polygon(points, polygon, closed,verbose)
360
361        clipped_G = self.get_sample(outside_indices)
362
363        return clipped_G
364
365    ##
366    # @brief Get instance geo_reference data.
367    def get_geo_reference(self):
368        return self.geo_reference
369
370    ##
371    # @brief Get coordinates for all data points as an Nx2 array.
372    # @param absolute If True, return UTM, else relative to xll/yll corners.
373    # @param geo_reference If supplied, points are relative to it.
374    # @param as_lat_long If True, return points as lat/lon.
375    # @param isSouthHemisphere If True, return lat/lon points in S.Hemi.
376    # @return A set of data points, in appropriate form.
377    def get_data_points(self,
378                        absolute=True,
379                        geo_reference=None,
380                        as_lat_long=False,
381                        isSouthHemisphere=True):
382        """Get coordinates for all data points as an Nx2 array
383
384        If absolute is False returned coordinates are relative to the
385        internal georeference's xll and yll corners, otherwise
386        absolute UTM coordinates are returned.
387
388        If a geo_reference is passed the points are returned relative
389        to that geo_reference.
390
391        isSH (isSouthHemisphere) is only used when getting data
392        points "as_lat_long" is True and if FALSE will return lats and
393        longs valid for the Northern Hemisphere.
394
395        Default: absolute is True.
396        """
397
398        if as_lat_long is True:
399            msg = "Points need a zone to be converted into lats and longs"
400            assert self.geo_reference is not None, msg
401            zone = self.geo_reference.get_zone()
402            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
403            lats_longs = []
404            for point in self.get_data_points(True):
405                # UTMtoLL(northing, easting, zone,
406                lat_calced, long_calced = UTMtoLL(point[1], point[0],
407                                                  zone, isSouthHemisphere)
408                lats_longs.append((lat_calced, long_calced)) # to hash
409            return lats_longs
410
411        if absolute is True and geo_reference is None:
412            return self.geo_reference.get_absolute(self.data_points)
413        elif geo_reference is not None:
414            return geo_reference.change_points_geo_ref(self.data_points,
415                                                       self.geo_reference)
416        else:
417            # If absolute is False
418            return self.data_points
419
420    ##
421    # @brief Get value for attribute name.
422    # @param attribute_name Name to get value for.
423    # @note If name passed is None, return default attribute value.
424    def get_attributes(self, attribute_name=None):
425        """Return values for one named attribute.
426
427        If attribute_name is None, default_attribute_name is used
428        """
429
430        if attribute_name is None:
431            if self.default_attribute_name is not None:
432                attribute_name = self.default_attribute_name
433            else:
434                attribute_name = self.attributes.keys()[0]
435                # above line takes the first one from keys
436
437        if self.verbose is True:
438            print 'Using attribute %s' %attribute_name
439            print 'Available attributes: %s' %(self.attributes.keys())
440
441        msg = 'Attribute name %s does not exist in data set' % attribute_name
442        assert self.attributes.has_key(attribute_name), msg
443
444        return self.attributes[attribute_name]
445
446    ##
447    # @brief Get all instance attributes.
448    # @return The instance attribute dictionary, or None if no attributes.
449    def get_all_attributes(self):
450        """Return values for all attributes.
451        The return value is either None or a dictionary (possibly empty).
452        """
453
454        return self.attributes
455
456    ##
457    # @brief Override __add__() to allow addition of geospatial objects.
458    # @param self This object.
459    # @param other The second object.
460    # @return The new geospatial object.
461    def __add__(self, other):
462        """Returns the addition of 2 geospatial objects,
463        objects are concatencated to the end of each other
464
465        NOTE: doesn't add if objects contain different
466        attributes
467
468        Always return absolute points!
469        This also means, that if you add None to the object,
470        it will be turned into absolute coordinates
471
472        other can be None in which case nothing is added to self.
473        """
474
475        # find objects zone and checks if the same
476        geo_ref1 = self.get_geo_reference()
477        zone1 = geo_ref1.get_zone()
478
479        if other is not None:
480            geo_ref2 = other.get_geo_reference()
481            zone2 = geo_ref2.get_zone()
482            geo_ref1.reconcile_zones(geo_ref2)
483            new_points = num.concatenate((self.get_data_points(absolute=True),
484                                          other.get_data_points(absolute=True)),
485                                         axis = 0)
486
487            # Concatenate attributes if any
488            if self.attributes is None:
489                if other.attributes is not None:
490                    msg = ('Geospatial data must have the same '
491                           'attributes to allow addition.')
492                    raise Exception, msg
493
494                new_attributes = None
495            else:
496                new_attributes = {}
497                for x in self.attributes.keys():
498                    if other.attributes.has_key(x):
499                        attrib1 = self.attributes[x]
500                        attrib2 = other.attributes[x]
501                        new_attributes[x] = num.concatenate((attrib1, attrib2),
502                                                            axis=0) #??default#
503                    else:
504                        msg = ('Geospatial data must have the same '
505                               'attributes to allow addition.')
506                        raise Exception, msg
507        else:
508            # other is None:
509            new_points = self.get_data_points(absolute=True)
510            new_attributes = self.attributes
511
512        # Instantiate new data object and return absolute coordinates
513        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
514        return Geospatial_data(new_points, new_attributes, new_geo_ref)
515
516    ##
517    # @brief Override the addition case where LHS isn't geospatial object.
518    # @param self This object.
519    # @param other The second object.
520    # @return The new geospatial object.
521    def __radd__(self, other):
522        """Handle cases like None + Geospatial_data(...)"""
523
524        return self + other
525
526################################################################################
527#  IMPORT/EXPORT POINTS FILES
528################################################################################
529
530    ##
531    # @brief Import a .txt, .csv or .pts points data file.
532    # @param file_name
533    # @param delimiter
534    # @param verbose True if this function is to be verbose.
535    # @note Will throw IOError or SyntaxError if there is a problem.
536    def import_points_file(self, file_name, delimiter=None, verbose=False):
537        """ load an .txt, .csv or .pts file
538
539        Note: will throw an IOError/SyntaxError if it can't load the file.
540        Catch these!
541
542        Post condition: self.attributes dictionary has been set
543        """
544
545        if access(file_name, F_OK) == 0 :
546            msg = 'File %s does not exist or is not accessible' % file_name
547            raise IOError, msg
548
549        attributes = {}
550        if file_name[-4:] == ".pts":
551            try:
552                data_points, attributes, geo_reference = \
553                             _read_pts_file(file_name, verbose)
554            except IOError, e:
555                msg = 'Could not open file %s ' % file_name
556                raise IOError, msg
557        elif file_name[-4:] == ".txt" or file_name[-4:]== ".csv":
558            try:
559                data_points, attributes, geo_reference = \
560                             _read_csv_file(file_name, verbose)
561            except IOError, e:
562                # This should only be if a file is not found
563                msg = ('Could not open file %s. Check the file location.'
564                       % file_name)
565                raise IOError, msg
566            except SyntaxError, e:
567                # This should only be if there is a format error
568                msg = ('Problem with format of file %s.\n%s'
569                       % (file_name, Error_message['IOError']))
570                raise SyntaxError, msg
571        else:
572            msg = 'Extension %s is unknown' % file_name[-4:]
573            raise IOError, msg
574
575        self.data_points = data_points
576        self.attributes = attributes
577        self.geo_reference = geo_reference
578
579    ##
580    # @brief Write points data to a file (.csv or .pts).
581    # @param file_name Path to file to write.
582    # @param absolute ??
583    # @param as_lat_long ??
584    # @param isSouthHemisphere ??
585    def export_points_file(self, file_name, absolute=True,
586                           as_lat_long=False, isSouthHemisphere=True):
587        """write a points file as a text (.csv) or binary (.pts) file
588
589        file_name is the file name, including the extension
590        The point_dict is defined at the top of this file.
591
592        If absolute is True data the xll and yll are added to the points value
593        and the xll and yll of the geo_reference are set to 0.
594
595        If absolute is False data points at returned as relative to the xll
596        and yll and geo_reference remains uneffected
597
598        isSouthHemisphere: is only used when getting data
599        points "as_lat_long" is True and if FALSE will return lats and
600        longs valid for the Northern Hemisphere.
601        """
602
603        if (file_name[-4:] == ".pts"):
604            if absolute is True:
605                geo_ref = deepcopy(self.geo_reference)
606                geo_ref.xllcorner = 0
607                geo_ref.yllcorner = 0
608                _write_pts_file(file_name,
609                                self.get_data_points(absolute),
610                                self.get_all_attributes(),
611                                geo_ref)
612            else:
613                _write_pts_file(file_name,
614                                self.get_data_points(absolute),
615                                self.get_all_attributes(),
616                                self.get_geo_reference())
617        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
618            msg = "ERROR: trying to write a .txt file with relative data."
619            assert absolute, msg
620            _write_csv_file(file_name,
621                            self.get_data_points(absolute=True,
622                                                 as_lat_long=as_lat_long,
623                                           isSouthHemisphere=isSouthHemisphere),
624                            self.get_all_attributes(),
625                            as_lat_long=as_lat_long)
626        elif file_name[-4:] == ".urs" :
627            msg = "ERROR: Can not write a .urs file as a relative file."
628            assert absolute, msg
629            _write_urs_file(file_name,
630                            self.get_data_points(as_lat_long=True,
631                                           isSouthHemisphere=isSouthHemisphere))
632        else:
633            msg = 'Unknown file type %s ' %file_name
634            raise IOError, msg
635
636    ##
637    # @brief Get a subset of data that is referred to by 'indices'.
638    # @param indices A list of indices to select data subset with.
639    # @return A geospatial object containing data subset.
640    def get_sample(self, indices):
641        """ Returns a object which is a subset of the original
642        and the data points and attributes in this new object refer to
643        the indices provided
644
645        Input
646            indices- a list of integers that represent the new object
647        Output
648            New geospatial data object representing points specified by
649            the indices
650        """
651
652        # FIXME: add the geo_reference to this
653        points = self.get_data_points()
654        sampled_points = num.take(points, indices, axis=0)
655
656        attributes = self.get_all_attributes()
657
658        sampled_attributes = {}
659        if attributes is not None:
660            for key, att in attributes.items():
661                sampled_attributes[key] = num.take(att, indices, axis=0)
662
663        return Geospatial_data(sampled_points, sampled_attributes)
664
665    ##
666    # @brief Split one geospatial object into two.
667    # @param factor Relative size to make first result object.
668    # @param seed_num Random 'seed' - used only for unit test.
669    # @param verbose True if this function is to be verbose.
670    # @note Points in each result object are selected randomly.
671    def split(self, factor=0.5, seed_num=None, verbose=False):
672        """Returns two geospatial_data object, first is the size of the 'factor'
673        smaller the original and the second is the remainder. The two
674        new objects are disjoint sets of each other.
675
676        Points of the two new object have selected RANDOMLY.
677
678        This method create two lists of indices which are passed into
679        get_sample.  The lists are created using random numbers, and
680        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
681        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
682
683        Input -  the factor which to split the object, if 0.1 then 10% of the
684                 together object will be returned
685
686        Output - two geospatial_data objects that are disjoint sets of the
687                 original
688        """
689
690        i = 0
691        self_size = len(self)
692        random_list = []
693        remainder_list = []
694        new_size = round(factor * self_size)
695
696        # Find unique random numbers
697        if verbose: print "make unique random number list and get indices"
698
699        total = num.array(range(self_size), num.int)    #array default#
700        total_list = total.tolist()
701
702        if verbose: print "total list len", len(total_list)
703
704        # There will be repeated random numbers however will not be a
705        # problem as they are being 'pop'ed out of array so if there
706        # are two numbers the same they will pop different indicies,
707        # still basically random
708        ## create list of non-unquie random numbers
709        if verbose: print "create random numbers list %s long" %new_size
710
711        # Set seed if provided, mainly important for unit test!
712        # plus recalcule seed when no seed provided.
713        if seed_num is not None:
714            seed(seed_num, seed_num)
715        else:
716            seed()
717
718        if verbose: print "seed:", get_seed()
719
720        random_num = randint(0, self_size-1, (int(new_size),))
721        random_num = random_num.tolist()
722
723        # need to sort and reverse so the pop() works correctly
724        random_num.sort()
725        random_num.reverse()
726
727        if verbose: print "make random number list and get indices"
728
729        j = 0
730        k = 1
731        remainder_list = total_list[:]
732
733        # pops array index (random_num) from remainder_list
734        # (which starts as the total_list and appends to random_list)
735        random_num_len = len(random_num)
736        for i in random_num:
737            random_list.append(remainder_list.pop(i))
738            j += 1
739            # prints progress
740            if verbose and round(random_num_len/10*k) == j:
741                print '(%s/%s)' % (j, random_num_len)
742                k += 1
743
744        # FIXME: move to tests, it might take a long time
745        # then create an array of random length between 500 and 1000,
746        # and use a random factor between 0 and 1
747        # setup for assertion
748        test_total = random_list[:]
749        test_total.extend(remainder_list)
750        test_total.sort()
751        msg = ('The two random lists made from the original list when added '
752               'together DO NOT equal the original list')
753        assert total_list == test_total, msg
754
755        # Get new samples
756        if verbose: print "get values of indices for random list"
757        G1 = self.get_sample(random_list)
758        if verbose: print "get values of indices for opposite of random list"
759        G2 = self.get_sample(remainder_list)
760
761        return G1, G2
762
763    ##
764    # @brief Allow iteration over this object.
765    def __iter__(self):
766        """Read in the header, number_of_points and save the
767        file pointer position
768        """
769
770        # FIXME - what to do if the file isn't there
771
772        # FIXME (Ole): Shouldn't this go into the constructor?
773        # This method acts like the constructor when blocking.
774        # ... and shouldn't it be called block_size?
775        #
776        if self.max_read_lines is None:
777            self.max_read_lines = MAX_READ_LINES
778
779        if self.file_name[-4:] == ".pts":
780            # See if the file is there.  Throw a QUIET IO error if it isn't
781            fd = open(self.file_name,'r')
782            fd.close()
783
784            # Throws prints to screen if file not present
785            self.fid = NetCDFFile(self.file_name, netcdf_mode_r)
786
787            (self.blocking_georef,
788             self.blocking_keys,
789             self.number_of_points) = _read_pts_file_header(self.fid,
790                                                            self.verbose)
791            self.start_row = 0
792            self.last_row = self.number_of_points
793            self.show_verbose = 0
794            self.verbose_block_size = (self.last_row + 10)/10
795            self.block_number = 0
796            self.number_of_blocks = self.number_of_points/self.max_read_lines
797            # This computes the number of full blocks. The last block may be
798            # smaller and won't be included in this estimate.
799
800            if self.verbose is True:
801                print ('Reading %d points (in ~%d blocks) from file %s. '
802                       % (self.number_of_points, self.number_of_blocks,
803                          self.file_name)),
804                print ('Each block consists of %d data points'
805                       % self.max_read_lines)
806        else:
807            # Assume the file is a csv file
808            file_pointer = open(self.file_name)
809            self.header, self.file_pointer = _read_csv_file_header(file_pointer)
810            self.blocking_georef = None # Used for reconciling zones
811
812        return self
813
814    ##
815    # @brief Read another block into the instance.
816    def next(self):
817        """read a block, instanciate a new geospatial and return it"""
818
819        if self.file_name[-4:] == ".pts":
820            if self.start_row == self.last_row:
821                # Read the end of the file last iteration
822                # Remove blocking attributes
823                self.fid.close()
824                del self.max_read_lines
825                del self.blocking_georef
826                del self.last_row
827                del self.start_row
828                del self.blocking_keys
829                del self.fid
830                raise StopIteration
831            fin_row = self.start_row + self.max_read_lines
832            if fin_row > self.last_row:
833                fin_row = self.last_row
834
835            if self.verbose is True:
836                if (self.show_verbose >= self.start_row
837                    and self.show_verbose < fin_row):
838                    print ('Reading block %d (points %d to %d) out of %d'
839                           % (self.block_number, self.start_row,
840                              fin_row, self.number_of_blocks))
841
842                    self.show_verbose += max(self.max_read_lines,
843                                             self.verbose_block_size)
844
845            # Read next block
846            pointlist, att_dict, = _read_pts_file_blocking(self.fid,
847                                                           self.start_row,
848                                                           fin_row,
849                                                           self.blocking_keys)
850
851            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
852            self.start_row = fin_row
853
854            self.block_number += 1
855        else:
856            # Assume the file is a csv file
857            try:
858                (pointlist,
859                 att_dict,
860                 geo_ref,
861                 self.file_pointer) = _read_csv_file_blocking(self.file_pointer,
862                                                              self.header[:],
863                                                              max_read_lines= \
864                                                           self.max_read_lines,
865                                                              verbose= \
866                                                                  self.verbose)
867
868                # Check that the zones haven't changed.
869                if geo_ref is not None:
870                    geo_ref.reconcile_zones(self.blocking_georef)
871                    self.blocking_georef = geo_ref
872                elif self.blocking_georef is not None:
873                    msg = ('Geo reference given, then not given.'
874                           ' This should not happen.')
875                    raise ValueError, msg
876                geo = Geospatial_data(pointlist, att_dict, geo_ref)
877            except StopIteration:
878                self.file_pointer.close()
879                del self.header
880                del self.file_pointer
881                raise StopIteration
882            except ANUGAError:
883                self.file_pointer.close()
884                del self.header
885                del self.file_pointer
886                raise
887            except SyntaxError:
888                self.file_pointer.close()
889                del self.header
890                del self.file_pointer
891                # This should only be if there is a format error
892                msg = ('Could not open file %s.\n%s'
893                       % (self.file_name, Error_message['IOError']))
894                raise SyntaxError, msg
895        return geo
896
897##################### Error messages ###########
898Error_message = {}
899Em = Error_message
900Em['IOError'] = ('NOTE: The format for a comma separated .txt/.csv file is:\n'
901                 '        1st line:     [column names]\n'
902                 '        other lines:  [x value], [y value], [attributes]\n'
903                 '\n'
904                 '           for example:\n'
905                 '           x, y, elevation, friction\n'
906                 '           0.6, 0.7, 4.9, 0.3\n'
907                 '           1.9, 2.8, 5, 0.3\n'
908                 '           2.7, 2.4, 5.2, 0.3\n'
909                 '\n'
910                 'The first two columns are assumed to be x, y coordinates.\n'
911                 'The attribute values must be numeric.\n')
912
913##
914# @brief ??
915# @param latitudes ??
916# @param longitudes ??
917# @param geo_reference ??
918# @param data_points ??
919# @param points_are_lats_longs ??
920# @note IS THIS USED???
921def _set_using_lat_long(latitudes,
922                        longitudes,
923                        geo_reference,
924                        data_points,
925                        points_are_lats_longs):
926    """If the points has lat long info, assume it is in (lat, long) order."""
927
928    if geo_reference is not None:
929        msg = ('A georeference is specified yet latitude and longitude '
930               'are also specified!')
931        raise ValueError, msg
932
933    if data_points is not None and not points_are_lats_longs:
934        msg = ('Data points are specified yet latitude and longitude are '
935               'also specified.')
936        raise ValueError, msg
937
938    if points_are_lats_longs:
939        if data_points is None:
940            msg = "Data points are not specified."
941            raise ValueError, msg
942        lats_longs = ensure_numeric(data_points)
943        latitudes = num.ravel(lats_longs[:,0:1])
944        longitudes = num.ravel(lats_longs[:,1:])
945
946    if latitudes is None and longitudes is None:
947        msg = "Latitudes and Longitudes are not specified."
948        raise ValueError, msg
949
950    if latitudes is None:
951        msg = "Longitudes are specified yet latitudes aren't."
952        raise ValueError, msg
953
954    if longitudes is None:
955        msg = "Latitudes are specified yet longitudes aren't."
956        raise ValueError, msg
957
958    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
959                                                    longitudes=longitudes)
960    return data_points, Geo_reference(zone=zone)
961
962
963##
964# @brief Read a .pts data file.
965# @param file_name Path to file to read.
966# @param verbose True if this function is to be verbose.
967# @return (pointlist, attributes, geo_reference)
968def _read_pts_file(file_name, verbose=False):
969    """Read .pts NetCDF file
970
971    Return a (dict_points, dict_attribute, geo_ref)
972    eg
973    dict['points'] = [[1.0,2.0],[3.0,5.0]]
974    dict['attributelist']['elevation'] = [[7.0,5.0]]
975    """
976
977    if verbose: print 'Reading ', file_name
978
979    # See if the file is there.  Throw a QUIET IO error if it isn't
980    fd = open(file_name,'r')
981    fd.close()
982
983    # Throws prints to screen if file not present
984    fid = NetCDFFile(file_name, netcdf_mode_r)
985
986    pointlist = num.array(fid.variables['points'])
987    keys = fid.variables.keys()
988
989    if verbose: print 'Got %d variables: %s' % (len(keys), keys)
990
991    try:
992        keys.remove('points')
993    except IOError, e:
994        fid.close()
995        msg = "Expected keyword 'points' but could not find it"
996        raise IOError, msg
997
998    attributes = {}
999    for key in keys:
1000        if verbose: print "reading attribute '%s'" % key
1001
1002        attributes[key] = num.array(fid.variables[key])
1003
1004    try:
1005        geo_reference = Geo_reference(NetCDFObject=fid)
1006    except AttributeError, e:
1007        geo_reference = None
1008
1009    fid.close()
1010
1011    return pointlist, attributes, geo_reference
1012
1013
1014##
1015# @brief Read a .csv data file.
1016# @param file_name Path to the .csv file to read.
1017# @param verbose True if this function is to be verbose.
1018def _read_csv_file(file_name, verbose=False):
1019    """Read .csv file
1020
1021    Return a dic of array of points, and dic of array of attribute
1022    eg
1023    dic['points'] = [[1.0,2.0],[3.0,5.0]]
1024    dic['attributelist']['elevation'] = [[7.0,5.0]]
1025    """
1026
1027    file_pointer = open(file_name)
1028    header, file_pointer = _read_csv_file_header(file_pointer)
1029    try:
1030        (pointlist,
1031         att_dict,
1032         geo_ref,
1033         file_pointer) = _read_csv_file_blocking(file_pointer,
1034                                                 header,
1035                                                 max_read_lines=1e30)
1036                                    # If the file is bigger that this, block..
1037                                    # FIXME (Ole) What's up here?
1038    except ANUGAError:
1039        file_pointer.close()
1040        raise
1041
1042    file_pointer.close()
1043
1044    return pointlist, att_dict, geo_ref
1045
1046
1047##
1048# @brief Read a .csv file header.
1049# @param file_pointer Open descriptor of the file to read.
1050# @param delimiter Header line delimiter string, split on this string.
1051# @param verbose True if this function is to be verbose.
1052# @return A tuple of (<cleaned header string>, <input file_pointer>)
1053
1054CSV_DELIMITER = ','
1055
1056def _read_csv_file_header(file_pointer,
1057                          delimiter=CSV_DELIMITER,
1058                          verbose=False):
1059    """Read the header of a .csv file
1060    Return a list of the header names
1061    """
1062
1063    line = file_pointer.readline()
1064    header = clean_line(line, delimiter)
1065
1066    return header, file_pointer
1067
1068##
1069# @brief Read a .csv file, with blocking.
1070# @param file_pointer Open descriptor of the file to read.
1071# @param header List of already read .csv header fields.
1072# @param delimiter Delimiter string header was split on.
1073# @param max_read_lines The max number of lines to read before blocking.
1074# @param verbose True if this function is to be verbose.
1075# @note Will throw IndexError, SyntaxError exceptions.
1076def _read_csv_file_blocking(file_pointer,
1077                            header,
1078                            delimiter=CSV_DELIMITER,
1079                            max_read_lines=MAX_READ_LINES,
1080                            verbose=False):
1081    """Read the body of a .csv file.
1082    header: The list header of the csv file, with the x and y labels.
1083    """
1084
1085    points = []
1086    pointattributes = []
1087    att_dict = {}
1088
1089    # This is to remove the x and y headers.
1090    header = header[:]
1091    try:
1092        x_header = header.pop(0)
1093        y_header = header.pop(0)
1094    except IndexError:
1095        # if there are not two columns this will occur.
1096        # eg if it is a space seperated file
1097        raise SyntaxError
1098
1099    read_lines = 0
1100    while read_lines < max_read_lines:
1101        line = file_pointer.readline()
1102        numbers = clean_line(line, delimiter)
1103        if len(numbers) <= 1:
1104            break
1105        if line[0] == '#':
1106            continue
1107
1108        read_lines += 1
1109
1110        try:
1111            x = float(numbers[0])
1112            y = float(numbers[1])
1113            points.append([x,y])
1114            numbers.pop(0)
1115            numbers.pop(0)
1116            if len(header) != len(numbers):
1117                file_pointer.close()
1118                msg = ('File load error. '
1119                       'There might be a problem with the file header.')
1120                raise SyntaxError, msg
1121            for i,n in enumerate(numbers):
1122                n.strip()
1123                if n != '\n' and n != '':
1124                    att_dict.setdefault(header[i],[]).append(float(n))
1125        except ValueError:
1126            raise SyntaxError
1127
1128    if points == []:
1129        raise StopIteration
1130
1131    pointlist = num.array(points, num.float)
1132    for key in att_dict.keys():
1133        att_dict[key] = num.array(att_dict[key], num.float)
1134
1135    # Do stuff here so the info is in lat's and longs
1136    geo_ref = None
1137    x_header = lower(x_header[:3])
1138    y_header = lower(y_header[:3])
1139    if (x_header == 'lon' or  x_header == 'lat') \
1140       and (y_header == 'lon' or  y_header == 'lat'):
1141        if x_header == 'lon':
1142            longitudes = num.ravel(pointlist[:,0:1])
1143            latitudes = num.ravel(pointlist[:,1:])
1144        else:
1145            latitudes = num.ravel(pointlist[:,0:1])
1146            longitudes = num.ravel(pointlist[:,1:])
1147
1148        pointlist, geo_ref = _set_using_lat_long(latitudes,
1149                                                 longitudes,
1150                                                 geo_reference=None,
1151                                                 data_points=None,
1152                                                 points_are_lats_longs=False)
1153
1154    return pointlist, att_dict, geo_ref, file_pointer
1155
1156
1157##
1158# @brief Read a .pts file header.
1159# @param fid Handle to the open .pts file.
1160# @param verbose True if the function is to be verbose.
1161# @return (geo_reference, keys, fid.dimensions['number_of_points'])
1162# @note Will throw IOError and AttributeError exceptions.
1163def _read_pts_file_header(fid, verbose=False):
1164    '''Read the geo_reference and number_of_points from a .pts file'''
1165
1166    keys = fid.variables.keys()
1167    try:
1168        keys.remove('points')
1169    except IOError, e:
1170        fid.close()
1171        msg = "Expected keyword 'points' but could not find it."
1172        raise IOError, msg
1173
1174    if verbose: print 'Got %d variables: %s' % (len(keys), keys)
1175
1176    try:
1177        geo_reference = Geo_reference(NetCDFObject=fid)
1178    except AttributeError, e:
1179        geo_reference = None
1180
1181    return geo_reference, keys, fid.dimensions['number_of_points']
1182
1183
1184##
1185# @brief Read the body of a .pts file, with blocking.
1186# @param fid Handle to already open file.
1187# @param start_row Start row index of points to return.
1188# @param fin_row End row index of points to return.
1189# @param keys Iterable of keys to return.
1190# @return Tuple of (pointlist, attributes).
1191def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1192    '''Read the body of a .pts file.'''
1193
1194    pointlist = num.array(fid.variables['points'][start_row:fin_row])
1195
1196    attributes = {}
1197    for key in keys:
1198        attributes[key] = num.array(fid.variables[key][start_row:fin_row])
1199
1200    return pointlist, attributes
1201
1202
1203##
1204# @brief Write a .pts data file.
1205# @param file_name Path to the file to write.
1206# @param write_data_points Data points to write.
1207# @param write_attributes Attributes to write.
1208# @param write_geo_reference Georef to write.
1209def _write_pts_file(file_name,
1210                    write_data_points,
1211                    write_attributes=None,
1212                    write_geo_reference=None):
1213    """Write .pts NetCDF file
1214
1215    NOTE: Below might not be valid ask Duncan : NB 5/2006
1216
1217    WARNING: This function mangles the point_atts data structure
1218    # F??ME: (DSG)This format has issues.
1219    # There can't be an attribute called points
1220    # consider format change
1221    # method changed by NB not sure if above statement is correct
1222
1223    should create new test for this
1224    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1225    for key in point_atts.keys():
1226        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1227        assert key in legal_keys, msg
1228    """
1229
1230    # NetCDF file definition
1231    outfile = NetCDFFile(file_name, netcdf_mode_w)
1232
1233    # Create new file
1234    outfile.institution = 'Geoscience Australia'
1235    outfile.description = ('NetCDF format for compact and portable storage '
1236                           'of spatial point data')
1237
1238    # Dimension definitions
1239    shape = write_data_points.shape[0]
1240    outfile.createDimension('number_of_points', shape)
1241    outfile.createDimension('number_of_dimensions', 2) # This is 2d data
1242
1243    # Variable definition
1244    outfile.createVariable('points', netcdf_float,
1245                           ('number_of_points', 'number_of_dimensions'))
1246
1247    # create variables
1248    outfile.variables['points'][:] = write_data_points
1249
1250    if write_attributes is not None:
1251        for key in write_attributes.keys():
1252            outfile.createVariable(key, netcdf_float, ('number_of_points',))
1253            outfile.variables[key][:] = write_attributes[key]
1254
1255    if write_geo_reference is not None:
1256        write_NetCDF_georeference(write_geo_reference, outfile)
1257
1258    outfile.close()
1259
1260
1261##
1262# @brief Write a .csv data file.
1263# @param file_name Path to the file to write.
1264# @param write_data_points Data points to write.
1265# @param write_attributes Attributes to write.
1266# @param as_lat_long True if points are lat/lon, else x/y.
1267# @param delimiter The CSV delimiter to use.
1268def _write_csv_file(file_name,
1269                    write_data_points,
1270                    write_attributes=None,
1271                    as_lat_long=False,
1272                    delimiter=','):
1273    """Write a .csv file."""
1274
1275    points = write_data_points
1276    pointattributes = write_attributes
1277
1278    fd = open(file_name, 'w')
1279
1280    if as_lat_long:
1281        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1282    else:
1283        titlelist = "x" + delimiter + "y"  + delimiter
1284
1285    if pointattributes is not None:
1286        for title in pointattributes.keys():
1287            titlelist = titlelist + title + delimiter
1288        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1289
1290    fd.write(titlelist + "\n")
1291
1292    # <x/lat> <y/long> [attributes]
1293    for i, vert in enumerate( points):
1294        if pointattributes is not None:
1295            attlist = ","
1296            for att in pointattributes.keys():
1297                attlist = attlist + str(pointattributes[att][i]) + delimiter
1298            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1299            attlist.strip()
1300        else:
1301            attlist = ''
1302
1303        fd.write(str(vert[0]) + delimiter + str(vert[1]) + attlist + "\n")
1304
1305    fd.close()
1306
1307
1308##
1309# @brief Write a URS file.
1310# @param file_name The path of the file to write.
1311# @param points
1312# @param delimiter
1313def _write_urs_file(file_name, points, delimiter=' '):
1314    """Write a URS format file.
1315    export a file, file_name, with the urs format
1316    the data points are in lats and longs
1317    """
1318
1319    fd = open(file_name, 'w')
1320
1321    # first line is # points
1322    fd.write(str(len(points)) + "\n")
1323
1324    # <lat> <long> <id#>
1325    for i, vert in enumerate( points):
1326        fd.write(str(round(vert[0],7)) + delimiter +
1327                 str(round(vert[1],7)) + delimiter + str(i) + "\n")
1328
1329    fd.close()
1330
1331
1332##
1333# @brief ??
1334# @param point_atts ??
1335# @return ??
1336def _point_atts2array(point_atts):
1337    point_atts['pointlist'] = num.array(point_atts['pointlist'], num.float)
1338
1339    for key in point_atts['attributelist'].keys():
1340        point_atts['attributelist'][key] = \
1341                num.array(point_atts['attributelist'][key], num.float)
1342
1343    return point_atts
1344
1345
1346##
1347# @brief Convert geospatial object to a points dictionary.
1348# @param geospatial_data The geospatial object to convert.
1349# @return A points dictionary.
1350def geospatial_data2points_dictionary(geospatial_data):
1351    """Convert geospatial data to points_dictionary"""
1352
1353    points_dictionary = {}
1354    points_dictionary['pointlist'] = geospatial_data.data_points
1355
1356    points_dictionary['attributelist'] = {}
1357
1358    for attribute_name in geospatial_data.attributes.keys():
1359        val = geospatial_data.attributes[attribute_name]
1360        points_dictionary['attributelist'][attribute_name] = val
1361
1362    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1363
1364    return points_dictionary
1365
1366
1367##
1368# @brief Convert a points dictionary to a geospatial object.
1369# @param points_dictionary A points dictionary to convert.
1370def points_dictionary2geospatial_data(points_dictionary):
1371    """Convert points_dictionary to geospatial data object"""
1372
1373    msg = "Points dictionary must have key 'pointlist'"
1374    assert points_dictionary.has_key('pointlist'), msg
1375
1376    msg = "Points dictionary must have key 'attributelist'"
1377    assert points_dictionary.has_key('attributelist'), msg
1378
1379    if points_dictionary.has_key('geo_reference'):
1380        geo = points_dictionary['geo_reference']
1381    else:
1382        geo = None
1383
1384    return Geospatial_data(points_dictionary['pointlist'],
1385                           points_dictionary['attributelist'],
1386                           geo_reference=geo)
1387
1388
1389##
1390# @brief Ensure that points are in absolute coordinates.
1391# @param points A list or array of points to check, or geospatial object.
1392# @param geo_reference If supplied,
1393# @return ??
1394def ensure_absolute(points, geo_reference=None):
1395    """Ensure that points are in absolute coordinates.
1396
1397    This function inputs several formats and
1398    outputs one format. - a numeric array of absolute points.
1399
1400    Input formats are;
1401      points: List or numeric array of coordinate pairs [xi, eta] of
1402              points or geospatial object or points file name
1403
1404    mesh_origin: A geo_reference object or 3-tuples consisting of
1405                 UTM zone, easting and northing.
1406                 If specified vertex coordinates are assumed to be
1407                 relative to their respective origins.
1408    """
1409
1410    # Input check
1411    if isinstance(points, basestring):
1412        # It's a string - assume it is a point file
1413        points = Geospatial_data(file_name=points)
1414
1415    if isinstance(points, Geospatial_data):
1416        points = points.get_data_points(absolute=True)
1417        msg = 'Use a Geospatial_data object or a mesh origin, not both.'
1418        assert geo_reference == None, msg
1419    else:
1420        points = ensure_numeric(points, num.float)
1421
1422    # Sort of geo_reference and convert points
1423    if geo_reference is None:
1424        geo = None    # Geo_reference()
1425    else:
1426        if isinstance(geo_reference, Geo_reference):
1427            geo = geo_reference
1428        else:
1429            geo = Geo_reference(geo_reference[0],
1430                                geo_reference[1],
1431                                geo_reference[2])
1432        points = geo.get_absolute(points)
1433
1434    return points
1435
1436
1437##
1438# @brief
1439# @param points
1440# @param geo_reference
1441# @return A geospatial object.
1442def ensure_geospatial(points, geo_reference=None):
1443    """Convert various data formats to a geospatial_data instance.
1444
1445    Inputed formats are;
1446    points:      List or numeric array of coordinate pairs [xi, eta] of
1447                 points or geospatial object
1448
1449    mesh_origin: A geo_reference object or 3-tuples consisting of
1450                 UTM zone, easting and northing.
1451                 If specified vertex coordinates are assumed to be
1452                 relative to their respective origins.
1453    """
1454
1455    # Input check
1456    if isinstance(points, Geospatial_data):
1457        msg = "Use a Geospatial_data object or a mesh origin, not both."
1458        assert geo_reference is None, msg
1459        return points
1460    else:
1461        # List or numeric array of absolute points
1462        points = ensure_numeric(points, num.float)
1463
1464    # Sort out geo reference
1465    if geo_reference is None:
1466        geo = None
1467    else:
1468        if isinstance(geo_reference, Geo_reference):
1469            geo = geo_reference
1470        else:
1471            geo = Geo_reference(geo_reference[0],
1472                                geo_reference[1],
1473                                geo_reference[2])
1474
1475    # Create Geospatial_data object with appropriate geo reference and return
1476    points = Geospatial_data(data_points=points, geo_reference=geo)
1477
1478    return points
1479
1480
1481##
1482# @brief
1483# @param data_file
1484# @param alpha_list
1485# @param mesh_file
1486# @param boundary_poly
1487# @param mesh_resolution
1488# @param north_boundary
1489# @param south_boundary
1490# @param east_boundary
1491# @param west_boundary
1492# @param plot_name
1493# @param split_factor
1494# @param seed_num
1495# @param cache
1496# @param verbose
1497def find_optimal_smoothing_parameter(data_file,
1498                                     alpha_list=None,
1499                                     mesh_file=None,
1500                                     boundary_poly=None,
1501                                     mesh_resolution=100000,
1502                                     north_boundary=None,
1503                                     south_boundary=None,
1504                                     east_boundary=None,
1505                                     west_boundary=None,
1506                                     plot_name='all_alphas',
1507                                     split_factor=0.1,
1508                                     seed_num=None,
1509                                     cache=False,
1510                                     verbose=False):
1511    """Removes a small random sample of points from 'data_file'.
1512    Then creates models with different alpha values from 'alpha_list' and
1513    cross validates the predicted value to the previously removed point data.
1514    Returns the alpha value which has the smallest covariance.
1515
1516    data_file: must not contain points outside the boundaries defined
1517               and it must be either a pts, txt or csv file.
1518
1519    alpha_list: the alpha values to test in a single list
1520
1521    mesh_file: name of the created mesh file or if passed in will read it.
1522               NOTE, if there is a mesh file mesh_resolution,
1523               north_boundary, south... etc will be ignored.
1524
1525    mesh_resolution: the maximum area size for a triangle
1526
1527    north_boundary... west_boundary: the value of the boundary
1528
1529    plot_name: the name for the plot contain the results
1530
1531    seed_num: the seed to the random number generator
1532
1533    USAGE:
1534        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1535                                             alpha_list=[0.0001, 0.01, 1],
1536                                             mesh_file=None,
1537                                             mesh_resolution=3,
1538                                             north_boundary=5,
1539                                             south_boundary=-5,
1540                                             east_boundary=5,
1541                                             west_boundary=-5,
1542                                             plot_name='all_alphas',
1543                                             seed_num=100000,
1544                                             verbose=False)
1545
1546    OUTPUT: returns the minumum normalised covalance calculate AND the
1547           alpha that created it. PLUS writes a plot of the results
1548
1549    NOTE: code will not work if the data_file extent is greater than the
1550    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
1551    """
1552
1553    from anuga.shallow_water import Domain
1554    from anuga.geospatial_data.geospatial_data import Geospatial_data
1555    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1556    from anuga.utilities.numerical_tools import cov
1557    from anuga.utilities.polygon import is_inside_polygon
1558    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1559
1560    attribute_smoothed = 'elevation'
1561
1562    if mesh_file is None:
1563        if verbose: print "building mesh"
1564        mesh_file = 'temp.msh'
1565
1566        if (north_boundary is None or south_boundary is None
1567            or east_boundary is None or west_boundary is None):
1568            no_boundary = True
1569        else:
1570            no_boundary = False
1571
1572        if no_boundary is True:
1573            msg = 'All boundaries must be defined'
1574            raise Expection, msg
1575
1576        poly_topo = [[east_boundary, south_boundary],
1577                     [east_boundary, north_boundary],
1578                     [west_boundary, north_boundary],
1579                     [west_boundary, south_boundary]]
1580
1581        create_mesh_from_regions(poly_topo,
1582                                 boundary_tags={'back': [2],
1583                                                'side': [1,3],
1584                                                'ocean': [0]},
1585                                 maximum_triangle_area=mesh_resolution,
1586                                 filename=mesh_file,
1587                                 use_cache=cache,
1588                                 verbose=verbose)
1589
1590    else: # if mesh file provided
1591        # test mesh file exists?
1592        if verbose: "reading from file: %s" % mesh_file
1593        if access(mesh_file,F_OK) == 0:
1594            msg = "file %s doesn't exist!" % mesh_file
1595            raise IOError, msg
1596
1597    # split topo data
1598    if verbose: print 'Reading elevation file: %s' % data_file
1599    G = Geospatial_data(file_name = data_file)
1600    if verbose: print 'Start split'
1601    G_small, G_other = G.split(split_factor, seed_num, verbose=verbose)
1602    if verbose: print 'Finish split'
1603    points = G_small.get_data_points()
1604
1605    if verbose: print "Number of points in sample to compare: ", len(points)
1606
1607    if alpha_list == None:
1608        alphas = [0.001,0.01,100]
1609        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,
1610        #          0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1611    else:
1612        alphas = alpha_list
1613
1614    # creates array with columns 1 and 2 are x, y. column 3 is elevation
1615    # 4 onwards is the elevation_predicted using the alpha, which will
1616    # be compared later against the real removed data
1617    data = num.array([], dtype=num.float)
1618
1619    data = num.resize(data, (len(points), 3+len(alphas)))
1620
1621    # gets relative point from sample
1622    data[:,0] = points[:,0]
1623    data[:,1] = points[:,1]
1624    elevation_sample = G_small.get_attributes(attribute_name=attribute_smoothed)
1625    data[:,2] = elevation_sample
1626
1627    normal_cov = num.array(num.zeros([len(alphas), 2]), dtype=num.float)
1628
1629    if verbose: print 'Setup computational domains with different alphas'
1630
1631    for i, alpha in enumerate(alphas):
1632        # add G_other data to domains with different alphas
1633        if verbose:
1634            print '\nCalculating domain and mesh for Alpha =', alpha, '\n'
1635        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1636        if verbose: print domain.statistics()
1637        domain.set_quantity(attribute_smoothed,
1638                            geospatial_data=G_other,
1639                            use_cache=cache,
1640                            verbose=verbose,
1641                            alpha=alpha)
1642
1643        # Convert points to geospatial data for use with get_values below
1644        points_geo = Geospatial_data(points, domain.geo_reference)
1645
1646        # returns the predicted elevation of the points that were "split" out
1647        # of the original data set for one particular alpha
1648        if verbose: print 'Get predicted elevation for location to be compared'
1649        elevation_predicted = \
1650                domain.quantities[attribute_smoothed].\
1651                    get_values(interpolation_points=points_geo)
1652
1653        # add predicted elevation to array that starts with x, y, z...
1654        data[:,i+3] = elevation_predicted
1655
1656        sample_cov = cov(elevation_sample)
1657        ele_cov = cov(elevation_sample - elevation_predicted)
1658        normal_cov[i,:] = [alpha, ele_cov / sample_cov]
1659
1660        if verbose:
1661            print 'Covariance for alpha ', normal_cov[i][0], '= ', \
1662                      normal_cov[i][1]
1663            print '-------------------------------------------- \n'
1664
1665    normal_cov0 = normal_cov[:,0]
1666    normal_cov_new = num.take(normal_cov, num.argsort(normal_cov0), axis=0)
1667
1668    if plot_name is not None:
1669        from pylab import savefig, semilogx, loglog
1670
1671        semilogx(normal_cov_new[:,0], normal_cov_new[:,1])
1672        loglog(normal_cov_new[:,0], normal_cov_new[:,1])
1673        savefig(plot_name, dpi=300)
1674
1675    if mesh_file == 'temp.msh':
1676        remove(mesh_file)
1677
1678    if verbose:
1679        print 'Final results:'
1680        for i, alpha in enumerate(alphas):
1681            print ('covariance for alpha %s = %s '
1682                   % (normal_cov[i][0], normal_cov[i][1]))
1683        print ('\nOptimal alpha is: %s '
1684               % normal_cov_new[(num.argmin(normal_cov_new, axis=0))[1], 0])
1685
1686    # covariance and optimal alpha
1687    return (min(normal_cov_new[:,1]),
1688            normal_cov_new[(num.argmin(normal_cov_new,axis=0))[1],0])
1689
1690
1691##
1692# @brief
1693# @param data_file
1694# @param alpha_list
1695# @param mesh_file
1696# @param boundary_poly
1697# @param mesh_resolution
1698# @param north_boundary
1699# @param south_boundary
1700# @param east_boundary
1701# @param west_boundary
1702# @param plot_name
1703# @param split_factor
1704# @param seed_num
1705# @param cache
1706# @param verbose
1707def old_find_optimal_smoothing_parameter(data_file,
1708                                         alpha_list=None,
1709                                         mesh_file=None,
1710                                         boundary_poly=None,
1711                                         mesh_resolution=100000,
1712                                         north_boundary=None,
1713                                         south_boundary=None,
1714                                         east_boundary=None,
1715                                         west_boundary=None,
1716                                         plot_name='all_alphas',
1717                                         split_factor=0.1,
1718                                         seed_num=None,
1719                                         cache=False,
1720                                         verbose=False):
1721    """
1722    data_file: must not contain points outside the boundaries defined
1723               and it either a pts, txt or csv file.
1724
1725    alpha_list: the alpha values to test in a single list
1726
1727    mesh_file: name of the created mesh file or if passed in will read it.
1728               NOTE, if there is a mesh file mesh_resolution,
1729               north_boundary, south... etc will be ignored.
1730
1731    mesh_resolution: the maximum area size for a triangle
1732
1733    north_boundary... west_boundary: the value of the boundary
1734
1735    plot_name: the name for the plot contain the results
1736
1737    seed_num: the seed to the random number generator
1738
1739    USAGE:
1740        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1741                                             alpha_list=[0.0001, 0.01, 1],
1742                                             mesh_file=None,
1743                                             mesh_resolution=3,
1744                                             north_boundary=5,
1745                                             south_boundary=-5,
1746                                             east_boundary=5,
1747                                             west_boundary=-5,
1748                                             plot_name='all_alphas',
1749                                             seed_num=100000,
1750                                             verbose=False)
1751
1752    OUTPUT: returns the minumum normalised covalance calculate AND the
1753            alpha that created it. PLUS writes a plot of the results
1754
1755    NOTE: code will not work if the data_file extend is greater than the
1756          boundary_polygon or the north_boundary...west_boundary
1757    """
1758
1759    from anuga.shallow_water import Domain
1760    from anuga.geospatial_data.geospatial_data import Geospatial_data
1761    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1762    from anuga.utilities.numerical_tools import cov
1763    from anuga.utilities.polygon import is_inside_polygon
1764    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1765
1766    attribute_smoothed = 'elevation'
1767
1768    if mesh_file is None:
1769        mesh_file = 'temp.msh'
1770
1771        if (north_boundary is None or south_boundary is None
1772            or east_boundary is None or west_boundary is None):
1773            no_boundary = True
1774        else:
1775            no_boundary = False
1776
1777        if no_boundary is True:
1778            msg = 'All boundaries must be defined'
1779            raise Expection, msg
1780
1781        poly_topo = [[east_boundary, south_boundary],
1782                     [east_boundary, north_boundary],
1783                     [west_boundary, north_boundary],
1784                     [west_boundary, south_boundary]]
1785
1786        create_mesh_from_regions(poly_topo,
1787                                 boundary_tags={'back': [2],
1788                                                'side': [1,3],
1789                                                'ocean': [0]},
1790                                 maximum_triangle_area=mesh_resolution,
1791                                 filename=mesh_file,
1792                                 use_cache=cache,
1793                                 verbose=verbose)
1794
1795    else: # if mesh file provided
1796        # test mesh file exists?
1797        if access(mesh_file,F_OK) == 0:
1798            msg = "file %s doesn't exist!" % mesh_file
1799            raise IOError, msg
1800
1801    # split topo data
1802    G = Geospatial_data(file_name=data_file)
1803    if verbose: print 'start split'
1804    G_small, G_other = G.split(split_factor, seed_num, verbose=verbose)
1805    if verbose: print 'finish split'
1806    points = G_small.get_data_points()
1807
1808    if verbose: print "Number of points in sample to compare: ", len(points)
1809
1810    if alpha_list == None:
1811        alphas = [0.001,0.01,100]
1812        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,
1813        #          0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1814    else:
1815        alphas = alpha_list
1816
1817    domains = {}
1818
1819    if verbose: print 'Setup computational domains with different alphas'
1820
1821    for alpha in alphas:
1822        # add G_other data to domains with different alphas
1823        if verbose:
1824            print '\nCalculating domain and mesh for Alpha =', alpha, '\n'
1825        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1826        if verbose: print domain.statistics()
1827        domain.set_quantity(attribute_smoothed,
1828                            geospatial_data=G_other,
1829                            use_cache=cache,
1830                            verbose=verbose,
1831                            alpha=alpha)
1832        domains[alpha] = domain
1833
1834    # creates array with columns 1 and 2 are x, y. column 3 is elevation
1835    # 4 onwards is the elevation_predicted using the alpha, which will
1836    # be compared later against the real removed data
1837    data = num.array([], dtype=num.float)
1838
1839    data = num.resize(data, (len(points), 3+len(alphas)))
1840
1841    # gets relative point from sample
1842    data[:,0] = points[:,0]
1843    data[:,1] = points[:,1]
1844    elevation_sample = G_small.get_attributes(attribute_name=attribute_smoothed)
1845    data[:,2] = elevation_sample
1846
1847    normal_cov = num.array(num.zeros([len(alphas), 2]), dtype=num.float)
1848
1849    if verbose:
1850        print 'Determine difference between predicted results and actual data'
1851
1852    for i, alpha in enumerate(domains):
1853        if verbose: print'Alpha =', alpha
1854
1855        points_geo = domains[alpha].geo_reference.change_points_geo_ref(points)
1856        # returns the predicted elevation of the points that were "split" out
1857        # of the original data set for one particular alpha
1858        elevation_predicted = \
1859                domains[alpha].quantities[attribute_smoothed].\
1860                        get_values(interpolation_points=points_geo)
1861
1862        # add predicted elevation to array that starts with x, y, z...
1863        data[:,i+3] = elevation_predicted
1864
1865        sample_cov = cov(elevation_sample)
1866        ele_cov = cov(elevation_sample - elevation_predicted)
1867        normal_cov[i,:] = [alpha,ele_cov / sample_cov]
1868        print 'memory usage during compare', mem_usage()
1869        if verbose: print 'cov', normal_cov[i][0], '= ', normal_cov[i][1]
1870
1871    normal_cov0 = normal_cov[:,0]
1872    normal_cov_new = num.take(normal_cov, num.argsort(normal_cov0), axis=0)
1873
1874    if plot_name is not None:
1875        from pylab import savefig,semilogx,loglog
1876
1877        semilogx(normal_cov_new[:,0], normal_cov_new[:,1])
1878        loglog(normal_cov_new[:,0], normal_cov_new[:,1])
1879        savefig(plot_name, dpi=300)
1880    if mesh_file == 'temp.msh':
1881        remove(mesh_file)
1882
1883    return (min(normal_cov_new[:,1]),
1884            normal_cov_new[(num.argmin(normal_cov_new, axis=0))[1],0])
1885
1886
1887if __name__ == "__main__":
1888    pass
1889
Note: See TracBrowser for help on using the repository browser.