source: anuga_core/source_numpy_conversion/anuga/geospatial_data/geospatial_data.py @ 5915

Last change on this file since 5915 was 5915, checked in by rwilson, 16 years ago

NumPy? conversion.

File size: 67.8 KB
Line 
1"""Class Geospatial_data - Manipulation of locations on the planet and
2associated attributes.
3
4"""
5from sys import maxint
6from os import access, F_OK, R_OK,remove
7from types import DictType
8from warnings import warn
9from string import lower
10import numpy
11import numpy.random
12#from Array import tolist
13from copy import deepcopy
14
15
16from Scientific.IO.NetCDF import NetCDFFile   
17from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL   
18from anuga.utilities.numerical_tools import ensure_numeric
19from anuga.coordinate_transforms.geo_reference import Geo_reference, \
20     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
21from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
22from anuga.utilities.anuga_exceptions import ANUGAError
23from anuga.config import points_file_block_line_size as MAX_READ_LINES 
24#from anuga.fit_interpolate.benchmark_least_squares import mem_usage
25 
26
27DEFAULT_ATTRIBUTE = 'elevation'
28
29class Geospatial_data:
30
31    def __init__(self,
32                 data_points=None, # this can also be a points file name
33                 attributes=None,
34                 geo_reference=None,
35                 default_attribute_name=None,
36                 file_name=None,
37                 latitudes=None,
38                 longitudes=None,
39                 points_are_lats_longs=False,
40                 max_read_lines=None,
41                 load_file_now=True,
42                 verbose=False):
43
44       
45        """
46        Create instance from data points and associated attributes
47
48        data_points: x,y coordinates in meters. Type must be either a
49        sequence of 2-tuples or an Mx2 Numeric array of floats.  A file name
50        with extension .txt, .cvs or .pts can also be passed in here.
51
52        attributes: Associated values for each data point. The type
53        must be either a list or an array of length M or a dictionary
54        of lists (or arrays) of length M. In the latter case the keys
55        in the dictionary represent the attribute names, in the former
56        the attribute will get the default name "elevation".
57       
58        geo_reference: Object representing the origin of the data
59        points. It contains UTM zone, easting and northing and data
60        points are assumed to be relative to this origin.
61        If geo_reference is None, the default geo ref object is used.
62
63        default_attribute_name: Name of default attribute to be used with
64        get_attribute_values. The idea is that the dataset can be
65        equipped with information about which attribute to return.
66        If None, the default is the "first"
67
68        latitudes, longitudes: Vectors of latitudes and longitudes,
69        used to specify location instead of points.
70       
71        points_are_lats_longs: Set this as true if the points are actually
72        lats and longs, not UTM
73
74        max_read_lines: The number of rows read into memory when using
75        blocking to read a file.
76
77        load_file_now:  If true the file is automatically loaded
78        into the geospatial instance. Used when blocking.
79       
80        file_name: Name of input netCDF file or .txt file. netCDF file must
81        have dimensions "points" etc.
82        .txt file is a comma seperated file with x, y and attribute
83        data.
84       
85        The first line has the titles of the columns.  The first two
86        column titles are checked to see if they start with lat or
87        long (not case sensitive).  If so the data is assumed to be
88        latitude and longitude, in decimal format and converted to
89        UTM.  Otherwise the first two columns are assumed to be the x
90        and y, and the title names acually used are ignored.
91 
92       
93        The format for a .txt file is:
94            1st line:     [column names]
95            other lines:  x y [attributes]
96
97            for example:
98            x, y, elevation, friction
99            0.6, 0.7, 4.9, 0.3
100            1.9, 2.8, 5, 0.3
101            2.7, 2.4, 5.2, 0.3
102
103        The first two columns have to be x, y or lat, long
104        coordinates.
105     
106       
107        The format for a Points dictionary is:
108
109          ['pointlist'] a 2 column array describing points. 1st column x,
110          2nd column y.
111          ['attributelist'], a dictionary of 1D arrays, representing
112          attribute values at the point.  The dictionary key is the attribute
113          header.
114          ['geo_reference'] a Geo_refernece object. Use if the point
115          information is relative. This is optional.
116            eg
117            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
118            dic['attributelist']['elevation'] = [[7.0,5.0]
119               
120        verbose:
121
122         
123        """
124
125        if isinstance(data_points, basestring):
126            # assume data point is really a file name
127            file_name = data_points
128
129        self.set_verbose(verbose)
130        self.geo_reference=None #create the attribute
131        self.file_name = file_name
132       
133        if max_read_lines is None:
134            self.max_read_lines = MAX_READ_LINES
135        else:
136            self.max_read_lines = max_read_lines
137
138        if file_name is None:
139            if latitudes is not None or longitudes is not None or \
140                   points_are_lats_longs:
141                data_points, geo_reference =  \
142                             _set_using_lat_long(latitudes=latitudes,
143                                  longitudes=longitudes,
144                                  geo_reference=geo_reference,
145                                  data_points=data_points,
146                                  points_are_lats_longs=points_are_lats_longs)
147            self.check_data_points(data_points)
148            self.set_attributes(attributes)
149            self.set_geo_reference(geo_reference)
150            self.set_default_attribute_name(default_attribute_name)
151
152        elif load_file_now is True:
153            # watch for case where file name and points,
154            # attributes etc are provided!!
155            # if file name then all provided info will be removed!
156
157            if verbose is True:
158                if file_name is not None:
159                    print 'Loading Geospatial data from file: %s' %file_name
160           
161            self.import_points_file(file_name, verbose=verbose)
162               
163            self.check_data_points(self.data_points)
164            self.set_attributes(self.attributes) 
165            self.set_geo_reference(self.geo_reference)
166            self.set_default_attribute_name(default_attribute_name)
167
168        if verbose is True:
169            if file_name is not None:
170                print 'Geospatial data created from file: %s' %file_name
171                if load_file_now is False:
172                    print 'Data will be loaded blockwise on demand'
173
174                    if file_name.endswith('csv') or file_name.endswith('txt'):
175                        pass
176                        # This message was misleading.
177                        # FIXME (Ole): Are we blocking here or not?
178                        #print 'ASCII formats are not that great for '
179                        #print 'blockwise reading. Consider storing this'
180                        #print 'data as a pts NetCDF format'
181
182    def __len__(self):
183        return len(self.data_points)
184
185    def __repr__(self):
186        return str(self.get_data_points(absolute=True))
187   
188   
189    def check_data_points(self, data_points):
190        """Checks data points
191        """
192       
193        if data_points is None:
194            self.data_points = None
195            msg = 'There is no data or file provided!'
196            raise ValueError, msg
197           
198        else:
199            self.data_points = ensure_numeric(data_points)
200            #print "self.data_points.shape",self.data_points.shape
201            if not (0,) == self.data_points.shape:
202                assert len(self.data_points.shape) == 2
203                assert self.data_points.shape[1] == 2
204
205    def set_attributes(self, attributes):
206        """Check and assign attributes dictionary
207        """
208       
209        if attributes is None:
210            self.attributes = None
211            return
212       
213        if not isinstance(attributes, DictType):
214            #Convert single attribute into dictionary
215            attributes = {DEFAULT_ATTRIBUTE: attributes}
216
217        #Check input attributes   
218        for key in attributes.keys():
219            try:
220                attributes[key] = ensure_numeric(attributes[key])
221            except:
222                msg = 'Attribute %s could not be converted' %key
223                msg += 'to a numeric vector'
224                raise msg
225
226        self.attributes = attributes   
227
228    #def set_geo_reference(self, geo_reference):
229    #    # FIXME (Ole): Backwards compatibility - deprecate
230    #    self.setgeo_reference(geo_reference)
231
232    def set_geo_reference(self, geo_reference):
233        """Set the georeference of geospatial data.
234       
235        It can also be used to change the georeference and will ensure that
236        the absolute coordinate values are unchanged.
237        """
238        from anuga.coordinate_transforms.geo_reference import Geo_reference
239
240        if geo_reference is None:
241            # Use default - points are in absolute coordinates
242            geo_reference = Geo_reference()
243           
244        # Allow for tuple (zone, xllcorner, yllcorner)   
245        geo_reference = ensure_geo_reference(geo_reference)
246       
247        if not isinstance(geo_reference, Geo_reference):
248            # FIXME (Ole): This exception will be raised even
249            # if geo_reference is None. Is that the intent Duncan?
250            msg = 'Argument geo_reference must be a valid Geo_reference \n'
251            msg += 'object or None.'
252            raise msg
253
254        # If a geo_reference already exists, change the point data according to
255        # the new geo reference
256        if  self.geo_reference is not None:
257            self.data_points = self.get_data_points(geo_reference=geo_reference)
258           
259        self.geo_reference = geo_reference
260
261
262    def set_default_attribute_name(self, default_attribute_name):
263        self.default_attribute_name = default_attribute_name
264
265    def set_verbose(self, verbose=False):
266        if verbose in [False, True]:
267            self.verbose = verbose
268        else:
269            msg = 'Illegal value: %s' %str(verbose)
270            raise Exception, msg
271
272    def clip(self, polygon, closed=True, verbose=False):
273        """Clip geospatial data by a polygon
274
275        Input
276          polygon - Either a list of points, an Nx2 array or
277                    a Geospatial data object.
278          closed - (optional) determine whether points on boundary should be
279          regarded as belonging to the polygon (closed = True)
280          or not (closed = False). Default is True.
281           
282        Output
283          New geospatial data object representing points inside
284          specified polygon.
285       
286       
287        Note - this method is non-destructive and leaves the data in 'self'
288        unchanged
289        """
290
291        from anuga.utilities.polygon import inside_polygon
292
293        if isinstance(polygon, Geospatial_data):
294            # Polygon is an object - extract points
295            polygon = polygon.get_data_points()
296
297        points = self.get_data_points()   
298#        if verbose: print '%s points:%s' %(verbose,points)
299        inside_indices = inside_polygon(points, polygon, closed, verbose)
300
301        clipped_G = self.get_sample(inside_indices)
302
303#        clipped_points = take(points, inside_indices)
304
305        # Clip all attributes
306#        attributes = self.get_all_attributes()
307
308#        clipped_attributes = {}
309#        if attributes is not None:
310#            for key, att in attributes.items():
311#                clipped_attributes[key] = take(att, inside_indices)
312
313#        return Geospatial_data(clipped_points, clipped_attributes)
314        return clipped_G
315       
316       
317    def clip_outside(self, polygon, closed=True,verbose=False):
318        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
319
320        Input
321          polygon - Either a list of points, an Nx2 array or
322                    a Geospatial data object.
323          closed - (optional) determine whether points on boundary should be
324          regarded as belonging to the polygon (closed = True)
325          or not (closed = False). Default is True.
326         
327        Output
328          Geospatial data object representing point OUTSIDE specified polygon
329       
330        """
331
332        from anuga.utilities.polygon import outside_polygon
333
334        if isinstance(polygon, Geospatial_data):
335            # Polygon is an object - extract points
336            polygon = polygon.get_data_points()
337
338        points = self.get_data_points()   
339        outside_indices = outside_polygon(points, polygon, closed,verbose)
340
341        clipped_G = self.get_sample(outside_indices)
342
343#        clipped_points = take(points, outside_indices)
344
345        # Clip all attributes
346#        attributes = self.get_all_attributes()
347
348#        clipped_attributes = {}
349#        if attributes is not None:
350#            for key, att in attributes.items():
351#                clipped_attributes[key] = take(att, outside_indices)
352
353#        return Geospatial_data(clipped_points, clipped_attributes)
354        return clipped_G
355
356   
357    def get_geo_reference(self):
358        return self.geo_reference
359       
360    def get_data_points(self, absolute=True, geo_reference=None,
361                        as_lat_long=False, isSouthHemisphere=True):
362        """Get coordinates for all data points as an Nx2 array
363
364        If absolute is False returned coordinates are relative to the
365        internal georeference's xll and yll corners, otherwise
366        absolute UTM coordinates are returned.
367
368        If a geo_reference is passed the points are returned relative
369        to that geo_reference.
370       
371        isSH (isSouthHemisphere) is only used when getting data
372        points "as_lat_long" is True and if FALSE will return lats and
373        longs valid for the Northern Hemisphere.
374
375        Default: absolute is True.
376        """
377        if as_lat_long is True:
378            msg = "Points need a zone to be converted into lats and longs"
379            assert self.geo_reference is not None, msg
380            zone = self.geo_reference.get_zone()
381            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
382            lats_longs = []
383            for point in self.get_data_points(True):
384           
385                # UTMtoLL(northing, easting, zone,
386                lat_calced, long_calced = UTMtoLL(point[1],point[0], 
387                                                  zone, isSouthHemisphere)
388                lats_longs.append((lat_calced, long_calced)) # to hash
389            return lats_longs
390           
391        if absolute is True and geo_reference is None:
392            return self.geo_reference.get_absolute(self.data_points)
393        elif geo_reference is not None:
394            return geo_reference.change_points_geo_ref(self.data_points, 
395                                                       self.geo_reference)
396        else:
397            # If absolute is False
398            return self.data_points
399
400   
401    def get_attributes(self, attribute_name=None):
402        """Return values for one named attribute.
403
404        If attribute_name is None, default_attribute_name is used
405        """
406
407        if attribute_name is None:
408            if self.default_attribute_name is not None:
409                attribute_name = self.default_attribute_name
410            else:
411                attribute_name = self.attributes.keys()[0] 
412                # above line takes the first one from keys
413       
414        if self.verbose is True:
415            print 'Using attribute %s' %attribute_name
416            print 'Available attributes: %s' %(self.attributes.keys())       
417
418        msg = 'Attribute name %s does not exist in data set' %attribute_name
419        assert self.attributes.has_key(attribute_name), msg
420
421        return self.attributes[attribute_name]
422
423    def get_all_attributes(self):
424        """
425        Return values for all attributes.
426        The return value is either None or a dictionary (possibly empty).
427        """
428
429        return self.attributes
430
431    def __add__(self, other):
432        """
433        Returns the addition of 2 geospatical objects,
434        objects are concatencated to the end of each other
435           
436        NOTE: doesn't add if objects contain different
437        attributes 
438       
439        Always return absolute points!
440        This alse means, that if you add None to the object,
441        it will be turned into absolute coordinates
442
443        other can be None in which case nothing is added to self.
444        """
445
446
447        # find objects zone and checks if the same
448        geo_ref1 = self.get_geo_reference()
449        zone1 = geo_ref1.get_zone()
450
451
452        if other is not None:
453
454            geo_ref2 = other.get_geo_reference()
455            zone2 = geo_ref2.get_zone()
456
457            geo_ref1.reconcile_zones(geo_ref2)
458
459       
460            new_points = numpy.concatenate((self.get_data_points(absolute=True),
461                                      other.get_data_points(absolute=True)),
462                                      axis = 0)       
463
464       
465     
466            # Concatenate attributes if any
467            if self.attributes is None:
468                if other.attributes is not None:
469                    msg = 'Geospatial data must have the same \n'
470                    msg += 'attributes to allow addition.'
471                    raise Exception, msg
472               
473                new_attributes = None
474            else:   
475                new_attributes = {}
476                for x in self.attributes.keys():
477                    if other.attributes.has_key(x):
478
479                        attrib1 = self.attributes[x]
480                        attrib2 = other.attributes[x]
481                        new_attributes[x] = numpy.concatenate((attrib1, attrib2))
482
483                    else:
484                        msg = 'Geospatial data must have the same \n'
485                        msg += 'attributes to allow addition.'
486                        raise Exception, msg
487
488
489        else:
490            #other is None:
491           
492            new_points = self.get_data_points(absolute=True)
493            new_attributes = self.attributes
494
495                   
496
497        # Instantiate new data object and return absolute coordinates
498        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
499        return Geospatial_data(new_points,
500                               new_attributes,
501                               new_geo_ref)
502
503
504    def __radd__(self, other):
505        """Handle cases like None + Geospatial_data(...)
506        """
507
508        return self + other
509
510   
511   
512    ###
513    #  IMPORT/EXPORT POINTS FILES
514    ###
515
516    def import_points_file(self, file_name, delimiter=None, verbose=False):
517        """ load an .txt, .csv or .pts file
518        Note: will throw an IOError/SyntaxError if it can't load the file.
519        Catch these!
520
521        Post condition: self.attributes dictionary has been set
522        """
523       
524        if access(file_name, F_OK) == 0 :
525            msg = 'File %s does not exist or is not accessible' %file_name
526            raise IOError, msg
527       
528        attributes = {}
529        if file_name[-4:]== ".pts":
530            try:
531                data_points, attributes, geo_reference =\
532                             _read_pts_file(file_name, verbose)
533            except IOError, e:   
534                msg = 'Could not open file %s ' %file_name
535                raise IOError, msg 
536       
537        elif file_name[-4:]== ".txt" or file_name[-4:]== ".csv":
538            try:
539                data_points, attributes, geo_reference =\
540                             _read_csv_file(file_name, verbose)
541            except IOError, e:
542                # This should only be if a file is not found
543                msg = 'Could not open file %s. ' %file_name
544                msg += 'Check the file location.'
545                raise IOError, msg
546            except SyntaxError, e:
547                # This should only be if there is a format error
548                msg = 'Could not open file %s. \n' %file_name
549                msg += Error_message['IOError']
550                #print "msg", msg
551                raise SyntaxError, msg
552        else:     
553            msg = 'Extension %s is unknown' %file_name[-4:]
554            raise IOError, msg
555#        print'in import data_points', data_points
556#        print'in import attributes', attributes
557#        print'in import data_points', geo_reference
558        self.data_points = data_points
559        self.attributes = attributes
560        self.geo_reference = geo_reference
561   
562#        return all_data
563   
564    def export_points_file(self, file_name, absolute=True, 
565                           as_lat_long=False, isSouthHemisphere=True):
566       
567        """
568        write a points file, file_name, as a text (.csv) or binary (.pts) file
569        file_name is the file name, including the extension
570        The point_dict is defined at the top of this file.
571       
572        If absolute is True data the xll and yll are added to the points value
573        and the xll and yll of the geo_reference are set to 0.
574       
575        If absolute is False data points at returned as relative to the xll
576        and yll and geo_reference remains uneffected
577       
578        isSouthHemisphere: is only used when getting data
579        points "as_lat_long" is True and if FALSE will return lats and
580        longs valid for the Northern Hemisphere.
581       
582        """
583
584        if (file_name[-4:] == ".pts"):
585            if absolute is True:
586                geo_ref = deepcopy(self.geo_reference)
587                geo_ref.xllcorner = 0
588                geo_ref.yllcorner = 0
589                _write_pts_file(file_name,
590                                self.get_data_points(absolute), 
591                                self.get_all_attributes(),
592                                geo_ref)
593            else:
594                _write_pts_file(file_name,
595                                self.get_data_points(absolute), 
596                                self.get_all_attributes(),
597                                self.get_geo_reference())
598               
599        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
600            msg = "ERROR: trying to write a .txt file with relative data."
601            assert absolute, msg
602            _write_csv_file(file_name,
603                            self.get_data_points(absolute=True,
604                                                 as_lat_long=as_lat_long,
605                                  isSouthHemisphere=isSouthHemisphere), 
606                            self.get_all_attributes(),
607                            as_lat_long=as_lat_long)
608             
609        elif file_name[-4:] == ".urs" :
610            msg = "ERROR: Can not write a .urs file as a relative file."
611            assert absolute, msg
612            _write_urs_file(file_name,
613                            self.get_data_points(as_lat_long=True,
614                                  isSouthHemisphere=isSouthHemisphere))
615                                                         
616        else:
617            msg = 'Unknown file type %s ' %file_name
618            raise IOError, msg
619       
620    def get_sample(self, indices):
621        """ Returns a object which is a subset of the original
622        and the data points and attributes in this new object refer to
623        the indices provided
624       
625        Input
626            indices- a list of integers that represent the new object
627        Output
628            New geospatial data object representing points specified by
629            the indices
630            """
631        #FIXME: add the geo_reference to this
632#        print 'hello from get_sample'
633        points = self.get_data_points()
634        sampled_points = take(points, indices)
635
636        attributes = self.get_all_attributes()
637
638        sampled_attributes = {}
639        if attributes is not None:
640            for key, att in attributes.items():
641                sampled_attributes[key] = numpy.take(att, indices)
642
643#        print 'goodbye from get_sample'
644
645        return Geospatial_data(sampled_points, sampled_attributes)
646   
647   
648    def split(self, factor=0.5,seed_num=None, verbose=False):
649        """Returns two       
650        geospatial_data object, first is the size of the 'factor'
651        smaller the original and the second is the remainder. The two
652        new object are disjoin set of each other.
653       
654        Points of the two new object have selected RANDOMLY.
655       
656        This method create two lists of indices which are passed into
657        get_sample.  The lists are created using random numbers, and
658        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
659        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
660               
661        Input - the factor which to split the object, if 0.1 then 10% of the
662            together object will be returned
663       
664        Output - two geospatial_data objects that are disjoint sets of the
665            original
666            """
667       
668        i=0
669        self_size = len(self)
670        random_list = []
671        remainder_list = []
672        new_size = round(factor*self_size)
673       
674        # Find unique random numbers
675        if verbose: print "make unique random number list and get indices"
676
677        total=numpy.array(range(self_size))
678        total_list = total.tolist()
679        if verbose: print "total list len",len(total_list)
680               
681        # There will be repeated random numbers however will not be a
682        # problem as they are being 'pop'ed out of array so if there
683        # are two numbers the same they will pop different indicies,
684        # still basically random
685        ## create list of non-unquie random numbers
686        if verbose: print "create random numbers list %s long" %new_size
687       
688        # Set seed if provided, mainly important for unit test!
689        # plus recalcule seed when no seed provided.
690        if seed_num != None:
691            numpy.random.seed(seed_num,seed_num)
692        else:
693            numpy.random.seed()
694        if verbose: print "seed:", numpy.random.get_seed()
695       
696        #print 'size',self_size, new_size
697        random_num = numpy.randint(0,self_size-1,(int(new_size),))
698        #print 'random num',random_num
699        random_num = random_num.tolist()
700
701        #need to sort and reverse so the pop() works correctly
702        random_num.sort()
703        random_num.reverse()       
704       
705        if verbose: print "make random number list and get indices"
706        j=0
707        k=1
708        remainder_list = total_list[:]
709        # pops array index (random_num) from remainder_list
710        # (which starts as the
711        # total_list and appends to random_list 
712        random_num_len = len(random_num)
713        for i in random_num:
714            random_list.append(remainder_list.pop(i))
715            j+=1
716            #prints progress
717            if verbose and round(random_num_len/10*k)==j:
718                print '(%s/%s)' %(j, random_num_len)
719                k+=1
720       
721        # FIXME: move to tests, it might take a long time
722        # then create an array of random lenght between 500 and 1000,
723        # and use a random factor between 0 and 1
724        # setup for assertion
725        test_total = random_list[:]
726        test_total.extend(remainder_list)
727        test_total.sort() 
728        msg = 'The two random lists made from the original list when added '\
729         'together DO NOT equal the original list'
730        assert (total_list==test_total),msg
731
732        # Get new samples
733        if verbose: print "get values of indices for random list"
734        G1 = self.get_sample(random_list)
735        if verbose: print "get values of indices for opposite of random list"
736        G2 = self.get_sample(remainder_list)
737
738        return G1, G2
739
740    def __iter__(self):
741        """Read in the header, number_of_points and save the
742        file pointer position
743        """
744        from Scientific.IO.NetCDF import NetCDFFile
745        #print "Starting to block"
746        #FIXME - what to do if the file isn't there
747
748        # FIXME (Ole): Shouldn't this go into the constructor?
749        # This method acts like the constructor when blcoking.
750        # ... and shouldn't it be called block_size?
751        #
752        if self.max_read_lines is None:
753            self.max_read_lines = MAX_READ_LINES
754        if self.file_name[-4:] == ".pts":
755           
756            # See if the file is there.  Throw a QUIET IO error if it isn't
757            fd = open(self.file_name,'r')
758            fd.close()
759   
760            # Throws prints to screen if file not present
761            self.fid = NetCDFFile(self.file_name, 'r')
762           
763            self.blocking_georef, self.blocking_keys, self.number_of_points =\
764                                  _read_pts_file_header(self.fid,
765                                                        self.verbose)
766            self.start_row = 0
767            self.last_row = self.number_of_points           
768            self.show_verbose = 0
769            self.verbose_block_size = (self.last_row + 10)/10
770            self.block_number = 0
771            self.number_of_blocks = self.number_of_points/self.max_read_lines
772            # This computes the number of full blocks. The last block may be
773            # smaller and won't be ircluded in this estimate.
774           
775            if self.verbose is True:
776                print 'Reading %d points (in ~%d blocks) from file %s. '\
777                      %(self.number_of_points,
778                        self.number_of_blocks,
779                        self.file_name),
780                print 'Each block consists of %d data points'\
781                      %self.max_read_lines
782           
783        else:
784            # Assume the file is a csv file
785            file_pointer = open(self.file_name)
786            self.header, self.file_pointer = \
787                         _read_csv_file_header(file_pointer)
788            self.blocking_georef = None # Used for reconciling zones
789
790        return self
791   
792   
793    def next(self):
794        """ read a block, instanciate a new geospatial and return it"""
795       
796        if self.file_name[-4:] == ".pts":
797            if self.start_row == self.last_row:
798                # Read the end of the file last iteration
799                # Remove blocking attributes
800                self.fid.close()
801                del self.max_read_lines
802                del self.blocking_georef
803                del self.last_row
804                del self.start_row
805                del self.blocking_keys
806                del self.fid
807                raise StopIteration
808            fin_row = self.start_row + self.max_read_lines
809            if fin_row > self.last_row:
810                fin_row = self.last_row
811
812               
813           
814            if self.verbose is True:
815                if self.show_verbose >= self.start_row and \
816                       self.show_verbose < fin_row:
817
818                   
819                    #print 'Doing %d of %d' %(self.start_row, self.last_row+10)
820
821                    print 'Reading block %d (points %d to %d) out of %d'\
822                          %(self.block_number,
823                            self.start_row,
824                            fin_row,
825                            self.number_of_blocks)
826
827                   
828                    self.show_verbose += max(self.max_read_lines,
829                                             self.verbose_block_size)
830
831                 
832            # Read next block
833            pointlist, att_dict, = \
834                       _read_pts_file_blocking(self.fid,
835                                               self.start_row,
836                                               fin_row,
837                                               self.blocking_keys)
838           
839            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
840            self.start_row = fin_row
841           
842            self.block_number += 1           
843           
844        else:
845            # Assume the file is a csv file
846            try:
847                #print "self.max_read_lines", self.max_read_lines
848                pointlist, att_dict, geo_ref, self.file_pointer = \
849                   _read_csv_file_blocking( self.file_pointer,
850                                            self.header[:],
851                                            max_read_lines=self.max_read_lines,
852                                            verbose=self.verbose)
853
854                # Check that the zones haven't changed.
855                if geo_ref is not None:
856                    geo_ref.reconcile_zones(self.blocking_georef)
857                    self.blocking_georef = geo_ref
858                elif self.blocking_georef is not None:
859                   
860                    msg = 'Geo reference given, then not given.'
861                    msg += ' This should not happen.' 
862                    raise ValueError, msg
863                geo = Geospatial_data(pointlist, att_dict, geo_ref)
864            except StopIteration:
865                self.file_pointer.close()
866                del self.header
867                del self.file_pointer
868                raise StopIteration
869            except ANUGAError:
870                self.file_pointer.close()
871                del self.header
872                del self.file_pointer
873                raise
874            except SyntaxError:
875                self.file_pointer.close()
876                del self.header
877                del self.file_pointer
878                # This should only be if there is a format error
879                msg = 'Could not open file %s. \n' %self.file_name
880                msg += Error_message['IOError']
881                raise SyntaxError, msg
882        return geo
883
884   
885##################### Error messages ###########
886Error_message = {}
887Em = Error_message
888Em['IOError'] = "NOTE: The format for a comma separated .txt/.csv file is:\n"
889Em['IOError'] += "        1st line:     [column names]\n"
890Em['IOError'] += "        other lines:  [x value], [y value], [attributes]\n"
891Em['IOError'] += "\n"
892Em['IOError'] += "           for example:\n"
893Em['IOError'] += "           x, y, elevation, friction\n"
894Em['IOError'] += "           0.6, 0.7, 4.9, 0.3\n"
895Em['IOError'] += "           1.9, 2.8, 5, 0.3\n"
896Em['IOError'] += "           2.7, 2.4, 5.2, 0.3\n"
897Em['IOError'] += "\n"
898Em['IOError'] += "The first two columns are assumed to be x, y coordinates.\n"
899Em['IOError'] += "The attribute values must be numeric.\n"
900
901def _set_using_lat_long(latitudes,
902                        longitudes,
903                        geo_reference,
904                        data_points,
905                        points_are_lats_longs):
906    """
907    if the points has lat long info, assume it is in (lat, long) order.
908    """
909   
910    if geo_reference is not None:
911        msg = """A georeference is specified yet latitude and longitude
912        are also specified!"""
913        raise ValueError, msg
914   
915    if data_points is not None and not points_are_lats_longs:
916        msg = """Data points are specified yet latitude and
917        longitude are also specified."""
918        raise ValueError, msg
919   
920    if points_are_lats_longs:
921        if data_points is None:
922            msg = """Data points are not specified."""
923            raise ValueError, msg
924        lats_longs = ensure_numeric(data_points)
925        latitudes = numpy.ravel(lats_longs[:,0:1])
926        longitudes = numpy.ravel(lats_longs[:,1:])
927       
928    if latitudes is None and longitudes is None:
929        msg = """Latitudes and Longitudes are not specified."""
930        raise ValueError, msg
931   
932    if latitudes is None:
933        msg = """Longitudes are specified yet latitudes aren't."""
934        raise ValueError, msg
935   
936    if longitudes is None:
937        msg = """Latitudes are specified yet longitudes aren't."""
938        raise ValueError, msg
939   
940    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
941                                                    longitudes=longitudes)
942    return data_points, Geo_reference(zone=zone)
943
944   
945def _read_pts_file(file_name, verbose=False):
946    """Read .pts NetCDF file
947   
948    Return a dic of array of points, and dic of array of attribute
949    eg
950    dic['points'] = [[1.0,2.0],[3.0,5.0]]
951    dic['attributelist']['elevation'] = [[7.0,5.0]
952    """   
953
954    from Scientific.IO.NetCDF import NetCDFFile
955   
956    if verbose: print 'Reading ', file_name
957   
958       
959    # See if the file is there.  Throw a QUIET IO error if it isn't
960    fd = open(file_name,'r')
961    fd.close()
962   
963    # Throws prints to screen if file not present
964    fid = NetCDFFile(file_name, 'r') 
965   
966    pointlist = numpy.array(fid.variables['points'])
967    keys = fid.variables.keys()
968    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
969    try:
970        keys.remove('points')
971    except IOError, e:       
972        fid.close()   
973        msg = 'Expected keyword "points" but could not find it'
974        raise IOError, msg
975   
976    attributes = {}
977    for key in keys:
978        if verbose: print "reading attribute '%s'" %key
979           
980        attributes[key] = numpy.array(fid.variables[key])
981   
982   
983    try:
984        geo_reference = Geo_reference(NetCDFObject=fid)
985    except AttributeError, e:
986        geo_reference = None
987   
988    fid.close()
989   
990    return pointlist, attributes, geo_reference
991
992
993def _read_csv_file(file_name, verbose=False):
994    """Read .csv file
995   
996    Return a dic of array of points, and dic of array of attribute
997    eg
998    dic['points'] = [[1.0,2.0],[3.0,5.0]]
999    dic['attributelist']['elevation'] = [[7.0,5.0]
1000    """
1001   
1002    file_pointer = open(file_name)
1003    header, file_pointer = _read_csv_file_header(file_pointer)
1004    try:
1005        pointlist, att_dict, geo_ref, file_pointer = \
1006                   _read_csv_file_blocking( \
1007                file_pointer,
1008                header,
1009                max_read_lines=1e30) #If the file is bigger that this, block..  # FIXME (Ole) What's up here?
1010               
1011    except ANUGAError:
1012        file_pointer.close()
1013        raise 
1014    file_pointer.close()
1015    return pointlist, att_dict, geo_ref   
1016
1017CSV_DELIMITER = ','
1018def _read_csv_file_header(file_pointer,
1019                          delimiter=CSV_DELIMITER,
1020                          verbose=False):
1021
1022    """Read the header of a .csv file
1023    Return a list of the header names
1024    """
1025    line = file_pointer.readline()
1026    header = clean_line(line, delimiter)
1027    return header, file_pointer
1028
1029def _read_csv_file_blocking(file_pointer, header,
1030                            delimiter=CSV_DELIMITER,
1031                            max_read_lines=MAX_READ_LINES,
1032                            verbose=False):
1033   
1034
1035    """
1036    Read the body of a .csv file.
1037    header: The list header of the csv file, with the x and y labels.
1038    """
1039    points = []
1040    pointattributes = []
1041    att_dict = {}
1042
1043    # This is to remove the x and y headers.
1044    header = header[:]
1045    try:
1046        x_header = header.pop(0)
1047        y_header = header.pop(0)
1048    except IndexError:
1049        # if there are not two columns this will occur.
1050        # eg if it is a space seperated file
1051        raise SyntaxError
1052   
1053    read_lines = 0
1054    while read_lines<max_read_lines:
1055        line = file_pointer.readline()
1056        #print "line",line
1057        numbers = clean_line(line,delimiter)
1058        if len(numbers) <= 1:
1059            break
1060        if line[0] == '#':
1061            continue
1062        read_lines += 1
1063        try:
1064            x = float(numbers[0])
1065            y = float(numbers[1])
1066            points.append([x,y])
1067            numbers.pop(0)
1068            numbers.pop(0)
1069            if len(header) != len(numbers):
1070                file_pointer.close() 
1071                msg = "File load error.  \
1072                There might be a problem with the file header"
1073                raise SyntaxError, msg
1074            for i,num in enumerate(numbers):
1075                num.strip()
1076                if num != '\n' and num != '':
1077                    #attributes.append(float(num))
1078                    att_dict.setdefault(header[i],[]).append(float(num))
1079        #except IOError:           
1080        except ValueError:
1081            raise SyntaxError
1082    if points == []:
1083        raise StopIteration
1084       
1085   
1086    pointlist = numpy.array(points).astype(numpy.float)
1087    for key in att_dict.keys():
1088        att_dict[key] = numpy.array(att_dict[key]).astype(numpy.float)
1089
1090    # Do stuff here so the info is in lat's and longs
1091    geo_ref = None
1092    x_header = lower(x_header[:3])
1093    y_header = lower(y_header[:3])
1094    if (x_header == 'lon' or  x_header == 'lat') and \
1095       (y_header == 'lon' or  y_header == 'lat'):
1096        if x_header == 'lon':
1097            longitudes = numpy.ravel(pointlist[:,0:1])
1098            latitudes = numpy.ravel(pointlist[:,1:])
1099        else:
1100            latitudes = numpy.ravel(pointlist[:,0:1])
1101            longitudes = numpy.ravel(pointlist[:,1:])
1102       
1103        pointlist, geo_ref = _set_using_lat_long(latitudes,
1104                                                 longitudes,
1105                                                 geo_reference=None,
1106                                                 data_points=None,
1107                                                 points_are_lats_longs=False)
1108    return pointlist, att_dict, geo_ref, file_pointer
1109
1110def _read_pts_file_header(fid, verbose=False):
1111
1112    """Read the geo_reference and number_of_points from a .pts file
1113    """
1114   
1115    keys = fid.variables.keys()
1116    try:
1117        keys.remove('points')
1118    except IOError, e:       
1119        fid.close()   
1120        msg = 'Expected keyword "points" but could not find it'
1121        raise IOError, msg
1122    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
1123   
1124    try:
1125        geo_reference = Geo_reference(NetCDFObject=fid)
1126    except AttributeError, e:
1127        geo_reference = None
1128
1129    return geo_reference, keys, fid.dimensions['number_of_points']
1130
1131def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1132                            #verbose=False):
1133   
1134
1135    """
1136    Read the body of a .csv file.
1137    header: The list header of the csv file, with the x and y labels.
1138    """
1139   
1140    pointlist = numpy.array(fid.variables['points'][start_row:fin_row])
1141   
1142    attributes = {}
1143    for key in keys:
1144        attributes[key] = numpy.array(fid.variables[key][start_row:fin_row])
1145
1146    return pointlist, attributes
1147   
1148def _write_pts_file(file_name,
1149                    write_data_points,
1150                    write_attributes=None, 
1151                    write_geo_reference=None):
1152    """
1153    Write .pts NetCDF file   
1154
1155    NOTE: Below might not be valid ask Duncan : NB 5/2006
1156   
1157    WARNING: This function mangles the point_atts data structure
1158    #F??ME: (DSG)This format has issues.
1159    # There can't be an attribute called points
1160    # consider format change
1161    # method changed by NB not sure if above statement is correct
1162   
1163    should create new test for this
1164    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1165    for key in point_atts.keys():
1166        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1167        assert key in legal_keys, msg
1168    """   
1169    from Scientific.IO.NetCDF import NetCDFFile
1170    # NetCDF file definition
1171    outfile = NetCDFFile(file_name, 'w')
1172   
1173    # Create new file
1174    outfile.institution = 'Geoscience Australia'
1175    outfile.description = 'NetCDF format for compact and portable storage ' +\
1176                          'of spatial point data'
1177   
1178    # Dimension definitions
1179    shape = write_data_points.shape[0]
1180    outfile.createDimension('number_of_points', shape) 
1181    outfile.createDimension('number_of_dimensions', 2) #This is 2d data
1182   
1183    # Variable definition
1184    outfile.createVariable('points', numpy.float, ('number_of_points',
1185                                             'number_of_dimensions'))
1186
1187    #create variables 
1188    outfile.variables['points'][:] = write_data_points #.astype(Float32)
1189
1190    if write_attributes is not None:
1191        for key in write_attributes.keys():
1192            outfile.createVariable(key, numpy.float, ('number_of_points',))
1193            outfile.variables[key][:] = write_attributes[key] #.astype(Float32)
1194       
1195    if write_geo_reference is not None:
1196        write_NetCDF_georeference(write_geo_reference, outfile)
1197       
1198    outfile.close() 
1199 
1200def _write_csv_file(file_name,
1201                    write_data_points,
1202                    write_attributes=None,
1203                    as_lat_long=False,
1204                    delimiter=','):
1205    """
1206    export a file, file_name, with the csv format
1207   
1208    """
1209    points = write_data_points
1210    pointattributes = write_attributes
1211   
1212    fd = open(file_name,'w')
1213    if as_lat_long:
1214        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1215    else:
1216        titlelist = "x" + delimiter + "y"  + delimiter
1217    if pointattributes is not None:   
1218        for title in pointattributes.keys():
1219            titlelist = titlelist + title + delimiter
1220        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1221    fd.write(titlelist+"\n")
1222   
1223    # <x/lat> <y/long> [attributes]
1224    for i, vert in enumerate( points):
1225
1226
1227        if pointattributes is not None:           
1228            attlist = ","
1229            for att in pointattributes.keys():
1230                attlist = attlist + str(pointattributes[att][i])+ delimiter
1231            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1232            attlist.strip()
1233        else:
1234            attlist = ''
1235
1236        fd.write(str(vert[0]) + delimiter +
1237                 str(vert[1]) + attlist + "\n")
1238
1239    fd.close()
1240 
1241def _write_urs_file(file_name,
1242                    points,
1243                    delimiter=' '):
1244    """
1245    export a file, file_name, with the urs format
1246    the data points are in lats and longs
1247   
1248    """   
1249    fd = open(file_name,'w')   
1250    fd.write(str(len(points))+"\n")   
1251    # <lat> <long> <id#>
1252    for i, vert in enumerate( points):
1253        fd.write(str(round(vert[0],7)) + delimiter + \
1254                 str(round(vert[1],7)) + delimiter +str(i)+ "\n")
1255    fd.close()
1256   
1257def _point_atts2array(point_atts):
1258    point_atts['pointlist'] = numpy.array(point_atts['pointlist']).astype(numpy.float)
1259   
1260    for key in point_atts['attributelist'].keys():
1261        point_atts['attributelist'][key]=\
1262                numpy.array(point_atts['attributelist'][key]).astype(numpy.float)
1263    return point_atts
1264
1265   
1266
1267
1268def geospatial_data2points_dictionary(geospatial_data):
1269    """Convert geospatial data to points_dictionary
1270    """
1271
1272    points_dictionary = {}
1273    points_dictionary['pointlist'] = geospatial_data.data_points
1274
1275    points_dictionary['attributelist'] = {}
1276
1277    for attribute_name in geospatial_data.attributes.keys():
1278        val = geospatial_data.attributes[attribute_name]
1279        points_dictionary['attributelist'][attribute_name] = val
1280
1281    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1282
1283    return points_dictionary
1284
1285   
1286def points_dictionary2geospatial_data(points_dictionary):
1287    """Convert points_dictionary to geospatial data object
1288    """
1289
1290    msg = 'Points dictionary must have key pointlist' 
1291    assert points_dictionary.has_key('pointlist'), msg
1292
1293    msg = 'Points dictionary must have key attributelist'     
1294    assert points_dictionary.has_key('attributelist'), msg       
1295
1296    if points_dictionary.has_key('geo_reference'):
1297        geo = points_dictionary['geo_reference']
1298    else:
1299        geo = None
1300   
1301    return Geospatial_data(points_dictionary['pointlist'],
1302                           points_dictionary['attributelist'],
1303                           geo_reference = geo)
1304
1305def clean_line(line,delimiter):     
1306    """Remove whitespace
1307    """
1308    #print ">%s" %line
1309    line = line.strip()
1310    #print "stripped>%s" %line
1311    numbers = line.split(delimiter)
1312    i = len(numbers) - 1
1313    while i >= 0:
1314        if numbers[i] == '':
1315            numbers.pop(i)
1316        else:
1317            numbers[i] = numbers[i].strip()
1318       
1319        i += -1
1320    #for num in numbers:
1321    #    print "num>%s<" %num
1322    return numbers
1323           
1324def ensure_absolute(points, geo_reference=None):
1325    """Ensure that points are in absolute coordinates.
1326   
1327    This function inputs several formats and
1328    outputs one format. - a numeric array of absolute points.
1329
1330    Input formats are;
1331      points: List or numeric array of coordinate pairs [xi, eta] of
1332              points or geospatial object or points file name
1333
1334    mesh_origin: A geo_reference object or 3-tuples consisting of
1335                 UTM zone, easting and northing.
1336                 If specified vertex coordinates are assumed to be
1337                 relative to their respective origins.
1338    """
1339
1340    # Input check
1341    if isinstance(points, basestring):
1342        #It's a string - assume it is a point file
1343        points = Geospatial_data(file_name=points)
1344       
1345    if isinstance(points, Geospatial_data):
1346        points = points.get_data_points(absolute=True)
1347        msg = 'Use a Geospatial_data object or a mesh origin. Not both.'
1348        assert geo_reference == None, msg
1349    else:
1350        points = ensure_numeric(points, numpy.float)
1351       
1352    # Sort of geo_reference and convert points
1353    if geo_reference is None:
1354        geo = None #Geo_reference()
1355    else:
1356        if isinstance(geo_reference, Geo_reference):
1357            geo = geo_reference
1358        else:
1359            geo = Geo_reference(geo_reference[0],
1360                                geo_reference[1],
1361                                geo_reference[2])
1362        points = geo.get_absolute(points)
1363       
1364    # Return   
1365    return points
1366     
1367
1368def ensure_geospatial(points, geo_reference=None):
1369    """
1370    This function inputs several formats and
1371    outputs one format. - a geospatial_data instance.
1372
1373    Inputed formats are;
1374    points: List or numeric array of coordinate pairs [xi, eta] of
1375              points or geospatial object
1376
1377    mesh_origin: A geo_reference object or 3-tuples consisting of
1378                 UTM zone, easting and northing.
1379                 If specified vertex coordinates are assumed to be
1380                 relative to their respective origins.
1381    """
1382   
1383    # Input check
1384    if isinstance(points, Geospatial_data):
1385        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1386        assert geo_reference is None, msg
1387        return points   
1388    else:
1389        # List or numeric array of absolute points
1390        points = ensure_numeric(points, numpy.float)
1391
1392    # Sort out geo reference   
1393    if geo_reference is None:
1394        geo = None
1395    else:
1396        if isinstance(geo_reference, Geo_reference):
1397            geo = geo_reference
1398        else:
1399            geo = Geo_reference(geo_reference[0],
1400                                geo_reference[1],
1401                                geo_reference[2])
1402                               
1403    # Create Geospatial_data object with appropriate geo reference and return                           
1404    points = Geospatial_data(data_points=points, geo_reference=geo)       
1405    return points
1406
1407
1408
1409def find_optimal_smoothing_parameter(data_file, 
1410                                     alpha_list=None,
1411                                     mesh_file=None,
1412                                     boundary_poly=None,
1413                                     mesh_resolution=100000,
1414                                     north_boundary=None,
1415                                     south_boundary=None,
1416                                     east_boundary=None,
1417                                     west_boundary=None,
1418                                     plot_name='all_alphas',
1419                                     split_factor=0.1,
1420                                     seed_num=None,
1421                                     cache=False,
1422                                     verbose=False
1423                                     ):
1424   
1425    """
1426    Removes a small random sample of points from 'data_file'. Then creates
1427    models with different alpha values from 'alpha_list' and cross validates
1428    the predicted value to the previously removed point data. Returns the
1429    alpha value which has the smallest covariance.
1430   
1431    data_file: must not contain points outside the boundaries defined
1432           and it either a pts, txt or csv file.
1433   
1434    alpha_list: the alpha values to test in a single list
1435   
1436    mesh_file: name of the created mesh file or if passed in will read it.
1437            NOTE, if there is a mesh file mesh_resolution,
1438            north_boundary, south... etc will be ignored.
1439   
1440    mesh_resolution: the maximum area size for a triangle
1441   
1442    north_boundary... west_boundary: the value of the boundary
1443   
1444    plot_name: the name for the plot contain the results
1445   
1446    seed_num: the seed to the random number generator
1447   
1448    USAGE:
1449        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1450                                             alpha_list=[0.0001, 0.01, 1],
1451                                             mesh_file=None,
1452                                             mesh_resolution=3,
1453                                             north_boundary=5,
1454                                             south_boundary=-5,
1455                                             east_boundary=5,
1456                                             west_boundary=-5,
1457                                             plot_name='all_alphas',
1458                                             seed_num=100000,
1459                                             verbose=False)
1460   
1461    OUTPUT: returns the minumum normalised covalance calculate AND the
1462           alpha that created it. PLUS writes a plot of the results
1463           
1464    NOTE: code will not work if the data_file extent is greater than the
1465    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
1466       
1467    """
1468
1469    from anuga.shallow_water import Domain
1470    from anuga.geospatial_data.geospatial_data import Geospatial_data
1471    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1472   
1473    from anuga.utilities.numerical_tools import cov
1474##    from numpy.oldnumeric import array, resize,shape,Float,zeros,take,argsort,argmin
1475    from anuga.utilities.polygon import is_inside_polygon
1476    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1477   
1478
1479    attribute_smoothed='elevation'
1480
1481    if mesh_file is None:
1482        if verbose: print "building mesh"
1483        mesh_file='temp.msh'
1484
1485        if north_boundary is None or south_boundary is None or \
1486           east_boundary is None or west_boundary is None:
1487            no_boundary=True
1488        else:
1489            no_boundary=False
1490       
1491        if no_boundary is True:
1492            msg= 'All boundaries must be defined'
1493            raise msg
1494
1495        poly_topo = [[east_boundary,south_boundary],
1496                     [east_boundary,north_boundary],
1497                     [west_boundary,north_boundary],
1498                     [west_boundary,south_boundary]]
1499                     
1500        create_mesh_from_regions(poly_topo,
1501                                 boundary_tags={'back': [2],
1502                                                'side': [1,3],
1503                                                'ocean': [0]},
1504                             maximum_triangle_area=mesh_resolution,
1505                             filename=mesh_file,
1506                             use_cache=cache,
1507                             verbose=verbose)
1508
1509    else: # if mesh file provided
1510        #test mesh file exists?
1511        if verbose: "reading from file: %s" %mesh_file
1512        if access(mesh_file,F_OK) == 0:
1513            msg="file %s doesn't exist!" %mesh_file
1514            raise IOError, msg
1515
1516    #split topo data
1517    if verbose: print 'Reading elevation file: %s' %data_file
1518    G = Geospatial_data(file_name = data_file)
1519    if verbose: print 'Start split'
1520    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1521    if verbose: print 'Finish split'
1522    points=G_small.get_data_points()
1523
1524    if verbose: print "Number of points in sample to compare: ", len(points)
1525   
1526    if alpha_list==None:
1527        alphas = [0.001,0.01,100]
1528        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1529         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1530       
1531    else:
1532        alphas=alpha_list
1533
1534    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1535    #4 onwards is the elevation_predicted using the alpha, which will
1536    #be compared later against the real removed data
1537    data=numpy.array([],typecode=numpy.float)
1538
1539    data=numpy.resize(data,(len(points),3+len(alphas)))
1540
1541    #gets relative point from sample
1542    data[:,0]=points[:,0]
1543    data[:,1]=points[:,1]
1544    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1545    data[:,2]=elevation_sample
1546
1547    normal_cov=numpy.array(numpy.zeros([len(alphas),2]),typecode=numpy.float)
1548
1549    if verbose: print 'Setup computational domains with different alphas'
1550
1551    #print 'memory usage before domains',mem_usage()
1552
1553    for i,alpha in enumerate(alphas):
1554        #add G_other data to domains with different alphas
1555        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1556        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1557        if verbose:print domain.statistics()
1558        domain.set_quantity(attribute_smoothed, 
1559                    geospatial_data = G_other,
1560                    use_cache = cache,
1561                    verbose = verbose,
1562                    alpha = alpha)
1563
1564       
1565        # Convert points to geospatial data for use with get_values below
1566        points_geo = Geospatial_data(points, domain.geo_reference)
1567       
1568        #returns the predicted elevation of the points that were "split" out
1569        #of the original data set for one particular alpha
1570        if verbose: print 'Get predicted elevation for location to be compared'
1571        elevation_predicted=domain.quantities[attribute_smoothed].\
1572                            get_values(interpolation_points=points_geo)
1573 
1574        #add predicted elevation to array that starts with x, y, z...
1575        data[:,i+3]=elevation_predicted
1576
1577        sample_cov= cov(elevation_sample)
1578        #print elevation_predicted
1579        ele_cov= cov(elevation_sample-elevation_predicted)
1580        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1581        #print 'memory usage during compare',mem_usage()
1582       
1583        if verbose: print'Covariance for alpha ',normal_cov[i][0],'= ',normal_cov[i][1]
1584        if verbose: print'-------------------------------------------- \n'
1585
1586    normal_cov0=normal_cov[:,0]
1587    normal_cov_new=numpy.take(normal_cov,numpy.argsort(normal_cov0))
1588
1589    if plot_name is not None:
1590        from pylab import savefig,semilogx,loglog
1591        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1592        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1593        savefig(plot_name,dpi=300)
1594    if mesh_file == 'temp.msh':
1595        remove(mesh_file)
1596   
1597    if verbose: 
1598        print 'Final results:'
1599        for i,alpha in enumerate(alphas):
1600            print'covariance for alpha %s = %s ' %(normal_cov[i][0],normal_cov[i][1])
1601        print '\n Optimal alpha is: %s ' % normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1602
1603    # covariance and optimal alpha
1604    return min(normal_cov_new[:,1]) , normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1605
1606def old_find_optimal_smoothing_parameter(data_file, 
1607                                     alpha_list=None,
1608                                     mesh_file=None,
1609                                     boundary_poly=None,
1610                                     mesh_resolution=100000,
1611                                     north_boundary=None,
1612                                     south_boundary=None,
1613                                     east_boundary=None,
1614                                     west_boundary=None,
1615                                     plot_name='all_alphas',
1616                                     split_factor=0.1,
1617                                     seed_num=None,
1618                                     cache=False,
1619                                     verbose=False
1620                                     ):
1621   
1622    """
1623    data_file: must not contain points outside the boundaries defined
1624           and it either a pts, txt or csv file.
1625   
1626    alpha_list: the alpha values to test in a single list
1627   
1628    mesh_file: name of the created mesh file or if passed in will read it.
1629            NOTE, if there is a mesh file mesh_resolution,
1630            north_boundary, south... etc will be ignored.
1631   
1632    mesh_resolution: the maximum area size for a triangle
1633   
1634    north_boundary... west_boundary: the value of the boundary
1635   
1636    plot_name: the name for the plot contain the results
1637   
1638    seed_num: the seed to the random number generator
1639   
1640    USAGE:
1641        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1642                                             alpha_list=[0.0001, 0.01, 1],
1643                                             mesh_file=None,
1644                                             mesh_resolution=3,
1645                                             north_boundary=5,
1646                                             south_boundary=-5,
1647                                             east_boundary=5,
1648                                             west_boundary=-5,
1649                                             plot_name='all_alphas',
1650                                             seed_num=100000,
1651                                             verbose=False)
1652   
1653    OUTPUT: returns the minumum normalised covalance calculate AND the
1654           alpha that created it. PLUS writes a plot of the results
1655           
1656    NOTE: code will not work if the data_file extend is greater than the
1657    boundary_polygon or the north_boundary...west_boundary
1658       
1659    """
1660
1661    from anuga.shallow_water import Domain
1662    from anuga.geospatial_data.geospatial_data import Geospatial_data
1663    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1664   
1665    from anuga.utilities.numerical_tools import cov
1666##    from numpy.oldnumeric import array, resize,shape,Float,zeros,take,argsort,argmin
1667    from anuga.utilities.polygon import is_inside_polygon
1668    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1669   
1670
1671    attribute_smoothed='elevation'
1672
1673    if mesh_file is None:
1674        mesh_file='temp.msh'
1675
1676        if north_boundary is None or south_boundary is None or \
1677           east_boundary is None or west_boundary is None:
1678            no_boundary=True
1679        else:
1680            no_boundary=False
1681       
1682        if no_boundary is True:
1683            msg= 'All boundaries must be defined'
1684            raise msg
1685
1686        poly_topo = [[east_boundary,south_boundary],
1687                     [east_boundary,north_boundary],
1688                     [west_boundary,north_boundary],
1689                     [west_boundary,south_boundary]]
1690                     
1691        create_mesh_from_regions(poly_topo,
1692                                 boundary_tags={'back': [2],
1693                                                'side': [1,3],
1694                                                'ocean': [0]},
1695                             maximum_triangle_area=mesh_resolution,
1696                             filename=mesh_file,
1697                             use_cache=cache,
1698                             verbose=verbose)
1699
1700    else: # if mesh file provided
1701        #test mesh file exists?
1702        if access(mesh_file,F_OK) == 0:
1703            msg="file %s doesn't exist!" %mesh_file
1704            raise IOError, msg
1705
1706    #split topo data
1707    G = Geospatial_data(file_name = data_file)
1708    if verbose: print 'start split'
1709    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1710    if verbose: print 'finish split'
1711    points=G_small.get_data_points()
1712
1713    #FIXME: Remove points outside boundary polygon
1714#    print 'new point',len(points)
1715#   
1716#    new_points=[]
1717#    new_points=array([],typecode=Float)
1718#    new_points=resize(new_points,(len(points),2))
1719#    print "BOUNDARY", boundary_poly
1720#    for i,point in enumerate(points):
1721#        if is_inside_polygon(point,boundary_poly, verbose=True):
1722#            new_points[i] = point
1723#            print"WOW",i,new_points[i]
1724#    points = new_points
1725
1726   
1727    if verbose: print "Number of points in sample to compare: ", len(points)
1728   
1729    if alpha_list==None:
1730        alphas = [0.001,0.01,100]
1731        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1732         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1733       
1734    else:
1735        alphas=alpha_list
1736    domains = {}
1737
1738    if verbose: print 'Setup computational domains with different alphas'
1739
1740    print 'memory usage before domains',mem_usage()
1741
1742    for alpha in alphas:
1743        #add G_other data to domains with different alphas
1744        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1745        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1746        if verbose:print domain.statistics()
1747        domain.set_quantity(attribute_smoothed, 
1748                    geospatial_data = G_other,
1749                    use_cache = cache,
1750                    verbose = verbose,
1751                    alpha = alpha)
1752        domains[alpha]=domain
1753
1754    print 'memory usage after domains',mem_usage()
1755
1756    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1757    #4 onwards is the elevation_predicted using the alpha, which will
1758    #be compared later against the real removed data
1759    data=numpy.array([],typecode=numpy.float)
1760
1761    data=numpy.resize(data,(len(points),3+len(alphas)))
1762
1763    #gets relative point from sample
1764    data[:,0]=points[:,0]
1765    data[:,1]=points[:,1]
1766    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1767    data[:,2]=elevation_sample
1768
1769    normal_cov=numpy.array(numpy.zeros([len(alphas),2]),typecode=numpy.float)
1770
1771    if verbose: print 'Determine difference between predicted results and actual data'
1772    for i,alpha in enumerate(domains):
1773        if verbose: print'Alpha =',alpha
1774       
1775        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1776        #returns the predicted elevation of the points that were "split" out
1777        #of the original data set for one particular alpha
1778        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1779                            get_values(interpolation_points=points_geo)
1780 
1781        #add predicted elevation to array that starts with x, y, z...
1782        data[:,i+3]=elevation_predicted
1783
1784        sample_cov= cov(elevation_sample)
1785        #print elevation_predicted
1786        ele_cov= cov(elevation_sample-elevation_predicted)
1787        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1788        print 'memory usage during compare',mem_usage()
1789       
1790
1791        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
1792
1793    normal_cov0=normal_cov[:,0]
1794    normal_cov_new=numpy.take(normal_cov,numpy.argsort(normal_cov0))
1795
1796    if plot_name is not None:
1797        from pylab import savefig,semilogx,loglog
1798        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1799        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1800        savefig(plot_name,dpi=300)
1801    if mesh_file == 'temp.msh':
1802        remove(mesh_file)
1803   
1804    return min(normal_cov_new[:,1]) , normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1805
1806         
1807if __name__ == "__main__":
1808    pass
1809   
Note: See TracBrowser for help on using the repository browser.