source: anuga_core/source_numpy_conversion/anuga/geospatial_data/geospatial_data.py @ 5945

Last change on this file since 5945 was 5945, checked in by rwilson, 15 years ago

More NumPy? changes.

File size: 67.7 KB
Line 
1"""Class Geospatial_data - Manipulation of locations on the planet and
2associated attributes.
3
4"""
5from sys import maxint
6from os import access, F_OK, R_OK,remove
7from types import DictType
8from warnings import warn
9from string import lower
10import numpy
11import numpy.random
12#from Array import tolist
13from copy import deepcopy
14
15
16from Scientific.IO.NetCDF import NetCDFFile
17from anuga.config import Float
18from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL   
19from anuga.utilities.numerical_tools import ensure_numeric
20from anuga.coordinate_transforms.geo_reference import Geo_reference, \
21     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
22from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
23from anuga.utilities.anuga_exceptions import ANUGAError
24from anuga.config import points_file_block_line_size as MAX_READ_LINES 
25#from anuga.fit_interpolate.benchmark_least_squares import mem_usage
26 
27
28DEFAULT_ATTRIBUTE = 'elevation'
29
30class Geospatial_data:
31
32    def __init__(self,
33                 data_points=None, # this can also be a points file name
34                 attributes=None,
35                 geo_reference=None,
36                 default_attribute_name=None,
37                 file_name=None,
38                 latitudes=None,
39                 longitudes=None,
40                 points_are_lats_longs=False,
41                 max_read_lines=None,
42                 load_file_now=True,
43                 verbose=False):
44
45       
46        """
47        Create instance from data points and associated attributes
48
49        data_points: x,y coordinates in meters. Type must be either a
50        sequence of 2-tuples or an Mx2 Numeric array of floats.  A file name
51        with extension .txt, .cvs or .pts can also be passed in here.
52
53        attributes: Associated values for each data point. The type
54        must be either a list or an array of length M or a dictionary
55        of lists (or arrays) of length M. In the latter case the keys
56        in the dictionary represent the attribute names, in the former
57        the attribute will get the default name "elevation".
58       
59        geo_reference: Object representing the origin of the data
60        points. It contains UTM zone, easting and northing and data
61        points are assumed to be relative to this origin.
62        If geo_reference is None, the default geo ref object is used.
63
64        default_attribute_name: Name of default attribute to be used with
65        get_attribute_values. The idea is that the dataset can be
66        equipped with information about which attribute to return.
67        If None, the default is the "first"
68
69        latitudes, longitudes: Vectors of latitudes and longitudes,
70        used to specify location instead of points.
71       
72        points_are_lats_longs: Set this as true if the points are actually
73        lats and longs, not UTM
74
75        max_read_lines: The number of rows read into memory when using
76        blocking to read a file.
77
78        load_file_now:  If true the file is automatically loaded
79        into the geospatial instance. Used when blocking.
80       
81        file_name: Name of input netCDF file or .txt file. netCDF file must
82        have dimensions "points" etc.
83        .txt file is a comma seperated file with x, y and attribute
84        data.
85       
86        The first line has the titles of the columns.  The first two
87        column titles are checked to see if they start with lat or
88        long (not case sensitive).  If so the data is assumed to be
89        latitude and longitude, in decimal format and converted to
90        UTM.  Otherwise the first two columns are assumed to be the x
91        and y, and the title names acually used are ignored.
92 
93       
94        The format for a .txt file is:
95            1st line:     [column names]
96            other lines:  x y [attributes]
97
98            for example:
99            x, y, elevation, friction
100            0.6, 0.7, 4.9, 0.3
101            1.9, 2.8, 5, 0.3
102            2.7, 2.4, 5.2, 0.3
103
104        The first two columns have to be x, y or lat, long
105        coordinates.
106     
107       
108        The format for a Points dictionary is:
109
110          ['pointlist'] a 2 column array describing points. 1st column x,
111          2nd column y.
112          ['attributelist'], a dictionary of 1D arrays, representing
113          attribute values at the point.  The dictionary key is the attribute
114          header.
115          ['geo_reference'] a Geo_refernece object. Use if the point
116          information is relative. This is optional.
117            eg
118            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
119            dic['attributelist']['elevation'] = [[7.0,5.0]
120               
121        verbose:
122
123         
124        """
125
126        if isinstance(data_points, basestring):
127            # assume data point is really a file name
128            file_name = data_points
129
130        self.set_verbose(verbose)
131        self.geo_reference=None #create the attribute
132        self.file_name = file_name
133       
134        if max_read_lines is None:
135            self.max_read_lines = MAX_READ_LINES
136        else:
137            self.max_read_lines = max_read_lines
138
139        if file_name is None:
140            if latitudes is not None or longitudes is not None or \
141                   points_are_lats_longs:
142                data_points, geo_reference =  \
143                             _set_using_lat_long(latitudes=latitudes,
144                                  longitudes=longitudes,
145                                  geo_reference=geo_reference,
146                                  data_points=data_points,
147                                  points_are_lats_longs=points_are_lats_longs)
148            self.check_data_points(data_points)
149            self.set_attributes(attributes)
150            self.set_geo_reference(geo_reference)
151            self.set_default_attribute_name(default_attribute_name)
152
153        elif load_file_now is True:
154            # watch for case where file name and points,
155            # attributes etc are provided!!
156            # if file name then all provided info will be removed!
157
158            if verbose is True:
159                if file_name is not None:
160                    print 'Loading Geospatial data from file: %s' %file_name
161           
162            self.import_points_file(file_name, verbose=verbose)
163               
164            self.check_data_points(self.data_points)
165            self.set_attributes(self.attributes) 
166            self.set_geo_reference(self.geo_reference)
167            self.set_default_attribute_name(default_attribute_name)
168
169        if verbose is True:
170            if file_name is not None:
171                print 'Geospatial data created from file: %s' %file_name
172                if load_file_now is False:
173                    print 'Data will be loaded blockwise on demand'
174
175                    if file_name.endswith('csv') or file_name.endswith('txt'):
176                        pass
177                        # This message was misleading.
178                        # FIXME (Ole): Are we blocking here or not?
179                        #print 'ASCII formats are not that great for '
180                        #print 'blockwise reading. Consider storing this'
181                        #print 'data as a pts NetCDF format'
182
183    def __len__(self):
184        return len(self.data_points)
185
186    def __repr__(self):
187        return str(self.get_data_points(absolute=True))
188   
189   
190    def check_data_points(self, data_points):
191        """Checks data points
192        """
193       
194        if data_points is None:
195            self.data_points = None
196            msg = 'There is no data or file provided!'
197            raise ValueError, msg
198           
199        else:
200            self.data_points = ensure_numeric(data_points)
201            #print "self.data_points.shape",self.data_points.shape
202            if not (0,) == self.data_points.shape:
203                assert len(self.data_points.shape) == 2
204                assert self.data_points.shape[1] == 2
205
206    def set_attributes(self, attributes):
207        """Check and assign attributes dictionary
208        """
209       
210        if attributes is None:
211            self.attributes = None
212            return
213       
214        if not isinstance(attributes, DictType):
215            #Convert single attribute into dictionary
216            attributes = {DEFAULT_ATTRIBUTE: attributes}
217
218        #Check input attributes   
219        for key in attributes.keys():
220            try:
221                attributes[key] = ensure_numeric(attributes[key])
222            except:
223                msg = 'Attribute %s could not be converted' %key
224                msg += 'to a numeric vector'
225                raise msg
226
227        self.attributes = attributes   
228
229    #def set_geo_reference(self, geo_reference):
230    #    # FIXME (Ole): Backwards compatibility - deprecate
231    #    self.setgeo_reference(geo_reference)
232
233    def set_geo_reference(self, geo_reference):
234        """Set the georeference of geospatial data.
235       
236        It can also be used to change the georeference and will ensure that
237        the absolute coordinate values are unchanged.
238        """
239        from anuga.coordinate_transforms.geo_reference import Geo_reference
240
241        if geo_reference is None:
242            # Use default - points are in absolute coordinates
243            geo_reference = Geo_reference()
244           
245        # Allow for tuple (zone, xllcorner, yllcorner)   
246        geo_reference = ensure_geo_reference(geo_reference)
247       
248        if not isinstance(geo_reference, Geo_reference):
249            # FIXME (Ole): This exception will be raised even
250            # if geo_reference is None. Is that the intent Duncan?
251            msg = 'Argument geo_reference must be a valid Geo_reference \n'
252            msg += 'object or None.'
253            raise msg
254
255        # If a geo_reference already exists, change the point data according to
256        # the new geo reference
257        if  self.geo_reference is not None:
258            self.data_points = self.get_data_points(geo_reference=geo_reference)
259           
260        self.geo_reference = geo_reference
261
262
263    def set_default_attribute_name(self, default_attribute_name):
264        self.default_attribute_name = default_attribute_name
265
266    def set_verbose(self, verbose=False):
267        if verbose in [False, True]:
268            self.verbose = verbose
269        else:
270            msg = 'Illegal value: %s' %str(verbose)
271            raise Exception, msg
272
273    def clip(self, polygon, closed=True, verbose=False):
274        """Clip geospatial data by a polygon
275
276        Input
277          polygon - Either a list of points, an Nx2 array or
278                    a Geospatial data object.
279          closed - (optional) determine whether points on boundary should be
280          regarded as belonging to the polygon (closed = True)
281          or not (closed = False). Default is True.
282           
283        Output
284          New geospatial data object representing points inside
285          specified polygon.
286       
287       
288        Note - this method is non-destructive and leaves the data in 'self'
289        unchanged
290        """
291
292        from anuga.utilities.polygon import inside_polygon
293
294        if isinstance(polygon, Geospatial_data):
295            # Polygon is an object - extract points
296            polygon = polygon.get_data_points()
297
298        points = self.get_data_points()   
299#        if verbose: print '%s points:%s' %(verbose,points)
300        inside_indices = inside_polygon(points, polygon, closed, verbose)
301
302        clipped_G = self.get_sample(inside_indices)
303
304#        clipped_points = take(points, inside_indices)
305
306        # Clip all attributes
307#        attributes = self.get_all_attributes()
308
309#        clipped_attributes = {}
310#        if attributes is not None:
311#            for key, att in attributes.items():
312#                clipped_attributes[key] = take(att, inside_indices)
313
314#        return Geospatial_data(clipped_points, clipped_attributes)
315        return clipped_G
316       
317       
318    def clip_outside(self, polygon, closed=True,verbose=False):
319        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
320
321        Input
322          polygon - Either a list of points, an Nx2 array or
323                    a Geospatial data object.
324          closed - (optional) determine whether points on boundary should be
325          regarded as belonging to the polygon (closed = True)
326          or not (closed = False). Default is True.
327         
328        Output
329          Geospatial data object representing point OUTSIDE specified polygon
330       
331        """
332
333        from anuga.utilities.polygon import outside_polygon
334
335        if isinstance(polygon, Geospatial_data):
336            # Polygon is an object - extract points
337            polygon = polygon.get_data_points()
338
339        points = self.get_data_points()   
340        outside_indices = outside_polygon(points, polygon, closed,verbose)
341
342        clipped_G = self.get_sample(outside_indices)
343
344#        clipped_points = take(points, outside_indices)
345
346        # Clip all attributes
347#        attributes = self.get_all_attributes()
348
349#        clipped_attributes = {}
350#        if attributes is not None:
351#            for key, att in attributes.items():
352#                clipped_attributes[key] = take(att, outside_indices)
353
354#        return Geospatial_data(clipped_points, clipped_attributes)
355        return clipped_G
356
357   
358    def get_geo_reference(self):
359        return self.geo_reference
360       
361    def get_data_points(self, absolute=True, geo_reference=None,
362                        as_lat_long=False, isSouthHemisphere=True):
363        """Get coordinates for all data points as an Nx2 array
364
365        If absolute is False returned coordinates are relative to the
366        internal georeference's xll and yll corners, otherwise
367        absolute UTM coordinates are returned.
368
369        If a geo_reference is passed the points are returned relative
370        to that geo_reference.
371       
372        isSH (isSouthHemisphere) is only used when getting data
373        points "as_lat_long" is True and if FALSE will return lats and
374        longs valid for the Northern Hemisphere.
375
376        Default: absolute is True.
377        """
378        if as_lat_long is True:
379            msg = "Points need a zone to be converted into lats and longs"
380            assert self.geo_reference is not None, msg
381            zone = self.geo_reference.get_zone()
382            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
383            lats_longs = []
384            for point in self.get_data_points(True):
385           
386                # UTMtoLL(northing, easting, zone,
387                lat_calced, long_calced = UTMtoLL(point[1],point[0], 
388                                                  zone, isSouthHemisphere)
389                lats_longs.append((lat_calced, long_calced)) # to hash
390            return lats_longs
391           
392        if absolute is True and geo_reference is None:
393            return self.geo_reference.get_absolute(self.data_points)
394        elif geo_reference is not None:
395            return geo_reference.change_points_geo_ref(self.data_points, 
396                                                       self.geo_reference)
397        else:
398            # If absolute is False
399            return self.data_points
400
401   
402    def get_attributes(self, attribute_name=None):
403        """Return values for one named attribute.
404
405        If attribute_name is None, default_attribute_name is used
406        """
407
408        if attribute_name is None:
409            if self.default_attribute_name is not None:
410                attribute_name = self.default_attribute_name
411            else:
412                attribute_name = self.attributes.keys()[0] 
413                # above line takes the first one from keys
414       
415        if self.verbose is True:
416            print 'Using attribute %s' %attribute_name
417            print 'Available attributes: %s' %(self.attributes.keys())       
418
419        msg = 'Attribute name %s does not exist in data set' %attribute_name
420        assert self.attributes.has_key(attribute_name), msg
421
422        return self.attributes[attribute_name]
423
424    def get_all_attributes(self):
425        """
426        Return values for all attributes.
427        The return value is either None or a dictionary (possibly empty).
428        """
429
430        return self.attributes
431
432    def __add__(self, other):
433        """
434        Returns the addition of 2 geospatical objects,
435        objects are concatencated to the end of each other
436           
437        NOTE: doesn't add if objects contain different
438        attributes 
439       
440        Always return absolute points!
441        This alse means, that if you add None to the object,
442        it will be turned into absolute coordinates
443
444        other can be None in which case nothing is added to self.
445        """
446
447
448        # find objects zone and checks if the same
449        geo_ref1 = self.get_geo_reference()
450        zone1 = geo_ref1.get_zone()
451
452
453        if other is not None:
454
455            geo_ref2 = other.get_geo_reference()
456            zone2 = geo_ref2.get_zone()
457
458            geo_ref1.reconcile_zones(geo_ref2)
459
460       
461            new_points = numpy.concatenate((self.get_data_points(absolute=True),
462                                      other.get_data_points(absolute=True)),
463                                      axis = 0)       
464
465       
466     
467            # Concatenate attributes if any
468            if self.attributes is None:
469                if other.attributes is not None:
470                    msg = 'Geospatial data must have the same \n'
471                    msg += 'attributes to allow addition.'
472                    raise Exception, msg
473               
474                new_attributes = None
475            else:   
476                new_attributes = {}
477                for x in self.attributes.keys():
478                    if other.attributes.has_key(x):
479
480                        attrib1 = self.attributes[x]
481                        attrib2 = other.attributes[x]
482                        new_attributes[x] = numpy.concatenate((attrib1, attrib2))
483
484                    else:
485                        msg = 'Geospatial data must have the same \n'
486                        msg += 'attributes to allow addition.'
487                        raise Exception, msg
488
489
490        else:
491            #other is None:
492           
493            new_points = self.get_data_points(absolute=True)
494            new_attributes = self.attributes
495
496                   
497
498        # Instantiate new data object and return absolute coordinates
499        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
500        return Geospatial_data(new_points,
501                               new_attributes,
502                               new_geo_ref)
503
504
505    def __radd__(self, other):
506        """Handle cases like None + Geospatial_data(...)
507        """
508
509        return self + other
510
511   
512   
513    ###
514    #  IMPORT/EXPORT POINTS FILES
515    ###
516
517    def import_points_file(self, file_name, delimiter=None, verbose=False):
518        """ load an .txt, .csv or .pts file
519        Note: will throw an IOError/SyntaxError if it can't load the file.
520        Catch these!
521
522        Post condition: self.attributes dictionary has been set
523        """
524       
525        if access(file_name, F_OK) == 0 :
526            msg = 'File %s does not exist or is not accessible' %file_name
527            raise IOError, msg
528       
529        attributes = {}
530        if file_name[-4:]== ".pts":
531            try:
532                data_points, attributes, geo_reference =\
533                             _read_pts_file(file_name, verbose)
534            except IOError, e:   
535                msg = 'Could not open file %s ' %file_name
536                raise IOError, msg 
537       
538        elif file_name[-4:]== ".txt" or file_name[-4:]== ".csv":
539            try:
540                data_points, attributes, geo_reference =\
541                             _read_csv_file(file_name, verbose)
542            except IOError, e:
543                # This should only be if a file is not found
544                msg = 'Could not open file %s. ' %file_name
545                msg += 'Check the file location.'
546                raise IOError, msg
547            except SyntaxError, e:
548                # This should only be if there is a format error
549                msg = 'Could not open file %s. \n' %file_name
550                msg += Error_message['IOError']
551                #print "msg", msg
552                raise SyntaxError, msg
553        else:     
554            msg = 'Extension %s is unknown' %file_name[-4:]
555            raise IOError, msg
556#        print'in import data_points', data_points
557#        print'in import attributes', attributes
558#        print'in import data_points', geo_reference
559        self.data_points = data_points
560        self.attributes = attributes
561        self.geo_reference = geo_reference
562   
563#        return all_data
564   
565    def export_points_file(self, file_name, absolute=True, 
566                           as_lat_long=False, isSouthHemisphere=True):
567       
568        """
569        write a points file, file_name, as a text (.csv) or binary (.pts) file
570        file_name is the file name, including the extension
571        The point_dict is defined at the top of this file.
572       
573        If absolute is True data the xll and yll are added to the points value
574        and the xll and yll of the geo_reference are set to 0.
575       
576        If absolute is False data points at returned as relative to the xll
577        and yll and geo_reference remains uneffected
578       
579        isSouthHemisphere: is only used when getting data
580        points "as_lat_long" is True and if FALSE will return lats and
581        longs valid for the Northern Hemisphere.
582       
583        """
584
585        if (file_name[-4:] == ".pts"):
586            if absolute is True:
587                geo_ref = deepcopy(self.geo_reference)
588                geo_ref.xllcorner = 0
589                geo_ref.yllcorner = 0
590                _write_pts_file(file_name,
591                                self.get_data_points(absolute), 
592                                self.get_all_attributes(),
593                                geo_ref)
594            else:
595                _write_pts_file(file_name,
596                                self.get_data_points(absolute), 
597                                self.get_all_attributes(),
598                                self.get_geo_reference())
599               
600        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
601            msg = "ERROR: trying to write a .txt file with relative data."
602            assert absolute, msg
603            _write_csv_file(file_name,
604                            self.get_data_points(absolute=True,
605                                                 as_lat_long=as_lat_long,
606                                  isSouthHemisphere=isSouthHemisphere), 
607                            self.get_all_attributes(),
608                            as_lat_long=as_lat_long)
609             
610        elif file_name[-4:] == ".urs" :
611            msg = "ERROR: Can not write a .urs file as a relative file."
612            assert absolute, msg
613            _write_urs_file(file_name,
614                            self.get_data_points(as_lat_long=True,
615                                  isSouthHemisphere=isSouthHemisphere))
616                                                         
617        else:
618            msg = 'Unknown file type %s ' %file_name
619            raise IOError, msg
620       
621    def get_sample(self, indices):
622        """ Returns a object which is a subset of the original
623        and the data points and attributes in this new object refer to
624        the indices provided
625       
626        Input
627            indices- a list of integers that represent the new object
628        Output
629            New geospatial data object representing points specified by
630            the indices
631            """
632        #FIXME: add the geo_reference to this
633#        print 'hello from get_sample'
634        points = self.get_data_points()
635        sampled_points = numpy.take(points, indices)        ##FIX?
636
637        attributes = self.get_all_attributes()
638
639        sampled_attributes = {}
640        if attributes is not None:
641            for key, att in attributes.items():
642                sampled_attributes[key] = numpy.take(att, indices)      ##FIX?
643
644#        print 'goodbye from get_sample'
645
646        return Geospatial_data(sampled_points, sampled_attributes)
647   
648   
649    def split(self, factor=0.5,seed_num=None, verbose=False):
650        """Returns two       
651        geospatial_data object, first is the size of the 'factor'
652        smaller the original and the second is the remainder. The two
653        new object are disjoin set of each other.
654       
655        Points of the two new object have selected RANDOMLY.
656       
657        This method create two lists of indices which are passed into
658        get_sample.  The lists are created using random numbers, and
659        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
660        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
661               
662        Input - the factor which to split the object, if 0.1 then 10% of the
663            together object will be returned
664       
665        Output - two geospatial_data objects that are disjoint sets of the
666            original
667            """
668       
669        i=0
670        self_size = len(self)
671        random_list = []
672        remainder_list = []
673        new_size = round(factor*self_size)
674       
675        # Find unique random numbers
676        if verbose: print "make unique random number list and get indices"
677
678        total=numpy.array(range(self_size))
679        total_list = total.tolist()
680        if verbose: print "total list len",len(total_list)
681               
682        # There will be repeated random numbers however will not be a
683        # problem as they are being 'pop'ed out of array so if there
684        # are two numbers the same they will pop different indicies,
685        # still basically random
686        ## create list of non-unquie random numbers
687        if verbose: print "create random numbers list %s long" %new_size
688       
689        # Set seed if provided, mainly important for unit test!
690        # plus recalcule seed when no seed provided.
691        if seed_num != None:
692            numpy.random.seed(seed_num)
693        else:
694            numpy.random.seed()
695        if verbose: print "seed:", numpy.random.get_seed()
696       
697        #print 'size',self_size, new_size
698        random_num = numpy.random.randint(0,self_size-1,(int(new_size),))
699        #print 'random num',random_num
700        random_num = random_num.tolist()
701
702        #need to sort and reverse so the pop() works correctly
703        random_num.sort()
704        random_num.reverse()       
705       
706        if verbose: print "make random number list and get indices"
707        j=0
708        k=1
709        remainder_list = total_list[:]
710        # pops array index (random_num) from remainder_list
711        # (which starts as the
712        # total_list and appends to random_list 
713        random_num_len = len(random_num)
714        for i in random_num:
715            random_list.append(remainder_list.pop(i))
716            j+=1
717            #prints progress
718            if verbose and round(random_num_len/10*k)==j:
719                print '(%s/%s)' %(j, random_num_len)
720                k+=1
721       
722        # FIXME: move to tests, it might take a long time
723        # then create an array of random lenght between 500 and 1000,
724        # and use a random factor between 0 and 1
725        # setup for assertion
726        test_total = random_list[:]
727        test_total.extend(remainder_list)
728        test_total.sort() 
729        msg = 'The two random lists made from the original list when added '\
730         'together DO NOT equal the original list'
731        assert (total_list==test_total),msg
732
733        # Get new samples
734        if verbose: print "get values of indices for random list"
735        G1 = self.get_sample(random_list)
736        if verbose: print "get values of indices for opposite of random list"
737        G2 = self.get_sample(remainder_list)
738
739        return G1, G2
740
741    def __iter__(self):
742        """Read in the header, number_of_points and save the
743        file pointer position
744        """
745        from Scientific.IO.NetCDF import NetCDFFile
746        #print "Starting to block"
747        #FIXME - what to do if the file isn't there
748
749        # FIXME (Ole): Shouldn't this go into the constructor?
750        # This method acts like the constructor when blcoking.
751        # ... and shouldn't it be called block_size?
752        #
753        if self.max_read_lines is None:
754            self.max_read_lines = MAX_READ_LINES
755        if self.file_name[-4:] == ".pts":
756           
757            # See if the file is there.  Throw a QUIET IO error if it isn't
758            fd = open(self.file_name,'r')
759            fd.close()
760   
761            # Throws prints to screen if file not present
762            self.fid = NetCDFFile(self.file_name, 'r')
763           
764            self.blocking_georef, self.blocking_keys, self.number_of_points =\
765                                  _read_pts_file_header(self.fid,
766                                                        self.verbose)
767            self.start_row = 0
768            self.last_row = self.number_of_points           
769            self.show_verbose = 0
770            self.verbose_block_size = (self.last_row + 10)/10
771            self.block_number = 0
772            self.number_of_blocks = self.number_of_points/self.max_read_lines
773            # This computes the number of full blocks. The last block may be
774            # smaller and won't be ircluded in this estimate.
775           
776            if self.verbose is True:
777                print 'Reading %d points (in ~%d blocks) from file %s. '\
778                      %(self.number_of_points,
779                        self.number_of_blocks,
780                        self.file_name),
781                print 'Each block consists of %d data points'\
782                      %self.max_read_lines
783           
784        else:
785            # Assume the file is a csv file
786            file_pointer = open(self.file_name)
787            self.header, self.file_pointer = \
788                         _read_csv_file_header(file_pointer)
789            self.blocking_georef = None # Used for reconciling zones
790
791        return self
792   
793   
794    def next(self):
795        """ read a block, instanciate a new geospatial and return it"""
796       
797        if self.file_name[-4:] == ".pts":
798            if self.start_row == self.last_row:
799                # Read the end of the file last iteration
800                # Remove blocking attributes
801                self.fid.close()
802                del self.max_read_lines
803                del self.blocking_georef
804                del self.last_row
805                del self.start_row
806                del self.blocking_keys
807                del self.fid
808                raise StopIteration
809            fin_row = self.start_row + self.max_read_lines
810            if fin_row > self.last_row:
811                fin_row = self.last_row
812
813               
814           
815            if self.verbose is True:
816                if self.show_verbose >= self.start_row and \
817                       self.show_verbose < fin_row:
818
819                   
820                    #print 'Doing %d of %d' %(self.start_row, self.last_row+10)
821
822                    print 'Reading block %d (points %d to %d) out of %d'\
823                          %(self.block_number,
824                            self.start_row,
825                            fin_row,
826                            self.number_of_blocks)
827
828                   
829                    self.show_verbose += max(self.max_read_lines,
830                                             self.verbose_block_size)
831
832                 
833            # Read next block
834            pointlist, att_dict, = \
835                       _read_pts_file_blocking(self.fid,
836                                               self.start_row,
837                                               fin_row,
838                                               self.blocking_keys)
839           
840            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
841            self.start_row = fin_row
842           
843            self.block_number += 1           
844           
845        else:
846            # Assume the file is a csv file
847            try:
848                #print "self.max_read_lines", self.max_read_lines
849                pointlist, att_dict, geo_ref, self.file_pointer = \
850                   _read_csv_file_blocking( self.file_pointer,
851                                            self.header[:],
852                                            max_read_lines=self.max_read_lines,
853                                            verbose=self.verbose)
854
855                # Check that the zones haven't changed.
856                if geo_ref is not None:
857                    geo_ref.reconcile_zones(self.blocking_georef)
858                    self.blocking_georef = geo_ref
859                elif self.blocking_georef is not None:
860                   
861                    msg = 'Geo reference given, then not given.'
862                    msg += ' This should not happen.' 
863                    raise ValueError, msg
864                geo = Geospatial_data(pointlist, att_dict, geo_ref)
865            except StopIteration:
866                self.file_pointer.close()
867                del self.header
868                del self.file_pointer
869                raise StopIteration
870            except ANUGAError:
871                self.file_pointer.close()
872                del self.header
873                del self.file_pointer
874                raise
875            except SyntaxError:
876                self.file_pointer.close()
877                del self.header
878                del self.file_pointer
879                # This should only be if there is a format error
880                msg = 'Could not open file %s. \n' %self.file_name
881                msg += Error_message['IOError']
882                raise SyntaxError, msg
883        return geo
884
885   
886##################### Error messages ###########
887Error_message = {}
888Em = Error_message
889Em['IOError'] = "NOTE: The format for a comma separated .txt/.csv file is:\n"
890Em['IOError'] += "        1st line:     [column names]\n"
891Em['IOError'] += "        other lines:  [x value], [y value], [attributes]\n"
892Em['IOError'] += "\n"
893Em['IOError'] += "           for example:\n"
894Em['IOError'] += "           x, y, elevation, friction\n"
895Em['IOError'] += "           0.6, 0.7, 4.9, 0.3\n"
896Em['IOError'] += "           1.9, 2.8, 5, 0.3\n"
897Em['IOError'] += "           2.7, 2.4, 5.2, 0.3\n"
898Em['IOError'] += "\n"
899Em['IOError'] += "The first two columns are assumed to be x, y coordinates.\n"
900Em['IOError'] += "The attribute values must be numeric.\n"
901
902def _set_using_lat_long(latitudes,
903                        longitudes,
904                        geo_reference,
905                        data_points,
906                        points_are_lats_longs):
907    """
908    if the points has lat long info, assume it is in (lat, long) order.
909    """
910   
911    if geo_reference is not None:
912        msg = """A georeference is specified yet latitude and longitude
913        are also specified!"""
914        raise ValueError, msg
915   
916    if data_points is not None and not points_are_lats_longs:
917        msg = """Data points are specified yet latitude and
918        longitude are also specified."""
919        raise ValueError, msg
920   
921    if points_are_lats_longs:
922        if data_points is None:
923            msg = """Data points are not specified."""
924            raise ValueError, msg
925        lats_longs = ensure_numeric(data_points)
926        latitudes = numpy.ravel(lats_longs[:,0:1])
927        longitudes = numpy.ravel(lats_longs[:,1:])
928       
929    if latitudes is None and longitudes is None:
930        msg = """Latitudes and Longitudes are not specified."""
931        raise ValueError, msg
932   
933    if latitudes is None:
934        msg = """Longitudes are specified yet latitudes aren't."""
935        raise ValueError, msg
936   
937    if longitudes is None:
938        msg = """Latitudes are specified yet longitudes aren't."""
939        raise ValueError, msg
940   
941    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
942                                                    longitudes=longitudes)
943    return data_points, Geo_reference(zone=zone)
944
945   
946def _read_pts_file(file_name, verbose=False):
947    """Read .pts NetCDF file
948   
949    Return a dic of array of points, and dic of array of attribute
950    eg
951    dic['points'] = [[1.0,2.0],[3.0,5.0]]
952    dic['attributelist']['elevation'] = [[7.0,5.0]
953    """   
954
955    from Scientific.IO.NetCDF import NetCDFFile
956   
957    if verbose: print 'Reading ', file_name
958   
959       
960    # See if the file is there.  Throw a QUIET IO error if it isn't
961    fd = open(file_name,'r')
962    fd.close()
963   
964    # Throws prints to screen if file not present
965    fid = NetCDFFile(file_name, 'r') 
966   
967    pointlist = numpy.array(fid.variables['points'])
968    keys = fid.variables.keys()
969    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
970    try:
971        keys.remove('points')
972    except IOError, e:       
973        fid.close()   
974        msg = 'Expected keyword "points" but could not find it'
975        raise IOError, msg
976   
977    attributes = {}
978    for key in keys:
979        if verbose: print "reading attribute '%s'" %key
980           
981        attributes[key] = numpy.array(fid.variables[key])
982   
983   
984    try:
985        geo_reference = Geo_reference(NetCDFObject=fid)
986    except AttributeError, e:
987        geo_reference = None
988   
989    fid.close()
990   
991    return pointlist, attributes, geo_reference
992
993
994def _read_csv_file(file_name, verbose=False):
995    """Read .csv file
996   
997    Return a dic of array of points, and dic of array of attribute
998    eg
999    dic['points'] = [[1.0,2.0],[3.0,5.0]]
1000    dic['attributelist']['elevation'] = [[7.0,5.0]
1001    """
1002   
1003    file_pointer = open(file_name)
1004    header, file_pointer = _read_csv_file_header(file_pointer)
1005    try:
1006        pointlist, att_dict, geo_ref, file_pointer = \
1007                   _read_csv_file_blocking( \
1008                file_pointer,
1009                header,
1010                max_read_lines=1e30) #If the file is bigger that this, block..  # FIXME (Ole) What's up here?
1011               
1012    except ANUGAError:
1013        file_pointer.close()
1014        raise 
1015    file_pointer.close()
1016    return pointlist, att_dict, geo_ref   
1017
1018CSV_DELIMITER = ','
1019def _read_csv_file_header(file_pointer,
1020                          delimiter=CSV_DELIMITER,
1021                          verbose=False):
1022
1023    """Read the header of a .csv file
1024    Return a list of the header names
1025    """
1026    line = file_pointer.readline()
1027    header = clean_line(line, delimiter)
1028    return header, file_pointer
1029
1030def _read_csv_file_blocking(file_pointer, header,
1031                            delimiter=CSV_DELIMITER,
1032                            max_read_lines=MAX_READ_LINES,
1033                            verbose=False):
1034   
1035
1036    """
1037    Read the body of a .csv file.
1038    header: The list header of the csv file, with the x and y labels.
1039    """
1040    points = []
1041    pointattributes = []
1042    att_dict = {}
1043
1044    # This is to remove the x and y headers.
1045    header = header[:]
1046    try:
1047        x_header = header.pop(0)
1048        y_header = header.pop(0)
1049    except IndexError:
1050        # if there are not two columns this will occur.
1051        # eg if it is a space seperated file
1052        raise SyntaxError
1053   
1054    read_lines = 0
1055    while read_lines<max_read_lines:
1056        line = file_pointer.readline()
1057        #print "line",line
1058        numbers = clean_line(line,delimiter)
1059        if len(numbers) <= 1:
1060            break
1061        if line[0] == '#':
1062            continue
1063        read_lines += 1
1064        try:
1065            x = float(numbers[0])
1066            y = float(numbers[1])
1067            points.append([x,y])
1068            numbers.pop(0)
1069            numbers.pop(0)
1070            if len(header) != len(numbers):
1071                file_pointer.close() 
1072                msg = "File load error.  \
1073                There might be a problem with the file header"
1074                raise SyntaxError, msg
1075            for i,num in enumerate(numbers):
1076                num.strip()
1077                if num != '\n' and num != '':
1078                    #attributes.append(float(num))
1079                    att_dict.setdefault(header[i],[]).append(float(num))
1080        #except IOError:           
1081        except ValueError:
1082            raise SyntaxError
1083    if points == []:
1084        raise StopIteration
1085       
1086   
1087    pointlist = numpy.array(points).astype(numpy.float)
1088    for key in att_dict.keys():
1089        att_dict[key] = numpy.array(att_dict[key]).astype(numpy.float)
1090
1091    # Do stuff here so the info is in lat's and longs
1092    geo_ref = None
1093    x_header = lower(x_header[:3])
1094    y_header = lower(y_header[:3])
1095    if (x_header == 'lon' or  x_header == 'lat') and \
1096       (y_header == 'lon' or  y_header == 'lat'):
1097        if x_header == 'lon':
1098            longitudes = numpy.ravel(pointlist[:,0:1])
1099            latitudes = numpy.ravel(pointlist[:,1:])
1100        else:
1101            latitudes = numpy.ravel(pointlist[:,0:1])
1102            longitudes = numpy.ravel(pointlist[:,1:])
1103       
1104        pointlist, geo_ref = _set_using_lat_long(latitudes,
1105                                                 longitudes,
1106                                                 geo_reference=None,
1107                                                 data_points=None,
1108                                                 points_are_lats_longs=False)
1109    return pointlist, att_dict, geo_ref, file_pointer
1110
1111def _read_pts_file_header(fid, verbose=False):
1112
1113    """Read the geo_reference and number_of_points from a .pts file
1114    """
1115   
1116    keys = fid.variables.keys()
1117    try:
1118        keys.remove('points')
1119    except IOError, e:       
1120        fid.close()   
1121        msg = 'Expected keyword "points" but could not find it'
1122        raise IOError, msg
1123    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
1124   
1125    try:
1126        geo_reference = Geo_reference(NetCDFObject=fid)
1127    except AttributeError, e:
1128        geo_reference = None
1129
1130    return geo_reference, keys, fid.dimensions['number_of_points']
1131
1132def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1133                            #verbose=False):
1134   
1135
1136    """
1137    Read the body of a .csv file.
1138    header: The list header of the csv file, with the x and y labels.
1139    """
1140   
1141    pointlist = numpy.array(fid.variables['points'][start_row:fin_row])
1142   
1143    attributes = {}
1144    for key in keys:
1145        attributes[key] = numpy.array(fid.variables[key][start_row:fin_row])
1146
1147    return pointlist, attributes
1148   
1149def _write_pts_file(file_name,
1150                    write_data_points,
1151                    write_attributes=None, 
1152                    write_geo_reference=None):
1153    """
1154    Write .pts NetCDF file   
1155
1156    NOTE: Below might not be valid ask Duncan : NB 5/2006
1157   
1158    WARNING: This function mangles the point_atts data structure
1159    #F??ME: (DSG)This format has issues.
1160    # There can't be an attribute called points
1161    # consider format change
1162    # method changed by NB not sure if above statement is correct
1163   
1164    should create new test for this
1165    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1166    for key in point_atts.keys():
1167        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1168        assert key in legal_keys, msg
1169    """   
1170    from Scientific.IO.NetCDF import NetCDFFile
1171    # NetCDF file definition
1172    outfile = NetCDFFile(file_name, 'w')
1173   
1174    # Create new file
1175    outfile.institution = 'Geoscience Australia'
1176    outfile.description = 'NetCDF format for compact and portable storage ' +\
1177                          'of spatial point data'
1178   
1179    # Dimension definitions
1180    shape = write_data_points.shape[0]
1181    outfile.createDimension('number_of_points', shape) 
1182    outfile.createDimension('number_of_dimensions', 2) #This is 2d data
1183   
1184    # Variable definition
1185    outfile.createVariable('points', Float, ('number_of_points',
1186                                             'number_of_dimensions'))
1187
1188    #create variables 
1189    outfile.variables['points'][:] = write_data_points #.astype(Float32)
1190
1191    if write_attributes is not None:
1192        for key in write_attributes.keys():
1193            outfile.createVariable(key, Float, ('number_of_points',))
1194            outfile.variables[key][:] = write_attributes[key] #.astype(Float32)
1195       
1196    if write_geo_reference is not None:
1197        write_NetCDF_georeference(write_geo_reference, outfile)
1198       
1199    outfile.close() 
1200 
1201def _write_csv_file(file_name,
1202                    write_data_points,
1203                    write_attributes=None,
1204                    as_lat_long=False,
1205                    delimiter=','):
1206    """
1207    export a file, file_name, with the csv format
1208   
1209    """
1210    points = write_data_points
1211    pointattributes = write_attributes
1212   
1213    fd = open(file_name,'w')
1214    if as_lat_long:
1215        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1216    else:
1217        titlelist = "x" + delimiter + "y"  + delimiter
1218    if pointattributes is not None:   
1219        for title in pointattributes.keys():
1220            titlelist = titlelist + title + delimiter
1221        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1222    fd.write(titlelist+"\n")
1223   
1224    # <x/lat> <y/long> [attributes]
1225    for i, vert in enumerate( points):
1226
1227
1228        if pointattributes is not None:           
1229            attlist = ","
1230            for att in pointattributes.keys():
1231                attlist = attlist + str(pointattributes[att][i])+ delimiter
1232            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1233            attlist.strip()
1234        else:
1235            attlist = ''
1236
1237        fd.write(str(vert[0]) + delimiter +
1238                 str(vert[1]) + attlist + "\n")
1239
1240    fd.close()
1241 
1242def _write_urs_file(file_name,
1243                    points,
1244                    delimiter=' '):
1245    """
1246    export a file, file_name, with the urs format
1247    the data points are in lats and longs
1248   
1249    """   
1250    fd = open(file_name,'w')   
1251    fd.write(str(len(points))+"\n")   
1252    # <lat> <long> <id#>
1253    for i, vert in enumerate( points):
1254        fd.write(str(round(vert[0],7)) + delimiter + \
1255                 str(round(vert[1],7)) + delimiter +str(i)+ "\n")
1256    fd.close()
1257   
1258def _point_atts2array(point_atts):
1259    point_atts['pointlist'] = numpy.array(point_atts['pointlist']).astype(numpy.float)
1260   
1261    for key in point_atts['attributelist'].keys():
1262        point_atts['attributelist'][key]=\
1263                numpy.array(point_atts['attributelist'][key]).astype(numpy.float)
1264    return point_atts
1265
1266   
1267
1268
1269def geospatial_data2points_dictionary(geospatial_data):
1270    """Convert geospatial data to points_dictionary
1271    """
1272
1273    points_dictionary = {}
1274    points_dictionary['pointlist'] = geospatial_data.data_points
1275
1276    points_dictionary['attributelist'] = {}
1277
1278    for attribute_name in geospatial_data.attributes.keys():
1279        val = geospatial_data.attributes[attribute_name]
1280        points_dictionary['attributelist'][attribute_name] = val
1281
1282    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1283
1284    return points_dictionary
1285
1286   
1287def points_dictionary2geospatial_data(points_dictionary):
1288    """Convert points_dictionary to geospatial data object
1289    """
1290
1291    msg = 'Points dictionary must have key pointlist' 
1292    assert points_dictionary.has_key('pointlist'), msg
1293
1294    msg = 'Points dictionary must have key attributelist'     
1295    assert points_dictionary.has_key('attributelist'), msg       
1296
1297    if points_dictionary.has_key('geo_reference'):
1298        geo = points_dictionary['geo_reference']
1299    else:
1300        geo = None
1301   
1302    return Geospatial_data(points_dictionary['pointlist'],
1303                           points_dictionary['attributelist'],
1304                           geo_reference = geo)
1305
1306def clean_line(line,delimiter):     
1307    """Remove whitespace
1308    """
1309    #print ">%s" %line
1310    line = line.strip()
1311    #print "stripped>%s" %line
1312    numbers = line.split(delimiter)
1313    i = len(numbers) - 1
1314    while i >= 0:
1315        if numbers[i] == '':
1316            numbers.pop(i)
1317        else:
1318            numbers[i] = numbers[i].strip()
1319       
1320        i += -1
1321    #for num in numbers:
1322    #    print "num>%s<" %num
1323    return numbers
1324           
1325def ensure_absolute(points, geo_reference=None):
1326    """Ensure that points are in absolute coordinates.
1327   
1328    This function inputs several formats and
1329    outputs one format. - a numeric array of absolute points.
1330
1331    Input formats are;
1332      points: List or numeric array of coordinate pairs [xi, eta] of
1333              points or geospatial object or points file name
1334
1335    mesh_origin: A geo_reference object or 3-tuples consisting of
1336                 UTM zone, easting and northing.
1337                 If specified vertex coordinates are assumed to be
1338                 relative to their respective origins.
1339    """
1340
1341    # Input check
1342    if isinstance(points, basestring):
1343        #It's a string - assume it is a point file
1344        points = Geospatial_data(file_name=points)
1345       
1346    if isinstance(points, Geospatial_data):
1347        points = points.get_data_points(absolute=True)
1348        msg = 'Use a Geospatial_data object or a mesh origin. Not both.'
1349        assert geo_reference == None, msg
1350    else:
1351        points = ensure_numeric(points, numpy.float)
1352       
1353    # Sort of geo_reference and convert points
1354    if geo_reference is None:
1355        geo = None #Geo_reference()
1356    else:
1357        if isinstance(geo_reference, Geo_reference):
1358            geo = geo_reference
1359        else:
1360            geo = Geo_reference(geo_reference[0],
1361                                geo_reference[1],
1362                                geo_reference[2])
1363        points = geo.get_absolute(points)
1364       
1365    # Return   
1366    return points
1367     
1368
1369def ensure_geospatial(points, geo_reference=None):
1370    """
1371    This function inputs several formats and
1372    outputs one format. - a geospatial_data instance.
1373
1374    Inputed formats are;
1375    points: List or numeric array of coordinate pairs [xi, eta] of
1376              points or geospatial object
1377
1378    mesh_origin: A geo_reference object or 3-tuples consisting of
1379                 UTM zone, easting and northing.
1380                 If specified vertex coordinates are assumed to be
1381                 relative to their respective origins.
1382    """
1383   
1384    # Input check
1385    if isinstance(points, Geospatial_data):
1386        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1387        assert geo_reference is None, msg
1388        return points   
1389    else:
1390        # List or numeric array of absolute points
1391        points = ensure_numeric(points, numpy.float)
1392
1393    # Sort out geo reference   
1394    if geo_reference is None:
1395        geo = None
1396    else:
1397        if isinstance(geo_reference, Geo_reference):
1398            geo = geo_reference
1399        else:
1400            geo = Geo_reference(geo_reference[0],
1401                                geo_reference[1],
1402                                geo_reference[2])
1403                               
1404    # Create Geospatial_data object with appropriate geo reference and return                           
1405    points = Geospatial_data(data_points=points, geo_reference=geo)       
1406    return points
1407
1408
1409
1410def find_optimal_smoothing_parameter(data_file, 
1411                                     alpha_list=None,
1412                                     mesh_file=None,
1413                                     boundary_poly=None,
1414                                     mesh_resolution=100000,
1415                                     north_boundary=None,
1416                                     south_boundary=None,
1417                                     east_boundary=None,
1418                                     west_boundary=None,
1419                                     plot_name='all_alphas',
1420                                     split_factor=0.1,
1421                                     seed_num=None,
1422                                     cache=False,
1423                                     verbose=False
1424                                     ):
1425   
1426    """
1427    Removes a small random sample of points from 'data_file'. Then creates
1428    models with different alpha values from 'alpha_list' and cross validates
1429    the predicted value to the previously removed point data. Returns the
1430    alpha value which has the smallest covariance.
1431   
1432    data_file: must not contain points outside the boundaries defined
1433           and it either a pts, txt or csv file.
1434   
1435    alpha_list: the alpha values to test in a single list
1436   
1437    mesh_file: name of the created mesh file or if passed in will read it.
1438            NOTE, if there is a mesh file mesh_resolution,
1439            north_boundary, south... etc will be ignored.
1440   
1441    mesh_resolution: the maximum area size for a triangle
1442   
1443    north_boundary... west_boundary: the value of the boundary
1444   
1445    plot_name: the name for the plot contain the results
1446   
1447    seed_num: the seed to the random number generator
1448   
1449    USAGE:
1450        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1451                                             alpha_list=[0.0001, 0.01, 1],
1452                                             mesh_file=None,
1453                                             mesh_resolution=3,
1454                                             north_boundary=5,
1455                                             south_boundary=-5,
1456                                             east_boundary=5,
1457                                             west_boundary=-5,
1458                                             plot_name='all_alphas',
1459                                             seed_num=100000,
1460                                             verbose=False)
1461   
1462    OUTPUT: returns the minumum normalised covalance calculate AND the
1463           alpha that created it. PLUS writes a plot of the results
1464           
1465    NOTE: code will not work if the data_file extent is greater than the
1466    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
1467       
1468    """
1469
1470    from anuga.shallow_water import Domain
1471    from anuga.geospatial_data.geospatial_data import Geospatial_data
1472    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1473   
1474    from anuga.utilities.numerical_tools import cov
1475    from anuga.utilities.polygon import is_inside_polygon
1476    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1477   
1478
1479    attribute_smoothed='elevation'
1480
1481    if mesh_file is None:
1482        if verbose: print "building mesh"
1483        mesh_file='temp.msh'
1484
1485        if north_boundary is None or south_boundary is None or \
1486           east_boundary is None or west_boundary is None:
1487            no_boundary=True
1488        else:
1489            no_boundary=False
1490       
1491        if no_boundary is True:
1492            msg= 'All boundaries must be defined'
1493            raise msg
1494
1495        poly_topo = [[east_boundary,south_boundary],
1496                     [east_boundary,north_boundary],
1497                     [west_boundary,north_boundary],
1498                     [west_boundary,south_boundary]]
1499                     
1500        create_mesh_from_regions(poly_topo,
1501                                 boundary_tags={'back': [2],
1502                                                'side': [1,3],
1503                                                'ocean': [0]},
1504                             maximum_triangle_area=mesh_resolution,
1505                             filename=mesh_file,
1506                             use_cache=cache,
1507                             verbose=verbose)
1508
1509    else: # if mesh file provided
1510        #test mesh file exists?
1511        if verbose: "reading from file: %s" %mesh_file
1512        if access(mesh_file,F_OK) == 0:
1513            msg="file %s doesn't exist!" %mesh_file
1514            raise IOError, msg
1515
1516    #split topo data
1517    if verbose: print 'Reading elevation file: %s' %data_file
1518    G = Geospatial_data(file_name = data_file)
1519    if verbose: print 'Start split'
1520    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1521    if verbose: print 'Finish split'
1522    points=G_small.get_data_points()
1523
1524    if verbose: print "Number of points in sample to compare: ", len(points)
1525   
1526    if alpha_list==None:
1527        alphas = [0.001,0.01,100]
1528        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1529         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1530       
1531    else:
1532        alphas=alpha_list
1533
1534    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1535    #4 onwards is the elevation_predicted using the alpha, which will
1536    #be compared later against the real removed data
1537    data=numpy.array([],typecode=numpy.float)
1538
1539    data=numpy.resize(data,(len(points),3+len(alphas)))
1540
1541    #gets relative point from sample
1542    data[:,0]=points[:,0]
1543    data[:,1]=points[:,1]
1544    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1545    data[:,2]=elevation_sample
1546
1547    normal_cov=numpy.array(numpy.zeros([len(alphas),2]),typecode=numpy.float)
1548
1549    if verbose: print 'Setup computational domains with different alphas'
1550
1551    #print 'memory usage before domains',mem_usage()
1552
1553    for i,alpha in enumerate(alphas):
1554        #add G_other data to domains with different alphas
1555        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1556        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1557        if verbose:print domain.statistics()
1558        domain.set_quantity(attribute_smoothed, 
1559                    geospatial_data = G_other,
1560                    use_cache = cache,
1561                    verbose = verbose,
1562                    alpha = alpha)
1563
1564       
1565        # Convert points to geospatial data for use with get_values below
1566        points_geo = Geospatial_data(points, domain.geo_reference)
1567       
1568        #returns the predicted elevation of the points that were "split" out
1569        #of the original data set for one particular alpha
1570        if verbose: print 'Get predicted elevation for location to be compared'
1571        elevation_predicted=domain.quantities[attribute_smoothed].\
1572                            get_values(interpolation_points=points_geo)
1573 
1574        #add predicted elevation to array that starts with x, y, z...
1575        data[:,i+3]=elevation_predicted
1576
1577        sample_cov= cov(elevation_sample)
1578        #print elevation_predicted
1579        ele_cov= cov(elevation_sample-elevation_predicted)
1580        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1581        #print 'memory usage during compare',mem_usage()
1582       
1583        if verbose: print'Covariance for alpha ',normal_cov[i][0],'= ',normal_cov[i][1]
1584        if verbose: print'-------------------------------------------- \n'
1585
1586    normal_cov0=normal_cov[:,0]
1587    normal_cov_new=numpy.take(normal_cov,numpy.argsort(normal_cov0))        ##FIX?
1588
1589    if plot_name is not None:
1590        from pylab import savefig,semilogx,loglog
1591        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1592        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1593        savefig(plot_name,dpi=300)
1594    if mesh_file == 'temp.msh':
1595        remove(mesh_file)
1596   
1597    if verbose: 
1598        print 'Final results:'
1599        for i,alpha in enumerate(alphas):
1600            print'covariance for alpha %s = %s ' %(normal_cov[i][0],normal_cov[i][1])
1601        print '\n Optimal alpha is: %s ' % normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1602
1603    # covariance and optimal alpha
1604    return min(normal_cov_new[:,1]) , normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1605
1606def old_find_optimal_smoothing_parameter(data_file, 
1607                                     alpha_list=None,
1608                                     mesh_file=None,
1609                                     boundary_poly=None,
1610                                     mesh_resolution=100000,
1611                                     north_boundary=None,
1612                                     south_boundary=None,
1613                                     east_boundary=None,
1614                                     west_boundary=None,
1615                                     plot_name='all_alphas',
1616                                     split_factor=0.1,
1617                                     seed_num=None,
1618                                     cache=False,
1619                                     verbose=False
1620                                     ):
1621   
1622    """
1623    data_file: must not contain points outside the boundaries defined
1624           and it either a pts, txt or csv file.
1625   
1626    alpha_list: the alpha values to test in a single list
1627   
1628    mesh_file: name of the created mesh file or if passed in will read it.
1629            NOTE, if there is a mesh file mesh_resolution,
1630            north_boundary, south... etc will be ignored.
1631   
1632    mesh_resolution: the maximum area size for a triangle
1633   
1634    north_boundary... west_boundary: the value of the boundary
1635   
1636    plot_name: the name for the plot contain the results
1637   
1638    seed_num: the seed to the random number generator
1639   
1640    USAGE:
1641        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1642                                             alpha_list=[0.0001, 0.01, 1],
1643                                             mesh_file=None,
1644                                             mesh_resolution=3,
1645                                             north_boundary=5,
1646                                             south_boundary=-5,
1647                                             east_boundary=5,
1648                                             west_boundary=-5,
1649                                             plot_name='all_alphas',
1650                                             seed_num=100000,
1651                                             verbose=False)
1652   
1653    OUTPUT: returns the minumum normalised covalance calculate AND the
1654           alpha that created it. PLUS writes a plot of the results
1655           
1656    NOTE: code will not work if the data_file extend is greater than the
1657    boundary_polygon or the north_boundary...west_boundary
1658       
1659    """
1660
1661    from anuga.shallow_water import Domain
1662    from anuga.geospatial_data.geospatial_data import Geospatial_data
1663    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1664   
1665    from anuga.utilities.numerical_tools import cov
1666    from anuga.utilities.polygon import is_inside_polygon
1667    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1668   
1669
1670    attribute_smoothed='elevation'
1671
1672    if mesh_file is None:
1673        mesh_file='temp.msh'
1674
1675        if north_boundary is None or south_boundary is None or \
1676           east_boundary is None or west_boundary is None:
1677            no_boundary=True
1678        else:
1679            no_boundary=False
1680       
1681        if no_boundary is True:
1682            msg= 'All boundaries must be defined'
1683            raise msg
1684
1685        poly_topo = [[east_boundary,south_boundary],
1686                     [east_boundary,north_boundary],
1687                     [west_boundary,north_boundary],
1688                     [west_boundary,south_boundary]]
1689                     
1690        create_mesh_from_regions(poly_topo,
1691                                 boundary_tags={'back': [2],
1692                                                'side': [1,3],
1693                                                'ocean': [0]},
1694                             maximum_triangle_area=mesh_resolution,
1695                             filename=mesh_file,
1696                             use_cache=cache,
1697                             verbose=verbose)
1698
1699    else: # if mesh file provided
1700        #test mesh file exists?
1701        if access(mesh_file,F_OK) == 0:
1702            msg="file %s doesn't exist!" %mesh_file
1703            raise IOError, msg
1704
1705    #split topo data
1706    G = Geospatial_data(file_name = data_file)
1707    if verbose: print 'start split'
1708    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1709    if verbose: print 'finish split'
1710    points=G_small.get_data_points()
1711
1712    #FIXME: Remove points outside boundary polygon
1713#    print 'new point',len(points)
1714#   
1715#    new_points=[]
1716#    new_points=array([],typecode=Float)
1717#    new_points=resize(new_points,(len(points),2))
1718#    print "BOUNDARY", boundary_poly
1719#    for i,point in enumerate(points):
1720#        if is_inside_polygon(point,boundary_poly, verbose=True):
1721#            new_points[i] = point
1722#            print"WOW",i,new_points[i]
1723#    points = new_points
1724
1725   
1726    if verbose: print "Number of points in sample to compare: ", len(points)
1727   
1728    if alpha_list==None:
1729        alphas = [0.001,0.01,100]
1730        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1731         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1732       
1733    else:
1734        alphas=alpha_list
1735    domains = {}
1736
1737    if verbose: print 'Setup computational domains with different alphas'
1738
1739    print 'memory usage before domains',mem_usage()
1740
1741    for alpha in alphas:
1742        #add G_other data to domains with different alphas
1743        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1744        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1745        if verbose:print domain.statistics()
1746        domain.set_quantity(attribute_smoothed, 
1747                    geospatial_data = G_other,
1748                    use_cache = cache,
1749                    verbose = verbose,
1750                    alpha = alpha)
1751        domains[alpha]=domain
1752
1753    print 'memory usage after domains',mem_usage()
1754
1755    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1756    #4 onwards is the elevation_predicted using the alpha, which will
1757    #be compared later against the real removed data
1758    data=numpy.array([],typecode=numpy.float)
1759
1760    data=numpy.resize(data,(len(points),3+len(alphas)))
1761
1762    #gets relative point from sample
1763    data[:,0]=points[:,0]
1764    data[:,1]=points[:,1]
1765    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1766    data[:,2]=elevation_sample
1767
1768    normal_cov=numpy.array(numpy.zeros([len(alphas),2]),typecode=numpy.float)
1769
1770    if verbose: print 'Determine difference between predicted results and actual data'
1771    for i,alpha in enumerate(domains):
1772        if verbose: print'Alpha =',alpha
1773       
1774        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1775        #returns the predicted elevation of the points that were "split" out
1776        #of the original data set for one particular alpha
1777        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1778                            get_values(interpolation_points=points_geo)
1779 
1780        #add predicted elevation to array that starts with x, y, z...
1781        data[:,i+3]=elevation_predicted
1782
1783        sample_cov= cov(elevation_sample)
1784        #print elevation_predicted
1785        ele_cov= cov(elevation_sample-elevation_predicted)
1786        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1787        print 'memory usage during compare',mem_usage()
1788       
1789
1790        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
1791
1792    normal_cov0=normal_cov[:,0]
1793    normal_cov_new=numpy.take(normal_cov,numpy.argsort(normal_cov0))        ##FIX?
1794
1795    if plot_name is not None:
1796        from pylab import savefig,semilogx,loglog
1797        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1798        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1799        savefig(plot_name,dpi=300)
1800    if mesh_file == 'temp.msh':
1801        remove(mesh_file)
1802   
1803    return min(normal_cov_new[:,1]) , normal_cov_new[(numpy.argmin(normal_cov_new,axis=0))[1],0]
1804
1805         
1806if __name__ == "__main__":
1807    pass
1808   
Note: See TracBrowser for help on using the repository browser.