source: anuga_core/source/anuga/geospatial_data/geospatial_data.py @ 5571

Last change on this file since 5571 was 5421, checked in by ole, 16 years ago

Changed timestepping_statistics to output real time seconds as well
Also comments and formatting

File size: 65.0 KB
Line 
1"""Class Geospatial_data - Manipulation of locations on the planet and
2associated attributes.
3
4"""
5from sys import maxint
6from os import access, F_OK, R_OK,remove
7from types import DictType
8from warnings import warn
9from string import lower
10from Numeric import concatenate, array, Float, shape, reshape, ravel, take, \
11                        size, shape
12#from Array import tolist
13from RandomArray import randint, seed, get_seed
14from copy import deepcopy
15
16#from MA import tolist
17
18from Scientific.IO.NetCDF import NetCDFFile   
19from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL   
20from anuga.utilities.numerical_tools import ensure_numeric
21from anuga.coordinate_transforms.geo_reference import Geo_reference, \
22     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
23from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
24from anuga.utilities.anuga_exceptions import ANUGAError
25from anuga.config import points_file_block_line_size as MAX_READ_LINES 
26#from anuga.fit_interpolate.benchmark_least_squares import mem_usage
27 
28
29DEFAULT_ATTRIBUTE = 'elevation'
30
31class Geospatial_data:
32
33    def __init__(self,
34                 data_points=None, # this can also be a points file name
35                 attributes=None,
36                 geo_reference=None,
37                 default_attribute_name=None,
38                 file_name=None,
39                 latitudes=None,
40                 longitudes=None,
41                 points_are_lats_longs=False,
42                 max_read_lines=None,
43                 load_file_now=True,
44                 verbose=False):
45
46       
47        """
48        Create instance from data points and associated attributes
49
50        data_points: x,y coordinates in meters. Type must be either a
51        sequence of 2-tuples or an Mx2 Numeric array of floats.  A file name
52        with extension .txt, .cvs or .pts can also be passed in here.
53
54        attributes: Associated values for each data point. The type
55        must be either a list or an array of length M or a dictionary
56        of lists (or arrays) of length M. In the latter case the keys
57        in the dictionary represent the attribute names, in the former
58        the attribute will get the default name "elevation".
59       
60        geo_reference: Object representing the origin of the data
61        points. It contains UTM zone, easting and northing and data
62        points are assumed to be relative to this origin.
63        If geo_reference is None, the default geo ref object is used.
64
65        default_attribute_name: Name of default attribute to be used with
66        get_attribute_values. The idea is that the dataset can be
67        equipped with information about which attribute to return.
68        If None, the default is the "first"
69
70        latitudes, longitudes: Vectors of latitudes and longitudes,
71        used to specify location instead of points.
72       
73        points_are_lats_longs: Set this as true if the points are actually
74        lats and longs, not UTM
75
76        max_read_lines: The number of rows read into memory when using
77        blocking to read a file.
78
79        load_file_now:  If true the file is automatically loaded
80        into the geospatial instance. Used when blocking.
81       
82        file_name: Name of input netCDF file or .txt file. netCDF file must
83        have dimensions "points" etc.
84        .txt file is a comma seperated file with x, y and attribute
85        data.
86       
87        The first line has the titles of the columns.  The first two
88        column titles are checked to see if they start with lat or
89        long (not case sensitive).  If so the data is assumed to be
90        latitude and longitude, in decimal format and converted to
91        UTM.  Otherwise the first two columns are assumed to be the x
92        and y, and the title names acually used are ignored.
93 
94       
95        The format for a .txt file is:
96            1st line:     [column names]
97            other lines:  x y [attributes]
98
99            for example:
100            x, y, elevation, friction
101            0.6, 0.7, 4.9, 0.3
102            1.9, 2.8, 5, 0.3
103            2.7, 2.4, 5.2, 0.3
104
105        The first two columns have to be x, y or lat, long
106        coordinates.
107     
108       
109        The format for a Points dictionary is:
110
111          ['pointlist'] a 2 column array describing points. 1st column x,
112          2nd column y.
113          ['attributelist'], a dictionary of 1D arrays, representing
114          attribute values at the point.  The dictionary key is the attribute
115          header.
116          ['geo_reference'] a Geo_refernece object. Use if the point
117          information is relative. This is optional.
118            eg
119            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
120            dic['attributelist']['elevation'] = [[7.0,5.0]
121               
122        verbose:
123
124         
125        """
126
127        if isinstance(data_points, basestring):
128            # assume data point is really a file name
129            file_name = data_points
130
131        self.set_verbose(verbose)
132        self.geo_reference=None #create the attribute
133        self.file_name = file_name
134       
135        if max_read_lines is None:
136            self.max_read_lines = MAX_READ_LINES
137        else:
138            self.max_read_lines = max_read_lines
139
140        if file_name is None:
141            if latitudes is not None or longitudes is not None or \
142                   points_are_lats_longs:
143                data_points, geo_reference =  \
144                             _set_using_lat_long(latitudes=latitudes,
145                                  longitudes=longitudes,
146                                  geo_reference=geo_reference,
147                                  data_points=data_points,
148                                  points_are_lats_longs=points_are_lats_longs)
149            self.check_data_points(data_points)
150            self.set_attributes(attributes)
151            self.set_geo_reference(geo_reference)
152            self.set_default_attribute_name(default_attribute_name)
153
154        elif load_file_now is True:
155            # watch for case where file name and points,
156            # attributes etc are provided!!
157            # if file name then all provided info will be removed!
158
159            if verbose is True:
160                if file_name is not None:
161                    print 'Loading Geospatial data from file: %s' %file_name
162           
163            self.import_points_file(file_name, verbose=verbose)
164               
165            self.check_data_points(self.data_points)
166            self.set_attributes(self.attributes) 
167            self.set_geo_reference(self.geo_reference)
168            self.set_default_attribute_name(default_attribute_name)
169
170        if verbose is True:
171            if file_name is not None:
172                print 'Geospatial data created from file: %s' %file_name
173                if load_file_now is False:
174                    print 'Data will be loaded blockwise on demand'
175
176                    if file_name.endswith('csv') or file_name.endswith('txt'):
177                        pass
178                        # This message was misleading.
179                        # FIXME (Ole): Are we blocking here or not?
180                        #print 'ASCII formats are not that great for '
181                        #print 'blockwise reading. Consider storing this'
182                        #print 'data as a pts NetCDF format'
183
184    def __len__(self):
185        return len(self.data_points)
186
187    def __repr__(self):
188        return str(self.get_data_points(absolute=True))
189   
190   
191    def check_data_points(self, data_points):
192        """Checks data points
193        """
194       
195        if data_points is None:
196            self.data_points = None
197            msg = 'There is no data or file provided!'
198            raise ValueError, msg
199           
200        else:
201            self.data_points = ensure_numeric(data_points)
202            #print "self.data_points.shape",self.data_points.shape
203            if not (0,) == self.data_points.shape:
204                assert len(self.data_points.shape) == 2
205                assert self.data_points.shape[1] == 2
206
207    def set_attributes(self, attributes):
208        """Check and assign attributes dictionary
209        """
210       
211        if attributes is None:
212            self.attributes = None
213            return
214       
215        if not isinstance(attributes, DictType):
216            #Convert single attribute into dictionary
217            attributes = {DEFAULT_ATTRIBUTE: attributes}
218
219        #Check input attributes   
220        for key in attributes.keys():
221            try:
222                attributes[key] = ensure_numeric(attributes[key])
223            except:
224                msg = 'Attribute %s could not be converted' %key
225                msg += 'to a numeric vector'
226                raise msg
227
228        self.attributes = attributes   
229
230
231    def set_geo_reference(self, geo_reference):
232        """
233        Set's the georeference of geospatial.
234        It can also be used to change the georeference
235        """
236        from anuga.coordinate_transforms.geo_reference import Geo_reference
237
238        if geo_reference is None:
239            geo_reference = Geo_reference() # Use default
240        geo_reference = ensure_geo_reference(geo_reference)
241        if not isinstance(geo_reference, Geo_reference):
242            msg = 'Argument geo_reference must be a valid Geo_reference \n'
243            msg += 'object or None.'
244            raise msg
245
246        # if a geo ref already exists, change the point data to
247        # represent the new geo-ref
248        if  self.geo_reference is not None:
249            #FIXME: Maybe put out a warning here...
250            self.data_points = self.get_data_points \
251                               (geo_reference=geo_reference)
252           
253        self.geo_reference = geo_reference
254
255
256    def set_default_attribute_name(self, default_attribute_name):
257        self.default_attribute_name = default_attribute_name
258
259    def set_verbose(self, verbose=False):
260        if verbose in [False, True]:
261            self.verbose = verbose
262        else:
263            msg = 'Illegal value: %s' %str(verbose)
264            raise Exception, msg
265
266    def clip(self, polygon, closed=True, verbose=False):
267        """Clip geospatial data by a polygon
268
269        Input
270          polygon - Either a list of points, an Nx2 array or
271                    a Geospatial data object.
272          closed - (optional) determine whether points on boundary should be
273          regarded as belonging to the polygon (closed = True)
274          or not (closed = False). Default is True.
275           
276        Output
277          New geospatial data object representing points inside
278          specified polygon.
279       
280       
281        Note - this method is non-destructive and leaves the data in 'self'
282        unchanged
283        """
284
285        from anuga.utilities.polygon import inside_polygon
286
287        if isinstance(polygon, Geospatial_data):
288            # Polygon is an object - extract points
289            polygon = polygon.get_data_points()
290
291        points = self.get_data_points()   
292#        if verbose: print '%s points:%s' %(verbose,points)
293        inside_indices = inside_polygon(points, polygon, closed, verbose)
294
295        clipped_G = self.get_sample(inside_indices)
296
297#        clipped_points = take(points, inside_indices)
298
299        # Clip all attributes
300#        attributes = self.get_all_attributes()
301
302#        clipped_attributes = {}
303#        if attributes is not None:
304#            for key, att in attributes.items():
305#                clipped_attributes[key] = take(att, inside_indices)
306
307#        return Geospatial_data(clipped_points, clipped_attributes)
308        return clipped_G
309       
310       
311    def clip_outside(self, polygon, closed=True,verbose=False):
312        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
313
314        Input
315          polygon - Either a list of points, an Nx2 array or
316                    a Geospatial data object.
317          closed - (optional) determine whether points on boundary should be
318          regarded as belonging to the polygon (closed = True)
319          or not (closed = False). Default is True.
320         
321        Output
322          Geospatial data object representing point OUTSIDE specified polygon
323       
324        """
325
326        from anuga.utilities.polygon import outside_polygon
327
328        if isinstance(polygon, Geospatial_data):
329            # Polygon is an object - extract points
330            polygon = polygon.get_data_points()
331
332        points = self.get_data_points()   
333        outside_indices = outside_polygon(points, polygon, closed,verbose)
334
335        clipped_G = self.get_sample(outside_indices)
336
337#        clipped_points = take(points, outside_indices)
338
339        # Clip all attributes
340#        attributes = self.get_all_attributes()
341
342#        clipped_attributes = {}
343#        if attributes is not None:
344#            for key, att in attributes.items():
345#                clipped_attributes[key] = take(att, outside_indices)
346
347#        return Geospatial_data(clipped_points, clipped_attributes)
348        return clipped_G
349
350   
351    def get_geo_reference(self):
352        return self.geo_reference
353       
354    def get_data_points(self, absolute=True, geo_reference=None,
355                        as_lat_long=False, isSouthHemisphere=True):
356        """Get coordinates for all data points as an Nx2 array
357
358        If absolute is False returned coordinates are relative to the
359        internal georeference's xll and yll corners, otherwise
360        absolute UTM coordinates are returned.
361
362        If a geo_reference is passed the points are returned relative
363        to that geo_reference.
364       
365        isSH (isSouthHemisphere) is only used when getting data
366        points "as_lat_long" is True and if FALSE will return lats and
367        longs valid for the Northern Hemisphere.
368
369        Default: absolute is True.
370        """
371        if as_lat_long is True:
372            msg = "Points need a zone to be converted into lats and longs"
373            assert self.geo_reference is not None, msg
374            zone = self.geo_reference.get_zone()
375            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
376            lats_longs = []
377            for point in self.get_data_points(True):
378                ### UTMtoLL(northing, easting, zone,
379                lat_calced, long_calced = UTMtoLL(point[1],point[0], 
380                                                  zone, isSouthHemisphere)
381                lats_longs.append((lat_calced, long_calced)) # to hash
382            return lats_longs
383        if absolute is True and geo_reference is None:
384            return self.geo_reference.get_absolute(self.data_points)
385        elif geo_reference is not None:
386            return geo_reference.change_points_geo_ref \
387                               (self.data_points, 
388                                self.geo_reference)
389        else:
390            return self.data_points
391
392   
393    def get_attributes(self, attribute_name=None):
394        """Return values for one named attribute.
395
396        If attribute_name is None, default_attribute_name is used
397        """
398
399        if attribute_name is None:
400            if self.default_attribute_name is not None:
401                attribute_name = self.default_attribute_name
402            else:
403                attribute_name = self.attributes.keys()[0] 
404                # above line takes the first one from keys
405       
406        if self.verbose is True:
407            print 'Using attribute %s' %attribute_name
408            print 'Available attributes: %s' %(self.attributes.keys())       
409
410        msg = 'Attribute name %s does not exist in data set' %attribute_name
411        assert self.attributes.has_key(attribute_name), msg
412
413        return self.attributes[attribute_name]
414
415    def get_all_attributes(self):
416        """
417        Return values for all attributes.
418        The return value is either None or a dictionary (possibly empty).
419        """
420
421        return self.attributes
422
423    def __add__(self, other):
424        """
425        Returns the addition of 2 geospatical objects,
426        objects are concatencated to the end of each other
427           
428        NOTE: doesn't add if objects contain different
429        attributes 
430       
431        Always return absolute points!
432        This alse means, that if you add None to the object,
433        it will be turned into absolute coordinates
434
435        other can be None in which case nothing is added to self.
436        """
437
438
439        # find objects zone and checks if the same
440        geo_ref1 = self.get_geo_reference()
441        zone1 = geo_ref1.get_zone()
442
443
444        if other is not None:
445
446            geo_ref2 = other.get_geo_reference()
447            zone2 = geo_ref2.get_zone()
448
449            geo_ref1.reconcile_zones(geo_ref2)
450
451       
452            new_points = concatenate((self.get_data_points(absolute=True),
453                                      other.get_data_points(absolute=True)),
454                                      axis = 0)       
455
456       
457     
458            # Concatenate attributes if any
459            if self.attributes is None:
460                if other.attributes is not None:
461                    msg = 'Geospatial data must have the same \n'
462                    msg += 'attributes to allow addition.'
463                    raise Exception, msg
464               
465                new_attributes = None
466            else:   
467                new_attributes = {}
468                for x in self.attributes.keys():
469                    if other.attributes.has_key(x):
470
471                        attrib1 = self.attributes[x]
472                        attrib2 = other.attributes[x]
473                        new_attributes[x] = concatenate((attrib1, attrib2))
474
475                    else:
476                        msg = 'Geospatial data must have the same \n'
477                        msg += 'attributes to allow addition.'
478                        raise Exception, msg
479
480
481        else:
482            #other is None:
483           
484            new_points = self.get_data_points(absolute=True)
485            new_attributes = self.attributes
486
487                   
488
489        # Instantiate new data object and return absolute coordinates
490        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
491        return Geospatial_data(new_points,
492                               new_attributes,
493                               new_geo_ref)
494
495
496    def __radd__(self, other):
497        """Handle cases like None + Geospatial_data(...)
498        """
499
500        return self + other
501
502   
503   
504    ###
505    #  IMPORT/EXPORT POINTS FILES
506    ###
507
508    def import_points_file(self, file_name, delimiter=None, verbose=False):
509        """ load an .txt, .csv or .pts file
510        Note: will throw an IOError/SyntaxError if it can't load the file.
511        Catch these!
512
513        Post condition: self.attributes dictionary has been set
514        """
515       
516        if access(file_name, F_OK) == 0 :
517            msg = 'File %s does not exist or is not accessible' %file_name
518            raise IOError, msg
519       
520        attributes = {}
521        if file_name[-4:]== ".pts":
522            try:
523                data_points, attributes, geo_reference =\
524                             _read_pts_file(file_name, verbose)
525            except IOError, e:   
526                msg = 'Could not open file %s ' %file_name
527                raise IOError, msg 
528       
529        elif file_name[-4:]== ".txt" or file_name[-4:]== ".csv":
530            try:
531                data_points, attributes, geo_reference =\
532                             _read_csv_file(file_name, verbose)
533            except IOError, e:
534                # This should only be if a file is not found
535                msg = 'Could not open file %s. ' %file_name
536                msg += 'Check the file location.'
537                raise IOError, msg
538            except SyntaxError, e:
539                # This should only be if there is a format error
540                msg = 'Could not open file %s. \n' %file_name
541                msg += Error_message['IOError']
542                #print "msg", msg
543                raise SyntaxError, msg
544        else:     
545            msg = 'Extension %s is unknown' %file_name[-4:]
546            raise IOError, msg
547#        print'in import data_points', data_points
548#        print'in import attributes', attributes
549#        print'in import data_points', geo_reference
550        self.data_points = data_points
551        self.attributes = attributes
552        self.geo_reference = geo_reference
553   
554#        return all_data
555   
556    def export_points_file(self, file_name, absolute=True, 
557                           as_lat_long=False, isSouthHemisphere=True):
558       
559        """
560        write a points file, file_name, as a text (.csv) or binary (.pts) file
561        file_name is the file name, including the extension
562        The point_dict is defined at the top of this file.
563       
564        If absolute is True data the xll and yll are added to the points value
565        and the xll and yll of the geo_reference are set to 0.
566       
567        If absolute is False data points at returned as relative to the xll
568        and yll and geo_reference remains uneffected
569       
570        isSouthHemisphere: is only used when getting data
571        points "as_lat_long" is True and if FALSE will return lats and
572        longs valid for the Northern Hemisphere.
573       
574        """
575
576        if (file_name[-4:] == ".pts"):
577            if absolute is True:
578                geo_ref = deepcopy(self.geo_reference)
579                geo_ref.xllcorner = 0
580                geo_ref.yllcorner = 0
581                _write_pts_file(file_name,
582                                self.get_data_points(absolute), 
583                                self.get_all_attributes(),
584                                geo_ref)
585            else:
586                _write_pts_file(file_name,
587                                self.get_data_points(absolute), 
588                                self.get_all_attributes(),
589                                self.get_geo_reference())
590               
591        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
592            msg = "ERROR: trying to write a .txt file with relative data."
593            assert absolute, msg
594            _write_csv_file(file_name,
595                            self.get_data_points(absolute=True,
596                                                 as_lat_long=as_lat_long,
597                                  isSouthHemisphere=isSouthHemisphere), 
598                            self.get_all_attributes(),
599                            as_lat_long=as_lat_long)
600             
601        elif file_name[-4:] == ".urs" :
602            msg = "ERROR: Can not write a .urs file as a relative file."
603            assert absolute, msg
604            _write_urs_file(file_name,
605                            self.get_data_points(as_lat_long=True,
606                                  isSouthHemisphere=isSouthHemisphere))
607                                                         
608        else:
609            msg = 'Unknown file type %s ' %file_name
610            raise IOError, msg
611       
612    def get_sample(self, indices):
613        """ Returns a object which is a subset of the original
614        and the data points and attributes in this new object refer to
615        the indices provided
616       
617        Input
618            indices- a list of integers that represent the new object
619        Output
620            New geospatial data object representing points specified by
621            the indices
622            """
623        #FIXME: add the geo_reference to this
624#        print 'hello from get_sample'
625        points = self.get_data_points()
626        sampled_points = take(points, indices)
627
628        attributes = self.get_all_attributes()
629
630        sampled_attributes = {}
631        if attributes is not None:
632            for key, att in attributes.items():
633                sampled_attributes[key] = take(att, indices)
634
635#        print 'goodbye from get_sample'
636
637        return Geospatial_data(sampled_points, sampled_attributes)
638   
639   
640    def split(self, factor=0.5,seed_num=None, verbose=False):
641        """Returns two       
642        geospatial_data object, first is the size of the 'factor'
643        smaller the original and the second is the remainder. The two
644        new object are disjoin set of each other.
645       
646        Points of the two new object have selected RANDOMLY.
647       
648        This method create two lists of indices which are passed into
649        get_sample.  The lists are created using random numbers, and
650        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
651        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
652               
653        Input - the factor which to split the object, if 0.1 then 10% of the
654            together object will be returned
655       
656        Output - two geospatial_data objects that are disjoint sets of the
657            original
658            """
659       
660        i=0
661        self_size = len(self)
662        random_list = []
663        remainder_list = []
664        new_size = round(factor*self_size)
665       
666        # Find unique random numbers
667        if verbose: print "make unique random number list and get indices"
668
669        total=array(range(self_size))
670        total_list = total.tolist()
671        if verbose: print "total list len",len(total_list)
672               
673        # There will be repeated random numbers however will not be a
674        # problem as they are being 'pop'ed out of array so if there
675        # are two numbers the same they will pop different indicies,
676        # still basically random
677        ## create list of non-unquie random numbers
678        if verbose: print "create random numbers list %s long" %new_size
679       
680        # Set seed if provided, mainly important for unit test!
681        # plus recalcule seed when no seed provided.
682        if seed_num != None:
683            seed(seed_num,seed_num)
684        else:
685            seed()
686        if verbose: print "seed:", get_seed()
687       
688        #print 'size',self_size, new_size
689        random_num = randint(0,self_size-1,(int(new_size),))
690        #print 'random num',random_num
691        random_num = random_num.tolist()
692
693        #need to sort and reverse so the pop() works correctly
694        random_num.sort()
695        random_num.reverse()       
696       
697        if verbose: print "make random number list and get indices"
698        j=0
699        k=1
700        remainder_list = total_list[:]
701        # pops array index (random_num) from remainder_list
702        # (which starts as the
703        # total_list and appends to random_list 
704        random_num_len = len(random_num)
705        for i in random_num:
706            random_list.append(remainder_list.pop(i))
707            j+=1
708            #prints progress
709            if verbose and round(random_num_len/10*k)==j:
710                print '(%s/%s)' %(j, random_num_len)
711                k+=1
712       
713        # FIXME: move to tests, it might take a long time
714        # then create an array of random lenght between 500 and 1000,
715        # and use a random factor between 0 and 1
716        # setup for assertion
717        test_total = random_list[:]
718        test_total.extend(remainder_list)
719        test_total.sort() 
720        msg = 'The two random lists made from the original list when added '\
721         'together DO NOT equal the original list'
722        assert (total_list==test_total),msg
723
724        # Get new samples
725        if verbose: print "get values of indices for random list"
726        G1 = self.get_sample(random_list)
727        if verbose: print "get values of indices for opposite of random list"
728        G2 = self.get_sample(remainder_list)
729
730        return G1, G2
731
732    def __iter__(self):
733        """Read in the header, number_of_points and save the
734        file pointer position
735        """
736        from Scientific.IO.NetCDF import NetCDFFile
737        #print "Starting to block"
738        #FIXME - what to do if the file isn't there
739
740        # FIXME (Ole): Shouldn't this go into the constructor?
741        # This method acts like the constructor when blcoking.
742        # ... and shouldn't it be called block_size?
743        #
744        if self.max_read_lines is None:
745            self.max_read_lines = MAX_READ_LINES
746        if self.file_name[-4:] == ".pts":
747           
748            # See if the file is there.  Throw a QUIET IO error if it isn't
749            fd = open(self.file_name,'r')
750            fd.close()
751   
752            # Throws prints to screen if file not present
753            self.fid = NetCDFFile(self.file_name, 'r')
754           
755            self.blocking_georef, self.blocking_keys, self.number_of_points =\
756                                  _read_pts_file_header(self.fid,
757                                                        self.verbose)
758            self.start_row = 0
759            self.last_row = self.number_of_points           
760            self.show_verbose = 0
761            self.verbose_block_size = (self.last_row + 10)/10
762            self.block_number = 0
763            self.number_of_blocks = self.number_of_points/self.max_read_lines
764            # This computes the number of full blocks. The last block may be
765            # smaller and won't be ircluded in this estimate.
766           
767            if self.verbose is True:
768                print 'Reading %d points (in ~%d blocks) from file %s. '\
769                      %(self.number_of_points,
770                        self.number_of_blocks,
771                        self.file_name),
772                print 'Each block consists of %d data points'\
773                      %self.max_read_lines
774           
775        else:
776            # Assume the file is a csv file
777            file_pointer = open(self.file_name)
778            self.header, self.file_pointer = \
779                         _read_csv_file_header(file_pointer)
780            self.blocking_georef = None # Used for reconciling zones
781
782        return self
783   
784   
785    def next(self):
786        """ read a block, instanciate a new geospatial and return it"""
787       
788        if self.file_name[-4:] == ".pts":
789            if self.start_row == self.last_row:
790                # Read the end of the file last iteration
791                # Remove blocking attributes
792                self.fid.close()
793                del self.max_read_lines
794                del self.blocking_georef
795                del self.last_row
796                del self.start_row
797                del self.blocking_keys
798                del self.fid
799                raise StopIteration
800            fin_row = self.start_row + self.max_read_lines
801            if fin_row > self.last_row:
802                fin_row = self.last_row
803
804               
805           
806            if self.verbose is True:
807                if self.show_verbose >= self.start_row and \
808                       self.show_verbose < fin_row:
809
810                   
811                    #print 'Doing %d of %d' %(self.start_row, self.last_row+10)
812
813                    print 'Reading block %d (points %d to %d) out of %d'\
814                          %(self.block_number,
815                            self.start_row,
816                            fin_row,
817                            self.number_of_blocks)
818
819                   
820                    self.show_verbose += max(self.max_read_lines,
821                                             self.verbose_block_size)
822
823                 
824            # Read next block
825            pointlist, att_dict, = \
826                       _read_pts_file_blocking(self.fid,
827                                               self.start_row,
828                                               fin_row,
829                                               self.blocking_keys)
830           
831            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
832            self.start_row = fin_row
833           
834            self.block_number += 1           
835           
836        else:
837            # Assume the file is a csv file
838            try:
839                #print "self.max_read_lines", self.max_read_lines
840                pointlist, att_dict, geo_ref, self.file_pointer = \
841                   _read_csv_file_blocking( self.file_pointer,
842                                            self.header[:],
843                                            max_read_lines=self.max_read_lines,
844                                            verbose=self.verbose)
845
846                # Check that the zones haven't changed.
847                if geo_ref is not None:
848                    geo_ref.reconcile_zones(self.blocking_georef)
849                    self.blocking_georef = geo_ref
850                elif self.blocking_georef is not None:
851                   
852                    msg = 'Geo reference given, then not given.'
853                    msg += ' This should not happen.' 
854                    raise ValueError, msg
855                geo = Geospatial_data(pointlist, att_dict, geo_ref)
856            except StopIteration:
857                self.file_pointer.close()
858                del self.header
859                del self.file_pointer
860                raise StopIteration
861            except ANUGAError:
862                self.file_pointer.close()
863                del self.header
864                del self.file_pointer
865                raise
866            except SyntaxError:
867                self.file_pointer.close()
868                del self.header
869                del self.file_pointer
870                # This should only be if there is a format error
871                msg = 'Could not open file %s. \n' %self.file_name
872                msg += Error_message['IOError']
873                raise SyntaxError, msg
874        return geo
875
876   
877##################### Error messages ###########
878Error_message = {}
879Em = Error_message
880Em['IOError'] = "NOTE: The format for a comma separated .txt/.csv file is:\n"
881Em['IOError'] += "        1st line:     [column names]\n"
882Em['IOError'] += "        other lines:  [x value], [y value], [attributes]\n"
883Em['IOError'] += "\n"
884Em['IOError'] += "           for example:\n"
885Em['IOError'] += "           x, y, elevation, friction\n"
886Em['IOError'] += "           0.6, 0.7, 4.9, 0.3\n"
887Em['IOError'] += "           1.9, 2.8, 5, 0.3\n"
888Em['IOError'] += "           2.7, 2.4, 5.2, 0.3\n"
889Em['IOError'] += "\n"
890Em['IOError'] += "The first two columns are assumed to be x, y coordinates.\n"
891Em['IOError'] += "The attribute values must be numeric.\n"
892
893def _set_using_lat_long(latitudes,
894                        longitudes,
895                        geo_reference,
896                        data_points,
897                        points_are_lats_longs):
898    """
899    if the points has lat long info, assume it is in (lat, long) order.
900    """
901   
902    if geo_reference is not None:
903        msg = """A georeference is specified yet latitude and longitude
904        are also specified!"""
905        raise ValueError, msg
906   
907    if data_points is not None and not points_are_lats_longs:
908        msg = """Data points are specified yet latitude and
909        longitude are also specified."""
910        raise ValueError, msg
911   
912    if points_are_lats_longs:
913        if data_points is None:
914            msg = """Data points are not specified."""
915            raise ValueError, msg
916        lats_longs = ensure_numeric(data_points)
917        latitudes = ravel(lats_longs[:,0:1])
918        longitudes = ravel(lats_longs[:,1:])
919       
920    if latitudes is None and longitudes is None:
921        msg = """Latitudes and Longitudes are not specified."""
922        raise ValueError, msg
923   
924    if latitudes is None:
925        msg = """Longitudes are specified yet latitudes aren't."""
926        raise ValueError, msg
927   
928    if longitudes is None:
929        msg = """Latitudes are specified yet longitudes aren't."""
930        raise ValueError, msg
931   
932    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
933                                                    longitudes=longitudes)
934    return data_points, Geo_reference(zone=zone)
935
936   
937def _read_pts_file(file_name, verbose=False):
938    """Read .pts NetCDF file
939   
940    Return a dic of array of points, and dic of array of attribute
941    eg
942    dic['points'] = [[1.0,2.0],[3.0,5.0]]
943    dic['attributelist']['elevation'] = [[7.0,5.0]
944    """   
945
946    from Scientific.IO.NetCDF import NetCDFFile
947   
948    if verbose: print 'Reading ', file_name
949   
950       
951    # See if the file is there.  Throw a QUIET IO error if it isn't
952    fd = open(file_name,'r')
953    fd.close()
954   
955    # Throws prints to screen if file not present
956    fid = NetCDFFile(file_name, 'r') 
957   
958    pointlist = array(fid.variables['points'])
959    keys = fid.variables.keys()
960    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
961    try:
962        keys.remove('points')
963    except IOError, e:       
964        fid.close()   
965        msg = 'Expected keyword "points" but could not find it'
966        raise IOError, msg
967   
968    attributes = {}
969    for key in keys:
970        if verbose: print "reading attribute '%s'" %key
971           
972        attributes[key] = array(fid.variables[key])
973   
974   
975    try:
976        geo_reference = Geo_reference(NetCDFObject=fid)
977    except AttributeError, e:
978        geo_reference = None
979   
980    fid.close()
981   
982    return pointlist, attributes, geo_reference
983
984
985def _read_csv_file(file_name, verbose=False):
986    """Read .csv file
987   
988    Return a dic of array of points, and dic of array of attribute
989    eg
990    dic['points'] = [[1.0,2.0],[3.0,5.0]]
991    dic['attributelist']['elevation'] = [[7.0,5.0]
992    """
993   
994    file_pointer = open(file_name)
995    header, file_pointer = _read_csv_file_header(file_pointer)
996    try:
997        pointlist, att_dict, geo_ref, file_pointer = \
998                   _read_csv_file_blocking( \
999                file_pointer,
1000                header,
1001                max_read_lines=1e30) #If the file is bigger that this, block..  # FIXME (Ole) What's up here?
1002               
1003    except ANUGAError:
1004        file_pointer.close()
1005        raise 
1006    file_pointer.close()
1007    return pointlist, att_dict, geo_ref   
1008
1009CSV_DELIMITER = ','
1010def _read_csv_file_header(file_pointer,
1011                          delimiter=CSV_DELIMITER,
1012                          verbose=False):
1013
1014    """Read the header of a .csv file
1015    Return a list of the header names
1016    """
1017    line = file_pointer.readline()
1018    header = clean_line(line, delimiter)
1019    return header, file_pointer
1020
1021def _read_csv_file_blocking(file_pointer, header,
1022                            delimiter=CSV_DELIMITER,
1023                            max_read_lines=MAX_READ_LINES,
1024                            verbose=False):
1025   
1026
1027    """
1028    Read the body of a .csv file.
1029    header: The list header of the csv file, with the x and y labels.
1030    """
1031    points = []
1032    pointattributes = []
1033    att_dict = {}
1034
1035    # This is to remove the x and y headers.
1036    header = header[:]
1037    try:
1038        x_header = header.pop(0)
1039        y_header = header.pop(0)
1040    except IndexError:
1041        # if there are not two columns this will occur.
1042        # eg if it is a space seperated file
1043        raise SyntaxError
1044   
1045    read_lines = 0
1046    while read_lines<max_read_lines:
1047        line = file_pointer.readline()
1048        #print "line",line
1049        numbers = clean_line(line,delimiter)
1050        if len(numbers) <= 1:
1051            break
1052        if line[0] == '#':
1053            continue
1054        read_lines += 1
1055        try:
1056            x = float(numbers[0])
1057            y = float(numbers[1])
1058            points.append([x,y])
1059            numbers.pop(0)
1060            numbers.pop(0)
1061            if len(header) != len(numbers):
1062                file_pointer.close() 
1063                msg = "File load error.  \
1064                There might be a problem with the file header"
1065                raise SyntaxError, msg
1066            for i,num in enumerate(numbers):
1067                num.strip()
1068                if num != '\n' and num != '':
1069                    #attributes.append(float(num))
1070                    att_dict.setdefault(header[i],[]).append(float(num))
1071        #except IOError:           
1072        except ValueError:
1073            raise SyntaxError
1074    if points == []:
1075        raise StopIteration
1076       
1077   
1078    pointlist = array(points).astype(Float)
1079    for key in att_dict.keys():
1080        att_dict[key] = array(att_dict[key]).astype(Float)
1081
1082    # Do stuff here so the info is in lat's and longs
1083    geo_ref = None
1084    x_header = lower(x_header[:3])
1085    y_header = lower(y_header[:3])
1086    if (x_header == 'lon' or  x_header == 'lat') and \
1087       (y_header == 'lon' or  y_header == 'lat'):
1088        if x_header == 'lon':
1089            longitudes = ravel(pointlist[:,0:1])
1090            latitudes = ravel(pointlist[:,1:])
1091        else:
1092            latitudes = ravel(pointlist[:,0:1])
1093            longitudes = ravel(pointlist[:,1:])
1094       
1095        pointlist, geo_ref = _set_using_lat_long(latitudes,
1096                                                 longitudes,
1097                                                 geo_reference=None,
1098                                                 data_points=None,
1099                                                 points_are_lats_longs=False)
1100    return pointlist, att_dict, geo_ref, file_pointer
1101
1102def _read_pts_file_header(fid, verbose=False):
1103
1104    """Read the geo_reference and number_of_points from a .pts file
1105    """
1106   
1107    keys = fid.variables.keys()
1108    try:
1109        keys.remove('points')
1110    except IOError, e:       
1111        fid.close()   
1112        msg = 'Expected keyword "points" but could not find it'
1113        raise IOError, msg
1114    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
1115   
1116    try:
1117        geo_reference = Geo_reference(NetCDFObject=fid)
1118    except AttributeError, e:
1119        geo_reference = None
1120
1121    return geo_reference, keys, fid.dimensions['number_of_points']
1122
1123def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1124                            #verbose=False):
1125   
1126
1127    """
1128    Read the body of a .csv file.
1129    header: The list header of the csv file, with the x and y labels.
1130    """
1131   
1132    pointlist = array(fid.variables['points'][start_row:fin_row])
1133   
1134    attributes = {}
1135    for key in keys:
1136        attributes[key] = array(fid.variables[key][start_row:fin_row])
1137
1138    return pointlist, attributes
1139   
1140def _write_pts_file(file_name,
1141                    write_data_points,
1142                    write_attributes=None, 
1143                    write_geo_reference=None):
1144    """
1145    Write .pts NetCDF file   
1146
1147    NOTE: Below might not be valid ask Duncan : NB 5/2006
1148   
1149    WARNING: This function mangles the point_atts data structure
1150    #F??ME: (DSG)This format has issues.
1151    # There can't be an attribute called points
1152    # consider format change
1153    # method changed by NB not sure if above statement is correct
1154   
1155    should create new test for this
1156    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1157    for key in point_atts.keys():
1158        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1159        assert key in legal_keys, msg
1160    """   
1161    from Scientific.IO.NetCDF import NetCDFFile
1162    # NetCDF file definition
1163    outfile = NetCDFFile(file_name, 'w')
1164   
1165    # Create new file
1166    outfile.institution = 'Geoscience Australia'
1167    outfile.description = 'NetCDF format for compact and portable storage ' +\
1168                          'of spatial point data'
1169   
1170    # Dimension definitions
1171    shape = write_data_points.shape[0]
1172    outfile.createDimension('number_of_points', shape) 
1173    outfile.createDimension('number_of_dimensions', 2) #This is 2d data
1174   
1175    # Variable definition
1176    outfile.createVariable('points', Float, ('number_of_points',
1177                                             'number_of_dimensions'))
1178
1179    #create variables 
1180    outfile.variables['points'][:] = write_data_points #.astype(Float32)
1181
1182    if write_attributes is not None:
1183        for key in write_attributes.keys():
1184            outfile.createVariable(key, Float, ('number_of_points',))
1185            outfile.variables[key][:] = write_attributes[key] #.astype(Float32)
1186       
1187    if write_geo_reference is not None:
1188        write_NetCDF_georeference(write_geo_reference, outfile)
1189       
1190    outfile.close() 
1191 
1192def _write_csv_file(file_name,
1193                    write_data_points,
1194                    write_attributes=None,
1195                    as_lat_long=False,
1196                    delimiter=','):
1197    """
1198    export a file, file_name, with the csv format
1199   
1200    """
1201    points = write_data_points
1202    pointattributes = write_attributes
1203   
1204    fd = open(file_name,'w')
1205    if as_lat_long:
1206        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1207    else:
1208        titlelist = "x" + delimiter + "y"  + delimiter
1209    if pointattributes is not None:   
1210        for title in pointattributes.keys():
1211            titlelist = titlelist + title + delimiter
1212        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1213    fd.write(titlelist+"\n")
1214   
1215    # <x/lat> <y/long> [attributes]
1216    for i, vert in enumerate( points):
1217
1218
1219        if pointattributes is not None:           
1220            attlist = ","
1221            for att in pointattributes.keys():
1222                attlist = attlist + str(pointattributes[att][i])+ delimiter
1223            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1224            attlist.strip()
1225        else:
1226            attlist = ''
1227
1228        fd.write(str(vert[0]) + delimiter +
1229                 str(vert[1]) + attlist + "\n")
1230
1231    fd.close()
1232 
1233def _write_urs_file(file_name,
1234                    points,
1235                    delimiter=' '):
1236    """
1237    export a file, file_name, with the urs format
1238    the data points are in lats and longs
1239   
1240    """   
1241    fd = open(file_name,'w')   
1242    fd.write(str(len(points))+"\n")   
1243    # <lat> <long> <id#>
1244    for i, vert in enumerate( points):
1245        fd.write(str(round(vert[0],7)) + delimiter + \
1246                 str(round(vert[1],7)) + delimiter +str(i)+ "\n")
1247    fd.close()
1248   
1249def _point_atts2array(point_atts):
1250    point_atts['pointlist'] = array(point_atts['pointlist']).astype(Float)
1251   
1252    for key in point_atts['attributelist'].keys():
1253        point_atts['attributelist'][key]=\
1254                array(point_atts['attributelist'][key]).astype(Float)
1255    return point_atts
1256
1257   
1258
1259
1260def geospatial_data2points_dictionary(geospatial_data):
1261    """Convert geospatial data to points_dictionary
1262    """
1263
1264    points_dictionary = {}
1265    points_dictionary['pointlist'] = geospatial_data.data_points
1266
1267    points_dictionary['attributelist'] = {}
1268
1269    for attribute_name in geospatial_data.attributes.keys():
1270        val = geospatial_data.attributes[attribute_name]
1271        points_dictionary['attributelist'][attribute_name] = val
1272
1273    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1274
1275    return points_dictionary
1276
1277   
1278def points_dictionary2geospatial_data(points_dictionary):
1279    """Convert points_dictionary to geospatial data object
1280    """
1281
1282    msg = 'Points dictionary must have key pointlist' 
1283    assert points_dictionary.has_key('pointlist'), msg
1284
1285    msg = 'Points dictionary must have key attributelist'     
1286    assert points_dictionary.has_key('attributelist'), msg       
1287
1288    if points_dictionary.has_key('geo_reference'):
1289        geo = points_dictionary['geo_reference']
1290    else:
1291        geo = None
1292   
1293    return Geospatial_data(points_dictionary['pointlist'],
1294                           points_dictionary['attributelist'],
1295                           geo_reference = geo)
1296
1297def clean_line(line,delimiter):     
1298    """Remove whitespace
1299    """
1300    #print ">%s" %line
1301    line = line.strip()
1302    #print "stripped>%s" %line
1303    numbers = line.split(delimiter)
1304    i = len(numbers) - 1
1305    while i >= 0:
1306        if numbers[i] == '':
1307            numbers.pop(i)
1308        else:
1309            numbers[i] = numbers[i].strip()
1310       
1311        i += -1
1312    #for num in numbers:
1313    #    print "num>%s<" %num
1314    return numbers
1315           
1316def ensure_absolute(points, geo_reference=None):
1317    """
1318    This function inputs several formats and
1319    outputs one format. - a numeric array of absolute points.
1320
1321    Inputed formats are;
1322    points: List or numeric array of coordinate pairs [xi, eta] of
1323              points or geospatial object or points file name
1324
1325    mesh_origin: A geo_reference object or 3-tuples consisting of
1326                 UTM zone, easting and northing.
1327                 If specified vertex coordinates are assumed to be
1328                 relative to their respective origins.
1329    """
1330    if isinstance(points,type('')):
1331        #It's a string
1332        #assume it is a point file
1333        points = Geospatial_data(file_name = points)
1334       
1335    if isinstance(points,Geospatial_data):
1336        points = points.get_data_points( \
1337                absolute = True)
1338        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1339        assert geo_reference == None, msg
1340           
1341    else:
1342        points = ensure_numeric(points, Float)
1343    if geo_reference is None:
1344        geo = None #Geo_reference()
1345    else:
1346        if isinstance(geo_reference, Geo_reference):
1347            geo = geo_reference
1348        else:
1349            geo = Geo_reference(geo_reference[0],
1350                                geo_reference[1],
1351                                geo_reference[2])
1352        points = geo.get_absolute(points)
1353    return points
1354     
1355
1356def ensure_geospatial(points, geo_reference=None):
1357    """
1358    This function inputs several formats and
1359    outputs one format. - a geospatial_data instance.
1360
1361    Inputed formats are;
1362    points: List or numeric array of coordinate pairs [xi, eta] of
1363              points or geospatial object
1364
1365    mesh_origin: A geo_reference object or 3-tuples consisting of
1366                 UTM zone, easting and northing.
1367                 If specified vertex coordinates are assumed to be
1368                 relative to their respective origins.
1369    """
1370    if isinstance(points,Geospatial_data):
1371        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1372        assert geo_reference == None, msg
1373        return points   
1374    else:
1375        points = ensure_numeric(points, Float)
1376    if geo_reference is None:
1377        geo = None
1378    else:
1379        if isinstance(geo_reference, Geo_reference):
1380            geo = geo_reference
1381        else:
1382            geo = Geo_reference(geo_reference[0],
1383                                geo_reference[1],
1384                                geo_reference[2])
1385    points = Geospatial_data(data_points=points, geo_reference=geo)       
1386    return points
1387
1388
1389
1390def find_optimal_smoothing_parameter(data_file, 
1391                                     alpha_list=None,
1392                                     mesh_file=None,
1393                                     boundary_poly=None,
1394                                     mesh_resolution=100000,
1395                                     north_boundary=None,
1396                                     south_boundary=None,
1397                                     east_boundary=None,
1398                                     west_boundary=None,
1399                                     plot_name='all_alphas',
1400                                     split_factor=0.1,
1401                                     seed_num=None,
1402                                     cache=False,
1403                                     verbose=False
1404                                     ):
1405   
1406    """
1407    Removes a small random sample of points from 'data_file'. Then creates
1408    models with different alpha values from 'alpha_list' and cross validates
1409    the predicted value to the previously removed point data. Returns the
1410    alpha value which has the smallest covariance.
1411   
1412    data_file: must not contain points outside the boundaries defined
1413           and it either a pts, txt or csv file.
1414   
1415    alpha_list: the alpha values to test in a single list
1416   
1417    mesh_file: name of the created mesh file or if passed in will read it.
1418            NOTE, if there is a mesh file mesh_resolution,
1419            north_boundary, south... etc will be ignored.
1420   
1421    mesh_resolution: the maximum area size for a triangle
1422   
1423    north_boundary... west_boundary: the value of the boundary
1424   
1425    plot_name: the name for the plot contain the results
1426   
1427    seed_num: the seed to the random number generator
1428   
1429    USAGE:
1430        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1431                                             alpha_list=[0.0001, 0.01, 1],
1432                                             mesh_file=None,
1433                                             mesh_resolution=3,
1434                                             north_boundary=5,
1435                                             south_boundary=-5,
1436                                             east_boundary=5,
1437                                             west_boundary=-5,
1438                                             plot_name='all_alphas',
1439                                             seed_num=100000,
1440                                             verbose=False)
1441   
1442    OUTPUT: returns the minumum normalised covalance calculate AND the
1443           alpha that created it. PLUS writes a plot of the results
1444           
1445    NOTE: code will not work if the data_file extent is greater than the
1446    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
1447       
1448    """
1449
1450    from anuga.shallow_water import Domain
1451    from anuga.geospatial_data.geospatial_data import Geospatial_data
1452    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1453   
1454    from anuga.utilities.numerical_tools import cov
1455    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
1456    from anuga.utilities.polygon import is_inside_polygon
1457    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1458   
1459
1460    attribute_smoothed='elevation'
1461
1462    if mesh_file is None:
1463        if verbose: print "building mesh"
1464        mesh_file='temp.msh'
1465
1466        if north_boundary is None or south_boundary is None or \
1467           east_boundary is None or west_boundary is None:
1468            no_boundary=True
1469        else:
1470            no_boundary=False
1471       
1472        if no_boundary is True:
1473            msg= 'All boundaries must be defined'
1474            raise msg
1475
1476        poly_topo = [[east_boundary,south_boundary],
1477                     [east_boundary,north_boundary],
1478                     [west_boundary,north_boundary],
1479                     [west_boundary,south_boundary]]
1480                     
1481        create_mesh_from_regions(poly_topo,
1482                                 boundary_tags={'back': [2],
1483                                                'side': [1,3],
1484                                                'ocean': [0]},
1485                             maximum_triangle_area=mesh_resolution,
1486                             filename=mesh_file,
1487                             use_cache=cache,
1488                             verbose=verbose)
1489
1490    else: # if mesh file provided
1491        #test mesh file exists?
1492        if verbose: "reading from file: %s" %mesh_file
1493        if access(mesh_file,F_OK) == 0:
1494            msg="file %s doesn't exist!" %mesh_file
1495            raise IOError, msg
1496
1497    #split topo data
1498    if verbose: print 'Reading elevation file: %s' %data_file
1499    G = Geospatial_data(file_name = data_file)
1500    if verbose: print 'Start split'
1501    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1502    if verbose: print 'Finish split'
1503    points=G_small.get_data_points()
1504
1505    if verbose: print "Number of points in sample to compare: ", len(points)
1506   
1507    if alpha_list==None:
1508        alphas = [0.001,0.01,100]
1509        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1510         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1511       
1512    else:
1513        alphas=alpha_list
1514
1515    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1516    #4 onwards is the elevation_predicted using the alpha, which will
1517    #be compared later against the real removed data
1518    data=array([],typecode=Float)
1519
1520    data=resize(data,(len(points),3+len(alphas)))
1521
1522    #gets relative point from sample
1523    data[:,0]=points[:,0]
1524    data[:,1]=points[:,1]
1525    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1526    data[:,2]=elevation_sample
1527
1528    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1529
1530    if verbose: print 'Setup computational domains with different alphas'
1531
1532    #print 'memory usage before domains',mem_usage()
1533
1534    for i,alpha in enumerate(alphas):
1535        #add G_other data to domains with different alphas
1536        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1537        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1538        if verbose:print domain.statistics()
1539        domain.set_quantity(attribute_smoothed, 
1540                    geospatial_data = G_other,
1541                    use_cache = cache,
1542                    verbose = verbose,
1543                    alpha = alpha)
1544
1545        points_geo=domain.geo_reference.change_points_geo_ref(points)
1546        #returns the predicted elevation of the points that were "split" out
1547        #of the original data set for one particular alpha
1548        if verbose: print 'Get predicted elevation for location to be compared'
1549        elevation_predicted=domain.quantities[attribute_smoothed].\
1550                            get_values(interpolation_points=points_geo)
1551 
1552        #add predicted elevation to array that starts with x, y, z...
1553        data[:,i+3]=elevation_predicted
1554
1555        sample_cov= cov(elevation_sample)
1556        #print elevation_predicted
1557        ele_cov= cov(elevation_sample-elevation_predicted)
1558        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1559        #print 'memory usage during compare',mem_usage()
1560       
1561        if verbose: print'Covariance for alpha ',normal_cov[i][0],'= ',normal_cov[i][1]
1562        if verbose: print'-------------------------------------------- \n'
1563
1564    normal_cov0=normal_cov[:,0]
1565    normal_cov_new=take(normal_cov,argsort(normal_cov0))
1566
1567    if plot_name is not None:
1568        from pylab import savefig,semilogx,loglog
1569        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1570        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1571        savefig(plot_name,dpi=300)
1572    if mesh_file == 'temp.msh':
1573        remove(mesh_file)
1574   
1575    if verbose: 
1576        print 'Final results:'
1577        for i,alpha in enumerate(alphas):
1578            print'covariance for alpha %s = %s ' %(normal_cov[i][0],normal_cov[i][1])
1579        print '\n Optimal alpha is: %s ' % normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1580
1581    # covariance and optimal alpha
1582    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1583
1584def old_find_optimal_smoothing_parameter(data_file, 
1585                                     alpha_list=None,
1586                                     mesh_file=None,
1587                                     boundary_poly=None,
1588                                     mesh_resolution=100000,
1589                                     north_boundary=None,
1590                                     south_boundary=None,
1591                                     east_boundary=None,
1592                                     west_boundary=None,
1593                                     plot_name='all_alphas',
1594                                     split_factor=0.1,
1595                                     seed_num=None,
1596                                     cache=False,
1597                                     verbose=False
1598                                     ):
1599   
1600    """
1601    data_file: must not contain points outside the boundaries defined
1602           and it either a pts, txt or csv file.
1603   
1604    alpha_list: the alpha values to test in a single list
1605   
1606    mesh_file: name of the created mesh file or if passed in will read it.
1607            NOTE, if there is a mesh file mesh_resolution,
1608            north_boundary, south... etc will be ignored.
1609   
1610    mesh_resolution: the maximum area size for a triangle
1611   
1612    north_boundary... west_boundary: the value of the boundary
1613   
1614    plot_name: the name for the plot contain the results
1615   
1616    seed_num: the seed to the random number generator
1617   
1618    USAGE:
1619        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1620                                             alpha_list=[0.0001, 0.01, 1],
1621                                             mesh_file=None,
1622                                             mesh_resolution=3,
1623                                             north_boundary=5,
1624                                             south_boundary=-5,
1625                                             east_boundary=5,
1626                                             west_boundary=-5,
1627                                             plot_name='all_alphas',
1628                                             seed_num=100000,
1629                                             verbose=False)
1630   
1631    OUTPUT: returns the minumum normalised covalance calculate AND the
1632           alpha that created it. PLUS writes a plot of the results
1633           
1634    NOTE: code will not work if the data_file extend is greater than the
1635    boundary_polygon or the north_boundary...west_boundary
1636       
1637    """
1638
1639    from anuga.shallow_water import Domain
1640    from anuga.geospatial_data.geospatial_data import Geospatial_data
1641    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1642   
1643    from anuga.utilities.numerical_tools import cov
1644    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
1645    from anuga.utilities.polygon import is_inside_polygon
1646    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1647   
1648
1649    attribute_smoothed='elevation'
1650
1651    if mesh_file is None:
1652        mesh_file='temp.msh'
1653
1654        if north_boundary is None or south_boundary is None or \
1655           east_boundary is None or west_boundary is None:
1656            no_boundary=True
1657        else:
1658            no_boundary=False
1659       
1660        if no_boundary is True:
1661            msg= 'All boundaries must be defined'
1662            raise msg
1663
1664        poly_topo = [[east_boundary,south_boundary],
1665                     [east_boundary,north_boundary],
1666                     [west_boundary,north_boundary],
1667                     [west_boundary,south_boundary]]
1668                     
1669        create_mesh_from_regions(poly_topo,
1670                                 boundary_tags={'back': [2],
1671                                                'side': [1,3],
1672                                                'ocean': [0]},
1673                             maximum_triangle_area=mesh_resolution,
1674                             filename=mesh_file,
1675                             use_cache=cache,
1676                             verbose=verbose)
1677
1678    else: # if mesh file provided
1679        #test mesh file exists?
1680        if access(mesh_file,F_OK) == 0:
1681            msg="file %s doesn't exist!" %mesh_file
1682            raise IOError, msg
1683
1684    #split topo data
1685    G = Geospatial_data(file_name = data_file)
1686    if verbose: print 'start split'
1687    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1688    if verbose: print 'finish split'
1689    points=G_small.get_data_points()
1690
1691    #FIXME: Remove points outside boundary polygon
1692#    print 'new point',len(points)
1693#   
1694#    new_points=[]
1695#    new_points=array([],typecode=Float)
1696#    new_points=resize(new_points,(len(points),2))
1697#    print "BOUNDARY", boundary_poly
1698#    for i,point in enumerate(points):
1699#        if is_inside_polygon(point,boundary_poly, verbose=True):
1700#            new_points[i] = point
1701#            print"WOW",i,new_points[i]
1702#    points = new_points
1703
1704   
1705    if verbose: print "Number of points in sample to compare: ", len(points)
1706   
1707    if alpha_list==None:
1708        alphas = [0.001,0.01,100]
1709        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1710         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1711       
1712    else:
1713        alphas=alpha_list
1714    domains = {}
1715
1716    if verbose: print 'Setup computational domains with different alphas'
1717
1718    print 'memory usage before domains',mem_usage()
1719
1720    for alpha in alphas:
1721        #add G_other data to domains with different alphas
1722        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1723        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1724        if verbose:print domain.statistics()
1725        domain.set_quantity(attribute_smoothed, 
1726                    geospatial_data = G_other,
1727                    use_cache = cache,
1728                    verbose = verbose,
1729                    alpha = alpha)
1730        domains[alpha]=domain
1731
1732    print 'memory usage after domains',mem_usage()
1733
1734    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1735    #4 onwards is the elevation_predicted using the alpha, which will
1736    #be compared later against the real removed data
1737    data=array([],typecode=Float)
1738
1739    data=resize(data,(len(points),3+len(alphas)))
1740
1741    #gets relative point from sample
1742    data[:,0]=points[:,0]
1743    data[:,1]=points[:,1]
1744    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1745    data[:,2]=elevation_sample
1746
1747    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1748
1749    if verbose: print 'Determine difference between predicted results and actual data'
1750    for i,alpha in enumerate(domains):
1751        if verbose: print'Alpha =',alpha
1752       
1753        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1754        #returns the predicted elevation of the points that were "split" out
1755        #of the original data set for one particular alpha
1756        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1757                            get_values(interpolation_points=points_geo)
1758 
1759        #add predicted elevation to array that starts with x, y, z...
1760        data[:,i+3]=elevation_predicted
1761
1762        sample_cov= cov(elevation_sample)
1763        #print elevation_predicted
1764        ele_cov= cov(elevation_sample-elevation_predicted)
1765        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1766        print 'memory usage during compare',mem_usage()
1767       
1768
1769        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
1770
1771    normal_cov0=normal_cov[:,0]
1772    normal_cov_new=take(normal_cov,argsort(normal_cov0))
1773
1774    if plot_name is not None:
1775        from pylab import savefig,semilogx,loglog
1776        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1777        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1778        savefig(plot_name,dpi=300)
1779    if mesh_file == 'temp.msh':
1780        remove(mesh_file)
1781   
1782    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1783
1784         
1785if __name__ == "__main__":
1786    pass
1787   
Note: See TracBrowser for help on using the repository browser.