source: anuga_core/source/anuga/geospatial_data/geospatial_data.py @ 5079

Last change on this file since 5079 was 5010, checked in by nick, 17 years ago

update 'clip' and clip_outside' with verbose and added some comment statments if verbose

File size: 65.7 KB
Line 
1"""Class Geospatial_data - Manipulation of locations on the planet and
2associated attributes.
3
4"""
5from sys import maxint
6from os import access, F_OK, R_OK,remove
7from types import DictType
8from warnings import warn
9from string import lower
10from Numeric import concatenate, array, Float, shape, reshape, ravel, take, \
11                        size, shape
12#from Array import tolist
13from RandomArray import randint, seed, get_seed
14from copy import deepcopy
15
16#from MA import tolist
17
18from Scientific.IO.NetCDF import NetCDFFile   
19from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL   
20from anuga.utilities.numerical_tools import ensure_numeric
21from anuga.coordinate_transforms.geo_reference import Geo_reference, \
22     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
23from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
24from anuga.utilities.anuga_exceptions import ANUGAError
25from anuga.config import points_file_block_line_size as MAX_READ_LINES 
26#from anuga.fit_interpolate.benchmark_least_squares import mem_usage
27 
28
29DEFAULT_ATTRIBUTE = 'elevation'
30
31class Geospatial_data:
32
33    def __init__(self,
34                 data_points=None, # this can also be a points file name
35                 attributes=None,
36                 geo_reference=None,
37                 default_attribute_name=None,
38                 file_name=None,
39                 latitudes=None,
40                 longitudes=None,
41                 points_are_lats_longs=False,
42                 max_read_lines=None,
43                 load_file_now=True,
44                 verbose=False):
45
46       
47        """
48        Create instance from data points and associated attributes
49
50        data_points: x,y coordinates in meters. Type must be either a
51        sequence of 2-tuples or an Mx2 Numeric array of floats.  A file name
52        with extension .txt, .cvs or .pts can also be passed in here.
53
54        attributes: Associated values for each data point. The type
55        must be either a list or an array of length M or a dictionary
56        of lists (or arrays) of length M. In the latter case the keys
57        in the dictionary represent the attribute names, in the former
58        the attribute will get the default name "elevation".
59       
60        geo_reference: Object representing the origin of the data
61        points. It contains UTM zone, easting and northing and data
62        points are assumed to be relative to this origin.
63        If geo_reference is None, the default geo ref object is used.
64
65        default_attribute_name: Name of default attribute to be used with
66        get_attribute_values. The idea is that the dataset can be
67        equipped with information about which attribute to return.
68        If None, the default is the "first"
69
70        latitudes, longitudes: Vectors of latitudes and longitudes,
71        used to specify location instead of points.
72       
73        points_are_lats_longs: Set this as true if the points are actually
74        lats and longs, not UTM
75
76        max_read_lines: The number of rows read into memory when using
77        blocking to read a file.
78
79        load_file_now:  If true the file is automatically loaded
80        into the geospatial instance. Used when blocking.
81       
82        file_name: Name of input netCDF file or .txt file. netCDF file must
83        have dimensions "points" etc.
84        .txt file is a comma seperated file with x, y and attribute
85        data.
86       
87        The first line has the titles of the columns.  The first two
88        column titles are checked to see if they start with lat or
89        long (not case sensitive).  If so the data is assumed to be
90        latitude and longitude, in decimal format and converted to
91        UTM.  Otherwise the first two columns are assumed to be the x
92        and y, and the title names acually used are ignored.
93 
94       
95        The format for a .txt file is:
96            1st line:     [column names]
97            other lines:  x y [attributes]
98
99            for example:
100            x, y, elevation, friction
101            0.6, 0.7, 4.9, 0.3
102            1.9, 2.8, 5, 0.3
103            2.7, 2.4, 5.2, 0.3
104
105        The first two columns have to be x, y or lat, long
106        coordinates.
107     
108       
109        The format for a Points dictionary is:
110
111          ['pointlist'] a 2 column array describing points. 1st column x,
112          2nd column y.
113          ['attributelist'], a dictionary of 1D arrays, representing
114          attribute values at the point.  The dictionary key is the attribute
115          header.
116          ['geo_reference'] a Geo_refernece object. Use if the point
117          information is relative. This is optional.
118            eg
119            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
120            dic['attributelist']['elevation'] = [[7.0,5.0]
121               
122        verbose:
123
124         
125        """
126
127        if isinstance(data_points, basestring):
128            # assume data point is really a file name
129            file_name = data_points
130
131        self.set_verbose(verbose)
132        self.geo_reference=None #create the attribute
133        self.file_name = file_name
134       
135        if max_read_lines is None:
136            self.max_read_lines = MAX_READ_LINES
137        else:
138            self.max_read_lines = max_read_lines
139
140        if file_name is None:
141            if latitudes is not None or longitudes is not None or \
142                   points_are_lats_longs:
143                data_points, geo_reference =  \
144                             _set_using_lat_long(latitudes=latitudes,
145                                  longitudes=longitudes,
146                                  geo_reference=geo_reference,
147                                  data_points=data_points,
148                                  points_are_lats_longs=points_are_lats_longs)
149            self.check_data_points(data_points)
150            self.set_attributes(attributes)
151            self.set_geo_reference(geo_reference)
152            self.set_default_attribute_name(default_attribute_name)
153
154        elif load_file_now is True:
155            # watch for case where file name and points,
156            # attributes etc are provided!!
157            # if file name then all provided info will be removed!
158
159            if verbose is True:
160                if file_name is not None:
161                    print 'Loading Geospatial data from file: %s' %file_name
162           
163            self.import_points_file(file_name, verbose=verbose)
164               
165            self.check_data_points(self.data_points)
166            self.set_attributes(self.attributes) 
167            self.set_geo_reference(self.geo_reference)
168            self.set_default_attribute_name(default_attribute_name)
169
170        if verbose is True:
171            if file_name is not None:
172                print 'Geospatial data created from file: %s' %file_name
173                if load_file_now is False:
174                    print 'Data will be loaded blockwise on demand'
175
176                    if file_name.endswith('csv') or file_name.endswith('txt'):
177                        print 'ASCII formats are not that great for '
178                        print 'blockwise reading. Consider storing this'
179                        print 'data as a pts NetCDF format'
180
181    def __len__(self):
182        return len(self.data_points)
183
184    def __repr__(self):
185        return str(self.get_data_points(absolute=True))
186   
187   
188    def check_data_points(self, data_points):
189        """Checks data points
190        """
191       
192        if data_points is None:
193            self.data_points = None
194            msg = 'There is no data or file provided!'
195            raise ValueError, msg
196           
197        else:
198            self.data_points = ensure_numeric(data_points)
199            #print "self.data_points.shape",self.data_points.shape
200            if not (0,) == self.data_points.shape:
201                assert len(self.data_points.shape) == 2
202                assert self.data_points.shape[1] == 2
203
204    def set_attributes(self, attributes):
205        """Check and assign attributes dictionary
206        """
207       
208        if attributes is None:
209            self.attributes = None
210            return
211       
212        if not isinstance(attributes, DictType):
213            #Convert single attribute into dictionary
214            attributes = {DEFAULT_ATTRIBUTE: attributes}
215
216        #Check input attributes   
217        for key in attributes.keys():
218            try:
219                attributes[key] = ensure_numeric(attributes[key])
220            except:
221                msg = 'Attribute %s could not be converted' %key
222                msg += 'to a numeric vector'
223                raise msg
224
225        self.attributes = attributes   
226
227
228    def set_geo_reference(self, geo_reference):
229        """
230        Set's the georeference of geospatial.
231        It can also be used to change the georeference
232        """
233        from anuga.coordinate_transforms.geo_reference import Geo_reference
234
235        if geo_reference is None:
236            geo_reference = Geo_reference() # Use default
237        geo_reference = ensure_geo_reference(geo_reference)
238        if not isinstance(geo_reference, Geo_reference):
239            msg = 'Argument geo_reference must be a valid Geo_reference \n'
240            msg += 'object or None.'
241            raise msg
242
243        # if a geo ref already exists, change the point data to
244        # represent the new geo-ref
245        if  self.geo_reference is not None:
246            #FIXME: Maybe put out a warning here...
247            self.data_points = self.get_data_points \
248                               (geo_reference=geo_reference)
249           
250        self.geo_reference = geo_reference
251
252
253    def set_default_attribute_name(self, default_attribute_name):
254        self.default_attribute_name = default_attribute_name
255
256    def set_verbose(self, verbose=False):
257        if verbose in [False, True]:
258            self.verbose = verbose
259        else:
260            msg = 'Illegal value: %s' %str(verbose)
261            raise Exception, msg
262
263    def clip(self, polygon, closed=True, verbose=False):
264        """Clip geospatial data by a polygon
265
266        Input
267          polygon - Either a list of points, an Nx2 array or
268                    a Geospatial data object.
269          closed - (optional) determine whether points on boundary should be
270          regarded as belonging to the polygon (closed = True)
271          or not (closed = False). Default is True.
272           
273        Output
274          New geospatial data object representing points inside
275          specified polygon.
276       
277        """
278
279        from anuga.utilities.polygon import inside_polygon
280
281        if isinstance(polygon, Geospatial_data):
282            # Polygon is an object - extract points
283            polygon = polygon.get_data_points()
284            #print 'polygon',polygon
285
286        points = self.get_data_points()   
287        #print '%s points:%s' %(verbose,points)
288        inside_indices = inside_polygon(points, polygon, closed, verbose)
289
290        clipped_G = self.get_sample(inside_indices)
291#        clipped_points = take(points, inside_indices)
292
293        # Clip all attributes
294#        attributes = self.get_all_attributes()
295
296#        clipped_attributes = {}
297#        if attributes is not None:
298#            for key, att in attributes.items():
299#                clipped_attributes[key] = take(att, inside_indices)
300
301#        return Geospatial_data(clipped_points, clipped_attributes)
302        return clipped_G
303       
304       
305    def clip_outside(self, polygon, closed=True,verbose=False):
306        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
307
308        Input
309          polygon - Either a list of points, an Nx2 array or
310                    a Geospatial data object.
311          closed - (optional) determine whether points on boundary should be
312          regarded as belonging to the polygon (closed = True)
313          or not (closed = False). Default is True.
314         
315        Output
316          Geospatial data object representing point OUTSIDE specified polygon
317       
318        """
319
320        from anuga.utilities.polygon import outside_polygon
321
322        if isinstance(polygon, Geospatial_data):
323            # Polygon is an object - extract points
324            polygon = polygon.get_data_points()
325
326        points = self.get_data_points()   
327        outside_indices = outside_polygon(points, polygon, closed,verbose)
328
329        clipped_G = self.get_sample(outside_indices)
330
331#        clipped_points = take(points, outside_indices)
332
333        # Clip all attributes
334#        attributes = self.get_all_attributes()
335
336#        clipped_attributes = {}
337#        if attributes is not None:
338#            for key, att in attributes.items():
339#                clipped_attributes[key] = take(att, outside_indices)
340
341#        return Geospatial_data(clipped_points, clipped_attributes)
342        return clipped_G
343
344   
345    def get_geo_reference(self):
346        return self.geo_reference
347       
348    def get_data_points(self, absolute=True, geo_reference=None,
349                        as_lat_long=False,isSouthHemisphere=True):
350        """Get coordinates for all data points as an Nx2 array
351
352        If absolute is False returned coordinates are relative to the
353        internal georeference's xll and yll corners, otherwise
354        absolute UTM coordinates are returned.
355
356        If a geo_reference is passed the points are returned relative
357        to that geo_reference.
358       
359        isSH (isSouthHemisphere) is only used when getting data
360        points "as_lat_long" is True and if FALSE will return lats and
361        longs valid for the Northern Hemisphere.
362
363        Default: absolute is True.
364        """
365        if as_lat_long is True:
366            msg = "Points need a zone to be converted into lats and longs"
367            assert self.geo_reference is not None, msg
368            zone = self.geo_reference.get_zone()
369            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
370            lats_longs = []
371            for point in self.get_data_points(True):
372                ### UTMtoLL(northing, easting, zone,
373                lat_calced, long_calced = UTMtoLL(point[1],point[0], 
374                                                  zone, isSouthHemisphere)
375                lats_longs.append((lat_calced, long_calced)) # to hash
376            return lats_longs
377           
378        if absolute is True and geo_reference is None:
379            return self.geo_reference.get_absolute(self.data_points)
380        elif geo_reference is not None:
381            return geo_reference.change_points_geo_ref \
382                               (self.data_points, 
383                                self.geo_reference)
384        else:
385            return self.data_points
386       
387   
388    def get_attributes(self, attribute_name=None):
389        """Return values for one named attribute.
390
391        If attribute_name is None, default_attribute_name is used
392        """
393
394        if attribute_name is None:
395            if self.default_attribute_name is not None:
396                attribute_name = self.default_attribute_name
397            else:
398                attribute_name = self.attributes.keys()[0] 
399                # above line takes the first one from keys
400       
401        if self.verbose is True:
402            print 'Using attribute %s' %attribute_name
403            print 'Available attributes: %s' %(self.attributes.keys())       
404
405        msg = 'Attribute name %s does not exist in data set' %attribute_name
406        assert self.attributes.has_key(attribute_name), msg
407
408        return self.attributes[attribute_name]
409
410    def get_all_attributes(self):
411        """
412        Return values for all attributes.
413        The return value is either None or a dictionary (possibly empty).
414        """
415
416        return self.attributes
417
418    def __add__(self, other):
419        """
420        Returns the addition of 2 geospatical objects,
421        objects are concatencated to the end of each other
422           
423        NOTE: doesn't add if objects contain different
424        attributes 
425       
426        Always return absolute points!
427        This alse means, that if you add None to the object,
428        it will be turned into absolute coordinates
429
430        other can be None in which case nothing is added to self.
431        """
432
433
434        # find objects zone and checks if the same
435        geo_ref1 = self.get_geo_reference()
436        zone1 = geo_ref1.get_zone()
437
438
439        if other is not None:
440
441            geo_ref2 = other.get_geo_reference()
442            zone2 = geo_ref2.get_zone()
443
444            geo_ref1.reconcile_zones(geo_ref2)
445
446       
447            new_points = concatenate((self.get_data_points(absolute=True),
448                                      other.get_data_points(absolute=True)),
449                                      axis = 0)       
450
451       
452     
453            # Concatenate attributes if any
454            if self.attributes is None:
455                if other.attributes is not None:
456                    msg = 'Geospatial data must have the same \n'
457                    msg += 'attributes to allow addition.'
458                    raise Exception, msg
459               
460                new_attributes = None
461            else:   
462                new_attributes = {}
463                for x in self.attributes.keys():
464                    if other.attributes.has_key(x):
465
466                        attrib1 = self.attributes[x]
467                        attrib2 = other.attributes[x]
468                        new_attributes[x] = concatenate((attrib1, attrib2))
469
470                    else:
471                        msg = 'Geospatial data must have the same \n'
472                        msg += 'attributes to allow addition.'
473                        raise Exception, msg
474
475
476        else:
477            #other is None:
478           
479            new_points = self.get_data_points(absolute=True)
480            new_attributes = self.attributes
481
482                   
483
484        # Instantiate new data object and return absolute coordinates
485        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
486        return Geospatial_data(new_points,
487                               new_attributes,
488                               new_geo_ref)
489
490
491    def __radd__(self, other):
492        """Handle cases like None + Geospatial_data(...)
493        """
494
495        return self + other
496
497   
498   
499    ###
500    #  IMPORT/EXPORT POINTS FILES
501    ###
502
503    def import_points_file(self, file_name, delimiter=None, verbose=False):
504        """ load an .txt, .csv or .pts file
505        Note: will throw an IOError/SyntaxError if it can't load the file.
506        Catch these!
507
508        Post condition: self.attributes dictionary has been set
509        """
510       
511        if access(file_name, F_OK) == 0 :
512            msg = 'File %s does not exist or is not accessible' %file_name
513            raise IOError, msg
514       
515        attributes = {}
516        if file_name[-4:]== ".pts":
517            try:
518                data_points, attributes, geo_reference =\
519                             _read_pts_file(file_name, verbose)
520            except IOError, e:   
521                msg = 'Could not open file %s ' %file_name
522                raise IOError, msg 
523       
524        elif file_name[-4:]== ".txt" or file_name[-4:]== ".csv":
525            try:
526                data_points, attributes, geo_reference =\
527                             _read_csv_file(file_name, verbose)
528            except IOError, e:
529                # This should only be if a file is not found
530                msg = 'Could not open file %s. ' %file_name
531                msg += 'Check the file location.'
532                raise IOError, msg
533            except SyntaxError, e:
534                # This should only be if there is a format error
535                msg = 'Could not open file %s. \n' %file_name
536                msg += Error_message['IOError']
537                #print "msg", msg
538                raise SyntaxError, msg
539        else:     
540            msg = 'Extension %s is unknown' %file_name[-4:]
541            raise IOError, msg
542#        print'in import data_points', data_points
543#        print'in import attributes', attributes
544#        print'in import data_points', geo_reference
545        self.data_points = data_points
546        self.attributes = attributes
547        self.geo_reference = geo_reference
548   
549#        return all_data
550   
551    def export_points_file(self, file_name, absolute=True, 
552                           as_lat_long=False, isSouthHemisphere=True):
553       
554        """
555        write a points file, file_name, as a text (.csv) or binary (.pts) file
556        file_name is the file name, including the extension
557        The point_dict is defined at the top of this file.
558       
559        If absolute is True data the xll and yll are added to the points value
560        and the xll and yll of the geo_reference are set to 0.
561       
562        If absolute is False data points at returned as relative to the xll
563        and yll and geo_reference remains uneffected
564       
565        isSouthHemisphere: is only used when getting data
566        points "as_lat_long" is True and if FALSE will return lats and
567        longs valid for the Northern Hemisphere.
568       
569        """
570
571        if (file_name[-4:] == ".pts"):
572            if absolute is True:
573                geo_ref = deepcopy(self.geo_reference)
574                geo_ref.xllcorner = 0
575                geo_ref.yllcorner = 0
576                _write_pts_file(file_name,
577                                self.get_data_points(absolute), 
578                                self.get_all_attributes(),
579                                geo_ref)
580            else:
581                _write_pts_file(file_name,
582                                self.get_data_points(absolute), 
583                                self.get_all_attributes(),
584                                self.get_geo_reference())
585               
586        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
587            msg = "ERROR: trying to write a .txt file with relative data."
588            assert absolute, msg
589            _write_csv_file(file_name,
590                            self.get_data_points(absolute=True,
591                                                 as_lat_long=as_lat_long,
592                                  isSouthHemisphere=isSouthHemisphere), 
593                            self.get_all_attributes(),
594                            as_lat_long=as_lat_long)
595             
596        elif file_name[-4:] == ".urs" :
597            msg = "ERROR: Can not write a .urs file as a relative file."
598            assert absolute, msg
599            _write_urs_file(file_name,
600                            self.get_data_points(as_lat_long=True,
601                                  isSouthHemisphere=isSouthHemisphere))
602                                                         
603        else:
604            msg = 'Unknown file type %s ' %file_name
605            raise IOError, msg
606       
607    def get_sample(self, indices):
608        """ Returns a object which is a subset of the original
609        and the data points and attributes in this new object refer to
610        the indices provided
611       
612        Input
613            indices- a list of integers that represent the new object
614        Output
615            New geospatial data object representing points specified by
616            the indices
617            """
618        #FIXME: add the geo_reference to this
619       
620        points = self.get_data_points()
621        sampled_points = take(points, indices)
622
623        attributes = self.get_all_attributes()
624
625        sampled_attributes = {}
626        if attributes is not None:
627            for key, att in attributes.items():
628                sampled_attributes[key] = take(att, indices)
629
630        return Geospatial_data(sampled_points, sampled_attributes)
631   
632   
633    def split(self, factor=0.5,seed_num=None, verbose=False):
634        """Returns two       
635        geospatial_data object, first is the size of the 'factor'
636        smaller the original and the second is the remainder. The two
637        new object are disjoin set of each other.
638       
639        Points of the two new object have selected RANDOMLY.
640       
641        This method create two lists of indices which are passed into
642        get_sample.  The lists are created using random numbers, and
643        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
644        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
645               
646        Input - the factor which to split the object, if 0.1 then 10% of the
647            together object will be returned
648       
649        Output - two geospatial_data objects that are disjoint sets of the
650            original
651            """
652       
653        i=0
654        self_size = len(self)
655        random_list = []
656        remainder_list = []
657        new_size = round(factor*self_size)
658       
659        # Find unique random numbers
660        if verbose: print "make unique random number list and get indices"
661
662        total=array(range(self_size))
663        total_list = total.tolist()
664        if verbose: print "total list len",len(total_list)
665               
666        # There will be repeated random numbers however will not be a
667        # problem as they are being 'pop'ed out of array so if there
668        # are two numbers the same they will pop different indicies,
669        # still basically random
670        ## create list of non-unquie random numbers
671        if verbose: print "create random numbers list %s long" %new_size
672       
673        # Set seed if provided, mainly important for unit test!
674        # plus recalcule seed when no seed provided.
675        if seed_num != None:
676            seed(seed_num,seed_num)
677        else:
678            seed()
679        if verbose: print "seed:", get_seed()
680       
681        #print 'size',self_size, new_size
682        random_num = randint(0,self_size-1,(int(new_size),))
683        #print 'random num',random_num
684        random_num = random_num.tolist()
685
686        #need to sort and reverse so the pop() works correctly
687        random_num.sort()
688        random_num.reverse()       
689       
690        if verbose: print "make random number list and get indices"
691        j=0
692        k=1
693        remainder_list = total_list[:]
694        # pops array index (random_num) from remainder_list
695        # (which starts as the
696        # total_list and appends to random_list 
697        random_num_len = len(random_num)
698        for i in random_num:
699            random_list.append(remainder_list.pop(i))
700            j+=1
701            #prints progress
702            if verbose and round(random_num_len/10*k)==j:
703                print '(%s/%s)' %(j, random_num_len)
704                k+=1
705       
706        # FIXME: move to tests, it might take a long time
707        # then create an array of random lenght between 500 and 1000,
708        # and use a random factor between 0 and 1
709        # setup for assertion
710        test_total = random_list[:]
711        test_total.extend(remainder_list)
712        test_total.sort() 
713        msg = 'The two random lists made from the original list when added '\
714         'together DO NOT equal the original list'
715        assert (total_list==test_total),msg
716
717        # Get new samples
718        if verbose: print "get values of indices for random list"
719        G1 = self.get_sample(random_list)
720        if verbose: print "get values of indices for opposite of random list"
721        G2 = self.get_sample(remainder_list)
722
723        return G1, G2
724
725    def __iter__(self):
726        """Read in the header, number_of_points and save the
727        file pointer position
728        """
729
730        from Scientific.IO.NetCDF import NetCDFFile
731       
732        #FIXME - what to do if the file isn't there
733
734        # FIXME (Ole): Shouldn't this go into the constructor?
735        # This method acts like the constructor when blcoking.
736        # ... and shouldn't it be called block_size?
737        #
738        if self.max_read_lines is None:
739            self.max_read_lines = MAX_READ_LINES
740       
741        if self.file_name[-4:] == ".pts":
742           
743            # See if the file is there.  Throw a QUIET IO error if it isn't
744            fd = open(self.file_name,'r')
745            fd.close()
746   
747            # Throws prints to screen if file not present
748            self.fid = NetCDFFile(self.file_name, 'r')
749           
750            self.blocking_georef, self.blocking_keys, self.number_of_points =\
751                                  _read_pts_file_header(self.fid,
752                                                        self.verbose)
753            self.start_row = 0
754            self.last_row = self.number_of_points           
755            self.show_verbose = 0
756            self.verbose_block_size = (self.last_row + 10)/10
757            self.block_number = 0
758            self.number_of_blocks = self.number_of_points/self.max_read_lines
759            # This computes the number of full blocks. The last block may be
760            # smaller and won't be ircluded in this estimate.
761           
762            if self.verbose is True:
763                print 'Reading %d points (in ~%d blocks) from file %s. '\
764                      %(self.number_of_points,
765                        self.number_of_blocks,
766                        self.file_name),
767                print 'Each block consists of %d data points'\
768                      %self.max_read_lines
769           
770        else:
771            # Assume the file is a csv file
772            file_pointer = open(self.file_name)
773            self.header, self.file_pointer = \
774                         _read_csv_file_header(file_pointer)
775            self.blocking_georef = None # Used for reconciling zones
776
777        return self
778   
779   
780    def next(self):
781        """ read a block, instanciate a new geospatial and return it"""
782       
783        if self.file_name[-4:] == ".pts":
784            if self.start_row == self.last_row:
785                # Read the end of the file last iteration
786                # Remove blocking attributes
787                self.fid.close()
788                del self.max_read_lines
789                del self.blocking_georef
790                del self.last_row
791                del self.start_row
792                del self.blocking_keys
793                del self.fid
794                raise StopIteration
795            fin_row = self.start_row + self.max_read_lines
796            if fin_row > self.last_row:
797                fin_row = self.last_row
798
799               
800           
801            if self.verbose is True:
802                if self.show_verbose >= self.start_row and \
803                       self.show_verbose < fin_row:
804
805                   
806                    #print 'Doing %d of %d' %(self.start_row, self.last_row+10)
807
808                    print 'Reading block %d (points %d to %d) out of %d'\
809                          %(self.block_number,
810                            self.start_row,
811                            fin_row,
812                            self.number_of_blocks)
813
814                   
815                    self.show_verbose += max(self.max_read_lines,
816                                             self.verbose_block_size)
817
818                 
819            # Read next block
820            pointlist, att_dict, = \
821                       _read_pts_file_blocking(self.fid,
822                                               self.start_row,
823                                               fin_row,
824                                               self.blocking_keys)
825           
826            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
827            self.start_row = fin_row
828           
829            self.block_number += 1           
830           
831        else:
832            # Assume the file is a csv file
833            try:
834                pointlist, att_dict, geo_ref, self.file_pointer = \
835                   _read_csv_file_blocking( self.file_pointer,
836                                            self.header[:],
837                                            max_read_lines=self.max_read_lines,
838                                            verbose=self.verbose)
839
840                # Check that the zones haven't changed.
841                if geo_ref is not None:
842                    geo_ref.reconcile_zones(self.blocking_georef)
843                    self.blocking_georef = geo_ref
844                elif self.blocking_georef is not None:
845                   
846                    msg = 'Geo reference given, then not given.'
847                    msg += ' This should not happen.' 
848                    raise ValueError, msg
849                geo = Geospatial_data(pointlist, att_dict, geo_ref)
850            except StopIteration:
851                self.file_pointer.close()
852                del self.header
853                del self.file_pointer
854                raise StopIteration
855            except ANUGAError:
856                self.file_pointer.close()
857                del self.header
858                del self.file_pointer
859                raise
860            except SyntaxError:
861                self.file_pointer.close()
862                del self.header
863                del self.file_pointer
864                # This should only be if there is a format error
865                msg = 'Could not open file %s. \n' %self.file_name
866                msg += Error_message['IOError']
867                raise SyntaxError, msg
868        return geo
869
870   
871##################### Error messages ###########
872Error_message = {}
873Em = Error_message
874Em['IOError'] = "NOTE: The format for a comma separated .txt/.csv file is:\n"
875Em['IOError'] += "        1st line:     [column names]\n"
876Em['IOError'] += "        other lines:  [x value], [y value], [attributes]\n"
877Em['IOError'] += "\n"
878Em['IOError'] += "           for example:\n"
879Em['IOError'] += "           x, y, elevation, friction\n"
880Em['IOError'] += "           0.6, 0.7, 4.9, 0.3\n"
881Em['IOError'] += "           1.9, 2.8, 5, 0.3\n"
882Em['IOError'] += "           2.7, 2.4, 5.2, 0.3\n"
883Em['IOError'] += "\n"
884Em['IOError'] += "The first two columns are assumed to be x, y coordinates.\n"
885Em['IOError'] += "The attribute values must be numeric.\n"
886
887def _set_using_lat_long(latitudes,
888                        longitudes,
889                        geo_reference,
890                        data_points,
891                        points_are_lats_longs):
892    """
893    if the points has lat long info, assume it is in (lat, long) order.
894    """
895   
896    if geo_reference is not None:
897        msg = """A georeference is specified yet latitude and longitude
898        are also specified!"""
899        raise ValueError, msg
900   
901    if data_points is not None and not points_are_lats_longs:
902        msg = """Data points are specified yet latitude and
903        longitude are also specified!"""
904        raise ValueError, msg
905   
906    if points_are_lats_longs:
907        if data_points is None:
908            msg = """Data points are not specified !"""
909            raise ValueError, msg
910        lats_longs = ensure_numeric(data_points)
911        latitudes = ravel(lats_longs[:,0:1])
912        longitudes = ravel(lats_longs[:,1:])
913       
914    if latitudes is None and longitudes is None:
915        msg = """Latitudes and Longitudes are not."""
916        raise ValueError, msg
917   
918    if latitudes is None:
919        msg = """Longitudes are specified yet latitudes aren't!"""
920        raise ValueError, msg
921   
922    if longitudes is None:
923        msg = """Latitudes are specified yet longitudes aren't!"""
924        raise ValueError, msg
925   
926    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
927                                                    longitudes=longitudes)
928    return data_points, Geo_reference(zone=zone)
929
930   
931def _read_pts_file(file_name, verbose=False):
932    """Read .pts NetCDF file
933   
934    Return a dic of array of points, and dic of array of attribute
935    eg
936    dic['points'] = [[1.0,2.0],[3.0,5.0]]
937    dic['attributelist']['elevation'] = [[7.0,5.0]
938    """   
939
940    from Scientific.IO.NetCDF import NetCDFFile
941   
942    if verbose: print 'Reading ', file_name
943   
944       
945    # See if the file is there.  Throw a QUIET IO error if it isn't
946    fd = open(file_name,'r')
947    fd.close()
948   
949    # Throws prints to screen if file not present
950    fid = NetCDFFile(file_name, 'r') 
951   
952    pointlist = array(fid.variables['points'])
953    keys = fid.variables.keys()
954    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
955    try:
956        keys.remove('points')
957    except IOError, e:       
958        fid.close()   
959        msg = 'Expected keyword "points" but could not find it'
960        raise IOError, msg
961   
962    attributes = {}
963    for key in keys:
964        if verbose: print "reading attribute '%s'" %key
965           
966        attributes[key] = array(fid.variables[key])
967   
968   
969    try:
970        geo_reference = Geo_reference(NetCDFObject=fid)
971    except AttributeError, e:
972        geo_reference = None
973   
974    fid.close()
975   
976    return pointlist, attributes, geo_reference
977
978
979def _read_csv_file(file_name, verbose=False):
980    """Read .csv file
981   
982    Return a dic of array of points, and dic of array of attribute
983    eg
984    dic['points'] = [[1.0,2.0],[3.0,5.0]]
985    dic['attributelist']['elevation'] = [[7.0,5.0]
986    """
987   
988    file_pointer = open(file_name)
989    header, file_pointer = _read_csv_file_header(file_pointer)
990    try:
991        pointlist, att_dict, geo_ref, file_pointer = \
992                   _read_csv_file_blocking( \
993                file_pointer,
994                header,
995                max_read_lines=1e30) #If the file is bigger that this, block..
996    except ANUGAError:
997        file_pointer.close()
998        raise 
999    file_pointer.close()
1000    return pointlist, att_dict, geo_ref   
1001
1002CSV_DELIMITER = ','
1003def _read_csv_file_header(file_pointer,
1004                          delimiter=CSV_DELIMITER,
1005                          verbose=False):
1006
1007    """Read the header of a .csv file
1008    Return a list of the header names
1009    """
1010    line = file_pointer.readline()
1011    header = clean_line(line, delimiter)
1012    return header, file_pointer
1013
1014def _read_csv_file_blocking(file_pointer, header,
1015                            delimiter=CSV_DELIMITER,
1016                            max_read_lines=MAX_READ_LINES,
1017                            verbose=False):
1018   
1019
1020    """
1021    Read the body of a .csv file.
1022    header: The list header of the csv file, with the x and y labels.
1023    """
1024    points = []
1025    pointattributes = []
1026    att_dict = {}
1027
1028    # This is to remove the x and y headers.
1029    header = header[:]
1030    try:
1031        x_header = header.pop(0)
1032        y_header = header.pop(0)
1033    except IndexError:
1034        # if there are not two columns this will occur.
1035        # eg if it is a space seperated file
1036        raise SyntaxError
1037   
1038    read_lines = 0
1039    while read_lines<max_read_lines:
1040        line = file_pointer.readline()
1041        #print "line",line
1042        numbers = clean_line(line,delimiter)
1043        if len(numbers) <= 1:
1044            break
1045        if line[0] == '#':
1046            continue
1047        read_lines += 1
1048        try:
1049            x = float(numbers[0])
1050            y = float(numbers[1])
1051            points.append([x,y])
1052            numbers.pop(0)
1053            numbers.pop(0)
1054            if len(header) != len(numbers):
1055                file_pointer.close() 
1056                msg = "File load error.  \
1057                There might be a problem with the file header"
1058                raise SyntaxError, msg
1059            for i,num in enumerate(numbers):
1060                num.strip()
1061                if num != '\n' and num != '':
1062                    #attributes.append(float(num))
1063                    att_dict.setdefault(header[i],[]).append(float(num))
1064        #except IOError:           
1065        except ValueError:
1066            raise SyntaxError
1067    if points == []:
1068        raise StopIteration
1069       
1070   
1071    pointlist = array(points).astype(Float)
1072    for key in att_dict.keys():
1073        att_dict[key] = array(att_dict[key]).astype(Float)
1074
1075    # Do stuff here so the info is in lat's and longs
1076    geo_ref = None
1077    x_header = lower(x_header[:3])
1078    y_header = lower(y_header[:3])
1079    if (x_header == 'lon' or  x_header == 'lat') and \
1080       (y_header == 'lon' or  y_header == 'lat'):
1081        if x_header == 'lon':
1082            longitudes = ravel(pointlist[:,0:1])
1083            latitudes = ravel(pointlist[:,1:])
1084        else:
1085            latitudes = ravel(pointlist[:,0:1])
1086            longitudes = ravel(pointlist[:,1:])
1087       
1088        pointlist, geo_ref = _set_using_lat_long(latitudes,
1089                                                 longitudes,
1090                                                 geo_reference=None,
1091                                                 data_points=None,
1092                                                 points_are_lats_longs=False)
1093    return pointlist, att_dict, geo_ref, file_pointer
1094
1095def _read_pts_file_header(fid, verbose=False):
1096
1097    """Read the geo_reference and number_of_points from a .pts file
1098    """
1099   
1100    keys = fid.variables.keys()
1101    try:
1102        keys.remove('points')
1103    except IOError, e:       
1104        fid.close()   
1105        msg = 'Expected keyword "points" but could not find it'
1106        raise IOError, msg
1107    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
1108   
1109    try:
1110        geo_reference = Geo_reference(NetCDFObject=fid)
1111    except AttributeError, e:
1112        geo_reference = None
1113
1114    return geo_reference, keys, fid.dimensions['number_of_points']
1115
1116def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1117                            #verbose=False):
1118   
1119
1120    """
1121    Read the body of a .csv file.
1122    header: The list header of the csv file, with the x and y labels.
1123    """
1124   
1125    pointlist = array(fid.variables['points'][start_row:fin_row])
1126   
1127    attributes = {}
1128    for key in keys:
1129        attributes[key] = array(fid.variables[key][start_row:fin_row])
1130
1131    return pointlist, attributes
1132   
1133def _write_pts_file(file_name,
1134                    write_data_points,
1135                    write_attributes=None, 
1136                    write_geo_reference=None):
1137    """
1138    Write .pts NetCDF file   
1139
1140    NOTE: Below might not be valid ask Duncan : NB 5/2006
1141   
1142    WARNING: This function mangles the point_atts data structure
1143    #F??ME: (DSG)This format has issues.
1144    # There can't be an attribute called points
1145    # consider format change
1146    # method changed by NB not sure if above statement is correct
1147   
1148    should create new test for this
1149    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1150    for key in point_atts.keys():
1151        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1152        assert key in legal_keys, msg
1153    """   
1154    from Scientific.IO.NetCDF import NetCDFFile
1155    # NetCDF file definition
1156    outfile = NetCDFFile(file_name, 'w')
1157   
1158    # Create new file
1159    outfile.institution = 'Geoscience Australia'
1160    outfile.description = 'NetCDF format for compact and portable storage ' +\
1161                          'of spatial point data'
1162   
1163    # Dimension definitions
1164    shape = write_data_points.shape[0]
1165    outfile.createDimension('number_of_points', shape) 
1166    outfile.createDimension('number_of_dimensions', 2) #This is 2d data
1167   
1168    # Variable definition
1169    outfile.createVariable('points', Float, ('number_of_points',
1170                                             'number_of_dimensions'))
1171
1172    #create variables 
1173    outfile.variables['points'][:] = write_data_points #.astype(Float32)
1174
1175    if write_attributes is not None:
1176        for key in write_attributes.keys():
1177            outfile.createVariable(key, Float, ('number_of_points',))
1178            outfile.variables[key][:] = write_attributes[key] #.astype(Float32)
1179       
1180    if write_geo_reference is not None:
1181        write_NetCDF_georeference(write_geo_reference, outfile)
1182       
1183    outfile.close() 
1184 
1185def _write_csv_file(file_name,
1186                    write_data_points,
1187                    write_attributes=None,
1188                    as_lat_long=False,
1189                    delimiter=','):
1190    """
1191    export a file, file_name, with the csv format
1192   
1193    """
1194    points = write_data_points
1195    pointattributes = write_attributes
1196   
1197    fd = open(file_name,'w')
1198    if as_lat_long:
1199        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1200    else:
1201        titlelist = "x" + delimiter + "y"  + delimiter
1202    if pointattributes is not None:   
1203        for title in pointattributes.keys():
1204            titlelist = titlelist + title + delimiter
1205        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1206    fd.write(titlelist+"\n")
1207   
1208    # <x/lat> <y/long> [attributes]
1209    for i, vert in enumerate( points):
1210
1211
1212        if pointattributes is not None:           
1213            attlist = ","
1214            for att in pointattributes.keys():
1215                attlist = attlist + str(pointattributes[att][i])+ delimiter
1216            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1217            attlist.strip()
1218        else:
1219            attlist = ''
1220
1221        fd.write(str(vert[0]) + delimiter +
1222                 str(vert[1]) + attlist + "\n")
1223
1224    fd.close()
1225 
1226def _write_urs_file(file_name,
1227                    points,
1228                    delimiter=' '):
1229    """
1230    export a file, file_name, with the urs format
1231    the data points are in lats and longs
1232   
1233    """   
1234    fd = open(file_name,'w')   
1235    fd.write(str(len(points))+"\n")   
1236    # <lat> <long> <id#>
1237    for i, vert in enumerate( points):
1238        fd.write(str(round(vert[0],7)) + delimiter + \
1239                 str(round(vert[1],7)) + delimiter +str(i)+ "\n")
1240    fd.close()
1241   
1242def _point_atts2array(point_atts):
1243    point_atts['pointlist'] = array(point_atts['pointlist']).astype(Float)
1244   
1245    for key in point_atts['attributelist'].keys():
1246        point_atts['attributelist'][key]=\
1247                array(point_atts['attributelist'][key]).astype(Float)
1248    return point_atts
1249
1250   
1251
1252
1253def geospatial_data2points_dictionary(geospatial_data):
1254    """Convert geospatial data to points_dictionary
1255    """
1256
1257    points_dictionary = {}
1258    points_dictionary['pointlist'] = geospatial_data.data_points
1259
1260    points_dictionary['attributelist'] = {}
1261
1262    for attribute_name in geospatial_data.attributes.keys():
1263        val = geospatial_data.attributes[attribute_name]
1264        points_dictionary['attributelist'][attribute_name] = val
1265
1266    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1267
1268    return points_dictionary
1269
1270   
1271def points_dictionary2geospatial_data(points_dictionary):
1272    """Convert points_dictionary to geospatial data object
1273    """
1274
1275    msg = 'Points dictionary must have key pointlist' 
1276    assert points_dictionary.has_key('pointlist'), msg
1277
1278    msg = 'Points dictionary must have key attributelist'     
1279    assert points_dictionary.has_key('attributelist'), msg       
1280
1281    if points_dictionary.has_key('geo_reference'):
1282        geo = points_dictionary['geo_reference']
1283    else:
1284        geo = None
1285   
1286    return Geospatial_data(points_dictionary['pointlist'],
1287                           points_dictionary['attributelist'],
1288                           geo_reference = geo)
1289
1290def clean_line(line,delimiter):     
1291    """Remove whitespace
1292    """
1293    #print ">%s" %line
1294    line = line.strip()
1295    #print "stripped>%s" %line
1296    numbers = line.split(delimiter)
1297    i = len(numbers) - 1
1298    while i >= 0:
1299        if numbers[i] == '':
1300            numbers.pop(i)
1301        else:
1302            numbers[i] = numbers[i].strip()
1303       
1304        i += -1
1305    #for num in numbers:
1306    #    print "num>%s<" %num
1307    return numbers
1308           
1309def ensure_absolute(points, geo_reference=None):
1310    """
1311    This function inputs several formats and
1312    outputs one format. - a numeric array of absolute points.
1313
1314    Inputed formats are;
1315    points: List or numeric array of coordinate pairs [xi, eta] of
1316              points or geospatial object or points file name
1317
1318    mesh_origin: A geo_reference object or 3-tuples consisting of
1319                 UTM zone, easting and northing.
1320                 If specified vertex coordinates are assumed to be
1321                 relative to their respective origins.
1322    """
1323    if isinstance(points,type('')):
1324        #It's a string
1325        #assume it is a point file
1326        points = Geospatial_data(file_name = points)
1327       
1328    if isinstance(points,Geospatial_data):
1329        points = points.get_data_points( \
1330                absolute = True)
1331        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1332        assert geo_reference == None, msg
1333           
1334    else:
1335        points = ensure_numeric(points, Float)
1336    if geo_reference is None:
1337        geo = None #Geo_reference()
1338    else:
1339        if isinstance(geo_reference, Geo_reference):
1340            geo = geo_reference
1341        else:
1342            geo = Geo_reference(geo_reference[0],
1343                                geo_reference[1],
1344                                geo_reference[2])
1345        points = geo.get_absolute(points)
1346    return points
1347     
1348
1349def ensure_geospatial(points, geo_reference=None):
1350    """
1351    This function inputs several formats and
1352    outputs one format. - a geospatial_data instance.
1353
1354    Inputed formats are;
1355    points: List or numeric array of coordinate pairs [xi, eta] of
1356              points or geospatial object
1357
1358    mesh_origin: A geo_reference object or 3-tuples consisting of
1359                 UTM zone, easting and northing.
1360                 If specified vertex coordinates are assumed to be
1361                 relative to their respective origins.
1362    """
1363    if isinstance(points,Geospatial_data):
1364        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1365        assert geo_reference == None, msg
1366        return points   
1367    else:
1368        points = ensure_numeric(points, Float)
1369    if geo_reference is None:
1370        geo = None
1371    else:
1372        if isinstance(geo_reference, Geo_reference):
1373            geo = geo_reference
1374        else:
1375            geo = Geo_reference(geo_reference[0],
1376                                geo_reference[1],
1377                                geo_reference[2])
1378    points = Geospatial_data(data_points=points, geo_reference=geo)       
1379    return points
1380
1381
1382
1383def find_optimal_smoothing_parameter(data_file, 
1384                                     alpha_list=None,
1385                                     mesh_file=None,
1386                                     boundary_poly=None,
1387                                     mesh_resolution=100000,
1388                                     north_boundary=None,
1389                                     south_boundary=None,
1390                                     east_boundary=None,
1391                                     west_boundary=None,
1392                                     plot_name='all_alphas',
1393                                     split_factor=0.1,
1394                                     seed_num=None,
1395                                     cache=False,
1396                                     verbose=False
1397                                     ):
1398   
1399    """
1400    data_file: must not contain points outside the boundaries defined
1401           and it either a pts, txt or csv file.
1402   
1403    alpha_list: the alpha values to test in a single list
1404   
1405    mesh_file: name of the created mesh file or if passed in will read it.
1406            NOTE, if there is a mesh file mesh_resolution,
1407            north_boundary, south... etc will be ignored.
1408   
1409    mesh_resolution: the maximum area size for a triangle
1410   
1411    north_boundary... west_boundary: the value of the boundary
1412   
1413    plot_name: the name for the plot contain the results
1414   
1415    seed_num: the seed to the random number generator
1416   
1417    USAGE:
1418        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1419                                             alpha_list=[0.0001, 0.01, 1],
1420                                             mesh_file=None,
1421                                             mesh_resolution=3,
1422                                             north_boundary=5,
1423                                             south_boundary=-5,
1424                                             east_boundary=5,
1425                                             west_boundary=-5,
1426                                             plot_name='all_alphas',
1427                                             seed_num=100000,
1428                                             verbose=False)
1429   
1430    OUTPUT: returns the minumum normalised covalance calculate AND the
1431           alpha that created it. PLUS writes a plot of the results
1432           
1433    NOTE: code will not work if the data_file extend is greater than the
1434    boundary_polygon or the north_boundary...west_boundary
1435       
1436    """
1437
1438    from anuga.shallow_water import Domain
1439    from anuga.geospatial_data.geospatial_data import Geospatial_data
1440    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1441   
1442    from anuga.utilities.numerical_tools import cov
1443    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
1444    from anuga.utilities.polygon import is_inside_polygon
1445    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1446   
1447
1448    attribute_smoothed='elevation'
1449
1450    if mesh_file is None:
1451        if verbose: print "building mesh"
1452        mesh_file='temp.msh'
1453
1454        if north_boundary is None or south_boundary is None or \
1455           east_boundary is None or west_boundary is None:
1456            no_boundary=True
1457        else:
1458            no_boundary=False
1459       
1460        if no_boundary is True:
1461            msg= 'All boundaries must be defined'
1462            raise msg
1463
1464        poly_topo = [[east_boundary,south_boundary],
1465                     [east_boundary,north_boundary],
1466                     [west_boundary,north_boundary],
1467                     [west_boundary,south_boundary]]
1468                     
1469        create_mesh_from_regions(poly_topo,
1470                                 boundary_tags={'back': [2],
1471                                                'side': [1,3],
1472                                                'ocean': [0]},
1473                             maximum_triangle_area=mesh_resolution,
1474                             filename=mesh_file,
1475                             use_cache=cache,
1476                             verbose=verbose)
1477
1478    else: # if mesh file provided
1479        #test mesh file exists?
1480        if verbose: "reading from file: %s" %mesh_file
1481        if access(mesh_file,F_OK) == 0:
1482            msg="file %s doesn't exist!" %mesh_file
1483            raise IOError, msg
1484
1485    #split topo data
1486    if verbose: print 'Reading elevation file: %s' %data_file
1487    G = Geospatial_data(file_name = data_file)
1488    if verbose: print 'Start split'
1489    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1490    if verbose: print 'Finish split'
1491    points=G_small.get_data_points()
1492
1493
1494
1495
1496    #FIXME: Remove points outside boundary polygon
1497#    print 'new point',len(points)
1498#   
1499#    new_points=[]
1500#    new_points=array([],typecode=Float)
1501#    new_points=resize(new_points,(len(points),2))
1502#    print "BOUNDARY", boundary_poly
1503#    for i,point in enumerate(points):
1504#        if is_inside_polygon(point,boundary_poly, verbose=True):
1505#            new_points[i] = point
1506#            print"WOW",i,new_points[i]
1507#    points = new_points
1508
1509   
1510    if verbose: print "Number of points in sample to compare: ", len(points)
1511   
1512    if alpha_list==None:
1513        alphas = [0.001,0.01,100]
1514        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1515         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1516       
1517    else:
1518        alphas=alpha_list
1519#    domains = {}
1520
1521    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1522    #4 onwards is the elevation_predicted using the alpha, which will
1523    #be compared later against the real removed data
1524    data=array([],typecode=Float)
1525
1526    data=resize(data,(len(points),3+len(alphas)))
1527
1528    #gets relative point from sample
1529    data[:,0]=points[:,0]
1530    data[:,1]=points[:,1]
1531    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1532    data[:,2]=elevation_sample
1533
1534    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1535
1536
1537
1538
1539
1540    if verbose: print 'Setup computational domains with different alphas'
1541
1542    #print 'memory usage before domains',mem_usage()
1543
1544    for i,alpha in enumerate(alphas):
1545        #add G_other data to domains with different alphas
1546        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1547        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1548        if verbose:print domain.statistics()
1549        domain.set_quantity(attribute_smoothed, 
1550                    geospatial_data = G_other,
1551                    use_cache = cache,
1552                    verbose = verbose,
1553                    alpha = alpha)
1554#        domains[alpha]=domain
1555
1556        points_geo=domain.geo_reference.change_points_geo_ref(points)
1557        #returns the predicted elevation of the points that were "split" out
1558        #of the original data set for one particular alpha
1559        if verbose: print 'Get predicted elevation for location to be compared'
1560        elevation_predicted=domain.quantities[attribute_smoothed].\
1561                            get_values(interpolation_points=points_geo)
1562 
1563        #add predicted elevation to array that starts with x, y, z...
1564        data[:,i+3]=elevation_predicted
1565
1566        sample_cov= cov(elevation_sample)
1567        #print elevation_predicted
1568        ele_cov= cov(elevation_sample-elevation_predicted)
1569        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1570        #print 'memory usage during compare',mem_usage()
1571       
1572        if verbose: print'Covariance for alpha ',normal_cov[i][0],'= ',normal_cov[i][1]
1573        if verbose: print'-------------------------------------------- \n'
1574#    if verbose: print 'Determine difference between predicted results and actual data'
1575#    for i,alpha in enumerate(domains):
1576#        if verbose: print'Alpha =',alpha
1577#       
1578#        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1579#        #returns the predicted elevation of the points that were "split" out
1580#        #of the original data set for one particular alpha
1581#        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1582#                            get_values(interpolation_points=points_geo)
1583#
1584#        #add predicted elevation to array that starts with x, y, z...
1585#        data[:,i+3]=elevation_predicted
1586#
1587#        sample_cov= cov(elevation_sample)
1588#        #print elevation_predicted
1589#        ele_cov= cov(elevation_sample-elevation_predicted)
1590#        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1591#        print 'memory usage during compare',mem_usage()
1592#       
1593#
1594#        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
1595
1596    normal_cov0=normal_cov[:,0]
1597    normal_cov_new=take(normal_cov,argsort(normal_cov0))
1598
1599    if plot_name is not None:
1600        from pylab import savefig,semilogx,loglog
1601        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1602        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1603        savefig(plot_name,dpi=300)
1604    if mesh_file == 'temp.msh':
1605        remove(mesh_file)
1606   
1607    if verbose: 
1608        print 'Final results:'
1609        for i,alpha in enumerate(alphas):
1610            print'covariance for alpha %s = %s ' %(normal_cov[i][0],normal_cov[i][1])
1611        print '\n Optimal alpha is: %s ' % normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1612
1613    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1614
1615def old_find_optimal_smoothing_parameter(data_file, 
1616                                     alpha_list=None,
1617                                     mesh_file=None,
1618                                     boundary_poly=None,
1619                                     mesh_resolution=100000,
1620                                     north_boundary=None,
1621                                     south_boundary=None,
1622                                     east_boundary=None,
1623                                     west_boundary=None,
1624                                     plot_name='all_alphas',
1625                                     split_factor=0.1,
1626                                     seed_num=None,
1627                                     cache=False,
1628                                     verbose=False
1629                                     ):
1630   
1631    """
1632    data_file: must not contain points outside the boundaries defined
1633           and it either a pts, txt or csv file.
1634   
1635    alpha_list: the alpha values to test in a single list
1636   
1637    mesh_file: name of the created mesh file or if passed in will read it.
1638            NOTE, if there is a mesh file mesh_resolution,
1639            north_boundary, south... etc will be ignored.
1640   
1641    mesh_resolution: the maximum area size for a triangle
1642   
1643    north_boundary... west_boundary: the value of the boundary
1644   
1645    plot_name: the name for the plot contain the results
1646   
1647    seed_num: the seed to the random number generator
1648   
1649    USAGE:
1650        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1651                                             alpha_list=[0.0001, 0.01, 1],
1652                                             mesh_file=None,
1653                                             mesh_resolution=3,
1654                                             north_boundary=5,
1655                                             south_boundary=-5,
1656                                             east_boundary=5,
1657                                             west_boundary=-5,
1658                                             plot_name='all_alphas',
1659                                             seed_num=100000,
1660                                             verbose=False)
1661   
1662    OUTPUT: returns the minumum normalised covalance calculate AND the
1663           alpha that created it. PLUS writes a plot of the results
1664           
1665    NOTE: code will not work if the data_file extend is greater than the
1666    boundary_polygon or the north_boundary...west_boundary
1667       
1668    """
1669
1670    from anuga.shallow_water import Domain
1671    from anuga.geospatial_data.geospatial_data import Geospatial_data
1672    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1673   
1674    from anuga.utilities.numerical_tools import cov
1675    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
1676    from anuga.utilities.polygon import is_inside_polygon
1677    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1678   
1679
1680    attribute_smoothed='elevation'
1681
1682    if mesh_file is None:
1683        mesh_file='temp.msh'
1684
1685        if north_boundary is None or south_boundary is None or \
1686           east_boundary is None or west_boundary is None:
1687            no_boundary=True
1688        else:
1689            no_boundary=False
1690       
1691        if no_boundary is True:
1692            msg= 'All boundaries must be defined'
1693            raise msg
1694
1695        poly_topo = [[east_boundary,south_boundary],
1696                     [east_boundary,north_boundary],
1697                     [west_boundary,north_boundary],
1698                     [west_boundary,south_boundary]]
1699                     
1700        create_mesh_from_regions(poly_topo,
1701                                 boundary_tags={'back': [2],
1702                                                'side': [1,3],
1703                                                'ocean': [0]},
1704                             maximum_triangle_area=mesh_resolution,
1705                             filename=mesh_file,
1706                             use_cache=cache,
1707                             verbose=verbose)
1708
1709    else: # if mesh file provided
1710        #test mesh file exists?
1711        if access(mesh_file,F_OK) == 0:
1712            msg="file %s doesn't exist!" %mesh_file
1713            raise IOError, msg
1714
1715    #split topo data
1716    G = Geospatial_data(file_name = data_file)
1717    if verbose: print 'start split'
1718    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1719    if verbose: print 'finish split'
1720    points=G_small.get_data_points()
1721
1722    #FIXME: Remove points outside boundary polygon
1723#    print 'new point',len(points)
1724#   
1725#    new_points=[]
1726#    new_points=array([],typecode=Float)
1727#    new_points=resize(new_points,(len(points),2))
1728#    print "BOUNDARY", boundary_poly
1729#    for i,point in enumerate(points):
1730#        if is_inside_polygon(point,boundary_poly, verbose=True):
1731#            new_points[i] = point
1732#            print"WOW",i,new_points[i]
1733#    points = new_points
1734
1735   
1736    if verbose: print "Number of points in sample to compare: ", len(points)
1737   
1738    if alpha_list==None:
1739        alphas = [0.001,0.01,100]
1740        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1741         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1742       
1743    else:
1744        alphas=alpha_list
1745    domains = {}
1746
1747    if verbose: print 'Setup computational domains with different alphas'
1748
1749    print 'memory usage before domains',mem_usage()
1750
1751    for alpha in alphas:
1752        #add G_other data to domains with different alphas
1753        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1754        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1755        if verbose:print domain.statistics()
1756        domain.set_quantity(attribute_smoothed, 
1757                    geospatial_data = G_other,
1758                    use_cache = cache,
1759                    verbose = verbose,
1760                    alpha = alpha)
1761        domains[alpha]=domain
1762
1763    print 'memory usage after domains',mem_usage()
1764
1765    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1766    #4 onwards is the elevation_predicted using the alpha, which will
1767    #be compared later against the real removed data
1768    data=array([],typecode=Float)
1769
1770    data=resize(data,(len(points),3+len(alphas)))
1771
1772    #gets relative point from sample
1773    data[:,0]=points[:,0]
1774    data[:,1]=points[:,1]
1775    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1776    data[:,2]=elevation_sample
1777
1778    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1779
1780    if verbose: print 'Determine difference between predicted results and actual data'
1781    for i,alpha in enumerate(domains):
1782        if verbose: print'Alpha =',alpha
1783       
1784        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1785        #returns the predicted elevation of the points that were "split" out
1786        #of the original data set for one particular alpha
1787        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1788                            get_values(interpolation_points=points_geo)
1789 
1790        #add predicted elevation to array that starts with x, y, z...
1791        data[:,i+3]=elevation_predicted
1792
1793        sample_cov= cov(elevation_sample)
1794        #print elevation_predicted
1795        ele_cov= cov(elevation_sample-elevation_predicted)
1796        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1797        print 'memory usage during compare',mem_usage()
1798       
1799
1800        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
1801
1802    normal_cov0=normal_cov[:,0]
1803    normal_cov_new=take(normal_cov,argsort(normal_cov0))
1804
1805    if plot_name is not None:
1806        from pylab import savefig,semilogx,loglog
1807        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1808        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1809        savefig(plot_name,dpi=300)
1810    if mesh_file == 'temp.msh':
1811        remove(mesh_file)
1812   
1813    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1814
1815         
1816if __name__ == "__main__":
1817    pass
1818   
Note: See TracBrowser for help on using the repository browser.