source: anuga_core/source/anuga/geospatial_data/geospatial_data.py @ 5253

Last change on this file since 5253 was 5253, checked in by duncan, 16 years ago

Addition to URS_points_needed_to_file so it can do the Northern hemisphere.

File size: 64.6 KB
RevLine 
[4126]1"""Class Geospatial_data - Manipulation of locations on the planet and
2associated attributes.
3
4"""
[4178]5from sys import maxint
[4744]6from os import access, F_OK, R_OK,remove
[4126]7from types import DictType
[4150]8from warnings import warn
[4180]9from string import lower
[4126]10from Numeric import concatenate, array, Float, shape, reshape, ravel, take, \
11                        size, shape
[4501]12#from Array import tolist
[4816]13from RandomArray import randint, seed, get_seed
[4452]14from copy import deepcopy
15
[4126]16#from MA import tolist
17
[4175]18from Scientific.IO.NetCDF import NetCDFFile   
[4223]19from anuga.coordinate_transforms.lat_long_UTM_conversion import UTMtoLL   
[4126]20from anuga.utilities.numerical_tools import ensure_numeric
[4223]21from anuga.coordinate_transforms.geo_reference import Geo_reference, \
[4452]22     TitleError, DEFAULT_ZONE, ensure_geo_reference, write_NetCDF_georeference
[4126]23from anuga.coordinate_transforms.redfearn import convert_from_latlon_to_utm
[4180]24from anuga.utilities.anuga_exceptions import ANUGAError
[5003]25from anuga.config import points_file_block_line_size as MAX_READ_LINES 
26#from anuga.fit_interpolate.benchmark_least_squares import mem_usage
27 
[4126]28
[4535]29DEFAULT_ATTRIBUTE = 'elevation'
30
[4126]31class Geospatial_data:
32
33    def __init__(self,
34                 data_points=None, # this can also be a points file name
35                 attributes=None,
36                 geo_reference=None,
37                 default_attribute_name=None,
38                 file_name=None,
39                 latitudes=None,
40                 longitudes=None,
41                 points_are_lats_longs=False,
[4129]42                 max_read_lines=None,
43                 load_file_now=True,
[4126]44                 verbose=False):
45
46       
47        """
48        Create instance from data points and associated attributes
49
50        data_points: x,y coordinates in meters. Type must be either a
[4179]51        sequence of 2-tuples or an Mx2 Numeric array of floats.  A file name
[4569]52        with extension .txt, .cvs or .pts can also be passed in here.
[4126]53
54        attributes: Associated values for each data point. The type
55        must be either a list or an array of length M or a dictionary
56        of lists (or arrays) of length M. In the latter case the keys
57        in the dictionary represent the attribute names, in the former
[4535]58        the attribute will get the default name "elevation".
[4126]59       
60        geo_reference: Object representing the origin of the data
61        points. It contains UTM zone, easting and northing and data
62        points are assumed to be relative to this origin.
[4179]63        If geo_reference is None, the default geo ref object is used.
[4126]64
65        default_attribute_name: Name of default attribute to be used with
66        get_attribute_values. The idea is that the dataset can be
67        equipped with information about which attribute to return.
68        If None, the default is the "first"
[4179]69
70        latitudes, longitudes: Vectors of latitudes and longitudes,
71        used to specify location instead of points.
[4126]72       
[4179]73        points_are_lats_longs: Set this as true if the points are actually
74        lats and longs, not UTM
75
76        max_read_lines: The number of rows read into memory when using
77        blocking to read a file.
78
79        load_file_now:  If true the file is automatically loaded
80        into the geospatial instance. Used when blocking.
81       
82        file_name: Name of input netCDF file or .txt file. netCDF file must
[4126]83        have dimensions "points" etc.
[4179]84        .txt file is a comma seperated file with x, y and attribute
[4180]85        data.
[4126]86       
[4180]87        The first line has the titles of the columns.  The first two
88        column titles are checked to see if they start with lat or
89        long (not case sensitive).  If so the data is assumed to be
90        latitude and longitude, in decimal format and converted to
91        UTM.  Otherwise the first two columns are assumed to be the x
92        and y, and the title names acually used are ignored.
93 
94       
[4179]95        The format for a .txt file is:
96            1st line:     [column names]
[4126]97            other lines:  x y [attributes]
98
99            for example:
[4179]100            x, y, elevation, friction
[4126]101            0.6, 0.7, 4.9, 0.3
102            1.9, 2.8, 5, 0.3
103            2.7, 2.4, 5.2, 0.3
104
[4659]105        The first two columns have to be x, y or lat, long
[4126]106        coordinates.
[4179]107     
[4126]108       
109        The format for a Points dictionary is:
110
111          ['pointlist'] a 2 column array describing points. 1st column x,
112          2nd column y.
113          ['attributelist'], a dictionary of 1D arrays, representing
114          attribute values at the point.  The dictionary key is the attribute
115          header.
116          ['geo_reference'] a Geo_refernece object. Use if the point
117          information is relative. This is optional.
118            eg
119            dic['pointlist'] = [[1.0,2.0],[3.0,5.0]]
120            dic['attributelist']['elevation'] = [[7.0,5.0]
121               
122        verbose:
[4129]123
[4126]124         
125        """
126
127        if isinstance(data_points, basestring):
128            # assume data point is really a file name
129            file_name = data_points
130
131        self.set_verbose(verbose)
132        self.geo_reference=None #create the attribute
133        self.file_name = file_name
[4663]134       
135        if max_read_lines is None:
136            self.max_read_lines = MAX_READ_LINES
137        else:
138            self.max_read_lines = max_read_lines
[4179]139
[4126]140        if file_name is None:
141            if latitudes is not None or longitudes is not None or \
142                   points_are_lats_longs:
143                data_points, geo_reference =  \
[4180]144                             _set_using_lat_long(latitudes=latitudes,
[4126]145                                  longitudes=longitudes,
146                                  geo_reference=geo_reference,
147                                  data_points=data_points,
148                                  points_are_lats_longs=points_are_lats_longs)
149            self.check_data_points(data_points)
150            self.set_attributes(attributes)
151            self.set_geo_reference(geo_reference)
152            self.set_default_attribute_name(default_attribute_name)
153
[4129]154        elif load_file_now is True:
[4126]155            # watch for case where file name and points,
156            # attributes etc are provided!!
157            # if file name then all provided info will be removed!
[4576]158
[4633]159            if verbose is True:
160                if file_name is not None:
161                    print 'Loading Geospatial data from file: %s' %file_name
[4576]162           
[4660]163            self.import_points_file(file_name, verbose=verbose)
[4126]164               
165            self.check_data_points(self.data_points)
166            self.set_attributes(self.attributes) 
167            self.set_geo_reference(self.geo_reference)
168            self.set_default_attribute_name(default_attribute_name)
169
[4576]170        if verbose is True:
171            if file_name is not None:
172                print 'Geospatial data created from file: %s' %file_name
173                if load_file_now is False:
174                    print 'Data will be loaded blockwise on demand'
175
176                    if file_name.endswith('csv') or file_name.endswith('txt'):
177                        print 'ASCII formats are not that great for '
178                        print 'blockwise reading. Consider storing this'
179                        print 'data as a pts NetCDF format'
180
[4126]181    def __len__(self):
182        return len(self.data_points)
183
184    def __repr__(self):
185        return str(self.get_data_points(absolute=True))
186   
187   
188    def check_data_points(self, data_points):
189        """Checks data points
190        """
191       
192        if data_points is None:
193            self.data_points = None
194            msg = 'There is no data or file provided!'
195            raise ValueError, msg
196           
197        else:
198            self.data_points = ensure_numeric(data_points)
[4165]199            #print "self.data_points.shape",self.data_points.shape
200            if not (0,) == self.data_points.shape:
201                assert len(self.data_points.shape) == 2
202                assert self.data_points.shape[1] == 2
[4126]203
204    def set_attributes(self, attributes):
205        """Check and assign attributes dictionary
206        """
207       
208        if attributes is None:
209            self.attributes = None
210            return
211       
212        if not isinstance(attributes, DictType):
213            #Convert single attribute into dictionary
[4535]214            attributes = {DEFAULT_ATTRIBUTE: attributes}
[4126]215
216        #Check input attributes   
217        for key in attributes.keys():
218            try:
219                attributes[key] = ensure_numeric(attributes[key])
220            except:
221                msg = 'Attribute %s could not be converted' %key
222                msg += 'to a numeric vector'
223                raise msg
224
225        self.attributes = attributes   
226
227
228    def set_geo_reference(self, geo_reference):
[4452]229        """
230        Set's the georeference of geospatial.
231        It can also be used to change the georeference
232        """
[4126]233        from anuga.coordinate_transforms.geo_reference import Geo_reference
234
235        if geo_reference is None:
236            geo_reference = Geo_reference() # Use default
[4452]237        geo_reference = ensure_geo_reference(geo_reference)
[4126]238        if not isinstance(geo_reference, Geo_reference):
239            msg = 'Argument geo_reference must be a valid Geo_reference \n'
240            msg += 'object or None.'
241            raise msg
242
243        # if a geo ref already exists, change the point data to
244        # represent the new geo-ref
245        if  self.geo_reference is not None:
246            #FIXME: Maybe put out a warning here...
247            self.data_points = self.get_data_points \
248                               (geo_reference=geo_reference)
249           
250        self.geo_reference = geo_reference
251
252
253    def set_default_attribute_name(self, default_attribute_name):
254        self.default_attribute_name = default_attribute_name
255
256    def set_verbose(self, verbose=False):
257        if verbose in [False, True]:
258            self.verbose = verbose
259        else:
260            msg = 'Illegal value: %s' %str(verbose)
[4255]261            raise Exception, msg
[4126]262
[5010]263    def clip(self, polygon, closed=True, verbose=False):
[4126]264        """Clip geospatial data by a polygon
265
266        Input
267          polygon - Either a list of points, an Nx2 array or
268                    a Geospatial data object.
269          closed - (optional) determine whether points on boundary should be
270          regarded as belonging to the polygon (closed = True)
271          or not (closed = False). Default is True.
272           
273        Output
274          New geospatial data object representing points inside
275          specified polygon.
276       
277        """
278
279        from anuga.utilities.polygon import inside_polygon
280
281        if isinstance(polygon, Geospatial_data):
282            # Polygon is an object - extract points
283            polygon = polygon.get_data_points()
284
285        points = self.get_data_points()   
[5146]286#        if verbose: print '%s points:%s' %(verbose,points)
[5010]287        inside_indices = inside_polygon(points, polygon, closed, verbose)
[4126]288
289        clipped_G = self.get_sample(inside_indices)
[5146]290
[4126]291#        clipped_points = take(points, inside_indices)
292
293        # Clip all attributes
294#        attributes = self.get_all_attributes()
295
296#        clipped_attributes = {}
297#        if attributes is not None:
298#            for key, att in attributes.items():
299#                clipped_attributes[key] = take(att, inside_indices)
300
301#        return Geospatial_data(clipped_points, clipped_attributes)
302        return clipped_G
303       
304       
[5010]305    def clip_outside(self, polygon, closed=True,verbose=False):
[4126]306        """Clip geospatial date by a polygon, keeping data OUTSIDE of polygon
307
308        Input
309          polygon - Either a list of points, an Nx2 array or
310                    a Geospatial data object.
311          closed - (optional) determine whether points on boundary should be
312          regarded as belonging to the polygon (closed = True)
313          or not (closed = False). Default is True.
314         
315        Output
316          Geospatial data object representing point OUTSIDE specified polygon
317       
318        """
319
320        from anuga.utilities.polygon import outside_polygon
321
322        if isinstance(polygon, Geospatial_data):
323            # Polygon is an object - extract points
324            polygon = polygon.get_data_points()
325
326        points = self.get_data_points()   
[5010]327        outside_indices = outside_polygon(points, polygon, closed,verbose)
[4126]328
329        clipped_G = self.get_sample(outside_indices)
330
331#        clipped_points = take(points, outside_indices)
332
333        # Clip all attributes
334#        attributes = self.get_all_attributes()
335
336#        clipped_attributes = {}
337#        if attributes is not None:
338#            for key, att in attributes.items():
339#                clipped_attributes[key] = take(att, outside_indices)
340
341#        return Geospatial_data(clipped_points, clipped_attributes)
342        return clipped_G
343
344   
345    def get_geo_reference(self):
346        return self.geo_reference
347       
[4223]348    def get_data_points(self, absolute=True, geo_reference=None,
[4641]349                        as_lat_long=False,isSouthHemisphere=True):
[4202]350        """Get coordinates for all data points as an Nx2 array
[4126]351
[4194]352        If absolute is False returned coordinates are relative to the
353        internal georeference's xll and yll corners, otherwise
354        absolute UTM coordinates are returned.
[4126]355
356        If a geo_reference is passed the points are returned relative
357        to that geo_reference.
[4640]358       
359        isSH (isSouthHemisphere) is only used when getting data
360        points "as_lat_long" is True and if FALSE will return lats and
361        longs valid for the Northern Hemisphere.
[4126]362
363        Default: absolute is True.
364        """
[4223]365        if as_lat_long is True:
366            msg = "Points need a zone to be converted into lats and longs"
367            assert self.geo_reference is not None, msg
368            zone = self.geo_reference.get_zone()
369            assert self.geo_reference.get_zone() is not DEFAULT_ZONE, msg
370            lats_longs = []
371            for point in self.get_data_points(True):
372                ### UTMtoLL(northing, easting, zone,
[4640]373                lat_calced, long_calced = UTMtoLL(point[1],point[0], 
[4641]374                                                  zone, isSouthHemisphere)
[4225]375                lats_longs.append((lat_calced, long_calced)) # to hash
[4223]376            return lats_longs
[4126]377        if absolute is True and geo_reference is None:
378            return self.geo_reference.get_absolute(self.data_points)
379        elif geo_reference is not None:
380            return geo_reference.change_points_geo_ref \
381                               (self.data_points, 
382                                self.geo_reference)
383        else:
384            return self.data_points
[5146]385
[4126]386   
387    def get_attributes(self, attribute_name=None):
388        """Return values for one named attribute.
389
390        If attribute_name is None, default_attribute_name is used
391        """
392
393        if attribute_name is None:
394            if self.default_attribute_name is not None:
395                attribute_name = self.default_attribute_name
396            else:
397                attribute_name = self.attributes.keys()[0] 
398                # above line takes the first one from keys
399       
[4255]400        if self.verbose is True:
401            print 'Using attribute %s' %attribute_name
402            print 'Available attributes: %s' %(self.attributes.keys())       
[4126]403
404        msg = 'Attribute name %s does not exist in data set' %attribute_name
405        assert self.attributes.has_key(attribute_name), msg
406
407        return self.attributes[attribute_name]
408
409    def get_all_attributes(self):
410        """
411        Return values for all attributes.
412        The return value is either None or a dictionary (possibly empty).
413        """
414
415        return self.attributes
416
417    def __add__(self, other):
418        """
419        Returns the addition of 2 geospatical objects,
420        objects are concatencated to the end of each other
421           
422        NOTE: doesn't add if objects contain different
423        attributes 
424       
[4484]425        Always return absolute points!
[4549]426        This alse means, that if you add None to the object,
427        it will be turned into absolute coordinates
428
429        other can be None in which case nothing is added to self.
[4126]430        """
431
[4549]432
[4126]433        # find objects zone and checks if the same
434        geo_ref1 = self.get_geo_reference()
435        zone1 = geo_ref1.get_zone()
436
437
[4549]438        if other is not None:
[4126]439
[4549]440            geo_ref2 = other.get_geo_reference()
441            zone2 = geo_ref2.get_zone()
[4126]442
[4549]443            geo_ref1.reconcile_zones(geo_ref2)
[4126]444
445       
[4549]446            new_points = concatenate((self.get_data_points(absolute=True),
447                                      other.get_data_points(absolute=True)),
448                                      axis = 0)       
[4126]449
450       
[4549]451     
452            # Concatenate attributes if any
453            if self.attributes is None:
454                if other.attributes is not None:
455                    msg = 'Geospatial data must have the same \n'
456                    msg += 'attributes to allow addition.'
457                    raise Exception, msg
458               
459                new_attributes = None
460            else:   
461                new_attributes = {}
462                for x in self.attributes.keys():
463                    if other.attributes.has_key(x):
[4126]464
[4549]465                        attrib1 = self.attributes[x]
466                        attrib2 = other.attributes[x]
467                        new_attributes[x] = concatenate((attrib1, attrib2))
[4484]468
[4549]469                    else:
470                        msg = 'Geospatial data must have the same \n'
471                        msg += 'attributes to allow addition.'
472                        raise Exception, msg
473
474
475        else:
476            #other is None:
[4126]477           
[4549]478            new_points = self.get_data_points(absolute=True)
479            new_attributes = self.attributes
[4126]480
[4549]481                   
[4126]482
[4484]483        # Instantiate new data object and return absolute coordinates
484        new_geo_ref = Geo_reference(geo_ref1.get_zone(), 0.0, 0.0)
[4126]485        return Geospatial_data(new_points,
486                               new_attributes,
487                               new_geo_ref)
[4549]488
489
490    def __radd__(self, other):
491        """Handle cases like None + Geospatial_data(...)
492        """
493
494        return self + other
495
[4126]496   
[4549]497   
[4126]498    ###
499    #  IMPORT/EXPORT POINTS FILES
500    ###
501
502    def import_points_file(self, file_name, delimiter=None, verbose=False):
[4659]503        """ load an .txt, .csv or .pts file
504        Note: will throw an IOError/SyntaxError if it can't load the file.
[4126]505        Catch these!
506
507        Post condition: self.attributes dictionary has been set
508        """
509       
510        if access(file_name, F_OK) == 0 :
511            msg = 'File %s does not exist or is not accessible' %file_name
512            raise IOError, msg
513       
514        attributes = {}
[4659]515        if file_name[-4:]== ".pts":
[4126]516            try:
517                data_points, attributes, geo_reference =\
518                             _read_pts_file(file_name, verbose)
519            except IOError, e:   
520                msg = 'Could not open file %s ' %file_name
521                raise IOError, msg 
522       
[4129]523        elif file_name[-4:]== ".txt" or file_name[-4:]== ".csv":
[4126]524            try:
525                data_points, attributes, geo_reference =\
526                             _read_csv_file(file_name, verbose)
[4470]527            except IOError, e:
528                # This should only be if a file is not found
529                msg = 'Could not open file %s. ' %file_name
530                msg += 'Check the file location.'
531                raise IOError, msg
532            except SyntaxError, e:
533                # This should only be if there is a format error
534                msg = 'Could not open file %s. \n' %file_name
535                msg += Error_message['IOError']
536                #print "msg", msg
537                raise SyntaxError, msg
[4126]538        else:     
539            msg = 'Extension %s is unknown' %file_name[-4:]
540            raise IOError, msg
541#        print'in import data_points', data_points
542#        print'in import attributes', attributes
543#        print'in import data_points', geo_reference
544        self.data_points = data_points
545        self.attributes = attributes
546        self.geo_reference = geo_reference
547   
548#        return all_data
549   
[4641]550    def export_points_file(self, file_name, absolute=True, 
551                           as_lat_long=False, isSouthHemisphere=True):
[4126]552       
553        """
[4663]554        write a points file, file_name, as a text (.csv) or binary (.pts) file
[4126]555        file_name is the file name, including the extension
556        The point_dict is defined at the top of this file.
557       
[4452]558        If absolute is True data the xll and yll are added to the points value
559        and the xll and yll of the geo_reference are set to 0.
[4126]560       
561        If absolute is False data points at returned as relative to the xll
562        and yll and geo_reference remains uneffected
[4641]563       
564        isSouthHemisphere: is only used when getting data
565        points "as_lat_long" is True and if FALSE will return lats and
566        longs valid for the Northern Hemisphere.
567       
[4126]568        """
[4150]569
[4659]570        if (file_name[-4:] == ".pts"):
[4126]571            if absolute is True:
[4452]572                geo_ref = deepcopy(self.geo_reference)
573                geo_ref.xllcorner = 0
574                geo_ref.yllcorner = 0
[4126]575                _write_pts_file(file_name,
576                                self.get_data_points(absolute), 
[4452]577                                self.get_all_attributes(),
578                                geo_ref)
[4126]579            else:
580                _write_pts_file(file_name,
581                                self.get_data_points(absolute), 
582                                self.get_all_attributes(),
583                                self.get_geo_reference())
[4150]584               
[4165]585        elif file_name[-4:] == ".txt" or file_name[-4:] == ".csv":
[4154]586            msg = "ERROR: trying to write a .txt file with relative data."
587            assert absolute, msg
588            _write_csv_file(file_name,
[4225]589                            self.get_data_points(absolute=True,
[4641]590                                                 as_lat_long=as_lat_long,
591                                  isSouthHemisphere=isSouthHemisphere), 
[4225]592                            self.get_all_attributes(),
593                            as_lat_long=as_lat_long)
[4238]594             
595        elif file_name[-4:] == ".urs" :
596            msg = "ERROR: Can not write a .urs file as a relative file."
597            assert absolute, msg
598            _write_urs_file(file_name,
[4641]599                            self.get_data_points(as_lat_long=True,
600                                  isSouthHemisphere=isSouthHemisphere))
[4238]601                                                         
[4126]602        else:
603            msg = 'Unknown file type %s ' %file_name
604            raise IOError, msg
605       
606    def get_sample(self, indices):
607        """ Returns a object which is a subset of the original
608        and the data points and attributes in this new object refer to
609        the indices provided
610       
611        Input
612            indices- a list of integers that represent the new object
613        Output
614            New geospatial data object representing points specified by
615            the indices
616            """
617        #FIXME: add the geo_reference to this
[5146]618#        print 'hello from get_sample'
[4126]619        points = self.get_data_points()
620        sampled_points = take(points, indices)
621
622        attributes = self.get_all_attributes()
623
624        sampled_attributes = {}
625        if attributes is not None:
626            for key, att in attributes.items():
627                sampled_attributes[key] = take(att, indices)
628
[5146]629#        print 'goodbye from get_sample'
630
[4126]631        return Geospatial_data(sampled_points, sampled_attributes)
632   
633   
[4744]634    def split(self, factor=0.5,seed_num=None, verbose=False):
[5010]635        """Returns two       
[4659]636        geospatial_data object, first is the size of the 'factor'
637        smaller the original and the second is the remainder. The two
638        new object are disjoin set of each other.
[4126]639       
640        Points of the two new object have selected RANDOMLY.
641       
[4659]642        This method create two lists of indices which are passed into
643        get_sample.  The lists are created using random numbers, and
644        they are unique sets eg.  total_list(1,2,3,4,5,6,7,8,9)
645        random_list(1,3,6,7,9) and remainder_list(0,2,4,5,8)
[4501]646               
[4126]647        Input - the factor which to split the object, if 0.1 then 10% of the
[4744]648            together object will be returned
[4126]649       
650        Output - two geospatial_data objects that are disjoint sets of the
651            original
652            """
653       
654        i=0
655        self_size = len(self)
656        random_list = []
657        remainder_list = []
658        new_size = round(factor*self_size)
659       
[4776]660        # Find unique random numbers
[4195]661        if verbose: print "make unique random number list and get indices"
[4126]662
[4501]663        total=array(range(self_size))
664        total_list = total.tolist()
665        if verbose: print "total list len",len(total_list)
666               
[4776]667        # There will be repeated random numbers however will not be a
668        # problem as they are being 'pop'ed out of array so if there
669        # are two numbers the same they will pop different indicies,
670        # still basically random
[4501]671        ## create list of non-unquie random numbers
672        if verbose: print "create random numbers list %s long" %new_size
[4744]673       
[4776]674        # Set seed if provided, mainly important for unit test!
[4816]675        # plus recalcule seed when no seed provided.
[4744]676        if seed_num != None:
677            seed(seed_num,seed_num)
[4816]678        else:
679            seed()
680        if verbose: print "seed:", get_seed()
[4744]681       
[5010]682        #print 'size',self_size, new_size
[4501]683        random_num = randint(0,self_size-1,(int(new_size),))
[5010]684        #print 'random num',random_num
[4501]685        random_num = random_num.tolist()
[4126]686
687        #need to sort and reverse so the pop() works correctly
[4501]688        random_num.sort()
689        random_num.reverse()       
690       
691        if verbose: print "make random number list and get indices"
692        j=0
693        k=1
694        remainder_list = total_list[:]
[4776]695        # pops array index (random_num) from remainder_list
[4659]696        # (which starts as the
[4776]697        # total_list and appends to random_list 
[4501]698        random_num_len = len(random_num)
699        for i in random_num:
700            random_list.append(remainder_list.pop(i))
701            j+=1
702            #prints progress
703            if verbose and round(random_num_len/10*k)==j:
704                print '(%s/%s)' %(j, random_num_len)
705                k+=1
706       
[4776]707        # FIXME: move to tests, it might take a long time
708        # then create an array of random lenght between 500 and 1000,
709        # and use a random factor between 0 and 1
710        # setup for assertion
[4501]711        test_total = random_list[:]
712        test_total.extend(remainder_list)
713        test_total.sort() 
[4659]714        msg = 'The two random lists made from the original list when added '\
715         'together DO NOT equal the original list'
[4501]716        assert (total_list==test_total),msg
717
[4776]718        # Get new samples
[4195]719        if verbose: print "get values of indices for random list"
[4126]720        G1 = self.get_sample(random_list)
[4195]721        if verbose: print "get values of indices for opposite of random list"
[4126]722        G2 = self.get_sample(remainder_list)
723
724        return G1, G2
725
726    def __iter__(self):
[4569]727        """Read in the header, number_of_points and save the
728        file pointer position
[4175]729        """
[4126]730
[4175]731        from Scientific.IO.NetCDF import NetCDFFile
732       
[4126]733        #FIXME - what to do if the file isn't there
[4129]734
[4576]735        # FIXME (Ole): Shouldn't this go into the constructor?
[4663]736        # This method acts like the constructor when blcoking.
[4576]737        # ... and shouldn't it be called block_size?
[4663]738        #
[4576]739        if self.max_read_lines is None:
740            self.max_read_lines = MAX_READ_LINES
741       
[4659]742        if self.file_name[-4:] == ".pts":
[4573]743           
[4776]744            # See if the file is there.  Throw a QUIET IO error if it isn't
[4175]745            fd = open(self.file_name,'r')
746            fd.close()
747   
[4776]748            # Throws prints to screen if file not present
[4175]749            self.fid = NetCDFFile(self.file_name, 'r')
750           
[4576]751            self.blocking_georef, self.blocking_keys, self.number_of_points =\
752                                  _read_pts_file_header(self.fid,
753                                                        self.verbose)
[4569]754            self.start_row = 0
[4576]755            self.last_row = self.number_of_points           
[4252]756            self.show_verbose = 0
[4569]757            self.verbose_block_size = (self.last_row + 10)/10
[4576]758            self.block_number = 0
759            self.number_of_blocks = self.number_of_points/self.max_read_lines
760            # This computes the number of full blocks. The last block may be
761            # smaller and won't be ircluded in this estimate.
[4569]762           
763            if self.verbose is True:
[4576]764                print 'Reading %d points (in ~%d blocks) from file %s. '\
765                      %(self.number_of_points,
766                        self.number_of_blocks,
767                        self.file_name),
768                print 'Each block consists of %d data points'\
769                      %self.max_read_lines
[4569]770           
[4174]771        else:
[4776]772            # Assume the file is a csv file
[4174]773            file_pointer = open(self.file_name)
774            self.header, self.file_pointer = \
775                         _read_csv_file_header(file_pointer)
[4180]776            self.blocking_georef = None # Used for reconciling zones
[4576]777
[4126]778        return self
779   
[4576]780   
[4126]781    def next(self):
[4569]782        """ read a block, instanciate a new geospatial and return it"""
783       
[4659]784        if self.file_name[-4:] == ".pts":
[4175]785            if self.start_row == self.last_row:
[4776]786                # Read the end of the file last iteration
787                # Remove blocking attributes
[4175]788                self.fid.close()
[4179]789                del self.max_read_lines
790                del self.blocking_georef
791                del self.last_row
792                del self.start_row
793                del self.blocking_keys
794                del self.fid
[4175]795                raise StopIteration
796            fin_row = self.start_row + self.max_read_lines
797            if fin_row > self.last_row:
798                fin_row = self.last_row
799
[4576]800               
[4252]801           
802            if self.verbose is True:
803                if self.show_verbose >= self.start_row and \
[4569]804                       self.show_verbose < fin_row:
[4576]805
806                   
807                    #print 'Doing %d of %d' %(self.start_row, self.last_row+10)
808
809                    print 'Reading block %d (points %d to %d) out of %d'\
810                          %(self.block_number,
811                            self.start_row,
812                            fin_row,
813                            self.number_of_blocks)
814
815                   
816                    self.show_verbose += max(self.max_read_lines,
817                                             self.verbose_block_size)
818
819                 
820            # Read next block
[4175]821            pointlist, att_dict, = \
[4576]822                       _read_pts_file_blocking(self.fid,
823                                               self.start_row,
824                                               fin_row,
825                                               self.blocking_keys)
[4569]826           
[4175]827            geo = Geospatial_data(pointlist, att_dict, self.blocking_georef)
828            self.start_row = fin_row
829           
[4576]830            self.block_number += 1           
831           
[4174]832        else:
[4776]833            # Assume the file is a csv file
[4174]834            try:
[4180]835                pointlist, att_dict, geo_ref, self.file_pointer = \
[4174]836                   _read_csv_file_blocking( self.file_pointer,
[4175]837                                            self.header[:],
838                                            max_read_lines=self.max_read_lines,
839                                            verbose=self.verbose)
[4180]840
841                # Check that the zones haven't changed.
842                if geo_ref is not None:
843                    geo_ref.reconcile_zones(self.blocking_georef)
844                    self.blocking_georef = geo_ref
845                elif self.blocking_georef is not None:
846                   
847                    msg = 'Geo reference given, then not given.'
848                    msg += ' This should not happen.' 
849                    raise ValueError, msg
850                geo = Geospatial_data(pointlist, att_dict, geo_ref)
[4174]851            except StopIteration:
852                self.file_pointer.close()
[4179]853                del self.header
854                del self.file_pointer
[4174]855                raise StopIteration
[4180]856            except ANUGAError:
857                self.file_pointer.close()
858                del self.header
859                del self.file_pointer
860                raise
[4470]861            except SyntaxError:
862                self.file_pointer.close()
863                del self.header
864                del self.file_pointer
865                # This should only be if there is a format error
866                msg = 'Could not open file %s. \n' %self.file_name
867                msg += Error_message['IOError']
868                raise SyntaxError, msg
[4174]869        return geo
[4776]870
871   
[4470]872##################### Error messages ###########
873Error_message = {}
874Em = Error_message
[4633]875Em['IOError'] = "NOTE: The format for a comma separated .txt/.csv file is:\n"
[4481]876Em['IOError'] += "        1st line:     [column names]\n"
877Em['IOError'] += "        other lines:  [x value], [y value], [attributes]\n"
[4470]878Em['IOError'] += "\n"
879Em['IOError'] += "           for example:\n"
880Em['IOError'] += "           x, y, elevation, friction\n"
881Em['IOError'] += "           0.6, 0.7, 4.9, 0.3\n"
882Em['IOError'] += "           1.9, 2.8, 5, 0.3\n"
883Em['IOError'] += "           2.7, 2.4, 5.2, 0.3\n"
884Em['IOError'] += "\n"
885Em['IOError'] += "The first two columns are assumed to be x, y coordinates.\n"
[4481]886Em['IOError'] += "The attribute values must be numeric.\n"
[4126]887
[4180]888def _set_using_lat_long(latitudes,
889                        longitudes,
890                        geo_reference,
891                        data_points,
892                        points_are_lats_longs):
893    """
894    if the points has lat long info, assume it is in (lat, long) order.
895    """
896   
897    if geo_reference is not None:
898        msg = """A georeference is specified yet latitude and longitude
899        are also specified!"""
900        raise ValueError, msg
901   
902    if data_points is not None and not points_are_lats_longs:
903        msg = """Data points are specified yet latitude and
[5253]904        longitude are also specified."""
[4180]905        raise ValueError, msg
906   
907    if points_are_lats_longs:
908        if data_points is None:
[5253]909            msg = """Data points are not specified."""
[4180]910            raise ValueError, msg
911        lats_longs = ensure_numeric(data_points)
912        latitudes = ravel(lats_longs[:,0:1])
913        longitudes = ravel(lats_longs[:,1:])
914       
915    if latitudes is None and longitudes is None:
[5253]916        msg = """Latitudes and Longitudes are not specified."""
[4180]917        raise ValueError, msg
918   
919    if latitudes is None:
[5253]920        msg = """Longitudes are specified yet latitudes aren't."""
[4180]921        raise ValueError, msg
922   
923    if longitudes is None:
[5253]924        msg = """Latitudes are specified yet longitudes aren't."""
[4180]925        raise ValueError, msg
926   
927    data_points, zone  = convert_from_latlon_to_utm(latitudes=latitudes,
928                                                    longitudes=longitudes)
929    return data_points, Geo_reference(zone=zone)
[4569]930
[4180]931   
[4126]932def _read_pts_file(file_name, verbose=False):
933    """Read .pts NetCDF file
934   
935    Return a dic of array of points, and dic of array of attribute
936    eg
937    dic['points'] = [[1.0,2.0],[3.0,5.0]]
938    dic['attributelist']['elevation'] = [[7.0,5.0]
939    """   
940
941    from Scientific.IO.NetCDF import NetCDFFile
942   
943    if verbose: print 'Reading ', file_name
944   
945       
[4776]946    # See if the file is there.  Throw a QUIET IO error if it isn't
[4126]947    fd = open(file_name,'r')
948    fd.close()
949   
[4776]950    # Throws prints to screen if file not present
[4126]951    fid = NetCDFFile(file_name, 'r') 
952   
953    pointlist = array(fid.variables['points'])
954    keys = fid.variables.keys()
955    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
956    try:
957        keys.remove('points')
958    except IOError, e:       
959        fid.close()   
960        msg = 'Expected keyword "points" but could not find it'
961        raise IOError, msg
962   
963    attributes = {}
964    for key in keys:
965        if verbose: print "reading attribute '%s'" %key
966           
967        attributes[key] = array(fid.variables[key])
968   
969   
970    try:
971        geo_reference = Geo_reference(NetCDFObject=fid)
972    except AttributeError, e:
973        geo_reference = None
974   
975    fid.close()
976   
977    return pointlist, attributes, geo_reference
978
979
980def _read_csv_file(file_name, verbose=False):
981    """Read .csv file
982   
983    Return a dic of array of points, and dic of array of attribute
984    eg
985    dic['points'] = [[1.0,2.0],[3.0,5.0]]
986    dic['attributelist']['elevation'] = [[7.0,5.0]
987    """
988   
989    file_pointer = open(file_name)
990    header, file_pointer = _read_csv_file_header(file_pointer)
[4180]991    try:
992        pointlist, att_dict, geo_ref, file_pointer = \
993                   _read_csv_file_blocking( \
[4126]994                file_pointer,
995                header,
[5222]996                max_read_lines=1e30) #If the file is bigger that this, block..  # FIXME (Ole) What's up here?
997               
[4180]998    except ANUGAError:
999        file_pointer.close()
1000        raise 
[4126]1001    file_pointer.close()
[4180]1002    return pointlist, att_dict, geo_ref   
[4126]1003
1004CSV_DELIMITER = ','
1005def _read_csv_file_header(file_pointer,
1006                          delimiter=CSV_DELIMITER,
1007                          verbose=False):
1008
1009    """Read the header of a .csv file
1010    Return a list of the header names
1011    """
1012    line = file_pointer.readline()
1013    header = clean_line(line, delimiter)
1014    return header, file_pointer
1015
1016def _read_csv_file_blocking(file_pointer, header,
1017                            delimiter=CSV_DELIMITER,
[4129]1018                            max_read_lines=MAX_READ_LINES,
[4126]1019                            verbose=False):
1020   
1021
1022    """
1023    Read the body of a .csv file.
1024    header: The list header of the csv file, with the x and y labels.
1025    """
1026    points = []
1027    pointattributes = []
1028    att_dict = {}
1029
[4776]1030    # This is to remove the x and y headers.
[4129]1031    header = header[:]
[4470]1032    try:
1033        x_header = header.pop(0)
1034        y_header = header.pop(0)
1035    except IndexError:
1036        # if there are not two columns this will occur.
1037        # eg if it is a space seperated file
1038        raise SyntaxError
[4126]1039   
1040    read_lines = 0
1041    while read_lines<max_read_lines:
1042        line = file_pointer.readline()
1043        #print "line",line
1044        numbers = clean_line(line,delimiter)
1045        if len(numbers) <= 1:
1046            break
1047        if line[0] == '#':
1048            continue
1049        read_lines += 1
[4470]1050        try:
1051            x = float(numbers[0])
1052            y = float(numbers[1])
1053            points.append([x,y])
1054            numbers.pop(0)
1055            numbers.pop(0)
1056            if len(header) != len(numbers):
1057                file_pointer.close() 
[4661]1058                msg = "File load error.  \
1059                There might be a problem with the file header"
[4470]1060                raise SyntaxError, msg
1061            for i,num in enumerate(numbers):
1062                num.strip()
1063                if num != '\n' and num != '':
1064                    #attributes.append(float(num))
1065                    att_dict.setdefault(header[i],[]).append(float(num))
1066        #except IOError:           
1067        except ValueError:
1068            raise SyntaxError
[4126]1069    if points == []:
1070        raise StopIteration
1071       
1072   
1073    pointlist = array(points).astype(Float)
1074    for key in att_dict.keys():
1075        att_dict[key] = array(att_dict[key]).astype(Float)
[4180]1076
[4776]1077    # Do stuff here so the info is in lat's and longs
[4180]1078    geo_ref = None
1079    x_header = lower(x_header[:3])
1080    y_header = lower(y_header[:3])
1081    if (x_header == 'lon' or  x_header == 'lat') and \
1082       (y_header == 'lon' or  y_header == 'lat'):
1083        if x_header == 'lon':
1084            longitudes = ravel(pointlist[:,0:1])
1085            latitudes = ravel(pointlist[:,1:])
1086        else:
1087            latitudes = ravel(pointlist[:,0:1])
1088            longitudes = ravel(pointlist[:,1:])
[4126]1089       
[4180]1090        pointlist, geo_ref = _set_using_lat_long(latitudes,
1091                                                 longitudes,
1092                                                 geo_reference=None,
1093                                                 data_points=None,
1094                                                 points_are_lats_longs=False)
1095    return pointlist, att_dict, geo_ref, file_pointer
[4126]1096
[4175]1097def _read_pts_file_header(fid, verbose=False):
1098
[4569]1099    """Read the geo_reference and number_of_points from a .pts file
[4175]1100    """
1101   
1102    keys = fid.variables.keys()
1103    try:
1104        keys.remove('points')
1105    except IOError, e:       
1106        fid.close()   
1107        msg = 'Expected keyword "points" but could not find it'
1108        raise IOError, msg
1109    if verbose: print 'Got %d variables: %s' %(len(keys), keys)
1110   
1111    try:
1112        geo_reference = Geo_reference(NetCDFObject=fid)
1113    except AttributeError, e:
1114        geo_reference = None
1115
1116    return geo_reference, keys, fid.dimensions['number_of_points']
1117
1118def _read_pts_file_blocking(fid, start_row, fin_row, keys):
1119                            #verbose=False):
1120   
1121
1122    """
1123    Read the body of a .csv file.
1124    header: The list header of the csv file, with the x and y labels.
1125    """
1126   
1127    pointlist = array(fid.variables['points'][start_row:fin_row])
1128   
1129    attributes = {}
1130    for key in keys:
1131        attributes[key] = array(fid.variables[key][start_row:fin_row])
1132
1133    return pointlist, attributes
1134   
[4126]1135def _write_pts_file(file_name,
1136                    write_data_points,
1137                    write_attributes=None, 
1138                    write_geo_reference=None):
1139    """
1140    Write .pts NetCDF file   
1141
1142    NOTE: Below might not be valid ask Duncan : NB 5/2006
1143   
1144    WARNING: This function mangles the point_atts data structure
1145    #F??ME: (DSG)This format has issues.
1146    # There can't be an attribute called points
1147    # consider format change
1148    # method changed by NB not sure if above statement is correct
1149   
1150    should create new test for this
1151    legal_keys = ['pointlist', 'attributelist', 'geo_reference']
1152    for key in point_atts.keys():
1153        msg = 'Key %s is illegal. Valid keys are %s' %(key, legal_keys)
1154        assert key in legal_keys, msg
1155    """   
1156    from Scientific.IO.NetCDF import NetCDFFile
1157    # NetCDF file definition
1158    outfile = NetCDFFile(file_name, 'w')
1159   
[4776]1160    # Create new file
[4126]1161    outfile.institution = 'Geoscience Australia'
1162    outfile.description = 'NetCDF format for compact and portable storage ' +\
1163                          'of spatial point data'
1164   
[4776]1165    # Dimension definitions
[4126]1166    shape = write_data_points.shape[0]
1167    outfile.createDimension('number_of_points', shape) 
1168    outfile.createDimension('number_of_dimensions', 2) #This is 2d data
1169   
[4776]1170    # Variable definition
[4126]1171    outfile.createVariable('points', Float, ('number_of_points',
1172                                             'number_of_dimensions'))
1173
1174    #create variables 
1175    outfile.variables['points'][:] = write_data_points #.astype(Float32)
1176
1177    if write_attributes is not None:
1178        for key in write_attributes.keys():
1179            outfile.createVariable(key, Float, ('number_of_points',))
1180            outfile.variables[key][:] = write_attributes[key] #.astype(Float32)
1181       
1182    if write_geo_reference is not None:
[4452]1183        write_NetCDF_georeference(write_geo_reference, outfile)
[4126]1184       
1185    outfile.close() 
1186 
[4154]1187def _write_csv_file(file_name,
1188                    write_data_points,
[4225]1189                    write_attributes=None,
1190                    as_lat_long=False,
[4154]1191                    delimiter=','):
1192    """
[4513]1193    export a file, file_name, with the csv format
[4126]1194   
[4154]1195    """
1196    points = write_data_points
1197    pointattributes = write_attributes
1198   
1199    fd = open(file_name,'w')
[4225]1200    if as_lat_long:
1201        titlelist = "latitude" + delimiter + "longitude"  + delimiter
1202    else:
1203        titlelist = "x" + delimiter + "y"  + delimiter
[4154]1204    if pointattributes is not None:   
1205        for title in pointattributes.keys():
1206            titlelist = titlelist + title + delimiter
1207        titlelist = titlelist[0:-len(delimiter)] # remove the last delimiter
1208    fd.write(titlelist+"\n")
1209   
[4225]1210    # <x/lat> <y/long> [attributes]
[4154]1211    for i, vert in enumerate( points):
1212
1213
1214        if pointattributes is not None:           
1215            attlist = ","
1216            for att in pointattributes.keys():
1217                attlist = attlist + str(pointattributes[att][i])+ delimiter
1218            attlist = attlist[0:-len(delimiter)] # remove the last delimiter
1219            attlist.strip()
1220        else:
1221            attlist = ''
1222
1223        fd.write(str(vert[0]) + delimiter +
1224                 str(vert[1]) + attlist + "\n")
1225
1226    fd.close()
[4238]1227 
1228def _write_urs_file(file_name,
1229                    points,
1230                    delimiter=' '):
1231    """
1232    export a file, file_name, with the urs format
1233    the data points are in lats and longs
[4154]1234   
[4238]1235    """   
1236    fd = open(file_name,'w')   
1237    fd.write(str(len(points))+"\n")   
1238    # <lat> <long> <id#>
1239    for i, vert in enumerate( points):
1240        fd.write(str(round(vert[0],7)) + delimiter + \
[4349]1241                 str(round(vert[1],7)) + delimiter +str(i)+ "\n")
[4238]1242    fd.close()
1243   
[4126]1244def _point_atts2array(point_atts):
1245    point_atts['pointlist'] = array(point_atts['pointlist']).astype(Float)
1246   
1247    for key in point_atts['attributelist'].keys():
1248        point_atts['attributelist'][key]=\
1249                array(point_atts['attributelist'][key]).astype(Float)
1250    return point_atts
1251
1252   
1253
1254
1255def geospatial_data2points_dictionary(geospatial_data):
1256    """Convert geospatial data to points_dictionary
1257    """
1258
1259    points_dictionary = {}
1260    points_dictionary['pointlist'] = geospatial_data.data_points
1261
1262    points_dictionary['attributelist'] = {}
1263
1264    for attribute_name in geospatial_data.attributes.keys():
1265        val = geospatial_data.attributes[attribute_name]
1266        points_dictionary['attributelist'][attribute_name] = val
1267
1268    points_dictionary['geo_reference'] = geospatial_data.geo_reference
1269
1270    return points_dictionary
1271
1272   
1273def points_dictionary2geospatial_data(points_dictionary):
1274    """Convert points_dictionary to geospatial data object
1275    """
1276
1277    msg = 'Points dictionary must have key pointlist' 
1278    assert points_dictionary.has_key('pointlist'), msg
1279
1280    msg = 'Points dictionary must have key attributelist'     
1281    assert points_dictionary.has_key('attributelist'), msg       
1282
1283    if points_dictionary.has_key('geo_reference'):
1284        geo = points_dictionary['geo_reference']
1285    else:
1286        geo = None
1287   
1288    return Geospatial_data(points_dictionary['pointlist'],
1289                           points_dictionary['attributelist'],
1290                           geo_reference = geo)
1291
1292def clean_line(line,delimiter):     
1293    """Remove whitespace
1294    """
1295    #print ">%s" %line
1296    line = line.strip()
1297    #print "stripped>%s" %line
1298    numbers = line.split(delimiter)
1299    i = len(numbers) - 1
1300    while i >= 0:
1301        if numbers[i] == '':
1302            numbers.pop(i)
1303        else:
1304            numbers[i] = numbers[i].strip()
1305       
1306        i += -1
1307    #for num in numbers:
1308    #    print "num>%s<" %num
1309    return numbers
1310           
1311def ensure_absolute(points, geo_reference=None):
1312    """
1313    This function inputs several formats and
1314    outputs one format. - a numeric array of absolute points.
1315
1316    Inputed formats are;
1317    points: List or numeric array of coordinate pairs [xi, eta] of
[4255]1318              points or geospatial object or points file name
[4126]1319
1320    mesh_origin: A geo_reference object or 3-tuples consisting of
1321                 UTM zone, easting and northing.
1322                 If specified vertex coordinates are assumed to be
1323                 relative to their respective origins.
1324    """
1325    if isinstance(points,type('')):
1326        #It's a string
1327        #assume it is a point file
1328        points = Geospatial_data(file_name = points)
1329       
1330    if isinstance(points,Geospatial_data):
1331        points = points.get_data_points( \
1332                absolute = True)
1333        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1334        assert geo_reference == None, msg
1335           
1336    else:
1337        points = ensure_numeric(points, Float)
1338    if geo_reference is None:
1339        geo = None #Geo_reference()
1340    else:
1341        if isinstance(geo_reference, Geo_reference):
1342            geo = geo_reference
1343        else:
1344            geo = Geo_reference(geo_reference[0],
1345                                geo_reference[1],
1346                                geo_reference[2])
1347        points = geo.get_absolute(points)
1348    return points
1349     
1350
1351def ensure_geospatial(points, geo_reference=None):
1352    """
1353    This function inputs several formats and
1354    outputs one format. - a geospatial_data instance.
1355
1356    Inputed formats are;
1357    points: List or numeric array of coordinate pairs [xi, eta] of
[4255]1358              points or geospatial object
[4126]1359
1360    mesh_origin: A geo_reference object or 3-tuples consisting of
1361                 UTM zone, easting and northing.
1362                 If specified vertex coordinates are assumed to be
1363                 relative to their respective origins.
1364    """
1365    if isinstance(points,Geospatial_data):
1366        msg = "Use a Geospatial_data object or a mesh origin. Not both."
1367        assert geo_reference == None, msg
1368        return points   
1369    else:
1370        points = ensure_numeric(points, Float)
1371    if geo_reference is None:
1372        geo = None
1373    else:
1374        if isinstance(geo_reference, Geo_reference):
1375            geo = geo_reference
1376        else:
1377            geo = Geo_reference(geo_reference[0],
1378                                geo_reference[1],
1379                                geo_reference[2])
1380    points = Geospatial_data(data_points=points, geo_reference=geo)       
1381    return points
1382
[4642]1383
1384
1385def find_optimal_smoothing_parameter(data_file, 
[4744]1386                                     alpha_list=None,
[4642]1387                                     mesh_file=None,
[4847]1388                                     boundary_poly=None,
[4642]1389                                     mesh_resolution=100000,
1390                                     north_boundary=None,
1391                                     south_boundary=None,
1392                                     east_boundary=None,
1393                                     west_boundary=None,
[4745]1394                                     plot_name='all_alphas',
[4879]1395                                     split_factor=0.1,
[4744]1396                                     seed_num=None,
1397                                     cache=False,
1398                                     verbose=False
[4642]1399                                     ):
[4744]1400   
1401    """
[5146]1402    Removes a small random sample of points from 'data_file'. Then creates
1403    models with different alpha values from 'alpha_list' and cross validates
1404    the predicted value to the previously removed point data. Returns the
1405    alpha value which has the smallest covariance.
1406   
[4744]1407    data_file: must not contain points outside the boundaries defined
1408           and it either a pts, txt or csv file.
1409   
1410    alpha_list: the alpha values to test in a single list
1411   
[4816]1412    mesh_file: name of the created mesh file or if passed in will read it.
1413            NOTE, if there is a mesh file mesh_resolution,
1414            north_boundary, south... etc will be ignored.
[4744]1415   
1416    mesh_resolution: the maximum area size for a triangle
1417   
1418    north_boundary... west_boundary: the value of the boundary
1419   
1420    plot_name: the name for the plot contain the results
1421   
1422    seed_num: the seed to the random number generator
1423   
1424    USAGE:
[4816]1425        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
[4744]1426                                             alpha_list=[0.0001, 0.01, 1],
1427                                             mesh_file=None,
1428                                             mesh_resolution=3,
1429                                             north_boundary=5,
1430                                             south_boundary=-5,
1431                                             east_boundary=5,
1432                                             west_boundary=-5,
1433                                             plot_name='all_alphas',
1434                                             seed_num=100000,
1435                                             verbose=False)
1436   
1437    OUTPUT: returns the minumum normalised covalance calculate AND the
1438           alpha that created it. PLUS writes a plot of the results
[4847]1439           
[5146]1440    NOTE: code will not work if the data_file extent is greater than the
1441    boundary_polygon or any of the boundaries, eg north_boundary...west_boundary
[4744]1442       
1443    """
[4642]1444
1445    from anuga.shallow_water import Domain
1446    from anuga.geospatial_data.geospatial_data import Geospatial_data
1447    from anuga.pmesh.mesh_interface import create_mesh_from_regions
[4126]1448   
[4642]1449    from anuga.utilities.numerical_tools import cov
[4744]1450    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
[4847]1451    from anuga.utilities.polygon import is_inside_polygon
[5003]1452    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1453   
[4126]1454
[4744]1455    attribute_smoothed='elevation'
1456
[4642]1457    if mesh_file is None:
[5010]1458        if verbose: print "building mesh"
[4642]1459        mesh_file='temp.msh'
[4744]1460
[4816]1461        if north_boundary is None or south_boundary is None or \
1462           east_boundary is None or west_boundary is None:
1463            no_boundary=True
1464        else:
1465            no_boundary=False
[4642]1466       
[4816]1467        if no_boundary is True:
1468            msg= 'All boundaries must be defined'
1469            raise msg
[4642]1470
[4816]1471        poly_topo = [[east_boundary,south_boundary],
1472                     [east_boundary,north_boundary],
1473                     [west_boundary,north_boundary],
1474                     [west_boundary,south_boundary]]
1475                     
1476        create_mesh_from_regions(poly_topo,
1477                                 boundary_tags={'back': [2],
1478                                                'side': [1,3],
1479                                                'ocean': [0]},
1480                             maximum_triangle_area=mesh_resolution,
1481                             filename=mesh_file,
1482                             use_cache=cache,
1483                             verbose=verbose)
1484
1485    else: # if mesh file provided
1486        #test mesh file exists?
[5010]1487        if verbose: "reading from file: %s" %mesh_file
[4816]1488        if access(mesh_file,F_OK) == 0:
1489            msg="file %s doesn't exist!" %mesh_file
1490            raise IOError, msg
1491
[4642]1492    #split topo data
[5010]1493    if verbose: print 'Reading elevation file: %s' %data_file
[4744]1494    G = Geospatial_data(file_name = data_file)
[5010]1495    if verbose: print 'Start split'
[4879]1496    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
[5010]1497    if verbose: print 'Finish split'
[4744]1498    points=G_small.get_data_points()
[4847]1499
[4744]1500    if verbose: print "Number of points in sample to compare: ", len(points)
1501   
1502    if alpha_list==None:
1503        alphas = [0.001,0.01,100]
1504        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1505         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1506       
1507    else:
1508        alphas=alpha_list
[5003]1509
1510    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1511    #4 onwards is the elevation_predicted using the alpha, which will
1512    #be compared later against the real removed data
1513    data=array([],typecode=Float)
1514
1515    data=resize(data,(len(points),3+len(alphas)))
1516
1517    #gets relative point from sample
1518    data[:,0]=points[:,0]
1519    data[:,1]=points[:,1]
1520    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1521    data[:,2]=elevation_sample
1522
1523    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1524
1525    if verbose: print 'Setup computational domains with different alphas'
1526
1527    #print 'memory usage before domains',mem_usage()
1528
1529    for i,alpha in enumerate(alphas):
1530        #add G_other data to domains with different alphas
1531        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1532        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
1533        if verbose:print domain.statistics()
1534        domain.set_quantity(attribute_smoothed, 
1535                    geospatial_data = G_other,
1536                    use_cache = cache,
1537                    verbose = verbose,
1538                    alpha = alpha)
1539
1540        points_geo=domain.geo_reference.change_points_geo_ref(points)
1541        #returns the predicted elevation of the points that were "split" out
1542        #of the original data set for one particular alpha
[5010]1543        if verbose: print 'Get predicted elevation for location to be compared'
[5003]1544        elevation_predicted=domain.quantities[attribute_smoothed].\
1545                            get_values(interpolation_points=points_geo)
1546 
1547        #add predicted elevation to array that starts with x, y, z...
1548        data[:,i+3]=elevation_predicted
1549
1550        sample_cov= cov(elevation_sample)
1551        #print elevation_predicted
1552        ele_cov= cov(elevation_sample-elevation_predicted)
1553        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
1554        #print 'memory usage during compare',mem_usage()
1555       
[5010]1556        if verbose: print'Covariance for alpha ',normal_cov[i][0],'= ',normal_cov[i][1]
1557        if verbose: print'-------------------------------------------- \n'
[5003]1558
1559    normal_cov0=normal_cov[:,0]
1560    normal_cov_new=take(normal_cov,argsort(normal_cov0))
1561
1562    if plot_name is not None:
1563        from pylab import savefig,semilogx,loglog
1564        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1565        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1566        savefig(plot_name,dpi=300)
1567    if mesh_file == 'temp.msh':
1568        remove(mesh_file)
1569   
[5004]1570    if verbose: 
1571        print 'Final results:'
[5010]1572        for i,alpha in enumerate(alphas):
[5004]1573            print'covariance for alpha %s = %s ' %(normal_cov[i][0],normal_cov[i][1])
1574        print '\n Optimal alpha is: %s ' % normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1575
[5146]1576    # covariance and optimal alpha
[5003]1577    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
1578
1579def old_find_optimal_smoothing_parameter(data_file, 
1580                                     alpha_list=None,
1581                                     mesh_file=None,
1582                                     boundary_poly=None,
1583                                     mesh_resolution=100000,
1584                                     north_boundary=None,
1585                                     south_boundary=None,
1586                                     east_boundary=None,
1587                                     west_boundary=None,
1588                                     plot_name='all_alphas',
1589                                     split_factor=0.1,
1590                                     seed_num=None,
1591                                     cache=False,
1592                                     verbose=False
1593                                     ):
1594   
1595    """
1596    data_file: must not contain points outside the boundaries defined
1597           and it either a pts, txt or csv file.
1598   
1599    alpha_list: the alpha values to test in a single list
1600   
1601    mesh_file: name of the created mesh file or if passed in will read it.
1602            NOTE, if there is a mesh file mesh_resolution,
1603            north_boundary, south... etc will be ignored.
1604   
1605    mesh_resolution: the maximum area size for a triangle
1606   
1607    north_boundary... west_boundary: the value of the boundary
1608   
1609    plot_name: the name for the plot contain the results
1610   
1611    seed_num: the seed to the random number generator
1612   
1613    USAGE:
1614        value, alpha = find_optimal_smoothing_parameter(data_file=fileName,
1615                                             alpha_list=[0.0001, 0.01, 1],
1616                                             mesh_file=None,
1617                                             mesh_resolution=3,
1618                                             north_boundary=5,
1619                                             south_boundary=-5,
1620                                             east_boundary=5,
1621                                             west_boundary=-5,
1622                                             plot_name='all_alphas',
1623                                             seed_num=100000,
1624                                             verbose=False)
1625   
1626    OUTPUT: returns the minumum normalised covalance calculate AND the
1627           alpha that created it. PLUS writes a plot of the results
1628           
1629    NOTE: code will not work if the data_file extend is greater than the
1630    boundary_polygon or the north_boundary...west_boundary
1631       
1632    """
1633
1634    from anuga.shallow_water import Domain
1635    from anuga.geospatial_data.geospatial_data import Geospatial_data
1636    from anuga.pmesh.mesh_interface import create_mesh_from_regions
1637   
1638    from anuga.utilities.numerical_tools import cov
1639    from Numeric import array, resize,shape,Float,zeros,take,argsort,argmin
1640    from anuga.utilities.polygon import is_inside_polygon
1641    from anuga.fit_interpolate.benchmark_least_squares import mem_usage
1642   
1643
1644    attribute_smoothed='elevation'
1645
1646    if mesh_file is None:
1647        mesh_file='temp.msh'
1648
1649        if north_boundary is None or south_boundary is None or \
1650           east_boundary is None or west_boundary is None:
1651            no_boundary=True
1652        else:
1653            no_boundary=False
1654       
1655        if no_boundary is True:
1656            msg= 'All boundaries must be defined'
1657            raise msg
1658
1659        poly_topo = [[east_boundary,south_boundary],
1660                     [east_boundary,north_boundary],
1661                     [west_boundary,north_boundary],
1662                     [west_boundary,south_boundary]]
1663                     
1664        create_mesh_from_regions(poly_topo,
1665                                 boundary_tags={'back': [2],
1666                                                'side': [1,3],
1667                                                'ocean': [0]},
1668                             maximum_triangle_area=mesh_resolution,
1669                             filename=mesh_file,
1670                             use_cache=cache,
1671                             verbose=verbose)
1672
1673    else: # if mesh file provided
1674        #test mesh file exists?
1675        if access(mesh_file,F_OK) == 0:
1676            msg="file %s doesn't exist!" %mesh_file
1677            raise IOError, msg
1678
1679    #split topo data
1680    G = Geospatial_data(file_name = data_file)
1681    if verbose: print 'start split'
1682    G_small, G_other = G.split(split_factor,seed_num, verbose=verbose)
1683    if verbose: print 'finish split'
1684    points=G_small.get_data_points()
1685
1686    #FIXME: Remove points outside boundary polygon
1687#    print 'new point',len(points)
1688#   
1689#    new_points=[]
1690#    new_points=array([],typecode=Float)
1691#    new_points=resize(new_points,(len(points),2))
1692#    print "BOUNDARY", boundary_poly
1693#    for i,point in enumerate(points):
1694#        if is_inside_polygon(point,boundary_poly, verbose=True):
1695#            new_points[i] = point
1696#            print"WOW",i,new_points[i]
1697#    points = new_points
1698
1699   
1700    if verbose: print "Number of points in sample to compare: ", len(points)
1701   
1702    if alpha_list==None:
1703        alphas = [0.001,0.01,100]
1704        #alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01,\
1705         #         0.1, 1.0, 10.0, 100.0,1000.0,10000.0]
1706       
1707    else:
1708        alphas=alpha_list
[4642]1709    domains = {}
1710
[4744]1711    if verbose: print 'Setup computational domains with different alphas'
[5003]1712
1713    print 'memory usage before domains',mem_usage()
1714
[4642]1715    for alpha in alphas:
[4744]1716        #add G_other data to domains with different alphas
[4847]1717        if verbose:print '\n Calculating domain and mesh for Alpha = ',alpha,'\n'
1718        domain = Domain(mesh_file, use_cache=cache, verbose=verbose)
[4744]1719        if verbose:print domain.statistics()
[4642]1720        domain.set_quantity(attribute_smoothed, 
1721                    geospatial_data = G_other,
[4847]1722                    use_cache = cache,
[4744]1723                    verbose = verbose,
[4642]1724                    alpha = alpha)
1725        domains[alpha]=domain
1726
[5003]1727    print 'memory usage after domains',mem_usage()
1728
[4744]1729    #creates array with columns 1 and 2 are x, y. column 3 is elevation
1730    #4 onwards is the elevation_predicted using the alpha, which will
1731    #be compared later against the real removed data
[4642]1732    data=array([],typecode=Float)
[4744]1733
[4642]1734    data=resize(data,(len(points),3+len(alphas)))
[4744]1735
[4642]1736    #gets relative point from sample
1737    data[:,0]=points[:,0]
1738    data[:,1]=points[:,1]
1739    elevation_sample=G_small.get_attributes(attribute_name=attribute_smoothed)
1740    data[:,2]=elevation_sample
1741
1742    normal_cov=array(zeros([len(alphas),2]),typecode=Float)
1743
[4847]1744    if verbose: print 'Determine difference between predicted results and actual data'
[4744]1745    for i,alpha in enumerate(domains):
[4847]1746        if verbose: print'Alpha =',alpha
1747       
[4744]1748        points_geo=domains[alpha].geo_reference.change_points_geo_ref(points)
1749        #returns the predicted elevation of the points that were "split" out
1750        #of the original data set for one particular alpha
[4847]1751        elevation_predicted=domains[alpha].quantities[attribute_smoothed].\
1752                            get_values(interpolation_points=points_geo)
1753 
1754        #add predicted elevation to array that starts with x, y, z...
[4744]1755        data[:,i+3]=elevation_predicted
[4642]1756
1757        sample_cov= cov(elevation_sample)
[4744]1758        #print elevation_predicted
[4642]1759        ele_cov= cov(elevation_sample-elevation_predicted)
[4744]1760        normal_cov[i,:]= [alpha,ele_cov/sample_cov]
[5003]1761        print 'memory usage during compare',mem_usage()
1762       
[4642]1763
[4744]1764        if verbose: print'cov',normal_cov[i][0],'= ',normal_cov[i][1]
[4745]1765
[4642]1766    normal_cov0=normal_cov[:,0]
1767    normal_cov_new=take(normal_cov,argsort(normal_cov0))
[4745]1768
1769    if plot_name is not None:
1770        from pylab import savefig,semilogx,loglog
1771        semilogx(normal_cov_new[:,0],normal_cov_new[:,1])
1772        loglog(normal_cov_new[:,0],normal_cov_new[:,1])
1773        savefig(plot_name,dpi=300)
[4816]1774    if mesh_file == 'temp.msh':
1775        remove(mesh_file)
[4642]1776   
[4744]1777    return min(normal_cov_new[:,1]) , normal_cov_new[(argmin(normal_cov_new,axis=0))[1],0]
[4642]1778
[4126]1779         
1780if __name__ == "__main__":
[4178]1781    pass
[4126]1782   
Note: See TracBrowser for help on using the repository browser.