Changeset 4963


Ignore:
Timestamp:
Jan 21, 2008, 6:58:15 PM (17 years ago)
Author:
ole
Message:

Work on IP tracking:
Checksums, removed DOCTYPE for the time being, formatted, and more checks

Location:
anuga_core/source/anuga/utilities
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • anuga_core/source/anuga/utilities/data_audit.py

    r4954 r4963  
    55from os.path import join, splitext
    66
    7 from anuga.utilities.xml_tools import parse, print_tree, get_elements, get_text
     7from anuga.utilities.xml_tools import parse, pretty_print_tree, get_elements, get_text
     8from anuga.utilities.system_tools import compute_checksum
    89
    910# Audit exceptions
    1011class NotPublishable(Exception): pass
     12class FilenameMismatch(Exception): pass
     13class CRCMismatch(Exception): pass
    1114class Invalid(Exception): pass
    1215class WrongTags(Exception): pass
    1316
     17audit_exceptions = (NotPublishable, FilenameMismatch, CRCMismatch, Invalid, WrongTags)
    1418
    1519def IP_verified(directory):
     
    4246    all_files_accounted_for = True
    4347    for dirpath, datafile in identify_datafiles(directory):
    44         print join(dirpath, datafile) + ': ',
     48        filename = join(dirpath, datafile)
     49       
     50        print filename + ' (Checksum=%s): ' %str(compute_checksum(filename)),
    4551
    4652        basename, ext = splitext(datafile)
     
    5359            all_files_accounted_for = False
    5460        else:
    55             if license_file_is_valid(fid):
     61            try:
     62                license_file_is_valid(fid, dirpath, verbose=False)
     63            except audit_exceptions, e:
     64                all_files_accounted_for = False                               
     65                print 'LICENSE FILE NOT VALID'
     66                print 'REASON:', e
     67
     68                #doc = parse(fid)
     69                #pretty_print_tree(doc)
     70                fid.seek(0)
     71                print fid.read()
     72
     73            else:       
    5674                print 'OK'
    57             else:
    58                 print 'LICENSE FILE NOT VALID'
    59                 all_files_accounted_for = False
     75
    6076            fid.close()
    6177
     
    115131
    116132
    117 def license_file_is_valid(fid):
     133def license_file_is_valid(fid, dirpath, verbose=False):
    118134    """Check that XML license file is valid
    119135    """
    120136
     137    license_filename = fid.name
    121138    doc = parse(fid)
    122139    #print_tree(doc)
     
    127144
    128145    if doc.nodeName != '#document':
    129         msg = 'License file %s does not appear' %fid.name
     146        msg = 'License file %s does not appear' %license_filename
    130147        msg += 'to be a valid XML document'
    131148        msg += 'The root node has name %s' %doc.nodeName
     
    133150        raise Invalid, msg       
    134151
    135     if len(doc.childNodes) != 2:
    136         msg = 'License file %s must have two elements' %fid.name
    137         msg += ' at the root level. They are\n '
    138         msg += '<?xml version="1.0" encoding="iso-8859-1"?>\n'
     152    if len(doc.childNodes) != 1:
     153        msg = 'License file %s must have only one element' %license_filename
     154        msg += ' at the root level. It is\n '
    139155        msg += '<ga_license_file>'
    140156        raise Invalid, msg
     
    142158
    143159    # Start looking at document in earnest
    144     root_node = doc.childNodes[1]
     160    root_node = doc.childNodes[0]
    145161    if root_node.nodeName != 'ga_license_file':
    146         msg = 'License file %s must have two elements' %fid.name
     162        msg = 'License file %s must have two elements' %license_filename
    147163        msg += ' at the root level. They are\n '
    148164        msg += '<?xml version="1.0" encoding="iso-8859-1"?>\n'
     
    168184            raise WrongTags, msg       
    169185
    170     print
     186    if verbose: print   
    171187    # Extract information for source section
    172188    for node in get_elements(elements[0].childNodes):
    173189        if node.nodeName == 'author':
    174190            # Do something
    175             print 'Author is', get_text(node.childNodes)
     191            if verbose: print 'Author:   ', get_text(node.childNodes)
    176192
    177193        if node.nodeName == 'svn_keywords':
     
    181197    # Extract information for datafile sections
    182198    for datanode in elements[1:]:
    183         print   
     199        if verbose: print
    184200   
    185201        for node in get_elements(datanode.childNodes):
     
    189205            if node.nodeName == 'filename':
    190206                # FIXME Check correctness
    191                 print 'Filename is "%s"' %get_text(node.childNodes)
     207                filename = join(dirpath, get_text(node.childNodes))
     208                if verbose: print 'Filename: "%s"' %filename
     209                try:
     210                    fid = open(filename, 'r')
     211                except:
     212                    msg = 'Specified filename %s could not be opened'\
     213                          %filename
     214                    raise FilenameMismatch, msg
     215
     216            if node.nodeName == 'checksum':
     217                # FIXME (Ole): This relies on crc being preceded by filename
     218                reported_crc = get_text(node.childNodes)
     219                if verbose: print 'Checksum: "%s"' %reported_crc
     220
     221                file_crc = str(compute_checksum(filename))
     222
     223                if reported_crc != file_crc:
     224                    msg = 'Bad checksum (CRC).\n'
     225                    msg += '  The CRC reported in license file "%s" is "%s"\n'\
     226                          %(license_filename, reported_crc)
     227                    msg += '  The CRC computed from file "%s" is "%s"'\
     228                           %(filename, file_crc)
     229                    raise CRCMismatch, msg
     230               
    192231
    193232            if node.nodeName == 'accountable':
    194                 print 'Accountable is "%s"' %get_text(node.childNodes)
     233                accountable = get_text(node.childNodes)
     234                if verbose: print 'Accountable: "%s"' %accountable
     235                if accountable == "":
     236                    msg = 'No accountable person specified'
     237                    raise Exception, msg
    195238
    196239            if node.nodeName == 'source':
    197                 print 'Source is "%s"' %get_text(node.childNodes)
     240                source = get_text(node.childNodes)
     241                if verbose: print 'Source: "%s"' %source
     242                if source == "":
     243                    msg = 'No source specified'
     244                    raise Exception, msg               
    198245
    199246            if node.nodeName == 'IP_owner':
    200                 print 'IP owner is "%s"' %get_text(node.childNodes)
     247                ip_owner = get_text(node.childNodes)
     248                if verbose: print 'IP owner: "%s"' %ip_owner
     249                if ip_owner == "":
     250                    msg = 'No IP owner specified'
     251                    raise Exception, msg                               
     252               
    201253
    202254            if node.nodeName == 'IP_info':
    203                 print 'IP info is "%s"' %get_text(node.childNodes)                               
     255                if verbose: print 'IP info: "%s"' %get_text(node.childNodes) 
    204256               
    205257
    206258            if node.nodeName == 'publishable':
     259               
     260                if verbose: print 'Publishable: %s' %fid.name               
    207261                value = get_text(node.childNodes)
    208262                if value.upper() != 'YES':
    209263                    msg = 'Data file %s is not flagged as publishable'\
    210264                          %fid.name
    211                     print msg
    212                     #raise NotPublishable, msg
    213                 else:
    214                     print 'Data file %s is flagged publishable' %fid.name               
    215 
    216     #FIXME (Ole): Use hash code for original datafile as an XML element
    217     # USE CRC32 in zlib or hash
    218    
    219     #for node in elements:
    220     #    print node
    221     #print
    222 
    223 
    224 
    225     # Check that file is deemed publishable
    226     items = doc.getElementsByTagName('publishable')
    227     for i in items:
    228         print i
    229         #i.getAttribute()
     265                    raise NotPublishable, msg
     266
     267
     268
     269    # If we get this far, the license file is OK
     270    return True
  • anuga_core/source/anuga/utilities/mainland_only.lic

    r4954 r4963  
    11<?xml version="1.0" encoding="iso-8859-1"?>
    2 
    3   <!DOCTYPE ga_license_file [
    4       <!ELEMENT ga_license_file (source, datafile+)>
    5       <!ELEMENT metadata (author, svn_keywords)>
    6       <!ELEMENT svn_keywords (author, date, revision, url, id)>   
    7       <!ELEMENT datafile (filename, publishable, accountable,
    8                         owner, location, IP_info)>   
    9       <!ELEMENT filename (#PCDATA)>
    10       <!ELEMENT publishable (#PCDATA)>   
    11       <!ELEMENT accountable (#PCDATA)>       
    12       <!ELEMENT source (#PCDATA)>                               
    13       <!ELEMENT IP_owner (#PCDATA)>           
    14       <!ELEMENT IP_info (#PCDATA)>               
    15   ]>
    162
    173  <ga_license_file>
     
    2814    <datafile>
    2915      <filename>mainland_only.csv</filename>
     16      <checksum>-1661725548</checksum>
    3017      <publishable>No</publishable>
    3118      <accountable>Jane Sexton</accountable>
    3219      <source>Unknown</source>
    3320      <IP_owner>Geoscience Australia</IP_owner>
    34       <IP_info>This is a polygon tracing the coastline at Dampier WA. The origin and license issues are undecided</IP_info>
     21      <IP_info>This is a polygon comprising easting and northing locations
     22      tracing parts of the coastline at Dampier WA as well as a rectangular area inland.
     23      This is used to specifically set the onshore initial condition in a tsunami scenario
     24      and here, it is used with a unit test in test_polygon.py.
     25     
     26      The coastline was derived from Maritime Boundaries which is a public dataset. However,
     27      rumour has it that some of it was digitised from a Landgate supplied image.
     28     
     29      The origin and license issues are still undecided</IP_info>
    3530    </datafile>
    3631
    3732  </ga_license_file>
    38  
  • anuga_core/source/anuga/utilities/parse.py

    r4944 r4963  
    2222
    2323from data_audit import license_file_is_valid
    24 license_file_is_valid(fid)
     24license_file_is_valid(fid, '.')
    2525
    2626
  • anuga_core/source/anuga/utilities/system_tools.py

    r4952 r4963  
    172172           
    173173
    174 def compute_checksum(filename):
     174def compute_checksum(filename, max_length=2**20):
    175175    """Compute the CRC32 checksum for specified file
     176
     177    Optional parameter max_length sets the maximum number
     178    of bytes used to limit time used with large files.
     179    Default = 2**20 (1MB)
    176180    """
    177181
    178     import zlib
    179     #FIXME(Ole): Do we need to limit the size?
    180 
     182    #from zlib import crc32
     183    from binascii import crc32 #(works as well)
     184   
    181185    fid = open(filename)
    182 
    183     crcval = zlib.crc32(fid.read())
    184    
     186    crcval = crc32(fid.read(max_length))
    185187    return crcval
  • anuga_core/source/anuga/utilities/xml_tools.py

    r4944 r4963  
    2222
    2323
     24def pretty_print_tree(n, indent=0):
     25    print n
     26
    2427def parse(fid):
    2528    """Parse XML file descriptor and return DOM object.
     
    3033    #doc = minidom.parse(fid, make_parser())
    3134
     35    fid.seek(0)
    3236    doc = minidom.parse(fid)   
    3337    return doc
Note: See TracChangeset for help on using the changeset viewer.