Context Navigation

← Previous Changeset
Next Changeset →

Changeset 5014

Timestamp:

Feb 8, 2008, 5:29:54 PM (17 years ago)

Author:

ole

Message:

More work on data audit of IP issues - first cut almost there

Location:

anuga_core/source/anuga/utilities

Files:

: 1 added
: 5 edited

data_audit.py (modified) (6 diffs)
data_audit_config.py (added)
mainland_only.lic (modified) (1 diff)
test_data_audit.py (modified) (7 diffs)
test_xml_tools.py (modified) (2 diffs)
xml_tools.py (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

anuga_core/source/anuga/utilities/data_audit.py

-                      r4970
+                      r5014
 from os.path import join, splitext
 from anuga.utilities.xml_tools import parse, pretty_print_tree, get_elements, get_text
+from anuga.utilities.xml_tools import xml2object, XML_element
 from anuga.utilities.system_tools import compute_checksum
+from data_audit_config import extensions_to_ignore, directories_to_ignore, files_to_ignore
 # Audit exceptions
 …
 class WrongTags(Exception): pass
+audit_exceptions = (NotPublishable, FilenameMismatch, CRCMismatch, Invalid, WrongTags)
+audit_exceptions = (NotPublishable,
+                    FilenameMismatch,
+                    CRCMismatch,
+                    Invalid,
+                    WrongTags)
 def IP_verified(directory, verbose=False):
 …
     """
-    print '---------------------------------------------'
-    print 'Files that need to be assessed for IP issues:'
-    print '---------------------------------------------'
     # Print header
     dirwidth = 72
-    print '---------------------------------------------'
-    print 'File'.ljust(dirwidth), 'Status'
-    print '---------------------------------------------'
     # Identify data files
+    first_time = True
     all_files_accounted_for = True
     for dirpath, datafile in identify_datafiles(directory):
 …
             except audit_exceptions, e:
                 all_files_accounted_for = False
+                status = 'LICENSE FILE NOT VALID'
+                status += 'REASON: %s' %e
+                #doc = parse(fid)
+                #pretty_print_tree(doc)
+                fid.seek(0)
+                status += fid.read()
+            #else:
+            #    if verbose: print 'OK'
+                status = 'LICENSE FILE NOT VALID\n'
+                status += 'REASON: %s\n' %e
+                try:
+                    doc = xml2object(fid)
+                except:
+                    status += 'XML file could not be read:'
+                    fid.seek(0)
+                    status += fid.read()
+                else:
+                    status += str(doc)
             fid.close()
         if status != 'OK' or verbose is True:
+            if first_time is True:
+                # Print header
+                print '---------------------------------------------'
+                print 'Files that need to be assessed for IP issuses'.ljust(dirwidth), 'Status'
+                print '---------------------------------------------'
+                first_time = False
             print filename + ' (Checksum=%s): '\
                   %str(compute_checksum(filename)), status
 …
     """ Identify files that might contain data
     """
-    # Ignore source code files
-    extensions_to_ignore = ['.py','.c','.h', '.f'] #, '.gif', '.jpg', '.png']
-    # Ignore generated stuff
-    extensions_to_ignore += ['.pyc', '.o', '.so', '~']
-    extensions_to_ignore += ['.aux', '.log', '.idx', 'ilg', '.ind',
-                             '.bbl', '.blg']
-    # Ignore license files themselves
-    extensions_to_ignore += ['.lic']
-    # Ignore certain other files
-    files_to_ignore = ['README.txt']
-    # Ignore directories
-    directories_to_ignore = ['anuga_work', 'pymetis', 'obsolete_code',
-                             'anuga_parallel', 'anuga_viewer',
-                             'planning', 'coding_standards',
-                             'experimentation',
-                             '.svn', 'misc', '.metadata']
     for dirpath, dirnames, filenames in walk(root):
 …
     license_filename = fid.name
+    doc = parse(fid)
+    #print_tree(doc)
+    doc = xml2object(fid)
+    #print doc
     # Check that file is valid (e.g. all elements there)
+    # FIXME (Ole): Todo
+    if doc.nodeName != '#document':
+        msg = 'License file %s does not appear' %license_filename
+        msg += 'to be a valid XML document'
+        msg += 'The root node has name %s' %doc.nodeName
+        msg += 'but it should be %s' %'#document'
+        raise Invalid, msg
+    if len(doc.childNodes) != 1:
+        msg = 'License file %s must have only one element' %license_filename
+        msg += ' at the root level. It is\n '
+        msg += '<ga_license_file>'
+        raise Invalid, msg
+    # Start looking at document in earnest
+    root_node = doc.childNodes[0]
+    if root_node.nodeName != 'ga_license_file':
+    if not doc.has_key('ga_license_file'):
         msg = 'License file %s must have two elements' %license_filename
         msg += ' at the root level. They are\n '
         msg += '<?xml version="1.0" encoding="iso-8859-1"?>\n'
         msg += '<ga_license_file>\n'
         msg += 'The second element was found to be %s' %root_node.nodeName
+        msg += ' at the root level. They are\n'
+        msg += '  <?xml version="1.0" encoding="iso-8859-1"?>\n'
+        msg += '  <ga_license_file>\n'
+        msg += 'The second element was found to be %s' %doc.keys()
         raise WrongTags, msg
     # Validate elements: metadata, datafile, datafile, ...
     elements = get_elements(root_node.childNodes)
     if elements[0].nodeName != 'metadata':
         msg = 'The first element under %s must be "metadata"'\
               %root_node.nodeName
+    elements = doc['ga_license_file']
+    if not elements.has_key('metadata'):
+        msg = 'Tag %s must have the element "metadata"'\
+              %doc.keys()[0]
         msg += 'The element found was %s' %elements[0].nodeName
         raise WrongTags, msg
+    for node in elements[1:]:
+        if node.nodeName != 'datafile':
+            msg = 'All elements, except the first, under %s must '\
+                  %root_node.nodeName
+            msg += 'be "datafile"'
+            msg += 'The element found was %s' %node.nodeName
+            raise WrongTags, msg
+    if verbose: print
+    # Extract information for source section
+    for node in get_elements(elements[0].childNodes):
+        if node.nodeName == 'author':
+            # Do something
+            if verbose: print 'Author:   ', get_text(node.childNodes)
+        if node.nodeName == 'svn_keywords':
+            # Do nothing
+            pass
+    if not elements.has_key('datafile'):
+        msg = 'Tag %s must have the element "datafile"'\
+              %doc.keys()[0]
+        msg += 'The element found was %s' %elements[0].nodeName
+        raise WrongTags, msg
+    for key in elements.keys():
+        msg = 'Invalid tag: %s' %key
+        if not key in ['metadata', 'datafile']:
+            raise WrongTags, msg
+    # Extract information for metadata section
+    if verbose: print
+    metadata = elements['metadata']
+    author = metadata['author']
+    if verbose: print 'Author:   ', author
+    #svn_keywords = metadata['svn_keywords']
+    #if verbose: print 'SVN keywords:   ', svn_keywords
     # Extract information for datafile sections
+    for datanode in elements[1:]:
+    datafile = elements['datafile']
+    if isinstance(datafile, XML_element):
+        datafile = [datafile]
+    for data in datafile:
         if verbose: print
+        for node in get_elements(datanode.childNodes):
+            #print 'Node', node.nodeName, node.childNodes
+            #continue
+            if node.nodeName == 'filename':
+                # FIXME Check correctness
+                filename = join(dirpath, get_text(node.childNodes))
+                if verbose: print 'Filename: "%s"' %filename
+                try:
+                    fid = open(filename, 'r')
+                except:
+                    msg = 'Specified filename %s could not be opened'\
+                          %filename
+                    raise FilenameMismatch, msg
+            if node.nodeName == 'checksum':
+                # FIXME (Ole): This relies on crc being preceded by filename
+                reported_crc = get_text(node.childNodes)
+                if verbose: print 'Checksum: "%s"' %reported_crc
+                file_crc = str(compute_checksum(filename))
+                if reported_crc != file_crc:
+                    msg = 'Bad checksum (CRC).\n'
+                    msg += '  The CRC reported in license file "%s" is "%s"\n'\
+                          %(license_filename, reported_crc)
+                    msg += '  The CRC computed from file "%s" is "%s"'\
+                           %(filename, file_crc)
+                    raise CRCMismatch, msg
+        # Filename
+        if data['filename'] == '':
+            msg = 'Missing filename'
+            raise FilenameMismatch, msg
+        else:
+            filename = join(dirpath, data['filename'])
+            if verbose: print 'Filename: "%s"' %filename
+            try:
+                fid = open(filename, 'r')
+            except:
+                msg = 'Specified filename %s could not be opened'\
+                      %filename
+                raise FilenameMismatch, msg
+        # CRC
+        reported_crc = data['checksum']
+        if verbose: print 'Checksum: "%s"' %reported_crc
+        file_crc = str(compute_checksum(filename))
+        if reported_crc != file_crc:
+            msg = 'Bad checksum (CRC).\n'
+            msg += '  The CRC reported in license file "%s" is "%s"\n'\
+                   %(license_filename, reported_crc)
+            msg += '  The CRC computed from file "%s" is "%s"'\
+                   %(filename, file_crc)
+            raise CRCMismatch, msg
+            if node.nodeName == 'accountable':
+                accountable = get_text(node.childNodes)
+                if verbose: print 'Accountable: "%s"' %accountable
+                if accountable == "":
+                    msg = 'No accountable person specified'
+                    raise Exception, msg
+            if node.nodeName == 'source':
+                source = get_text(node.childNodes)
+                if verbose: print 'Source: "%s"' %source
+                if source == "":
+                    msg = 'No source specified'
+                    raise Exception, msg
+            if node.nodeName == 'IP_owner':
+                ip_owner = get_text(node.childNodes)
+                if verbose: print 'IP owner: "%s"' %ip_owner
+                if ip_owner == "":
+                    msg = 'No IP owner specified'
+                    raise Exception, msg
+        # Accountable
+        accountable = data['accountable']
+        if verbose: print 'Accountable: "%s"' %accountable
+        if accountable == '':
+            msg = 'No accountable person specified'
+            raise Exception, msg
+        # Source
+        source = data['source']
+        if verbose: print 'Source: "%s"' %source
+        if source == '':
+            msg = 'No source specified'
+            raise Exception, msg
+        # IP owner
+        ip_owner = data['IP_owner']
+        if verbose: print 'IP owner: "%s"' %ip_owner
+        if ip_owner == '':
+            msg = 'No IP owner specified'
+            raise Exception, msg
+            if node.nodeName == 'IP_info':
+                if verbose: print 'IP info: "%s"' %get_text(node.childNodes)
+            if node.nodeName == 'publishable':
+                if verbose: print 'Publishable: %s' %fid.name
+                value = get_text(node.childNodes)
+                if value.upper() != 'YES':
+                    msg = 'Data file %s is not flagged as publishable'\
+                          %fid.name
+                    raise NotPublishable, msg
+        # IP info
+        ip_info = data['IP_info']
+        if verbose: print 'IP info: "%s"' %ip_info
+        if ip_info == '':
+            msg = 'No IP info specified'
+            raise Exception, msg
+        # Publishable
+        publishable = data['publishable'].upper()
+        if verbose: print 'Publishable: "%s"' %publishable
+        if publishable != 'YES':
+            msg = 'Data file %s is not flagged as publishable'\
+                  %fid.name
+            raise NotPublishable, msg

anuga_core/source/anuga/utilities/mainland_only.lic

-                      r4976
+                      r5014
   <metadata>
     <author>Ole Nielsen</author>
-    <svn_keywords>
-      <author>$Author$</author>
-      <date>$Date$</date>
-      <revision>$Revision$</revision>
-      <url>$URL$</url>
-      <id>$Id$</id>
-    </svn_keywords>
   </metadata>
   <datafile>

anuga_core/source/anuga/utilities/test_data_audit.py

-                      r4971
+                      r5014
 import unittest
 from Numeric import zeros, array, allclose, Float
 from tempfile import NamedTemporaryFile
+from tempfile import mkstemp
 import os
 …
         pass
+    def NOtest_license_file_is_not_valid(self):
+        """Basic test using an invalid XML file
+        """
+        # FIXME(OLE): Needs work to ensure that the order of
+        # problems is deterministic. Currently we check for checksum
+        # but on some systems file or publishable may come first
+        # Generate invalid example
+        fid = NamedTemporaryFile(mode='w',
+                                 suffix='.asc',
+                                 dir='.')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.flush()
+    def test_license_file_is_not_valid1(self):
+        """Basic test using an invalid XML file. This one
+        should fail on bad CRC checksum
+        """
+        # Generate invalid checksum example
+        tmp_fd , tmp_name = mkstemp(suffix='.asc', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.close()
         # Create associated license file
         basename, ext = os.path.splitext(fid.name)
+        basename, ext = os.path.splitext(tmp_name)
         license_filename = basename + '.lic'
-        #print fid.name, license_filename
         licfid = open(license_filename, 'w')
         xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
 …
     </metadata>
     <datafile>
       <filename>mainland_only.csv</filename>
       <checksum>-1661725548</checksum>
       <publishable>No</publishable>
+      <filename>%s</filename>
+      <checksum>-111111</checksum>
+      <publishable>Yes</publishable>
       <accountable>Jane Sexton</accountable>
       <source>Unknown</source>
 …
   </ga_license_file>
+"""
+""" %tmp_name
         licfid.write(xml_string)
         licfid.close()
 …
         licfid = open(license_filename)
         #print licfid.read()
         try:
             license_file_is_valid(licfid)
 …
         fid.close()
         os.remove(license_filename)
+    def NOtest_license_file_is_valid(self):
+        os.remove(tmp_name)
+    def test_license_file_is_not_valid2(self):
+        """Basic test using an invalid XML file. This one
+        should fail on Not Publishable
+        """
+        # Generate invalid checksum example
+        tmp_fd , tmp_name = mkstemp(suffix='.asc', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.close()
+        # Create associated license file
+        basename, ext = os.path.splitext(tmp_name)
+        license_filename = basename + '.lic'
+        licfid = open(license_filename, 'w')
+        xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
+  <ga_license_file>
+    <metadata>
+      <author>Ole Nielsen</author>
+      <svn_keywords>
+        <author>$Author: ole $</author>
+        <date>$Date: 2008-01-21 18:58:15 +1100 (Mon, 21 Jan 2008) $</date>
+        <revision>$Revision$</revision>
+        <url>$URL: https://datamining.anu.edu.au/svn/ga/anuga_core/source/anuga/utilities/mainland_only.lic $</url>
+        <id>$Id: mainland_only.lic 4963 2008-01-21 07:58:15Z ole $</id>
+      </svn_keywords>
+    </metadata>
+    <datafile>
+      <filename>%s</filename>
+      <checksum>-1484449438</checksum>
+      <publishable>no</publishable>
+      <accountable>Jane Sexton</accountable>
+      <source>Unknown</source>
+      <IP_owner>Geoscience Australia</IP_owner>
+      <IP_info>This is a polygon comprising easting and northing locations</IP_info>
+    </datafile>
+  </ga_license_file>
+""" %tmp_name
+        licfid.write(xml_string)
+        licfid.close()
+        licfid = open(license_filename)
+        #print licfid.read()
+        try:
+            license_file_is_valid(licfid)
+        except NotPublishable:
+            pass
+        else:
+            msg = 'Should have raised NotPublishable exception'
+            raise Exception, msg
+        # Clean up
+        licfid.close()
+        fid.close()
+        os.remove(license_filename)
+        os.remove(tmp_name)
+    def test_license_file_is_not_valid3(self):
+        """Basic test using an invalid XML file. This one
+        should fail on Filename Mismatch
+        """
+        tmp_fd , tmp_name = mkstemp(suffix='.asc', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.close()
+        # Create associated license file
+        basename, ext = os.path.splitext(tmp_name)
+        license_filename = basename + '.lic'
+        licfid = open(license_filename, 'w')
+        xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
+  <ga_license_file>
+    <metadata>
+      <author>Ole Nielsen</author>
+      <svn_keywords>
+        <author>$Author: ole $</author>
+        <date>$Date: 2008-01-21 18:58:15 +1100 (Mon, 21 Jan 2008) $</date>
+        <revision>$Revision$</revision>
+        <url>$URL:$</url>
+        <id>$Id:$</id>
+      </svn_keywords>
+    </metadata>
+    <datafile>
+      <filename>%s</filename>
+      <checksum>-1484449438</checksum>
+      <publishable>Yes</publishable>
+      <accountable>Jane Sexton</accountable>
+      <source>Unknown</source>
+      <IP_owner>Geoscience Australia</IP_owner>
+      <IP_info>This is a polygon comprising easting and northing locations</IP_info>
+    </datafile>
+  </ga_license_file>
+""" %(basename + '.no_exist')
+        licfid.write(xml_string)
+        licfid.close()
+        licfid = open(license_filename)
+        #print licfid.read()
+        try:
+            license_file_is_valid(licfid)
+        except FilenameMismatch:
+            pass
+        else:
+            msg = 'Should have raised FilenameMismatch exception'
+            raise Exception, msg
+        # Clean up
+        licfid.close()
+        fid.close()
+        os.remove(license_filename)
+        os.remove(tmp_name)
+    def test_license_file_is_valid(self):
         """Basic test using an valid XML file
         """
-        # FIXME(Ole): NOT FINISHED
         # Generate valid example
+        fid = NamedTemporaryFile(mode='w',
+                                 suffix='.asc',
+                                 dir='.')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.flush()
+        tmp_fd , tmp_name = mkstemp(suffix='.asc', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.close()
         # Strip leading dir (./)
         data_filename = os.path.split(fid.name)[1]
         print 'Name', data_filename
+        data_filename = os.path.split(tmp_name)[1]
+        #print 'Name', data_filename
         # Create associated license file
         basename, ext = os.path.splitext(fid.name)
+        basename, ext = os.path.splitext(tmp_name)
         license_filename = basename + '.lic'
 …
   </ga_license_file>
+""" %(data_filename, '000')
+        licfid.write(xml_string)
+        licfid.close()
+        licfid = open(license_filename)
+        #print licfid.read()
+        #print fid.name, license_filename
+        print os.listdir('.')
+        license_file_is_valid(licfid, verbose=True)
+        # Clean up
+        licfid.close()
+        fid.close()
+        os.remove(license_filename)
+""" %(data_filename, '-1484449438')
+        licfid.write(xml_string)
+        licfid.close()
+        licfid = open(license_filename)
+        license_file_is_valid(licfid)#, verbose=True)
+        # Clean up
+        os.remove(license_filename)
+        os.remove(tmp_name)
+    def test_valid_license_file_with_multiple_files(self):
+        """Test of XML file with more than one datafile element.
+        """
+        # Generate example files
+        tmp_fd , tmp_name = mkstemp(suffix='.asc', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        string = 'Example data file with textual content. AAAABBBBCCCC1234'
+        fid.write(string)
+        fid.close()
+        # Derive filenames
+        basename, ext = os.path.splitext(tmp_name)
+        data_filename1 = basename + '.asc'
+        data_filename2 = basename + '.prj'
+        license_filename = basename + '.lic'
+        #print data_filename1, data_filename2, license_filename
+        # Write data to second data file
+        fid = open(data_filename2, 'w')
+        string = 'Another example data file with text in it'
+        fid.write(string)
+        fid.close()
+        # Create license file
+        licfid = open(license_filename, 'w')
+        xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
+  <ga_license_file>
+    <metadata>
+      <author>Ole Nielsen</author>
+      <svn_keywords>
+        <author>$Author$</author>
+        <date>$Date$</date>
+        <revision>$Revision$</revision>
+        <url>$URL:$</url>
+        <id>$Id$</id>
+      </svn_keywords>
+    </metadata>
+    <datafile>
+      <filename>%s</filename>
+      <checksum>%s</checksum>
+      <publishable>Yes</publishable>
+      <accountable>Jane Sexton</accountable>
+      <source>Generated on the fly</source>
+      <IP_owner>Geoscience Australia</IP_owner>
+      <IP_info>This is a test</IP_info>
+    </datafile>
+    <datafile>
+      <filename>%s</filename>
+      <checksum>%s</checksum>
+      <publishable>Yes</publishable>
+      <accountable>Ole Nielsen</accountable>
+      <source>Generated on the fly</source>
+      <IP_owner>Geoscience Australia</IP_owner>
+      <IP_info>This is another test</IP_info>
+    </datafile>
+  </ga_license_file>
+""" %(data_filename1, '-1484449438', data_filename2, '-1322430740')
+        licfid.write(xml_string)
+        licfid.close()
+        licfid = open(license_filename)
+        license_file_is_valid(licfid)#, verbose=True)
+        # Clean up
+        os.remove(license_filename)
+        os.remove(data_filename1)
+        os.remove(data_filename2)

anuga_core/source/anuga/utilities/test_xml_tools.py

-                      r5009
+                      r5014
         assert doc['second element']['texts']['title 4'] == 'example text 4'
+        assert doc.has_key('first element')
 …
         os.remove(tmp_name)
+    def test_duplicate_tags(self):
+        """Test handling of duplicate tags.
+        """
+        X1 = XML_element(tag='datafile',
+                         value=XML_element(tag='some_text',
+                                           value='hello world'))
+        X2 = XML_element(tag='second_element',
+                         value=XML_element(tag='texts',
+                                           value='egg and spam'))
+        X3 = XML_element(tag='datafile',
+                         value='42')
+        # Need to have one main element according to minidom
+        main = XML_element(tag='all', value=[X1, X2, X3])
+        xmldoc = XML_element(value=main)
+        #print xmldoc
+        tmp_fd , tmp_name = mkstemp(suffix='.xml', dir='.')
+        fid = os.fdopen(tmp_fd, 'w')
+        fid.write(str(xmldoc))
+        fid.close()
+        # Now read it back
+        xmlobject = xml2object(tmp_name, verbose=True)
+        #print xmlobject
+        assert str(xmldoc) == str(xmlobject)
+        assert xmlobject['all'].has_key('datafile')
+        assert len(xmlobject['all']['datafile']) == 2
+        #print xmlobject['all']['datafile']
+        os.remove(tmp_name)
 #-------------------------------------------------------------

anuga_core/source/anuga/utilities/xml_tools.py

-                      r5009
+                      r5014
+def remove_whitespace(s):
+    """Remove excess whitespace including newlines from string
+    """
+    import string
+    words = s.split() # Split on whitespace
+    return string.join(words)
+    #return s.replace('\n', '')
+    #s.translate(string.maketrans)
 …
         s += '<%s>' %self.tag
         if isinstance(self.value, basestring):
             s += self.value
+            s += remove_whitespace(self.value)
         else:
             s += '\n'
 …
         This will allow statements such as
+        assert xmlobject['datafile']['accountable'] == 'Jane Sexton'
+        """
+        assert xmlobject['datafile']['accountable'] == 'Jane Sexton'
+        If more than one element matches the given key a list of all
+        matches will be returned
+        """
+        result = []
         for node in self.value:
             if node.tag == key:
                 if isinstance(node.value, basestring):
+                    return node.value
+                    result.append(str(node.value))
+                    #return node.value
                 else:
+                    return node
+                    result.append(node)
+                    #return node
+        if len(result) == 0:
+            return None
+        if len(result) == 1:
+            return result[0]
+        if len(result) > 1:
+            return result
+    def has_key(self, key):
+        found = False
+        for node in self.value:
+            if node.tag == key:
+                found = True
+        return found
     def keys(self):
 …
         fid = xml
-    #print fid.read()
     dom = parse(fid)

Note: See TracChangeset for help on using the changeset viewer.