source: trunk/anuga_core/source/anuga/utilities/data_audit.py @ 7876

Last change on this file since 7876 was 5628, checked in by ole, 16 years ago

Added error check in xml_tools to catch text nodes followed by non-text nodes.
This was in response to an error in one of the license files (not yet fixed).

File size: 12.3 KB
Line 
1"""Track IP of data files in an entire directory tree.
2See docstring for the public function IP_verified()
3for details on algorithm.
4
5An example of the XML format expected by this module is
6
7
8<?xml version="1.0" encoding="iso-8859-1"?>
9
10<ga_license_file>
11  <metadata>
12    <author>Ole Nielsen</author>
13  </metadata>
14 
15  <datafile>
16    <filename>channel1.png</filename>
17    <checksum>1339122967</checksum>
18    <publishable>Yes</publishable>
19    <accountable>Ole Nielsen</accountable>
20    <source>Generated by ANUGA development team</source>
21    <IP_owner>Geoscience Australia</IP_owner>
22    <IP_info>For use with ANUGA manual</IP_info>   
23  </datafile>
24
25</ga_license_file>
26
27
28There can be more than one <datafile> element to cover files
29with different extensions.
30
31
32Here's a DTD format, we might implement one day
33
34   <!DOCTYPE ga_license_file [
35      <!ELEMENT ga_license_file (source, datafile+)>
36      <!ELEMENT metadata (author, svn_keywords)>
37      <!ELEMENT svn_keywords (author, date, revision, url, id)>   
38      <!ELEMENT datafile (filename, publishable, accountable,
39                        owner, location, IP_info)>   
40      <!ELEMENT filename (#PCDATA)>
41      <!ELEMENT publishable (#PCDATA)>   
42      <!ELEMENT accountable (#PCDATA)>       
43      <!ELEMENT source (#PCDATA)>                               
44      <!ELEMENT IP_owner (#PCDATA)>           
45      <!ELEMENT IP_info (#PCDATA)>               
46  ]>
47
48
49
50"""
51
52from os import remove, walk, sep
53from os.path import join, splitext
54
55# Don't add anuga.utilities to these imports
56# EQRM also uses this file, but has a different directory structure
57from xml_tools import xml2object, XML_element
58from system_tools import compute_checksum
59
60
61# Audit exceptions
62class NotPublishable(Exception): pass
63class FilenameMismatch(Exception): pass
64class CRCMismatch(Exception): pass
65class Invalid(Exception): pass
66class WrongTags(Exception): pass
67class Empty(Exception): pass
68
69audit_exceptions = (NotPublishable,
70                    FilenameMismatch,
71                    CRCMismatch,
72                    Invalid,
73                    WrongTags,
74                    Empty)
75
76
77def IP_verified(directory,
78                extensions_to_ignore=None,
79                directories_to_ignore=None,
80                files_to_ignore=None,
81                verbose=False):
82    """Find and audit potential data files that might violate IP
83
84    This is the public function to be used to ascertain that
85    all data in the specified directory tree has been audited according
86    to the GA data IP tracking process.
87
88    if IP_verified is False:
89        # Stop and take remedial action
90        ...
91    else:
92        # Proceed boldly with confidence
93       
94    verbose controls standard output.
95    If verbose is False, only diagnostics about failed audits will appear.
96    All files that check OK will pass silently.
97
98    Optional arguments extensions_to_ignore, directories_to_ignore, and
99    files_to_ignore are lists of things to skip.
100
101    Examples are:
102    extensions_to_ignore = ['.py','.c','.h', '.f'] # Ignore source code
103    files_to_ignore = ['README.txt']
104    directories_to_ignore = ['.svn', 'misc']
105
106    None is also OK for these parameters.
107   
108    """
109
110    # Identify data files
111    oldpath = None
112    all_files = 0
113    ok_files = 0
114    all_files_accounted_for = True
115    for dirpath, filename in identify_datafiles(directory,
116                                                extensions_to_ignore,
117                                                directories_to_ignore,
118                                                files_to_ignore):
119
120
121        if oldpath != dirpath:
122            # Decide if dir header needs to be printed                       
123            oldpath = dirpath
124            first_time_this_dir = True
125           
126
127
128        all_files += 1
129       
130        basename, ext = splitext(filename)
131        license_filename = join(dirpath, basename + '.lic')
132
133
134        # Look for a XML license file with the .lic
135        status = 'OK'
136        try:
137            fid = open(license_filename)
138        except IOError:
139            status = 'NO LICENSE FILE'
140            all_files_accounted_for = False
141        else:
142            fid.close()
143           
144            try:
145                license_file_is_valid(license_filename,
146                                      filename,
147                                      dirpath,
148                                      verbose=False)
149            except audit_exceptions, e:
150                all_files_accounted_for = False                               
151                status = 'LICENSE FILE NOT VALID\n'
152                status += 'REASON: %s\n' %e
153
154                try:
155                    doc = xml2object(license_filename)
156                except:
157                    status += 'XML file %s could not be read:'\
158                              %license_filename
159                    fid = open(license_filename)
160                    status += fid.read()
161                    fid.close()
162                else:
163                    pass
164                    #if verbose is True:
165                    #    status += str(doc)
166
167
168
169        if status == 'OK':
170            ok_files += 1
171        else:
172            # Only print status if there is a problem (no news is good news)
173            if first_time_this_dir is True:
174                print
175                print '------------------------------------'
176                msg = 'Files without licensing info in dir:'
177                print msg, dirpath
178                print '------------------------------------'
179                first_time_this_dir = False
180           
181
182            print filename + ' (Checksum=%s): '\
183                  %str(compute_checksum(join(dirpath, filename))),\
184                  status
185
186
187    if verbose is True:
188        print
189        print '---------------------'       
190        print 'Audit result for dir: %s:' %directory
191        print '---------------------'               
192        print 'Number of files audited:  %d' %(all_files)
193        print 'Number of files verified: %d' %(ok_files)       
194        print
195
196    # Return result       
197    return all_files_accounted_for
198
199
200
201#------------------
202# Private functions
203#------------------
204def identify_datafiles(root,
205                       extensions_to_ignore=None,
206                       directories_to_ignore=None,
207                       files_to_ignore=None):
208    """ Identify files that might contain data
209
210    See function IP_verified() for details about optinoal parmeters
211    """
212
213    for dirpath, dirnames, filenames in walk(root):
214
215        for ignore in directories_to_ignore:
216            if ignore in dirnames:
217                dirnames.remove(ignore)  # don't visit ignored directories
218
219
220        for filename in filenames:
221
222
223            # Ignore extensions that need no IP check
224            ignore = False
225            for ext in extensions_to_ignore:
226                if filename.endswith(ext):
227                    ignore = True
228
229            if filename in files_to_ignore:
230                ignore = True
231
232            if ignore is False:
233                yield dirpath, filename
234
235
236def license_file_is_valid(license_filename, data_filename,
237                          dirpath='.', verbose=False):
238    """Check that XML license file for given filename_to_verify is valid.
239
240    Input:
241        license_filename: XML license file (must be an absolute path name)
242        data_filename: The data filename that is being audited
243        dir_path: Where the files live
244        verbose: Optional verbosity
245       
246
247    Check for each datafile listed that
248
249    * Datafile tags are there and match the one specified
250    * Fields are non empty (except IP_info which can be left blank)
251    * Datafile exists
252    * Checksum is correct
253    * Datafile is flagged as publishable
254
255    If anything is violated an appropriate exception is raised.
256    If everything is honky dory the function will return True.
257    """
258
259    if verbose:
260        print 'Parsing', license_filename
261       
262    doc = xml2object(license_filename)
263   
264    # Check that file is valid (e.g. all elements there)
265    if not doc.has_key('ga_license_file'):
266        msg = 'License file %s must have two elements' %license_filename
267        msg += ' at the root level. They are\n'
268        msg += '  <?xml version="1.0" encoding="iso-8859-1"?>\n'
269        msg += '  <ga_license_file>\n'
270        msg += 'The second element was found to be %s' %doc.keys()
271        raise WrongTags, msg
272   
273
274    # Validate elements: metadata, datafile, datafile, ...
275    # FIXME (Ole): I'd like this to verified by the parser
276    # using a proper DTD template one day....
277    # For not, let's check the main ones.
278    elements = doc['ga_license_file']
279    if not elements.has_key('metadata'):
280        msg = 'Tag %s must have the element "metadata"'\
281              %doc.keys()[0]
282        msg += 'The element found was %s' %elements[0].nodeName
283        raise WrongTags, msg
284
285    if not elements.has_key('datafile'):
286        msg = 'Tag %s must have the element "datafile"'\
287              %doc.keys()[0]
288        msg += 'The element found was %s' %elements[0].nodeName
289        raise WrongTags, msg   
290
291    for key in elements.keys():
292        msg = 'Invalid tag: %s' %key
293        if not key in ['metadata', 'datafile']:
294            raise WrongTags, msg                   
295
296   
297    # Extract information for metadata section
298    if verbose: print
299    metadata = elements['metadata']
300
301    author = metadata['author']
302    if verbose: print 'Author:   ', author
303    if author == '':
304        msg = 'Missing author'
305        raise Exception, msg               
306   
307    #svn_keywords = metadata['svn_keywords']
308    #if verbose: print 'SVN keywords:   ', svn_keywords
309   
310       
311    # Extract information for datafile sections
312    datafile = elements['datafile']
313    if isinstance(datafile, XML_element):
314        datafile = [datafile]
315
316
317    # Check that filename to verify is listed in license file
318    found = False
319    for data in datafile:   
320        if data['filename'] == data_filename:
321            found = True
322            break
323           
324    if not found:
325        msg = 'Specified filename to verify %s ' %data_filename
326        msg += 'did not appear in license file %s' %license_filename
327        raise FilenameMismatch, msg               
328           
329       
330    # Check contents for selected data_filename
331    #for data in datafile:
332    #    if verbose: print
333
334    # Filename
335    if data['filename'] == '':
336        msg = 'Missing filename'
337        raise FilenameMismatch, msg           
338    else:
339        filename = join(dirpath, data['filename'])
340        if verbose: print 'Filename: "%s"' %filename
341        try:
342            fid = open(filename, 'r')
343        except:
344            msg = 'Specified filename %s could not be opened'\
345                  %filename
346            raise FilenameMismatch, msg
347
348    # CRC
349    reported_crc = data['checksum']
350    if verbose: print 'Checksum: "%s"' %reported_crc
351   
352    file_crc = str(compute_checksum(filename))
353    if reported_crc != file_crc:
354        msg = 'Bad checksum (CRC).\n'
355        msg += '  The CRC reported in license file "%s" is "%s"\n'\
356               %(license_filename, reported_crc)
357        msg += '  The CRC computed from file "%s" is "%s"'\
358               %(filename, file_crc)
359        raise CRCMismatch, msg
360           
361    # Accountable
362    accountable = data['accountable']
363    if verbose: print 'Accountable: "%s"' %accountable
364    if accountable == '':
365        msg = 'No accountable person specified'
366        raise Empty, msg
367
368    # Source
369    source = data['source']
370    if verbose: print 'Source: "%s"' %source
371    if source == '':
372        msg = 'No source specified'
373        raise Empty, msg               
374
375    # IP owner
376    ip_owner = data['IP_owner']
377    if verbose: print 'IP owner: "%s"' %ip_owner
378    if ip_owner == '':
379        msg = 'No IP owner specified'
380        raise Empty, msg                               
381           
382    # IP info
383    ip_info = data['IP_info']
384    if verbose: print 'IP info: "%s"' %ip_info
385    #if ip_info == '':
386    #    msg = 'No IP info specified'
387    #    raise Empty, msg                                               
388
389    # Publishable
390    publishable = data['publishable']
391    if verbose: print 'Publishable: "%s"' %publishable
392    if publishable == '':
393        msg = 'No publishable value specified'
394        raise NotPublishable, msg
395   
396    if publishable.upper() != 'YES':
397        msg = 'Data file %s is not flagged as publishable'\
398              %fid.name
399        raise NotPublishable, msg
400
401
402
403    # If we get this far, the license file is OK
404    return True
Note: See TracBrowser for help on using the repository browser.