source: anuga_core/source/anuga/utilities/data_audit.py @ 5195

Last change on this file since 5195 was 5195, checked in by duncan, 16 years ago

Generalising to work in EQRM

File size: 12.2 KB
Line 
1"""Track IP of data files in an entire directory tree.
2See docstring for the public function IP_verified()
3for details on algorithm.
4
5An example of the XML format expected by this module is
6
7
8<?xml version="1.0" encoding="iso-8859-1"?>
9
10<ga_license_file>
11  <metadata>
12    <author>Ole Nielsen</author>
13  </metadata>
14 
15  <datafile>
16    <filename>channel1.png</filename>
17    <checksum>1339122967</checksum>
18    <publishable>Yes</publishable>
19    <accountable>Ole Nielsen</accountable>
20    <source>Generated by ANUGA development team</source>
21    <IP_owner>Geoscience Australia</IP_owner>
22    <IP_info>For use with ANUGA manual</IP_info>   
23  </datafile>
24
25</ga_license_file>
26
27
28There can be more than one <datafile> element to cover files
29with different extensions.
30
31
32Here's a DTD format, we might implement one day
33
34   <!DOCTYPE ga_license_file [
35      <!ELEMENT ga_license_file (source, datafile+)>
36      <!ELEMENT metadata (author, svn_keywords)>
37      <!ELEMENT svn_keywords (author, date, revision, url, id)>   
38      <!ELEMENT datafile (filename, publishable, accountable,
39                        owner, location, IP_info)>   
40      <!ELEMENT filename (#PCDATA)>
41      <!ELEMENT publishable (#PCDATA)>   
42      <!ELEMENT accountable (#PCDATA)>       
43      <!ELEMENT source (#PCDATA)>                               
44      <!ELEMENT IP_owner (#PCDATA)>           
45      <!ELEMENT IP_info (#PCDATA)>               
46  ]>
47
48
49
50"""
51
52from os import remove, walk, sep
53from os.path import join, splitext
54
55# Don't add anuga.utilities to these imports
56# EQRM also uses this file, but has a different directory structure
57from xml_tools import xml2object, XML_element
58from system_tools import compute_checksum
59
60
61# Audit exceptions
62class NotPublishable(Exception): pass
63class FilenameMismatch(Exception): pass
64class CRCMismatch(Exception): pass
65class Invalid(Exception): pass
66class WrongTags(Exception): pass
67class Empty(Exception): pass
68
69audit_exceptions = (NotPublishable,
70                    FilenameMismatch,
71                    CRCMismatch,
72                    Invalid,
73                    WrongTags,
74                    Empty)
75
76
77def IP_verified(directory,
78                extensions_to_ignore=None,
79                directories_to_ignore=None,
80                files_to_ignore=None,
81                verbose=False):
82    """Find and audit potential data files that might violate IP
83
84    This is the public function to be used to ascertain that
85    all data in the specified directory tree has been audited according
86    to the GA data IP tracking process.
87
88    if IP_verified is False:
89        # Stop and take remedial action
90        ...
91    else:
92        # Proceed boldly with confidence
93       
94    verbose controls standard output.
95    If verbose is False, only diagnostics about failed audits will appear.
96    All files that check OK will pass silently.
97
98    Optional arguments extensions_to_ignore, directories_to_ignore, and
99    files_to_ignore are lists of things to skip.
100
101    Examples are:
102    extensions_to_ignore = ['.py','.c','.h', '.f'] # Ignore source code
103    files_to_ignore = ['README.txt']
104    directories_to_ignore = ['.svn', 'misc']
105
106    None is also OK for these parameters.
107   
108    """
109
110    # Identify data files
111    oldpath = None
112    all_files = 0
113    ok_files = 0
114    all_files_accounted_for = True
115    for dirpath, filename in identify_datafiles(directory,
116                                                extensions_to_ignore,
117                                                directories_to_ignore,
118                                                files_to_ignore):
119
120
121        if oldpath != dirpath:
122            # Decide if dir header needs to be printed                       
123            oldpath = dirpath
124            first_time_this_dir = True
125           
126
127
128        all_files += 1
129       
130        basename, ext = splitext(filename)
131        license_filename = join(dirpath, basename + '.lic')
132
133
134        # Look for a XML license file with the .lic
135        status = 'OK'
136        try:
137            fid = open(license_filename)
138        except IOError:
139            status = 'NO LICENSE FILE'
140            all_files_accounted_for = False
141        else:
142            fid.close()
143           
144            try:
145                license_file_is_valid(license_filename,
146                                      filename,
147                                      dirpath,
148                                      verbose=False)
149            except audit_exceptions, e:
150                all_files_accounted_for = False                               
151                status = 'LICENSE FILE NOT VALID\n'
152                status += 'REASON: %s\n' %e
153
154                try:
155                    doc = xml2object(license_filename)
156                except:
157                    status += 'XML file %s could not be read:'\
158                              %license_filename
159                    fid = open(license_filename)
160                    status += fid.read()
161                    fid.close()
162                else:
163                    pass
164                    #if verbose is True:
165                    #    status += str(doc)
166
167
168
169        if status == 'OK':
170            ok_files += 1
171        else:
172            # Only print status if there is a problem (no news is good news)
173            if first_time_this_dir is True:
174                print
175                print '------------------------------------'
176                msg = 'Files without licensing info in dir:'
177                print msg, dirpath
178                print '------------------------------------'
179                first_time_this_dir = False
180           
181
182            print filename + ' (Checksum=%s): '\
183                  %str(compute_checksum(join(dirpath, filename))),\
184                  status
185
186
187    if verbose is True:
188        print
189        print '---------------------'       
190        print 'Audit result for dir: %s:' %directory
191        print '---------------------'               
192        print 'Number of files audited:  %d' %(all_files)
193        print 'Number of files verified: %d' %(ok_files)       
194        print
195
196    # Return result       
197    return all_files_accounted_for
198
199
200
201#------------------
202# Private functions
203#------------------
204def identify_datafiles(root,
205                       extensions_to_ignore=None,
206                       directories_to_ignore=None,
207                       files_to_ignore=None):
208    """ Identify files that might contain data
209
210    See function IP_verified() for details about optinoal parmeters
211    """
212
213    for dirpath, dirnames, filenames in walk(root):
214
215        for ignore in directories_to_ignore:
216            if ignore in dirnames:
217                dirnames.remove(ignore)  # don't visit ignored directories
218
219
220        for filename in filenames:
221
222
223            # Ignore extensions that need no IP check
224            ignore = False
225            for ext in extensions_to_ignore:
226                if filename.endswith(ext):
227                    ignore = True
228
229            if filename in files_to_ignore:
230                ignore = True
231
232            if ignore is False:
233                yield dirpath, filename
234
235
236def license_file_is_valid(license_filename, data_filename,
237                          dirpath='.', verbose=False):
238    """Check that XML license file for given filename_to_verify is valid.
239
240    Input:
241        license_filename: XML license file (must be an absolute path name)
242        data_filename: The data filename that is being audited
243        dir_path: Where the files live
244        verbose: Optional verbosity
245       
246
247    Check for each datafile listed that
248
249    * Datafile tags are there and match the one specified
250    * Fields are non empty (except IP_info which can be left blank)
251    * Datafile exists
252    * Checksum is correct
253    * Datafile is flagged as publishable
254
255    If anything is violated an appropriate exception is raised.
256    If everything is honky dory the function will return True.
257    """
258
259    doc = xml2object(license_filename)
260   
261    # Check that file is valid (e.g. all elements there)
262    if not doc.has_key('ga_license_file'):
263        msg = 'License file %s must have two elements' %license_filename
264        msg += ' at the root level. They are\n'
265        msg += '  <?xml version="1.0" encoding="iso-8859-1"?>\n'
266        msg += '  <ga_license_file>\n'
267        msg += 'The second element was found to be %s' %doc.keys()
268        raise WrongTags, msg
269   
270
271    # Validate elements: metadata, datafile, datafile, ...
272    # FIXME (Ole): I'd like this to verified by the parser
273    # using a proper DTD template one day....
274    # For not, let's check the main ones.
275    elements = doc['ga_license_file']
276    if not elements.has_key('metadata'):
277        msg = 'Tag %s must have the element "metadata"'\
278              %doc.keys()[0]
279        msg += 'The element found was %s' %elements[0].nodeName
280        raise WrongTags, msg
281
282    if not elements.has_key('datafile'):
283        msg = 'Tag %s must have the element "datafile"'\
284              %doc.keys()[0]
285        msg += 'The element found was %s' %elements[0].nodeName
286        raise WrongTags, msg   
287
288    for key in elements.keys():
289        msg = 'Invalid tag: %s' %key
290        if not key in ['metadata', 'datafile']:
291            raise WrongTags, msg                   
292
293   
294    # Extract information for metadata section
295    if verbose: print
296    metadata = elements['metadata']
297
298    author = metadata['author']
299    if verbose: print 'Author:   ', author
300    if author == '':
301        msg = 'Missing author'
302        raise Exception, msg               
303   
304    #svn_keywords = metadata['svn_keywords']
305    #if verbose: print 'SVN keywords:   ', svn_keywords
306   
307       
308    # Extract information for datafile sections
309    datafile = elements['datafile']
310    if isinstance(datafile, XML_element):
311        datafile = [datafile]
312
313
314    # Check that filename to verify is listed in license file
315    found = False
316    for data in datafile:   
317        if data['filename'] == data_filename:
318            found = True
319            break
320           
321    if not found:
322        msg = 'Specified filename to verify %s ' %data_filename
323        msg += 'did not appear in license file %s' %license_filename
324        raise FilenameMismatch, msg               
325           
326       
327    # Check contents for selected data_filename
328    #for data in datafile:
329    #    if verbose: print
330
331    # Filename
332    if data['filename'] == '':
333        msg = 'Missing filename'
334        raise FilenameMismatch, msg           
335    else:
336        filename = join(dirpath, data['filename'])
337        if verbose: print 'Filename: "%s"' %filename
338        try:
339            fid = open(filename, 'r')
340        except:
341            msg = 'Specified filename %s could not be opened'\
342                  %filename
343            raise FilenameMismatch, msg
344
345    # CRC
346    reported_crc = data['checksum']
347    if verbose: print 'Checksum: "%s"' %reported_crc
348   
349    file_crc = str(compute_checksum(filename))
350    if reported_crc != file_crc:
351        msg = 'Bad checksum (CRC).\n'
352        msg += '  The CRC reported in license file "%s" is "%s"\n'\
353               %(license_filename, reported_crc)
354        msg += '  The CRC computed from file "%s" is "%s"'\
355               %(filename, file_crc)
356        raise CRCMismatch, msg
357           
358    # Accountable
359    accountable = data['accountable']
360    if verbose: print 'Accountable: "%s"' %accountable
361    if accountable == '':
362        msg = 'No accountable person specified'
363        raise Empty, msg
364
365    # Source
366    source = data['source']
367    if verbose: print 'Source: "%s"' %source
368    if source == '':
369        msg = 'No source specified'
370        raise Empty, msg               
371
372    # IP owner
373    ip_owner = data['IP_owner']
374    if verbose: print 'IP owner: "%s"' %ip_owner
375    if ip_owner == '':
376        msg = 'No IP owner specified'
377        raise Empty, msg                               
378           
379    # IP info
380    ip_info = data['IP_info']
381    if verbose: print 'IP info: "%s"' %ip_info
382    #if ip_info == '':
383    #    msg = 'No IP info specified'
384    #    raise Empty, msg                                               
385
386    # Publishable
387    publishable = data['publishable']
388    if verbose: print 'Publishable: "%s"' %publishable
389    if publishable == '':
390        msg = 'No publishable value specified'
391        raise NotPublishable, msg
392   
393    if publishable.upper() != 'YES':
394        msg = 'Data file %s is not flagged as publishable'\
395              %fid.name
396        raise NotPublishable, msg
397
398
399
400    # If we get this far, the license file is OK
401    return True
Note: See TracBrowser for help on using the repository browser.