source: anuga_core/source/anuga/utilities/data_audit.py @ 5120

Last change on this file since 5120 was 5057, checked in by ole, 17 years ago

Simplified things

File size: 12.2 KB
Line 
1"""Track IP of data files in an entire directory tree.
2See docstring for the public function IP_verified()
3for details on algorithm.
4
5An example of the XML format expected by this module is
6
7
8<?xml version="1.0" encoding="iso-8859-1"?>
9
10<ga_license_file>
11  <metadata>
12    <author>Ole Nielsen</author>
13  </metadata>
14 
15  <datafile>
16    <filename>channel1.png</filename>
17    <checksum>1339122967</checksum>
18    <publishable>Yes</publishable>
19    <accountable>Ole Nielsen</accountable>
20    <source>Generated by ANUGA development team</source>
21    <IP_owner>Geoscience Australia</IP_owner>
22    <IP_info>For use with ANUGA manual</IP_info>   
23  </datafile>
24
25</ga_license_file>
26
27
28There can be more than one <datafile> element to cover files
29with different extensions.
30
31
32Here's a DTD format, we might implement one day
33
34   <!DOCTYPE ga_license_file [
35      <!ELEMENT ga_license_file (source, datafile+)>
36      <!ELEMENT metadata (author, svn_keywords)>
37      <!ELEMENT svn_keywords (author, date, revision, url, id)>   
38      <!ELEMENT datafile (filename, publishable, accountable,
39                        owner, location, IP_info)>   
40      <!ELEMENT filename (#PCDATA)>
41      <!ELEMENT publishable (#PCDATA)>   
42      <!ELEMENT accountable (#PCDATA)>       
43      <!ELEMENT source (#PCDATA)>                               
44      <!ELEMENT IP_owner (#PCDATA)>           
45      <!ELEMENT IP_info (#PCDATA)>               
46  ]>
47
48
49
50"""
51
52from os import remove, walk, sep
53from os.path import join, splitext
54
55from anuga.utilities.xml_tools import xml2object, XML_element
56from anuga.utilities.system_tools import compute_checksum
57
58
59# Audit exceptions
60class NotPublishable(Exception): pass
61class FilenameMismatch(Exception): pass
62class CRCMismatch(Exception): pass
63class Invalid(Exception): pass
64class WrongTags(Exception): pass
65class Empty(Exception): pass
66
67audit_exceptions = (NotPublishable,
68                    FilenameMismatch,
69                    CRCMismatch,
70                    Invalid,
71                    WrongTags,
72                    Empty)
73
74
75def IP_verified(directory,
76                extensions_to_ignore=None,
77                directories_to_ignore=None,
78                files_to_ignore=None,
79                verbose=False):
80    """Find and audit potential data files that might violate IP
81
82    This is the public function to be used to ascertain that
83    all data in the specified directory tree has been audited according
84    to the GA data IP tracking process.
85
86    if IP_verified is False:
87        # Stop and take remedial action
88        ...
89    else:
90        # Proceed boldly with confidence
91       
92    verbose controls standard output.
93    If verbose is False, only diagnostics about failed audits will appear.
94    All files that check OK will pass silently.
95
96    Optional arguments extensions_to_ignore, directories_to_ignore, and
97    files_to_ignore are lists of things to skip.
98
99    Examples are:
100    extensions_to_ignore = ['.py','.c','.h', '.f'] # Ignore source code
101    files_to_ignore = ['README.txt']
102    directories_to_ignore = ['.svn', 'misc']
103
104    None is also OK for these parameters.
105   
106    """
107
108    # Identify data files
109    oldpath = None
110    all_files = 0
111    ok_files = 0
112    all_files_accounted_for = True
113    for dirpath, filename in identify_datafiles(directory,
114                                                extensions_to_ignore,
115                                                directories_to_ignore,
116                                                files_to_ignore):
117
118
119        if oldpath != dirpath:
120            # Decide if dir header needs to be printed                       
121            oldpath = dirpath
122            first_time_this_dir = True
123           
124
125
126        all_files += 1
127       
128        basename, ext = splitext(filename)
129        license_filename = join(dirpath, basename + '.lic')
130
131
132        # Look for a XML license file with the .lic
133        status = 'OK'
134        try:
135            fid = open(license_filename)
136        except IOError:
137            status = 'NO LICENSE FILE'
138            all_files_accounted_for = False
139        else:
140            fid.close()
141           
142            try:
143                license_file_is_valid(license_filename,
144                                      filename,
145                                      dirpath,
146                                      verbose=False)
147            except audit_exceptions, e:
148                all_files_accounted_for = False                               
149                status = 'LICENSE FILE NOT VALID\n'
150                status += 'REASON: %s\n' %e
151
152                try:
153                    doc = xml2object(license_filename)
154                except:
155                    status += 'XML file %s could not be read:'\
156                              %license_filename
157                    fid = open(license_filename)
158                    status += fid.read()
159                    fid.close()
160                else:
161                    pass
162                    #if verbose is True:
163                    #    status += str(doc)
164
165
166
167        if status == 'OK':
168            ok_files += 1
169        else:
170            # Only print status if there is a problem (no news is good news)
171            if first_time_this_dir is True:
172                print
173                print '------------------------------------'
174                msg = 'Files without licensing info in dir:'
175                print msg, dirpath
176                print '------------------------------------'
177                first_time_this_dir = False
178           
179
180            print filename + ' (Checksum=%s): '\
181                  %str(compute_checksum(join(dirpath, filename))),\
182                  status
183
184
185    if verbose is True:
186        print
187        print '---------------------'       
188        print 'Audit result for dir: %s:' %directory
189        print '---------------------'               
190        print 'Number of files audited:  %d' %(all_files)
191        print 'Number of files verified: %d' %(ok_files)       
192        print
193
194    # Return result       
195    return all_files_accounted_for
196
197
198
199#------------------
200# Private functions
201#------------------
202def identify_datafiles(root,
203                       extensions_to_ignore=None,
204                       directories_to_ignore=None,
205                       files_to_ignore=None):
206    """ Identify files that might contain data
207
208    See function IP_verified() for details about optinoal parmeters
209    """
210
211    for dirpath, dirnames, filenames in walk(root):
212
213        for ignore in directories_to_ignore:
214            if ignore in dirnames:
215                dirnames.remove(ignore)  # don't visit ignored directories
216
217
218        for filename in filenames:
219
220
221            # Ignore extensions that need no IP check
222            ignore = False
223            for ext in extensions_to_ignore:
224                if filename.endswith(ext):
225                    ignore = True
226
227            if filename in files_to_ignore:
228                ignore = True
229
230            if ignore is False:
231                yield dirpath, filename
232
233
234def license_file_is_valid(license_filename, data_filename,
235                          dirpath='.', verbose=False):
236    """Check that XML license file for given filename_to_verify is valid.
237
238    Input:
239        license_filename: XML license file (must be an absolute path name)
240        data_filename: The data filename that is being audited
241        dir_path: Where the files live
242        verbose: Optional verbosity
243       
244
245    Check for each datafile listed that
246
247    * Datafile tags are there and match the one specified
248    * Fields are non empty (except IP_info which can be left blank)
249    * Datafile exists
250    * Checksum is correct
251    * Datafile is flagged as publishable
252
253    If anything is violated an appropriate exception is raised.
254    If everything is honky dory the function will return True.
255    """
256
257    doc = xml2object(license_filename)
258   
259    # Check that file is valid (e.g. all elements there)
260    if not doc.has_key('ga_license_file'):
261        msg = 'License file %s must have two elements' %license_filename
262        msg += ' at the root level. They are\n'
263        msg += '  <?xml version="1.0" encoding="iso-8859-1"?>\n'
264        msg += '  <ga_license_file>\n'
265        msg += 'The second element was found to be %s' %doc.keys()
266        raise WrongTags, msg
267   
268
269    # Validate elements: metadata, datafile, datafile, ...
270    # FIXME (Ole): I'd like this to verified by the parser
271    # using a proper DTD template one day....
272    # For not, let's check the main ones.
273    elements = doc['ga_license_file']
274    if not elements.has_key('metadata'):
275        msg = 'Tag %s must have the element "metadata"'\
276              %doc.keys()[0]
277        msg += 'The element found was %s' %elements[0].nodeName
278        raise WrongTags, msg
279
280    if not elements.has_key('datafile'):
281        msg = 'Tag %s must have the element "datafile"'\
282              %doc.keys()[0]
283        msg += 'The element found was %s' %elements[0].nodeName
284        raise WrongTags, msg   
285
286    for key in elements.keys():
287        msg = 'Invalid tag: %s' %key
288        if not key in ['metadata', 'datafile']:
289            raise WrongTags, msg                   
290
291   
292    # Extract information for metadata section
293    if verbose: print
294    metadata = elements['metadata']
295
296    author = metadata['author']
297    if verbose: print 'Author:   ', author
298    if author == '':
299        msg = 'Missing author'
300        raise Exception, msg               
301   
302    #svn_keywords = metadata['svn_keywords']
303    #if verbose: print 'SVN keywords:   ', svn_keywords
304   
305       
306    # Extract information for datafile sections
307    datafile = elements['datafile']
308    if isinstance(datafile, XML_element):
309        datafile = [datafile]
310
311
312    # Check that filename to verify is listed in license file
313    found = False
314    for data in datafile:   
315        if data['filename'] == data_filename:
316            found = True
317            break
318           
319    if not found:
320        msg = 'Specified filename to verify %s ' %data_filename
321        msg += 'did not appear in license file %s' %license_filename
322        raise FilenameMismatch, msg               
323           
324       
325    # Check contents for selected data_filename
326    #for data in datafile:
327    #    if verbose: print
328
329    # Filename
330    if data['filename'] == '':
331        msg = 'Missing filename'
332        raise FilenameMismatch, msg           
333    else:
334        filename = join(dirpath, data['filename'])
335        if verbose: print 'Filename: "%s"' %filename
336        try:
337            fid = open(filename, 'r')
338        except:
339            msg = 'Specified filename %s could not be opened'\
340                  %filename
341            raise FilenameMismatch, msg
342
343    # CRC
344    reported_crc = data['checksum']
345    if verbose: print 'Checksum: "%s"' %reported_crc
346   
347    file_crc = str(compute_checksum(filename))
348    if reported_crc != file_crc:
349        msg = 'Bad checksum (CRC).\n'
350        msg += '  The CRC reported in license file "%s" is "%s"\n'\
351               %(license_filename, reported_crc)
352        msg += '  The CRC computed from file "%s" is "%s"'\
353               %(filename, file_crc)
354        raise CRCMismatch, msg
355           
356    # Accountable
357    accountable = data['accountable']
358    if verbose: print 'Accountable: "%s"' %accountable
359    if accountable == '':
360        msg = 'No accountable person specified'
361        raise Empty, msg
362
363    # Source
364    source = data['source']
365    if verbose: print 'Source: "%s"' %source
366    if source == '':
367        msg = 'No source specified'
368        raise Empty, msg               
369
370    # IP owner
371    ip_owner = data['IP_owner']
372    if verbose: print 'IP owner: "%s"' %ip_owner
373    if ip_owner == '':
374        msg = 'No IP owner specified'
375        raise Empty, msg                               
376           
377    # IP info
378    ip_info = data['IP_info']
379    if verbose: print 'IP info: "%s"' %ip_info
380    #if ip_info == '':
381    #    msg = 'No IP info specified'
382    #    raise Empty, msg                                               
383
384    # Publishable
385    publishable = data['publishable']
386    if verbose: print 'Publishable: "%s"' %publishable
387    if publishable == '':
388        msg = 'No publishable value specified'
389        raise NotPublishable, msg
390   
391    if publishable.upper() != 'YES':
392        msg = 'Data file %s is not flagged as publishable'\
393              %fid.name
394        raise NotPublishable, msg
395
396
397
398    # If we get this far, the license file is OK
399    return True
Note: See TracBrowser for help on using the repository browser.