source: anuga_core/source/anuga/utilities/data_audit.py @ 5052

Last change on this file since 5052 was 5052, checked in by ole, 17 years ago

Nicer output of data_audit.

File size: 11.1 KB
Line 
1"""Track IP of data files in an entire directory tree.
2See docstring for the public function IP_verified()
3for details.
4"""
5
6from os import remove, walk, sep
7from os.path import join, splitext
8
9from anuga.utilities.xml_tools import xml2object, XML_element
10from anuga.utilities.system_tools import compute_checksum
11
12
13# Audit exceptions
14class NotPublishable(Exception): pass
15class FilenameMismatch(Exception): pass
16class CRCMismatch(Exception): pass
17class Invalid(Exception): pass
18class WrongTags(Exception): pass
19class Empty(Exception): pass
20
21audit_exceptions = (NotPublishable,
22                    FilenameMismatch,
23                    CRCMismatch,
24                    Invalid,
25                    WrongTags,
26                    Empty)
27
28
29def IP_verified(directory,
30                extensions_to_ignore=None,
31                directories_to_ignore=None,
32                files_to_ignore=None,
33                verbose=False):
34    """Find and audit potential data files that might violate IP
35
36    This is the public function to be used to ascertain that
37    all data in the specified directory tree has been audited according
38    to the GA data IP tracking process.
39
40    if IP_verified is False:
41        # Stop and take remedial action
42        ...
43    else:
44        # Proceed boldly with confidence
45       
46    verbose controls standard output.
47    If verbose is False, only diagnostics about failed audits will appear.
48    All files that check OK will pass silently.
49
50    Optional arguments extensions_to_ignore, directories_to_ignore, and
51    files_to_ignore are lists of things to skip.
52
53    Examples are:
54    extensions_to_ignore = ['.py','.c','.h', '.f'] # Ignore source code
55    files_to_ignore = ['README.txt']
56    directories_to_ignore = ['.svn', 'misc']
57
58    None is also OK for these parameters.
59   
60    """
61
62    # Identify data files
63    oldpath = None
64    all_files = 0
65    ok_files = 0
66    files_found_in_dir = True
67    all_files_accounted_for = True
68    for dirpath, filename in identify_datafiles(directory,
69                                                extensions_to_ignore,
70                                                directories_to_ignore,
71                                                files_to_ignore):
72
73
74        if oldpath != dirpath:
75            dir_change = True
76            oldpath = dirpath
77            files_found_in_dir = False # Reset for this dir
78        else:
79            dir_change = False
80
81        all_files += 1
82       
83        basename, ext = splitext(filename)
84        license_filename = join(dirpath, basename + '.lic')
85
86
87        # Look for a XML license file with the .lic
88        status = 'OK'
89        try:
90            fid = open(license_filename)
91        except IOError:
92            status = 'NO LICENSE FILE'
93            all_files_accounted_for = False
94        else:
95            fid.close()
96           
97            try:
98                license_file_is_valid(license_filename,
99                                      filename,
100                                      dirpath,
101                                      verbose=False)
102            except audit_exceptions, e:
103                all_files_accounted_for = False                               
104                status = 'LICENSE FILE NOT VALID\n'
105                status += 'REASON: %s\n' %e
106
107                try:
108                    doc = xml2object(license_filename)
109                except:
110                    status += 'XML file %s could not be read:'\
111                              %license_filename
112                    fid = open(license_filename)
113                    status += fid.read()
114                    fid.close()
115                else:
116                    pass
117                    #if verbose is True:
118                    #    status += str(doc)
119
120
121        # Decide if dir header needs to be printed           
122        if status != 'OK':
123            files_found_in_dir = True
124           
125                   
126        # Only print status if there is a problem (no news is good news)
127        if dir_change is True and files_found_in_dir is True:
128            print
129            print '------------------------------------'
130            msg = 'Files without licensing info in dir:'
131            print msg, dirpath
132            print '------------------------------------'
133               
134
135        if status == 'OK':
136            ok_files += 1
137        else:
138            #print dir_change, dirpath, filename + ' (Checksum=%s): '\
139            print filename + ' (Checksum=%s): '\
140                  %str(compute_checksum(join(dirpath, filename))),\
141                  status
142
143
144    if verbose is True:
145        print
146        print '---------------------'       
147        print 'Audit result for dir: %s:' %directory
148        print '---------------------'               
149        print 'Number of files audited:  %d' %(all_files)
150        print 'Number of files verified: %d' %(ok_files)       
151        print
152
153    # Return result       
154    return all_files_accounted_for
155
156
157
158#------------------
159# Private functions
160#------------------
161def identify_datafiles(root,
162                       extensions_to_ignore=None,
163                       directories_to_ignore=None,
164                       files_to_ignore=None):
165    """ Identify files that might contain data
166
167    See function IP_verified() for details about optinoal parmeters
168    """
169
170    for dirpath, dirnames, filenames in walk(root):
171
172        for ignore in directories_to_ignore:
173            if ignore in dirnames:
174                dirnames.remove(ignore)  # don't visit ignored directories
175
176
177        for filename in filenames:
178
179
180            # Ignore extensions that need no IP check
181            ignore = False
182            for ext in extensions_to_ignore:
183                if filename.endswith(ext):
184                    ignore = True
185
186            if filename in files_to_ignore:
187                ignore = True
188
189            if ignore is False:
190                yield dirpath, filename
191
192
193def license_file_is_valid(license_filename, data_filename,
194                          dirpath='.', verbose=False):
195    """Check that XML license file for given filename_to_verify is valid.
196
197    Input:
198        license_filename: XML license file (must be an absolute path name)
199        data_filename: The data filename that is being audited
200        dir_path: Where the files live
201        verbose: Optional verbosity
202       
203
204    Check for each datafile listed that
205
206    * Datafile tags are there and match the one specified
207    * Fields are non empty (except IP_info which can be left blank)
208    * Datafile exists
209    * Checksum is correct
210    * Datafile is flagged as publishable
211
212    If anything is violated an appropriate exception is raised.
213    If everything is honky dory the function will return True.
214    """
215
216    doc = xml2object(license_filename)
217   
218    # Check that file is valid (e.g. all elements there)
219    if not doc.has_key('ga_license_file'):
220        msg = 'License file %s must have two elements' %license_filename
221        msg += ' at the root level. They are\n'
222        msg += '  <?xml version="1.0" encoding="iso-8859-1"?>\n'
223        msg += '  <ga_license_file>\n'
224        msg += 'The second element was found to be %s' %doc.keys()
225        raise WrongTags, msg
226   
227
228    # Validate elements: metadata, datafile, datafile, ...
229    # FIXME (Ole): I'd like this to verified by the parser
230    # using a proper DTD template one day....
231    # For not, let's check the main ones.
232    elements = doc['ga_license_file']
233    if not elements.has_key('metadata'):
234        msg = 'Tag %s must have the element "metadata"'\
235              %doc.keys()[0]
236        msg += 'The element found was %s' %elements[0].nodeName
237        raise WrongTags, msg
238
239    if not elements.has_key('datafile'):
240        msg = 'Tag %s must have the element "datafile"'\
241              %doc.keys()[0]
242        msg += 'The element found was %s' %elements[0].nodeName
243        raise WrongTags, msg   
244
245    for key in elements.keys():
246        msg = 'Invalid tag: %s' %key
247        if not key in ['metadata', 'datafile']:
248            raise WrongTags, msg                   
249
250   
251    # Extract information for metadata section
252    if verbose: print
253    metadata = elements['metadata']
254
255    author = metadata['author']
256    if verbose: print 'Author:   ', author
257    if author == '':
258        msg = 'Missing author'
259        raise Exception, msg               
260   
261    #svn_keywords = metadata['svn_keywords']
262    #if verbose: print 'SVN keywords:   ', svn_keywords
263   
264       
265    # Extract information for datafile sections
266    datafile = elements['datafile']
267    if isinstance(datafile, XML_element):
268        datafile = [datafile]
269
270
271    # Check that filename to verify is listed in license file
272    found = False
273    for data in datafile:   
274        if data['filename'] == data_filename:
275            found = True
276            break
277           
278    if not found:
279        msg = 'Specified filename to verify %s ' %data_filename
280        msg += 'did not appear in license file %s' %license_filename
281        raise FilenameMismatch, msg               
282           
283       
284    # Check contents for selected data_filename
285    #for data in datafile:
286    #    if verbose: print
287
288    # Filename
289    if data['filename'] == '':
290        msg = 'Missing filename'
291        raise FilenameMismatch, msg           
292    else:
293        filename = join(dirpath, data['filename'])
294        if verbose: print 'Filename: "%s"' %filename
295        try:
296            fid = open(filename, 'r')
297        except:
298            msg = 'Specified filename %s could not be opened'\
299                  %filename
300            raise FilenameMismatch, msg
301
302    # CRC
303    reported_crc = data['checksum']
304    if verbose: print 'Checksum: "%s"' %reported_crc
305   
306    file_crc = str(compute_checksum(filename))
307    if reported_crc != file_crc:
308        msg = 'Bad checksum (CRC).\n'
309        msg += '  The CRC reported in license file "%s" is "%s"\n'\
310               %(license_filename, reported_crc)
311        msg += '  The CRC computed from file "%s" is "%s"'\
312               %(filename, file_crc)
313        raise CRCMismatch, msg
314           
315    # Accountable
316    accountable = data['accountable']
317    if verbose: print 'Accountable: "%s"' %accountable
318    if accountable == '':
319        msg = 'No accountable person specified'
320        raise Empty, msg
321
322    # Source
323    source = data['source']
324    if verbose: print 'Source: "%s"' %source
325    if source == '':
326        msg = 'No source specified'
327        raise Empty, msg               
328
329    # IP owner
330    ip_owner = data['IP_owner']
331    if verbose: print 'IP owner: "%s"' %ip_owner
332    if ip_owner == '':
333        msg = 'No IP owner specified'
334        raise Empty, msg                               
335           
336    # IP info
337    ip_info = data['IP_info']
338    if verbose: print 'IP info: "%s"' %ip_info
339    #if ip_info == '':
340    #    msg = 'No IP info specified'
341    #    raise Empty, msg                                               
342
343    # Publishable
344    publishable = data['publishable']
345    if verbose: print 'Publishable: "%s"' %publishable
346    if publishable == '':
347        msg = 'No publishable value specified'
348        raise NotPublishable, msg
349   
350    if publishable.upper() != 'YES':
351        msg = 'Data file %s is not flagged as publishable'\
352              %fid.name
353        raise NotPublishable, msg
354
355
356
357    # If we get this far, the license file is OK
358    return True
Note: See TracBrowser for help on using the repository browser.