1 | """Basic XML utilities based on minidom - the built in Document Object Model |
---|
2 | """ |
---|
3 | |
---|
4 | import sys |
---|
5 | from xml.dom import minidom, Node |
---|
6 | #from xml.sax import make_parser, parse as validate, handler |
---|
7 | |
---|
8 | def print_tree(n, indent=0): |
---|
9 | while n: |
---|
10 | #print 'nodeType', n.nodeType, Node.ELEMENT_NODE |
---|
11 | #if n.nodeType != Node.ELEMENT_NODE: |
---|
12 | # break |
---|
13 | |
---|
14 | print ' '*indent,\ |
---|
15 | 'Node name: "%s",' %n.nodeName,\ |
---|
16 | 'Node type: "%s",' %n.nodeType,\ |
---|
17 | 'Node value: "%s"' %str(n.nodeValue).strip() |
---|
18 | |
---|
19 | |
---|
20 | print_tree(n.firstChild, indent+4) |
---|
21 | n = n.nextSibling |
---|
22 | |
---|
23 | |
---|
24 | def pretty_print_tree(n, indent=0): |
---|
25 | print n |
---|
26 | |
---|
27 | def parse(fid): |
---|
28 | """Parse XML file descriptor and return DOM object. |
---|
29 | """ |
---|
30 | |
---|
31 | # FIXME (OLE): XML code should be validated against the DTD |
---|
32 | #validate(fid, handler) |
---|
33 | #doc = minidom.parse(fid, make_parser()) |
---|
34 | |
---|
35 | fid.seek(0) |
---|
36 | doc = minidom.parse(fid) |
---|
37 | return doc |
---|
38 | |
---|
39 | |
---|
40 | def get_elements(nodelist): |
---|
41 | """Return list of nodes that are ELEMENT_NODE |
---|
42 | """ |
---|
43 | |
---|
44 | element_list = [] |
---|
45 | for node in nodelist: |
---|
46 | if node.nodeType == Node.ELEMENT_NODE: |
---|
47 | element_list.append(node) |
---|
48 | |
---|
49 | return element_list |
---|
50 | |
---|
51 | |
---|
52 | def get_text(nodelist): |
---|
53 | """Return a concatenation of text fields from list of nodes |
---|
54 | """ |
---|
55 | |
---|
56 | s = '' |
---|
57 | for node in nodelist: |
---|
58 | if node.nodeType == Node.TEXT_NODE: |
---|
59 | s += node.nodeValue + ', ' |
---|
60 | |
---|
61 | if len(s)>0: s = s[:-2] |
---|
62 | return s |
---|
63 | |
---|
64 | |
---|
65 | |
---|
66 | def remove_whitespace(s): |
---|
67 | """Remove excess whitespace including newlines from string |
---|
68 | """ |
---|
69 | import string |
---|
70 | words = s.split() # Split on whitespace |
---|
71 | |
---|
72 | return string.join(words) |
---|
73 | |
---|
74 | #return s.replace('\n', '') |
---|
75 | #s.translate(string.maketrans) |
---|
76 | |
---|
77 | |
---|
78 | |
---|
79 | #---------------------------- |
---|
80 | # XML object model |
---|
81 | #---------------------------- |
---|
82 | |
---|
83 | class XML_element(dict): |
---|
84 | def __init__(self, |
---|
85 | tag=None, |
---|
86 | value=None, |
---|
87 | version='1.0', |
---|
88 | encoding='iso-8859-1'): |
---|
89 | """ |
---|
90 | value can be either |
---|
91 | * An XML_element |
---|
92 | * a list of XML_value |
---|
93 | * a text string |
---|
94 | |
---|
95 | """ |
---|
96 | |
---|
97 | if isinstance(value, XML_element): |
---|
98 | value = [value] |
---|
99 | |
---|
100 | self.value = value |
---|
101 | |
---|
102 | |
---|
103 | |
---|
104 | if tag is None: |
---|
105 | tag = '?xml version="%s" encoding="%s"?' %(version, encoding) |
---|
106 | self.root_element = True |
---|
107 | else: |
---|
108 | self.root_element = False |
---|
109 | |
---|
110 | self.tag = tag |
---|
111 | |
---|
112 | |
---|
113 | |
---|
114 | |
---|
115 | # FIXME: It might be better to represent these objects |
---|
116 | # in a proper dictionary format with |
---|
117 | # {tag: value, ...} |
---|
118 | # No, tried that - it removes any notion of ordering. |
---|
119 | |
---|
120 | |
---|
121 | def __add__(self, other): |
---|
122 | return str(self) + str(other) |
---|
123 | |
---|
124 | def __radd__(self, other): |
---|
125 | return str(other) + str(self) #Python swaps self and other |
---|
126 | |
---|
127 | def __repr__(self): |
---|
128 | return str(self) |
---|
129 | |
---|
130 | def __str__(self, indent=0): |
---|
131 | """String representation of XML element |
---|
132 | """ |
---|
133 | |
---|
134 | if self.root_element is True: |
---|
135 | increment = 0 |
---|
136 | else: |
---|
137 | increment = 4 |
---|
138 | |
---|
139 | s = tab = ' '*indent |
---|
140 | |
---|
141 | s += '<%s>' %self.tag |
---|
142 | if isinstance(self.value, basestring): |
---|
143 | s += remove_whitespace(self.value) |
---|
144 | else: |
---|
145 | s += '\n' |
---|
146 | for e in self.value: |
---|
147 | s += e.__str__(indent+increment) |
---|
148 | s += tab |
---|
149 | |
---|
150 | if self.root_element is False: |
---|
151 | s += '</%s>\n' %self.tag |
---|
152 | |
---|
153 | return s |
---|
154 | |
---|
155 | |
---|
156 | def __getitem__(self, key): |
---|
157 | """Return sub-tree starting at element with tag equal to specified key |
---|
158 | If node is terminal, its text value will be returned instead of itself. |
---|
159 | This will allow statements such as |
---|
160 | |
---|
161 | assert xmlobject['datafile']['accountable'] == 'Jane Sexton' |
---|
162 | |
---|
163 | If more than one element matches the given key a list of all |
---|
164 | matches will be returned |
---|
165 | """ |
---|
166 | |
---|
167 | result = [] |
---|
168 | for node in self.value: |
---|
169 | if node.tag == key: |
---|
170 | #print 'node tag = %s, node value = %s' %(node.tag, node.value) |
---|
171 | |
---|
172 | if isinstance(node.value, basestring): |
---|
173 | result.append(str(node.value)) |
---|
174 | #return node.value |
---|
175 | else: |
---|
176 | result.append(node) |
---|
177 | #return node |
---|
178 | |
---|
179 | #print 'result', result |
---|
180 | if len(result) == 0: |
---|
181 | return None |
---|
182 | if len(result) == 1: |
---|
183 | return result[0] |
---|
184 | if len(result) > 1: |
---|
185 | return result |
---|
186 | |
---|
187 | |
---|
188 | def has_key(self, key): |
---|
189 | found = False |
---|
190 | for node in self.value: |
---|
191 | if node.tag == key: |
---|
192 | found = True |
---|
193 | |
---|
194 | return found |
---|
195 | |
---|
196 | |
---|
197 | def keys(self): |
---|
198 | return [str(node.tag) for node in self.value] |
---|
199 | |
---|
200 | |
---|
201 | |
---|
202 | def pretty_print(self, indent=0): |
---|
203 | """Print the document without tags using indentation |
---|
204 | """ |
---|
205 | |
---|
206 | s = tab = ' '*indent |
---|
207 | s += '%s: ' %self.tag |
---|
208 | if isinstance(self.value, basestring): |
---|
209 | s += self.value |
---|
210 | else: |
---|
211 | s += '\n' |
---|
212 | for e in self.value: |
---|
213 | s += e.pretty_print(indent+4) |
---|
214 | s += '\n' |
---|
215 | |
---|
216 | return s |
---|
217 | |
---|
218 | |
---|
219 | def xml2object(xml, verbose=False): |
---|
220 | """Generate XML object model from XML file or XML text |
---|
221 | |
---|
222 | This is the inverse operation to the __str__ representation |
---|
223 | (up to whitespace). |
---|
224 | |
---|
225 | Input xml can be either an |
---|
226 | * xml file |
---|
227 | * open xml file object |
---|
228 | |
---|
229 | Return XML_document instance. |
---|
230 | """ |
---|
231 | |
---|
232 | # FIXME - can we allow xml to be string? |
---|
233 | # This would depend on minidom's parse function |
---|
234 | |
---|
235 | # Input tests |
---|
236 | if isinstance(xml, basestring): |
---|
237 | fid = open(xml) |
---|
238 | else: |
---|
239 | fid = xml |
---|
240 | |
---|
241 | try: |
---|
242 | dom = parse(fid) |
---|
243 | except Exception, e: |
---|
244 | # Throw filename into dom exception |
---|
245 | msg = 'XML file "%s" could not be parsed.\n' %fid.name |
---|
246 | msg += 'Error message from parser: "%s"' %str(e) |
---|
247 | raise Exception, msg |
---|
248 | |
---|
249 | try: |
---|
250 | xml_object = dom2object(dom) |
---|
251 | except Exception, e: |
---|
252 | msg = 'Could not convert %s into XML object.\n' %fid.name |
---|
253 | msg += str(e) |
---|
254 | raise Exception, msg |
---|
255 | |
---|
256 | return xml_object |
---|
257 | |
---|
258 | |
---|
259 | |
---|
260 | def dom2object(node): |
---|
261 | """Convert DOM representation to XML_object hierarchy. |
---|
262 | """ |
---|
263 | |
---|
264 | value = [] |
---|
265 | textnode_encountered = None |
---|
266 | for n in node.childNodes: |
---|
267 | |
---|
268 | if n.nodeType == 3: |
---|
269 | # Child is a text element - omit the dom tag #text and |
---|
270 | # go straight to the text value. |
---|
271 | |
---|
272 | # Note - only the last text value will be recorded |
---|
273 | |
---|
274 | msg = 'Text element has child nodes - this shouldn\'t happen' |
---|
275 | assert len(n.childNodes) == 0, msg |
---|
276 | |
---|
277 | |
---|
278 | x = n.nodeValue.strip() |
---|
279 | if len(x) == 0: |
---|
280 | # Skip empty text children |
---|
281 | continue |
---|
282 | |
---|
283 | textnode_encountered = value = x |
---|
284 | else: |
---|
285 | # XML element |
---|
286 | |
---|
287 | |
---|
288 | if textnode_encountered is not None: |
---|
289 | msg = 'A text node was followed by a non-text tag. This is not allowed.\n' |
---|
290 | msg += 'Offending text node: "%s" ' %str(textnode_encountered) |
---|
291 | msg += 'was followed by node named: "<%s>"' %str(n.nodeName) |
---|
292 | raise Exception, msg |
---|
293 | |
---|
294 | |
---|
295 | value.append(dom2object(n)) |
---|
296 | |
---|
297 | |
---|
298 | # Deal with empty elements |
---|
299 | if len(value) == 0: value = '' |
---|
300 | |
---|
301 | |
---|
302 | if node.nodeType == 9: |
---|
303 | # Root node (document) |
---|
304 | tag = None |
---|
305 | else: |
---|
306 | # Normal XML node |
---|
307 | tag = node.nodeName |
---|
308 | |
---|
309 | |
---|
310 | X = XML_element(tag=tag, |
---|
311 | value=value) |
---|
312 | |
---|
313 | return X |
---|
314 | |
---|
315 | |
---|
316 | |
---|
317 | |
---|
318 | |
---|
319 | #=================== Useful print statement |
---|
320 | #if n.nodeType == 3 and str(n.nodeValue).strip() == '': |
---|
321 | # pass |
---|
322 | #else: |
---|
323 | # print 'Node name: "%s",' %n.nodeName,\ |
---|
324 | # 'Node type: "%s",' %n.nodeType,\ |
---|
325 | # 'Node value: "%s",' %str(n.nodeValue).strip(),\ |
---|
326 | # 'Node children: %d' %len(n.childNodes) |
---|