Context Navigation

source: trunk/anuga_core/source/anuga/utilities/xml_tools.py @ 8968

Last change on this file since 8968 was 5628, checked in by ole, 16 years ago
Added error check in xml_tools to catch text nodes followed by non-text nodes. This was in response to an error in one of the license files (not yet fixed).
File size: 8.0 KB

Line
1	"""Basic XML utilities based on minidom - the built in Document Object Model
2	"""
3
4	import sys
5	from xml.dom import minidom, Node
6	#from xml.sax import make_parser, parse as validate, handler
7
8	def print_tree(n, indent=0):
9	while n:
10	#print 'nodeType', n.nodeType, Node.ELEMENT_NODE
11	#if n.nodeType != Node.ELEMENT_NODE:
12	# break
13
14	print ' '*indent,\
15	'Node name: "%s",' %n.nodeName,\
16	'Node type: "%s",' %n.nodeType,\
17	'Node value: "%s"' %str(n.nodeValue).strip()
18
19
20	print_tree(n.firstChild, indent+4)
21	n = n.nextSibling
22
23
24	def pretty_print_tree(n, indent=0):
25	print n
26
27	def parse(fid):
28	"""Parse XML file descriptor and return DOM object.
29	"""
30
31	# FIXME (OLE): XML code should be validated against the DTD
32	#validate(fid, handler)
33	#doc = minidom.parse(fid, make_parser())
34
35	fid.seek(0)
36	doc = minidom.parse(fid)
37	return doc
38
39
40	def get_elements(nodelist):
41	"""Return list of nodes that are ELEMENT_NODE
42	"""
43
44	element_list = []
45	for node in nodelist:
46	if node.nodeType == Node.ELEMENT_NODE:
47	element_list.append(node)
48
49	return element_list
50
51
52	def get_text(nodelist):
53	"""Return a concatenation of text fields from list of nodes
54	"""
55
56	s = ''
57	for node in nodelist:
58	if node.nodeType == Node.TEXT_NODE:
59	s += node.nodeValue + ', '
60
61	if len(s)>0: s = s[:-2]
62	return s
63
64
65
66	def remove_whitespace(s):
67	"""Remove excess whitespace including newlines from string
68	"""
69	import string
70	words = s.split() # Split on whitespace
71
72	return string.join(words)
73
74	#return s.replace('\n', '')
75	#s.translate(string.maketrans)
76
77
78
79	#----------------------------
80	# XML object model
81	#----------------------------
82
83	class XML_element(dict):
84	def __init__(self,
85	tag=None,
86	value=None,
87	version='1.0',
88	encoding='iso-8859-1'):
89	"""
90	value can be either
91	* An XML_element
92	* a list of XML_value
93	* a text string
94
95	"""
96
97	if isinstance(value, XML_element):
98	value = [value]
99
100	self.value = value
101
102
103
104	if tag is None:
105	tag = '?xml version="%s" encoding="%s"?' %(version, encoding)
106	self.root_element = True
107	else:
108	self.root_element = False
109
110	self.tag = tag
111
112
113
114
115	# FIXME: It might be better to represent these objects
116	# in a proper dictionary format with
117	# {tag: value, ...}
118	# No, tried that - it removes any notion of ordering.
119
120
121	def __add__(self, other):
122	return str(self) + str(other)
123
124	def __radd__(self, other):
125	return str(other) + str(self) #Python swaps self and other
126
127	def __repr__(self):
128	return str(self)
129
130	def __str__(self, indent=0):
131	"""String representation of XML element
132	"""
133
134	if self.root_element is True:
135	increment = 0
136	else:
137	increment = 4
138
139	s = tab = ' '*indent
140
141	s += '<%s>' %self.tag
142	if isinstance(self.value, basestring):
143	s += remove_whitespace(self.value)
144	else:
145	s += '\n'
146	for e in self.value:
147	s += e.__str__(indent+increment)
148	s += tab
149
150	if self.root_element is False:
151	s += '</%s>\n' %self.tag
152
153	return s
154
155
156	def __getitem__(self, key):
157	"""Return sub-tree starting at element with tag equal to specified key
158	If node is terminal, its text value will be returned instead of itself.
159	This will allow statements such as
160
161	assert xmlobject['datafile']['accountable'] == 'Jane Sexton'
162
163	If more than one element matches the given key a list of all
164	matches will be returned
165	"""
166
167	result = []
168	for node in self.value:
169	if node.tag == key:
170	#print 'node tag = %s, node value = %s' %(node.tag, node.value)
171
172	if isinstance(node.value, basestring):
173	result.append(str(node.value))
174	#return node.value
175	else:
176	result.append(node)
177	#return node
178
179	#print 'result', result
180	if len(result) == 0:
181	return None
182	if len(result) == 1:
183	return result[0]
184	if len(result) > 1:
185	return result
186
187
188	def has_key(self, key):
189	found = False
190	for node in self.value:
191	if node.tag == key:
192	found = True
193
194	return found
195
196
197	def keys(self):
198	return [str(node.tag) for node in self.value]
199
200
201
202	def pretty_print(self, indent=0):
203	"""Print the document without tags using indentation
204	"""
205
206	s = tab = ' '*indent
207	s += '%s: ' %self.tag
208	if isinstance(self.value, basestring):
209	s += self.value
210	else:
211	s += '\n'
212	for e in self.value:
213	s += e.pretty_print(indent+4)
214	s += '\n'
215
216	return s
217
218
219	def xml2object(xml, verbose=False):
220	"""Generate XML object model from XML file or XML text
221
222	This is the inverse operation to the __str__ representation
223	(up to whitespace).
224
225	Input xml can be either an
226	* xml file
227	* open xml file object
228
229	Return XML_document instance.
230	"""
231
232	# FIXME - can we allow xml to be string?
233	# This would depend on minidom's parse function
234
235	# Input tests
236	if isinstance(xml, basestring):
237	fid = open(xml)
238	else:
239	fid = xml
240
241	try:
242	dom = parse(fid)
243	except Exception, e:
244	# Throw filename into dom exception
245	msg = 'XML file "%s" could not be parsed.\n' %fid.name
246	msg += 'Error message from parser: "%s"' %str(e)
247	raise Exception, msg
248
249	try:
250	xml_object = dom2object(dom)
251	except Exception, e:
252	msg = 'Could not convert %s into XML object.\n' %fid.name
253	msg += str(e)
254	raise Exception, msg
255
256	return xml_object
257
258
259
260	def dom2object(node):
261	"""Convert DOM representation to XML_object hierarchy.
262	"""
263
264	value = []
265	textnode_encountered = None
266	for n in node.childNodes:
267
268	if n.nodeType == 3:
269	# Child is a text element - omit the dom tag #text and
270	# go straight to the text value.
271
272	# Note - only the last text value will be recorded
273
274	msg = 'Text element has child nodes - this shouldn\'t happen'
275	assert len(n.childNodes) == 0, msg
276
277
278	x = n.nodeValue.strip()
279	if len(x) == 0:
280	# Skip empty text children
281	continue
282
283	textnode_encountered = value = x
284	else:
285	# XML element
286
287
288	if textnode_encountered is not None:
289	msg = 'A text node was followed by a non-text tag. This is not allowed.\n'
290	msg += 'Offending text node: "%s" ' %str(textnode_encountered)
291	msg += 'was followed by node named: "<%s>"' %str(n.nodeName)
292	raise Exception, msg
293
294
295	value.append(dom2object(n))
296
297
298	# Deal with empty elements
299	if len(value) == 0: value = ''
300
301
302	if node.nodeType == 9:
303	# Root node (document)
304	tag = None
305	else:
306	# Normal XML node
307	tag = node.nodeName
308
309
310	X = XML_element(tag=tag,
311	value=value)
312
313	return X
314
315
316
317
318
319	#=================== Useful print statement
320	#if n.nodeType == 3 and str(n.nodeValue).strip() == '':
321	# pass
322	#else:
323	# print 'Node name: "%s",' %n.nodeName,\
324	# 'Node type: "%s",' %n.nodeType,\
325	# 'Node value: "%s",' %str(n.nodeValue).strip(),\
326	# 'Node children: %d' %len(n.childNodes)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: