"""Basic XML utilities based on minidom - the built in Document Object Model
"""

import sys
from xml.dom import minidom, Node
#from xml.sax import make_parser, parse as validate, handler

def print_tree(n, indent=0):
    while n:
        #print 'nodeType', n.nodeType, Node.ELEMENT_NODE
        #if n.nodeType != Node.ELEMENT_NODE:
        #    break

        print ' '*indent,\
              'Node name: "%s",' %n.nodeName,\
              'Node type: "%s",' %n.nodeType,\
              'Node value: "%s"' %str(n.nodeValue).strip()
              
        
        print_tree(n.firstChild, indent+4)
        n = n.nextSibling


def pretty_print_tree(n, indent=0):
    print n

def parse(fid):
    """Parse XML file descriptor and return DOM object.
    """

    # FIXME (OLE): XML code should be validated against the DTD
    #validate(fid, handler)
    #doc = minidom.parse(fid, make_parser())

    fid.seek(0)
    doc = minidom.parse(fid)    
    return doc


def get_elements(nodelist):
    """Return list of nodes that are ELEMENT_NODE
    """

    element_list = []
    for node in nodelist:
        if node.nodeType == Node.ELEMENT_NODE:
            element_list.append(node)

    return element_list


def get_text(nodelist):
    """Return a concatenation of text fields from list of nodes
    """

    s = ''
    for node in nodelist:
        if node.nodeType == Node.TEXT_NODE:
            s += node.nodeValue + ', '

    if len(s)>0: s = s[:-2]
    return s


def remove_whitespace(s):
    """Remove excess whitespace including newlines from string
    """
    import string
    words = s.split() # Split on whitespace

    return string.join(words)

    #return s.replace('\n', '')
    #s.translate(string.maketrans)
    

#----------------------------
# XML object model
#----------------------------

class XML_element(dict):
    def __init__(self,
                 tag=None,
                 value=None,
                 version='1.0',
                 encoding='iso-8859-1'):                 
        """
        value can be either
          * An XML_element
          * a list of XML_value
          * a text string
        
        """
        
        if isinstance(value, XML_element):
            value = [value]
            
        self.value = value


        if tag is None:
            tag = '?xml version="%s" encoding="%s"?' %(version, encoding)
            self.root_element = True
        else:
            self.root_element = False
        
        self.tag = tag


        # FIXME: It might be better to represent these objects
        # in a proper dictionary format with
        # {tag: value, ...}
        # No, tried that - it removes any notion of ordering.
        

    def __add__(self, other):
        return str(self) + str(other)

    def __radd__(self, other):
        return str(other) + str(self)    #Python swaps self and other    

    def __repr__(self):
        return str(self)

    def __str__(self, indent=0):
        """String representation of XML element
        """

        if self.root_element is True:
            increment = 0
        else:
            increment = 4
            
        s = tab = ' '*indent
        
        s += '<%s>' %self.tag
        if isinstance(self.value, basestring):
            s += remove_whitespace(self.value)
        else:
            s += '\n'
            for e in self.value:
                s += e.__str__(indent+increment)
            s += tab

        if self.root_element is False:    
            s += '</%s>\n' %self.tag
            
        return s


    def __getitem__(self, key):
        """Return sub-tree starting at element with tag equal to specified key
        If node is terminal, its text value will be returned instead of itself.
        This will allow statements such as

        assert xmlobject['datafile']['accountable'] == 'Jane Sexton'

        If more than one element matches the given key a list of all
        matches will be returned
        """

        result = []
        for node in self.value:
            if node.tag == key:
                #print 'node tag = %s, node value = %s' %(node.tag, node.value)
                
                if isinstance(node.value, basestring):
                    result.append(str(node.value))
                    #return node.value
                else:
                    result.append(node)
                    #return node

        #print 'result', result
        if len(result) == 0:
            return None
        if len(result) == 1:
            return result[0]
        if len(result) > 1:
            return result
                    

    def has_key(self, key):
        found = False
        for node in self.value:
            if node.tag == key:
                found = True

        return found
        

    def keys(self):
        return [str(node.tag) for node in self.value]

    
    def pretty_print(self, indent=0):
        """Print the document without tags using indentation
        """

        s = tab = ' '*indent
        s += '%s: ' %self.tag        
        if isinstance(self.value, basestring):
            s += self.value
        else:
            s += '\n'
            for e in self.value:
                s += e.pretty_print(indent+4)
        s += '\n'
        
        return s
    
    
def xml2object(xml, verbose=False):
    """Generate XML object model from XML file or XML text

    This is the inverse operation to the __str__ representation
    (up to whitespace).

    Input xml can be either an
    * xml file
    * open xml file object 

    Return XML_document instance.
    """

    # FIXME - can we allow xml to be string?
    # This would depend on minidom's parse function

    # Input tests
    if isinstance(xml, basestring):
        fid = open(xml)
    else:
        fid = xml

    try:
        dom = parse(fid)
    except Exception, e:
        # Throw filename into dom exception
        msg = 'XML file "%s" could not be parsed.\n' %fid.name
        msg += 'Error message from parser: "%s"' %str(e)
        raise Exception, msg

    try:
        xml_object = dom2object(dom)
    except Exception, e:
        msg = 'Could not convert %s into XML object.\n' %fid.name
        msg += str(e)
        raise Exception, msg
    
    return xml_object


def dom2object(node):
    """Convert DOM representation to XML_object hierarchy.
    """

    value = []
    textnode_encountered = None
    for n in node.childNodes:
    
        if n.nodeType == 3:
            # Child is a text element - omit the dom tag #text and
            # go straight to the text value.
            
            # Note - only the last text value will be recorded
            
            msg = 'Text element has child nodes - this shouldn\'t happen'
            assert len(n.childNodes) == 0, msg

            
            x = n.nodeValue.strip()
            if len(x) == 0:
                # Skip empty text children
                continue
            
            textnode_encountered = value = x
        else:
            # XML element
            
            
            if textnode_encountered is not None:
                msg = 'A text node was followed by a non-text tag. This is not allowed.\n'
                msg += 'Offending text node: "%s" ' %str(textnode_encountered)            
                msg += 'was followed by node named: "<%s>"' %str(n.nodeName)
                raise Exception, msg
            

            value.append(dom2object(n))

                
    # Deal with empty elements
    if len(value) == 0: value = ''
    

    if node.nodeType == 9:
        # Root node (document)
        tag = None
    else:
        # Normal XML node
        tag = node.nodeName

        
    X = XML_element(tag=tag,
                    value=value)
        
    return X


    #=================== Useful print statement
    #if n.nodeType == 3 and str(n.nodeValue).strip() == '':
    #    pass
    #else:
    #    print 'Node name: "%s",' %n.nodeName,\
    #          'Node type: "%s",' %n.nodeType,\
    #          'Node value: "%s",' %str(n.nodeValue).strip(),\
    #          'Node children: %d' %len(n.childNodes)