1 ## simplexml.py based on Mattew Allum's xmlstream.py
3 ## Copyright (C) 2003-2005 Alexey "Snake" Nezhdanov
5 ## This program is free software; you can redistribute it and/or modify
6 ## it under the terms of the GNU General Public License as published by
7 ## the Free Software Foundation; either version 2, or (at your option)
10 ## This program is distributed in the hope that it will be useful,
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ## GNU General Public License for more details.
15 # $Id: simplexml.py,v 1.34 2009/03/03 10:24:02 normanr Exp $
17 """Simplexml module provides xmpppy library with all needed tools to handle XML nodes and XML streams.
18 I'm personally using it in many other separate projects. It is designed to be as standalone as possible."""
20 import xml.parsers.expat
23 """Returns provided string with symbols & < > " replaced by their respective XML entities."""
24 # replace also FORM FEED and ESC, because they are not valid XML chars
25 return txt.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace(u'\x0C', "").replace(u'\x1B', "")
29 """Converts object "what" to unicode string using it's own __str__ method if accessible or unicode method otherwise."""
30 if isinstance(what, unicode): return what
32 except AttributeError: r=str(what)
33 if not isinstance(r, unicode): return unicode(r,ENCODING)
37 """ Node class describes syntax of separate XML Node. It have a constructor that permits node creation
38 from set of "namespace name", attributes and payload of text strings and other nodes.
39 It does not natively support building node from text string and uses NodeBuilder class for that purpose.
40 After creation node can be mangled in many ways so it can be completely changed.
41 Also node can be serialised into string in one of two modes: default (where the textual representation
42 of node describes it exactly) and "fancy" - with whitespace added to make indentation and thus make
43 result more readable by human.
45 Node class have attribute FORCE_NODE_RECREATION that is defaults to False thus enabling fast node
46 replication from the some other node. The drawback of the fast way is that new node shares some
47 info with the "original" node that is changing the one node may influence the other. Though it is
48 rarely needed (in xmpppy it is never needed at all since I'm usually never using original node after
49 replication (and using replication only to move upwards on the classes tree).
51 FORCE_NODE_RECREATION=0
52 def __init__(self, tag=None, attrs={}, payload=[], parent=None, nsp=None, node_built=False, node=None):
53 """ Takes "tag" argument as the name of node (prepended by namespace, if needed and separated from it
54 by a space), attrs dictionary as the set of arguments, payload list as the set of textual strings
55 and child nodes that this node carries within itself and "parent" argument that is another node
56 that this one will be the child of. Also the __init__ can be provided with "node" argument that is
57 either a text string containing exactly one node or another Node instance to begin with. If both
58 "node" and other arguments is provided then the node initially created as replica of "node"
59 provided and then modified to be compliant with other arguments."""
61 if self.FORCE_NODE_RECREATION and isinstance(node, Node):
63 if not isinstance(node, Node):
64 node=NodeBuilder(node,self)
67 self.name,self.namespace,self.attrs,self.data,self.kids,self.parent,self.nsd = node.name,node.namespace,{},[],[],node.parent,{}
68 for key in node.attrs.keys(): self.attrs[key]=node.attrs[key]
69 for data in node.data: self.data.append(data)
70 for kid in node.kids: self.kids.append(kid)
71 for k,v in node.nsd.items(): self.nsd[k] = v
72 else: self.name,self.namespace,self.attrs,self.data,self.kids,self.parent,self.nsd = 'tag','',{},[],[],None,{}
77 for k,v in nsp.items(): self.nsp_cache[k] = v
78 for attr,val in attrs.items():
81 elif attr.startswith('xmlns:'):
82 self.nsd[attr[6:]] = val
83 self.attrs[attr]=attrs[attr]
86 pfx,self.name = (['']+tag.split(':'))[-2:]
87 self.namespace = self.lookup_nsp(pfx)
90 self.namespace,self.name = tag.split()
93 if isinstance(payload, basestring): payload=[payload]
95 if isinstance(i, Node): self.addChild(node=i)
96 else: self.data.append(ustr(i))
98 def lookup_nsp(self,pfx=''):
99 ns = self.nsd.get(pfx,None)
101 ns = self.nsp_cache.get(pfx,None)
104 ns = self.parent.lookup_nsp(pfx)
105 self.nsp_cache[pfx] = ns
107 return 'http://www.gajim.org/xmlns/undeclared'
110 def __str__(self,fancy=0):
111 """ Method used to dump node into textual representation.
112 if "fancy" argument is set to True produces indented output for readability."""
113 s = (fancy-1) * 2 * ' ' + "<" + self.name
115 if not self.parent or self.parent.namespace!=self.namespace:
116 if 'xmlns' not in self.attrs:
117 s = s + ' xmlns="%s"'%self.namespace
118 for key in self.attrs.keys():
119 val = ustr(self.attrs[key])
120 s = s + ' %s="%s"' % ( key, XMLescape(val) )
124 if fancy: s = s + "\n"
126 if not fancy and (len(self.data)-1)>=cnt: s=s+XMLescape(self.data[cnt])
127 elif (len(self.data)-1)>=cnt: s=s+XMLescape(self.data[cnt].strip())
128 if isinstance(a, Node):
129 s = s + a.__str__(fancy and fancy+1)
133 if not fancy and (len(self.data)-1) >= cnt: s = s + XMLescape(self.data[cnt])
134 elif (len(self.data)-1) >= cnt: s = s + XMLescape(self.data[cnt].strip())
135 if not self.kids and s.endswith('>'):
137 if fancy: s = s + "\n"
139 if fancy and not self.data: s = s + (fancy-1) * 2 * ' '
140 s = s + "</" + self.name + ">"
141 if fancy: s = s + "\n"
144 """ Serialise node, dropping all tags and leaving CDATA intact.
145 That is effectively kills all formatiing, leaving only text were contained in XML.
152 if a: s = s + a.getCDATA()
154 if (len(self.data)-1) >= cnt: s = s + self.data[cnt]
156 def addChild(self, name=None, attrs={}, payload=[], namespace=None, node=None):
157 """ If "node" argument is provided, adds it as child node. Else creates new node from
158 the other arguments' values and adds it as well."""
160 raise AttributeError("Use namespace=x instead of attrs={'xmlns':x}")
164 else: newnode=Node(tag=name, parent=self, attrs=attrs, payload=payload)
166 newnode.setNamespace(namespace)
167 self.kids.append(newnode)
168 self.data.append(u'')
170 def addData(self, data):
171 """ Adds some CDATA to node. """
172 self.data.append(ustr(data))
173 self.kids.append(None)
175 """ Removes all CDATA from the node. """
177 def delAttr(self, key):
178 """ Deletes an attribute "key" """
180 def delChild(self, node, attrs={}):
181 """ Deletes the "node" from the node's childs list, if "node" is an instance.
182 Else deletes the first node that have specified name and (optionally) attributes. """
183 if not isinstance(node, Node): node=self.getTag(node,attrs)
184 self.kids[self.kids.index(node)]=None
187 """ Returns all node's attributes as dictionary. """
189 def getAttr(self, key):
190 """ Returns value of specified attribute. """
191 try: return self.attrs[key]
193 def getChildren(self):
194 """ Returns all node's child nodes as list. """
197 """ Returns all node CDATA as string (concatenated). """
198 return ''.join(self.data)
200 """ Returns the name of node """
202 def getNamespace(self):
203 """ Returns the namespace of node """
204 return self.namespace
206 """ Returns the parent of node (if present). """
208 def getPayload(self):
209 """ Return the payload of node i.e. list of child nodes and CDATA entries.
210 F.e. for "<node>text1<nodea/><nodeb/> text2</node>" will be returned list:
211 ['text1', <nodea instance>, <nodeb instance>, ' text2']. """
213 for i in range(max(len(self.data),len(self.kids))):
214 if i < len(self.data) and self.data[i]: ret.append(self.data[i])
215 if i < len(self.kids) and self.kids[i]: ret.append(self.kids[i])
217 def getTag(self, name, attrs={}, namespace=None):
218 """ Filters all child nodes using specified arguments as filter.
219 Returns the first found or None if not found. """
220 return self.getTags(name, attrs, namespace, one=1)
221 def getTagAttr(self,tag,attr):
222 """ Returns attribute value of the child with specified name (or None if no such attribute)."""
223 try: return self.getTag(tag).attrs[attr]
225 def getTagData(self,tag):
226 """ Returns cocatenated CDATA of the child with specified name."""
227 try: return self.getTag(tag).getData()
229 def getTags(self, name, attrs={}, namespace=None, one=0):
230 """ Filters all child nodes using specified arguments as filter.
231 Returns the list of nodes found. """
233 for node in self.kids:
234 if not node: continue
235 if namespace and namespace!=node.getNamespace(): continue
236 if node.getName() == name:
237 for key in attrs.keys():
238 if key not in node.attrs or node.attrs[key]!=attrs[key]: break
239 else: nodes.append(node)
240 if one and nodes: return nodes[0]
241 if not one: return nodes
243 def iterTags(self, name, attrs={}, namespace=None):
244 """ Iterate over all children using specified arguments as filter. """
245 for node in self.kids:
246 if not node: continue
247 if namespace is not None and namespace!=node.getNamespace(): continue
248 if node.getName() == name:
249 for key in attrs.keys():
250 if key not in node.attrs or \
251 node.attrs[key]!=attrs[key]: break
255 def setAttr(self, key, val):
256 """ Sets attribute "key" with the value "val". """
258 def setData(self, data):
259 """ Sets node's CDATA to provided string. Resets all previous CDATA!"""
260 self.data=[ustr(data)]
261 def setName(self,val):
262 """ Changes the node name. """
264 def setNamespace(self, namespace):
265 """ Changes the node namespace. """
266 self.namespace=namespace
267 def setParent(self, node):
268 """ Sets node's parent to "node". WARNING: do not checks if the parent already present
269 and not removes the node from the list of childs of previous parent. """
271 def setPayload(self,payload,add=0):
272 """ Sets node payload according to the list specified. WARNING: completely replaces all node's
273 previous content. If you wish just to add child or CDATA - use addData or addChild methods. """
274 if isinstance(payload, basestring): payload=[payload]
275 if add: self.kids+=payload
276 else: self.kids=payload
277 def setTag(self, name, attrs={}, namespace=None):
278 """ Same as getTag but if the node with specified namespace/attributes not found, creates such
279 node and returns it. """
280 node=self.getTags(name, attrs, namespace=namespace, one=1)
282 else: return self.addChild(name, attrs, namespace=namespace)
283 def setTagAttr(self,tag,attr,val):
284 """ Creates new node (if not already present) with name "tag"
285 and sets it's attribute "attr" to value "val". """
286 try: self.getTag(tag).attrs[attr]=val
287 except: self.addChild(tag,attrs={attr:val})
288 def setTagData(self,tag,val,attrs={}):
289 """ Creates new node (if not already present) with name "tag" and (optionally) attributes "attrs"
290 and sets it's CDATA to string "val". """
291 try: self.getTag(tag,attrs).setData(ustr(val))
292 except: self.addChild(tag,attrs,payload=[ustr(val)])
293 def has_attr(self,key):
294 """ Checks if node have attribute "key"."""
295 return key in self.attrs
296 def __getitem__(self,item):
297 """ Returns node's attribute "item" value. """
298 return self.getAttr(item)
299 def __setitem__(self,item,val):
300 """ Sets node's attribute "item" value. """
301 return self.setAttr(item,val)
302 def __delitem__(self,item):
303 """ Deletes node's attribute "item". """
304 return self.delAttr(item)
305 def __getattr__(self,attr):
306 """ Reduce memory usage caused by T/NT classes - use memory only when needed. """
316 """ Auxiliary class used to quick access to node's child nodes. """
317 def __init__(self,node): self.__dict__['node']=node
318 def __getattr__(self,attr): return self.node.getTag(attr)
319 def __setattr__(self,attr,val):
320 if isinstance(val,Node): Node.__init__(self.node.setTag(attr),node=val)
321 else: return self.node.setTagData(attr,val)
322 def __delattr__(self,attr): return self.node.delChild(attr)
325 """ Auxiliary class used to quick create node's child nodes. """
326 def __getattr__(self,attr): return self.node.addChild(attr)
327 def __setattr__(self,attr,val):
328 if isinstance(val,Node): self.node.addChild(attr,node=val)
329 else: return self.node.addChild(attr,payload=[val])
331 DBG_NODEBUILDER = 'nodebuilder'
333 """ Builds a Node class minidom from data parsed to it. This class used for two purposes:
334 1. Creation an XML Node from a textual representation. F.e. reading a config file. See an XML2Node method.
335 2. Handling an incoming XML stream. This is done by mangling
336 the __dispatch_depth parameter and redefining the dispatch method.
337 You do not need to use this class directly if you do not designing your own XML handler."""
338 def __init__(self,data=None,initial_node=None):
339 """ Takes two optional parameters: "data" and "initial_node".
340 By default class initialised with empty Node class instance.
341 Though, if "initial_node" is provided it used as "starting point".
342 You can think about it as of "node upgrade".
343 "data" (if provided) feeded to parser immidiatedly after instance init.
345 self.DEBUG(DBG_NODEBUILDER, "Preparing to handle incoming XML stream.", 'start')
346 self._parser = xml.parsers.expat.ParserCreate()
347 self._parser.StartElementHandler = self.starttag
348 self._parser.EndElementHandler = self.endtag
349 self._parser.CharacterDataHandler = self.handle_cdata
350 self._parser.StartNamespaceDeclHandler = self.handle_namespace_start
351 self._parser.buffer_text = True
352 self.Parse = self._parser.Parse
355 self.__last_depth = 0
357 self._dispatch_depth = 1
358 self._document_attrs = None
359 self._document_nsp = None
360 self._mini_dom=initial_node
361 self.last_is_data = 1
363 self.data_buffer = None
364 self.streamError = ''
366 self._parser.Parse(data,1)
368 def check_data_buffer(self):
370 self._ptr.data.append(''.join(self.data_buffer))
371 del self.data_buffer[:]
372 self.data_buffer = None
375 """ Method used to allow class instance to be garbage-collected. """
376 self.check_data_buffer()
377 self._parser.StartElementHandler = None
378 self._parser.EndElementHandler = None
379 self._parser.CharacterDataHandler = None
380 self._parser.StartNamespaceDeclHandler = None
382 def starttag(self, tag, attrs):
383 """XML Parser callback. Used internally"""
384 self.check_data_buffer()
386 self.DEBUG(DBG_NODEBUILDER, "DEPTH -> %i , tag -> %s, attrs -> %s" % (self.__depth, tag, `attrs`), 'down')
387 if self.__depth == self._dispatch_depth:
388 if not self._mini_dom :
389 self._mini_dom = Node(tag=tag, attrs=attrs, nsp = self._document_nsp, node_built=True)
391 Node.__init__(self._mini_dom,tag=tag, attrs=attrs, nsp = self._document_nsp, node_built=True)
392 self._ptr = self._mini_dom
393 elif self.__depth > self._dispatch_depth:
394 self._ptr.kids.append(Node(tag=tag,parent=self._ptr,attrs=attrs, node_built=True))
395 self._ptr = self._ptr.kids[-1]
396 if self.__depth == 1:
397 self._document_attrs = {}
398 self._document_nsp = {}
399 nsp, name = (['']+tag.split(':'))[-2:]
400 for attr,val in attrs.items():
402 self._document_nsp[u''] = val
403 elif attr.startswith('xmlns:'):
404 self._document_nsp[attr[6:]] = val
406 self._document_attrs[attr] = val
407 ns = self._document_nsp.get(nsp, 'http://www.gajim.org/xmlns/undeclared-root')
409 self.stream_header_received(ns, name, attrs)
410 except ValueError, e:
411 self._document_attrs = None
412 raise ValueError(str(e))
413 if not self.last_is_data and self._ptr.parent:
414 self._ptr.parent.data.append('')
415 self.last_is_data = 0
417 def endtag(self, tag ):
418 """XML Parser callback. Used internally"""
419 self.DEBUG(DBG_NODEBUILDER, "DEPTH -> %i , tag -> %s" % (self.__depth, tag), 'up')
420 self.check_data_buffer()
421 if self.__depth == self._dispatch_depth:
422 if self._mini_dom.getName() == 'error':
423 self.streamError = self._mini_dom.getChildren()[0].getName()
424 self.dispatch(self._mini_dom)
425 elif self.__depth > self._dispatch_depth:
426 self._ptr = self._ptr.parent
428 self.DEBUG(DBG_NODEBUILDER, "Got higher than dispatch level. Stream terminated?", 'stop')
430 self.last_is_data = 0
431 if self.__depth == 0: self.stream_footer_received()
433 def handle_cdata(self, data):
434 """XML Parser callback. Used internally"""
435 self.DEBUG(DBG_NODEBUILDER, data, 'data')
436 if self.last_is_data:
438 self.data_buffer.append(data)
440 self.data_buffer = [data]
441 self.last_is_data = 1
443 def handle_namespace_start(self, prefix, uri):
444 """XML Parser callback. Used internally"""
445 self.check_data_buffer()
447 def DEBUG(self, level, text, comment=None):
448 """ Gets all NodeBuilder walking events. Can be used for debugging if redefined."""
450 """ Returns just built Node. """
451 self.check_data_buffer()
452 return self._mini_dom
453 def dispatch(self,stanza):
454 """ Gets called when the NodeBuilder reaches some level of depth on it's way up with the built
455 node as argument. Can be redefined to convert incoming XML stanzas to program events. """
456 def stream_header_received(self,ns,tag,attrs):
457 """ Method called when stream just opened. """
458 self.check_data_buffer()
459 def stream_footer_received(self):
460 """ Method called when stream just closed. """
461 self.check_data_buffer()
463 def has_received_endtag(self, level=0):
464 """ Return True if at least one end tag was seen (at level) """
465 return self.__depth <= level and self.__max_depth > level
467 def _inc_depth(self):
468 self.__last_depth = self.__depth
470 self.__max_depth = max(self.__depth, self.__max_depth)
472 def _dec_depth(self):
473 self.__last_depth = self.__depth
477 """ Converts supplied textual string into XML node. Handy f.e. for reading configuration file.
478 Raises xml.parser.expat.parsererror if provided string is not well-formed XML. """
479 return NodeBuilder(xml).getDom()
481 def BadXML2Node(xml):
482 """ Converts supplied textual string into XML node. Survives if xml data is cutted half way round.
483 I.e. "<html>some text <br>some more text". Will raise xml.parser.expat.parsererror on misplaced
484 tags though. F.e. "<b>some text <br>some more text</b>" will not work."""
485 return NodeBuilder(xml).getDom()