Coverage for grm\lib\xmltodict.py: 11%
241 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-12 14:21 +0900
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-12 14:21 +0900
1#!/usr/bin/env python
2"Makes working with XML feel like you are working with JSON"
4try:
5 from defusedexpat import pyexpat as expat
6except ImportError:
7 from xml.parsers import expat
9from xml.sax.saxutils import XMLGenerator
10from xml.sax.xmlreader import AttributesImpl
12try: # pragma no cover
13 from cStringIO import StringIO
14except ImportError: # pragma no cover
15 try:
16 from StringIO import StringIO
17 except ImportError:
18 from io import StringIO
20from collections import OrderedDict
22try: # pragma no cover
23 _basestring = basestring
24except NameError: # pragma no cover
25 _basestring = str
26try: # pragma no cover
27 _unicode = unicode
28except NameError: # pragma no cover
29 _unicode = str
31__author__ = "Martin Blech"
32__version__ = "0.12.0"
33__license__ = "MIT"
36class ParsingInterrupted(Exception):
37 pass
40class _DictSAXHandler(object):
41 def __init__(
42 self,
43 item_depth=0,
44 item_callback=lambda *args: True,
45 xml_attribs=True,
46 attr_prefix="@",
47 cdata_key="#text",
48 force_cdata=False,
49 cdata_separator="",
50 postprocessor=None,
51 dict_constructor=OrderedDict,
52 strip_whitespace=True,
53 namespace_separator=":",
54 namespaces=None,
55 force_list=None,
56 ):
57 self.path = []
58 self.stack = []
59 self.data = []
60 self.item = None
61 self.item_depth = item_depth
62 self.xml_attribs = xml_attribs
63 self.item_callback = item_callback
64 self.attr_prefix = attr_prefix
65 self.cdata_key = cdata_key
66 self.force_cdata = force_cdata
67 self.cdata_separator = cdata_separator
68 self.postprocessor = postprocessor
69 self.dict_constructor = dict_constructor
70 self.strip_whitespace = strip_whitespace
71 self.namespace_separator = namespace_separator
72 self.namespaces = namespaces
73 self.namespace_declarations = OrderedDict()
74 self.force_list = force_list
76 def _build_name(self, full_name):
77 if not self.namespaces:
78 return full_name
79 i = full_name.rfind(self.namespace_separator)
80 if i == -1:
81 return full_name
82 namespace, name = full_name[:i], full_name[i + 1 :]
83 short_namespace = self.namespaces.get(namespace, namespace)
84 if not short_namespace:
85 return name
86 else:
87 return self.namespace_separator.join((short_namespace, name))
89 def _attrs_to_dict(self, attrs):
90 if isinstance(attrs, dict):
91 return attrs
92 return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
94 def startNamespaceDecl(self, prefix, uri):
95 self.namespace_declarations[prefix or ""] = uri
97 def startElement(self, full_name, attrs):
98 name = self._build_name(full_name)
99 attrs = self._attrs_to_dict(attrs)
100 if attrs and self.namespace_declarations:
101 attrs["xmlns"] = self.namespace_declarations
102 self.namespace_declarations = OrderedDict()
103 self.path.append((name, attrs or None))
104 if len(self.path) > self.item_depth:
105 self.stack.append((self.item, self.data))
106 if self.xml_attribs:
107 attr_entries = []
108 for key, value in attrs.items():
109 key = self.attr_prefix + self._build_name(key)
110 if self.postprocessor:
111 entry = self.postprocessor(self.path, key, value)
112 else:
113 entry = (key, value)
114 if entry:
115 attr_entries.append(entry)
116 attrs = self.dict_constructor(attr_entries)
117 else:
118 attrs = None
119 self.item = attrs or None
120 self.data = []
122 def endElement(self, full_name):
123 name = self._build_name(full_name)
124 if len(self.path) == self.item_depth:
125 item = self.item
126 if item is None:
127 item = None if not self.data else self.cdata_separator.join(self.data)
129 should_continue = self.item_callback(self.path, item)
130 if not should_continue:
131 raise ParsingInterrupted()
132 if len(self.stack):
133 data = None if not self.data else self.cdata_separator.join(self.data)
134 item = self.item
135 self.item, self.data = self.stack.pop()
136 if self.strip_whitespace and data:
137 data = data.strip() or None
138 if data and self.force_cdata and item is None:
139 item = self.dict_constructor()
140 if item is not None:
141 if data:
142 self.push_data(item, self.cdata_key, data)
143 self.item = self.push_data(self.item, name, item)
144 else:
145 self.item = self.push_data(self.item, name, data)
146 else:
147 self.item = None
148 self.data = []
149 self.path.pop()
151 def characters(self, data):
152 if not self.data:
153 self.data = [data]
154 else:
155 self.data.append(data)
157 def push_data(self, item, key, data):
158 if self.postprocessor is not None:
159 result = self.postprocessor(self.path, key, data)
160 if result is None:
161 return item
162 key, data = result
163 if item is None:
164 item = self.dict_constructor()
165 try:
166 value = item[key]
167 if isinstance(value, list):
168 value.append(data)
169 else:
170 item[key] = [value, data]
171 except KeyError:
172 if self._should_force_list(key, data):
173 item[key] = [data]
174 else:
175 item[key] = data
176 return item
178 def _should_force_list(self, key, value):
179 if not self.force_list:
180 return False
181 if isinstance(self.force_list, bool):
182 return self.force_list
183 try:
184 return key in self.force_list
185 except TypeError:
186 return self.force_list(self.path[:-1], key, value)
189def parse(
190 xml_input,
191 encoding=None,
192 expat=expat,
193 process_namespaces=False,
194 namespace_separator=":",
195 disable_entities=True,
196 **kwargs
197):
198 """Parse the given XML input and convert it into a dictionary.
200 `xml_input` can either be a `string` or a file-like object.
202 If `xml_attribs` is `True`, element attributes are put in the dictionary
203 among regular child elements, using `@` as a prefix to avoid collisions. If
204 set to `False`, they are just ignored.
206 Simple example::
208 >>> import xmltodict
209 >>> doc = xmltodict.parse(\"\"\"
210 ... <a prop="x">
211 ... <b>1</b>
212 ... <b>2</b>
213 ... </a>
214 ... \"\"\")
215 >>> doc['a']['@prop']
216 u'x'
217 >>> doc['a']['b']
218 [u'1', u'2']
220 If `item_depth` is `0`, the function returns a dictionary for the root
221 element (default behavior). Otherwise, it calls `item_callback` every time
222 an item at the specified depth is found and returns `None` in the end
223 (streaming mode).
225 The callback function receives two parameters: the `path` from the document
226 root to the item (name-attribs pairs), and the `item` (dict). If the
227 callback's return value is false-ish, parsing will be stopped with the
228 :class:`ParsingInterrupted` exception.
230 Streaming example::
232 >>> def handle(path, item):
233 ... print('path:%s item:%s' % (path, item))
234 ... return True
235 ...
236 >>> xmltodict.parse(\"\"\"
237 ... <a prop="x">
238 ... <b>1</b>
239 ... <b>2</b>
240 ... </a>\"\"\", item_depth=2, item_callback=handle)
241 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
242 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
244 The optional argument `postprocessor` is a function that takes `path`,
245 `key` and `value` as positional arguments and returns a new `(key, value)`
246 pair where both `key` and `value` may have changed. Usage example::
248 >>> def postprocessor(path, key, value):
249 ... try:
250 ... return key + ':int', int(value)
251 ... except (ValueError, TypeError):
252 ... return key, value
253 >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
254 ... postprocessor=postprocessor)
255 OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
257 You can pass an alternate version of `expat` (such as `defusedexpat`) by
258 using the `expat` parameter. E.g:
260 >>> import defusedexpat
261 >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
262 OrderedDict([(u'a', u'hello')])
264 You can use the force_list argument to force lists to be created even
265 when there is only a single child of a given level of hierarchy. The
266 force_list argument is a tuple of keys. If the key for a given level
267 of hierarchy is in the force_list argument, that level of hierarchy
268 will have a list as a child (even if there is only one sub-element).
269 The index_keys operation takes precendence over this. This is applied
270 after any user-supplied postprocessor has already run.
272 For example, given this input:
273 <servers>
274 <server>
275 <name>host1</name>
276 <os>Linux</os>
277 <interfaces>
278 <interface>
279 <name>em0</name>
280 <ip_address>10.0.0.1</ip_address>
281 </interface>
282 </interfaces>
283 </server>
284 </servers>
286 If called with force_list=('interface',), it will produce
287 this dictionary:
288 {'servers':
289 {'server':
290 {'name': 'host1',
291 'os': 'Linux'},
292 'interfaces':
293 {'interface':
294 [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
296 `force_list` can also be a callable that receives `path`, `key` and
297 `value`. This is helpful in cases where the logic that decides whether
298 a list should be forced is more complex.
299 """
300 handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs)
301 if isinstance(xml_input, _unicode):
302 if not encoding:
303 encoding = "utf-8"
304 xml_input = xml_input.encode(encoding)
305 if not process_namespaces:
306 namespace_separator = None
307 parser = expat.ParserCreate(encoding, namespace_separator)
308 try:
309 parser.ordered_attributes = True
310 except AttributeError:
311 # Jython's expat does not support ordered_attributes
312 pass
313 parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
314 parser.StartElementHandler = handler.startElement
315 parser.EndElementHandler = handler.endElement
316 parser.CharacterDataHandler = handler.characters
317 parser.buffer_text = True
318 if disable_entities:
319 try:
320 # Attempt to disable DTD in Jython's expat parser (Xerces-J).
321 feature = "http://apache.org/xml/features/disallow-doctype-decl"
322 parser._reader.setFeature(feature, True)
323 except AttributeError:
324 # For CPython / expat parser.
325 # Anything not handled ends up here and entities aren't expanded.
326 parser.DefaultHandler = lambda x: None
327 # Expects an integer return; zero means failure -> expat.ExpatError.
328 parser.ExternalEntityRefHandler = lambda *x: 1
329 if hasattr(xml_input, "read"):
330 parser.ParseFile(xml_input)
331 else:
332 parser.Parse(xml_input, True)
333 return handler.item
336def _process_namespace(name, namespaces, ns_sep=":", attr_prefix="@"):
337 if not namespaces:
338 return name
339 try:
340 ns, name = name.rsplit(ns_sep, 1)
341 except ValueError:
342 pass
343 else:
344 ns_res = namespaces.get(ns.strip(attr_prefix))
345 name = (
346 "{}{}{}{}".format(
347 attr_prefix if ns.startswith(attr_prefix) else "", ns_res, ns_sep, name
348 )
349 if ns_res
350 else name
351 )
352 return name
355def _emit(
356 key,
357 value,
358 content_handler,
359 attr_prefix="@",
360 cdata_key="#text",
361 depth=0,
362 preprocessor=None,
363 pretty=False,
364 newl="\n",
365 indent="\t",
366 namespace_separator=":",
367 namespaces=None,
368 full_document=True,
369):
370 key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
371 if preprocessor is not None:
372 result = preprocessor(key, value)
373 if result is None:
374 return
375 key, value = result
376 if (
377 not hasattr(value, "__iter__")
378 or isinstance(value, _basestring)
379 or isinstance(value, dict)
380 ):
381 value = [value]
382 for index, v in enumerate(value):
383 if full_document and depth == 0 and index > 0:
384 raise ValueError("document with multiple roots")
385 if v is None:
386 v = OrderedDict()
387 elif isinstance(v, bool):
388 if v:
389 v = _unicode("true")
390 else:
391 v = _unicode("false")
392 elif not isinstance(v, dict):
393 v = _unicode(v)
394 if isinstance(v, _basestring):
395 v = OrderedDict(((cdata_key, v),))
396 cdata = None
397 attrs = OrderedDict()
398 children = []
399 for ik, iv in v.items():
400 if ik == cdata_key:
401 cdata = iv
402 continue
403 if ik.startswith(attr_prefix):
404 ik = _process_namespace(
405 ik, namespaces, namespace_separator, attr_prefix
406 )
407 if ik == "@xmlns" and isinstance(iv, dict):
408 for k, v in iv.items():
409 attr = "xmlns{}".format(":{}".format(k) if k else "")
410 attrs[attr] = _unicode(v)
411 continue
412 if not isinstance(iv, _unicode):
413 iv = _unicode(iv)
414 attrs[ik[len(attr_prefix) :]] = iv
415 continue
416 children.append((ik, iv))
417 if pretty:
418 content_handler.ignorableWhitespace(depth * indent)
419 content_handler.startElement(key, AttributesImpl(attrs))
420 if pretty and children:
421 content_handler.ignorableWhitespace(newl)
422 for child_key, child_value in children:
423 _emit(
424 child_key,
425 child_value,
426 content_handler,
427 attr_prefix,
428 cdata_key,
429 depth + 1,
430 preprocessor,
431 pretty,
432 newl,
433 indent,
434 namespaces=namespaces,
435 namespace_separator=namespace_separator,
436 )
437 if cdata is not None:
438 content_handler.characters(cdata)
439 if pretty and children:
440 content_handler.ignorableWhitespace(depth * indent)
441 content_handler.endElement(key)
442 if pretty and depth:
443 content_handler.ignorableWhitespace(newl)
446def unparse(
447 input_dict,
448 output=None,
449 encoding="utf-8",
450 full_document=True,
451 short_empty_elements=False,
452 **kwargs
453):
454 """Emit an XML document for the given `input_dict` (reverse of `parse`).
456 The resulting XML document is returned as a string, but if `output` (a
457 file-like object) is specified, it is written there instead.
459 Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
460 as XML node attributes, whereas keys equal to `cdata_key`
461 (default=`'#text'`) are treated as character data.
463 The `pretty` parameter (default=`False`) enables pretty-printing. In this
464 mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
465 can be customized with the `newl` and `indent` parameters.
467 """
468 if full_document and len(input_dict) != 1:
469 raise ValueError("Document must have exactly one root.")
470 must_return = False
471 if output is None:
472 output = StringIO()
473 must_return = True
474 if short_empty_elements:
475 content_handler = XMLGenerator(output, encoding, True)
476 else:
477 content_handler = XMLGenerator(output, encoding)
478 if full_document:
479 content_handler.startDocument()
480 for key, value in input_dict.items():
481 _emit(key, value, content_handler, full_document=full_document, **kwargs)
482 if full_document:
483 content_handler.endDocument()
484 if must_return:
485 value = output.getvalue()
486 try: # pragma no cover
487 value = value.decode(encoding)
488 except AttributeError: # pragma no cover
489 pass
490 return value
493if __name__ == "__main__": # pragma: no cover
494 import marshal
495 import sys
497 try:
498 stdin = sys.stdin.buffer
499 stdout = sys.stdout.buffer
500 except AttributeError:
501 stdin = sys.stdin
502 stdout = sys.stdout
504 (item_depth,) = sys.argv[1:]
505 item_depth = int(item_depth)
507 def handle_item(path, item):
508 marshal.dump((path, item), stdout)
509 return True
511 try:
512 root = parse(
513 stdin,
514 item_depth=item_depth,
515 item_callback=handle_item,
516 dict_constructor=dict,
517 )
518 if item_depth == 0:
519 handle_item([], root)
520 except KeyboardInterrupt:
521 pass