Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2"Makes working with XML feel like you are working with JSON" 

3 

4try: 

5 from defusedexpat import pyexpat as expat 

6except ImportError: 

7 from xml.parsers import expat 

8from xml.sax.saxutils import XMLGenerator 

9from xml.sax.xmlreader import AttributesImpl 

10try: # pragma no cover 

11 from cStringIO import StringIO 

12except ImportError: # pragma no cover 

13 try: 

14 from StringIO import StringIO 

15 except ImportError: 

16 from io import StringIO 

17try: # pragma no cover 

18 from collections import OrderedDict 

19except ImportError: # pragma no cover 

20 try: 

21 from ordereddict import OrderedDict 

22 except ImportError: 

23 OrderedDict = dict 

24 

25try: # pragma no cover 

26 _basestring = basestring 

27except NameError: # pragma no cover 

28 _basestring = str 

29try: # pragma no cover 

30 _unicode = unicode 

31except NameError: # pragma no cover 

32 _unicode = str 

33 

34__author__ = 'Martin Blech' 

35__version__ = '0.11.0' 

36__license__ = 'MIT' 

37 

38 

39class ParsingInterrupted(Exception): 

40 pass 

41 

42 

43class _DictSAXHandler(object): 

44 def __init__(self, 

45 item_depth=0, 

46 item_callback=lambda *args: True, 

47 xml_attribs=True, 

48 attr_prefix='@', 

49 cdata_key='#text', 

50 force_cdata=False, 

51 cdata_separator='', 

52 postprocessor=None, 

53 dict_constructor=OrderedDict, 

54 strip_whitespace=True, 

55 namespace_separator=':', 

56 namespaces=None, 

57 force_list=None): 

58 self.path = [] 

59 self.stack = [] 

60 self.data = [] 

61 self.item = None 

62 self.item_depth = item_depth 

63 self.xml_attribs = xml_attribs 

64 self.item_callback = item_callback 

65 self.attr_prefix = attr_prefix 

66 self.cdata_key = cdata_key 

67 self.force_cdata = force_cdata 

68 self.cdata_separator = cdata_separator 

69 self.postprocessor = postprocessor 

70 self.dict_constructor = dict_constructor 

71 self.strip_whitespace = strip_whitespace 

72 self.namespace_separator = namespace_separator 

73 self.namespaces = namespaces 

74 self.namespace_declarations = OrderedDict() 

75 self.force_list = force_list 

76 

77 def _build_name(self, full_name): 

78 if not self.namespaces: 

79 return full_name 

80 i = full_name.rfind(self.namespace_separator) 

81 if i == -1: 

82 return full_name 

83 namespace, name = full_name[:i], full_name[i+1:] 

84 short_namespace = self.namespaces.get(namespace, namespace) 

85 if not short_namespace: 

86 return name 

87 else: 

88 return self.namespace_separator.join((short_namespace, name)) 

89 

90 def _attrs_to_dict(self, attrs): 

91 if isinstance(attrs, dict): 

92 return attrs 

93 return self.dict_constructor(zip(attrs[0::2], attrs[1::2])) 

94 

95 def startNamespaceDecl(self, prefix, uri): 

96 self.namespace_declarations[prefix or ''] = uri 

97 

98 def startElement(self, full_name, attrs): 

99 name = self._build_name(full_name) 

100 attrs = self._attrs_to_dict(attrs) 

101 if attrs and self.namespace_declarations: 

102 attrs['xmlns'] = self.namespace_declarations 

103 self.namespace_declarations = OrderedDict() 

104 self.path.append((name, attrs or None)) 

105 if len(self.path) > self.item_depth: 

106 self.stack.append((self.item, self.data)) 

107 if self.xml_attribs: 

108 attr_entries = [] 

109 for key, value in attrs.items(): 

110 key = self.attr_prefix+self._build_name(key) 

111 if self.postprocessor: 

112 entry = self.postprocessor(self.path, key, value) 

113 else: 

114 entry = (key, value) 

115 if entry: 

116 attr_entries.append(entry) 

117 attrs = self.dict_constructor(attr_entries) 

118 else: 

119 attrs = None 

120 self.item = attrs or None 

121 self.data = [] 

122 

123 def endElement(self, full_name): 

124 name = self._build_name(full_name) 

125 if len(self.path) == self.item_depth: 

126 item = self.item 

127 if item is None: 

128 item = (None if not self.data 

129 else self.cdata_separator.join(self.data)) 

130 

131 should_continue = self.item_callback(self.path, item) 

132 if not should_continue: 

133 raise ParsingInterrupted() 

134 if len(self.stack): 

135 data = (None if not self.data 

136 else self.cdata_separator.join(self.data)) 

137 item = self.item 

138 self.item, self.data = self.stack.pop() 

139 if self.strip_whitespace and data: 

140 data = data.strip() or None 

141 if data and self.force_cdata and item is None: 

142 item = self.dict_constructor() 

143 if item is not None: 

144 if data: 

145 self.push_data(item, self.cdata_key, data) 

146 self.item = self.push_data(self.item, name, item) 

147 else: 

148 self.item = self.push_data(self.item, name, data) 

149 else: 

150 self.item = None 

151 self.data = [] 

152 self.path.pop() 

153 

154 def characters(self, data): 

155 if not self.data: 

156 self.data = [data] 

157 else: 

158 self.data.append(data) 

159 

160 def push_data(self, item, key, data): 

161 if self.postprocessor is not None: 

162 result = self.postprocessor(self.path, key, data) 

163 if result is None: 

164 return item 

165 key, data = result 

166 if item is None: 

167 item = self.dict_constructor() 

168 try: 

169 value = item[key] 

170 if isinstance(value, list): 

171 value.append(data) 

172 else: 

173 item[key] = [value, data] 

174 except KeyError: 

175 if self._should_force_list(key, data): 

176 item[key] = [data] 

177 else: 

178 item[key] = data 

179 return item 

180 

181 def _should_force_list(self, key, value): 

182 if not self.force_list: 

183 return False 

184 try: 

185 return key in self.force_list 

186 except TypeError: 

187 return self.force_list(self.path[:-1], key, value) 

188 

189 

190def parse(xml_input, encoding=None, expat=expat, process_namespaces=False, 

191 namespace_separator=':', disable_entities=True, **kwargs): 

192 """Parse the given XML input and convert it into a dictionary. 

193 

194 `xml_input` can either be a `string` or a file-like object. 

195 

196 If `xml_attribs` is `True`, element attributes are put in the dictionary 

197 among regular child elements, using `@` as a prefix to avoid collisions. If 

198 set to `False`, they are just ignored. 

199 

200 Simple example:: 

201 

202 >>> import xmltodict 

203 >>> doc = xmltodict.parse(\"\"\" 

204 ... <a prop="x"> 

205 ... <b>1</b> 

206 ... <b>2</b> 

207 ... </a> 

208 ... \"\"\") 

209 >>> doc['a']['@prop'] 

210 u'x' 

211 >>> doc['a']['b'] 

212 [u'1', u'2'] 

213 

214 If `item_depth` is `0`, the function returns a dictionary for the root 

215 element (default behavior). Otherwise, it calls `item_callback` every time 

216 an item at the specified depth is found and returns `None` in the end 

217 (streaming mode). 

218 

219 The callback function receives two parameters: the `path` from the document 

220 root to the item (name-attribs pairs), and the `item` (dict). If the 

221 callback's return value is false-ish, parsing will be stopped with the 

222 :class:`ParsingInterrupted` exception. 

223 

224 Streaming example:: 

225 

226 >>> def handle(path, item): 

227 ... print('path:%s item:%s' % (path, item)) 

228 ... return True 

229 ... 

230 >>> xmltodict.parse(\"\"\" 

231 ... <a prop="x"> 

232 ... <b>1</b> 

233 ... <b>2</b> 

234 ... </a>\"\"\", item_depth=2, item_callback=handle) 

235 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 

236 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 

237 

238 The optional argument `postprocessor` is a function that takes `path`, 

239 `key` and `value` as positional arguments and returns a new `(key, value)` 

240 pair where both `key` and `value` may have changed. Usage example:: 

241 

242 >>> def postprocessor(path, key, value): 

243 ... try: 

244 ... return key + ':int', int(value) 

245 ... except (ValueError, TypeError): 

246 ... return key, value 

247 >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', 

248 ... postprocessor=postprocessor) 

249 OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) 

250 

251 You can pass an alternate version of `expat` (such as `defusedexpat`) by 

252 using the `expat` parameter. E.g: 

253 

254 >>> import defusedexpat 

255 >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) 

256 OrderedDict([(u'a', u'hello')]) 

257 

258 You can use the force_list argument to force lists to be created even 

259 when there is only a single child of a given level of hierarchy. The 

260 force_list argument is a tuple of keys. If the key for a given level 

261 of hierarchy is in the force_list argument, that level of hierarchy 

262 will have a list as a child (even if there is only one sub-element). 

263 The index_keys operation takes precendence over this. This is applied 

264 after any user-supplied postprocessor has already run. 

265 

266 For example, given this input: 

267 <servers> 

268 <server> 

269 <name>host1</name> 

270 <os>Linux</os> 

271 <interfaces> 

272 <interface> 

273 <name>em0</name> 

274 <ip_address>10.0.0.1</ip_address> 

275 </interface> 

276 </interfaces> 

277 </server> 

278 </servers> 

279 

280 If called with force_list=('interface',), it will produce 

281 this dictionary: 

282 {'servers': 

283 {'server': 

284 {'name': 'host1', 

285 'os': 'Linux'}, 

286 'interfaces': 

287 {'interface': 

288 [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } 

289 

290 `force_list` can also be a callable that receives `path`, `key` and 

291 `value`. This is helpful in cases where the logic that decides whether 

292 a list should be forced is more complex. 

293 """ 

294 handler = _DictSAXHandler(namespace_separator=namespace_separator, 

295 **kwargs) 

296 if isinstance(xml_input, _unicode): 

297 if not encoding: 

298 encoding = 'utf-8' 

299 xml_input = xml_input.encode(encoding) 

300 if not process_namespaces: 

301 namespace_separator = None 

302 parser = expat.ParserCreate( 

303 encoding, 

304 namespace_separator 

305 ) 

306 try: 

307 parser.ordered_attributes = True 

308 except AttributeError: 

309 # Jython's expat does not support ordered_attributes 

310 pass 

311 parser.StartNamespaceDeclHandler = handler.startNamespaceDecl 

312 parser.StartElementHandler = handler.startElement 

313 parser.EndElementHandler = handler.endElement 

314 parser.CharacterDataHandler = handler.characters 

315 parser.buffer_text = True 

316 if disable_entities: 

317 try: 

318 # Attempt to disable DTD in Jython's expat parser (Xerces-J). 

319 feature = "http://apache.org/xml/features/disallow-doctype-decl" 

320 parser._reader.setFeature(feature, True) 

321 except AttributeError: 

322 # For CPython / expat parser. 

323 # Anything not handled ends up here and entities aren't expanded. 

324 parser.DefaultHandler = lambda x: None 

325 # Expects an integer return; zero means failure -> expat.ExpatError. 

326 parser.ExternalEntityRefHandler = lambda *x: 1 

327 if hasattr(xml_input, 'read'): 

328 parser.ParseFile(xml_input) 

329 else: 

330 parser.Parse(xml_input, True) 

331 return handler.item 

332 

333 

334def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'): 

335 if not namespaces: 

336 return name 

337 try: 

338 ns, name = name.rsplit(ns_sep, 1) 

339 except ValueError: 

340 pass 

341 else: 

342 ns_res = namespaces.get(ns.strip(attr_prefix)) 

343 name = '{0}{1}{2}{3}'.format( 

344 attr_prefix if ns.startswith(attr_prefix) else '', 

345 ns_res, ns_sep, name) if ns_res else name 

346 return name 

347 

348 

349def _emit(key, value, content_handler, 

350 attr_prefix='@', 

351 cdata_key='#text', 

352 depth=0, 

353 preprocessor=None, 

354 pretty=False, 

355 newl='\n', 

356 indent='\t', 

357 namespace_separator=':', 

358 namespaces=None, 

359 full_document=True): 

360 key = _process_namespace(key, namespaces, namespace_separator, attr_prefix) 

361 if preprocessor is not None: 

362 result = preprocessor(key, value) 

363 if result is None: 

364 return 

365 key, value = result 

366 if (not hasattr(value, '__iter__') 

367 or isinstance(value, _basestring) 

368 or isinstance(value, dict)): 

369 value = [value] 

370 for index, v in enumerate(value): 

371 if full_document and depth == 0 and index > 0: 

372 raise ValueError('document with multiple roots') 

373 if v is None: 

374 v = OrderedDict() 

375 elif not isinstance(v, dict): 

376 v = _unicode(v) 

377 if isinstance(v, _basestring): 

378 v = OrderedDict(((cdata_key, v),)) 

379 cdata = None 

380 attrs = OrderedDict() 

381 children = [] 

382 for ik, iv in v.items(): 

383 if ik == cdata_key: 

384 cdata = iv 

385 continue 

386 if ik.startswith(attr_prefix): 

387 ik = _process_namespace(ik, namespaces, namespace_separator, 

388 attr_prefix) 

389 if ik == '@xmlns' and isinstance(iv, dict): 

390 for k, v in iv.items(): 

391 attr = 'xmlns{0}'.format(':{0}'.format(k) if k else '') 

392 attrs[attr] = _unicode(v) 

393 continue 

394 if not isinstance(iv, _unicode): 

395 iv = _unicode(iv) 

396 attrs[ik[len(attr_prefix):]] = iv 

397 continue 

398 children.append((ik, iv)) 

399 if pretty: 

400 content_handler.ignorableWhitespace(depth * indent) 

401 content_handler.startElement(key, AttributesImpl(attrs)) 

402 if pretty and children: 

403 content_handler.ignorableWhitespace(newl) 

404 for child_key, child_value in children: 

405 _emit(child_key, child_value, content_handler, 

406 attr_prefix, cdata_key, depth+1, preprocessor, 

407 pretty, newl, indent, namespaces=namespaces, 

408 namespace_separator=namespace_separator) 

409 if cdata is not None: 

410 content_handler.characters(cdata) 

411 if pretty and children: 

412 content_handler.ignorableWhitespace(depth * indent) 

413 content_handler.endElement(key) 

414 if pretty and depth: 

415 content_handler.ignorableWhitespace(newl) 

416 

417 

418def unparse(input_dict, output=None, encoding='utf-8', full_document=True, 

419 short_empty_elements=False, 

420 **kwargs): 

421 """Emit an XML document for the given `input_dict` (reverse of `parse`). 

422 

423 The resulting XML document is returned as a string, but if `output` (a 

424 file-like object) is specified, it is written there instead. 

425 

426 Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted 

427 as XML node attributes, whereas keys equal to `cdata_key` 

428 (default=`'#text'`) are treated as character data. 

429 

430 The `pretty` parameter (default=`False`) enables pretty-printing. In this 

431 mode, lines are terminated with `'\n'` and indented with `'\t'`, but this 

432 can be customized with the `newl` and `indent` parameters. 

433 

434 """ 

435 if full_document and len(input_dict) != 1: 

436 raise ValueError('Document must have exactly one root.') 

437 must_return = False 

438 if output is None: 

439 output = StringIO() 

440 must_return = True 

441 if short_empty_elements: 

442 content_handler = XMLGenerator(output, encoding, True) 

443 else: 

444 content_handler = XMLGenerator(output, encoding) 

445 if full_document: 

446 content_handler.startDocument() 

447 for key, value in input_dict.items(): 

448 _emit(key, value, content_handler, full_document=full_document, 

449 **kwargs) 

450 if full_document: 

451 content_handler.endDocument() 

452 if must_return: 

453 value = output.getvalue() 

454 try: # pragma no cover 

455 value = value.decode(encoding) 

456 except AttributeError: # pragma no cover 

457 pass 

458 return value 

459 

460if __name__ == '__main__': # pragma: no cover 

461 import sys 

462 import marshal 

463 try: 

464 stdin = sys.stdin.buffer 

465 stdout = sys.stdout.buffer 

466 except AttributeError: 

467 stdin = sys.stdin 

468 stdout = sys.stdout 

469 

470 (item_depth,) = sys.argv[1:] 

471 item_depth = int(item_depth) 

472 

473 

474 def handle_item(path, item): 

475 marshal.dump((path, item), stdout) 

476 return True 

477 

478 try: 

479 root = parse(stdin, 

480 item_depth=item_depth, 

481 item_callback=handle_item, 

482 dict_constructor=dict) 

483 if item_depth == 0: 

484 handle_item([], root) 

485 except KeyboardInterrupt: 

486 pass