Coverage for grm\lib\xmltodict.py: 11%

241 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-12 14:21 +0900

1#!/usr/bin/env python 

2"Makes working with XML feel like you are working with JSON" 

3 

4try: 

5 from defusedexpat import pyexpat as expat 

6except ImportError: 

7 from xml.parsers import expat 

8 

9from xml.sax.saxutils import XMLGenerator 

10from xml.sax.xmlreader import AttributesImpl 

11 

12try: # pragma no cover 

13 from cStringIO import StringIO 

14except ImportError: # pragma no cover 

15 try: 

16 from StringIO import StringIO 

17 except ImportError: 

18 from io import StringIO 

19 

20from collections import OrderedDict 

21 

22try: # pragma no cover 

23 _basestring = basestring 

24except NameError: # pragma no cover 

25 _basestring = str 

26try: # pragma no cover 

27 _unicode = unicode 

28except NameError: # pragma no cover 

29 _unicode = str 

30 

31__author__ = "Martin Blech" 

32__version__ = "0.12.0" 

33__license__ = "MIT" 

34 

35 

36class ParsingInterrupted(Exception): 

37 pass 

38 

39 

40class _DictSAXHandler(object): 

41 def __init__( 

42 self, 

43 item_depth=0, 

44 item_callback=lambda *args: True, 

45 xml_attribs=True, 

46 attr_prefix="@", 

47 cdata_key="#text", 

48 force_cdata=False, 

49 cdata_separator="", 

50 postprocessor=None, 

51 dict_constructor=OrderedDict, 

52 strip_whitespace=True, 

53 namespace_separator=":", 

54 namespaces=None, 

55 force_list=None, 

56 ): 

57 self.path = [] 

58 self.stack = [] 

59 self.data = [] 

60 self.item = None 

61 self.item_depth = item_depth 

62 self.xml_attribs = xml_attribs 

63 self.item_callback = item_callback 

64 self.attr_prefix = attr_prefix 

65 self.cdata_key = cdata_key 

66 self.force_cdata = force_cdata 

67 self.cdata_separator = cdata_separator 

68 self.postprocessor = postprocessor 

69 self.dict_constructor = dict_constructor 

70 self.strip_whitespace = strip_whitespace 

71 self.namespace_separator = namespace_separator 

72 self.namespaces = namespaces 

73 self.namespace_declarations = OrderedDict() 

74 self.force_list = force_list 

75 

76 def _build_name(self, full_name): 

77 if not self.namespaces: 

78 return full_name 

79 i = full_name.rfind(self.namespace_separator) 

80 if i == -1: 

81 return full_name 

82 namespace, name = full_name[:i], full_name[i + 1 :] 

83 short_namespace = self.namespaces.get(namespace, namespace) 

84 if not short_namespace: 

85 return name 

86 else: 

87 return self.namespace_separator.join((short_namespace, name)) 

88 

89 def _attrs_to_dict(self, attrs): 

90 if isinstance(attrs, dict): 

91 return attrs 

92 return self.dict_constructor(zip(attrs[0::2], attrs[1::2])) 

93 

94 def startNamespaceDecl(self, prefix, uri): 

95 self.namespace_declarations[prefix or ""] = uri 

96 

97 def startElement(self, full_name, attrs): 

98 name = self._build_name(full_name) 

99 attrs = self._attrs_to_dict(attrs) 

100 if attrs and self.namespace_declarations: 

101 attrs["xmlns"] = self.namespace_declarations 

102 self.namespace_declarations = OrderedDict() 

103 self.path.append((name, attrs or None)) 

104 if len(self.path) > self.item_depth: 

105 self.stack.append((self.item, self.data)) 

106 if self.xml_attribs: 

107 attr_entries = [] 

108 for key, value in attrs.items(): 

109 key = self.attr_prefix + self._build_name(key) 

110 if self.postprocessor: 

111 entry = self.postprocessor(self.path, key, value) 

112 else: 

113 entry = (key, value) 

114 if entry: 

115 attr_entries.append(entry) 

116 attrs = self.dict_constructor(attr_entries) 

117 else: 

118 attrs = None 

119 self.item = attrs or None 

120 self.data = [] 

121 

122 def endElement(self, full_name): 

123 name = self._build_name(full_name) 

124 if len(self.path) == self.item_depth: 

125 item = self.item 

126 if item is None: 

127 item = None if not self.data else self.cdata_separator.join(self.data) 

128 

129 should_continue = self.item_callback(self.path, item) 

130 if not should_continue: 

131 raise ParsingInterrupted() 

132 if len(self.stack): 

133 data = None if not self.data else self.cdata_separator.join(self.data) 

134 item = self.item 

135 self.item, self.data = self.stack.pop() 

136 if self.strip_whitespace and data: 

137 data = data.strip() or None 

138 if data and self.force_cdata and item is None: 

139 item = self.dict_constructor() 

140 if item is not None: 

141 if data: 

142 self.push_data(item, self.cdata_key, data) 

143 self.item = self.push_data(self.item, name, item) 

144 else: 

145 self.item = self.push_data(self.item, name, data) 

146 else: 

147 self.item = None 

148 self.data = [] 

149 self.path.pop() 

150 

151 def characters(self, data): 

152 if not self.data: 

153 self.data = [data] 

154 else: 

155 self.data.append(data) 

156 

157 def push_data(self, item, key, data): 

158 if self.postprocessor is not None: 

159 result = self.postprocessor(self.path, key, data) 

160 if result is None: 

161 return item 

162 key, data = result 

163 if item is None: 

164 item = self.dict_constructor() 

165 try: 

166 value = item[key] 

167 if isinstance(value, list): 

168 value.append(data) 

169 else: 

170 item[key] = [value, data] 

171 except KeyError: 

172 if self._should_force_list(key, data): 

173 item[key] = [data] 

174 else: 

175 item[key] = data 

176 return item 

177 

178 def _should_force_list(self, key, value): 

179 if not self.force_list: 

180 return False 

181 if isinstance(self.force_list, bool): 

182 return self.force_list 

183 try: 

184 return key in self.force_list 

185 except TypeError: 

186 return self.force_list(self.path[:-1], key, value) 

187 

188 

189def parse( 

190 xml_input, 

191 encoding=None, 

192 expat=expat, 

193 process_namespaces=False, 

194 namespace_separator=":", 

195 disable_entities=True, 

196 **kwargs 

197): 

198 """Parse the given XML input and convert it into a dictionary. 

199 

200 `xml_input` can either be a `string` or a file-like object. 

201 

202 If `xml_attribs` is `True`, element attributes are put in the dictionary 

203 among regular child elements, using `@` as a prefix to avoid collisions. If 

204 set to `False`, they are just ignored. 

205 

206 Simple example:: 

207 

208 >>> import xmltodict 

209 >>> doc = xmltodict.parse(\"\"\" 

210 ... <a prop="x"> 

211 ... <b>1</b> 

212 ... <b>2</b> 

213 ... </a> 

214 ... \"\"\") 

215 >>> doc['a']['@prop'] 

216 u'x' 

217 >>> doc['a']['b'] 

218 [u'1', u'2'] 

219 

220 If `item_depth` is `0`, the function returns a dictionary for the root 

221 element (default behavior). Otherwise, it calls `item_callback` every time 

222 an item at the specified depth is found and returns `None` in the end 

223 (streaming mode). 

224 

225 The callback function receives two parameters: the `path` from the document 

226 root to the item (name-attribs pairs), and the `item` (dict). If the 

227 callback's return value is false-ish, parsing will be stopped with the 

228 :class:`ParsingInterrupted` exception. 

229 

230 Streaming example:: 

231 

232 >>> def handle(path, item): 

233 ... print('path:%s item:%s' % (path, item)) 

234 ... return True 

235 ... 

236 >>> xmltodict.parse(\"\"\" 

237 ... <a prop="x"> 

238 ... <b>1</b> 

239 ... <b>2</b> 

240 ... </a>\"\"\", item_depth=2, item_callback=handle) 

241 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1 

242 path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 

243 

244 The optional argument `postprocessor` is a function that takes `path`, 

245 `key` and `value` as positional arguments and returns a new `(key, value)` 

246 pair where both `key` and `value` may have changed. Usage example:: 

247 

248 >>> def postprocessor(path, key, value): 

249 ... try: 

250 ... return key + ':int', int(value) 

251 ... except (ValueError, TypeError): 

252 ... return key, value 

253 >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>', 

254 ... postprocessor=postprocessor) 

255 OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) 

256 

257 You can pass an alternate version of `expat` (such as `defusedexpat`) by 

258 using the `expat` parameter. E.g: 

259 

260 >>> import defusedexpat 

261 >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat) 

262 OrderedDict([(u'a', u'hello')]) 

263 

264 You can use the force_list argument to force lists to be created even 

265 when there is only a single child of a given level of hierarchy. The 

266 force_list argument is a tuple of keys. If the key for a given level 

267 of hierarchy is in the force_list argument, that level of hierarchy 

268 will have a list as a child (even if there is only one sub-element). 

269 The index_keys operation takes precendence over this. This is applied 

270 after any user-supplied postprocessor has already run. 

271 

272 For example, given this input: 

273 <servers> 

274 <server> 

275 <name>host1</name> 

276 <os>Linux</os> 

277 <interfaces> 

278 <interface> 

279 <name>em0</name> 

280 <ip_address>10.0.0.1</ip_address> 

281 </interface> 

282 </interfaces> 

283 </server> 

284 </servers> 

285 

286 If called with force_list=('interface',), it will produce 

287 this dictionary: 

288 {'servers': 

289 {'server': 

290 {'name': 'host1', 

291 'os': 'Linux'}, 

292 'interfaces': 

293 {'interface': 

294 [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } 

295 

296 `force_list` can also be a callable that receives `path`, `key` and 

297 `value`. This is helpful in cases where the logic that decides whether 

298 a list should be forced is more complex. 

299 """ 

300 handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs) 

301 if isinstance(xml_input, _unicode): 

302 if not encoding: 

303 encoding = "utf-8" 

304 xml_input = xml_input.encode(encoding) 

305 if not process_namespaces: 

306 namespace_separator = None 

307 parser = expat.ParserCreate(encoding, namespace_separator) 

308 try: 

309 parser.ordered_attributes = True 

310 except AttributeError: 

311 # Jython's expat does not support ordered_attributes 

312 pass 

313 parser.StartNamespaceDeclHandler = handler.startNamespaceDecl 

314 parser.StartElementHandler = handler.startElement 

315 parser.EndElementHandler = handler.endElement 

316 parser.CharacterDataHandler = handler.characters 

317 parser.buffer_text = True 

318 if disable_entities: 

319 try: 

320 # Attempt to disable DTD in Jython's expat parser (Xerces-J). 

321 feature = "http://apache.org/xml/features/disallow-doctype-decl" 

322 parser._reader.setFeature(feature, True) 

323 except AttributeError: 

324 # For CPython / expat parser. 

325 # Anything not handled ends up here and entities aren't expanded. 

326 parser.DefaultHandler = lambda x: None 

327 # Expects an integer return; zero means failure -> expat.ExpatError. 

328 parser.ExternalEntityRefHandler = lambda *x: 1 

329 if hasattr(xml_input, "read"): 

330 parser.ParseFile(xml_input) 

331 else: 

332 parser.Parse(xml_input, True) 

333 return handler.item 

334 

335 

336def _process_namespace(name, namespaces, ns_sep=":", attr_prefix="@"): 

337 if not namespaces: 

338 return name 

339 try: 

340 ns, name = name.rsplit(ns_sep, 1) 

341 except ValueError: 

342 pass 

343 else: 

344 ns_res = namespaces.get(ns.strip(attr_prefix)) 

345 name = ( 

346 "{}{}{}{}".format( 

347 attr_prefix if ns.startswith(attr_prefix) else "", ns_res, ns_sep, name 

348 ) 

349 if ns_res 

350 else name 

351 ) 

352 return name 

353 

354 

355def _emit( 

356 key, 

357 value, 

358 content_handler, 

359 attr_prefix="@", 

360 cdata_key="#text", 

361 depth=0, 

362 preprocessor=None, 

363 pretty=False, 

364 newl="\n", 

365 indent="\t", 

366 namespace_separator=":", 

367 namespaces=None, 

368 full_document=True, 

369): 

370 key = _process_namespace(key, namespaces, namespace_separator, attr_prefix) 

371 if preprocessor is not None: 

372 result = preprocessor(key, value) 

373 if result is None: 

374 return 

375 key, value = result 

376 if ( 

377 not hasattr(value, "__iter__") 

378 or isinstance(value, _basestring) 

379 or isinstance(value, dict) 

380 ): 

381 value = [value] 

382 for index, v in enumerate(value): 

383 if full_document and depth == 0 and index > 0: 

384 raise ValueError("document with multiple roots") 

385 if v is None: 

386 v = OrderedDict() 

387 elif isinstance(v, bool): 

388 if v: 

389 v = _unicode("true") 

390 else: 

391 v = _unicode("false") 

392 elif not isinstance(v, dict): 

393 v = _unicode(v) 

394 if isinstance(v, _basestring): 

395 v = OrderedDict(((cdata_key, v),)) 

396 cdata = None 

397 attrs = OrderedDict() 

398 children = [] 

399 for ik, iv in v.items(): 

400 if ik == cdata_key: 

401 cdata = iv 

402 continue 

403 if ik.startswith(attr_prefix): 

404 ik = _process_namespace( 

405 ik, namespaces, namespace_separator, attr_prefix 

406 ) 

407 if ik == "@xmlns" and isinstance(iv, dict): 

408 for k, v in iv.items(): 

409 attr = "xmlns{}".format(":{}".format(k) if k else "") 

410 attrs[attr] = _unicode(v) 

411 continue 

412 if not isinstance(iv, _unicode): 

413 iv = _unicode(iv) 

414 attrs[ik[len(attr_prefix) :]] = iv 

415 continue 

416 children.append((ik, iv)) 

417 if pretty: 

418 content_handler.ignorableWhitespace(depth * indent) 

419 content_handler.startElement(key, AttributesImpl(attrs)) 

420 if pretty and children: 

421 content_handler.ignorableWhitespace(newl) 

422 for child_key, child_value in children: 

423 _emit( 

424 child_key, 

425 child_value, 

426 content_handler, 

427 attr_prefix, 

428 cdata_key, 

429 depth + 1, 

430 preprocessor, 

431 pretty, 

432 newl, 

433 indent, 

434 namespaces=namespaces, 

435 namespace_separator=namespace_separator, 

436 ) 

437 if cdata is not None: 

438 content_handler.characters(cdata) 

439 if pretty and children: 

440 content_handler.ignorableWhitespace(depth * indent) 

441 content_handler.endElement(key) 

442 if pretty and depth: 

443 content_handler.ignorableWhitespace(newl) 

444 

445 

446def unparse( 

447 input_dict, 

448 output=None, 

449 encoding="utf-8", 

450 full_document=True, 

451 short_empty_elements=False, 

452 **kwargs 

453): 

454 """Emit an XML document for the given `input_dict` (reverse of `parse`). 

455 

456 The resulting XML document is returned as a string, but if `output` (a 

457 file-like object) is specified, it is written there instead. 

458 

459 Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted 

460 as XML node attributes, whereas keys equal to `cdata_key` 

461 (default=`'#text'`) are treated as character data. 

462 

463 The `pretty` parameter (default=`False`) enables pretty-printing. In this 

464 mode, lines are terminated with `'\n'` and indented with `'\t'`, but this 

465 can be customized with the `newl` and `indent` parameters. 

466 

467 """ 

468 if full_document and len(input_dict) != 1: 

469 raise ValueError("Document must have exactly one root.") 

470 must_return = False 

471 if output is None: 

472 output = StringIO() 

473 must_return = True 

474 if short_empty_elements: 

475 content_handler = XMLGenerator(output, encoding, True) 

476 else: 

477 content_handler = XMLGenerator(output, encoding) 

478 if full_document: 

479 content_handler.startDocument() 

480 for key, value in input_dict.items(): 

481 _emit(key, value, content_handler, full_document=full_document, **kwargs) 

482 if full_document: 

483 content_handler.endDocument() 

484 if must_return: 

485 value = output.getvalue() 

486 try: # pragma no cover 

487 value = value.decode(encoding) 

488 except AttributeError: # pragma no cover 

489 pass 

490 return value 

491 

492 

493if __name__ == "__main__": # pragma: no cover 

494 import marshal 

495 import sys 

496 

497 try: 

498 stdin = sys.stdin.buffer 

499 stdout = sys.stdout.buffer 

500 except AttributeError: 

501 stdin = sys.stdin 

502 stdout = sys.stdout 

503 

504 (item_depth,) = sys.argv[1:] 

505 item_depth = int(item_depth) 

506 

507 def handle_item(path, item): 

508 marshal.dump((path, item), stdout) 

509 return True 

510 

511 try: 

512 root = parse( 

513 stdin, 

514 item_depth=item_depth, 

515 item_callback=handle_item, 

516 dict_constructor=dict, 

517 ) 

518 if item_depth == 0: 

519 handle_item([], root) 

520 except KeyboardInterrupt: 

521 pass