The xmllib Module
The xmlib module provides a simple XML parser, using regular expressions to pull the XML data apart, as shown in Example 5-1. The parser does basic checks on the document, such as a check to see that there is only one top-level element and a check to see that all tags are balanced.
You feed XML data to this parser piece by piece (as data arrives over a network, for example). The parser calls methods in itself for start tags, data sections, end tags, and entities, among other things.
If you're only interested in a few tags, you can define special start_tag and end_tag methods, where tag is the tag name. The start functions are called with the attributes given as a dictionary.
Example 5-1. Using the xmllib Module to Extract Information from an Element
File: xmllib-example-1.py import xmllib class Parser(xmllib.XMLParser): # get quotation number def _ _init_ _(self, file=None): xmllib.XMLParser._ _init_ _(self) if file: self.load(file) def load(self, file): while 1: s = file.read(512) if not s: break self.feed(s) self.close() def start_quotation(self, attrs): print "id =>", attrs.get("id") raise EOFError try: c = Parser() c.load(open("samples/sample.xml")) except EOFError: pass id => 031
Example 5-2 contains a simple (and incomplete) rendering engine. The parser maintains an element stack (_ _tags), which it passes to the renderer, together with text fragments. The renderer looks up the current tag hierarchy in a style dictionary, and if it isn't already there, it creates a new style descriptor by combining bits and pieces from the stylesheet.
Example 5-2. Using the xmllib Module
File: xmllib-example-2.py
import xmllib
import string, sys
STYLESHEET = {
# each element can contribute one or more style elements
"quotation": {"style": "italic"},
"lang": {"weight": "bold"},
"name": {"weight": "medium"},
}
class Parser(xmllib.XMLParser):
# a simple styling engine
def _ _init_ _(self, renderer):
xmllib.XMLParser._ _init_ _(self)
self._ _data = []
self._ _tags = []
self._ _renderer = renderer
def load(self, file):
while 1:
s = file.read(8192)
if not s:
break
self.feed(s)
self.close()
def handle_data(self, data):
self._ _data.append(data)
def unknown_starttag(self, tag, attrs):
if self._ _data:
text = string.join(self._ _data, "")
self._ _renderer.text(self._ _tags, text)
self._ _tags.append(tag)
self._ _data = []
def unknown_endtag(self, tag):
self._ _tags.pop()
if self._ _data:
text = string.join(self._ _data, "")
self._ _renderer.text(self._ _tags, text)
self._ _data = []
class DumbRenderer:
def _ _init_ _(self):
self.cache = {}
def text(self, tags, text):
# render text in the style given by the tag stack
tags = tuple(tags)
style = self.cache.get(tags)
if style is None:
# figure out a combined style
style = {}
for tag in tags:
s = STYLESHEET.get(tag)
if s:
style.update(s)
self.cache[tags] = style # update cache
# write to standard output
sys.stdout.write("%s =>
" % style)
sys.stdout.write(" " + repr(text) + "
")
#
# try it out
r = DumbRenderer()
c = Parser(r)
c.load(open("samples/sample.xml"))
{'style': 'italic'} =>
'I've had a lot of developers come up to me and 12say,
"I haven't had this much fun in a long time. It sure
beats 12writing '
{'style': 'italic', 'weight': 'bold'} =>
'Cobol'
{'style': 'italic'} =>
'" -- '
{'style': 'italic', 'weight': 'medium'} =>
'James Gosling'
{'style': 'italic'} =>
', on 12'
{'weight': 'bold'} =>
'Java'
{'style': 'italic'} =>
'.'
Категории