14
|
1 """xmltramp: Make XML documents easily accessible."""
|
|
2
|
|
3 __version__ = "2.16"
|
|
4 __author__ = "Aaron Swartz"
|
|
5 __credits__ = "Many thanks to pjz, bitsko, and DanC."
|
|
6 __copyright__ = "(C) 2003 Aaron Swartz. GNU GPL 2."
|
|
7
|
|
8 if not hasattr(__builtins__, 'True'): True, False = 1, 0
|
|
9 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
|
|
10 def islst(f): return isinstance(f, type(())) or isinstance(f, type([]))
|
|
11
|
|
12 empty = {'http://www.w3.org/1999/xhtml': ['img', 'br', 'hr', 'meta', 'link', 'base', 'param', 'input', 'col', 'area']}
|
|
13
|
|
14 def quote(x, elt=True):
|
|
15 if elt and '<' in x and len(x) > 24 and x.find(']]>') == -1: return "<![CDATA["+x+"]]>"
|
|
16 else: x = x.replace('&', '&').replace('<', '<').replace(']]>', ']]>')
|
|
17 if not elt: x = x.replace('"', '"')
|
|
18 return x
|
|
19
|
|
20 class Element:
|
|
21 def __init__(self, name, attrs=None, children=None, prefixes=None):
|
|
22 if islst(name) and name[0] == None: name = name[1]
|
|
23 if attrs:
|
|
24 na = {}
|
|
25 for k in attrs.keys():
|
|
26 if islst(k) and k[0] == None: na[k[1]] = attrs[k]
|
|
27 else: na[k] = attrs[k]
|
|
28 attrs = na
|
|
29 self._name = name
|
|
30 self._attrs = attrs or {}
|
|
31 self._dir = children or []
|
|
32 prefixes = prefixes or {}
|
|
33 self._prefixes = dict(zip(prefixes.values(), prefixes.keys()))
|
|
34 if prefixes: self._dNS = prefixes.get(None, None)
|
|
35 else: self._dNS = None
|
|
36
|
|
37 def __repr__(self, recursive=0, multiline=0, inprefixes=None):
|
|
38 def qname(name, inprefixes):
|
|
39 if islst(name):
|
|
40 if inprefixes[name[0]] is not None: return inprefixes[name[0]]+':'+name[1]
|
|
41 else: return name[1]
|
|
42 else: return name
|
|
43
|
|
44 def arep(a, inprefixes, addns=1):
|
|
45 out = ''
|
|
46 for p in self._prefixes.keys():
|
|
47 if not p in inprefixes.keys():
|
|
48 if addns: out += ' xmlns'
|
|
49 if addns and self._prefixes[p]: out += ':'+self._prefixes[p]
|
|
50 if addns: out += '="'+quote(p, False)+'"'
|
|
51 inprefixes[p] = self._prefixes[p]
|
|
52 for k in a.keys():
|
|
53 out += ' ' + qname(k, inprefixes)+ '="' + quote(a[k], False) + '"'
|
|
54 return out
|
|
55 inprefixes = inprefixes or {u'http://www.w3.org/XML/1998/namespace':'xml'}
|
|
56
|
|
57 # need to call first to set inprefixes:
|
|
58 attributes = arep(self._attrs, inprefixes, recursive)
|
|
59 out = '<' + qname(self._name, inprefixes) + attributes
|
|
60 if not self._dir and (self._name[0] in empty.keys()
|
|
61 and self._name[1] in empty[self._name[0]]):
|
|
62 out += ' />'
|
|
63 return out
|
|
64 out += '>'
|
|
65 if recursive:
|
|
66 content = 0
|
|
67 for x in self._dir:
|
|
68 if isinstance(x, Element): content = 1
|
|
69 pad = '\n' + ('\t' * recursive)
|
|
70 for x in self._dir:
|
|
71 if multiline and content: out += pad
|
|
72 if isstr(x): out += quote(x)
|
|
73 elif isinstance(x, Element): out += x.__repr__(recursive+1, multiline, inprefixes.copy())
|
|
74 else: raise TypeError, "I wasn't expecting "+`x`+"."
|
|
75 if multiline and content: out += '\n' + ('\t' * (recursive-1))
|
|
76 else:
|
|
77 if self._dir: out += '...'
|
|
78 out += '</'+qname(self._name, inprefixes)+'>'
|
|
79 return out
|
|
80
|
|
81 def __unicode__(self):
|
|
82 text = ''
|
|
83 for x in self._dir: text += unicode(x)
|
|
84 return ' '.join(text.split())
|
|
85
|
|
86 def __str__(self):
|
|
87 return self.__unicode__().encode('utf-8')
|
|
88
|
|
89 def __getattr__(self, n):
|
|
90 if n[0] == '_': raise AttributeError, "Use foo['"+n+"'] to access the child element."
|
|
91 if self._dNS: n = (self._dNS, n)
|
|
92 for x in self._dir:
|
|
93 if isinstance(x, Element) and x._name == n: return x
|
|
94 raise AttributeError, 'No child element named \''+n+"'"
|
|
95
|
|
96 def __hasattr__(self, n):
|
|
97 for x in self._dir:
|
|
98 if isinstance(x, Element) and x._name == n: return True
|
|
99 return False
|
|
100
|
|
101 def __setattr__(self, n, v):
|
|
102 if n[0] == '_': self.__dict__[n] = v
|
|
103 else: self[n] = v
|
|
104
|
|
105 def __getitem__(self, n):
|
|
106 if isinstance(n, type(0)): # d[1] == d._dir[1]
|
|
107 return self._dir[n]
|
|
108 elif isinstance(n, slice(0).__class__):
|
|
109 # numerical slices
|
|
110 if isinstance(n.start, type(0)): return self._dir[n.start:n.stop]
|
|
111 # d['foo':] == all <foo>s
|
|
112 n = n.start
|
|
113 if self._dNS and not islst(n): n = (self._dNS, n)
|
|
114 out = []
|
|
115 for x in self._dir:
|
|
116 if isinstance(x, Element) and x._name == n: out.append(x)
|
|
117 return out
|
|
118 else: # d['foo'] == first <foo>
|
|
119 if self._dNS and not islst(n): n = (self._dNS, n)
|
|
120 for x in self._dir:
|
|
121 if isinstance(x, Element) and x._name == n: return x
|
|
122 raise KeyError
|
|
123
|
|
124 def __setitem__(self, n, v):
|
|
125 if isinstance(n, type(0)): # d[1]
|
|
126 self._dir[n] = v
|
|
127 elif isinstance(n, slice(0).__class__):
|
|
128 # d['foo':] adds a new foo
|
|
129 n = n.start
|
|
130 if self._dNS and not islst(n): n = (self._dNS, n)
|
|
131 nv = Element(n)
|
|
132 self._dir.append(nv)
|
|
133
|
|
134 else: # d["foo"] replaces first <foo> and dels rest
|
|
135 if self._dNS and not islst(n): n = (self._dNS, n)
|
|
136 nv = Element(n); nv._dir.append(v)
|
|
137 replaced = False
|
|
138 todel = []
|
|
139 for i in range(len(self)):
|
|
140 if self[i]._name == n:
|
|
141 if replaced:
|
|
142 todel.append(i)
|
|
143 else:
|
|
144 self[i] = nv
|
|
145 replaced = True
|
|
146 if not replaced: self._dir.append(nv)
|
|
147 for i in todel: del self[i]
|
|
148
|
|
149 def __delitem__(self, n):
|
|
150 if isinstance(n, type(0)): del self._dir[n]
|
|
151 elif isinstance(n, slice(0).__class__):
|
|
152 # delete all <foo>s
|
|
153 n = n.start
|
|
154 if self._dNS and not islst(n): n = (self._dNS, n)
|
|
155 for i in range(len(self)):
|
|
156 if self[i]._name == n: del self[i]
|
|
157 else:
|
|
158 # delete first foo
|
|
159 for i in range(len(self)):
|
|
160 if self[i]._name == n: del self[i]
|
|
161 break
|
|
162
|
|
163 def __call__(self, *_pos, **_set):
|
|
164 if _set:
|
|
165 for k in _set.keys(): self._attrs[k] = _set[k]
|
|
166 if len(_pos) > 1:
|
|
167 for i in range(0, len(_pos), 2): self._attrs[_pos[i]] = _pos[i+1]
|
|
168 if len(_pos) == 1 is not None: return self._attrs[_pos[0]]
|
|
169 if len(_pos) == 0: return self._attrs
|
|
170
|
|
171 def __len__(self): return len(self._dir)
|
|
172
|
|
173 class Namespace:
|
|
174 def __init__(self, uri): self.__uri = uri
|
|
175 def __getattr__(self, n): return (self.__uri, n)
|
|
176 def __getitem__(self, n): return (self.__uri, n)
|
|
177
|
|
178 from xml.sax.handler import EntityResolver, DTDHandler, ContentHandler, ErrorHandler
|
|
179
|
|
180 class Seeder(EntityResolver, DTDHandler, ContentHandler, ErrorHandler):
|
|
181 def __init__(self):
|
|
182 self.stack = []
|
|
183 self.ch = ''
|
|
184 self.prefixes = {}
|
|
185 ContentHandler.__init__(self)
|
|
186
|
|
187 def startPrefixMapping(self, prefix, uri):
|
|
188 if not self.prefixes.has_key(prefix): self.prefixes[prefix] = []
|
|
189 self.prefixes[prefix].append(uri)
|
|
190 def endPrefixMapping(self, prefix):
|
|
191 self.prefixes[prefix].pop()
|
|
192
|
|
193 def startElementNS(self, name, qname, attrs):
|
|
194 ch = self.ch; self.ch = ''
|
|
195 if ch and not ch.isspace(): self.stack[-1]._dir.append(ch)
|
|
196 attrs = dict(attrs)
|
|
197 newprefixes = {}
|
|
198 for k in self.prefixes.keys(): newprefixes[k] = self.prefixes[k][-1]
|
|
199 self.stack.append(Element(name, attrs, prefixes=newprefixes.copy()))
|
|
200
|
|
201 def characters(self, ch):
|
|
202 self.ch += ch
|
|
203
|
|
204 def endElementNS(self, name, qname):
|
|
205 ch = self.ch; self.ch = ''
|
|
206 if ch and not ch.isspace(): self.stack[-1]._dir.append(ch)
|
|
207 element = self.stack.pop()
|
|
208 if self.stack: self.stack[-1]._dir.append(element)
|
|
209 else: self.result = element
|
|
210
|
|
211 from xml.sax import make_parser
|
|
212 from xml.sax.handler import feature_namespaces
|
|
213
|
|
214 def seed(fileobj):
|
|
215 seeder = Seeder()
|
|
216 parser = make_parser()
|
|
217 parser.setFeature(feature_namespaces, 1)
|
|
218 parser.setContentHandler(seeder)
|
|
219 parser.parse(fileobj)
|
|
220 return seeder.result
|
|
221
|
|
222 def parse(text):
|
|
223 from StringIO import StringIO
|
|
224 return seed(StringIO(text))
|
|
225
|
|
226 def load(url):
|
|
227 import urllib
|
|
228 return seed(urllib.urlopen(url))
|
|
229
|
|
230 def unittest():
|
|
231 parse('<doc>a<baz>f<b>o</b>ob<b>a</b>r</baz>a</doc>').__repr__(1,1) == \
|
|
232 '<doc>\n\ta<baz>\n\t\tf<b>o</b>ob<b>a</b>r\n\t</baz>a\n</doc>'
|
|
233 assert str(parse("<doc />")) == ""
|
|
234 assert str(parse("<doc>I <b>love</b> you.</doc>")) == "I love you."
|
|
235 assert parse("<doc>\nmom\nwow\n</doc>")[0].strip() == "mom\nwow"
|
|
236 assert str(parse('<bing> <bang> <bong>center</bong> </bang> </bing>')) == "center"
|
|
237 assert str(parse('<doc>\xcf\x80</doc>')) == '\xcf\x80'
|
|
238 d = Element('foo', attrs={'foo':'bar'}, children=['hit with a', Element('bar'), Element('bar')])
|
|
239
|
|
240 try:
|
|
241 d._doesnotexist
|
|
242 raise "ExpectedError", "but found success. Damn."
|
|
243 except AttributeError: pass
|
|
244 assert d.bar._name == 'bar'
|
|
245 try:
|
|
246 d.doesnotexist
|
|
247 raise "ExpectedError", "but found success. Damn."
|
|
248 except AttributeError: pass
|
|
249 assert hasattr(d, 'bar') == True
|
|
250 assert d('foo') == 'bar'
|
|
251 d(silly='yes')
|
|
252 assert d('silly') == 'yes'
|
|
253 assert d() == d._attrs
|
|
254 assert d[0] == 'hit with a'
|
|
255 d[0] = 'ice cream'
|
|
256 assert d[0] == 'ice cream'
|
|
257 del d[0]
|
|
258 assert d[0]._name == "bar"
|
|
259 assert len(d[:]) == len(d._dir)
|
|
260 assert len(d[1:]) == len(d._dir) - 1
|
|
261 assert len(d['bar':]) == 2
|
|
262 d['bar':] = 'baz'
|
|
263 assert len(d['bar':]) == 3
|
|
264 assert d['bar']._name == 'bar'
|
|
265 d = Element('foo')
|
|
266 doc = Namespace("http://example.org/bar")
|
|
267 bbc = Namespace("http://example.org/bbc")
|
|
268 dc = Namespace("http://purl.org/dc/elements/1.1/")
|
|
269 d = parse("""<doc version="2.7182818284590451"
|
|
270 xmlns="http://example.org/bar"
|
|
271 xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
272 xmlns:bbc="http://example.org/bbc">
|
|
273 <author>John Polk and John Palfrey</author>
|
|
274 <dc:creator>John Polk</dc:creator>
|
|
275 <dc:creator>John Palfrey</dc:creator>
|
|
276 <bbc:show bbc:station="4">Buffy</bbc:show>
|
|
277 </doc>""")
|
|
278 assert repr(d) == '<doc version="2.7182818284590451">...</doc>'
|
|
279 assert d.__repr__(1) == '<doc xmlns:bbc="http://example.org/bbc" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://example.org/bar" version="2.7182818284590451"><author>John Polk and John Palfrey</author><dc:creator>John Polk</dc:creator><dc:creator>John Palfrey</dc:creator><bbc:show bbc:station="4">Buffy</bbc:show></doc>'
|
|
280 assert d.__repr__(1,1) == '<doc xmlns:bbc="http://example.org/bbc" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://example.org/bar" version="2.7182818284590451">\n\t<author>John Polk and John Palfrey</author>\n\t<dc:creator>John Polk</dc:creator>\n\t<dc:creator>John Palfrey</dc:creator>\n\t<bbc:show bbc:station="4">Buffy</bbc:show>\n</doc>'
|
|
281 assert repr(parse("<doc xml:lang='en' />")) == '<doc xml:lang="en"></doc>'
|
|
282 assert str(d.author) == str(d['author']) == "John Polk and John Palfrey"
|
|
283 assert d.author._name == doc.author
|
|
284 assert str(d[dc.creator]) == "John Polk"
|
|
285 assert d[dc.creator]._name == dc.creator
|
|
286 assert str(d[dc.creator:][1]) == "John Palfrey"
|
|
287 d[dc.creator] = "Me!!!"
|
|
288 assert str(d[dc.creator]) == "Me!!!"
|
|
289 assert len(d[dc.creator:]) == 1
|
|
290 d[dc.creator:] = "You!!!"
|
|
291 assert len(d[dc.creator:]) == 2
|
|
292 assert d[bbc.show](bbc.station) == "4"
|
|
293 d[bbc.show](bbc.station, "5")
|
|
294 assert d[bbc.show](bbc.station) == "5"
|
|
295 e = Element('e')
|
|
296 e.c = '<img src="foo">'
|
|
297 assert e.__repr__(1) == '<e><c><img src="foo"></c></e>'
|
|
298 e.c = '2 > 4'
|
|
299 assert e.__repr__(1) == '<e><c>2 > 4</c></e>'
|
|
300 e.c = 'CDATA sections are <em>closed</em> with ]]>.'
|
|
301 assert e.__repr__(1) == '<e><c>CDATA sections are <em>closed</em> with ]]>.</c></e>'
|
|
302 e.c = parse('<div xmlns="http://www.w3.org/1999/xhtml">i<br /><span></span>love<br />you</div>')
|
|
303 assert e.__repr__(1) == '<e><c><div xmlns="http://www.w3.org/1999/xhtml">i<br /><span></span>love<br />you</div></c></e>'
|
|
304 e = Element('e')
|
|
305 e('c', 'that "sucks"')
|
|
306 assert e.__repr__(1) == '<e c="that "sucks""></e>'
|
|
307 assert quote("]]>") == "]]>"
|
|
308 assert quote('< dkdkdsd dkd sksdksdfsd fsdfdsf]]> kfdfkg >') == '< dkdkdsd dkd sksdksdfsd fsdfdsf]]> kfdfkg >'
|
|
309 assert parse('<x a="<"></x>').__repr__(1) == '<x a="<"></x>'
|
|
310 assert parse('<a xmlns="http://a"><b xmlns="http://b"/></a>').__repr__(1) == '<a xmlns="http://a"><b xmlns="http://b"></b></a>'
|
|
311
|
|
312 if __name__ == '__main__': unittest()
|