annotate pyikb/Parser.py @ 160:7551342718b6

Refactory pyikriam with patterns. - Use dyna_prog, a dynamic programming decorator, to cache city objects. - fake_moz to emulate a mozilla browser.
author Thinker K.F. Li <thinker@branda.to>
date Sat, 01 Nov 2008 21:29:51 +0800
parents 1c42ae140ad3
children
rev   line source
63
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
1 #!/usr/bin/python
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
2 # -*- coding: utf-8 -*-
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
3 import re,string
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
4 from sgmllib import SGMLParser
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
5
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
6 class ContentParser(SGMLParser):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
7 def __init__(self):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
8 SGMLParser.__init__(self)
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
9 self.anchor = {'link':'', 'title':''}
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
10 self.anchorlist = []
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
11 self.liattr={}
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
12 self.inside_elements=['site']
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
13 self.pat=re.compile('\r|\t|\n')
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
14
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
15 def start_a(self, attributes):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
16 """For each anchor tag, pay attention to the href and title attributes."""
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
17 href, title = '', ''
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
18 for name, value in attributes:
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
19 if name.lower() == 'href': href = value
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
20 if name.lower() == 'title': title = value
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
21 self.anchor['link'] = href
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
22 self.anchor['title'] = title
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
23 self.inside_elements.append('anchor')
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
24
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
25 def end_a(self):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
26 self.anchorlist.append(self.anchor) # store the anchor in a list
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
27 self.anchor = {'link':'', 'title':''} # reset the dictionary,
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
28 self.inside_elements.pop()
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
29
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
30 def handle_data(self, text):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
31 if self.inside_elements[-1]=='anchor':
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
32 self.anchor['title'] = text
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
33 if self.inside_elements[-1]=='li':
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
34 text=self.pat.sub(' ',text)
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
35 text=string.split(text," ")
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
36 if self.liattcl in self.liattr:
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
37 self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
38 else:
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
39 self.liattr[self.liattcl]=text
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
40
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
41 def start_li(self,attributes):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
42 self.liattcl=''
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
43 attrs = dict(attributes)
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
44 if attrs.has_key('class'):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
45 self.liattcl=attrs['class']
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
46 self.inside_elements.append('li')
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
47
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
48 def end_li(self):
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
49 if self.inside_elements[-1]=='li':
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
50 self.inside_elements.pop()
1c42ae140ad3 add Parser.py and lconf.py.
kevin@localhost.localdomain
parents:
diff changeset
51