Mercurial > eagle-eye
comparison pyikb/Parser.py @ 68:4ba1e981716d
merged kevint and hychen's work.
author | "Rex Tsai <chihchun@kalug.linux.org.tw>" |
---|---|
date | Wed, 22 Oct 2008 06:24:39 +0800 |
parents | 1c42ae140ad3 |
children |
comparison
equal
deleted
inserted
replaced
67:6eccb3a95df5 | 68:4ba1e981716d |
---|---|
1 #!/usr/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 import re,string | |
4 from sgmllib import SGMLParser | |
5 | |
6 class ContentParser(SGMLParser): | |
7 def __init__(self): | |
8 SGMLParser.__init__(self) | |
9 self.anchor = {'link':'', 'title':''} | |
10 self.anchorlist = [] | |
11 self.liattr={} | |
12 self.inside_elements=['site'] | |
13 self.pat=re.compile('\r|\t|\n') | |
14 | |
15 def start_a(self, attributes): | |
16 """For each anchor tag, pay attention to the href and title attributes.""" | |
17 href, title = '', '' | |
18 for name, value in attributes: | |
19 if name.lower() == 'href': href = value | |
20 if name.lower() == 'title': title = value | |
21 self.anchor['link'] = href | |
22 self.anchor['title'] = title | |
23 self.inside_elements.append('anchor') | |
24 | |
25 def end_a(self): | |
26 self.anchorlist.append(self.anchor) # store the anchor in a list | |
27 self.anchor = {'link':'', 'title':''} # reset the dictionary, | |
28 self.inside_elements.pop() | |
29 | |
30 def handle_data(self, text): | |
31 if self.inside_elements[-1]=='anchor': | |
32 self.anchor['title'] = text | |
33 if self.inside_elements[-1]=='li': | |
34 text=self.pat.sub(' ',text) | |
35 text=string.split(text," ") | |
36 if self.liattcl in self.liattr: | |
37 self.liattr[self.liattcl]=self.liattr[self.liattcl]+text | |
38 else: | |
39 self.liattr[self.liattcl]=text | |
40 | |
41 def start_li(self,attributes): | |
42 self.liattcl='' | |
43 attrs = dict(attributes) | |
44 if attrs.has_key('class'): | |
45 self.liattcl=attrs['class'] | |
46 self.inside_elements.append('li') | |
47 | |
48 def end_li(self): | |
49 if self.inside_elements[-1]=='li': | |
50 self.inside_elements.pop() | |
51 |