comparison pyikb/Parser.py @ 68:4ba1e981716d

merged kevint and hychen's work.
author "Rex Tsai <chihchun@kalug.linux.org.tw>"
date Wed, 22 Oct 2008 06:24:39 +0800
parents 1c42ae140ad3
children
comparison
equal deleted inserted replaced
67:6eccb3a95df5 68:4ba1e981716d
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 import re,string
4 from sgmllib import SGMLParser
5
6 class ContentParser(SGMLParser):
7 def __init__(self):
8 SGMLParser.__init__(self)
9 self.anchor = {'link':'', 'title':''}
10 self.anchorlist = []
11 self.liattr={}
12 self.inside_elements=['site']
13 self.pat=re.compile('\r|\t|\n')
14
15 def start_a(self, attributes):
16 """For each anchor tag, pay attention to the href and title attributes."""
17 href, title = '', ''
18 for name, value in attributes:
19 if name.lower() == 'href': href = value
20 if name.lower() == 'title': title = value
21 self.anchor['link'] = href
22 self.anchor['title'] = title
23 self.inside_elements.append('anchor')
24
25 def end_a(self):
26 self.anchorlist.append(self.anchor) # store the anchor in a list
27 self.anchor = {'link':'', 'title':''} # reset the dictionary,
28 self.inside_elements.pop()
29
30 def handle_data(self, text):
31 if self.inside_elements[-1]=='anchor':
32 self.anchor['title'] = text
33 if self.inside_elements[-1]=='li':
34 text=self.pat.sub(' ',text)
35 text=string.split(text," ")
36 if self.liattcl in self.liattr:
37 self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
38 else:
39 self.liattr[self.liattcl]=text
40
41 def start_li(self,attributes):
42 self.liattcl=''
43 attrs = dict(attributes)
44 if attrs.has_key('class'):
45 self.liattcl=attrs['class']
46 self.inside_elements.append('li')
47
48 def end_li(self):
49 if self.inside_elements[-1]=='li':
50 self.inside_elements.pop()
51