63
|
1 #!/usr/bin/python
|
|
2 # -*- coding: utf-8 -*-
|
|
3 import re,string
|
|
4 from sgmllib import SGMLParser
|
|
5
|
|
6 class ContentParser(SGMLParser):
|
|
7 def __init__(self):
|
|
8 SGMLParser.__init__(self)
|
|
9 self.anchor = {'link':'', 'title':''}
|
|
10 self.anchorlist = []
|
|
11 self.liattr={}
|
|
12 self.inside_elements=['site']
|
|
13 self.pat=re.compile('\r|\t|\n')
|
|
14
|
|
15 def start_a(self, attributes):
|
|
16 """For each anchor tag, pay attention to the href and title attributes."""
|
|
17 href, title = '', ''
|
|
18 for name, value in attributes:
|
|
19 if name.lower() == 'href': href = value
|
|
20 if name.lower() == 'title': title = value
|
|
21 self.anchor['link'] = href
|
|
22 self.anchor['title'] = title
|
|
23 self.inside_elements.append('anchor')
|
|
24
|
|
25 def end_a(self):
|
|
26 self.anchorlist.append(self.anchor) # store the anchor in a list
|
|
27 self.anchor = {'link':'', 'title':''} # reset the dictionary,
|
|
28 self.inside_elements.pop()
|
|
29
|
|
30 def handle_data(self, text):
|
|
31 if self.inside_elements[-1]=='anchor':
|
|
32 self.anchor['title'] = text
|
|
33 if self.inside_elements[-1]=='li':
|
|
34 text=self.pat.sub(' ',text)
|
|
35 text=string.split(text," ")
|
|
36 if self.liattcl in self.liattr:
|
|
37 self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
|
|
38 else:
|
|
39 self.liattr[self.liattcl]=text
|
|
40
|
|
41 def start_li(self,attributes):
|
|
42 self.liattcl=''
|
|
43 attrs = dict(attributes)
|
|
44 if attrs.has_key('class'):
|
|
45 self.liattcl=attrs['class']
|
|
46 self.inside_elements.append('li')
|
|
47
|
|
48 def end_li(self):
|
|
49 if self.inside_elements[-1]=='li':
|
|
50 self.inside_elements.pop()
|
|
51
|