Mercurial > eagle-eye
diff pyikb/Parser.py @ 63:1c42ae140ad3
add Parser.py and lconf.py.
author | kevin@localhost.localdomain |
---|---|
date | Wed, 22 Oct 2008 05:40:57 +0800 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyikb/Parser.py Wed Oct 22 05:40:57 2008 +0800 @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import re,string +from sgmllib import SGMLParser + +class ContentParser(SGMLParser): + def __init__(self): + SGMLParser.__init__(self) + self.anchor = {'link':'', 'title':''} + self.anchorlist = [] + self.liattr={} + self.inside_elements=['site'] + self.pat=re.compile('\r|\t|\n') + + def start_a(self, attributes): + """For each anchor tag, pay attention to the href and title attributes.""" + href, title = '', '' + for name, value in attributes: + if name.lower() == 'href': href = value + if name.lower() == 'title': title = value + self.anchor['link'] = href + self.anchor['title'] = title + self.inside_elements.append('anchor') + + def end_a(self): + self.anchorlist.append(self.anchor) # store the anchor in a list + self.anchor = {'link':'', 'title':''} # reset the dictionary, + self.inside_elements.pop() + + def handle_data(self, text): + if self.inside_elements[-1]=='anchor': + self.anchor['title'] = text + if self.inside_elements[-1]=='li': + text=self.pat.sub(' ',text) + text=string.split(text," ") + if self.liattcl in self.liattr: + self.liattr[self.liattcl]=self.liattr[self.liattcl]+text + else: + self.liattr[self.liattcl]=text + + def start_li(self,attributes): + self.liattcl='' + attrs = dict(attributes) + if attrs.has_key('class'): + self.liattcl=attrs['class'] + self.inside_elements.append('li') + + def end_li(self): + if self.inside_elements[-1]=='li': + self.inside_elements.pop() +