Mercurial > eagle-eye
view pyikb/Parser.py @ 160:7551342718b6
Refactory pyikriam with patterns.
- Use dyna_prog, a dynamic programming decorator, to cache city objects.
- fake_moz to emulate a mozilla browser.
author | Thinker K.F. Li <thinker@branda.to> |
---|---|
date | Sat, 01 Nov 2008 21:29:51 +0800 |
parents | 1c42ae140ad3 |
children |
line wrap: on
line source
#!/usr/bin/python # -*- coding: utf-8 -*- import re,string from sgmllib import SGMLParser class ContentParser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.anchor = {'link':'', 'title':''} self.anchorlist = [] self.liattr={} self.inside_elements=['site'] self.pat=re.compile('\r|\t|\n') def start_a(self, attributes): """For each anchor tag, pay attention to the href and title attributes.""" href, title = '', '' for name, value in attributes: if name.lower() == 'href': href = value if name.lower() == 'title': title = value self.anchor['link'] = href self.anchor['title'] = title self.inside_elements.append('anchor') def end_a(self): self.anchorlist.append(self.anchor) # store the anchor in a list self.anchor = {'link':'', 'title':''} # reset the dictionary, self.inside_elements.pop() def handle_data(self, text): if self.inside_elements[-1]=='anchor': self.anchor['title'] = text if self.inside_elements[-1]=='li': text=self.pat.sub(' ',text) text=string.split(text," ") if self.liattcl in self.liattr: self.liattr[self.liattcl]=self.liattr[self.liattcl]+text else: self.liattr[self.liattcl]=text def start_li(self,attributes): self.liattcl='' attrs = dict(attributes) if attrs.has_key('class'): self.liattcl=attrs['class'] self.inside_elements.append('li') def end_li(self): if self.inside_elements[-1]=='li': self.inside_elements.pop()