view pyikb/Parser.py @ 160:7551342718b6

Refactory pyikriam with patterns. - Use dyna_prog, a dynamic programming decorator, to cache city objects. - fake_moz to emulate a mozilla browser.
author Thinker K.F. Li <thinker@branda.to>
date Sat, 01 Nov 2008 21:29:51 +0800
parents 1c42ae140ad3
children
line wrap: on
line source

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re,string
from sgmllib import SGMLParser  

class ContentParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.anchor =  {'link':'', 'title':''}
        self.anchorlist = []
	self.liattr={}
        self.inside_elements=['site']
	self.pat=re.compile('\r|\t|\n')

    def start_a(self, attributes):
        """For each anchor tag, pay attention to the href and title attributes."""
        href, title = '', ''
        for name, value in attributes:
            if name.lower() == 'href': href = value
            if name.lower() == 'title': title = value
        self.anchor['link'] = href
        self.anchor['title'] = title
        self.inside_elements.append('anchor')

    def end_a(self):
        self.anchorlist.append(self.anchor) # store the anchor in a list 
        self.anchor = {'link':'', 'title':''}   # reset the dictionary,  
        self.inside_elements.pop()

    def handle_data(self, text):
        if self.inside_elements[-1]=='anchor':
            self.anchor['title'] = text
	if self.inside_elements[-1]=='li':
	    text=self.pat.sub(' ',text)
	    text=string.split(text," ")
	    if self.liattcl in self.liattr:
	    	self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
	    else:
	        self.liattr[self.liattcl]=text

    def start_li(self,attributes):
	self.liattcl=''
        attrs = dict(attributes)
	if attrs.has_key('class'):
	     	self.liattcl=attrs['class']
		self.inside_elements.append('li')

    def end_li(self):
	if self.inside_elements[-1]=='li':
	    self.inside_elements.pop()