view pyikb/Parser.py @ 113:82eff1aaf0ec

billy3321 <billy3321 AT msn.com> 4HG: branch default Add chihchunloop to usebot.sh
author billy3321@f3svr.f3.csu.edu.tw.f3.csu.edu.tw
date Thu, 30 Oct 2008 12:07:13 +0800
parents 1c42ae140ad3
children
line wrap: on
line source

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re,string
from sgmllib import SGMLParser  

class ContentParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.anchor =  {'link':'', 'title':''}
        self.anchorlist = []
	self.liattr={}
        self.inside_elements=['site']
	self.pat=re.compile('\r|\t|\n')

    def start_a(self, attributes):
        """For each anchor tag, pay attention to the href and title attributes."""
        href, title = '', ''
        for name, value in attributes:
            if name.lower() == 'href': href = value
            if name.lower() == 'title': title = value
        self.anchor['link'] = href
        self.anchor['title'] = title
        self.inside_elements.append('anchor')

    def end_a(self):
        self.anchorlist.append(self.anchor) # store the anchor in a list 
        self.anchor = {'link':'', 'title':''}   # reset the dictionary,  
        self.inside_elements.pop()

    def handle_data(self, text):
        if self.inside_elements[-1]=='anchor':
            self.anchor['title'] = text
	if self.inside_elements[-1]=='li':
	    text=self.pat.sub(' ',text)
	    text=string.split(text," ")
	    if self.liattcl in self.liattr:
	    	self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
	    else:
	        self.liattr[self.liattcl]=text

    def start_li(self,attributes):
	self.liattcl=''
        attrs = dict(attributes)
	if attrs.has_key('class'):
	     	self.liattcl=attrs['class']
		self.inside_elements.append('li')

    def end_li(self):
	if self.inside_elements[-1]=='li':
	    self.inside_elements.pop()