view pyikb/Parser.py @ 108:d3bece9b06b2

fixed the couting *again*
author "Rex Tsai <chihchun@kalug.linux.org.tw>"
date Wed, 29 Oct 2008 15:56:04 +0800
parents 1c42ae140ad3
children
line wrap: on
line source

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re,string
from sgmllib import SGMLParser  

class ContentParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.anchor =  {'link':'', 'title':''}
        self.anchorlist = []
	self.liattr={}
        self.inside_elements=['site']
	self.pat=re.compile('\r|\t|\n')

    def start_a(self, attributes):
        """For each anchor tag, pay attention to the href and title attributes."""
        href, title = '', ''
        for name, value in attributes:
            if name.lower() == 'href': href = value
            if name.lower() == 'title': title = value
        self.anchor['link'] = href
        self.anchor['title'] = title
        self.inside_elements.append('anchor')

    def end_a(self):
        self.anchorlist.append(self.anchor) # store the anchor in a list 
        self.anchor = {'link':'', 'title':''}   # reset the dictionary,  
        self.inside_elements.pop()

    def handle_data(self, text):
        if self.inside_elements[-1]=='anchor':
            self.anchor['title'] = text
	if self.inside_elements[-1]=='li':
	    text=self.pat.sub(' ',text)
	    text=string.split(text," ")
	    if self.liattcl in self.liattr:
	    	self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
	    else:
	        self.liattr[self.liattcl]=text

    def start_li(self,attributes):
	self.liattcl=''
        attrs = dict(attributes)
	if attrs.has_key('class'):
	     	self.liattcl=attrs['class']
		self.inside_elements.append('li')

    def end_li(self):
	if self.inside_elements[-1]=='li':
	    self.inside_elements.pop()