diff pyikb/Parser.py @ 63:1c42ae140ad3

add Parser.py and lconf.py.
author kevin@localhost.localdomain
date Wed, 22 Oct 2008 05:40:57 +0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyikb/Parser.py	Wed Oct 22 05:40:57 2008 +0800
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import re,string
+from sgmllib import SGMLParser  
+
+class ContentParser(SGMLParser):
+    def __init__(self):
+        SGMLParser.__init__(self)
+        self.anchor =  {'link':'', 'title':''}
+        self.anchorlist = []
+	self.liattr={}
+        self.inside_elements=['site']
+	self.pat=re.compile('\r|\t|\n')
+
+    def start_a(self, attributes):
+        """For each anchor tag, pay attention to the href and title attributes."""
+        href, title = '', ''
+        for name, value in attributes:
+            if name.lower() == 'href': href = value
+            if name.lower() == 'title': title = value
+        self.anchor['link'] = href
+        self.anchor['title'] = title
+        self.inside_elements.append('anchor')
+
+    def end_a(self):
+        self.anchorlist.append(self.anchor) # store the anchor in a list 
+        self.anchor = {'link':'', 'title':''}   # reset the dictionary,  
+        self.inside_elements.pop()
+
+    def handle_data(self, text):
+        if self.inside_elements[-1]=='anchor':
+            self.anchor['title'] = text
+	if self.inside_elements[-1]=='li':
+	    text=self.pat.sub(' ',text)
+	    text=string.split(text," ")
+	    if self.liattcl in self.liattr:
+	    	self.liattr[self.liattcl]=self.liattr[self.liattcl]+text
+	    else:
+	        self.liattr[self.liattcl]=text
+
+    def start_li(self,attributes):
+	self.liattcl=''
+        attrs = dict(attributes)
+	if attrs.has_key('class'):
+	     	self.liattcl=attrs['class']
+		self.inside_elements.append('li')
+
+    def end_li(self):
+	if self.inside_elements[-1]=='li':
+	    self.inside_elements.pop()
+