#!/usr/bin/env python """ SimpleStripper: VERY simple HTML stripper class. Usage: SimpleStripper.strip(html) Returns: a list of stripped paragraphs. """ __version__ = '$Revision: 1.1 $' __author__ = "Garth T Kidd " from HTMLParser import HTMLParser _betweenMatrix = { (1, 0): '', # no space between word and punctuation (1, 1): ' ', # space between words (0, 1): '', # no space between punctuation and word (0, 0): '' # no space between punctuation } def concat(left, right): "Quasi-intelligent string concatenation." tail = left[-1:] head = right[0:1] between = _betweenMatrix[(head.isalpha(), tail.isalpha())] result = left + between + right if head.isalpha() and tail in ['.', ',', ')', ']', '}']: result = left + ' ' + right return result def truncate(str, length): "Truncate a string at a particular length." if len(str)<=length: return str else: return str[0:length-3] + '...' class StrippingParser(HTMLParser): def __init__(self): self.paras = [] self.inp = 0 self.para = '' HTMLParser.__init__(self) def handle_data(self, data): data = data.strip() if data: if self.para: self.para = concat(self.para, data) else: self.para = data def handle_starttag(self, tag, atts): if tag in ['p', 'P']: if self.inp: # started another self.endPara() self.inp = 1 def handle_endtag(self, tag): if tag in ['p', 'P']: self.endPara() def endPara(self): if self.inp: self.paras.append(self.para) self.para = '' self.inp = 0 def close(self): if self.para: self.inp = 1 # just in case self.endPara() HTMLParser.close(self) def strip(html): sp = StrippingParser() sp.feed(html) sp.close() return sp.paras if __name__ == '__main__': from pprint import pprint tests = [ 'This is a test.', '

This is a test.

', '

This
is a test.

', '

This is a test' ] line = '-' * 79 num = 0 for test in tests: num += 1 print line print "Test #%d\n%s => " % (num, repr(test)), pprint(strip(test))