src/FukuiNoNamari/barehtmlparser.py

   1 # Copyright (C) 2006 by Aiwota Programmer
   2 # aiwotaprog@tetteke.tk
   3 #
   4 # This program is free software; you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation; either version 2 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program; if not, write to the Free Software
  16 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 from HTMLParserEx import HTMLParserEx
  19 import htmlentitydefs
  20
  21
  22 class BareHTMLParser(HTMLParserEx):
  23     """Parses html by the minimal necessity
  24
  25     to_out_func format is:
  26     def some_func(untied_data, is_bold, href):
  27     where untied_data is non markuped string
  28     and is_bold is whether untied_data is bold or not
  29     and href is url anchor if exists
  30
  31     strip spaces at the head and end of line, but first line's head is unable.
  32     """
  33
  34     def __init__(self, to_out_func):
  35         HTMLParserEx.__init__(self)
  36         self.to_out_func = to_out_func
  37         self.bold = False
  38         self.href = None
  39         self.buffer = ""
  40
  41     def reset_func(self, to_out_func):
  42         self.flush()
  43         self.to_out_func = to_out_func
  44
  45     def to_out(self, data):
  46         n = len(self.buffer)
  47         if n > 0 and self.buffer[n-1] == "\n":
  48             data = data.lstrip(" ")
  49         self.buffer = self.buffer + data
  50
  51     def flush(self):
  52         if self.buffer:
  53             self.to_out_func(self.buffer, self.bold, self.href)
  54             self.buffer = ""
  55
  56     def newline(self):
  57         self.buffer = self.buffer.rstrip(" ")
  58         self.to_out("\n")
  59
  60     # override
  61     # flush after closing
  62     def close(self):
  63         HTMLParserEx.close(self)
  64         self.flush()
  65
  66     # override handle_*
  67
  68     def handle_starttag(self, tag, attr):
  69         if tag == "b":
  70             self.flush()
  71             self.bold = True
  72         elif tag == "br":
  73             self.newline()
  74         elif tag == "a":
  75             self.flush()
  76             for item in attr:
  77                 if item[0] == "href":
  78                     self.href = item[1]
  79
  80     def handle_endtag(self, tag):
  81         if tag == "b":
  82             self.flush()
  83             self.bold = False
  84         elif tag == "a":
  85             self.flush()
  86             self.href = None
  87
  88     def handle_data(self, data):
  89         self.to_out(data)
  90
  91     def handle_charref(self, ref):
  92         data = None
  93         try:
  94             data = unichr(int(ref))
  95         except:
  96             data = "&#"+ref+";"
  97         self.to_out(data)
  98
  99     def handle_entityref(self, name):
 100         if name in htmlentitydefs.name2codepoint:
 101             codepoint = htmlentitydefs.name2codepoint[name]
 102             self.to_out(unichr(codepoint))
 103         else:
 104             self.to_out("&"+name+";")