src/FukuiNoNamari/barehtmlparser.py

   1 # Copyright (C) 2006 by Aiwota Programmer
   2 # aiwotaprog@tetteke.tk
   3 #
   4 # This program is free software; you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation; either version 2 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program; if not, write to the Free Software
  16 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 from HTMLParserEx import HTMLParserEx
  19 import htmlentitydefs
  20
  21
  22 class BareHTMLParser(HTMLParserEx):
  23     """Parses html by the minimal necessity
  24
  25     to_out_func format is:
  26     def some_func(untied_data, is_bold, href):
  27     where untied_data is non markuped string
  28     and is_bold is whether untied_data is bold or not
  29     and href is url anchor if exists
  30
  31     on_new_line format is:
  32     def on_new_line():
  33
  34     strip spaces at the head and end of line, but first line's head is unable.
  35     """
  36
  37     def __init__(self, to_out_func, on_new_line):
  38         HTMLParserEx.__init__(self)
  39         self.to_out_func = to_out_func
  40         self.on_new_line = on_new_line
  41         self.bold = False
  42         self.href = None
  43         self.buffer = ""
  44
  45     def reset_func(self, to_out_func):
  46         self.flush()
  47         self.to_out_func = to_out_func
  48
  49     def to_out(self, data):
  50         n = len(self.buffer)
  51         if n == 0:
  52             data = data.lstrip(" ")
  53         self.buffer = self.buffer + data
  54
  55     def flush(self):
  56         if self.buffer:
  57             self.to_out_func(self.buffer, self.bold, self.href)
  58             self.buffer = ""
  59
  60     def newline(self):
  61         self.buffer = self.buffer.rstrip(" ")
  62         self.flush()
  63         self.on_new_line()
  64
  65     # override
  66     # flush after closing
  67     def close(self):
  68         HTMLParserEx.close(self)
  69         self.flush()
  70
  71     # override handle_*
  72
  73     def handle_starttag(self, tag, attr):
  74         if tag == "b":
  75             self.flush()
  76             self.bold = True
  77         elif tag == "br":
  78             self.newline()
  79         elif tag == "a":
  80             self.flush()
  81             for item in attr:
  82                 if item[0] == "href":
  83                     self.href = item[1]
  84
  85     def handle_endtag(self, tag):
  86         if tag == "b":
  87             self.flush()
  88             self.bold = False
  89         elif tag == "a":
  90             self.flush()
  91             self.href = None
  92
  93     def handle_data(self, data):
  94         self.to_out(data)
  95
  96     def handle_charref(self, ref):
  97         data = None
  98         try:
  99             data = unichr(int(ref))
 100         except:
 101             data = "&#"+ref+";"
 102         self.to_out(data)
 103
 104     def handle_entityref(self, name):
 105         if name in htmlentitydefs.name2codepoint:
 106             codepoint = htmlentitydefs.name2codepoint[name]
 107             self.to_out(unichr(codepoint))
 108         else:
 109             self.to_out("&"+name+";")