src/FukuiNoNamari/HTMLParserEx.py

   1 # HTMLPaserEx.py
   2 # This is a derivative work of HTMLParser.py from Python.
   3 # under the terms of Python Software Foundation License.
   4
   5 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
   6 # All rights reserved.
   7 #
   8 # Copyright (c) 2000 BeOpen.com.
   9 # All rights reserved.
  10 #
  11 # Copyright (c) 1995-2001 Corporation for National Research Initiatives.
  12 # All rights reserved.
  13 #
  14 # Copyright (c) 1991-1995 Stichting Mathematisch Centrum.
  15 # All rights reserved.
  16
  17
  18 # ChangeLog
  19 #
  20 # feed is always complete.
  21 # character and entity references must be strict.
  22 # support handling a uri (starting with http://).
  23 #
  24 # Copyright (C) 2006 by Aiwota Programmer
  25 # aiwotaprog@tetteke.tk
  26
  27 from HTMLParser import starttagopen
  28 import HTMLParser
  29 import re
  30
  31 interesting_normal = re.compile("[<&h]")
  32
  33 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+);')
  34 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*);')
  35
  36 urireg = re.compile("http://[a-zA-z0-9/.&=%\-?~]*")
  37
  38
  39 class HTMLParserEx(HTMLParser.HTMLParser):
  40
  41     def reset(self):
  42         HTMLParser.HTMLParser.reset(self)
  43         self.interesting = interesting_normal
  44
  45     def clear_cdata_mode(self):
  46         HTMLParser.HTMLParser.clear_cdata_mode(self)
  47         self.interesting = interesting_normal
  48
  49     # Internal -- handle data as far as reasonable.  May leave state
  50     # and data to be processed by a subsequent call.  If 'end' is
  51     # true, force handling all data as if followed by EOF marker.
  52     def goahead(self, end):
  53         rawdata = self.rawdata
  54         i = 0
  55         n = len(rawdata)
  56         while i < n:
  57             match = self.interesting.search(rawdata, i) # < or & or h
  58             if match:
  59                 j = match.start()
  60             else:
  61                 j = n
  62             if i < j: self.handle_data(rawdata[i:j])
  63             i = self.updatepos(i, j)
  64             if i == n: break
  65             startswith = rawdata.startswith
  66             if startswith('<', i):
  67                 if starttagopen.match(rawdata, i): # < + letter
  68                     k = self.parse_starttag(i)
  69                 elif startswith("</", i):
  70                     k = self.parse_endtag(i)
  71                 elif startswith("<!--", i):
  72                     k = self.parse_comment(i)
  73                 elif startswith("<?", i):
  74                     k = self.parse_pi(i)
  75                 elif startswith("<!", i):
  76                     k = self.parse_declaration(i)
  77
  78                 if k < 0: # not terminated, out simply.
  79                     self.handle_data("<")
  80                     k = i + 1
  81
  82                 i = self.updatepos(i, k)
  83             elif startswith("&#", i):
  84
  85                 match = charref.match(rawdata, i)
  86                 if match:
  87                     name = match.group()[2:-1]
  88                     self.handle_charref(name)
  89                     k = match.end()
  90                     i = self.updatepos(i, k)
  91                 else:
  92                     self.handle_data("&#")
  93                     i = self.updatepos(i, i + 2)
  94
  95             elif startswith('&', i):
  96
  97                 match = entityref.match(rawdata, i)
  98                 if match:
  99                     name = match.group(1)
 100                     self.handle_entityref(name)
 101                     k = match.end()
 102                     i = self.updatepos(i, k)
 103                 else:
 104                     self.handle_data("&")
 105                     i = self.updatepos(i, i + 1)
 106
 107             elif startswith("h", i):
 108
 109                 match = urireg.match(rawdata, i)
 110                 if match:
 111                     k = match.end()
 112                     attr = [("href", match.group())]
 113                     self.handle_starttag("a", attr)
 114                     self.handle_data(match.group())
 115                     self.handle_endtag("a")
 116                     i = self.updatepos(i, k)
 117                 else:
 118                     self.handle_data("h")
 119                     i = self.updatepos(i, i + 1)
 120
 121             else:
 122                 assert 0, "interesting.search() lied"
 123         # end while
 124         if i < n: # always complete
 125             self.handle_data(rawdata[i:n])
 126             i = self.updatepos(i, n)
 127         self.rawdata = rawdata[i:]