2 # This is a derivative work of HTMLParser.py from Python.
3 # under the terms of Python Software Foundation License.
5 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
8 # Copyright (c) 2000 BeOpen.com.
11 # Copyright (c) 1995-2001 Corporation for National Research Initiatives.
12 # All rights reserved.
14 # Copyright (c) 1991-1995 Stichting Mathematisch Centrum.
15 # All rights reserved.
20 # feed is always complete.
21 # character and entity references must be strict.
22 # support handling a uri (starting with http://).
24 # Copyright (C) 2006 by Aiwota Programmer
25 # aiwotaprog@tetteke.tk
27 from HTMLParser import starttagopen
31 interesting_normal = re.compile("[<&h]")
33 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+);')
34 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*);')
36 urireg = re.compile("http://[a-zA-z0-9/.&=%\-?~]*")
39 class HTMLParserEx(HTMLParser.HTMLParser):
42 HTMLParser.HTMLParser.reset(self)
43 self.interesting = interesting_normal
45 def clear_cdata_mode(self):
46 HTMLParser.HTMLParser.clear_cdata_mode(self)
47 self.interesting = interesting_normal
49 # Internal -- handle data as far as reasonable. May leave state
50 # and data to be processed by a subsequent call. If 'end' is
51 # true, force handling all data as if followed by EOF marker.
52 def goahead(self, end):
53 rawdata = self.rawdata
57 match = self.interesting.search(rawdata, i) # < or & or h
62 if i < j: self.handle_data(rawdata[i:j])
63 i = self.updatepos(i, j)
65 startswith = rawdata.startswith
66 if startswith('<', i):
67 if starttagopen.match(rawdata, i): # < + letter
68 k = self.parse_starttag(i)
69 elif startswith("</", i):
70 k = self.parse_endtag(i)
71 elif startswith("<!--", i):
72 k = self.parse_comment(i)
73 elif startswith("<?", i):
75 elif startswith("<!", i):
76 k = self.parse_declaration(i)
78 if k < 0: # not terminated, out simply.
82 i = self.updatepos(i, k)
83 elif startswith("&#", i):
85 match = charref.match(rawdata, i)
87 name = match.group()[2:-1]
88 self.handle_charref(name)
90 i = self.updatepos(i, k)
92 self.handle_data("&#")
93 i = self.updatepos(i, i + 2)
95 elif startswith('&', i):
97 match = entityref.match(rawdata, i)
100 self.handle_entityref(name)
102 i = self.updatepos(i, k)
104 self.handle_data("&")
105 i = self.updatepos(i, i + 1)
107 elif startswith("h", i):
109 match = urireg.match(rawdata, i)
112 attr = [("href", match.group())]
113 self.handle_starttag("a", attr)
114 self.handle_data(match.group())
115 self.handle_endtag("a")
116 i = self.updatepos(i, k)
118 self.handle_data("h")
119 i = self.updatepos(i, i + 1)
122 assert 0, "interesting.search() lied"
124 if i < n: # always complete
125 self.handle_data(rawdata[i:n])
126 i = self.updatepos(i, n)
127 self.rawdata = rawdata[i:]