1 /* Parser.java -- HTML parser.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu.javax.swing.text.html.parser.support;
41 import gnu.javax.swing.text.html.parser.htmlAttributeSet;
42 import gnu.javax.swing.text.html.parser.htmlValidator;
43 import gnu.javax.swing.text.html.parser.support.low.Constants;
44 import gnu.javax.swing.text.html.parser.support.low.ParseException;
45 import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
46 import gnu.javax.swing.text.html.parser.support.low.Token;
47 import gnu.javax.swing.text.html.parser.support.low.node;
48 import gnu.javax.swing.text.html.parser.support.low.pattern;
50 import java.io.IOException;
51 import java.io.Reader;
53 import java.util.Comparator;
55 import java.util.TreeSet;
56 import java.util.Vector;
58 import javax.swing.text.ChangedCharSetException;
59 import javax.swing.text.SimpleAttributeSet;
60 import javax.swing.text.html.HTML;
61 import javax.swing.text.html.parser.AttributeList;
62 import javax.swing.text.html.parser.DTD;
63 import javax.swing.text.html.parser.DTDConstants;
64 import javax.swing.text.html.parser.Element;
65 import javax.swing.text.html.parser.Entity;
66 import javax.swing.text.html.parser.TagElement;
69 * <p>A simple error-tolerant HTML parser that uses a DTD document
70 * to access data on the possible tokens, arguments and syntax.</p>
71 * <p> The parser reads an HTML content from a Reader and calls various
72 * notifying methods (which should be overridden in a subclass)
73 * when tags or data are encountered.</p>
74 * <p>Some HTML elements need no opening or closing tags. The
75 * task of this parser is to invoke the tag handling methods also when
76 * the tags are not explicitly specified and must be supposed using
77 * information, stored in the DTD.
78 * For example, parsing the document
79 * <p><table><tr><td>a<td>b<td>c</tr> <br>
80 * will invoke exactly the handling methods exactly in the same order
81 * (and with the same parameters) as if parsing the document: <br>
82 * <em><html><head></head><body><table><
83 * tbody></em><tr><td>a<em></td></em><td>b<em>
84 * </td></em><td>c<em></td></tr></em><
85 * <em>/tbody></table></body></html></em></p>
86 * (supposed tags are given in italics). The parser also supports
87 * obsolete elements of HTML syntax.<p>
89 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
92 extends ReaderTokenizer
93 implements DTDConstants
96 * The current html tag.
98 public Token hTag = new Token();
101 * The document template description that will be used to parse the documents.
106 * The value of this field determines whether or not the Parser will be
107 * strict in enforcing SGML compatibility. The default value is false,
108 * stating that the parser should do everything to parse and get at least
109 * some information even from the incorrectly written HTML input.
111 protected boolean strict;
114 * This fields has positive values in preformatted tags.
116 protected int preformatted = 0;
119 * The set of the document tags. This field is used for supporting
122 private Set documentTags =
123 new TreeSet(new Comparator()
125 public int compare(Object a, Object b)
127 return ((String) a).compareToIgnoreCase((String) b);
133 * The buffer to collect the incremental output like text or coment.
135 private StringBuffer buffer = new StringBuffer();
138 * The buffer to store the document title.
140 private StringBuffer title = new StringBuffer();
148 * True means that the 'title' tag of this document has
149 * already been handled.
151 private boolean titleHandled;
154 * True means that the 'title' tag is currently open and all
155 * text is also added to the title buffer.
157 private boolean titleOpen;
160 * The attributes of the current HTML element.
161 * Package-private to avoid an accessor method.
163 htmlAttributeSet attributes =
164 htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
167 * The validator, controlling the forcible closing of the tags that
168 * (in accordance to dtd) are not allowed in the current context.
170 private htmlValidator validator;
173 * Provides the default values for parameters in the case when these
174 * values are defined in the DTD.
176 private parameterDefaulter defaulter;
179 * The text pre-processor for handling line ends and tabs.
181 private textPreProcessor textProcessor = new textPreProcessor();
184 * Creates a new Parser that uses the given
185 * {@link javax.swing.text.html.parser.DTD }. The only standard way
186 * to get an instance of DTD is to construct it manually, filling in
187 * all required fields.
188 * @param a_dtd The DTD to use. The parser behaviour after passing null
189 * as an argument is not documented and may vary between implementations.
191 public Parser(DTD a_dtd)
194 dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
198 defaulter = new parameterDefaulter(dtd);
201 new htmlValidator(dtd)
204 * Handles the error message. This method must be overridden to pass
205 * the message where required.
206 * @param msg The message text.
208 protected void s_error(String msg)
214 * The method is called when the tag validator decides to close the
215 * tag on its own initiative. After reaching the end of stream,
216 * The tag validator closes all unclosed elements that are required
217 * to have the end (closing) tag.
219 * @param tElement The tag being fictionally (forcibly) closed.
221 protected void handleSupposedEndTag(Element tElement)
223 // The tag is cloned as the original tElement is the
224 // element from the starting tag - may be accidently used
226 TagElement tag = makeTag(tElement, true);
227 _handleEndTag_remaining(tag);
231 * The method is called when the the tag validator decides to open
232 * the new tag on its own initiative. The tags, opened in this
233 * way, are HTML, HEAD and BODY. The attribute set is temporary
234 * assigned to the empty one, the previous value is
235 * restored before return.
237 * @param tElement The tag being fictionally (forcibly) closed.
239 protected void handleSupposedStartTag(Element tElement)
241 TagElement tag = makeTag(tElement, true);
242 htmlAttributeSet were = attributes;
243 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
244 _handleStartTag(tag);
251 * Get the attributes of the current tag.
252 * @return The attribute set, representing the attributes of the current tag.
254 public SimpleAttributeSet getAttributes()
256 return new SimpleAttributeSet(attributes);
260 * Invokes the error handler. The default method in this implementation
261 * delegates the call to handleError, also providing the current line.
263 public void error(String msg)
265 error(msg, getTokenAhead());
268 public void error(String msg, Token atToken)
271 handleError(atToken.where.beginLine,
272 msg + ": line " + atToken.where.beginLine +
273 ", absolute pos " + atToken.where.startPosition
280 * Invokes the error handler. The default method in this implementation
281 * delegates the call to error (parm1+": '"+parm2+"'").
283 public void error(String msg, String invalid)
285 error(msg + ": '" + invalid + "'");
289 * Invokes the error handler. The default method in this implementation
290 * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
292 public void error(String parm1, String parm2, String parm3)
294 error(parm1 + " " + parm2 + " " + parm3);
298 * Invokes the error handler. The default method in this implementation
299 * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
301 public void error(String parm1, String parm2, String parm3, String parm4)
303 error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
306 public void flushAttributes()
311 * Parse the HTML text, calling various methods in response to the
312 * occurence of the corresponding HTML constructions.
313 * @param reader The reader to read the source HTML from.
314 * @throws IOException If the reader throws one.
316 public synchronized void parse(Reader reader)
324 validator.closeAll();
326 catch (ParseException ex)
330 error("Unable to continue parsing the document", ex.getMessage());
332 Throwable cause = ex.getCause();
333 if (cause instanceof IOException)
334 throw (IOException) cause;
340 * Parses DTD markup declaration. Currently returns null without action.
342 * @throws IOException
344 public String parseDTDMarkup()
351 * Parse SGML insertion ( <! ... > ). When the
352 * the SGML insertion is found, this method is called, passing
353 * SGML in the string buffer as a parameter. The default method
354 * returns false without action and can be overridden to
355 * implement user - defined SGML support.
357 * If you need more information about SGML insertions in HTML documents,
358 * the author suggests to read SGML tutorial on
359 * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
360 * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
361 * Oxford University Press, 688 p, ISBN: 0198537379.
364 * @return true if this is a valid DTD markup declaration.
365 * @throws IOException
367 public boolean parseMarkupDeclarations(StringBuffer strBuff)
374 * Get the first line of the last parsed token.
376 protected int getCurrentLine()
378 return hTag.where.beginLine;
382 * Read parseable character data, add to buffer.
383 * @param clearBuffer If true, buffer if filled by CDATA section,
384 * otherwise the section is appended to the existing content of the
387 * @throws ParseException
389 protected void CDATA(boolean clearBuffer)
390 throws ParseException
392 Token start = hTag = getTokenAhead();
397 // Handle expected EOF.
398 if (start.kind == EOF)
407 error("unexpected eof", t);
410 else if (t.kind == BEGIN)
412 else if (t.kind == Constants.ENTITY)
414 resolveAndAppendEntity(t);
423 hTag = new Token(start, getTokenAhead(0));
424 if (buffer.length() != 0)
429 * Process Comment. This method skips till --> without
430 * taking SGML constructs into consideration. The supported SGML
431 * constructs are handled separately.
433 protected void Comment()
434 throws ParseException
438 Token start = hTag = mustBe(BEGIN);
453 handleEOFInComment();
457 else if (COMMENT_END.matches(this))
464 else if (COMMENT_TRIPLEDASH_END.matches(this))
467 t = mustBe(NUMTOKEN);
468 if (t.getImage().equals("-"))
482 /* The lllll-- can match as NUMTOKEN */
483 if ((t.getImage().endsWith("--")) &&
485 getTokenAhead(1).kind == END ||
486 (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
490 buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
492 /* Skip the closing > that we have already checked. */
493 last = mustBe(t.kind);
500 hTag = new Token(start, last);
502 // Consume any whitespace immediately following a comment.
508 * Read a script. The text, returned without any changes,
509 * is terminated only by the closing tag SCRIPT.
511 protected void Script()
512 throws ParseException
516 Token start = hTag = mustBe(BEGIN);
519 name = mustBe(SCRIPT);
523 restOfTag(false, name, start);
527 while (!SCRIPT_CLOSE.matches(this))
529 append(getNextToken());
532 consume(SCRIPT_CLOSE);
537 _handleEndTag(makeTagElement(name.getImage(), false));
541 * Process SGML insertion that is not a comment.
543 protected void Sgml()
544 throws ParseException
546 if (COMMENT_OPEN.matches(this))
548 else // skip till ">"
550 Token start = hTag = mustBe(BEGIN);
559 if (t.kind == Constants.ENTITY)
561 resolveAndAppendEntity(t);
563 else if (t.kind == EOF)
565 error("unexpected eof", t);
568 else if (t.kind == END)
576 parseMarkupDeclarations(buffer);
578 catch (IOException ex)
580 error("Unable to parse SGML insertion: '" + buffer + "'",
585 // Consume any whitespace that follows the Sgml insertion.
590 * Read a style definition. The text, returned without any changes,
591 * is terminated only by the closing tag STYLE.
593 protected void Style()
594 throws ParseException
598 Token start = hTag = mustBe(BEGIN);
601 name = mustBe(STYLE);
605 restOfTag(false, name, start);
609 while (!STYLE_CLOSE.matches(this))
611 append(getNextToken());
614 consume(STYLE_CLOSE);
619 _handleEndTag(makeTagElement(name.getImage(), false));
626 throws ParseException
630 boolean closing = false;
632 Token start = hTag = mustBe(BEGIN);
635 name = getNextToken();
638 if (name.kind == SLASH)
641 name = getNextToken();
644 restOfTag(closing, name, start);
648 * A hook, for operations, preceeding call to handleText.
649 * Handle text in a string buffer.
650 * In non - preformatted mode, all line breaks immediately following the
651 * start tag and immediately before an end tag is discarded,
652 * \r, \n and \t are replaced by spaces, multiple space are replaced
653 * by the single one and the result is moved into array,
654 * passing it to handleText().
656 protected void _handleText()
660 if (preformatted > 0)
661 text = textProcessor.preprocessPreformatted(buffer);
663 text = textProcessor.preprocess(buffer);
665 if (text != null && text.length > 0
666 // According to the specs we need to discard whitespace immediately
667 // before a closing tag.
668 && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
670 TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
671 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
672 _handleEmptyTag(pcdata);
681 * Add the image of this token to the buffer.
682 * @param t A token to append.
684 protected final void append(Token t)
691 * Consume pattern that must match.
692 * @param p A pattern to consume.
694 protected final void consume(pattern p)
697 for (int i = 0; i < p.nodes.length; i++)
708 * The method is called when the HTML end (closing) tag is found or if
709 * the parser concludes that the one should be present in the
710 * current position. The method is called immediatly
711 * before calling the handleEndTag().
712 * @param omitted True if the tag is no actually present in the document,
713 * but is supposed by the parser (like </html> at the end of the
716 protected void endTag(boolean omitted)
721 * Handle HTML comment. The default method returns without action.
724 protected void handleComment(char[] comment)
729 * This is additionally called in when the HTML content terminates
730 * without closing the HTML comment. This can only happen if the
731 * HTML document contains errors (for example, the closing --;gt is
734 protected void handleEOFInComment()
736 error("Unclosed comment");
740 * Handle the tag with no content, like <br>. The method is
741 * called for the elements that, in accordance with the current DTD,
742 * has an empty content.
743 * @param tag The tag being handled.
744 * @throws javax.swing.text.ChangedCharSetException
746 protected void handleEmptyTag(TagElement tag)
747 throws javax.swing.text.ChangedCharSetException
752 * The method is called when the HTML closing tag ((like </table>)
753 * is found or if the parser concludes that the one should be present
754 * in the current position.
757 protected void handleEndTag(TagElement tag)
761 /* Handle error that has occured in the given line. */
762 protected void handleError(int line, String message)
767 * The method is called when the HTML opening tag ((like <table>)
768 * is found or if the parser concludes that the one should be present
769 * in the current position.
772 protected void handleStartTag(TagElement tag)
777 * Handle the text section.
778 * <p> For non-preformatted section, the parser replaces
779 * \t, \r and \n by spaces and then multiple spaces
780 * by a single space. Additionaly, all whitespace around
783 * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
784 * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n,
785 * if it is present. Additionally, it replaces each occurence of \r or \r\n
786 * by a single \n.</p>
788 * @param text A section text.
790 protected void handleText(char[] text)
795 * Handle HTML <title> tag. This method is invoked when
796 * both title starting and closing tags are already behind.
797 * The passed argument contains the concatenation of all
798 * title text sections.
799 * @param title The title text.
801 protected void handleTitle(char[] title)
806 * Constructs the tag from the given element. In this implementation,
807 * this is defined, but never called.
810 protected TagElement makeTag(Element element)
812 return makeTag(element, false);
816 * Constructs the tag from the given element.
817 * @param the tag base {@link javax.swing.text.html.parser.Element}
818 * @param isSupposed true if the tag is not actually present in the
819 * html input, but the parser supposes that it should to occur in
820 * the current location.
823 protected TagElement makeTag(Element element, boolean isSupposed)
825 return new TagElement(element, isSupposed);
829 * This is called when the tag, representing the given element,
830 * occurs first time in the document.
833 protected void markFirstTime(Element element)
838 * Consume the token that was checked before and hence MUST be present.
839 * @param kind The kind of token to consume.
841 protected Token mustBe(int kind)
843 if (getTokenAhead().kind == kind)
844 return getNextToken();
849 ei = " ('" + (char) kind + "') ";
850 throw new AssertionError("The token of kind " + kind + ei +
857 * Handle attribute without value. The default method uses
858 * the only allowed attribute value from DTD.
859 * If the attribute is unknown or allows several values,
860 * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
861 * this value is added to the attribute set.
862 * @param element The name of element.
863 * @param attribute The name of attribute without value.
865 protected void noValueAttribute(String element, String attribute)
867 Object value = HTML.NULL_ATTRIBUTE_VALUE;
869 Element e = dtd.elementHash.get(element.toLowerCase());
872 AttributeList attr = e.getAttribute(attribute);
875 Vector values = attr.values;
876 if (values != null && values.size() == 1)
877 value = values.get(0);
880 attributes.addAttribute(attribute, value);
884 * Consume the optional token, if present.
885 * @param kind The kind of token to consume.
887 protected Token optional(int kind)
889 if (getTokenAhead().kind == kind)
890 return getNextToken();
895 /** Parse the html document. */
896 protected void parseDocument()
897 throws ParseException
899 // Read up any initial whitespace.
901 while (getTokenAhead().kind != EOF)
904 if (TAG.matches(this))
906 else if (COMMENT_OPEN.matches(this))
908 else if (STYLE_OPEN.matches(this))
910 else if (SCRIPT_OPEN.matches(this))
912 else if (SGML.matches(this))
917 // Surely HTML error, treat as a text.
920 Token wrong = getNextToken();
921 error("unexpected '" + wrong.getImage() + "'", wrong);
923 buffer.append(wrong.getImage());
930 * Read the element attributes, adding them into attribute set.
931 * @param element The element name (needed to access attribute
932 * information in dtd).
934 protected void readAttributes(String element)
941 attributes = new htmlAttributeSet();
946 while (getTokenAhead().kind == NUMTOKEN)
948 name = getNextToken();
951 next = getTokenAhead();
957 next = getNextToken();
963 // read "quoted" attribute.
965 readTillTokenE(QUOT);
966 attrValue = buffer.toString();
971 // read 'quoted' attribute.
974 attrValue = buffer.toString();
977 // read unquoted attribute.
982 // Check maybe the opening quote is missing.
983 next = getTokenAhead();
984 if (bQUOTING.get(next.kind))
987 error("The value without opening quote is closed with '"
988 + next.getImage() + "'");
989 attrValue = value.getImage();
991 else if (next.kind == SLASH || next.kind == OTHER)
992 // The slash and other characters (like %) in this context is
993 // treated as the ordinary
994 // character, not as a token. The character may be part of
997 StringBuffer image = new StringBuffer(value.getImage());
998 while (next.kind == NUMTOKEN || next.kind == SLASH
999 || next.kind == OTHER)
1001 image.append(getNextToken().getImage());
1002 next = getTokenAhead();
1004 attrValue = image.toString();
1007 attrValue = value.getImage();
1014 // Check maybe the opening quote is missing.
1015 next = getTokenAhead();
1016 if (bQUOTING.get(next.kind))
1019 error("The value without opening quote is closed with '"
1020 + next.getImage() + "'");
1021 attrValue = value.getImage();
1023 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1024 // The slash in this context is treated as the ordinary
1025 // character, not as a token. The slash may be part of
1026 // the unquoted URL.
1028 StringBuffer image = new StringBuffer(value.getImage());
1029 while (next.kind == NUMTOKEN || next.kind == SLASH)
1031 image.append(getNextToken().getImage());
1032 next = getTokenAhead();
1034 attrValue = image.toString();
1037 attrValue = value.getImage();
1040 break attributeReading;
1042 attributes.addAttribute(name.getImage(), attrValue);
1046 // The '=' is missing: attribute without value.
1048 noValueAttribute(element, name.getImage());
1054 * Return string, corresponding the given named entity. The name is passed
1055 * with the preceeding &, but without the ending semicolon.
1057 protected String resolveNamedEntity(final String a_tag)
1060 if (!a_tag.startsWith("&"))
1061 throw new AssertionError("Named entity " + a_tag +
1062 " must start witn '&'."
1065 String tag = a_tag.substring(1);
1069 Entity entity = dtd.getEntity(tag);
1071 return entity.getString();
1073 entity = dtd.getEntity(tag.toLowerCase());
1077 error("The name of this entity should be in lowercase", a_tag);
1078 return entity.getString();
1081 catch (IndexOutOfBoundsException ibx)
1083 /* The error will be reported. */
1086 error("Unknown named entity", a_tag);
1091 * Return char, corresponding the given numeric entity.
1092 * The name is passed with the preceeding &#, but without
1093 * the ending semicolon.
1095 protected char resolveNumericEntity(final String a_tag)
1098 if (!a_tag.startsWith("&#"))
1099 throw new AssertionError("Numeric entity " + a_tag +
1100 " must start witn '&#'."
1103 String tag = a_tag.substring(2);
1107 // Determine the encoding type:
1108 char cx = tag.charAt(0);
1109 if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1111 return (char) Integer.parseInt(tag.substring(1), 16);
1113 return (char) Integer.parseInt(tag);
1116 /* The error will be reported. */
1117 catch (NumberFormatException nex)
1120 catch (IndexOutOfBoundsException ix)
1124 error("Invalid numeric entity", a_tag);
1129 * Reset all fields into the intial default state, preparing the
1130 * parset for parsing the next document.
1132 protected void restart()
1134 documentTags.clear();
1135 titleHandled = false;
1137 buffer.setLength(0);
1139 validator.restart();
1143 * The method is called when the HTML opening tag ((like <table>)
1144 * is found or if the parser concludes that the one should be present
1145 * in the current position. The method is called immediately before
1146 * calling the handleStartTag.
1147 * @param tag The tag
1149 protected void startTag(TagElement tag)
1150 throws ChangedCharSetException
1155 * Handle a complete element, when the tag content is already present in the
1156 * buffer and both starting and heading tags behind. This is called
1157 * in the case when the tag text must not be parsed for the nested
1158 * elements (elements STYLE and SCRIPT).
1160 private void _handleCompleteElement(TagElement tag)
1162 _handleStartTag(tag);
1164 // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1165 HTML.Tag h = tag.getHTMLTag();
1166 if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1168 boolean tmp = titleOpen;
1180 * A hooks for operations, preceeding call to handleEmptyTag().
1181 * Handle the tag with no content, like <br>. As no any
1182 * nested tags are expected, the tag validator is not involved.
1183 * @param tag The tag being handled.
1185 private void _handleEmptyTag(TagElement tag)
1189 validator.validateTag(tag, attributes);
1190 handleEmptyTag(tag);
1191 HTML.Tag h = tag.getHTMLTag();
1192 // When a block tag is closed, consume whitespace that follows after
1194 // For some unknown reason a FRAME tag is not treated as block element.
1195 // However in this case it should be treated as such.
1199 catch (ChangedCharSetException ex)
1201 error("Changed charset exception:", ex.getMessage());
1206 * A hooks for operations, preceeding call to handleEndTag().
1207 * The method is called when the HTML closing tag
1208 * is found. Calls handleTitle after closing the 'title' tag.
1209 * @param tag The tag
1211 private void _handleEndTag(TagElement tag)
1213 if (validator.closeTag(tag))
1214 _handleEndTag_remaining(tag);
1218 * Actions that are also required if the closing action was
1219 * initiated by the tag validator.
1220 * Package-private to avoid an accessor method.
1222 void _handleEndTag_remaining(TagElement tag)
1224 HTML.Tag h = tag.getHTMLTag();
1227 endTag(tag.fictional());
1229 if (h.isPreformatted())
1231 if (preformatted < 0)
1234 // When a block tag is closed, consume whitespace that follows after
1239 if (h == HTML.Tag.TITLE)
1242 titleHandled = true;
1244 char[] a = new char[ title.length() ];
1245 title.getChars(0, a.length, a, 0);
1251 * A hooks for operations, preceeding call to handleStartTag().
1252 * The method is called when the HTML opening tag ((like <table>)
1254 * Package-private to avoid an accessor method.
1255 * @param tag The tag
1257 void _handleStartTag(TagElement tag)
1259 validator.openTag(tag, attributes);
1261 handleStartTag(tag);
1263 HTML.Tag h = tag.getHTMLTag();
1268 if (h.isPreformatted())
1271 if (h == HTML.Tag.TITLE)
1274 error("Repetetive <TITLE> tag");
1276 titleHandled = false;
1281 * Resume parsing after heavy errors in HTML tag structure.
1282 * @throws ParseException
1284 private void forciblyCloseTheTag()
1285 throws ParseException
1288 buffer.setLength(0);
1291 for (int i = 1; i < 100; i++)
1293 t = getTokenAhead(i - 1);
1294 if (t.kind == EOF || t.kind == BEGIN)
1298 /* Closing '>' found. */
1305 buffer.append("Ignoring '");
1306 for (int i = 1; i <= closeAt; i++)
1311 buffer.append('\'');
1312 error(buffer.toString());
1317 * Handle comment in string buffer. You can avoid allocating a char
1318 * array each time by processing your comment directly here.
1320 private void handleComment()
1322 char[] a = new char[ buffer.length() ];
1323 buffer.getChars(0, a.length, a, 0);
1327 private TagElement makeTagElement(String name, boolean isSupposed)
1329 Element e = dtd.elementHash.get(name.toLowerCase());
1332 error("Unknown tag <" + name + ">");
1333 e = dtd.getElement(name);
1334 e.name = name.toUpperCase();
1338 if (!documentTags.contains(e.name))
1341 documentTags.add(e.name);
1344 return makeTag(e, isSupposed);
1348 * Read till the given token, resolving entities. Consume the given
1349 * token without adding it to buffer.
1350 * @param till The token to read till
1351 * @throws ParseException
1353 private void readTillTokenE(int till)
1354 throws ParseException
1356 buffer.setLength(0);
1361 if (t.kind == Constants.ENTITY)
1363 resolveAndAppendEntity(t);
1365 else if (t.kind == EOF)
1367 error("unexpected eof", t);
1370 else if (t.kind == till)
1372 else if (t.kind == WS)
1374 // Processing whitespace in accordance with CDATA rules:
1375 String s = t.getImage();
1377 for (int i = 0; i < s.length(); i++)
1381 buffer.append(' '); // CR replaced by space
1383 { /* LF ignored */ }
1385 buffer.append(' '); // Tab replaced by space
1396 * Resolve the entity and append it to the end of buffer.
1399 private void resolveAndAppendEntity(Token entity)
1401 switch (entity.category)
1404 buffer.append(resolveNamedEntity(entity.getImage()));
1407 case ENTITY_NUMERIC :
1408 buffer.append(resolveNumericEntity(entity.getImage()));
1412 throw new AssertionError("Invalid entity category " +
1419 * Handle the remaining of HTML tags. This is a common end for
1420 * TAG, SCRIPT and STYLE.
1421 * @param closing True for closing tags ( </TAG> ).
1422 * @param name Name of element
1423 * @param start Token where element has started
1424 * @throws ParseException
1426 private void restOfTag(boolean closing, Token name, Token start)
1427 throws ParseException
1429 boolean end = false;
1434 readAttributes(name.getImage());
1438 next = getTokenAhead();
1439 if (next.kind == END)
1445 hTag = new Token(start, next);
1449 // The tag body contains errors. If additionally the tag
1450 // name is not valid, this construction is treated as text.
1451 if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1455 error("Errors in tag body and unknown tag name. " +
1456 "Treating the tag as a text."
1460 hTag = mustBe(BEGIN);
1461 buffer.setLength(0);
1462 buffer.append(hTag.getImage());
1468 error("Forcibly closing invalid parameter list");
1469 forciblyCloseTheTag();
1476 _handleEndTag(makeTagElement(name.getImage(), false));
1480 TagElement te = makeTagElement(name.getImage(), false);
1481 if (te.getElement().type == DTDConstants.EMPTY)
1482 _handleEmptyTag(te);
1485 // According to the specs we need to consume whitespace following
1486 // immediately after a opening tag.
1488 _handleStartTag(te);
1494 * This should fire additional actions in response to the
1495 * ChangedCharSetException. The current implementation
1499 private void startingTag(TagElement tag)
1505 catch (ChangedCharSetException cax)
1507 error("Invalid change of charset");
1511 private void ws_error()
1513 error("Whitespace here is not permitted");
1517 * Returns true when the specified tag should be considered a block tag
1518 * wrt whitespace handling. We need this special handling, since there
1519 * are a couple of tags that we must treat as block tags but which aren't
1520 * officially block tags.
1522 * @param tag the tag to check
1523 * @return true when the specified tag should be considered a block tag
1524 * wrt whitespace handling
1526 private boolean isBlock(HTML.Tag tag)
1528 return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;