1 /* CSSScanner.java -- A parser for CSS stylesheets
2 Copyright (C) 2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu.javax.swing.text.html.css;
41 import java.io.BufferedInputStream;
42 import java.io.IOException;
43 import java.io.InputStream;
44 import java.io.InputStreamReader;
45 import java.io.Reader;
48 * A tokenizer for CSS stylesheets. This is based on the scanner definition
51 * http://www.w3.org/TR/CSS21/syndata.html#tokenization
53 * @author Roman Kennke (kennke@aicas.com)
55 // TODO: Maybe implement more restrictive scanner:
56 // http://www.w3.org/TR/CSS21/grammar.html#q2
60 // The tokens. This list is taken from:
61 // http://www.w3.org/TR/CSS21/syndata.html#tokenization
62 static final int IDENT = 1;
63 static final int ATKEYWORD = 2;
64 static final int STRING = 3;
65 static final int INVALID = 4;
66 static final int HASH = 5;
67 static final int NUMBER = 6;
68 static final int PERCENTAGE = 7;
69 static final int DIMENSION = 8;
70 static final int URI = 9;
71 static final int UNICODE_RANGE = 10;
72 static final int CDO = 11;
73 static final int CDC = 12;
74 static final int SEMICOLON = 13;
75 static final int CURLY_LEFT = 14;
76 static final int CURLY_RIGHT = 15;
77 static final int PAREN_LEFT = 16;
78 static final int PAREN_RIGHT = 17;
79 static final int BRACE_LEFT = 16;
80 static final int BRACE_RIGHT = 17;
81 static final int S = 18;
82 static final int COMMENT = 19;
83 static final int FUNCTION = 20;
84 static final int INCLUDES = 21;
85 static final int DASHMATCH = 22;
86 static final int DELIM = 23;
88 // Additional tokens defined for convenience.
89 static final int EOF = -1;
102 * The end index in the parseBuffer of the current token.
107 * The lookahead 'buffer'.
109 private int[] lookahead;
113 lookahead = new int[2];
116 parseBuffer = new char[2048];
121 * Fetches the next token. The actual character data is in the parseBuffer
122 * afterwards with the tokenStart at index 0 and the tokenEnd field
123 * pointing to the end of the token.
125 * @return the next token
138 parseBuffer[0] = (char) next;
143 parseBuffer[0] = (char) next;
148 parseBuffer[0] = (char) next;
153 parseBuffer[0] = (char) next;
158 parseBuffer[0] = (char) next;
163 parseBuffer[0] = (char) next;
168 parseBuffer[0] = (char) next;
173 parseBuffer[0] = (char) next;
179 parseBuffer[0] = (char) next;
199 // FIXME: Detecting an URI involves several characters lookahead.
201 // lookahead[0] = ch;
206 parseBuffer[0] = (char) next;
207 parseBuffer[1] = (char) read();
208 parseBuffer[2] = (char) read();
209 parseBuffer[3] = (char) read();
210 if (parseBuffer[1] == '!' && parseBuffer[2] == '-'
211 && parseBuffer[3] == '-')
217 throw new CSSLexicalException("expected CDO token");
225 parseBuffer[0] = (char) next;
226 parseBuffer[1] = (char) read();
227 if (parseBuffer[1] == '=')
230 throw new CSSLexicalException("expected INCLUDES token");
233 parseBuffer[0] = (char) next;
234 parseBuffer[1] = (char) read();
235 if (parseBuffer[1] == '=')
238 throw new CSSLexicalException("expected DASHMATCH token");
247 parseBuffer[0] = (char) next;
248 parseBuffer[1] = (char) ch2;
249 parseBuffer[2] = (char) ch3;
254 throw new CSSLexicalException("expected CDC token");
262 if (ch3 == -1 || ch3 != '(')
269 parseBuffer[tokenEnd] = (char) ch3;
290 parseBuffer[tokenEnd] = (char) ch3;
294 else if (ch3 == -1 || (! (ch3 == '_'
295 || (ch3 >= 'a' && ch3 <= 'z')
296 || (ch3 >= 'A' && ch3 <= 'Z')
297 || ch3 == '\\' || ch3 > 177)))
310 // Handle IDENT that don't begin with '-'.
311 if (next == '_' || (next >= 'a' && next <= 'z')
312 || (next >= 'A' && next <= 'Z') || next == '\\' || next > 177)
317 if (ch4 == -1 || ch4 != '(')
324 parseBuffer[tokenEnd] = (char) ch4;
331 parseBuffer[0] = (char) next;
341 String currentTokenString()
343 return new String(parseBuffer, 0, tokenEnd);
347 * Reads one character from the input stream or from the lookahead
348 * buffer, if it contains one character.
350 * @return the next character
352 * @throws IOException if problems occur on the input source
358 if (lookahead[0] != -1)
363 else if (lookahead[1] != -1)
376 * Reads and identifier.
378 * @throws IOException if something goes wrong in the input source or if
379 * the lexical analyser fails to read an identifier
381 private void readIdent()
385 // Read possibly leading '-'.
388 parseBuffer[tokenEnd] = (char) ch1;
392 // What follows must be '_' or a-z or A-Z or nonascii (>177) or an
394 if (ch1 == '_' || (ch1 >= 'a' && ch1 <= 'z')
395 || (ch1 >= 'A' && ch1 <= 'Z') || ch1 > 177)
397 parseBuffer[tokenEnd] = (char) ch1;
400 else if (ch1 == '\\')
402 // Try to read an escape.
407 throw new CSSLexicalException("First character of identifier incorrect");
409 // Read any number of [_a-zA-Z0-9-] chars.
411 while (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
412 || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
414 parseBuffer[tokenEnd] = (char) ch;
419 // Push back last read character since it doesn't belong to the IDENT.
426 * @throws IOException if something goes wrong in the input source or if
427 * the lexical analyser fails to read an escape
429 private void readEscape()
433 if (ch != -1 && ch == '\\')
435 parseBuffer[tokenEnd] = (char) ch;
438 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f'))
440 // Read unicode escape.
441 // Zero to five 0-9a-f chars can follow.
444 while (((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f'))
447 parseBuffer[tokenEnd] = (char) ch;
452 // Now we can have a \r\n or any whitespace character following.
455 parseBuffer[tokenEnd] = (char) ch;
460 parseBuffer[tokenEnd] = (char) ch;
468 else if (ch == ' ' || ch == '\n' || ch == '\f' || ch == '\t')
470 parseBuffer[tokenEnd] = (char) ch;
478 else if (ch != '\n' && ch != '\r' && ch != '\f')
480 parseBuffer[tokenEnd] = (char) ch;
484 throw new CSSLexicalException("Can't read escape");
487 throw new CSSLexicalException("Escape must start with '\\'");
491 private void readName()
494 // Read first name character.
496 if (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
497 || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
499 parseBuffer[tokenEnd] = (char) ch;
503 throw new CSSLexicalException("Invalid name");
505 // Read any number (at least one) of [_a-zA-Z0-9-] chars.
507 while (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
508 || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
510 parseBuffer[tokenEnd] = (char) ch;
515 // Push back last read character since it doesn't belong to the IDENT.
522 * @throws IOException
524 private void readString()
528 if (ch1 != -1 && (ch1 == '\'' || ch1 == '\"'))
530 parseBuffer[tokenEnd] = (char) ch1;
533 // Read any number of chars until we hit another chc1 char.
534 // Reject newlines, except if prefixed with \.
536 while (ch != -1 && ch != ch1)
538 // Every non-newline and non-\ char should be ok.
539 if (ch != '\n' && ch != '\r' && ch != '\f' && ch != '\\')
541 parseBuffer[tokenEnd] = (char) ch;
544 // Ok when followed by newline or as part of escape.
548 if (ch2 == '\n' || ch2 == '\r')
550 parseBuffer[tokenEnd] = (char) ch;
551 parseBuffer[tokenEnd + 1] = (char) ch2;
556 // Try to parse an escape.
563 throw new CSSLexicalException("Invalid string");
569 // Push the final char on the buffer.
570 parseBuffer[tokenEnd] = (char) ch;
574 throw new CSSLexicalException("Unterminated string");
577 throw new CSSLexicalException("Invalid string");
581 * Reads a chunk of whitespace.
583 * @throws IOException
585 private void readWhitespace()
589 while (ch != -1 && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
592 parseBuffer[tokenEnd] = (char) ch;
596 // Push back last character read.
601 private void readURI()
608 * Reads a comment block.
610 * @throws IOException
612 private void readComment()
615 // First we need a / and a *
617 if (ch != -1 && ch == '/')
619 parseBuffer[tokenEnd] = (char) ch;
622 if (ch != -1 && ch == '*')
624 parseBuffer[tokenEnd] = (char) ch;
627 parseBuffer[tokenEnd] = (char) ch;
629 boolean finished = false;
632 while (! finished && ch != -1)
634 if (lastChar == '*' && ch == '/')
636 parseBuffer[tokenEnd] = (char) ch;
644 throw new CSSLexicalException("Unterminated comment");
646 // Push back last character read.
653 * @throws IOException
655 private void readNum()
658 boolean hadDot = false;
659 // First char must be number or .
661 if (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.'))
665 parseBuffer[tokenEnd] = (char) ch;
667 // Now read in any number of digits afterwards, and maybe one dot,
668 // if we hadn't one already.
670 while (ch != -1 && ((ch >= '0' && ch <= '9')
671 || (ch == '.' && ! hadDot)))
675 parseBuffer[tokenEnd] = (char) ch;
681 throw new CSSLexicalException("Invalid number");
683 // Check if we haven't accidentally finished with a dot.
684 if (parseBuffer[tokenEnd - 1] == '.')
685 throw new CSSLexicalException("Invalid number");
687 // Push back last character read.
692 * For testing, we read in the default.css in javax/swing/text/html
696 public static void main(String[] args)
700 String name = "/javax/swing/text/html/default.css";
701 InputStream in = CSSScanner.class.getResourceAsStream(name);
702 BufferedInputStream bin = new BufferedInputStream(in);
703 InputStreamReader r = new InputStreamReader(bin);
704 CSSScanner s = new CSSScanner(r);
708 token = s.nextToken();
709 System.out.println("token: " + token + ": "
710 + s.currentTokenString());
711 } while (token != -1);
713 catch (IOException ex)
715 ex.printStackTrace();