libjava/classpath/gnu/javax/swing/text/html/css/CSSScanner.java

   1 /* CSSScanner.java -- A parser for CSS stylesheets
   2    Copyright (C) 2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package gnu.javax.swing.text.html.css;
  40
  41 import java.io.BufferedInputStream;
  42 import java.io.IOException;
  43 import java.io.InputStream;
  44 import java.io.InputStreamReader;
  45 import java.io.Reader;
  46
  47 /**
  48  * A tokenizer for CSS stylesheets. This is based on the scanner definition
  49  * from:
  50  *
  51  * http://www.w3.org/TR/CSS21/syndata.html#tokenization
  52  *
  53  * @author Roman Kennke (kennke@aicas.com)
  54  */
  55 // TODO: Maybe implement more restrictive scanner:
  56 // http://www.w3.org/TR/CSS21/grammar.html#q2
  57 class CSSScanner
  58 {
  59
  60   // The tokens. This list is taken from:
  61   // http://www.w3.org/TR/CSS21/syndata.html#tokenization
  62   static final int IDENT = 1;
  63   static final int ATKEYWORD = 2;
  64   static final int STRING = 3;
  65   static final int INVALID = 4;
  66   static final int HASH = 5;
  67   static final int NUMBER = 6;
  68   static final int PERCENTAGE = 7;
  69   static final int DIMENSION = 8;
  70   static final int URI = 9;
  71   static final int UNICODE_RANGE = 10;
  72   static final int CDO = 11;
  73   static final int CDC = 12;
  74   static final int SEMICOLON = 13;
  75   static final int CURLY_LEFT = 14;
  76   static final int CURLY_RIGHT = 15;
  77   static final int PAREN_LEFT = 16;
  78   static final int PAREN_RIGHT = 17;
  79   static final int BRACE_LEFT = 16;
  80   static final int BRACE_RIGHT = 17;
  81   static final int S = 18;
  82   static final int COMMENT = 19;
  83   static final int FUNCTION = 20;
  84   static final int INCLUDES = 21;
  85   static final int DASHMATCH = 22;
  86   static final int DELIM = 23;
  87
  88   // Additional tokens defined for convenience.
  89   static final int EOF = -1;
  90
  91   /**
  92    * The input source.
  93    */
  94   private Reader in;
  95
  96   /**
  97    * The parse buffer.
  98    */
  99   char[] parseBuffer;
 100
 101   /**
 102    * The end index in the parseBuffer of the current token.
 103    */
 104   int tokenEnd;
 105
 106   /**
 107    * The lookahead 'buffer'.
 108    */
 109   private int[] lookahead;
 110
 111   CSSScanner(Reader r)
 112   {
 113     lookahead = new int[2];
 114     lookahead[0] = -1;
 115     lookahead[1] = -1;
 116     parseBuffer = new char[2048];
 117     in = r;
 118   }
 119
 120   /**
 121    * Fetches the next token. The actual character data is in the parseBuffer
 122    * afterwards with the tokenStart at index 0 and the tokenEnd field
 123    * pointing to the end of the token.
 124    *
 125    * @return the next token
 126    */
 127   int nextToken()
 128     throws IOException
 129   {
 130     tokenEnd = 0;
 131     int token = -1;
 132     int next = read();
 133     if (next != -1)
 134       {
 135         switch (next)
 136         {
 137           case ';':
 138             parseBuffer[0] = (char) next;
 139             tokenEnd = 1;
 140             token = SEMICOLON;
 141             break;
 142           case '{':
 143             parseBuffer[0] = (char) next;
 144             tokenEnd = 1;
 145             token = CURLY_LEFT;
 146             break;
 147           case '}':
 148             parseBuffer[0] = (char) next;
 149             tokenEnd = 1;
 150             token = CURLY_RIGHT;
 151             break;
 152           case '(':
 153             parseBuffer[0] = (char) next;
 154             tokenEnd = 1;
 155             token = PAREN_LEFT;
 156             break;
 157           case ')':
 158             parseBuffer[0] = (char) next;
 159             tokenEnd = 1;
 160             token = PAREN_RIGHT;
 161             break;
 162           case '[':
 163             parseBuffer[0] = (char) next;
 164             tokenEnd = 1;
 165             token = BRACE_LEFT;
 166             break;
 167           case ']':
 168             parseBuffer[0] = (char) next;
 169             tokenEnd = 1;
 170             token = BRACE_RIGHT;
 171             break;
 172           case '@':
 173             parseBuffer[0] = (char) next;
 174             tokenEnd = 1;
 175             readIdent();
 176             token = ATKEYWORD;
 177             break;
 178           case '#':
 179             parseBuffer[0] = (char) next;
 180             tokenEnd = 1;
 181             readName();
 182             token = HASH;
 183             break;
 184           case '\'':
 185           case '"':
 186             lookahead[0] = next;
 187             readString();
 188             token = STRING;
 189             break;
 190           case ' ':
 191           case '\t':
 192           case '\r':
 193           case '\n':
 194           case '\f':
 195             lookahead[0] = next;
 196             readWhitespace();
 197             token = S;
 198             break;
 199             // FIXME: Detecting an URI involves several characters lookahead.
 200 //          case 'u':
 201 //            lookahead[0] = ch;
 202 //            readURI();
 203 //            token = URI;
 204 //            break;
 205           case '<':
 206             parseBuffer[0] = (char) next;
 207             parseBuffer[1] = (char) read();
 208             parseBuffer[2] = (char) read();
 209             parseBuffer[3] = (char) read();
 210             if (parseBuffer[1] == '!' && parseBuffer[2] == '-'
 211               && parseBuffer[3] == '-')
 212               {
 213                 token = CDO;
 214                 tokenEnd = 4;
 215               }
 216             else
 217               throw new CSSLexicalException("expected CDO token");
 218             break;
 219           case '/':
 220             lookahead[0] = next;
 221             readComment();
 222             token = COMMENT;
 223             break;
 224           case '~':
 225             parseBuffer[0] = (char) next;
 226             parseBuffer[1] = (char) read();
 227             if (parseBuffer[1] == '=')
 228               token = INCLUDES;
 229             else
 230               throw new CSSLexicalException("expected INCLUDES token");
 231             break;
 232           case '|':
 233             parseBuffer[0] = (char) next;
 234             parseBuffer[1] = (char) read();
 235             if (parseBuffer[1] == '=')
 236               token = DASHMATCH;
 237             else
 238               throw new CSSLexicalException("expected DASHMATCH token");
 239             break;
 240           case '-':
 241             int ch2 = read();
 242             if (ch2 == '-')
 243               {
 244                 int ch3 = read();
 245                 if (ch3 == '>')
 246                   {
 247                     parseBuffer[0] = (char) next;
 248                     parseBuffer[1] = (char) ch2;
 249                     parseBuffer[2] = (char) ch3;
 250                     tokenEnd = 3;
 251                     token = CDC;
 252                   }
 253                 else
 254                   throw new CSSLexicalException("expected CDC token");
 255               }
 256             else
 257               {
 258                 lookahead[0] = next;
 259                 lookahead[1] = ch2;
 260                 readIdent();
 261                 int ch3 = read();
 262                 if (ch3 == -1 || ch3 != '(')
 263                   {
 264                     lookahead[0] = ch3;
 265                     token = IDENT;
 266                   }
 267                 else
 268                   {
 269                     parseBuffer[tokenEnd] = (char) ch3;
 270                     tokenEnd++;
 271                     token = FUNCTION;
 272                   }
 273               }
 274             break;
 275           case '0':
 276           case '1':
 277           case '2':
 278           case '3':
 279           case '4':
 280           case '5':
 281           case '6':
 282           case '7':
 283           case '8':
 284           case '9':
 285             lookahead[0] = next;
 286             readNum();
 287             int ch3 = read();
 288             if (ch3 == '%')
 289               {
 290                 parseBuffer[tokenEnd] = (char) ch3;
 291                 tokenEnd++;
 292                 token = PERCENTAGE;
 293               }
 294             else if (ch3 == -1 || (! (ch3 == '_'
 295                                       || (ch3 >= 'a' && ch3 <= 'z')
 296                                       || (ch3 >= 'A' && ch3 <= 'Z')
 297                                       || ch3 == '\\' || ch3 > 177)))
 298               {
 299                 lookahead[0] = ch3;
 300                 token = NUMBER;
 301               }
 302             else
 303               {
 304                 lookahead[0] = ch3;
 305                 readIdent();
 306                 token = DIMENSION;
 307               }
 308             break;
 309           default:
 310             // Handle IDENT that don't begin with '-'.
 311             if (next == '_' || (next >= 'a' && next <= 'z')
 312                 || (next >= 'A' && next <= 'Z') || next == '\\' || next > 177)
 313               {
 314                 lookahead[0] = next;
 315                 readIdent();
 316                 int ch4 = read();
 317                 if (ch4 == -1 || ch4 != '(')
 318                   {
 319                     lookahead[0] = ch4;
 320                     token = IDENT;
 321                   }
 322                 else
 323                   {
 324                     parseBuffer[tokenEnd] = (char) ch4;
 325                     tokenEnd++;
 326                     token = FUNCTION;
 327                   }
 328               }
 329             else
 330               {
 331                 parseBuffer[0] = (char) next;
 332                 tokenEnd = 1;
 333                 token = DELIM;
 334               }
 335           break;
 336         }
 337       }
 338     return token;
 339   }
 340
 341   String currentTokenString()
 342   {
 343     return new String(parseBuffer, 0, tokenEnd);
 344   }
 345
 346   /**
 347    * Reads one character from the input stream or from the lookahead
 348    * buffer, if it contains one character.
 349    *
 350    * @return the next character
 351    *
 352    * @throws IOException if problems occur on the input source
 353    */
 354   private int read()
 355     throws IOException
 356   {
 357     int ret;
 358     if (lookahead[0] != -1)
 359       {
 360         ret = lookahead[0];
 361         lookahead[0] = -1;
 362       }
 363     else if (lookahead[1] != -1)
 364       {
 365         ret = lookahead[1];
 366         lookahead[1] = -1;
 367       }
 368     else
 369       {
 370         ret = in.read();
 371       }
 372     return ret;
 373   }
 374
 375   /**
 376    * Reads and identifier.
 377    *
 378    * @throws IOException if something goes wrong in the input source or if
 379    *         the lexical analyser fails to read an identifier
 380    */
 381   private void readIdent()
 382     throws IOException
 383   {
 384     int ch1 = read();
 385     // Read possibly leading '-'.
 386     if (ch1 == '-')
 387       {
 388         parseBuffer[tokenEnd] = (char) ch1;
 389         tokenEnd++;
 390         ch1 = read();
 391       }
 392     // What follows must be '_' or a-z or A-Z or nonascii (>177) or an
 393     // escape.
 394     if (ch1 == '_' || (ch1 >= 'a' && ch1 <= 'z')
 395         || (ch1 >= 'A' && ch1 <= 'Z') || ch1 > 177)
 396       {
 397         parseBuffer[tokenEnd] = (char) ch1;
 398         tokenEnd++;
 399       }
 400     else if (ch1 == '\\')
 401       {
 402         // Try to read an escape.
 403         lookahead[0] = ch1;
 404         readEscape();
 405       }
 406     else
 407       throw new CSSLexicalException("First character of identifier incorrect");
 408
 409     // Read any number of [_a-zA-Z0-9-] chars.
 410     int ch = read();
 411     while (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
 412            || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
 413       {
 414         parseBuffer[tokenEnd] = (char) ch;
 415         tokenEnd++;
 416         ch = read();
 417       }
 418
 419     // Push back last read character since it doesn't belong to the IDENT.
 420     lookahead[0] = ch;
 421   }
 422
 423   /**
 424    * Reads an escape.
 425    *
 426    * @throws IOException if something goes wrong in the input source or if
 427    *         the lexical analyser fails to read an escape
 428    */
 429   private void readEscape()
 430     throws IOException
 431   {
 432     int ch = read();
 433     if (ch != -1 && ch == '\\')
 434       {
 435         parseBuffer[tokenEnd] = (char) ch;
 436         tokenEnd++;
 437         ch = read();
 438         if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f'))
 439           {
 440             // Read unicode escape.
 441             // Zero to five 0-9a-f chars can follow.
 442             int hexcount = 0;
 443             ch = read();
 444             while (((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f'))
 445                    && hexcount < 5)
 446               {
 447                 parseBuffer[tokenEnd] = (char) ch;
 448                 tokenEnd++;
 449                 hexcount++;
 450                 ch = read();
 451               }
 452             // Now we can have a \r\n or any whitespace character following.
 453             if (ch == '\r')
 454               {
 455                 parseBuffer[tokenEnd] = (char) ch;
 456                 tokenEnd++;
 457                 ch = read();
 458                 if (ch == '\n')
 459                   {
 460                     parseBuffer[tokenEnd] = (char) ch;
 461                     tokenEnd++;
 462                   }
 463                 else
 464                   {
 465                     lookahead[0] = ch;
 466                   }
 467               }
 468             else if (ch == ' ' || ch == '\n' || ch == '\f' || ch == '\t')
 469               {
 470                 parseBuffer[tokenEnd] = (char) ch;
 471                 tokenEnd++;
 472               }
 473             else
 474               {
 475                 lookahead[0] = ch;
 476               }
 477           }
 478         else if (ch != '\n' && ch != '\r' && ch != '\f')
 479           {
 480             parseBuffer[tokenEnd] = (char) ch;
 481             tokenEnd++;
 482           }
 483         else
 484           throw new CSSLexicalException("Can't read escape");
 485       }
 486     else
 487       throw new CSSLexicalException("Escape must start with '\\'");
 488
 489   }
 490
 491   private void readName()
 492     throws IOException
 493   {
 494     // Read first name character.
 495     int ch = read();
 496     if (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
 497            || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
 498       {
 499         parseBuffer[tokenEnd] = (char) ch;
 500         tokenEnd++;
 501       }
 502     else
 503       throw new CSSLexicalException("Invalid name");
 504
 505     // Read any number (at least one) of [_a-zA-Z0-9-] chars.
 506     ch = read();
 507     while (ch != -1 && (ch == '_' || ch == '-' || (ch >= 'a' && ch <= 'z')
 508            || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')))
 509       {
 510         parseBuffer[tokenEnd] = (char) ch;
 511         tokenEnd++;
 512         ch = read();
 513       }
 514
 515     // Push back last read character since it doesn't belong to the IDENT.
 516     lookahead[0] = ch;
 517   }
 518
 519   /**
 520    * Reads in a string.
 521    *
 522    * @throws IOException
 523    */
 524   private void readString()
 525     throws IOException
 526   {
 527     int ch1 = read();
 528     if (ch1 != -1 && (ch1 == '\'' || ch1 == '\"'))
 529       {
 530         parseBuffer[tokenEnd] = (char) ch1;
 531         tokenEnd++;
 532
 533         // Read any number of chars until we hit another chc1 char.
 534         // Reject newlines, except if prefixed with \.
 535         int ch = read();
 536         while (ch != -1 && ch != ch1)
 537           {
 538             // Every non-newline and non-\ char should be ok.
 539             if (ch != '\n' && ch != '\r' && ch != '\f' && ch != '\\')
 540               {
 541                 parseBuffer[tokenEnd] = (char) ch;
 542                 tokenEnd++;
 543               }
 544             // Ok when followed by newline or as part of escape.
 545             else if (ch == '\\')
 546               {
 547                 int ch2 = read();
 548                 if (ch2 == '\n' || ch2 == '\r')
 549                   {
 550                     parseBuffer[tokenEnd] = (char) ch;
 551                     parseBuffer[tokenEnd + 1] = (char) ch2;
 552                     tokenEnd += 2;
 553                   }
 554                 else
 555                   {
 556                     // Try to parse an escape.
 557                     lookahead[0] = ch;
 558                     lookahead[1] = ch2;
 559                     readEscape();
 560                   }
 561               }
 562             else
 563               throw new CSSLexicalException("Invalid string");
 564
 565             ch = read();
 566           }
 567         if (ch != -1)
 568           {
 569             // Push the final char on the buffer.
 570             parseBuffer[tokenEnd] = (char) ch;
 571             tokenEnd++;
 572           }
 573         else
 574           throw new CSSLexicalException("Unterminated string");
 575       }
 576     else
 577       throw new CSSLexicalException("Invalid string");
 578   }
 579
 580   /**
 581    * Reads a chunk of whitespace.
 582    *
 583    * @throws IOException
 584    */
 585   private void readWhitespace()
 586     throws IOException
 587   {
 588     int ch = read();
 589     while (ch != -1 && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
 590            || ch == '\f'))
 591       {
 592         parseBuffer[tokenEnd] = (char) ch;
 593         tokenEnd++;
 594         ch = read();
 595       }
 596     // Push back last character read.
 597     lookahead[0] = ch;
 598
 599   }
 600
 601   private void readURI()
 602     throws IOException
 603   {
 604     // FIXME: Implement.
 605   }
 606
 607   /**
 608    * Reads a comment block.
 609    *
 610    * @throws IOException
 611    */
 612   private void readComment()
 613     throws IOException
 614   {
 615     // First we need a / and a *
 616     int ch = read();
 617     if (ch != -1 && ch == '/')
 618       {
 619         parseBuffer[tokenEnd] = (char) ch;
 620         tokenEnd++;
 621         ch = read();
 622         if (ch != -1 && ch == '*')
 623           {
 624             parseBuffer[tokenEnd] = (char) ch;
 625             tokenEnd++;
 626             ch = read();
 627             parseBuffer[tokenEnd] = (char) ch;
 628             tokenEnd++;
 629             boolean finished = false;
 630             int lastChar = ch;
 631             ch = read();
 632             while (! finished && ch != -1)
 633               {
 634                 if (lastChar == '*' && ch == '/')
 635                   finished = true;
 636                 parseBuffer[tokenEnd] = (char) ch;
 637                 tokenEnd++;
 638                 lastChar = ch;
 639                 ch = read();
 640               }
 641           }
 642       }
 643     if (ch == -1)
 644       throw new CSSLexicalException("Unterminated comment");
 645
 646     // Push back last character read.
 647     lookahead[0] = ch;
 648   }
 649
 650   /**
 651    * Reads a number.
 652    *
 653    * @throws IOException
 654    */
 655   private void readNum()
 656     throws IOException
 657   {
 658     boolean hadDot = false;
 659     // First char must be number or .
 660     int ch = read();
 661     if (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.'))
 662       {
 663         if (ch == '.')
 664           hadDot = true;
 665         parseBuffer[tokenEnd] = (char) ch;
 666         tokenEnd++;
 667         // Now read in any number of digits afterwards, and maybe one dot,
 668         // if we hadn't one already.
 669         ch = read();
 670         while (ch != -1 && ((ch >= '0' && ch <= '9')
 671                             || (ch == '.' && ! hadDot)))
 672           {
 673             if (ch == '.')
 674               hadDot = true;
 675             parseBuffer[tokenEnd] = (char) ch;
 676             tokenEnd++;
 677             ch = read();
 678           }
 679       }
 680     else
 681       throw new CSSLexicalException("Invalid number");
 682
 683     // Check if we haven't accidentally finished with a dot.
 684     if (parseBuffer[tokenEnd - 1] == '.')
 685       throw new CSSLexicalException("Invalid number");
 686
 687     // Push back last character read.
 688     lookahead[0] = ch;
 689   }
 690
 691   /**
 692    * For testing, we read in the default.css in javax/swing/text/html
 693    *
 694    * @param args
 695    */
 696   public static void main(String[] args)
 697   {
 698     try
 699       {
 700         String name = "/javax/swing/text/html/default.css";
 701         InputStream in = CSSScanner.class.getResourceAsStream(name);
 702         BufferedInputStream bin = new BufferedInputStream(in);
 703         InputStreamReader r = new InputStreamReader(bin);
 704         CSSScanner s = new CSSScanner(r);
 705         int token;
 706         do
 707           {
 708             token = s.nextToken();
 709             System.out.println("token: " + token + ": "
 710                                + s.currentTokenString());
 711           } while (token != -1);
 712       }
 713     catch (IOException ex)
 714       {
 715         ex.printStackTrace();
 716       }
 717   }
 718 }