gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
   3    Free Software Foundation, Inc.
   4    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GCC is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to
  20 the Free Software Foundation, 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.
  22
  23 Java and all Java-based marks are trademarks or registered trademarks
  24 of Sun Microsystems, Inc. in the United States and other countries.
  25 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  26
  27 /* It defines java_lex (yylex) that reads a Java ASCII source file
  28    possibly containing Unicode escape sequence or utf8 encoded
  29    characters and returns a token for everything found but comments,
  30    white spaces and line terminators. When necessary, it also fills
  31    the java_lval (yylval) union. It's implemented to be called by a
  32    re-entrant parser generated by Bison.
  33
  34    The lexical analysis conforms to the Java grammar described in "The
  35    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  36    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  37
  38 #include "keyword.h"
  39 #include "flags.h"
  40 #include "chartables.h"
  41 #ifndef JC1_LITE
  42 #include "timevar.h"
  43 #endif
  44
  45 /* Function declarations.  */
  46 static char *java_sprint_unicode (int);
  47 static void java_unicode_2_utf8 (unicode_t);
  48 static void java_lex_error (const char *, int);
  49 #ifndef JC1_LITE
  50 static int do_java_lex (YYSTYPE *);
  51 static int java_lex (YYSTYPE *);
  52 static int java_is_eol (FILE *, int);
  53 static tree build_wfl_node (tree);
  54 #endif
  55 static int java_parse_escape_sequence (void);
  56 static int java_start_char_p (unicode_t);
  57 static int java_part_char_p (unicode_t);
  58 static int java_space_char_p (unicode_t);
  59 static void java_parse_doc_section (int);
  60 static void java_parse_end_comment (int);
  61 static int java_read_char (java_lexer *);
  62 static int java_get_unicode (void);
  63 static int java_peek_unicode (void);
  64 static void java_next_unicode (void);
  65 static int java_read_unicode (java_lexer *, int *);
  66 #ifndef JC1_LITE
  67 static int utf8_cmp (const unsigned char *, int, const char *);
  68 #endif
  69
  70 java_lexer *java_new_lexer (FILE *, const char *);
  71 #ifndef JC1_LITE
  72 static void error_if_numeric_overflow (tree);
  73 #endif
  74
  75 #ifdef HAVE_ICONV
  76 /* This is nonzero if we have initialized `need_byteswap'.  */
  77 static int byteswap_init = 0;
  78
  79 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  80    big-endian order -- not native endian order.  We handle this by
  81    doing a conversion once at startup and seeing what happens.  This
  82    flag holds the results of this determination.  */
  83 static int need_byteswap = 0;
  84 #endif
  85
  86 void
  87 java_init_lex (FILE *finput, const char *encoding)
  88 {
  89 #ifndef JC1_LITE
  90   int java_lang_imported = 0;
  91
  92   if (!java_lang_id)
  93     java_lang_id = get_identifier ("java.lang");
  94   if (!inst_id)
  95     inst_id = get_identifier ("inst$");
  96   if (!wpv_id)
  97     wpv_id = get_identifier ("write_parm_value$");
  98
  99   if (!java_lang_imported)
 100     {
 101       tree node = build_tree_list (build_unknown_wfl (java_lang_id),
 102                                    NULL_TREE);
 103       read_import_dir (TREE_PURPOSE (node));
 104       TREE_CHAIN (node) = ctxp->import_demand_list;
 105       ctxp->import_demand_list = node;
 106       java_lang_imported = 1;
 107     }
 108
 109   if (!wfl_operator)
 110     {
 111 #ifdef USE_MAPPED_LOCATION
 112       wfl_operator = build_expr_wfl (NULL_TREE, input_location);
 113 #else
 114       wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 115 #endif
 116     }
 117   if (!label_id)
 118     label_id = get_identifier ("$L");
 119   if (!wfl_append)
 120     wfl_append = build_unknown_wfl (get_identifier ("append"));
 121   if (!wfl_string_buffer)
 122     wfl_string_buffer =
 123       build_unknown_wfl (get_identifier (flag_emit_class_files
 124                                       ? "java.lang.StringBuffer"
 125                                          : "gnu.gcj.runtime.StringBuffer"));
 126   if (!wfl_to_string)
 127     wfl_to_string = build_unknown_wfl (get_identifier ("toString"));
 128
 129   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 130     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 131
 132   memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
 133   ctxp->current_parsed_class = NULL;
 134   ctxp->package = NULL_TREE;
 135 #endif
 136
 137   ctxp->save_location = input_location;
 138   ctxp->java_error_flag = 0;
 139   ctxp->lexer = java_new_lexer (finput, encoding);
 140 }
 141
 142 static char *
 143 java_sprint_unicode (int c)
 144 {
 145   static char buffer [10];
 146   if (c < ' ' || c >= 127)
 147     sprintf (buffer, "\\u%04x", c);
 148   else
 149     {
 150       buffer [0] = c;
 151       buffer [1] = '\0';
 152     }
 153   return buffer;
 154 }
 155
 156 /* Create a new lexer object.  */
 157
 158 java_lexer *
 159 java_new_lexer (FILE *finput, const char *encoding)
 160 {
 161   java_lexer *lex = xmalloc (sizeof (java_lexer));
 162   int enc_error = 0;
 163
 164   lex->finput = finput;
 165   lex->bs_count = 0;
 166   lex->unget_value = 0;
 167   lex->next_unicode = 0;
 168   lex->avail_unicode = 0;
 169   lex->next_columns = 1;
 170   lex->encoding = encoding;
 171   lex->position.line = 1;
 172   lex->position.col = 1;
 173 #ifndef JC1_LITE
 174 #ifdef USE_MAPPED_LOCATION
 175       input_location
 176         = linemap_line_start (&line_table, 1, 120);
 177 #else
 178       input_line = 1;
 179 #endif
 180 #endif
 181
 182 #ifdef HAVE_ICONV
 183   lex->handle = iconv_open ("UCS-2", encoding);
 184   if (lex->handle != (iconv_t) -1)
 185     {
 186       lex->first = -1;
 187       lex->last = -1;
 188       lex->out_first = -1;
 189       lex->out_last = -1;
 190       lex->read_anything = 0;
 191       lex->use_fallback = 0;
 192
 193       /* Work around broken iconv() implementations by doing checking at
 194          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 195          then all UCS-2 encoders will be broken.  Perhaps not a valid
 196          assumption.  */
 197       if (! byteswap_init)
 198         {
 199           iconv_t handle;
 200
 201           byteswap_init = 1;
 202
 203           handle = iconv_open ("UCS-2", "UTF-8");
 204           if (handle != (iconv_t) -1)
 205             {
 206               unicode_t result;
 207               unsigned char in[3];
 208               char *inp, *outp;
 209               size_t inc, outc, r;
 210
 211               /* This is the UTF-8 encoding of \ufeff.  */
 212               in[0] = 0xef;
 213               in[1] = 0xbb;
 214               in[2] = 0xbf;
 215
 216               inp = (char *) in;
 217               inc = 3;
 218               outp = (char *) &result;
 219               outc = 2;
 220
 221               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 222                          &outp, &outc);
 223               iconv_close (handle);
 224               /* Conversion must be complete for us to use the result.  */
 225               if (r != (size_t) -1 && inc == 0 && outc == 0)
 226                 need_byteswap = (result != 0xfeff);
 227             }
 228         }
 229
 230       lex->byte_swap = need_byteswap;
 231     }
 232   else
 233 #endif /* HAVE_ICONV */
 234     {
 235       /* If iconv failed, use the internal decoder if the default
 236          encoding was requested.  This code is used on platforms where
 237          iconv exists but is insufficient for our needs.  For
 238          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
 239
 240          On Solaris the default encoding, as returned by nl_langinfo(),
 241          is `646' (aka ASCII), but the Solaris iconv_open() doesn't
 242          understand that.  We work around that by pretending
 243          `646' to be the same as UTF-8.   */
 244       if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
 245         enc_error = 1;
 246 #ifdef HAVE_ICONV
 247       else
 248         {
 249           lex->use_fallback = 1;
 250           lex->encoding = "UTF-8";
 251         }
 252 #endif /* HAVE_ICONV */
 253     }
 254
 255   if (enc_error)
 256     fatal_error ("unknown encoding: %qs\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n%<--encoding=UTF-8%> option", encoding);
 257
 258   return lex;
 259 }
 260
 261 void
 262 java_destroy_lexer (java_lexer *lex)
 263 {
 264 #ifdef HAVE_ICONV
 265   if (! lex->use_fallback)
 266     iconv_close (lex->handle);
 267 #endif
 268   free (lex);
 269 }
 270
 271 static int
 272 java_read_char (java_lexer *lex)
 273 {
 274 #ifdef HAVE_ICONV
 275   if (! lex->use_fallback)
 276     {
 277       size_t ir, inbytesleft, in_save, out_count, out_save;
 278       char *inp, *outp;
 279       unicode_t result;
 280
 281       /* If there is data which has already been converted, use it.  */
 282       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 283         {
 284           lex->out_first = 0;
 285           lex->out_last = 0;
 286
 287           while (1)
 288             {
 289               /* See if we need to read more data.  If FIRST == 0 then
 290                  the previous conversion attempt ended in the middle of
 291                  a character at the end of the buffer.  Otherwise we
 292                  only have to read if the buffer is empty.  */
 293               if (lex->first == 0 || lex->first >= lex->last)
 294                 {
 295                   int r;
 296
 297                   if (lex->first >= lex->last)
 298                     {
 299                       lex->first = 0;
 300                       lex->last = 0;
 301                     }
 302                   if (feof (lex->finput))
 303                     return UEOF;
 304                   r = fread (&lex->buffer[lex->last], 1,
 305                              sizeof (lex->buffer) - lex->last,
 306                              lex->finput);
 307                   lex->last += r;
 308                 }
 309
 310               inbytesleft = lex->last - lex->first;
 311               out_count = sizeof (lex->out_buffer) - lex->out_last;
 312
 313               if (inbytesleft == 0)
 314                 {
 315                   /* We've tried to read and there is nothing left.  */
 316                   return UEOF;
 317                 }
 318
 319               in_save = inbytesleft;
 320               out_save = out_count;
 321               inp = &lex->buffer[lex->first];
 322               outp = (char *) &lex->out_buffer[lex->out_last];
 323               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 324                           &inbytesleft, &outp, &out_count);
 325
 326               /* If we haven't read any bytes, then look to see if we
 327                  have read a BOM.  */
 328               if (! lex->read_anything && out_save - out_count >= 2)
 329                 {
 330                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 331                   if (uc == 0xfeff)
 332                     {
 333                       lex->byte_swap = 0;
 334                       lex->out_first += 2;
 335                     }
 336                   else if (uc == 0xfffe)
 337                     {
 338                       lex->byte_swap = 1;
 339                       lex->out_first += 2;
 340                     }
 341                   lex->read_anything = 1;
 342                 }
 343
 344               if (lex->byte_swap)
 345                 {
 346                   unsigned int i;
 347                   for (i = 0; i < out_save - out_count; i += 2)
 348                     {
 349                       char t = lex->out_buffer[lex->out_last + i];
 350                       lex->out_buffer[lex->out_last + i]
 351                         = lex->out_buffer[lex->out_last + i + 1];
 352                       lex->out_buffer[lex->out_last + i + 1] = t;
 353                     }
 354                 }
 355
 356               lex->first += in_save - inbytesleft;
 357               lex->out_last += out_save - out_count;
 358
 359               /* If we converted anything at all, move along.  */
 360               if (out_count != out_save)
 361                 break;
 362
 363               if (ir == (size_t) -1)
 364                 {
 365                   if (errno == EINVAL)
 366                     {
 367                       /* This is ok.  This means that the end of our buffer
 368                          is in the middle of a character sequence.  We just
 369                          move the valid part of the buffer to the beginning
 370                          to force a read.  */
 371                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 372                                lex->last - lex->first);
 373                       lex->last -= lex->first;
 374                       lex->first = 0;
 375                     }
 376                   else
 377                     {
 378                       /* A more serious error.  */
 379                       char buffer[128];
 380                       sprintf (buffer,
 381                                "Unrecognized character for encoding '%s'",
 382                                lex->encoding);
 383                       java_lex_error (buffer, 0);
 384                       return UEOF;
 385                     }
 386                 }
 387             }
 388         }
 389
 390       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 391         {
 392           /* Don't have any data.  */
 393           return UEOF;
 394         }
 395
 396       /* Success.  */
 397       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 398       lex->out_first += 2;
 399       return result;
 400     }
 401   else
 402 #endif /* HAVE_ICONV */
 403     {
 404       int c, c1, c2;
 405       c = getc (lex->finput);
 406
 407       if (c == EOF)
 408         return UEOF;
 409       if (c < 128)
 410         return (unicode_t) c;
 411       else
 412         {
 413           if ((c & 0xe0) == 0xc0)
 414             {
 415               c1 = getc (lex->finput);
 416               if ((c1 & 0xc0) == 0x80)
 417                 {
 418                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 419                   /* Check for valid 2-byte characters.  We explicitly
 420                      allow \0 because this encoding is common in the
 421                      Java world.  */
 422                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 423                     return r;
 424                 }
 425             }
 426           else if ((c & 0xf0) == 0xe0)
 427             {
 428               c1 = getc (lex->finput);
 429               if ((c1 & 0xc0) == 0x80)
 430                 {
 431                   c2 = getc (lex->finput);
 432                   if ((c2 & 0xc0) == 0x80)
 433                     {
 434                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 435                                                  (( c1 & 0x3f) << 6)
 436                                                  + (c2 & 0x3f));
 437                       /* Check for valid 3-byte characters.
 438                          Don't allow surrogate, \ufffe or \uffff.  */
 439                       if (IN_RANGE (r, 0x800, 0xffff)
 440                           && ! IN_RANGE (r, 0xd800, 0xdfff)
 441                           && r != 0xfffe && r != 0xffff)
 442                         return r;
 443                     }
 444                 }
 445             }
 446
 447           /* We simply don't support invalid characters.  We also
 448              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 449              cannot be valid Java characters.  */
 450           java_lex_error ("malformed UTF-8 character", 0);
 451         }
 452     }
 453
 454   /* We only get here on error.  */
 455   return UEOF;
 456 }
 457
 458 static int
 459 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
 460 {
 461   int c;
 462
 463   if (lex->unget_value)
 464     {
 465       c = lex->unget_value;
 466       lex->unget_value = 0;
 467     }
 468   else
 469     c = java_read_char (lex);
 470
 471   *unicode_escape_p = 0;
 472
 473   if (c != '\\')
 474     {
 475       lex->bs_count = 0;
 476       return c;
 477     }
 478
 479   ++lex->bs_count;
 480   if ((lex->bs_count) % 2 == 1)
 481     {
 482       /* Odd number of \ seen.  */
 483       c = java_read_char (lex);
 484       if (c == 'u')
 485         {
 486           unicode_t unicode = 0;
 487           int shift = 12;
 488
 489           /* Recognize any number of `u's in \u.  */
 490           while ((c = java_read_char (lex)) == 'u')
 491             ;
 492
 493           shift = 12;
 494           do
 495             {
 496               if (c == UEOF)
 497                 {
 498                   java_lex_error ("prematurely terminated \\u sequence", 0);
 499                   return UEOF;
 500                 }
 501
 502               if (hex_p (c))
 503                 unicode |= (unicode_t)(hex_value (c) << shift);
 504               else
 505                 {
 506                   java_lex_error ("non-hex digit in \\u sequence", 0);
 507                   break;
 508                 }
 509
 510               c = java_read_char (lex);
 511               shift -= 4;
 512             }
 513           while (shift >= 0);
 514
 515           if (c != UEOF)
 516             lex->unget_value = c;
 517
 518           lex->bs_count = 0;
 519           *unicode_escape_p = 1;
 520           return unicode;
 521         }
 522       lex->unget_value = c;
 523     }
 524   return (unicode_t) '\\';
 525 }
 526
 527 /* Get the next Unicode character (post-Unicode-escape-handling).
 528    Move the current position to just after returned character. */
 529
 530 static int
 531 java_get_unicode (void)
 532 {
 533   int next = java_peek_unicode ();
 534   java_next_unicode ();
 535   return next;
 536 }
 537
 538 /* Return the next Unicode character (post-Unicode-escape-handling).
 539    Do not move the current position, which remains just before
 540    the returned character. */
 541
 542 static int
 543 java_peek_unicode (void)
 544 {
 545   int unicode_escape_p;
 546   java_lexer *lex = ctxp->lexer;
 547   int next;
 548
 549   if (lex->avail_unicode)
 550     return lex->next_unicode;
 551
 552   next = java_read_unicode (lex, &unicode_escape_p);
 553
 554   if (next == '\r')
 555     {
 556       /* We have to read ahead to see if we got \r\n.
 557          In that case we return a single line terminator.  */
 558       int dummy;
 559       next = java_read_unicode (lex, &dummy);
 560       if (next != '\n' && next != UEOF)
 561         lex->unget_value = next;
 562       /* In either case we must return a newline.  */
 563       next = '\n';
 564     }
 565
 566   lex->next_unicode = next;
 567   lex->avail_unicode = 1;
 568
 569   if (next == UEOF)
 570     {
 571       lex->next_columns = 0;
 572       return next;
 573     }
 574
 575   if (next == '\n')
 576     {
 577       lex->next_columns = 1 - lex->position.col;
 578     }
 579   else if (next == '\t')
 580     {
 581       int cur_col = lex->position.col;
 582       lex->next_columns = ((cur_col + 7) & ~7) + 1 - cur_col;
 583
 584     }
 585   else
 586     {
 587       lex->next_columns = 1;
 588     }
 589   if (unicode_escape_p)
 590     lex->next_columns = 6;
 591   return next;
 592 }
 593
 594 /* Move forward one Unicode character (post-Unicode-escape-handling).
 595    Only allowed after java_peek_unicode.  The combination java_peek_unicode
 596    followed by java_next_unicode is equivalent to java_get_unicode.  */
 597
 598 static void java_next_unicode (void)
 599 {
 600   struct java_lexer *lex = ctxp->lexer;
 601   lex->position.col += lex->next_columns;
 602   if (lex->next_unicode == '\n')
 603     {
 604       lex->position.line++;
 605 #ifndef JC1_LITE
 606 #ifdef USE_MAPPED_LOCATION
 607       input_location
 608         = linemap_line_start (&line_table, lex->position.line, 120);
 609 #else
 610       input_line = lex->position.line;
 611 #endif
 612 #endif
 613     }
 614   lex->avail_unicode = 0;
 615 }
 616
 617 #if 0
 618 /* The inverse of java_next_unicode.
 619    Not currently used, but could be if it would be cleaner or faster.
 620    java_peek_unicode == java_get_unicode + java_unget_unicode.
 621    java_get_unicode == java_peek_unicode + java_next_unicode.
 622 */
 623 static void java_unget_unicode ()
 624 {
 625   struct java_lexer *lex = ctxp->lexer;
 626   if (lex->avail_unicode)
 627     fatal_error ("internal error - bad unget");
 628   lex->avail_unicode = 1;
 629   lex->position.col -= lex->next_columns;
 630 }
 631 #endif
 632
 633 /* Parse the end of a C style comment.
 634  * C is the first character following the '/' and '*'.  */
 635 static void
 636 java_parse_end_comment (int c)
 637 {
 638   for ( ;; c = java_get_unicode ())
 639     {
 640       switch (c)
 641         {
 642         case UEOF:
 643           java_lex_error ("Comment not terminated at end of input", 0);
 644           return;
 645         case '*':
 646           switch (c = java_peek_unicode ())
 647             {
 648             case UEOF:
 649               java_lex_error ("Comment not terminated at end of input", 0);
 650               return;
 651             case '/':
 652               java_next_unicode ();
 653               return;
 654             case '*':   /* Reparse only '*'.  */
 655               ;
 656             }
 657         }
 658     }
 659 }
 660
 661 /* Parse the documentation section. Keywords must be at the beginning
 662    of a documentation comment line (ignoring white space and any `*'
 663    character). Parsed keyword(s): @DEPRECATED.  */
 664
 665 static void
 666 java_parse_doc_section (int c)
 667 {
 668   int last_was_star;
 669
 670   /* We reset this here, because only the most recent doc comment
 671      applies to the following declaration.  */
 672   ctxp->deprecated = 0;
 673
 674   /* We loop over all the lines of the comment.  We'll eventually exit
 675      if we hit EOF prematurely, or when we see the comment
 676      terminator.  */
 677   while (1)
 678     {
 679       /* These first steps need only be done if we're still looking
 680          for the deprecated tag.  If we've already seen it, we might
 681          as well skip looking for it again.  */
 682       if (! ctxp->deprecated)
 683         {
 684           /* Skip whitespace and '*'s.  We must also check for the end
 685              of the comment here.  */
 686           while (JAVA_WHITE_SPACE_P (c) || c == '*')
 687             {
 688               last_was_star = (c == '*');
 689               c = java_get_unicode ();
 690               if (last_was_star && c == '/')
 691                 {
 692                   /* We just saw the comment terminator.  */
 693                   return;
 694                 }
 695             }
 696
 697           if (c == UEOF)
 698             goto eof;
 699
 700           if (c == '@')
 701             {
 702               const char *deprecated = "@deprecated";
 703               int i;
 704
 705               for (i = 0; deprecated[i]; ++i)
 706                 {
 707                   if (c != deprecated[i])
 708                     break;
 709                   /* We write the code in this way, with the
 710                      update at the end, so that after the loop
 711                      we're left with the next character in C.  */
 712                   c = java_get_unicode ();
 713                 }
 714
 715               if (c == UEOF)
 716                 goto eof;
 717
 718               /* @deprecated must be followed by a space or newline.
 719                  We also allow a '*' in case it appears just before
 720                  the end of a comment.  In this position only we also
 721                  must allow any Unicode space character.  */
 722               if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
 723                 {
 724                   if (! deprecated[i])
 725                     ctxp->deprecated = 1;
 726                 }
 727             }
 728         }
 729
 730       /* We've examined the relevant content from this line.  Now we
 731          skip the remaining characters and start over with the next
 732          line.  We also check for end of comment here.  */
 733       while (c != '\n' && c != UEOF)
 734         {
 735           last_was_star = (c == '*');
 736           c = java_get_unicode ();
 737           if (last_was_star && c == '/')
 738             return;
 739         }
 740
 741       if (c == UEOF)
 742         goto eof;
 743       /* We have to advance past the \n.  */
 744       c = java_get_unicode ();
 745       if (c == UEOF)
 746         goto eof;
 747     }
 748
 749  eof:
 750   java_lex_error ("Comment not terminated at end of input", 0);
 751 }
 752
 753 /* Return true if C is a valid start character for a Java identifier.
 754    This is only called if C >= 128 -- smaller values are handled
 755    inline.  However, this function handles all values anyway.  */
 756 static int
 757 java_start_char_p (unicode_t c)
 758 {
 759   unsigned int hi = c / 256;
 760   const char *const page = type_table[hi];
 761   unsigned long val = (unsigned long) page;
 762   int flags;
 763
 764   if ((val & ~ LETTER_MASK) != 0)
 765     flags = page[c & 255];
 766   else
 767     flags = val;
 768
 769   return flags & LETTER_START;
 770 }
 771
 772 /* Return true if C is a valid part character for a Java identifier.
 773    This is only called if C >= 128 -- smaller values are handled
 774    inline.  However, this function handles all values anyway.  */
 775 static int
 776 java_part_char_p (unicode_t c)
 777 {
 778   unsigned int hi = c / 256;
 779   const char *const page = type_table[hi];
 780   unsigned long val = (unsigned long) page;
 781   int flags;
 782
 783   if ((val & ~ LETTER_MASK) != 0)
 784     flags = page[c & 255];
 785   else
 786     flags = val;
 787
 788   return flags & LETTER_PART;
 789 }
 790
 791 /* Return true if C is whitespace.  */
 792 static int
 793 java_space_char_p (unicode_t c)
 794 {
 795   unsigned int hi = c / 256;
 796   const char *const page = type_table[hi];
 797   unsigned long val = (unsigned long) page;
 798   int flags;
 799
 800   if ((val & ~ LETTER_MASK) != 0)
 801     flags = page[c & 255];
 802   else
 803     flags = val;
 804
 805   return flags & LETTER_SPACE;
 806 }
 807
 808 static int
 809 java_parse_escape_sequence (void)
 810 {
 811   int c;
 812
 813   switch (c = java_get_unicode ())
 814     {
 815     case 'b':
 816       return (unicode_t)0x8;
 817     case 't':
 818       return (unicode_t)0x9;
 819     case 'n':
 820       return (unicode_t)0xa;
 821     case 'f':
 822       return (unicode_t)0xc;
 823     case 'r':
 824       return (unicode_t)0xd;
 825     case '"':
 826       return (unicode_t)0x22;
 827     case '\'':
 828       return (unicode_t)0x27;
 829     case '\\':
 830       return (unicode_t)0x5c;
 831     case '0': case '1': case '2': case '3': case '4':
 832     case '5': case '6': case '7':
 833       {
 834         int more = 3;
 835         unicode_t char_lit = 0;
 836
 837         if (c > '3')
 838           {
 839             /* According to the grammar, `\477' has a well-defined
 840                meaning -- it is `\47' followed by `7'.  */
 841             --more;
 842           }
 843         char_lit = 0;
 844         for (;;)
 845           {
 846             char_lit = 8 * char_lit + c - '0';
 847             if (--more == 0)
 848               break;
 849             c = java_peek_unicode ();
 850             if (! RANGE (c, '0', '7'))
 851               break;
 852             java_next_unicode ();
 853           }
 854
 855         return char_lit;
 856       }
 857     default:
 858       java_lex_error ("Invalid character in escape sequence", -1);
 859       return JAVA_CHAR_ERROR;
 860     }
 861 }
 862
 863 #ifndef JC1_LITE
 864 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
 865
 866 /* Subroutine of java_lex: converts floating-point literals to tree
 867    nodes.  LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
 868    store the result.  FFLAG indicates whether the literal was tagged
 869    with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
 870    is the line number on which to report any error.  */
 871
 872 static void java_perform_atof (YYSTYPE *, char *, int, int);
 873
 874 static void
 875 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
 876                    int number_beginning)
 877 {
 878   REAL_VALUE_TYPE value;
 879   tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 880
 881   SET_REAL_VALUE_ATOF (value,
 882                        REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
 883
 884   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 885     {
 886       JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
 887       value = DCONST0;
 888     }
 889   else if (IS_ZERO (value))
 890     {
 891       /* We check to see if the value is really 0 or if we've found an
 892          underflow.  We do this in the most primitive imaginable way.  */
 893       int really_zero = 1;
 894       char *p = literal_token;
 895       if (*p == '-')
 896         ++p;
 897       while (*p && *p != 'e' && *p != 'E')
 898         {
 899           if (*p != '0' && *p != '.')
 900             {
 901               really_zero = 0;
 902               break;
 903             }
 904           ++p;
 905         }
 906       if (! really_zero)
 907         {
 908           int save_col = ctxp->lexer->position.col;
 909           ctxp->lexer->position.col = number_beginning;
 910           java_lex_error ("Floating point literal underflow", 0);
 911           ctxp->lexer->position.col = save_col;
 912         }
 913     }
 914
 915   SET_LVAL_NODE (build_real (type, value));
 916 }
 917 #endif
 918
 919 static int yylex (YYSTYPE *);
 920
 921 static int
 922 #ifdef JC1_LITE
 923 yylex (YYSTYPE *java_lval)
 924 #else
 925 do_java_lex (YYSTYPE *java_lval)
 926 #endif
 927 {
 928   int c;
 929   char *string;
 930
 931   /* Translation of the Unicode escape in the raw stream of Unicode
 932      characters. Takes care of line terminator.  */
 933  step1:
 934   /* Skip white spaces: SP, TAB and FF or ULT.  */
 935   for (;;)
 936     {
 937       c = java_peek_unicode ();
 938       if (c != '\n' && ! JAVA_WHITE_SPACE_P (c))
 939         break;
 940       java_next_unicode ();
 941     }
 942
 943   /* Handle EOF here.  */
 944   if (c == UEOF)        /* Should probably do something here...  */
 945     return 0;
 946
 947 #ifndef JC1_LITE
 948 #ifdef USE_MAPPED_LOCATION
 949   LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table,
 950                                ctxp->lexer->position.col);
 951 #else
 952   ctxp->lexer->token_start = ctxp->lexer->position;
 953 #endif
 954 #endif
 955
 956   /* Numeric literals.  */
 957   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 958     {
 959       /* This section of code is borrowed from gcc/c-lex.c.  */
 960 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 961       int parts[TOTAL_PARTS];
 962       HOST_WIDE_INT high, low;
 963       /* End borrowed section.  */
 964       char literal_token [256];
 965       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 966       int  found_hex_digits = 0, found_non_octal_digits = -1;
 967       int  i;
 968 #ifndef JC1_LITE
 969       int  number_beginning = ctxp->lexer->position.col;
 970       tree value;
 971 #endif
 972
 973       for (i = 0; i < TOTAL_PARTS; i++)
 974         parts [i] = 0;
 975
 976       if (c == '0')
 977         {
 978           java_next_unicode ();
 979           c = java_peek_unicode ();
 980           if (c == 'x' || c == 'X')
 981             {
 982               radix = 16;
 983               java_next_unicode ();
 984               c = java_peek_unicode ();
 985             }
 986           else if (JAVA_ASCII_DIGIT (c))
 987             {
 988               literal_token [literal_index++] = '0';
 989               radix = 8;
 990             }
 991           else if (c == '.' || c == 'e' || c =='E')
 992             {
 993               literal_token [literal_index++] = '0';
 994               /* Handle C during floating-point parsing.  */
 995             }
 996           else
 997             {
 998               /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
 999               switch (c)
1000                 {
1001                 case 'L': case 'l':
1002                   java_next_unicode ();
1003                   SET_LVAL_NODE (long_zero_node);
1004                   return (INT_LIT_TK);
1005                 case 'f': case 'F':
1006                   java_next_unicode ();
1007                   SET_LVAL_NODE (float_zero_node);
1008                   return (FP_LIT_TK);
1009                 case 'd': case 'D':
1010                   java_next_unicode ();
1011                   SET_LVAL_NODE (double_zero_node);
1012                   return (FP_LIT_TK);
1013                 default:
1014                   SET_LVAL_NODE (integer_zero_node);
1015                   return (INT_LIT_TK);
1016                 }
1017             }
1018         }
1019       /* Parse the first part of the literal, until we find something
1020          which is not a number.  */
1021       while (radix == 16 ? JAVA_ASCII_HEXDIGIT (c) : JAVA_ASCII_DIGIT (c))
1022         {
1023           /* We store in a string (in case it turns out to be a FP) and in
1024              PARTS if we have to process a integer literal.  */
1025           int numeric = hex_value (c);
1026           int count;
1027
1028           /* Remember when we find a valid hexadecimal digit.  */
1029           if (radix == 16)
1030             found_hex_digits = 1;
1031           /* Remember when we find an invalid octal digit.  */
1032           else if (radix == 8 && numeric >= 8 && found_non_octal_digits < 0)
1033             found_non_octal_digits = literal_index;
1034
1035           literal_token [literal_index++] = c;
1036           /* This section of code if borrowed from gcc/c-lex.c.  */
1037           for (count = 0; count < TOTAL_PARTS; count++)
1038             {
1039               parts[count] *= radix;
1040               if (count)
1041                 {
1042                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1043                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1044                 }
1045               else
1046                 parts[0] += numeric;
1047             }
1048           if (parts [TOTAL_PARTS-1] != 0)
1049             overflow = 1;
1050           /* End borrowed section.  */
1051           java_next_unicode ();
1052           c = java_peek_unicode ();
1053         }
1054
1055       /* If we have something from the FP char set but not a digit, parse
1056          a FP literal.  */
1057       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1058         {
1059           /* stage==0: seen digits only
1060            * stage==1: seen '.'
1061            * stage==2: seen 'e' or 'E'.
1062            * stage==3: seen '+' or '-' after 'e' or 'E'.
1063            * stage==4: seen type suffix ('f'/'F'/'d'/'D')
1064            */
1065           int stage = 0;
1066           int seen_digit = (literal_index ? 1 : 0);
1067           int seen_exponent = 0;
1068           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1069                                    double unless specified.  */
1070
1071           /* It is ok if the radix is 8 because this just means we've
1072              seen a leading `0'.  However, radix==16 is invalid.  */
1073           if (radix == 16)
1074             java_lex_error ("Can't express non-decimal FP literal", 0);
1075           radix = 10;
1076
1077           for (;;)
1078             {
1079               if (c == '.')
1080                 {
1081                   if (stage < 1)
1082                     {
1083                       stage = 1;
1084                       literal_token [literal_index++ ] = c;
1085                       java_next_unicode ();
1086                       c = java_peek_unicode ();
1087                       if (literal_index == 1 && !JAVA_ASCII_DIGIT (c))
1088                         BUILD_OPERATOR (DOT_TK);
1089                     }
1090                   else
1091                     java_lex_error ("Invalid character in FP literal", 0);
1092                 }
1093
1094               if (c == 'e' || c == 'E')
1095                 {
1096                   if (stage < 2)
1097                     {
1098                       /* {E,e} must have seen at least a digit.  */
1099                       if (!seen_digit)
1100                         java_lex_error
1101                           ("Invalid FP literal, mantissa must have digit", 0);
1102                       seen_digit = 0;
1103                       seen_exponent = 1;
1104                       stage = 2;
1105                       literal_token [literal_index++] = c;
1106                       java_next_unicode ();
1107                       c = java_peek_unicode ();
1108                     }
1109                   else
1110                     java_lex_error ("Invalid character in FP literal", 0);
1111                 }
1112               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1113                 {
1114                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1115                   stage = 4;    /* So we fall through.  */
1116                 }
1117
1118               if ((c=='-' || c =='+') && stage == 2)
1119                 {
1120                   stage = 3;
1121                   literal_token [literal_index++] = c;
1122                   java_next_unicode ();
1123                   c = java_peek_unicode ();
1124                 }
1125
1126               if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1127                   (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1128                   (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1129                   (stage == 3 && JAVA_ASCII_DIGIT (c)))
1130                 {
1131                   if (JAVA_ASCII_DIGIT (c))
1132                     seen_digit = 1;
1133                   if (stage == 2)
1134                     stage = 3;
1135                   literal_token [literal_index++ ] = c;
1136                   java_next_unicode ();
1137                   c = java_peek_unicode ();
1138                 }
1139               else
1140                 {
1141                   if (stage == 4) /* Don't push back fF/dD.  */
1142                     java_next_unicode ();
1143
1144                   /* An exponent (if any) must have seen a digit.  */
1145                   if (seen_exponent && !seen_digit)
1146                     java_lex_error
1147                       ("Invalid FP literal, exponent must have digit", 0);
1148
1149                   literal_token [literal_index] = '\0';
1150
1151 #ifndef JC1_LITE
1152                   java_perform_atof (java_lval, literal_token,
1153                                      fflag, number_beginning);
1154 #endif
1155                   return FP_LIT_TK;
1156                 }
1157             }
1158         } /* JAVA_ASCII_FPCHAR (c) */
1159
1160       /* Here we get back to converting the integral literal.  */
1161       if (radix == 16 && ! found_hex_digits)
1162         java_lex_error
1163           ("0x must be followed by at least one hexadecimal digit", 0);
1164       else if (radix == 8 && found_non_octal_digits >= 0)
1165         {
1166           int back = literal_index - found_non_octal_digits;
1167           ctxp->lexer->position.col -= back;
1168           java_lex_error ("Octal literal contains digit out of range", 0);
1169           ctxp->lexer->position.col += back;
1170         }
1171       else if (c == 'L' || c == 'l')
1172         {
1173           java_next_unicode ();
1174           long_suffix = 1;
1175         }
1176
1177       /* This section of code is borrowed from gcc/c-lex.c.  */
1178       if (!overflow)
1179         {
1180           bytes = GET_TYPE_PRECISION (long_type_node);
1181           for (i = bytes; i < TOTAL_PARTS; i++)
1182             if (parts [i])
1183               {
1184                 overflow = 1;
1185                 break;
1186               }
1187         }
1188       high = low = 0;
1189       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1190         {
1191           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1192                                               / HOST_BITS_PER_CHAR)]
1193                    << (i * HOST_BITS_PER_CHAR));
1194           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1195         }
1196       /* End borrowed section.  */
1197
1198 #ifndef JC1_LITE
1199       /* Range checking.  */
1200       /* Temporarily set type to unsigned.  */
1201       value = build_int_cst_wide (long_suffix
1202                                   ? unsigned_long_type_node
1203                                   : unsigned_int_type_node, low, high);
1204       SET_LVAL_NODE (value);
1205
1206       /* For base 10 numbers, only values up to the highest value
1207          (plus one) can be written.  For instance, only ints up to
1208          2147483648 can be written.  The special case of the largest
1209          negative value is handled elsewhere.  For other bases, any
1210          number can be represented.  */
1211       if (overflow || (radix == 10
1212                        && tree_int_cst_lt (long_suffix
1213                                            ? decimal_long_max
1214                                            : decimal_int_max,
1215                                            value)))
1216         {
1217           if (long_suffix)
1218             JAVA_RANGE_ERROR ("Numeric overflow for 'long' literal");
1219           else
1220             JAVA_RANGE_ERROR ("Numeric overflow for 'int' literal");
1221         }
1222
1223       /* Sign extend the value.  */
1224       value = build_int_cst_wide (long_suffix ? long_type_node : int_type_node,
1225                                   low, high);
1226       value = force_fit_type (value, 0, false, false);
1227
1228       if (radix != 10)
1229         {
1230           value = copy_node (value);
1231           JAVA_NOT_RADIX10_FLAG (value) = 1;
1232         }
1233
1234       SET_LVAL_NODE (value);
1235 #endif
1236       return INT_LIT_TK;
1237     }
1238
1239   /* We may have an ID here.  */
1240   if (JAVA_START_CHAR_P (c))
1241     {
1242       int ascii_index = 0, all_ascii = 1;
1243
1244       /* Keyword, boolean literal or null literal.  */
1245       while (c != UEOF && JAVA_PART_CHAR_P (c))
1246         {
1247           java_unicode_2_utf8 (c);
1248           if (c >= 128)
1249             all_ascii = 0;
1250           java_next_unicode ();
1251           ascii_index++;
1252           c = java_peek_unicode ();
1253         }
1254
1255       obstack_1grow (&temporary_obstack, '\0');
1256       string = obstack_finish (&temporary_obstack);
1257
1258       /* If we have something all ascii, we consider a keyword, a boolean
1259          literal, a null literal or an all ASCII identifier.  Otherwise,
1260          this is an identifier (possibly not respecting formation rule).  */
1261       if (all_ascii)
1262         {
1263           const struct java_keyword *kw;
1264           if ((kw=java_keyword (string, ascii_index)))
1265             {
1266               switch (kw->token)
1267                 {
1268                 case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1269                 case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1270                 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1271                 case PRIVATE_TK:      case STRICT_TK:
1272                   SET_MODIFIER_CTX (kw->token);
1273                   return MODIFIER_TK;
1274                 case FLOAT_TK:
1275                   SET_LVAL_NODE (float_type_node);
1276                   return FP_TK;
1277                 case DOUBLE_TK:
1278                   SET_LVAL_NODE (double_type_node);
1279                   return FP_TK;
1280                 case BOOLEAN_TK:
1281                   SET_LVAL_NODE (boolean_type_node);
1282                   return BOOLEAN_TK;
1283                 case BYTE_TK:
1284                   SET_LVAL_NODE (byte_type_node);
1285                   return INTEGRAL_TK;
1286                 case SHORT_TK:
1287                   SET_LVAL_NODE (short_type_node);
1288                   return INTEGRAL_TK;
1289                 case INT_TK:
1290                   SET_LVAL_NODE (int_type_node);
1291                   return INTEGRAL_TK;
1292                 case LONG_TK:
1293                   SET_LVAL_NODE (long_type_node);
1294                   return INTEGRAL_TK;
1295                 case CHAR_TK:
1296                   SET_LVAL_NODE (char_type_node);
1297                   return INTEGRAL_TK;
1298
1299                   /* Keyword based literals.  */
1300                 case TRUE_TK:
1301                 case FALSE_TK:
1302                   SET_LVAL_NODE ((kw->token == TRUE_TK ?
1303                                   boolean_true_node : boolean_false_node));
1304                   return BOOL_LIT_TK;
1305                 case NULL_TK:
1306                   SET_LVAL_NODE (null_pointer_node);
1307                   return NULL_TK;
1308
1309                 case ASSERT_TK:
1310                   if (flag_assert)
1311                     {
1312                       BUILD_OPERATOR (kw->token);
1313                       return kw->token;
1314                     }
1315                   else
1316                     break;
1317
1318                   /* Some keyword we want to retain information on the location
1319                      they where found.  */
1320                 case CASE_TK:
1321                 case DEFAULT_TK:
1322                 case SUPER_TK:
1323                 case THIS_TK:
1324                 case RETURN_TK:
1325                 case BREAK_TK:
1326                 case CONTINUE_TK:
1327                 case TRY_TK:
1328                 case CATCH_TK:
1329                 case THROW_TK:
1330                 case INSTANCEOF_TK:
1331                   BUILD_OPERATOR (kw->token);
1332
1333                 default:
1334                   return kw->token;
1335                 }
1336             }
1337         }
1338
1339       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1340       return ID_TK;
1341     }
1342
1343   java_next_unicode ();
1344
1345   /* Character literals.  */
1346   if (c == '\'')
1347     {
1348       int char_lit;
1349
1350       if ((c = java_get_unicode ()) == '\\')
1351         char_lit = java_parse_escape_sequence ();
1352       else
1353         {
1354           if (c == '\n' || c == '\'')
1355             java_lex_error ("Invalid character literal", 0);
1356           char_lit = c;
1357         }
1358
1359       c = java_get_unicode ();
1360
1361       if ((c == '\n') || (c == UEOF))
1362         java_lex_error ("Character literal not terminated at end of line", 0);
1363       if (c != '\'')
1364         java_lex_error ("Syntax error in character literal", 0);
1365
1366       if (char_lit == JAVA_CHAR_ERROR)
1367         char_lit = 0;           /* We silently convert it to zero.  */
1368
1369       SET_LVAL_NODE (build_int_cst (char_type_node, char_lit));
1370       return CHAR_LIT_TK;
1371     }
1372
1373   /* String literals.  */
1374   if (c == '"')
1375     {
1376       int no_error = 1;
1377       char *string;
1378
1379       for (;;)
1380         {
1381           c = java_peek_unicode ();
1382           if (c == '\n' || c == UEOF) /* ULT.  */
1383             {
1384               java_lex_error ("String not terminated at end of line", 0);
1385               break;
1386             }
1387           java_next_unicode ();
1388           if (c == '"')
1389             break;
1390           if (c == '\\')
1391             c = java_parse_escape_sequence ();
1392           if (c == JAVA_CHAR_ERROR)
1393             {
1394               no_error = 0;
1395               c = 0;            /* We silently convert it to zero.  */
1396             }
1397           java_unicode_2_utf8 (c);
1398         }
1399
1400       obstack_1grow (&temporary_obstack, '\0');
1401       string = obstack_finish (&temporary_obstack);
1402 #ifndef JC1_LITE
1403       if (!no_error || (c != '"'))
1404         java_lval->node = error_mark_node; /* FIXME: Requires further
1405                                               testing.  */
1406       else
1407         java_lval->node = build_string (strlen (string), string);
1408 #endif
1409       obstack_free (&temporary_obstack, string);
1410       return STRING_LIT_TK;
1411     }
1412
1413   switch (c)
1414     {
1415     case '/':
1416       /* Check for comment.  */
1417       switch (c = java_peek_unicode ())
1418         {
1419         case '/':
1420           java_next_unicode ();
1421           for (;;)
1422             {
1423               c = java_get_unicode ();
1424               if (c == UEOF)
1425                 {
1426                   /* It is ok to end a `//' comment with EOF, unless
1427                      we're being pedantic.  */
1428                   if (pedantic)
1429                     java_lex_error ("Comment not terminated at end of input",
1430                                     0);
1431                   return 0;
1432                 }
1433               if (c == '\n')    /* ULT */
1434                 goto step1;
1435             }
1436           break;
1437
1438         case '*':
1439           java_next_unicode ();
1440           if ((c = java_get_unicode ()) == '*')
1441             {
1442               c = java_get_unicode ();
1443               if (c == '/')
1444                 {
1445                   /* Empty documentation comment.  We have to reset
1446                      the deprecation marker as only the most recent
1447                      doc comment applies.  */
1448                   ctxp->deprecated = 0;
1449                 }
1450               else
1451                 java_parse_doc_section (c);
1452             }
1453           else
1454             java_parse_end_comment ((c = java_get_unicode ()));
1455           goto step1;
1456           break;
1457
1458         case '=':
1459           java_next_unicode ();
1460           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1461
1462         default:
1463           BUILD_OPERATOR (DIV_TK);
1464         }
1465
1466     case '(':
1467       BUILD_OPERATOR (OP_TK);
1468     case ')':
1469       return CP_TK;
1470     case '{':
1471 #ifndef JC1_LITE
1472       java_lval->operator.token = OCB_TK;
1473       java_lval->operator.location = BUILD_LOCATION();
1474 #endif
1475 #ifdef USE_MAPPED_LOCATION
1476       if (ctxp->ccb_indent == 1)
1477         ctxp->first_ccb_indent1 = input_location;
1478 #else
1479       if (ctxp->ccb_indent == 1)
1480         ctxp->first_ccb_indent1 = input_line;
1481 #endif
1482       ctxp->ccb_indent++;
1483       return OCB_TK;
1484     case '}':
1485 #ifndef JC1_LITE
1486       java_lval->operator.token = CCB_TK;
1487       java_lval->operator.location = BUILD_LOCATION();
1488 #endif
1489       ctxp->ccb_indent--;
1490 #ifdef USE_MAPPED_LOCATION
1491       if (ctxp->ccb_indent == 1)
1492         ctxp->last_ccb_indent1 = input_location;
1493 #else
1494       if (ctxp->ccb_indent == 1)
1495         ctxp->last_ccb_indent1 = input_line;
1496 #endif
1497       return CCB_TK;
1498     case '[':
1499       BUILD_OPERATOR (OSB_TK);
1500     case ']':
1501       return CSB_TK;
1502     case ';':
1503       return SC_TK;
1504     case ',':
1505       return C_TK;
1506     case '.':
1507       BUILD_OPERATOR (DOT_TK);
1508
1509       /* Operators.  */
1510     case '=':
1511       c = java_peek_unicode ();
1512       if (c == '=')
1513         {
1514           java_next_unicode ();
1515           BUILD_OPERATOR (EQ_TK);
1516         }
1517       else
1518         {
1519           /* Equals is used in two different locations. In the
1520              variable_declarator: rule, it has to be seen as '=' as opposed
1521              to being seen as an ordinary assignment operator in
1522              assignment_operators: rule.  */
1523           BUILD_OPERATOR (ASSIGN_TK);
1524         }
1525
1526     case '>':
1527       switch ((c = java_peek_unicode ()))
1528         {
1529         case '=':
1530           java_next_unicode ();
1531           BUILD_OPERATOR (GTE_TK);
1532         case '>':
1533           java_next_unicode ();
1534           switch ((c = java_peek_unicode ()))
1535             {
1536             case '>':
1537               java_next_unicode ();
1538               c = java_peek_unicode ();
1539               if (c == '=')
1540                 {
1541                   java_next_unicode ();
1542                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1543                 }
1544               else
1545                 {
1546                   BUILD_OPERATOR (ZRS_TK);
1547                 }
1548             case '=':
1549               java_next_unicode ();
1550               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1551             default:
1552               BUILD_OPERATOR (SRS_TK);
1553             }
1554         default:
1555           BUILD_OPERATOR (GT_TK);
1556         }
1557
1558     case '<':
1559       switch ((c = java_peek_unicode ()))
1560         {
1561         case '=':
1562           java_next_unicode ();
1563           BUILD_OPERATOR (LTE_TK);
1564         case '<':
1565           java_next_unicode ();
1566           if ((c = java_peek_unicode ()) == '=')
1567             {
1568               java_next_unicode ();
1569               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1570             }
1571           else
1572             {
1573               BUILD_OPERATOR (LS_TK);
1574             }
1575         default:
1576           BUILD_OPERATOR (LT_TK);
1577         }
1578
1579     case '&':
1580       switch ((c = java_peek_unicode ()))
1581         {
1582         case '&':
1583           java_next_unicode ();
1584           BUILD_OPERATOR (BOOL_AND_TK);
1585         case '=':
1586           java_next_unicode ();
1587           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1588         default:
1589           BUILD_OPERATOR (AND_TK);
1590         }
1591
1592     case '|':
1593       switch ((c = java_peek_unicode ()))
1594         {
1595         case '|':
1596           java_next_unicode ();
1597           BUILD_OPERATOR (BOOL_OR_TK);
1598         case '=':
1599           java_next_unicode ();
1600           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1601         default:
1602           BUILD_OPERATOR (OR_TK);
1603         }
1604
1605     case '+':
1606       switch ((c = java_peek_unicode ()))
1607         {
1608         case '+':
1609           java_next_unicode ();
1610           BUILD_OPERATOR (INCR_TK);
1611         case '=':
1612           java_next_unicode ();
1613           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1614         default:
1615           BUILD_OPERATOR (PLUS_TK);
1616         }
1617
1618     case '-':
1619       switch ((c = java_peek_unicode ()))
1620         {
1621         case '-':
1622           java_next_unicode ();
1623           BUILD_OPERATOR (DECR_TK);
1624         case '=':
1625           java_next_unicode ();
1626           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1627         default:
1628           BUILD_OPERATOR (MINUS_TK);
1629         }
1630
1631     case '*':
1632       if ((c = java_peek_unicode ()) == '=')
1633         {
1634           java_next_unicode ();
1635           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1636         }
1637       else
1638         {
1639           BUILD_OPERATOR (MULT_TK);
1640         }
1641
1642     case '^':
1643       if ((c = java_peek_unicode ()) == '=')
1644         {
1645           java_next_unicode ();
1646           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1647         }
1648       else
1649         {
1650           BUILD_OPERATOR (XOR_TK);
1651         }
1652
1653     case '%':
1654       if ((c = java_peek_unicode ()) == '=')
1655         {
1656           java_next_unicode ();
1657           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1658         }
1659       else
1660         {
1661           BUILD_OPERATOR (REM_TK);
1662         }
1663
1664     case '!':
1665       if ((c = java_peek_unicode()) == '=')
1666         {
1667           java_next_unicode ();
1668           BUILD_OPERATOR (NEQ_TK);
1669         }
1670       else
1671         {
1672           BUILD_OPERATOR (NEG_TK);
1673         }
1674
1675     case '?':
1676       BUILD_OPERATOR (REL_QM_TK);
1677     case ':':
1678       BUILD_OPERATOR (REL_CL_TK);
1679     case '~':
1680       BUILD_OPERATOR (NOT_TK);
1681     }
1682
1683   if (c == 0x1a)                /* CTRL-Z.  */
1684     {
1685       if ((c = java_peek_unicode ()) == UEOF)
1686         return 0;               /* Ok here.  */
1687     }
1688
1689   /* Everything else is an invalid character in the input.  */
1690   {
1691     char lex_error_buffer [128];
1692     sprintf (lex_error_buffer, "Invalid character '%s' in input",
1693              java_sprint_unicode (c));
1694     java_lex_error (lex_error_buffer, -1);
1695   }
1696   return 0;
1697 }
1698
1699 #ifndef JC1_LITE
1700
1701 /* The exported interface to the lexer.  */
1702 static int
1703 java_lex (YYSTYPE *java_lval)
1704 {
1705   int r;
1706
1707   timevar_push (TV_LEX);
1708   r = do_java_lex (java_lval);
1709   timevar_pop (TV_LEX);
1710   return r;
1711 }
1712
1713 /* This is called by the parser to see if an error should be generated
1714    due to numeric overflow.  This function only handles the particular
1715    case of the largest negative value, and is only called in the case
1716    where this value is not preceded by `-'.  */
1717 static void
1718 error_if_numeric_overflow (tree value)
1719 {
1720   if (TREE_CODE (value) == INTEGER_CST
1721       && !JAVA_NOT_RADIX10_FLAG (value)
1722       && tree_int_cst_sgn (value) < 0)
1723     {
1724       if (TREE_TYPE (value) == long_type_node)
1725         java_lex_error ("Numeric overflow for 'long' literal", 0);
1726       else
1727         java_lex_error ("Numeric overflow for 'int' literal", 0);
1728     }
1729 }
1730
1731 #endif /* JC1_LITE */
1732
1733 static void
1734 java_unicode_2_utf8 (unicode_t unicode)
1735 {
1736   if (RANGE (unicode, 0x01, 0x7f))
1737     obstack_1grow (&temporary_obstack, (char)unicode);
1738   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1739     {
1740       obstack_1grow (&temporary_obstack,
1741                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1742       obstack_1grow (&temporary_obstack,
1743                      (unsigned char)(0x80 | (unicode & 0x3f)));
1744     }
1745   else                          /* Range 0x800-0xffff.  */
1746     {
1747       obstack_1grow (&temporary_obstack,
1748                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1749       obstack_1grow (&temporary_obstack,
1750                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1751       obstack_1grow (&temporary_obstack,
1752                      (unsigned char)(0x80 | (unicode & 0x003f)));
1753     }
1754 }
1755
1756 #ifndef JC1_LITE
1757 static tree
1758 build_wfl_node (tree node)
1759 {
1760 #ifdef USE_MAPPED_LOCATION
1761   node = build_expr_wfl (node, input_location);
1762 #else
1763   node = build_expr_wfl (node, ctxp->filename,
1764                          ctxp->lexer->token_start.line,
1765                          ctxp->lexer->token_start.col);
1766 #endif
1767   /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
1768   TREE_TYPE (node) = NULL_TREE;
1769   return node;
1770 }
1771 #endif
1772
1773 static void
1774 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1775 {
1776 #ifndef JC1_LITE
1777   int col = (ctxp->lexer->position.col
1778              + forward * ctxp->lexer->next_columns);
1779 #if USE_MAPPED_LOCATION
1780   source_location save_location = input_location;
1781   LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table, col);
1782
1783   /* Might be caught in the middle of some error report.  */
1784   ctxp->java_error_flag = 0;
1785   java_error (NULL);
1786   java_error (msg);
1787   input_location = save_location;
1788 #else
1789   java_lc save = ctxp->lexer->token_start;
1790   ctxp->lexer->token_start.line = ctxp->lexer->position.line;
1791   ctxp->lexer->token_start.col = col;
1792
1793   /* Might be caught in the middle of some error report.  */
1794   ctxp->java_error_flag = 0;
1795   java_error (NULL);
1796   java_error (msg);
1797   ctxp->lexer->token_start = save;
1798 #endif
1799 #endif
1800 }
1801
1802 #ifndef JC1_LITE
1803 static int
1804 java_is_eol (FILE *fp, int c)
1805 {
1806   int next;
1807   switch (c)
1808     {
1809     case '\r':
1810       next = getc (fp);
1811       if (next != '\n' && next != EOF)
1812         ungetc (next, fp);
1813       return 1;
1814     case '\n':
1815       return 1;
1816     default:
1817       return 0;
1818     }
1819 }
1820 #endif
1821
1822 char *
1823 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1824                    int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1825 {
1826 #ifdef JC1_LITE
1827   return 0;
1828 #else
1829   /* Dumb implementation. Doesn't try to cache or optimize things.  */
1830   /* First line of the file is line 1, first column is 1.  */
1831
1832   /* COL == -1 means, at the CR/LF in LINE.  */
1833   /* COL == -2 means, at the first non space char in LINE.  */
1834
1835   FILE *fp;
1836   int c, ccol, cline = 1;
1837   int current_line_col = 0;
1838   int first_non_space = 0;
1839   char *base;
1840
1841   if (!(fp = fopen (filename, "r")))
1842     fatal_error ("can't open %s: %m", filename);
1843
1844   while (cline != line)
1845     {
1846       c = getc (fp);
1847       if (c == EOF)
1848         {
1849           static const char msg[] = "<<file too short - unexpected EOF>>";
1850           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1851           goto have_line;
1852         }
1853       if (java_is_eol (fp, c))
1854         cline++;
1855     }
1856
1857   /* Gather the chars of the current line in a buffer.  */
1858   for (;;)
1859     {
1860       c = getc (fp);
1861       if (c < 0 || java_is_eol (fp, c))
1862         break;
1863       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1864         first_non_space = current_line_col;
1865       obstack_1grow (&temporary_obstack, c);
1866       current_line_col++;
1867     }
1868  have_line:
1869
1870   obstack_1grow (&temporary_obstack, '\n');
1871
1872   if (col == -1)
1873     {
1874       col = current_line_col;
1875       first_non_space = 0;
1876     }
1877   else if (col == -2)
1878     col = first_non_space;
1879   else
1880     first_non_space = 0;
1881
1882   /* Place the '^' a the right position.  */
1883   base = obstack_base (&temporary_obstack);
1884   for (col += 2, ccol = 0; ccol < col; ccol++)
1885     {
1886       /* Compute \t when reaching first_non_space.  */
1887       char c = (first_non_space ?
1888                 (base [ccol] == '\t' ? '\t' : ' ') : ' ');
1889       obstack_1grow (&temporary_obstack, c);
1890     }
1891   obstack_grow0 (&temporary_obstack, "^", 1);
1892
1893   fclose (fp);
1894   return obstack_finish (&temporary_obstack);
1895 #endif
1896 }
1897
1898 #ifndef JC1_LITE
1899 static int
1900 utf8_cmp (const unsigned char *str, int length, const char *name)
1901 {
1902   const unsigned char *limit = str + length;
1903   int i;
1904
1905   for (i = 0; name[i]; ++i)
1906     {
1907       int ch = UTF8_GET (str, limit);
1908       if (ch != name[i])
1909         return ch - name[i];
1910     }
1911
1912   return str == limit ? 0 : 1;
1913 }
1914
1915 /* A sorted list of all C++ keywords.  */
1916
1917 static const char *const cxx_keywords[] =
1918 {
1919   "_Complex",
1920   "__alignof",
1921   "__alignof__",
1922   "__asm",
1923   "__asm__",
1924   "__attribute",
1925   "__attribute__",
1926   "__builtin_va_arg",
1927   "__complex",
1928   "__complex__",
1929   "__const",
1930   "__const__",
1931   "__extension__",
1932   "__imag",
1933   "__imag__",
1934   "__inline",
1935   "__inline__",
1936   "__label__",
1937   "__null",
1938   "__real",
1939   "__real__",
1940   "__restrict",
1941   "__restrict__",
1942   "__signed",
1943   "__signed__",
1944   "__typeof",
1945   "__typeof__",
1946   "__volatile",
1947   "__volatile__",
1948   "and",
1949   "and_eq",
1950   "asm",
1951   "auto",
1952   "bitand",
1953   "bitor",
1954   "bool",
1955   "break",
1956   "case",
1957   "catch",
1958   "char",
1959   "class",
1960   "compl",
1961   "const",
1962   "const_cast",
1963   "continue",
1964   "default",
1965   "delete",
1966   "do",
1967   "double",
1968   "dynamic_cast",
1969   "else",
1970   "enum",
1971   "explicit",
1972   "export",
1973   "extern",
1974   "false",
1975   "float",
1976   "for",
1977   "friend",
1978   "goto",
1979   "if",
1980   "inline",
1981   "int",
1982   "long",
1983   "mutable",
1984   "namespace",
1985   "new",
1986   "not",
1987   "not_eq",
1988   "operator",
1989   "or",
1990   "or_eq",
1991   "private",
1992   "protected",
1993   "public",
1994   "register",
1995   "reinterpret_cast",
1996   "return",
1997   "short",
1998   "signed",
1999   "sizeof",
2000   "static",
2001   "static_cast",
2002   "struct",
2003   "switch",
2004   "template",
2005   "this",
2006   "throw",
2007   "true",
2008   "try",
2009   "typedef",
2010   "typeid",
2011   "typename",
2012   "typeof",
2013   "union",
2014   "unsigned",
2015   "using",
2016   "virtual",
2017   "void",
2018   "volatile",
2019   "wchar_t",
2020   "while",
2021   "xor",
2022   "xor_eq"
2023 };
2024
2025 /* Return true if NAME is a C++ keyword.  */
2026
2027 int
2028 cxx_keyword_p (const char *name, int length)
2029 {
2030   int last = ARRAY_SIZE (cxx_keywords);
2031   int first = 0;
2032   int mid = (last + first) / 2;
2033   int old = -1;
2034
2035   for (mid = (last + first) / 2;
2036        mid != old;
2037        old = mid, mid = (last + first) / 2)
2038     {
2039       int kwl = strlen (cxx_keywords[mid]);
2040       int min_length = kwl > length ? length : kwl;
2041       int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2042
2043       if (r == 0)
2044         {
2045           int i;
2046           /* We've found a match if all the remaining characters are `$'.  */
2047           for (i = min_length; i < length && name[i] == '$'; ++i)
2048             ;
2049           if (i == length)
2050             return 1;
2051           r = 1;
2052         }
2053
2054       if (r < 0)
2055         last = mid;
2056       else
2057         first = mid;
2058     }
2059   return 0;
2060 }
2061 #endif /* JC1_LITE */