gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   3    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   4
   5 This file is part of GNU CC.
   6
   7 GNU CC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU CC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU CC; see the file COPYING.  If not, write to
  19 the Free Software Foundation, 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.
  21
  22 Java and all Java-based marks are trademarks or registered trademarks
  23 of Sun Microsystems, Inc. in the United States and other countries.
  24 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  25
  26 /* It defines java_lex (yylex) that reads a Java ASCII source file
  27    possibly containing Unicode escape sequence or utf8 encoded
  28    characters and returns a token for everything found but comments,
  29    white spaces and line terminators. When necessary, it also fills
  30    the java_lval (yylval) union. It's implemented to be called by a
  31    re-entrant parser generated by Bison.
  32
  33    The lexical analysis conforms to the Java grammar described in "The
  34    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  35    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  36
  37 #include "keyword.h"
  38 #include "flags.h"
  39 #include "chartables.h"
  40
  41 /* Function declarations.  */
  42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
  43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
  44 static void java_lex_error PARAMS ((const char *, int));
  45 #ifndef JC1_LITE
  46 static int java_is_eol PARAMS ((FILE *, int));
  47 static tree build_wfl_node PARAMS ((tree));
  48 #endif
  49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  50 static int java_parse_escape_sequence PARAMS ((void));
  51 static int java_start_char_p PARAMS ((unicode_t));
  52 static int java_part_char_p PARAMS ((unicode_t));
  53 static int java_parse_doc_section PARAMS ((int));
  54 static void java_parse_end_comment PARAMS ((int));
  55 static int java_get_unicode PARAMS ((void));
  56 static int java_read_unicode PARAMS ((java_lexer *, int *));
  57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
  58                                                              int *));
  59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  60 static int java_read_char PARAMS ((java_lexer *));
  61 static void java_allocate_new_line PARAMS ((void));
  62 static void java_unget_unicode PARAMS ((void));
  63 static unicode_t java_sneak_unicode PARAMS ((void));
  64 #ifndef JC1_LITE
  65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
  66 #endif
  67
  68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
  69 #ifndef JC1_LITE
  70 static void error_if_numeric_overflow PARAMS ((tree));
  71 #endif
  72
  73 #ifdef HAVE_ICONV
  74 /* This is nonzero if we have initialized `need_byteswap'.  */
  75 static int byteswap_init = 0;
  76
  77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  78    big-endian order -- not native endian order.  We handle this by
  79    doing a conversion once at startup and seeing what happens.  This
  80    flag holds the results of this determination.  */
  81 static int need_byteswap = 0;
  82 #endif
  83
  84 void
  85 java_init_lex (finput, encoding)
  86      FILE *finput;
  87      const char *encoding;
  88 {
  89 #ifndef JC1_LITE
  90   int java_lang_imported = 0;
  91
  92   if (!java_lang_id)
  93     java_lang_id = get_identifier ("java.lang");
  94   if (!java_lang_cloneable)
  95     java_lang_cloneable = get_identifier ("java.lang.Cloneable");
  96   if (!java_io_serializable)
  97     java_io_serializable = get_identifier ("java.io.Serializable");
  98   if (!inst_id)
  99     inst_id = get_identifier ("inst$");
 100   if (!wpv_id)
 101     wpv_id = get_identifier ("write_parm_value$");
 102
 103   if (!java_lang_imported)
 104     {
 105       tree node = build_tree_list
 106         (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
 107       read_import_dir (TREE_PURPOSE (node));
 108       TREE_CHAIN (node) = ctxp->import_demand_list;
 109       ctxp->import_demand_list = node;
 110       java_lang_imported = 1;
 111     }
 112
 113   if (!wfl_operator)
 114     wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 115   if (!label_id)
 116     label_id = get_identifier ("$L");
 117   if (!wfl_append)
 118     wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
 119   if (!wfl_string_buffer)
 120     wfl_string_buffer =
 121       build_expr_wfl (get_identifier (flag_emit_class_files
 122                                       ? "java.lang.StringBuffer"
 123                                       : "gnu.gcj.runtime.StringBuffer"),
 124                       NULL, 0, 0);
 125   if (!wfl_to_string)
 126     wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
 127
 128   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 129     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 130
 131   memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
 132   memset ((PTR) current_jcf, 0, sizeof (JCF));
 133   ctxp->current_parsed_class = NULL;
 134   ctxp->package = NULL_TREE;
 135 #endif
 136
 137   ctxp->filename = input_filename;
 138   ctxp->lineno = lineno = 0;
 139   ctxp->p_line = NULL;
 140   ctxp->c_line = NULL;
 141   ctxp->java_error_flag = 0;
 142   ctxp->lexer = java_new_lexer (finput, encoding);
 143 }
 144
 145 static char *
 146 java_sprint_unicode (line, i)
 147     struct java_line *line;
 148     int i;
 149 {
 150   static char buffer [10];
 151   if (line->unicode_escape_p [i] || line->line [i] > 128)
 152     sprintf (buffer, "\\u%04x", line->line [i]);
 153   else
 154     {
 155       buffer [0] = line->line [i];
 156       buffer [1] = '\0';
 157     }
 158   return buffer;
 159 }
 160
 161 static unicode_t
 162 java_sneak_unicode ()
 163 {
 164   return (ctxp->c_line->line [ctxp->c_line->current]);
 165 }
 166
 167 static void
 168 java_unget_unicode ()
 169 {
 170   if (!ctxp->c_line->current)
 171     /* Can't unget unicode.  */
 172     abort ();
 173
 174   ctxp->c_line->current--;
 175   ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
 176 }
 177
 178 static void
 179 java_allocate_new_line ()
 180 {
 181   unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
 182   char ahead_escape_p = (ctxp->c_line ?
 183                          ctxp->c_line->unicode_escape_ahead_p : 0);
 184
 185   if (ctxp->c_line && !ctxp->c_line->white_space_only)
 186     {
 187       if (ctxp->p_line)
 188         {
 189           free (ctxp->p_line->unicode_escape_p);
 190           free (ctxp->p_line->line);
 191           free (ctxp->p_line);
 192         }
 193       ctxp->p_line = ctxp->c_line;
 194       ctxp->c_line = NULL;              /* Reallocated.  */
 195     }
 196
 197   if (!ctxp->c_line)
 198     {
 199       ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
 200       ctxp->c_line->max = JAVA_LINE_MAX;
 201       ctxp->c_line->line = (unicode_t *)xmalloc
 202         (sizeof (unicode_t)*ctxp->c_line->max);
 203       ctxp->c_line->unicode_escape_p =
 204           (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
 205       ctxp->c_line->white_space_only = 0;
 206     }
 207
 208   ctxp->c_line->line [0] = ctxp->c_line->size = 0;
 209   ctxp->c_line->char_col = ctxp->c_line->current = 0;
 210   if (ahead)
 211     {
 212       ctxp->c_line->line [ctxp->c_line->size] = ahead;
 213       ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
 214       ctxp->c_line->size++;
 215     }
 216   ctxp->c_line->ahead [0] = 0;
 217   ctxp->c_line->unicode_escape_ahead_p = 0;
 218   ctxp->c_line->lineno = ++lineno;
 219   ctxp->c_line->white_space_only = 1;
 220 }
 221
 222 /* Create a new lexer object.  */
 223
 224 java_lexer *
 225 java_new_lexer (finput, encoding)
 226      FILE *finput;
 227      const char *encoding;
 228 {
 229   java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
 230   int enc_error = 0;
 231
 232   lex->finput = finput;
 233   lex->bs_count = 0;
 234   lex->unget_value = 0;
 235   lex->hit_eof = 0;
 236
 237 #ifdef HAVE_ICONV
 238   lex->handle = iconv_open ("UCS-2", encoding);
 239   if (lex->handle != (iconv_t) -1)
 240     {
 241       lex->first = -1;
 242       lex->last = -1;
 243       lex->out_first = -1;
 244       lex->out_last = -1;
 245       lex->read_anything = 0;
 246       lex->use_fallback = 0;
 247
 248       /* Work around broken iconv() implementations by doing checking at
 249          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 250          then all UCS-2 encoders will be broken.  Perhaps not a valid
 251          assumption.  */
 252       if (! byteswap_init)
 253         {
 254           iconv_t handle;
 255
 256           byteswap_init = 1;
 257
 258           handle = iconv_open ("UCS-2", "UTF-8");
 259           if (handle != (iconv_t) -1)
 260             {
 261               unicode_t result;
 262               unsigned char in[3];
 263               char *inp, *outp;
 264               size_t inc, outc, r;
 265
 266               /* This is the UTF-8 encoding of \ufeff.  */
 267               in[0] = 0xef;
 268               in[1] = 0xbb;
 269               in[2] = 0xbf;
 270
 271               inp = in;
 272               inc = 3;
 273               outp = (char *) &result;
 274               outc = 2;
 275
 276               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 277                          &outp, &outc);
 278               iconv_close (handle);
 279               /* Conversion must be complete for us to use the result.  */
 280               if (r != (size_t) -1 && inc == 0 && outc == 0)
 281                 need_byteswap = (result != 0xfeff);
 282             }
 283         }
 284
 285       lex->byte_swap = need_byteswap;
 286     }
 287   else
 288 #endif /* HAVE_ICONV */
 289     {
 290       /* If iconv failed, use the internal decoder if the default
 291          encoding was requested.  This code is used on platforms where
 292          iconv exists but is insufficient for our needs.  For
 293          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.  */
 294       if (strcmp (encoding, DEFAULT_ENCODING))
 295         enc_error = 1;
 296 #ifdef HAVE_ICONV
 297       else
 298         lex->use_fallback = 1;
 299 #endif /* HAVE_ICONV */
 300     }
 301
 302   if (enc_error)
 303     fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
 304
 305   return lex;
 306 }
 307
 308 void
 309 java_destroy_lexer (lex)
 310      java_lexer *lex;
 311 {
 312 #ifdef HAVE_ICONV
 313   if (! lex->use_fallback)
 314     iconv_close (lex->handle);
 315 #endif
 316   free (lex);
 317 }
 318
 319 static int
 320 java_read_char (lex)
 321      java_lexer *lex;
 322 {
 323   if (lex->unget_value)
 324     {
 325       unicode_t r = lex->unget_value;
 326       lex->unget_value = 0;
 327       return r;
 328     }
 329
 330 #ifdef HAVE_ICONV
 331   if (! lex->use_fallback)
 332     {
 333       size_t ir, inbytesleft, in_save, out_count, out_save;
 334       char *inp, *outp;
 335       unicode_t result;
 336
 337       /* If there is data which has already been converted, use it.  */
 338       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 339         {
 340           lex->out_first = 0;
 341           lex->out_last = 0;
 342
 343           while (1)
 344             {
 345               /* See if we need to read more data.  If FIRST == 0 then
 346                  the previous conversion attempt ended in the middle of
 347                  a character at the end of the buffer.  Otherwise we
 348                  only have to read if the buffer is empty.  */
 349               if (lex->first == 0 || lex->first >= lex->last)
 350                 {
 351                   int r;
 352
 353                   if (lex->first >= lex->last)
 354                     {
 355                       lex->first = 0;
 356                       lex->last = 0;
 357                     }
 358                   if (feof (lex->finput))
 359                     return UEOF;
 360                   r = fread (&lex->buffer[lex->last], 1,
 361                              sizeof (lex->buffer) - lex->last,
 362                              lex->finput);
 363                   lex->last += r;
 364                 }
 365
 366               inbytesleft = lex->last - lex->first;
 367               out_count = sizeof (lex->out_buffer) - lex->out_last;
 368
 369               if (inbytesleft == 0)
 370                 {
 371                   /* We've tried to read and there is nothing left.  */
 372                   return UEOF;
 373                 }
 374
 375               in_save = inbytesleft;
 376               out_save = out_count;
 377               inp = &lex->buffer[lex->first];
 378               outp = &lex->out_buffer[lex->out_last];
 379               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 380                           &inbytesleft, &outp, &out_count);
 381
 382               /* If we haven't read any bytes, then look to see if we
 383                  have read a BOM.  */
 384               if (! lex->read_anything && out_save - out_count >= 2)
 385                 {
 386                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 387                   if (uc == 0xfeff)
 388                     {
 389                       lex->byte_swap = 0;
 390                       lex->out_first += 2;
 391                     }
 392                   else if (uc == 0xfffe)
 393                     {
 394                       lex->byte_swap = 1;
 395                       lex->out_first += 2;
 396                     }
 397                   lex->read_anything = 1;
 398                 }
 399
 400               if (lex->byte_swap)
 401                 {
 402                   unsigned int i;
 403                   for (i = 0; i < out_save - out_count; i += 2)
 404                     {
 405                       char t = lex->out_buffer[lex->out_last + i];
 406                       lex->out_buffer[lex->out_last + i]
 407                         = lex->out_buffer[lex->out_last + i + 1];
 408                       lex->out_buffer[lex->out_last + i + 1] = t;
 409                     }
 410                 }
 411
 412               lex->first += in_save - inbytesleft;
 413               lex->out_last += out_save - out_count;
 414
 415               /* If we converted anything at all, move along.  */
 416               if (out_count != out_save)
 417                 break;
 418
 419               if (ir == (size_t) -1)
 420                 {
 421                   if (errno == EINVAL)
 422                     {
 423                       /* This is ok.  This means that the end of our buffer
 424                          is in the middle of a character sequence.  We just
 425                          move the valid part of the buffer to the beginning
 426                          to force a read.  */
 427                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 428                                lex->last - lex->first);
 429                       lex->last -= lex->first;
 430                       lex->first = 0;
 431                     }
 432                   else
 433                     {
 434                       /* A more serious error.  */
 435                       java_lex_error ("unrecognized character in input stream",
 436                                       0);
 437                       return UEOF;
 438                     }
 439                 }
 440             }
 441         }
 442
 443       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 444         {
 445           /* Don't have any data.  */
 446           return UEOF;
 447         }
 448
 449       /* Success.  */
 450       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 451       lex->out_first += 2;
 452       return result;
 453     }
 454   else
 455 #endif /* HAVE_ICONV */
 456     {
 457       int c, c1, c2;
 458       c = getc (lex->finput);
 459
 460       if (c == EOF)
 461         return UEOF;
 462       if (c < 128)
 463         return (unicode_t) c;
 464       else
 465         {
 466           if ((c & 0xe0) == 0xc0)
 467             {
 468               c1 = getc (lex->finput);
 469               if ((c1 & 0xc0) == 0x80)
 470                 {
 471                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 472                   /* Check for valid 2-byte characters.  We explicitly
 473                      allow \0 because this encoding is common in the
 474                      Java world.  */
 475                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 476                     return r;
 477                 }
 478             }
 479           else if ((c & 0xf0) == 0xe0)
 480             {
 481               c1 = getc (lex->finput);
 482               if ((c1 & 0xc0) == 0x80)
 483                 {
 484                   c2 = getc (lex->finput);
 485                   if ((c2 & 0xc0) == 0x80)
 486                     {
 487                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 488                                                  (( c1 & 0x3f) << 6)
 489                                                  + (c2 & 0x3f));
 490                       /* Check for valid 3-byte characters.
 491                          Don't allow surrogate, \ufffe or \uffff.  */
 492                       if (r >= 0x800 && r <= 0xffff
 493                           && ! (r >= 0xd800 && r <= 0xdfff)
 494                           && r != 0xfffe && r != 0xffff)
 495                         return r;
 496                     }
 497                 }
 498             }
 499
 500           /* We simply don't support invalid characters.  We also
 501              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 502              cannot be valid Java characters.  */
 503           java_lex_error ("malformed UTF-8 character", 0);
 504         }
 505     }
 506
 507   /* We only get here on error.  */
 508   return UEOF;
 509 }
 510
 511 static void
 512 java_store_unicode (l, c, unicode_escape_p)
 513     struct java_line *l;
 514     unicode_t c;
 515     int unicode_escape_p;
 516 {
 517   if (l->size == l->max)
 518     {
 519       l->max += JAVA_LINE_MAX;
 520       l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
 521       l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
 522                                                sizeof (char)*l->max);
 523     }
 524   l->line [l->size] = c;
 525   l->unicode_escape_p [l->size++] = unicode_escape_p;
 526 }
 527
 528 static int
 529 java_read_unicode (lex, unicode_escape_p)
 530      java_lexer *lex;
 531      int *unicode_escape_p;
 532 {
 533   int c;
 534
 535   c = java_read_char (lex);
 536   *unicode_escape_p = 0;
 537
 538   if (c != '\\')
 539     {
 540       lex->bs_count = 0;
 541       return c;
 542     }
 543
 544   ++lex->bs_count;
 545   if ((lex->bs_count) % 2 == 1)
 546     {
 547       /* Odd number of \ seen.  */
 548       c = java_read_char (lex);
 549       if (c == 'u')
 550         {
 551           unicode_t unicode = 0;
 552           int shift = 12;
 553
 554           /* Recognize any number of `u's in \u.  */
 555           while ((c = java_read_char (lex)) == 'u')
 556             ;
 557
 558           /* Unget the most recent character as it is not a `u'.  */
 559           if (c == UEOF)
 560             return UEOF;
 561           lex->unget_value = c;
 562
 563           /* Next should be 4 hex digits, otherwise it's an error.
 564              The hex value is converted into the unicode, pushed into
 565              the Unicode stream.  */
 566           for (shift = 12; shift >= 0; shift -= 4)
 567             {
 568               if ((c = java_read_char (lex)) == UEOF)
 569                 return UEOF;
 570               if (hex_p (c))
 571                 unicode |= (unicode_t)(hex_value (c) << shift);
 572               else
 573                 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
 574             }
 575           lex->bs_count = 0;
 576           *unicode_escape_p = 1;
 577           return unicode;
 578         }
 579       lex->unget_value = c;
 580     }
 581   return (unicode_t) '\\';
 582 }
 583
 584 static int
 585 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
 586      java_lexer *lex;
 587      int *unicode_escape_p;
 588 {
 589   int c = java_read_unicode (lex, unicode_escape_p);
 590
 591   if (c == '\r')
 592     {
 593       /* We have to read ahead to see if we got \r\n.  In that case we
 594          return a single line terminator.  */
 595       int dummy;
 596       c = java_read_unicode (lex, &dummy);
 597       if (c != '\n')
 598         lex->unget_value = c;
 599       /* In either case we must return a newline.  */
 600       c = '\n';
 601     }
 602
 603   return c;
 604 }
 605
 606 static int
 607 java_get_unicode ()
 608 {
 609   /* It's time to read a line when...  */
 610   if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
 611     {
 612       int c;
 613       int found_chars = 0;
 614
 615       if (ctxp->lexer->hit_eof)
 616         return UEOF;
 617
 618       java_allocate_new_line ();
 619       if (ctxp->c_line->line[0] != '\n')
 620         {
 621           for (;;)
 622             {
 623               int unicode_escape_p;
 624               c = java_read_unicode_collapsing_terminators (ctxp->lexer,
 625                                                             &unicode_escape_p);
 626               if (c != UEOF)
 627                 {
 628                   found_chars = 1;
 629                   java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 630                   if (ctxp->c_line->white_space_only
 631                       && !JAVA_WHITE_SPACE_P (c)
 632                       && c != '\n')
 633                     ctxp->c_line->white_space_only = 0;
 634                 }
 635               if ((c == '\n') || (c == UEOF))
 636                 break;
 637             }
 638
 639           if (c == UEOF && ! found_chars)
 640             {
 641               ctxp->lexer->hit_eof = 1;
 642               return UEOF;
 643             }
 644         }
 645     }
 646   ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
 647   JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
 648   return ctxp->c_line->line [ctxp->c_line->current++];
 649 }
 650
 651 /* Parse the end of a C style comment.
 652  * C is the first character following the '/' and '*'.  */
 653 static void
 654 java_parse_end_comment (c)
 655      int c;
 656 {
 657   for ( ;; c = java_get_unicode ())
 658     {
 659       switch (c)
 660         {
 661         case UEOF:
 662           java_lex_error ("Comment not terminated at end of input", 0);
 663           return;
 664         case '*':
 665           switch (c = java_get_unicode ())
 666             {
 667             case UEOF:
 668               java_lex_error ("Comment not terminated at end of input", 0);
 669               return;
 670             case '/':
 671               return;
 672             case '*':   /* Reparse only '*'.  */
 673               java_unget_unicode ();
 674             }
 675         }
 676     }
 677 }
 678
 679 /* Parse the documentation section. Keywords must be at the beginning
 680    of a documentation comment line (ignoring white space and any `*'
 681    character). Parsed keyword(s): @DEPRECATED.  */
 682
 683 static int
 684 java_parse_doc_section (c)
 685      int c;
 686 {
 687   int valid_tag = 0, seen_star = 0;
 688
 689   while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
 690     {
 691       switch (c)
 692         {
 693         case '*':
 694           seen_star = 1;
 695           break;
 696         case '\n': /* ULT */
 697           valid_tag = 1;
 698         default:
 699           seen_star = 0;
 700         }
 701       c = java_get_unicode();
 702     }
 703
 704   if (c == UEOF)
 705     java_lex_error ("Comment not terminated at end of input", 0);
 706
 707   if (seen_star && (c == '/'))
 708     return 1;                   /* Goto step1 in caller.  */
 709
 710   /* We're parsing `@deprecated'.  */
 711   if (valid_tag && (c == '@'))
 712     {
 713       char tag [11];
 714       int  tag_index = 0;
 715
 716       while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
 717         {
 718           c = java_get_unicode ();
 719           tag [tag_index++] = c;
 720         }
 721
 722       if (c == UEOF)
 723         java_lex_error ("Comment not terminated at end of input", 0);
 724       tag [tag_index] = '\0';
 725
 726       if (!strcmp (tag, "deprecated"))
 727         ctxp->deprecated = 1;
 728     }
 729   java_unget_unicode ();
 730   return 0;
 731 }
 732
 733 /* Return true if C is a valid start character for a Java identifier.
 734    This is only called if C >= 128 -- smaller values are handled
 735    inline.  However, this function handles all values anyway.  */
 736 static int
 737 java_start_char_p (c)
 738      unicode_t c;
 739 {
 740   unsigned int hi = c / 256;
 741   const char *const page = type_table[hi];
 742   unsigned long val = (unsigned long) page;
 743   int flags;
 744
 745   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 746     flags = page[c & 255];
 747   else
 748     flags = val;
 749
 750   return flags & LETTER_START;
 751 }
 752
 753 /* Return true if C is a valid part character for a Java identifier.
 754    This is only called if C >= 128 -- smaller values are handled
 755    inline.  However, this function handles all values anyway.  */
 756 static int
 757 java_part_char_p (c)
 758      unicode_t c;
 759 {
 760   unsigned int hi = c / 256;
 761   const char *const page = type_table[hi];
 762   unsigned long val = (unsigned long) page;
 763   int flags;
 764
 765   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 766     flags = page[c & 255];
 767   else
 768     flags = val;
 769
 770   return flags & LETTER_PART;
 771 }
 772
 773 static int
 774 java_parse_escape_sequence ()
 775 {
 776   unicode_t char_lit;
 777   int c;
 778
 779   switch (c = java_get_unicode ())
 780     {
 781     case 'b':
 782       return (unicode_t)0x8;
 783     case 't':
 784       return (unicode_t)0x9;
 785     case 'n':
 786       return (unicode_t)0xa;
 787     case 'f':
 788       return (unicode_t)0xc;
 789     case 'r':
 790       return (unicode_t)0xd;
 791     case '"':
 792       return (unicode_t)0x22;
 793     case '\'':
 794       return (unicode_t)0x27;
 795     case '\\':
 796       return (unicode_t)0x5c;
 797     case '0': case '1': case '2': case '3': case '4':
 798     case '5': case '6': case '7':
 799       {
 800         int octal_escape[3];
 801         int octal_escape_index = 0;
 802         int max = 3;
 803         int i, shift;
 804
 805         for (; octal_escape_index < max && RANGE (c, '0', '7');
 806              c = java_get_unicode ())
 807           {
 808             if (octal_escape_index == 0 && c > '3')
 809               {
 810                 /* According to the grammar, `\477' has a well-defined
 811                    meaning -- it is `\47' followed by `7'.  */
 812                 --max;
 813               }
 814             octal_escape [octal_escape_index++] = c;
 815           }
 816
 817         java_unget_unicode ();
 818
 819         for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
 820              i < octal_escape_index; i++, shift -= 3)
 821           char_lit |= (octal_escape [i] - '0') << shift;
 822
 823         return char_lit;
 824       }
 825     default:
 826       java_lex_error ("Invalid character in escape sequence", 0);
 827       return JAVA_CHAR_ERROR;
 828     }
 829 }
 830
 831 /* Isolate the code which may raise an arithmetic exception in its
 832    own function.  */
 833
 834 #ifndef JC1_LITE
 835 struct jpa_args
 836 {
 837   YYSTYPE *java_lval;
 838   char *literal_token;
 839   int fflag;
 840   int number_beginning;
 841 };
 842
 843 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
 844
 845 static void java_perform_atof   PARAMS ((PTR));
 846
 847 static void
 848 java_perform_atof (av)
 849      PTR av;
 850 {
 851   struct jpa_args *a = (struct jpa_args *)av;
 852   YYSTYPE *java_lval = a->java_lval;
 853   int number_beginning = a->number_beginning;
 854   REAL_VALUE_TYPE value;
 855   tree type = (a->fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 856
 857   SET_REAL_VALUE_ATOF (value,
 858                        REAL_VALUE_ATOF (a->literal_token, TYPE_MODE (type)));
 859
 860   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 861     {
 862       JAVA_FLOAT_RANGE_ERROR ((a->fflag ? "float" : "double"));
 863       value = DCONST0;
 864     }
 865   else if (IS_ZERO (value))
 866     {
 867       /* We check to see if the value is really 0 or if we've found an
 868          underflow.  We do this in the most primitive imaginable way.  */
 869       int really_zero = 1;
 870       char *p = a->literal_token;
 871       if (*p == '-')
 872         ++p;
 873       while (*p && *p != 'e' && *p != 'E')
 874         {
 875           if (*p != '0' && *p != '.')
 876             {
 877               really_zero = 0;
 878               break;
 879             }
 880           ++p;
 881         }
 882       if (! really_zero)
 883         {
 884           int i = ctxp->c_line->current;
 885           ctxp->c_line->current = number_beginning;
 886           java_lex_error ("Floating point literal underflow", 0);
 887           ctxp->c_line->current = i;
 888         }
 889     }
 890
 891   SET_LVAL_NODE_TYPE (build_real (type, value), type);
 892 }
 893 #endif
 894
 895 static int yylex                PARAMS ((YYSTYPE *));
 896
 897 static int
 898 #ifdef JC1_LITE
 899 yylex (java_lval)
 900 #else
 901 java_lex (java_lval)
 902 #endif
 903      YYSTYPE *java_lval;
 904 {
 905   int c;
 906   unicode_t first_unicode;
 907   int ascii_index, all_ascii;
 908   char *string;
 909
 910   /* Translation of the Unicode escape in the raw stream of Unicode
 911      characters. Takes care of line terminator.  */
 912  step1:
 913   /* Skip white spaces: SP, TAB and FF or ULT.  */
 914   for (c = java_get_unicode ();
 915        c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
 916     if (c == '\n')
 917       {
 918         ctxp->elc.line = ctxp->c_line->lineno;
 919         ctxp->elc.col  = ctxp->c_line->char_col-2;
 920       }
 921
 922   ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
 923
 924   if (c == 0x1a)                /* CTRL-Z.  */
 925     {
 926       if ((c = java_get_unicode ()) == UEOF)
 927         return 0;               /* Ok here.  */
 928       else
 929         java_unget_unicode ();  /* Caught later, at the end of the
 930                                    function.  */
 931     }
 932   /* Handle EOF here.  */
 933   if (c == UEOF)        /* Should probably do something here...  */
 934     return 0;
 935
 936   /* Take care of eventual comments.  */
 937   if (c == '/')
 938     {
 939       switch (c = java_get_unicode ())
 940         {
 941         case '/':
 942           for (;;)
 943             {
 944               c = java_get_unicode ();
 945               if (c == UEOF)
 946                 {
 947                   /* It is ok to end a `//' comment with EOF, unless
 948                      we're being pedantic.  */
 949                   if (pedantic)
 950                     java_lex_error ("Comment not terminated at end of input",
 951                                     0);
 952                   return 0;
 953                 }
 954               if (c == '\n')    /* ULT */
 955                 goto step1;
 956             }
 957           break;
 958
 959         case '*':
 960           if ((c = java_get_unicode ()) == '*')
 961             {
 962               if ((c = java_get_unicode ()) == '/')
 963                 goto step1;     /* Empty documentation comment.  */
 964               else if (java_parse_doc_section (c))
 965                 goto step1;
 966             }
 967
 968           java_parse_end_comment ((c = java_get_unicode ()));
 969           goto step1;
 970           break;
 971         default:
 972           java_unget_unicode ();
 973           c = '/';
 974           break;
 975         }
 976     }
 977
 978   ctxp->elc.line = ctxp->c_line->lineno;
 979   ctxp->elc.prev_col = ctxp->elc.col;
 980   ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
 981   if (ctxp->elc.col < 0)
 982     abort ();
 983
 984   /* Numeric literals.  */
 985   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 986     {
 987       /* This section of code is borrowed from gcc/c-lex.c.  */
 988 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 989       int parts[TOTAL_PARTS];
 990       HOST_WIDE_INT high, low;
 991       /* End borrowed section.  */
 992       char literal_token [256];
 993       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 994       int  found_hex_digits = 0, found_non_octal_digits = 0;
 995       int  i;
 996 #ifndef JC1_LITE
 997       int  number_beginning = ctxp->c_line->current;
 998       tree value;
 999 #endif
1000
1001       /* We might have a . separator instead of a FP like .[0-9]*.  */
1002       if (c == '.')
1003         {
1004           unicode_t peep = java_sneak_unicode ();
1005
1006           if (!JAVA_ASCII_DIGIT (peep))
1007             {
1008               JAVA_LEX_SEP('.');
1009               BUILD_OPERATOR (DOT_TK);
1010             }
1011         }
1012
1013       for (i = 0; i < TOTAL_PARTS; i++)
1014         parts [i] = 0;
1015
1016       if (c == '0')
1017         {
1018           c = java_get_unicode ();
1019           if (c == 'x' || c == 'X')
1020             {
1021               radix = 16;
1022               c = java_get_unicode ();
1023             }
1024           else if (JAVA_ASCII_DIGIT (c))
1025             radix = 8;
1026           else if (c == '.')
1027             {
1028               /* Push the '.' back and prepare for a FP parsing...  */
1029               java_unget_unicode ();
1030               c = '0';
1031             }
1032           else
1033             {
1034               /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
1035               JAVA_LEX_LIT ("0", 10);
1036               switch (c)
1037                 {
1038                 case 'L': case 'l':
1039                   SET_LVAL_NODE (long_zero_node);
1040                   return (INT_LIT_TK);
1041                 case 'f': case 'F':
1042                   SET_LVAL_NODE (float_zero_node);
1043                   return (FP_LIT_TK);
1044                 case 'd': case 'D':
1045                   SET_LVAL_NODE (double_zero_node);
1046                   return (FP_LIT_TK);
1047                 default:
1048                   java_unget_unicode ();
1049                   SET_LVAL_NODE (integer_zero_node);
1050                   return (INT_LIT_TK);
1051                 }
1052             }
1053         }
1054       /* Parse the first part of the literal, until we find something
1055          which is not a number.  */
1056       while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1057              JAVA_ASCII_DIGIT (c))
1058         {
1059           /* We store in a string (in case it turns out to be a FP) and in
1060              PARTS if we have to process a integer literal.  */
1061           int numeric = hex_value (c);
1062           int count;
1063
1064           /* Remember when we find a valid hexadecimal digit.  */
1065           if (radix == 16)
1066             found_hex_digits = 1;
1067           /* Remember when we find an invalid octal digit.  */
1068           else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1069             found_non_octal_digits = 1;
1070
1071           literal_token [literal_index++] = c;
1072           /* This section of code if borrowed from gcc/c-lex.c.  */
1073           for (count = 0; count < TOTAL_PARTS; count++)
1074             {
1075               parts[count] *= radix;
1076               if (count)
1077                 {
1078                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1079                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1080                 }
1081               else
1082                 parts[0] += numeric;
1083             }
1084           if (parts [TOTAL_PARTS-1] != 0)
1085             overflow = 1;
1086           /* End borrowed section.  */
1087           c = java_get_unicode ();
1088         }
1089
1090       /* If we have something from the FP char set but not a digit, parse
1091          a FP literal.  */
1092       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1093         {
1094           int stage = 0;
1095           int seen_digit = (literal_index ? 1 : 0);
1096           int seen_exponent = 0;
1097           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1098                                    double unless specified.  */
1099
1100           /* It is ok if the radix is 8 because this just means we've
1101              seen a leading `0'.  However, radix==16 is invalid.  */
1102           if (radix == 16)
1103             java_lex_error ("Can't express non-decimal FP literal", 0);
1104           radix = 10;
1105
1106           for (;;)
1107             {
1108               if (c == '.')
1109                 {
1110                   if (stage < 1)
1111                     {
1112                       stage = 1;
1113                       literal_token [literal_index++ ] = c;
1114                       c = java_get_unicode ();
1115                     }
1116                   else
1117                     java_lex_error ("Invalid character in FP literal", 0);
1118                 }
1119
1120               if (c == 'e' || c == 'E')
1121                 {
1122                   if (stage < 2)
1123                     {
1124                       /* {E,e} must have seen at least a digit.  */
1125                       if (!seen_digit)
1126                         java_lex_error
1127                           ("Invalid FP literal, mantissa must have digit", 0);
1128                       seen_digit = 0;
1129                       seen_exponent = 1;
1130                       stage = 2;
1131                       literal_token [literal_index++] = c;
1132                       c = java_get_unicode ();
1133                     }
1134                   else
1135                     java_lex_error ("Invalid character in FP literal", 0);
1136                 }
1137               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1138                 {
1139                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1140                   stage = 4;    /* So we fall through.  */
1141                 }
1142
1143               if ((c=='-' || c =='+') && stage == 2)
1144                 {
1145                   stage = 3;
1146                   literal_token [literal_index++] = c;
1147                   c = java_get_unicode ();
1148                 }
1149
1150               if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1151                   (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1152                   (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1153                   (stage == 3 && JAVA_ASCII_DIGIT (c)))
1154                 {
1155                   if (JAVA_ASCII_DIGIT (c))
1156                     seen_digit = 1;
1157                   literal_token [literal_index++ ] = c;
1158                   c = java_get_unicode ();
1159                 }
1160               else
1161                 {
1162 #ifndef JC1_LITE
1163                   struct jpa_args a;
1164 #endif
1165                   if (stage != 4) /* Don't push back fF/dD.  */
1166                     java_unget_unicode ();
1167
1168                   /* An exponent (if any) must have seen a digit.  */
1169                   if (seen_exponent && !seen_digit)
1170                     java_lex_error
1171                       ("Invalid FP literal, exponent must have digit", 0);
1172
1173                   literal_token [literal_index] = '\0';
1174                   JAVA_LEX_LIT (literal_token, radix);
1175
1176 #ifndef JC1_LITE
1177                   a.literal_token = literal_token;
1178                   a.fflag = fflag;
1179                   a.java_lval = java_lval;
1180                   a.number_beginning = number_beginning;
1181                   if (do_float_handler (java_perform_atof, (PTR) &a))
1182                     return FP_LIT_TK;
1183
1184                   JAVA_FLOAT_RANGE_ERROR ((fflag ? "float" : "double"));
1185 #else
1186                   return FP_LIT_TK;
1187 #endif
1188                 }
1189             }
1190         } /* JAVA_ASCII_FPCHAR (c) */
1191
1192       /* Here we get back to converting the integral literal.  */
1193       if (radix == 16 && ! found_hex_digits)
1194         java_lex_error
1195           ("0x must be followed by at least one hexadecimal digit", 0);
1196       else if (radix == 8 && found_non_octal_digits)
1197         java_lex_error ("Octal literal contains digit out of range", 0);
1198       else if (c == 'L' || c == 'l')
1199         long_suffix = 1;
1200       else
1201         java_unget_unicode ();
1202
1203 #ifdef JAVA_LEX_DEBUG
1204       literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe.  */
1205       JAVA_LEX_LIT (literal_token, radix);
1206 #endif
1207       /* This section of code is borrowed from gcc/c-lex.c.  */
1208       if (!overflow)
1209         {
1210           bytes = GET_TYPE_PRECISION (long_type_node);
1211           for (i = bytes; i < TOTAL_PARTS; i++)
1212             if (parts [i])
1213               {
1214                 overflow = 1;
1215                 break;
1216               }
1217         }
1218       high = low = 0;
1219       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1220         {
1221           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1222                                               / HOST_BITS_PER_CHAR)]
1223                    << (i * HOST_BITS_PER_CHAR));
1224           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1225         }
1226       /* End borrowed section.  */
1227
1228       /* Range checking.  */
1229       if (long_suffix)
1230         {
1231           /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1232              9223372036854775807L is the biggest `long' literal that can be
1233              expressed using a 10 radix. For other radices, everything that
1234              fits withing 64 bits is OK.  */
1235           int hb = (high >> 31);
1236           if (overflow || (hb && low && radix == 10)
1237               || (hb && high & 0x7fffffff && radix == 10))
1238             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1239         }
1240       else
1241         {
1242           /* 2147483648 is valid if operand of a '-'. Otherwise,
1243              2147483647 is the biggest `int' literal that can be
1244              expressed using a 10 radix. For other radices, everything
1245              that fits within 32 bits is OK.  As all literals are
1246              signed, we sign extend here.  */
1247           int hb = (low >> 31) & 0x1;
1248           if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1249             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1250           high = -hb;
1251         }
1252 #ifndef JC1_LITE
1253       value = build_int_2 (low, high);
1254       JAVA_RADIX10_FLAG (value) = radix == 10;
1255       SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1256 #else
1257       SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1258                           long_suffix ? long_type_node : int_type_node);
1259 #endif
1260       return INT_LIT_TK;
1261     }
1262
1263   /* Character literals.  */
1264   if (c == '\'')
1265     {
1266       int char_lit;
1267       if ((c = java_get_unicode ()) == '\\')
1268         char_lit = java_parse_escape_sequence ();
1269       else
1270         {
1271           if (c == '\n' || c == '\'')
1272             java_lex_error ("Invalid character literal", 0);
1273           char_lit = c;
1274         }
1275
1276       c = java_get_unicode ();
1277
1278       if ((c == '\n') || (c == UEOF))
1279         java_lex_error ("Character literal not terminated at end of line", 0);
1280       if (c != '\'')
1281         java_lex_error ("Syntax error in character literal", 0);
1282
1283       if (char_lit == JAVA_CHAR_ERROR)
1284         char_lit = 0;           /* We silently convert it to zero.  */
1285
1286       JAVA_LEX_CHAR_LIT (char_lit);
1287       SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1288       return CHAR_LIT_TK;
1289     }
1290
1291   /* String literals.  */
1292   if (c == '"')
1293     {
1294       int no_error;
1295       char *string;
1296
1297       for (no_error = 1, c = java_get_unicode ();
1298            c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1299         {
1300           if (c == '\\')
1301             c = java_parse_escape_sequence ();
1302           if (c == JAVA_CHAR_ERROR)
1303             {
1304               no_error = 0;
1305               c = 0;            /* We silently convert it to zero.  */
1306             }
1307           java_unicode_2_utf8 (c);
1308         }
1309       if (c == '\n' || c == UEOF) /* ULT.  */
1310         {
1311           lineno--;     /* Refer to the line where the terminator was seen.  */
1312           java_lex_error ("String not terminated at end of line", 0);
1313           lineno++;
1314         }
1315
1316       obstack_1grow (&temporary_obstack, '\0');
1317       string = obstack_finish (&temporary_obstack);
1318 #ifndef JC1_LITE
1319       if (!no_error || (c != '"'))
1320         java_lval->node = error_mark_node; /* FIXME: Requires futher
1321                                               testing.  */
1322       else
1323         java_lval->node = build_string (strlen (string), string);
1324 #endif
1325       obstack_free (&temporary_obstack, string);
1326       return STRING_LIT_TK;
1327     }
1328
1329   /* Separator.  */
1330   switch (c)
1331     {
1332     case '(':
1333       JAVA_LEX_SEP (c);
1334       BUILD_OPERATOR (OP_TK);
1335     case ')':
1336       JAVA_LEX_SEP (c);
1337       return CP_TK;
1338     case '{':
1339       JAVA_LEX_SEP (c);
1340       if (ctxp->ccb_indent == 1)
1341         ctxp->first_ccb_indent1 = lineno;
1342       ctxp->ccb_indent++;
1343       BUILD_OPERATOR (OCB_TK);
1344     case '}':
1345       JAVA_LEX_SEP (c);
1346       ctxp->ccb_indent--;
1347       if (ctxp->ccb_indent == 1)
1348         ctxp->last_ccb_indent1 = lineno;
1349       BUILD_OPERATOR (CCB_TK);
1350     case '[':
1351       JAVA_LEX_SEP (c);
1352       BUILD_OPERATOR (OSB_TK);
1353     case ']':
1354       JAVA_LEX_SEP (c);
1355       return CSB_TK;
1356     case ';':
1357       JAVA_LEX_SEP (c);
1358       return SC_TK;
1359     case ',':
1360       JAVA_LEX_SEP (c);
1361       return C_TK;
1362     case '.':
1363       JAVA_LEX_SEP (c);
1364       BUILD_OPERATOR (DOT_TK);
1365       /*      return DOT_TK; */
1366     }
1367
1368   /* Operators.  */
1369   switch (c)
1370     {
1371     case '=':
1372       if ((c = java_get_unicode ()) == '=')
1373         {
1374           BUILD_OPERATOR (EQ_TK);
1375         }
1376       else
1377         {
1378           /* Equals is used in two different locations. In the
1379              variable_declarator: rule, it has to be seen as '=' as opposed
1380              to being seen as an ordinary assignment operator in
1381              assignment_operators: rule.  */
1382           java_unget_unicode ();
1383           BUILD_OPERATOR (ASSIGN_TK);
1384         }
1385
1386     case '>':
1387       switch ((c = java_get_unicode ()))
1388         {
1389         case '=':
1390           BUILD_OPERATOR (GTE_TK);
1391         case '>':
1392           switch ((c = java_get_unicode ()))
1393             {
1394             case '>':
1395               if ((c = java_get_unicode ()) == '=')
1396                 {
1397                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1398                 }
1399               else
1400                 {
1401                   java_unget_unicode ();
1402                   BUILD_OPERATOR (ZRS_TK);
1403                 }
1404             case '=':
1405               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1406             default:
1407               java_unget_unicode ();
1408               BUILD_OPERATOR (SRS_TK);
1409             }
1410         default:
1411           java_unget_unicode ();
1412           BUILD_OPERATOR (GT_TK);
1413         }
1414
1415     case '<':
1416       switch ((c = java_get_unicode ()))
1417         {
1418         case '=':
1419           BUILD_OPERATOR (LTE_TK);
1420         case '<':
1421           if ((c = java_get_unicode ()) == '=')
1422             {
1423               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1424             }
1425           else
1426             {
1427               java_unget_unicode ();
1428               BUILD_OPERATOR (LS_TK);
1429             }
1430         default:
1431           java_unget_unicode ();
1432           BUILD_OPERATOR (LT_TK);
1433         }
1434
1435     case '&':
1436       switch ((c = java_get_unicode ()))
1437         {
1438         case '&':
1439           BUILD_OPERATOR (BOOL_AND_TK);
1440         case '=':
1441           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1442         default:
1443           java_unget_unicode ();
1444           BUILD_OPERATOR (AND_TK);
1445         }
1446
1447     case '|':
1448       switch ((c = java_get_unicode ()))
1449         {
1450         case '|':
1451           BUILD_OPERATOR (BOOL_OR_TK);
1452         case '=':
1453           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1454         default:
1455           java_unget_unicode ();
1456           BUILD_OPERATOR (OR_TK);
1457         }
1458
1459     case '+':
1460       switch ((c = java_get_unicode ()))
1461         {
1462         case '+':
1463           BUILD_OPERATOR (INCR_TK);
1464         case '=':
1465           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1466         default:
1467           java_unget_unicode ();
1468           BUILD_OPERATOR (PLUS_TK);
1469         }
1470
1471     case '-':
1472       switch ((c = java_get_unicode ()))
1473         {
1474         case '-':
1475           BUILD_OPERATOR (DECR_TK);
1476         case '=':
1477           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1478         default:
1479           java_unget_unicode ();
1480           BUILD_OPERATOR (MINUS_TK);
1481         }
1482
1483     case '*':
1484       if ((c = java_get_unicode ()) == '=')
1485         {
1486           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1487         }
1488       else
1489         {
1490           java_unget_unicode ();
1491           BUILD_OPERATOR (MULT_TK);
1492         }
1493
1494     case '/':
1495       if ((c = java_get_unicode ()) == '=')
1496         {
1497           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1498         }
1499       else
1500         {
1501           java_unget_unicode ();
1502           BUILD_OPERATOR (DIV_TK);
1503         }
1504
1505     case '^':
1506       if ((c = java_get_unicode ()) == '=')
1507         {
1508           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1509         }
1510       else
1511         {
1512           java_unget_unicode ();
1513           BUILD_OPERATOR (XOR_TK);
1514         }
1515
1516     case '%':
1517       if ((c = java_get_unicode ()) == '=')
1518         {
1519           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1520         }
1521       else
1522         {
1523           java_unget_unicode ();
1524           BUILD_OPERATOR (REM_TK);
1525         }
1526
1527     case '!':
1528       if ((c = java_get_unicode()) == '=')
1529         {
1530           BUILD_OPERATOR (NEQ_TK);
1531         }
1532       else
1533         {
1534           java_unget_unicode ();
1535           BUILD_OPERATOR (NEG_TK);
1536         }
1537
1538     case '?':
1539       JAVA_LEX_OP ("?");
1540       BUILD_OPERATOR (REL_QM_TK);
1541     case ':':
1542       JAVA_LEX_OP (":");
1543       BUILD_OPERATOR (REL_CL_TK);
1544     case '~':
1545       BUILD_OPERATOR (NOT_TK);
1546     }
1547
1548   /* Keyword, boolean literal or null literal.  */
1549   for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1550        JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1551     {
1552       java_unicode_2_utf8 (c);
1553       if (all_ascii && c >= 128)
1554         all_ascii = 0;
1555       ascii_index++;
1556     }
1557
1558   obstack_1grow (&temporary_obstack, '\0');
1559   string = obstack_finish (&temporary_obstack);
1560   java_unget_unicode ();
1561
1562   /* If we have something all ascii, we consider a keyword, a boolean
1563      literal, a null literal or an all ASCII identifier.  Otherwise,
1564      this is an identifier (possibly not respecting formation rule).  */
1565   if (all_ascii)
1566     {
1567       const struct java_keyword *kw;
1568       if ((kw=java_keyword (string, ascii_index)))
1569         {
1570           JAVA_LEX_KW (string);
1571           switch (kw->token)
1572             {
1573             case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1574             case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1575             case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1576             case PRIVATE_TK:      case STRICT_TK:
1577               SET_MODIFIER_CTX (kw->token);
1578               return MODIFIER_TK;
1579             case FLOAT_TK:
1580               SET_LVAL_NODE (float_type_node);
1581               return FP_TK;
1582             case DOUBLE_TK:
1583               SET_LVAL_NODE (double_type_node);
1584               return FP_TK;
1585             case BOOLEAN_TK:
1586               SET_LVAL_NODE (boolean_type_node);
1587               return BOOLEAN_TK;
1588             case BYTE_TK:
1589               SET_LVAL_NODE (byte_type_node);
1590               return INTEGRAL_TK;
1591             case SHORT_TK:
1592               SET_LVAL_NODE (short_type_node);
1593               return INTEGRAL_TK;
1594             case INT_TK:
1595               SET_LVAL_NODE (int_type_node);
1596               return INTEGRAL_TK;
1597             case LONG_TK:
1598               SET_LVAL_NODE (long_type_node);
1599               return INTEGRAL_TK;
1600             case CHAR_TK:
1601               SET_LVAL_NODE (char_type_node);
1602               return INTEGRAL_TK;
1603
1604               /* Keyword based literals.  */
1605             case TRUE_TK:
1606             case FALSE_TK:
1607               SET_LVAL_NODE ((kw->token == TRUE_TK ?
1608                               boolean_true_node : boolean_false_node));
1609               return BOOL_LIT_TK;
1610             case NULL_TK:
1611               SET_LVAL_NODE (null_pointer_node);
1612               return NULL_TK;
1613
1614               /* Some keyword we want to retain information on the location
1615                  they where found.  */
1616             case CASE_TK:
1617             case DEFAULT_TK:
1618             case SUPER_TK:
1619             case THIS_TK:
1620             case RETURN_TK:
1621             case BREAK_TK:
1622             case CONTINUE_TK:
1623             case TRY_TK:
1624             case CATCH_TK:
1625             case THROW_TK:
1626             case INSTANCEOF_TK:
1627               BUILD_OPERATOR (kw->token);
1628
1629             default:
1630               return kw->token;
1631             }
1632         }
1633     }
1634
1635   /* We may have an ID here.  */
1636   if (JAVA_START_CHAR_P (first_unicode))
1637     {
1638       JAVA_LEX_ID (string);
1639       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1640       return ID_TK;
1641     }
1642
1643   /* Everything else is an invalid character in the input.  */
1644   {
1645     char lex_error_buffer [128];
1646     sprintf (lex_error_buffer, "Invalid character `%s' in input",
1647              java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1648     java_lex_error (lex_error_buffer, 1);
1649   }
1650   return 0;
1651 }
1652
1653 #ifndef JC1_LITE
1654 /* This is called by the parser to see if an error should be generated
1655    due to numeric overflow.  This function only handles the particular
1656    case of the largest negative value, and is only called in the case
1657    where this value is not preceded by `-'.  */
1658 static void
1659 error_if_numeric_overflow (value)
1660      tree value;
1661 {
1662   if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1663     {
1664       unsigned HOST_WIDE_INT lo, hi;
1665
1666       lo = TREE_INT_CST_LOW (value);
1667       hi = TREE_INT_CST_HIGH (value);
1668       if (TREE_TYPE (value) == long_type_node)
1669         {
1670           int hb = (hi >> 31);
1671           if (hb && !(hi & 0x7fffffff))
1672             java_lex_error ("Numeric overflow for `long' literal", 0);
1673         }
1674       else
1675         {
1676           int hb = (lo >> 31) & 0x1;
1677           if (hb && !(lo & 0x7fffffff))
1678             java_lex_error ("Numeric overflow for `int' literal", 0);
1679         }
1680     }
1681 }
1682 #endif /* JC1_LITE */
1683
1684 static void
1685 java_unicode_2_utf8 (unicode)
1686     unicode_t unicode;
1687 {
1688   if (RANGE (unicode, 0x01, 0x7f))
1689     obstack_1grow (&temporary_obstack, (char)unicode);
1690   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1691     {
1692       obstack_1grow (&temporary_obstack,
1693                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1694       obstack_1grow (&temporary_obstack,
1695                      (unsigned char)(0x80 | (unicode & 0x3f)));
1696     }
1697   else                          /* Range 0x800-0xffff.  */
1698     {
1699       obstack_1grow (&temporary_obstack,
1700                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1701       obstack_1grow (&temporary_obstack,
1702                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1703       obstack_1grow (&temporary_obstack,
1704                      (unsigned char)(0x80 | (unicode & 0x003f)));
1705     }
1706 }
1707
1708 #ifndef JC1_LITE
1709 static tree
1710 build_wfl_node (node)
1711      tree node;
1712 {
1713   node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1714   /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
1715   TREE_TYPE (node) = NULL_TREE;
1716   return node;
1717 }
1718 #endif
1719
1720 static void
1721 java_lex_error (msg, forward)
1722      const char *msg ATTRIBUTE_UNUSED;
1723      int forward ATTRIBUTE_UNUSED;
1724 {
1725 #ifndef JC1_LITE
1726   ctxp->elc.line = ctxp->c_line->lineno;
1727   ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1728
1729   /* Might be caught in the middle of some error report.  */
1730   ctxp->java_error_flag = 0;
1731   java_error (NULL);
1732   java_error (msg);
1733 #endif
1734 }
1735
1736 #ifndef JC1_LITE
1737 static int
1738 java_is_eol (fp, c)
1739   FILE *fp;
1740   int c;
1741 {
1742   int next;
1743   switch (c)
1744     {
1745     case '\r':
1746       next = getc (fp);
1747       if (next != '\n' && next != EOF)
1748         ungetc (next, fp);
1749       return 1;
1750     case '\n':
1751       return 1;
1752     default:
1753       return 0;
1754     }
1755 }
1756 #endif
1757
1758 char *
1759 java_get_line_col (filename, line, col)
1760      const char *filename ATTRIBUTE_UNUSED;
1761      int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1762 {
1763 #ifdef JC1_LITE
1764   return 0;
1765 #else
1766   /* Dumb implementation. Doesn't try to cache or optimize things.  */
1767   /* First line of the file is line 1, first column is 1.  */
1768
1769   /* COL == -1 means, at the CR/LF in LINE.  */
1770   /* COL == -2 means, at the first non space char in LINE.  */
1771
1772   FILE *fp;
1773   int c, ccol, cline = 1;
1774   int current_line_col = 0;
1775   int first_non_space = 0;
1776   char *base;
1777
1778   if (!(fp = fopen (filename, "r")))
1779     fatal_io_error ("can't open %s", filename);
1780
1781   while (cline != line)
1782     {
1783       c = getc (fp);
1784       if (c == EOF)
1785         {
1786           static const char msg[] = "<<file too short - unexpected EOF>>";
1787           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1788           goto have_line;
1789         }
1790       if (java_is_eol (fp, c))
1791         cline++;
1792     }
1793
1794   /* Gather the chars of the current line in a buffer.  */
1795   for (;;)
1796     {
1797       c = getc (fp);
1798       if (c < 0 || java_is_eol (fp, c))
1799         break;
1800       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1801         first_non_space = current_line_col;
1802       obstack_1grow (&temporary_obstack, c);
1803       current_line_col++;
1804     }
1805  have_line:
1806
1807   obstack_1grow (&temporary_obstack, '\n');
1808
1809   if (col == -1)
1810     {
1811       col = current_line_col;
1812       first_non_space = 0;
1813     }
1814   else if (col == -2)
1815     col = first_non_space;
1816   else
1817     first_non_space = 0;
1818
1819   /* Place the '^' a the right position.  */
1820   base = obstack_base (&temporary_obstack);
1821   for (ccol = 1; ccol <= col+3; ccol++)
1822     {
1823       /* Compute \t when reaching first_non_space.  */
1824       char c = (first_non_space ?
1825                 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1826       obstack_1grow (&temporary_obstack, c);
1827     }
1828   obstack_grow0 (&temporary_obstack, "^", 1);
1829
1830   fclose (fp);
1831   return obstack_finish (&temporary_obstack);
1832 #endif
1833 }
1834
1835 #ifndef JC1_LITE
1836 static int
1837 utf8_cmp (str, length, name)
1838      const unsigned char *str;
1839      int length;
1840      const char *name;
1841 {
1842   const unsigned char *limit = str + length;
1843   int i;
1844
1845   for (i = 0; name[i]; ++i)
1846     {
1847       int ch = UTF8_GET (str, limit);
1848       if (ch != name[i])
1849         return ch - name[i];
1850     }
1851
1852   return str == limit ? 0 : 1;
1853 }
1854
1855 /* A sorted list of all C++ keywords.  */
1856
1857 static const char *const cxx_keywords[] =
1858 {
1859   "_Complex",
1860   "__alignof",
1861   "__alignof__",
1862   "__asm",
1863   "__asm__",
1864   "__attribute",
1865   "__attribute__",
1866   "__builtin_va_arg",
1867   "__complex",
1868   "__complex__",
1869   "__const",
1870   "__const__",
1871   "__extension__",
1872   "__imag",
1873   "__imag__",
1874   "__inline",
1875   "__inline__",
1876   "__label__",
1877   "__null",
1878   "__real",
1879   "__real__",
1880   "__restrict",
1881   "__restrict__",
1882   "__signed",
1883   "__signed__",
1884   "__typeof",
1885   "__typeof__",
1886   "__volatile",
1887   "__volatile__",
1888   "and",
1889   "and_eq",
1890   "asm",
1891   "auto",
1892   "bitand",
1893   "bitor",
1894   "bool",
1895   "break",
1896   "case",
1897   "catch",
1898   "char",
1899   "class",
1900   "compl",
1901   "const",
1902   "const_cast",
1903   "continue",
1904   "default",
1905   "delete",
1906   "do",
1907   "double",
1908   "dynamic_cast",
1909   "else",
1910   "enum",
1911   "explicit",
1912   "export",
1913   "extern",
1914   "false",
1915   "float",
1916   "for",
1917   "friend",
1918   "goto",
1919   "if",
1920   "inline",
1921   "int",
1922   "long",
1923   "mutable",
1924   "namespace",
1925   "new",
1926   "not",
1927   "not_eq",
1928   "operator",
1929   "or",
1930   "or_eq",
1931   "private",
1932   "protected",
1933   "public",
1934   "register",
1935   "reinterpret_cast",
1936   "return",
1937   "short",
1938   "signed",
1939   "sizeof",
1940   "static",
1941   "static_cast",
1942   "struct",
1943   "switch",
1944   "template",
1945   "this",
1946   "throw",
1947   "true",
1948   "try",
1949   "typedef",
1950   "typeid",
1951   "typename",
1952   "typeof",
1953   "union",
1954   "unsigned",
1955   "using",
1956   "virtual",
1957   "void",
1958   "volatile",
1959   "wchar_t",
1960   "while",
1961   "xor",
1962   "xor_eq"
1963 };
1964
1965 /* Return true if NAME is a C++ keyword.  */
1966
1967 int
1968 cxx_keyword_p (name, length)
1969      const char *name;
1970      int length;
1971 {
1972   int last = ARRAY_SIZE (cxx_keywords);
1973   int first = 0;
1974   int mid = (last + first) / 2;
1975   int old = -1;
1976
1977   for (mid = (last + first) / 2;
1978        mid != old;
1979        old = mid, mid = (last + first) / 2)
1980     {
1981       int kwl = strlen (cxx_keywords[mid]);
1982       int min_length = kwl > length ? length : kwl;
1983       int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1984
1985       if (r == 0)
1986         {
1987           int i;
1988           /* We've found a match if all the remaining characters are `$'.  */
1989           for (i = min_length; i < length && name[i] == '$'; ++i)
1990             ;
1991           if (i == length)
1992             return 1;
1993           r = 1;
1994         }
1995
1996       if (r < 0)
1997         last = mid;
1998       else
1999         first = mid;
2000     }
2001   return 0;
2002 }
2003 #endif /* JC1_LITE */