gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   3    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   4
   5 This file is part of GNU CC.
   6
   7 GNU CC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU CC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU CC; see the file COPYING.  If not, write to
  19 the Free Software Foundation, 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.
  21
  22 Java and all Java-based marks are trademarks or registered trademarks
  23 of Sun Microsystems, Inc. in the United States and other countries.
  24 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  25
  26 /* It defines java_lex (yylex) that reads a Java ASCII source file
  27    possibly containing Unicode escape sequence or utf8 encoded
  28    characters and returns a token for everything found but comments,
  29    white spaces and line terminators. When necessary, it also fills
  30    the java_lval (yylval) union. It's implemented to be called by a
  31    re-entrant parser generated by Bison.
  32
  33    The lexical analysis conforms to the Java grammar described in "The
  34    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  35    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  36
  37 #include "keyword.h"
  38 #include "flags.h"
  39 #include "chartables.h"
  40
  41 /* Function declarations.  */
  42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
  43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
  44 static void java_lex_error PARAMS ((const char *, int));
  45 #ifndef JC1_LITE
  46 static int java_is_eol PARAMS ((FILE *, int));
  47 static tree build_wfl_node PARAMS ((tree));
  48 #endif
  49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  50 static int java_parse_escape_sequence PARAMS ((void));
  51 static int java_start_char_p PARAMS ((unicode_t));
  52 static int java_part_char_p PARAMS ((unicode_t));
  53 static int java_parse_doc_section PARAMS ((int));
  54 static void java_parse_end_comment PARAMS ((int));
  55 static int java_get_unicode PARAMS ((void));
  56 static int java_read_unicode PARAMS ((java_lexer *, int *));
  57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
  58                                                              int *));
  59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  60 static int java_read_char PARAMS ((java_lexer *));
  61 static void java_allocate_new_line PARAMS ((void));
  62 static void java_unget_unicode PARAMS ((void));
  63 static unicode_t java_sneak_unicode PARAMS ((void));
  64 #ifndef JC1_LITE
  65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
  66 #endif
  67
  68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
  69 #ifndef JC1_LITE
  70 static void error_if_numeric_overflow PARAMS ((tree));
  71 #endif
  72
  73 #ifdef HAVE_ICONV
  74 /* This is nonzero if we have initialized `need_byteswap'.  */
  75 static int byteswap_init = 0;
  76
  77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  78    big-endian order -- not native endian order.  We handle this by
  79    doing a conversion once at startup and seeing what happens.  This
  80    flag holds the results of this determination.  */
  81 static int need_byteswap = 0;
  82 #endif
  83
  84 void
  85 java_init_lex (finput, encoding)
  86      FILE *finput;
  87      const char *encoding;
  88 {
  89 #ifndef JC1_LITE
  90   int java_lang_imported = 0;
  91
  92   if (!java_lang_id)
  93     java_lang_id = get_identifier ("java.lang");
  94   if (!java_lang_cloneable)
  95     java_lang_cloneable = get_identifier ("java.lang.Cloneable");
  96   if (!java_io_serializable)
  97     java_io_serializable = get_identifier ("java.io.Serializable");
  98   if (!inst_id)
  99     inst_id = get_identifier ("inst$");
 100   if (!wpv_id)
 101     wpv_id = get_identifier ("write_parm_value$");
 102
 103   if (!java_lang_imported)
 104     {
 105       tree node = build_tree_list
 106         (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
 107       read_import_dir (TREE_PURPOSE (node));
 108       TREE_CHAIN (node) = ctxp->import_demand_list;
 109       ctxp->import_demand_list = node;
 110       java_lang_imported = 1;
 111     }
 112
 113   if (!wfl_operator)
 114     wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 115   if (!label_id)
 116     label_id = get_identifier ("$L");
 117   if (!wfl_append)
 118     wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
 119   if (!wfl_string_buffer)
 120     wfl_string_buffer =
 121       build_expr_wfl (get_identifier (flag_emit_class_files
 122                                       ? "java.lang.StringBuffer"
 123                                       : "gnu.gcj.runtime.StringBuffer"),
 124                       NULL, 0, 0);
 125   if (!wfl_to_string)
 126     wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
 127
 128   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 129     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 130
 131   memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
 132   memset ((PTR) current_jcf, 0, sizeof (JCF));
 133   ctxp->current_parsed_class = NULL;
 134   ctxp->package = NULL_TREE;
 135 #endif
 136
 137   ctxp->filename = input_filename;
 138   ctxp->lineno = lineno = 0;
 139   ctxp->p_line = NULL;
 140   ctxp->c_line = NULL;
 141   ctxp->java_error_flag = 0;
 142   ctxp->lexer = java_new_lexer (finput, encoding);
 143 }
 144
 145 static char *
 146 java_sprint_unicode (line, i)
 147     struct java_line *line;
 148     int i;
 149 {
 150   static char buffer [10];
 151   if (line->unicode_escape_p [i] || line->line [i] > 128)
 152     sprintf (buffer, "\\u%04x", line->line [i]);
 153   else
 154     {
 155       buffer [0] = line->line [i];
 156       buffer [1] = '\0';
 157     }
 158   return buffer;
 159 }
 160
 161 static unicode_t
 162 java_sneak_unicode ()
 163 {
 164   return (ctxp->c_line->line [ctxp->c_line->current]);
 165 }
 166
 167 static void
 168 java_unget_unicode ()
 169 {
 170   if (!ctxp->c_line->current)
 171     /* Can't unget unicode.  */
 172     abort ();
 173
 174   ctxp->c_line->current--;
 175   ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
 176 }
 177
 178 static void
 179 java_allocate_new_line ()
 180 {
 181   unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
 182   char ahead_escape_p = (ctxp->c_line ?
 183                          ctxp->c_line->unicode_escape_ahead_p : 0);
 184
 185   if (ctxp->c_line && !ctxp->c_line->white_space_only)
 186     {
 187       if (ctxp->p_line)
 188         {
 189           free (ctxp->p_line->unicode_escape_p);
 190           free (ctxp->p_line->line);
 191           free (ctxp->p_line);
 192         }
 193       ctxp->p_line = ctxp->c_line;
 194       ctxp->c_line = NULL;              /* Reallocated.  */
 195     }
 196
 197   if (!ctxp->c_line)
 198     {
 199       ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
 200       ctxp->c_line->max = JAVA_LINE_MAX;
 201       ctxp->c_line->line = (unicode_t *)xmalloc
 202         (sizeof (unicode_t)*ctxp->c_line->max);
 203       ctxp->c_line->unicode_escape_p =
 204           (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
 205       ctxp->c_line->white_space_only = 0;
 206     }
 207
 208   ctxp->c_line->line [0] = ctxp->c_line->size = 0;
 209   ctxp->c_line->char_col = ctxp->c_line->current = 0;
 210   if (ahead)
 211     {
 212       ctxp->c_line->line [ctxp->c_line->size] = ahead;
 213       ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
 214       ctxp->c_line->size++;
 215     }
 216   ctxp->c_line->ahead [0] = 0;
 217   ctxp->c_line->unicode_escape_ahead_p = 0;
 218   ctxp->c_line->lineno = ++lineno;
 219   ctxp->c_line->white_space_only = 1;
 220 }
 221
 222 /* Create a new lexer object.  */
 223
 224 java_lexer *
 225 java_new_lexer (finput, encoding)
 226      FILE *finput;
 227      const char *encoding;
 228 {
 229   java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
 230   int enc_error = 0;
 231
 232   lex->finput = finput;
 233   lex->bs_count = 0;
 234   lex->unget_value = 0;
 235   lex->hit_eof = 0;
 236
 237 #ifdef HAVE_ICONV
 238   lex->handle = iconv_open ("UCS-2", encoding);
 239   if (lex->handle != (iconv_t) -1)
 240     {
 241       lex->first = -1;
 242       lex->last = -1;
 243       lex->out_first = -1;
 244       lex->out_last = -1;
 245       lex->read_anything = 0;
 246       lex->use_fallback = 0;
 247
 248       /* Work around broken iconv() implementations by doing checking at
 249          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 250          then all UCS-2 encoders will be broken.  Perhaps not a valid
 251          assumption.  */
 252       if (! byteswap_init)
 253         {
 254           iconv_t handle;
 255
 256           byteswap_init = 1;
 257
 258           handle = iconv_open ("UCS-2", "UTF-8");
 259           if (handle != (iconv_t) -1)
 260             {
 261               unicode_t result;
 262               unsigned char in[3];
 263               char *inp, *outp;
 264               size_t inc, outc, r;
 265
 266               /* This is the UTF-8 encoding of \ufeff.  */
 267               in[0] = 0xef;
 268               in[1] = 0xbb;
 269               in[2] = 0xbf;
 270
 271               inp = in;
 272               inc = 3;
 273               outp = (char *) &result;
 274               outc = 2;
 275
 276               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 277                          &outp, &outc);
 278               iconv_close (handle);
 279               /* Conversion must be complete for us to use the result.  */
 280               if (r != (size_t) -1 && inc == 0 && outc == 0)
 281                 need_byteswap = (result != 0xfeff);
 282             }
 283         }
 284
 285       lex->byte_swap = need_byteswap;
 286     }
 287   else
 288 #endif /* HAVE_ICONV */
 289     {
 290       /* If iconv failed, use the internal decoder if the default
 291          encoding was requested.  This code is used on platforms where
 292          iconv exists but is insufficient for our needs.  For
 293          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.  */
 294       if (strcmp (encoding, DEFAULT_ENCODING))
 295         enc_error = 1;
 296 #ifdef HAVE_ICONV
 297       else
 298         lex->use_fallback = 1;
 299 #endif /* HAVE_ICONV */
 300     }
 301
 302   if (enc_error)
 303     fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
 304
 305   return lex;
 306 }
 307
 308 void
 309 java_destroy_lexer (lex)
 310      java_lexer *lex;
 311 {
 312 #ifdef HAVE_ICONV
 313   if (! lex->use_fallback)
 314     iconv_close (lex->handle);
 315 #endif
 316   free (lex);
 317 }
 318
 319 static int
 320 java_read_char (lex)
 321      java_lexer *lex;
 322 {
 323   if (lex->unget_value)
 324     {
 325       unicode_t r = lex->unget_value;
 326       lex->unget_value = 0;
 327       return r;
 328     }
 329
 330 #ifdef HAVE_ICONV
 331   if (! lex->use_fallback)
 332     {
 333       size_t ir, inbytesleft, in_save, out_count, out_save;
 334       char *inp, *outp;
 335       unicode_t result;
 336
 337       /* If there is data which has already been converted, use it.  */
 338       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 339         {
 340           lex->out_first = 0;
 341           lex->out_last = 0;
 342
 343           while (1)
 344             {
 345               /* See if we need to read more data.  If FIRST == 0 then
 346                  the previous conversion attempt ended in the middle of
 347                  a character at the end of the buffer.  Otherwise we
 348                  only have to read if the buffer is empty.  */
 349               if (lex->first == 0 || lex->first >= lex->last)
 350                 {
 351                   int r;
 352
 353                   if (lex->first >= lex->last)
 354                     {
 355                       lex->first = 0;
 356                       lex->last = 0;
 357                     }
 358                   if (feof (lex->finput))
 359                     return UEOF;
 360                   r = fread (&lex->buffer[lex->last], 1,
 361                              sizeof (lex->buffer) - lex->last,
 362                              lex->finput);
 363                   lex->last += r;
 364                 }
 365
 366               inbytesleft = lex->last - lex->first;
 367               out_count = sizeof (lex->out_buffer) - lex->out_last;
 368
 369               if (inbytesleft == 0)
 370                 {
 371                   /* We've tried to read and there is nothing left.  */
 372                   return UEOF;
 373                 }
 374
 375               in_save = inbytesleft;
 376               out_save = out_count;
 377               inp = &lex->buffer[lex->first];
 378               outp = &lex->out_buffer[lex->out_last];
 379               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 380                           &inbytesleft, &outp, &out_count);
 381
 382               /* If we haven't read any bytes, then look to see if we
 383                  have read a BOM.  */
 384               if (! lex->read_anything && out_save - out_count >= 2)
 385                 {
 386                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 387                   if (uc == 0xfeff)
 388                     {
 389                       lex->byte_swap = 0;
 390                       lex->out_first += 2;
 391                     }
 392                   else if (uc == 0xfffe)
 393                     {
 394                       lex->byte_swap = 1;
 395                       lex->out_first += 2;
 396                     }
 397                   lex->read_anything = 1;
 398                 }
 399
 400               if (lex->byte_swap)
 401                 {
 402                   unsigned int i;
 403                   for (i = 0; i < out_save - out_count; i += 2)
 404                     {
 405                       char t = lex->out_buffer[lex->out_last + i];
 406                       lex->out_buffer[lex->out_last + i]
 407                         = lex->out_buffer[lex->out_last + i + 1];
 408                       lex->out_buffer[lex->out_last + i + 1] = t;
 409                     }
 410                 }
 411
 412               lex->first += in_save - inbytesleft;
 413               lex->out_last += out_save - out_count;
 414
 415               /* If we converted anything at all, move along.  */
 416               if (out_count != out_save)
 417                 break;
 418
 419               if (ir == (size_t) -1)
 420                 {
 421                   if (errno == EINVAL)
 422                     {
 423                       /* This is ok.  This means that the end of our buffer
 424                          is in the middle of a character sequence.  We just
 425                          move the valid part of the buffer to the beginning
 426                          to force a read.  */
 427                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 428                                lex->last - lex->first);
 429                       lex->last -= lex->first;
 430                       lex->first = 0;
 431                     }
 432                   else
 433                     {
 434                       /* A more serious error.  */
 435                       java_lex_error ("unrecognized character in input stream",
 436                                       0);
 437                       return UEOF;
 438                     }
 439                 }
 440             }
 441         }
 442
 443       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 444         {
 445           /* Don't have any data.  */
 446           return UEOF;
 447         }
 448
 449       /* Success.  */
 450       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 451       lex->out_first += 2;
 452       return result;
 453     }
 454   else
 455 #endif /* HAVE_ICONV */
 456     {
 457       int c, c1, c2;
 458       c = getc (lex->finput);
 459
 460       if (c == EOF)
 461         return UEOF;
 462       if (c < 128)
 463         return (unicode_t) c;
 464       else
 465         {
 466           if ((c & 0xe0) == 0xc0)
 467             {
 468               c1 = getc (lex->finput);
 469               if ((c1 & 0xc0) == 0x80)
 470                 {
 471                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 472                   /* Check for valid 2-byte characters.  We explicitly
 473                      allow \0 because this encoding is common in the
 474                      Java world.  */
 475                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 476                     return r;
 477                 }
 478             }
 479           else if ((c & 0xf0) == 0xe0)
 480             {
 481               c1 = getc (lex->finput);
 482               if ((c1 & 0xc0) == 0x80)
 483                 {
 484                   c2 = getc (lex->finput);
 485                   if ((c2 & 0xc0) == 0x80)
 486                     {
 487                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 488                                                  (( c1 & 0x3f) << 6)
 489                                                  + (c2 & 0x3f));
 490                       /* Check for valid 3-byte characters.
 491                          Don't allow surrogate, \ufffe or \uffff.  */
 492                       if (r >= 0x800 && r <= 0xffff
 493                           && ! (r >= 0xd800 && r <= 0xdfff)
 494                           && r != 0xfffe && r != 0xffff)
 495                         return r;
 496                     }
 497                 }
 498             }
 499
 500           /* We simply don't support invalid characters.  We also
 501              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 502              cannot be valid Java characters.  */
 503           java_lex_error ("malformed UTF-8 character", 0);
 504         }
 505     }
 506
 507   /* We only get here on error.  */
 508   return UEOF;
 509 }
 510
 511 static void
 512 java_store_unicode (l, c, unicode_escape_p)
 513     struct java_line *l;
 514     unicode_t c;
 515     int unicode_escape_p;
 516 {
 517   if (l->size == l->max)
 518     {
 519       l->max += JAVA_LINE_MAX;
 520       l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
 521       l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
 522                                                sizeof (char)*l->max);
 523     }
 524   l->line [l->size] = c;
 525   l->unicode_escape_p [l->size++] = unicode_escape_p;
 526 }
 527
 528 static int
 529 java_read_unicode (lex, unicode_escape_p)
 530      java_lexer *lex;
 531      int *unicode_escape_p;
 532 {
 533   int c;
 534
 535   c = java_read_char (lex);
 536   *unicode_escape_p = 0;
 537
 538   if (c != '\\')
 539     {
 540       lex->bs_count = 0;
 541       return c;
 542     }
 543
 544   ++lex->bs_count;
 545   if ((lex->bs_count) % 2 == 1)
 546     {
 547       /* Odd number of \ seen.  */
 548       c = java_read_char (lex);
 549       if (c == 'u')
 550         {
 551           unicode_t unicode = 0;
 552           int shift = 12;
 553
 554           /* Recognize any number of `u's in \u.  */
 555           while ((c = java_read_char (lex)) == 'u')
 556             ;
 557
 558           /* Unget the most recent character as it is not a `u'.  */
 559           if (c == UEOF)
 560             return UEOF;
 561           lex->unget_value = c;
 562
 563           /* Next should be 4 hex digits, otherwise it's an error.
 564              The hex value is converted into the unicode, pushed into
 565              the Unicode stream.  */
 566           for (shift = 12; shift >= 0; shift -= 4)
 567             {
 568               if ((c = java_read_char (lex)) == UEOF)
 569                 return UEOF;
 570               if (hex_p (c))
 571                 unicode |= (unicode_t)(hex_value (c) << shift);
 572               else
 573                 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
 574             }
 575           lex->bs_count = 0;
 576           *unicode_escape_p = 1;
 577           return unicode;
 578         }
 579       lex->unget_value = c;
 580     }
 581   return (unicode_t) '\\';
 582 }
 583
 584 static int
 585 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
 586      java_lexer *lex;
 587      int *unicode_escape_p;
 588 {
 589   int c = java_read_unicode (lex, unicode_escape_p);
 590
 591   if (c == '\r')
 592     {
 593       /* We have to read ahead to see if we got \r\n.  In that case we
 594          return a single line terminator.  */
 595       int dummy;
 596       c = java_read_unicode (lex, &dummy);
 597       if (c != '\n')
 598         lex->unget_value = c;
 599       /* In either case we must return a newline.  */
 600       c = '\n';
 601     }
 602
 603   return c;
 604 }
 605
 606 static int
 607 java_get_unicode ()
 608 {
 609   /* It's time to read a line when...  */
 610   if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
 611     {
 612       int c;
 613       int found_chars = 0;
 614
 615       if (ctxp->lexer->hit_eof)
 616         return UEOF;
 617
 618       java_allocate_new_line ();
 619       if (ctxp->c_line->line[0] != '\n')
 620         {
 621           for (;;)
 622             {
 623               int unicode_escape_p;
 624               c = java_read_unicode_collapsing_terminators (ctxp->lexer,
 625                                                             &unicode_escape_p);
 626               if (c != UEOF)
 627                 {
 628                   found_chars = 1;
 629                   java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 630                   if (ctxp->c_line->white_space_only
 631                       && !JAVA_WHITE_SPACE_P (c)
 632                       && c != '\n')
 633                     ctxp->c_line->white_space_only = 0;
 634                 }
 635               if ((c == '\n') || (c == UEOF))
 636                 break;
 637             }
 638
 639           if (c == UEOF && ! found_chars)
 640             {
 641               ctxp->lexer->hit_eof = 1;
 642               return UEOF;
 643             }
 644         }
 645     }
 646   ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
 647   JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
 648   return ctxp->c_line->line [ctxp->c_line->current++];
 649 }
 650
 651 /* Parse the end of a C style comment.
 652  * C is the first character following the '/' and '*'.  */
 653 static void
 654 java_parse_end_comment (c)
 655      int c;
 656 {
 657   for ( ;; c = java_get_unicode ())
 658     {
 659       switch (c)
 660         {
 661         case UEOF:
 662           java_lex_error ("Comment not terminated at end of input", 0);
 663           return;
 664         case '*':
 665           switch (c = java_get_unicode ())
 666             {
 667             case UEOF:
 668               java_lex_error ("Comment not terminated at end of input", 0);
 669               return;
 670             case '/':
 671               return;
 672             case '*':   /* Reparse only '*'.  */
 673               java_unget_unicode ();
 674             }
 675         }
 676     }
 677 }
 678
 679 /* Parse the documentation section. Keywords must be at the beginning
 680    of a documentation comment line (ignoring white space and any `*'
 681    character). Parsed keyword(s): @DEPRECATED.  */
 682
 683 static int
 684 java_parse_doc_section (c)
 685      int c;
 686 {
 687   int valid_tag = 0, seen_star = 0;
 688
 689   while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
 690     {
 691       switch (c)
 692         {
 693         case '*':
 694           seen_star = 1;
 695           break;
 696         case '\n': /* ULT */
 697           valid_tag = 1;
 698         default:
 699           seen_star = 0;
 700         }
 701       c = java_get_unicode();
 702     }
 703
 704   if (c == UEOF)
 705     java_lex_error ("Comment not terminated at end of input", 0);
 706
 707   if (seen_star && (c == '/'))
 708     return 1;                   /* Goto step1 in caller.  */
 709
 710   /* We're parsing `@deprecated'.  */
 711   if (valid_tag && (c == '@'))
 712     {
 713       char tag [11];
 714       int  tag_index = 0;
 715
 716       while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
 717         {
 718           c = java_get_unicode ();
 719           tag [tag_index++] = c;
 720         }
 721
 722       if (c == UEOF)
 723         java_lex_error ("Comment not terminated at end of input", 0);
 724       tag [tag_index] = '\0';
 725
 726       if (!strcmp (tag, "deprecated"))
 727         ctxp->deprecated = 1;
 728     }
 729   java_unget_unicode ();
 730   return 0;
 731 }
 732
 733 /* Return true if C is a valid start character for a Java identifier.
 734    This is only called if C >= 128 -- smaller values are handled
 735    inline.  However, this function handles all values anyway.  */
 736 static int
 737 java_start_char_p (c)
 738      unicode_t c;
 739 {
 740   unsigned int hi = c / 256;
 741   const char *const page = type_table[hi];
 742   unsigned long val = (unsigned long) page;
 743   int flags;
 744
 745   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 746     flags = page[c & 255];
 747   else
 748     flags = val;
 749
 750   return flags & LETTER_START;
 751 }
 752
 753 /* Return true if C is a valid part character for a Java identifier.
 754    This is only called if C >= 128 -- smaller values are handled
 755    inline.  However, this function handles all values anyway.  */
 756 static int
 757 java_part_char_p (c)
 758      unicode_t c;
 759 {
 760   unsigned int hi = c / 256;
 761   const char *const page = type_table[hi];
 762   unsigned long val = (unsigned long) page;
 763   int flags;
 764
 765   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 766     flags = page[c & 255];
 767   else
 768     flags = val;
 769
 770   return flags & LETTER_PART;
 771 }
 772
 773 static int
 774 java_parse_escape_sequence ()
 775 {
 776   unicode_t char_lit;
 777   int c;
 778
 779   switch (c = java_get_unicode ())
 780     {
 781     case 'b':
 782       return (unicode_t)0x8;
 783     case 't':
 784       return (unicode_t)0x9;
 785     case 'n':
 786       return (unicode_t)0xa;
 787     case 'f':
 788       return (unicode_t)0xc;
 789     case 'r':
 790       return (unicode_t)0xd;
 791     case '"':
 792       return (unicode_t)0x22;
 793     case '\'':
 794       return (unicode_t)0x27;
 795     case '\\':
 796       return (unicode_t)0x5c;
 797     case '0': case '1': case '2': case '3': case '4':
 798     case '5': case '6': case '7':
 799       {
 800         int octal_escape[3];
 801         int octal_escape_index = 0;
 802         int max = 3;
 803         int i, shift;
 804
 805         for (; octal_escape_index < max && RANGE (c, '0', '7');
 806              c = java_get_unicode ())
 807           {
 808             if (octal_escape_index == 0 && c > '3')
 809               {
 810                 /* According to the grammar, `\477' has a well-defined
 811                    meaning -- it is `\47' followed by `7'.  */
 812                 --max;
 813               }
 814             octal_escape [octal_escape_index++] = c;
 815           }
 816
 817         java_unget_unicode ();
 818
 819         for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
 820              i < octal_escape_index; i++, shift -= 3)
 821           char_lit |= (octal_escape [i] - '0') << shift;
 822
 823         return char_lit;
 824       }
 825     default:
 826       java_lex_error ("Invalid character in escape sequence", 0);
 827       return JAVA_CHAR_ERROR;
 828     }
 829 }
 830
 831 #ifndef JC1_LITE
 832 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
 833
 834 /* Subroutine of java_lex: converts floating-point literals to tree
 835    nodes.  LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
 836    store the result.  FFLAG indicates whether the literal was tagged
 837    with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
 838    is the line number on which to report any error.  */
 839
 840 static void java_perform_atof   PARAMS ((YYSTYPE *, char *, int, int));
 841
 842 static void
 843 java_perform_atof (java_lval, literal_token, fflag, number_beginning)
 844      YYSTYPE *java_lval;
 845      char *literal_token;
 846      int fflag;
 847      int number_beginning;
 848 {
 849   REAL_VALUE_TYPE value;
 850   tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 851
 852   SET_REAL_VALUE_ATOF (value,
 853                        REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
 854
 855   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 856     {
 857       JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
 858       value = DCONST0;
 859     }
 860   else if (IS_ZERO (value))
 861     {
 862       /* We check to see if the value is really 0 or if we've found an
 863          underflow.  We do this in the most primitive imaginable way.  */
 864       int really_zero = 1;
 865       char *p = literal_token;
 866       if (*p == '-')
 867         ++p;
 868       while (*p && *p != 'e' && *p != 'E')
 869         {
 870           if (*p != '0' && *p != '.')
 871             {
 872               really_zero = 0;
 873               break;
 874             }
 875           ++p;
 876         }
 877       if (! really_zero)
 878         {
 879           int i = ctxp->c_line->current;
 880           ctxp->c_line->current = number_beginning;
 881           java_lex_error ("Floating point literal underflow", 0);
 882           ctxp->c_line->current = i;
 883         }
 884     }
 885
 886   SET_LVAL_NODE_TYPE (build_real (type, value), type);
 887 }
 888 #endif
 889
 890 static int yylex                PARAMS ((YYSTYPE *));
 891
 892 static int
 893 #ifdef JC1_LITE
 894 yylex (java_lval)
 895 #else
 896 java_lex (java_lval)
 897 #endif
 898      YYSTYPE *java_lval;
 899 {
 900   int c;
 901   unicode_t first_unicode;
 902   int ascii_index, all_ascii;
 903   char *string;
 904
 905   /* Translation of the Unicode escape in the raw stream of Unicode
 906      characters. Takes care of line terminator.  */
 907  step1:
 908   /* Skip white spaces: SP, TAB and FF or ULT.  */
 909   for (c = java_get_unicode ();
 910        c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
 911     if (c == '\n')
 912       {
 913         ctxp->elc.line = ctxp->c_line->lineno;
 914         ctxp->elc.col  = ctxp->c_line->char_col-2;
 915       }
 916
 917   ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
 918
 919   if (c == 0x1a)                /* CTRL-Z.  */
 920     {
 921       if ((c = java_get_unicode ()) == UEOF)
 922         return 0;               /* Ok here.  */
 923       else
 924         java_unget_unicode ();  /* Caught later, at the end of the
 925                                    function.  */
 926     }
 927   /* Handle EOF here.  */
 928   if (c == UEOF)        /* Should probably do something here...  */
 929     return 0;
 930
 931   /* Take care of eventual comments.  */
 932   if (c == '/')
 933     {
 934       switch (c = java_get_unicode ())
 935         {
 936         case '/':
 937           for (;;)
 938             {
 939               c = java_get_unicode ();
 940               if (c == UEOF)
 941                 {
 942                   /* It is ok to end a `//' comment with EOF, unless
 943                      we're being pedantic.  */
 944                   if (pedantic)
 945                     java_lex_error ("Comment not terminated at end of input",
 946                                     0);
 947                   return 0;
 948                 }
 949               if (c == '\n')    /* ULT */
 950                 goto step1;
 951             }
 952           break;
 953
 954         case '*':
 955           if ((c = java_get_unicode ()) == '*')
 956             {
 957               if ((c = java_get_unicode ()) == '/')
 958                 goto step1;     /* Empty documentation comment.  */
 959               else if (java_parse_doc_section (c))
 960                 goto step1;
 961             }
 962
 963           java_parse_end_comment ((c = java_get_unicode ()));
 964           goto step1;
 965           break;
 966         default:
 967           java_unget_unicode ();
 968           c = '/';
 969           break;
 970         }
 971     }
 972
 973   ctxp->elc.line = ctxp->c_line->lineno;
 974   ctxp->elc.prev_col = ctxp->elc.col;
 975   ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
 976   if (ctxp->elc.col < 0)
 977     abort ();
 978
 979   /* Numeric literals.  */
 980   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 981     {
 982       /* This section of code is borrowed from gcc/c-lex.c.  */
 983 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 984       int parts[TOTAL_PARTS];
 985       HOST_WIDE_INT high, low;
 986       /* End borrowed section.  */
 987       char literal_token [256];
 988       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 989       int  found_hex_digits = 0, found_non_octal_digits = 0;
 990       int  i;
 991 #ifndef JC1_LITE
 992       int  number_beginning = ctxp->c_line->current;
 993       tree value;
 994 #endif
 995
 996       /* We might have a . separator instead of a FP like .[0-9]*.  */
 997       if (c == '.')
 998         {
 999           unicode_t peep = java_sneak_unicode ();
1000
1001           if (!JAVA_ASCII_DIGIT (peep))
1002             {
1003               JAVA_LEX_SEP('.');
1004               BUILD_OPERATOR (DOT_TK);
1005             }
1006         }
1007
1008       for (i = 0; i < TOTAL_PARTS; i++)
1009         parts [i] = 0;
1010
1011       if (c == '0')
1012         {
1013           c = java_get_unicode ();
1014           if (c == 'x' || c == 'X')
1015             {
1016               radix = 16;
1017               c = java_get_unicode ();
1018             }
1019           else if (JAVA_ASCII_DIGIT (c))
1020             radix = 8;
1021           else if (c == '.')
1022             {
1023               /* Push the '.' back and prepare for a FP parsing...  */
1024               java_unget_unicode ();
1025               c = '0';
1026             }
1027           else
1028             {
1029               /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
1030               JAVA_LEX_LIT ("0", 10);
1031               switch (c)
1032                 {
1033                 case 'L': case 'l':
1034                   SET_LVAL_NODE (long_zero_node);
1035                   return (INT_LIT_TK);
1036                 case 'f': case 'F':
1037                   SET_LVAL_NODE (float_zero_node);
1038                   return (FP_LIT_TK);
1039                 case 'd': case 'D':
1040                   SET_LVAL_NODE (double_zero_node);
1041                   return (FP_LIT_TK);
1042                 default:
1043                   java_unget_unicode ();
1044                   SET_LVAL_NODE (integer_zero_node);
1045                   return (INT_LIT_TK);
1046                 }
1047             }
1048         }
1049       /* Parse the first part of the literal, until we find something
1050          which is not a number.  */
1051       while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1052              JAVA_ASCII_DIGIT (c))
1053         {
1054           /* We store in a string (in case it turns out to be a FP) and in
1055              PARTS if we have to process a integer literal.  */
1056           int numeric = hex_value (c);
1057           int count;
1058
1059           /* Remember when we find a valid hexadecimal digit.  */
1060           if (radix == 16)
1061             found_hex_digits = 1;
1062           /* Remember when we find an invalid octal digit.  */
1063           else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1064             found_non_octal_digits = 1;
1065
1066           literal_token [literal_index++] = c;
1067           /* This section of code if borrowed from gcc/c-lex.c.  */
1068           for (count = 0; count < TOTAL_PARTS; count++)
1069             {
1070               parts[count] *= radix;
1071               if (count)
1072                 {
1073                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1074                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1075                 }
1076               else
1077                 parts[0] += numeric;
1078             }
1079           if (parts [TOTAL_PARTS-1] != 0)
1080             overflow = 1;
1081           /* End borrowed section.  */
1082           c = java_get_unicode ();
1083         }
1084
1085       /* If we have something from the FP char set but not a digit, parse
1086          a FP literal.  */
1087       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1088         {
1089           int stage = 0;
1090           int seen_digit = (literal_index ? 1 : 0);
1091           int seen_exponent = 0;
1092           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1093                                    double unless specified.  */
1094
1095           /* It is ok if the radix is 8 because this just means we've
1096              seen a leading `0'.  However, radix==16 is invalid.  */
1097           if (radix == 16)
1098             java_lex_error ("Can't express non-decimal FP literal", 0);
1099           radix = 10;
1100
1101           for (;;)
1102             {
1103               if (c == '.')
1104                 {
1105                   if (stage < 1)
1106                     {
1107                       stage = 1;
1108                       literal_token [literal_index++ ] = c;
1109                       c = java_get_unicode ();
1110                     }
1111                   else
1112                     java_lex_error ("Invalid character in FP literal", 0);
1113                 }
1114
1115               if (c == 'e' || c == 'E')
1116                 {
1117                   if (stage < 2)
1118                     {
1119                       /* {E,e} must have seen at least a digit.  */
1120                       if (!seen_digit)
1121                         java_lex_error
1122                           ("Invalid FP literal, mantissa must have digit", 0);
1123                       seen_digit = 0;
1124                       seen_exponent = 1;
1125                       stage = 2;
1126                       literal_token [literal_index++] = c;
1127                       c = java_get_unicode ();
1128                     }
1129                   else
1130                     java_lex_error ("Invalid character in FP literal", 0);
1131                 }
1132               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1133                 {
1134                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1135                   stage = 4;    /* So we fall through.  */
1136                 }
1137
1138               if ((c=='-' || c =='+') && stage == 2)
1139                 {
1140                   stage = 3;
1141                   literal_token [literal_index++] = c;
1142                   c = java_get_unicode ();
1143                 }
1144
1145               if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1146                   (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1147                   (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1148                   (stage == 3 && JAVA_ASCII_DIGIT (c)))
1149                 {
1150                   if (JAVA_ASCII_DIGIT (c))
1151                     seen_digit = 1;
1152                   if (stage == 2)
1153                     stage = 3;
1154                   literal_token [literal_index++ ] = c;
1155                   c = java_get_unicode ();
1156                 }
1157               else
1158                 {
1159                   if (stage != 4) /* Don't push back fF/dD.  */
1160                     java_unget_unicode ();
1161
1162                   /* An exponent (if any) must have seen a digit.  */
1163                   if (seen_exponent && !seen_digit)
1164                     java_lex_error
1165                       ("Invalid FP literal, exponent must have digit", 0);
1166
1167                   literal_token [literal_index] = '\0';
1168                   JAVA_LEX_LIT (literal_token, radix);
1169
1170 #ifndef JC1_LITE
1171                   java_perform_atof (java_lval, literal_token,
1172                                      fflag, number_beginning);
1173 #endif
1174                   return FP_LIT_TK;
1175                 }
1176             }
1177         } /* JAVA_ASCII_FPCHAR (c) */
1178
1179       /* Here we get back to converting the integral literal.  */
1180       if (radix == 16 && ! found_hex_digits)
1181         java_lex_error
1182           ("0x must be followed by at least one hexadecimal digit", 0);
1183       else if (radix == 8 && found_non_octal_digits)
1184         java_lex_error ("Octal literal contains digit out of range", 0);
1185       else if (c == 'L' || c == 'l')
1186         long_suffix = 1;
1187       else
1188         java_unget_unicode ();
1189
1190 #ifdef JAVA_LEX_DEBUG
1191       literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe.  */
1192       JAVA_LEX_LIT (literal_token, radix);
1193 #endif
1194       /* This section of code is borrowed from gcc/c-lex.c.  */
1195       if (!overflow)
1196         {
1197           bytes = GET_TYPE_PRECISION (long_type_node);
1198           for (i = bytes; i < TOTAL_PARTS; i++)
1199             if (parts [i])
1200               {
1201                 overflow = 1;
1202                 break;
1203               }
1204         }
1205       high = low = 0;
1206       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1207         {
1208           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1209                                               / HOST_BITS_PER_CHAR)]
1210                    << (i * HOST_BITS_PER_CHAR));
1211           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1212         }
1213       /* End borrowed section.  */
1214
1215       /* Range checking.  */
1216       if (long_suffix)
1217         {
1218           /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1219              9223372036854775807L is the biggest `long' literal that can be
1220              expressed using a 10 radix. For other radices, everything that
1221              fits withing 64 bits is OK.  */
1222           int hb = (high >> 31);
1223           if (overflow || (hb && low && radix == 10)
1224               || (hb && high & 0x7fffffff && radix == 10))
1225             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1226         }
1227       else
1228         {
1229           /* 2147483648 is valid if operand of a '-'. Otherwise,
1230              2147483647 is the biggest `int' literal that can be
1231              expressed using a 10 radix. For other radices, everything
1232              that fits within 32 bits is OK.  As all literals are
1233              signed, we sign extend here.  */
1234           int hb = (low >> 31) & 0x1;
1235           if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1236             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1237           high = -hb;
1238         }
1239 #ifndef JC1_LITE
1240       value = build_int_2 (low, high);
1241       JAVA_RADIX10_FLAG (value) = radix == 10;
1242       SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1243 #else
1244       SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1245                           long_suffix ? long_type_node : int_type_node);
1246 #endif
1247       return INT_LIT_TK;
1248     }
1249
1250   /* Character literals.  */
1251   if (c == '\'')
1252     {
1253       int char_lit;
1254       if ((c = java_get_unicode ()) == '\\')
1255         char_lit = java_parse_escape_sequence ();
1256       else
1257         {
1258           if (c == '\n' || c == '\'')
1259             java_lex_error ("Invalid character literal", 0);
1260           char_lit = c;
1261         }
1262
1263       c = java_get_unicode ();
1264
1265       if ((c == '\n') || (c == UEOF))
1266         java_lex_error ("Character literal not terminated at end of line", 0);
1267       if (c != '\'')
1268         java_lex_error ("Syntax error in character literal", 0);
1269
1270       if (char_lit == JAVA_CHAR_ERROR)
1271         char_lit = 0;           /* We silently convert it to zero.  */
1272
1273       JAVA_LEX_CHAR_LIT (char_lit);
1274       SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1275       return CHAR_LIT_TK;
1276     }
1277
1278   /* String literals.  */
1279   if (c == '"')
1280     {
1281       int no_error;
1282       char *string;
1283
1284       for (no_error = 1, c = java_get_unicode ();
1285            c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1286         {
1287           if (c == '\\')
1288             c = java_parse_escape_sequence ();
1289           if (c == JAVA_CHAR_ERROR)
1290             {
1291               no_error = 0;
1292               c = 0;            /* We silently convert it to zero.  */
1293             }
1294           java_unicode_2_utf8 (c);
1295         }
1296       if (c == '\n' || c == UEOF) /* ULT.  */
1297         {
1298           lineno--;     /* Refer to the line where the terminator was seen.  */
1299           java_lex_error ("String not terminated at end of line", 0);
1300           lineno++;
1301         }
1302
1303       obstack_1grow (&temporary_obstack, '\0');
1304       string = obstack_finish (&temporary_obstack);
1305 #ifndef JC1_LITE
1306       if (!no_error || (c != '"'))
1307         java_lval->node = error_mark_node; /* FIXME: Requires futher
1308                                               testing.  */
1309       else
1310         java_lval->node = build_string (strlen (string), string);
1311 #endif
1312       obstack_free (&temporary_obstack, string);
1313       return STRING_LIT_TK;
1314     }
1315
1316   /* Separator.  */
1317   switch (c)
1318     {
1319     case '(':
1320       JAVA_LEX_SEP (c);
1321       BUILD_OPERATOR (OP_TK);
1322     case ')':
1323       JAVA_LEX_SEP (c);
1324       return CP_TK;
1325     case '{':
1326       JAVA_LEX_SEP (c);
1327       if (ctxp->ccb_indent == 1)
1328         ctxp->first_ccb_indent1 = lineno;
1329       ctxp->ccb_indent++;
1330       BUILD_OPERATOR (OCB_TK);
1331     case '}':
1332       JAVA_LEX_SEP (c);
1333       ctxp->ccb_indent--;
1334       if (ctxp->ccb_indent == 1)
1335         ctxp->last_ccb_indent1 = lineno;
1336       BUILD_OPERATOR (CCB_TK);
1337     case '[':
1338       JAVA_LEX_SEP (c);
1339       BUILD_OPERATOR (OSB_TK);
1340     case ']':
1341       JAVA_LEX_SEP (c);
1342       return CSB_TK;
1343     case ';':
1344       JAVA_LEX_SEP (c);
1345       return SC_TK;
1346     case ',':
1347       JAVA_LEX_SEP (c);
1348       return C_TK;
1349     case '.':
1350       JAVA_LEX_SEP (c);
1351       BUILD_OPERATOR (DOT_TK);
1352       /*      return DOT_TK; */
1353     }
1354
1355   /* Operators.  */
1356   switch (c)
1357     {
1358     case '=':
1359       if ((c = java_get_unicode ()) == '=')
1360         {
1361           BUILD_OPERATOR (EQ_TK);
1362         }
1363       else
1364         {
1365           /* Equals is used in two different locations. In the
1366              variable_declarator: rule, it has to be seen as '=' as opposed
1367              to being seen as an ordinary assignment operator in
1368              assignment_operators: rule.  */
1369           java_unget_unicode ();
1370           BUILD_OPERATOR (ASSIGN_TK);
1371         }
1372
1373     case '>':
1374       switch ((c = java_get_unicode ()))
1375         {
1376         case '=':
1377           BUILD_OPERATOR (GTE_TK);
1378         case '>':
1379           switch ((c = java_get_unicode ()))
1380             {
1381             case '>':
1382               if ((c = java_get_unicode ()) == '=')
1383                 {
1384                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1385                 }
1386               else
1387                 {
1388                   java_unget_unicode ();
1389                   BUILD_OPERATOR (ZRS_TK);
1390                 }
1391             case '=':
1392               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1393             default:
1394               java_unget_unicode ();
1395               BUILD_OPERATOR (SRS_TK);
1396             }
1397         default:
1398           java_unget_unicode ();
1399           BUILD_OPERATOR (GT_TK);
1400         }
1401
1402     case '<':
1403       switch ((c = java_get_unicode ()))
1404         {
1405         case '=':
1406           BUILD_OPERATOR (LTE_TK);
1407         case '<':
1408           if ((c = java_get_unicode ()) == '=')
1409             {
1410               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1411             }
1412           else
1413             {
1414               java_unget_unicode ();
1415               BUILD_OPERATOR (LS_TK);
1416             }
1417         default:
1418           java_unget_unicode ();
1419           BUILD_OPERATOR (LT_TK);
1420         }
1421
1422     case '&':
1423       switch ((c = java_get_unicode ()))
1424         {
1425         case '&':
1426           BUILD_OPERATOR (BOOL_AND_TK);
1427         case '=':
1428           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1429         default:
1430           java_unget_unicode ();
1431           BUILD_OPERATOR (AND_TK);
1432         }
1433
1434     case '|':
1435       switch ((c = java_get_unicode ()))
1436         {
1437         case '|':
1438           BUILD_OPERATOR (BOOL_OR_TK);
1439         case '=':
1440           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1441         default:
1442           java_unget_unicode ();
1443           BUILD_OPERATOR (OR_TK);
1444         }
1445
1446     case '+':
1447       switch ((c = java_get_unicode ()))
1448         {
1449         case '+':
1450           BUILD_OPERATOR (INCR_TK);
1451         case '=':
1452           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1453         default:
1454           java_unget_unicode ();
1455           BUILD_OPERATOR (PLUS_TK);
1456         }
1457
1458     case '-':
1459       switch ((c = java_get_unicode ()))
1460         {
1461         case '-':
1462           BUILD_OPERATOR (DECR_TK);
1463         case '=':
1464           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1465         default:
1466           java_unget_unicode ();
1467           BUILD_OPERATOR (MINUS_TK);
1468         }
1469
1470     case '*':
1471       if ((c = java_get_unicode ()) == '=')
1472         {
1473           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1474         }
1475       else
1476         {
1477           java_unget_unicode ();
1478           BUILD_OPERATOR (MULT_TK);
1479         }
1480
1481     case '/':
1482       if ((c = java_get_unicode ()) == '=')
1483         {
1484           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1485         }
1486       else
1487         {
1488           java_unget_unicode ();
1489           BUILD_OPERATOR (DIV_TK);
1490         }
1491
1492     case '^':
1493       if ((c = java_get_unicode ()) == '=')
1494         {
1495           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1496         }
1497       else
1498         {
1499           java_unget_unicode ();
1500           BUILD_OPERATOR (XOR_TK);
1501         }
1502
1503     case '%':
1504       if ((c = java_get_unicode ()) == '=')
1505         {
1506           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1507         }
1508       else
1509         {
1510           java_unget_unicode ();
1511           BUILD_OPERATOR (REM_TK);
1512         }
1513
1514     case '!':
1515       if ((c = java_get_unicode()) == '=')
1516         {
1517           BUILD_OPERATOR (NEQ_TK);
1518         }
1519       else
1520         {
1521           java_unget_unicode ();
1522           BUILD_OPERATOR (NEG_TK);
1523         }
1524
1525     case '?':
1526       JAVA_LEX_OP ("?");
1527       BUILD_OPERATOR (REL_QM_TK);
1528     case ':':
1529       JAVA_LEX_OP (":");
1530       BUILD_OPERATOR (REL_CL_TK);
1531     case '~':
1532       BUILD_OPERATOR (NOT_TK);
1533     }
1534
1535   /* Keyword, boolean literal or null literal.  */
1536   for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1537        JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1538     {
1539       java_unicode_2_utf8 (c);
1540       if (all_ascii && c >= 128)
1541         all_ascii = 0;
1542       ascii_index++;
1543     }
1544
1545   obstack_1grow (&temporary_obstack, '\0');
1546   string = obstack_finish (&temporary_obstack);
1547   java_unget_unicode ();
1548
1549   /* If we have something all ascii, we consider a keyword, a boolean
1550      literal, a null literal or an all ASCII identifier.  Otherwise,
1551      this is an identifier (possibly not respecting formation rule).  */
1552   if (all_ascii)
1553     {
1554       const struct java_keyword *kw;
1555       if ((kw=java_keyword (string, ascii_index)))
1556         {
1557           JAVA_LEX_KW (string);
1558           switch (kw->token)
1559             {
1560             case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1561             case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1562             case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1563             case PRIVATE_TK:      case STRICT_TK:
1564               SET_MODIFIER_CTX (kw->token);
1565               return MODIFIER_TK;
1566             case FLOAT_TK:
1567               SET_LVAL_NODE (float_type_node);
1568               return FP_TK;
1569             case DOUBLE_TK:
1570               SET_LVAL_NODE (double_type_node);
1571               return FP_TK;
1572             case BOOLEAN_TK:
1573               SET_LVAL_NODE (boolean_type_node);
1574               return BOOLEAN_TK;
1575             case BYTE_TK:
1576               SET_LVAL_NODE (byte_type_node);
1577               return INTEGRAL_TK;
1578             case SHORT_TK:
1579               SET_LVAL_NODE (short_type_node);
1580               return INTEGRAL_TK;
1581             case INT_TK:
1582               SET_LVAL_NODE (int_type_node);
1583               return INTEGRAL_TK;
1584             case LONG_TK:
1585               SET_LVAL_NODE (long_type_node);
1586               return INTEGRAL_TK;
1587             case CHAR_TK:
1588               SET_LVAL_NODE (char_type_node);
1589               return INTEGRAL_TK;
1590
1591               /* Keyword based literals.  */
1592             case TRUE_TK:
1593             case FALSE_TK:
1594               SET_LVAL_NODE ((kw->token == TRUE_TK ?
1595                               boolean_true_node : boolean_false_node));
1596               return BOOL_LIT_TK;
1597             case NULL_TK:
1598               SET_LVAL_NODE (null_pointer_node);
1599               return NULL_TK;
1600
1601               /* Some keyword we want to retain information on the location
1602                  they where found.  */
1603             case CASE_TK:
1604             case DEFAULT_TK:
1605             case SUPER_TK:
1606             case THIS_TK:
1607             case RETURN_TK:
1608             case BREAK_TK:
1609             case CONTINUE_TK:
1610             case TRY_TK:
1611             case CATCH_TK:
1612             case THROW_TK:
1613             case INSTANCEOF_TK:
1614               BUILD_OPERATOR (kw->token);
1615
1616             default:
1617               return kw->token;
1618             }
1619         }
1620     }
1621
1622   /* We may have an ID here.  */
1623   if (JAVA_START_CHAR_P (first_unicode))
1624     {
1625       JAVA_LEX_ID (string);
1626       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1627       return ID_TK;
1628     }
1629
1630   /* Everything else is an invalid character in the input.  */
1631   {
1632     char lex_error_buffer [128];
1633     sprintf (lex_error_buffer, "Invalid character `%s' in input",
1634              java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1635     java_lex_error (lex_error_buffer, 1);
1636   }
1637   return 0;
1638 }
1639
1640 #ifndef JC1_LITE
1641 /* This is called by the parser to see if an error should be generated
1642    due to numeric overflow.  This function only handles the particular
1643    case of the largest negative value, and is only called in the case
1644    where this value is not preceded by `-'.  */
1645 static void
1646 error_if_numeric_overflow (value)
1647      tree value;
1648 {
1649   if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1650     {
1651       unsigned HOST_WIDE_INT lo, hi;
1652
1653       lo = TREE_INT_CST_LOW (value);
1654       hi = TREE_INT_CST_HIGH (value);
1655       if (TREE_TYPE (value) == long_type_node)
1656         {
1657           int hb = (hi >> 31);
1658           if (hb && !(hi & 0x7fffffff))
1659             java_lex_error ("Numeric overflow for `long' literal", 0);
1660         }
1661       else
1662         {
1663           int hb = (lo >> 31) & 0x1;
1664           if (hb && !(lo & 0x7fffffff))
1665             java_lex_error ("Numeric overflow for `int' literal", 0);
1666         }
1667     }
1668 }
1669 #endif /* JC1_LITE */
1670
1671 static void
1672 java_unicode_2_utf8 (unicode)
1673     unicode_t unicode;
1674 {
1675   if (RANGE (unicode, 0x01, 0x7f))
1676     obstack_1grow (&temporary_obstack, (char)unicode);
1677   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1678     {
1679       obstack_1grow (&temporary_obstack,
1680                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1681       obstack_1grow (&temporary_obstack,
1682                      (unsigned char)(0x80 | (unicode & 0x3f)));
1683     }
1684   else                          /* Range 0x800-0xffff.  */
1685     {
1686       obstack_1grow (&temporary_obstack,
1687                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1688       obstack_1grow (&temporary_obstack,
1689                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1690       obstack_1grow (&temporary_obstack,
1691                      (unsigned char)(0x80 | (unicode & 0x003f)));
1692     }
1693 }
1694
1695 #ifndef JC1_LITE
1696 static tree
1697 build_wfl_node (node)
1698      tree node;
1699 {
1700   node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1701   /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
1702   TREE_TYPE (node) = NULL_TREE;
1703   return node;
1704 }
1705 #endif
1706
1707 static void
1708 java_lex_error (msg, forward)
1709      const char *msg ATTRIBUTE_UNUSED;
1710      int forward ATTRIBUTE_UNUSED;
1711 {
1712 #ifndef JC1_LITE
1713   ctxp->elc.line = ctxp->c_line->lineno;
1714   ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1715
1716   /* Might be caught in the middle of some error report.  */
1717   ctxp->java_error_flag = 0;
1718   java_error (NULL);
1719   java_error (msg);
1720 #endif
1721 }
1722
1723 #ifndef JC1_LITE
1724 static int
1725 java_is_eol (fp, c)
1726   FILE *fp;
1727   int c;
1728 {
1729   int next;
1730   switch (c)
1731     {
1732     case '\r':
1733       next = getc (fp);
1734       if (next != '\n' && next != EOF)
1735         ungetc (next, fp);
1736       return 1;
1737     case '\n':
1738       return 1;
1739     default:
1740       return 0;
1741     }
1742 }
1743 #endif
1744
1745 char *
1746 java_get_line_col (filename, line, col)
1747      const char *filename ATTRIBUTE_UNUSED;
1748      int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1749 {
1750 #ifdef JC1_LITE
1751   return 0;
1752 #else
1753   /* Dumb implementation. Doesn't try to cache or optimize things.  */
1754   /* First line of the file is line 1, first column is 1.  */
1755
1756   /* COL == -1 means, at the CR/LF in LINE.  */
1757   /* COL == -2 means, at the first non space char in LINE.  */
1758
1759   FILE *fp;
1760   int c, ccol, cline = 1;
1761   int current_line_col = 0;
1762   int first_non_space = 0;
1763   char *base;
1764
1765   if (!(fp = fopen (filename, "r")))
1766     fatal_io_error ("can't open %s", filename);
1767
1768   while (cline != line)
1769     {
1770       c = getc (fp);
1771       if (c == EOF)
1772         {
1773           static const char msg[] = "<<file too short - unexpected EOF>>";
1774           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1775           goto have_line;
1776         }
1777       if (java_is_eol (fp, c))
1778         cline++;
1779     }
1780
1781   /* Gather the chars of the current line in a buffer.  */
1782   for (;;)
1783     {
1784       c = getc (fp);
1785       if (c < 0 || java_is_eol (fp, c))
1786         break;
1787       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1788         first_non_space = current_line_col;
1789       obstack_1grow (&temporary_obstack, c);
1790       current_line_col++;
1791     }
1792  have_line:
1793
1794   obstack_1grow (&temporary_obstack, '\n');
1795
1796   if (col == -1)
1797     {
1798       col = current_line_col;
1799       first_non_space = 0;
1800     }
1801   else if (col == -2)
1802     col = first_non_space;
1803   else
1804     first_non_space = 0;
1805
1806   /* Place the '^' a the right position.  */
1807   base = obstack_base (&temporary_obstack);
1808   for (ccol = 1; ccol <= col+3; ccol++)
1809     {
1810       /* Compute \t when reaching first_non_space.  */
1811       char c = (first_non_space ?
1812                 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1813       obstack_1grow (&temporary_obstack, c);
1814     }
1815   obstack_grow0 (&temporary_obstack, "^", 1);
1816
1817   fclose (fp);
1818   return obstack_finish (&temporary_obstack);
1819 #endif
1820 }
1821
1822 #ifndef JC1_LITE
1823 static int
1824 utf8_cmp (str, length, name)
1825      const unsigned char *str;
1826      int length;
1827      const char *name;
1828 {
1829   const unsigned char *limit = str + length;
1830   int i;
1831
1832   for (i = 0; name[i]; ++i)
1833     {
1834       int ch = UTF8_GET (str, limit);
1835       if (ch != name[i])
1836         return ch - name[i];
1837     }
1838
1839   return str == limit ? 0 : 1;
1840 }
1841
1842 /* A sorted list of all C++ keywords.  */
1843
1844 static const char *const cxx_keywords[] =
1845 {
1846   "_Complex",
1847   "__alignof",
1848   "__alignof__",
1849   "__asm",
1850   "__asm__",
1851   "__attribute",
1852   "__attribute__",
1853   "__builtin_va_arg",
1854   "__complex",
1855   "__complex__",
1856   "__const",
1857   "__const__",
1858   "__extension__",
1859   "__imag",
1860   "__imag__",
1861   "__inline",
1862   "__inline__",
1863   "__label__",
1864   "__null",
1865   "__real",
1866   "__real__",
1867   "__restrict",
1868   "__restrict__",
1869   "__signed",
1870   "__signed__",
1871   "__typeof",
1872   "__typeof__",
1873   "__volatile",
1874   "__volatile__",
1875   "and",
1876   "and_eq",
1877   "asm",
1878   "auto",
1879   "bitand",
1880   "bitor",
1881   "bool",
1882   "break",
1883   "case",
1884   "catch",
1885   "char",
1886   "class",
1887   "compl",
1888   "const",
1889   "const_cast",
1890   "continue",
1891   "default",
1892   "delete",
1893   "do",
1894   "double",
1895   "dynamic_cast",
1896   "else",
1897   "enum",
1898   "explicit",
1899   "export",
1900   "extern",
1901   "false",
1902   "float",
1903   "for",
1904   "friend",
1905   "goto",
1906   "if",
1907   "inline",
1908   "int",
1909   "long",
1910   "mutable",
1911   "namespace",
1912   "new",
1913   "not",
1914   "not_eq",
1915   "operator",
1916   "or",
1917   "or_eq",
1918   "private",
1919   "protected",
1920   "public",
1921   "register",
1922   "reinterpret_cast",
1923   "return",
1924   "short",
1925   "signed",
1926   "sizeof",
1927   "static",
1928   "static_cast",
1929   "struct",
1930   "switch",
1931   "template",
1932   "this",
1933   "throw",
1934   "true",
1935   "try",
1936   "typedef",
1937   "typeid",
1938   "typename",
1939   "typeof",
1940   "union",
1941   "unsigned",
1942   "using",
1943   "virtual",
1944   "void",
1945   "volatile",
1946   "wchar_t",
1947   "while",
1948   "xor",
1949   "xor_eq"
1950 };
1951
1952 /* Return true if NAME is a C++ keyword.  */
1953
1954 int
1955 cxx_keyword_p (name, length)
1956      const char *name;
1957      int length;
1958 {
1959   int last = ARRAY_SIZE (cxx_keywords);
1960   int first = 0;
1961   int mid = (last + first) / 2;
1962   int old = -1;
1963
1964   for (mid = (last + first) / 2;
1965        mid != old;
1966        old = mid, mid = (last + first) / 2)
1967     {
1968       int kwl = strlen (cxx_keywords[mid]);
1969       int min_length = kwl > length ? length : kwl;
1970       int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1971
1972       if (r == 0)
1973         {
1974           int i;
1975           /* We've found a match if all the remaining characters are `$'.  */
1976           for (i = min_length; i < length && name[i] == '$'; ++i)
1977             ;
1978           if (i == length)
1979             return 1;
1980           r = 1;
1981         }
1982
1983       if (r < 0)
1984         last = mid;
1985       else
1986         first = mid;
1987     }
1988   return 0;
1989 }
1990 #endif /* JC1_LITE */