1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declarations. */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
70 static void error_if_numeric_overflow PARAMS ((tree));
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init = 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap = 0;
85 java_init_lex (finput, encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
94 if (!java_lang_cloneable)
95 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
96 if (!java_io_serializable)
97 java_io_serializable = get_identifier ("java.io.Serializable");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
132 memset ((PTR) current_jcf, 0, sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = lineno = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (line, i)
147 struct java_line *line;
150 static char buffer [10];
151 if (line->unicode_escape_p [i] || line->line [i] > 128)
152 sprintf (buffer, "\\u%04x", line->line [i]);
155 buffer [0] = line->line [i];
162 java_sneak_unicode ()
164 return (ctxp->c_line->line [ctxp->c_line->current]);
168 java_unget_unicode ()
170 if (!ctxp->c_line->current)
171 /* Can't unget unicode. */
174 ctxp->c_line->current--;
175 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
179 java_allocate_new_line ()
181 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
182 char ahead_escape_p = (ctxp->c_line ?
183 ctxp->c_line->unicode_escape_ahead_p : 0);
185 if (ctxp->c_line && !ctxp->c_line->white_space_only)
189 free (ctxp->p_line->unicode_escape_p);
190 free (ctxp->p_line->line);
193 ctxp->p_line = ctxp->c_line;
194 ctxp->c_line = NULL; /* Reallocated. */
199 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
200 ctxp->c_line->max = JAVA_LINE_MAX;
201 ctxp->c_line->line = (unicode_t *)xmalloc
202 (sizeof (unicode_t)*ctxp->c_line->max);
203 ctxp->c_line->unicode_escape_p =
204 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
205 ctxp->c_line->white_space_only = 0;
208 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
209 ctxp->c_line->char_col = ctxp->c_line->current = 0;
212 ctxp->c_line->line [ctxp->c_line->size] = ahead;
213 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
214 ctxp->c_line->size++;
216 ctxp->c_line->ahead [0] = 0;
217 ctxp->c_line->unicode_escape_ahead_p = 0;
218 ctxp->c_line->lineno = ++lineno;
219 ctxp->c_line->white_space_only = 1;
222 /* Create a new lexer object. */
225 java_new_lexer (finput, encoding)
227 const char *encoding;
229 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
232 lex->finput = finput;
234 lex->unget_value = 0;
238 lex->handle = iconv_open ("UCS-2", encoding);
239 if (lex->handle != (iconv_t) -1)
245 lex->read_anything = 0;
246 lex->use_fallback = 0;
248 /* Work around broken iconv() implementations by doing checking at
249 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
250 then all UCS-2 encoders will be broken. Perhaps not a valid
258 handle = iconv_open ("UCS-2", "UTF-8");
259 if (handle != (iconv_t) -1)
266 /* This is the UTF-8 encoding of \ufeff. */
273 outp = (char *) &result;
276 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
278 iconv_close (handle);
279 /* Conversion must be complete for us to use the result. */
280 if (r != (size_t) -1 && inc == 0 && outc == 0)
281 need_byteswap = (result != 0xfeff);
285 lex->byte_swap = need_byteswap;
288 #endif /* HAVE_ICONV */
290 /* If iconv failed, use the internal decoder if the default
291 encoding was requested. This code is used on platforms where
292 iconv exists but is insufficient for our needs. For
293 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2. */
294 if (strcmp (encoding, DEFAULT_ENCODING))
298 lex->use_fallback = 1;
299 #endif /* HAVE_ICONV */
303 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
309 java_destroy_lexer (lex)
313 if (! lex->use_fallback)
314 iconv_close (lex->handle);
323 if (lex->unget_value)
325 unicode_t r = lex->unget_value;
326 lex->unget_value = 0;
331 if (! lex->use_fallback)
333 size_t ir, inbytesleft, in_save, out_count, out_save;
337 /* If there is data which has already been converted, use it. */
338 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
345 /* See if we need to read more data. If FIRST == 0 then
346 the previous conversion attempt ended in the middle of
347 a character at the end of the buffer. Otherwise we
348 only have to read if the buffer is empty. */
349 if (lex->first == 0 || lex->first >= lex->last)
353 if (lex->first >= lex->last)
358 if (feof (lex->finput))
360 r = fread (&lex->buffer[lex->last], 1,
361 sizeof (lex->buffer) - lex->last,
366 inbytesleft = lex->last - lex->first;
367 out_count = sizeof (lex->out_buffer) - lex->out_last;
369 if (inbytesleft == 0)
371 /* We've tried to read and there is nothing left. */
375 in_save = inbytesleft;
376 out_save = out_count;
377 inp = &lex->buffer[lex->first];
378 outp = &lex->out_buffer[lex->out_last];
379 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
380 &inbytesleft, &outp, &out_count);
382 /* If we haven't read any bytes, then look to see if we
384 if (! lex->read_anything && out_save - out_count >= 2)
386 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
392 else if (uc == 0xfffe)
397 lex->read_anything = 1;
403 for (i = 0; i < out_save - out_count; i += 2)
405 char t = lex->out_buffer[lex->out_last + i];
406 lex->out_buffer[lex->out_last + i]
407 = lex->out_buffer[lex->out_last + i + 1];
408 lex->out_buffer[lex->out_last + i + 1] = t;
412 lex->first += in_save - inbytesleft;
413 lex->out_last += out_save - out_count;
415 /* If we converted anything at all, move along. */
416 if (out_count != out_save)
419 if (ir == (size_t) -1)
423 /* This is ok. This means that the end of our buffer
424 is in the middle of a character sequence. We just
425 move the valid part of the buffer to the beginning
427 memmove (&lex->buffer[0], &lex->buffer[lex->first],
428 lex->last - lex->first);
429 lex->last -= lex->first;
434 /* A more serious error. */
435 java_lex_error ("unrecognized character in input stream",
443 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
445 /* Don't have any data. */
450 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
455 #endif /* HAVE_ICONV */
458 c = getc (lex->finput);
463 return (unicode_t) c;
466 if ((c & 0xe0) == 0xc0)
468 c1 = getc (lex->finput);
469 if ((c1 & 0xc0) == 0x80)
471 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
472 /* Check for valid 2-byte characters. We explicitly
473 allow \0 because this encoding is common in the
475 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
479 else if ((c & 0xf0) == 0xe0)
481 c1 = getc (lex->finput);
482 if ((c1 & 0xc0) == 0x80)
484 c2 = getc (lex->finput);
485 if ((c2 & 0xc0) == 0x80)
487 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
490 /* Check for valid 3-byte characters.
491 Don't allow surrogate, \ufffe or \uffff. */
492 if (r >= 0x800 && r <= 0xffff
493 && ! (r >= 0xd800 && r <= 0xdfff)
494 && r != 0xfffe && r != 0xffff)
500 /* We simply don't support invalid characters. We also
501 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
502 cannot be valid Java characters. */
503 java_lex_error ("malformed UTF-8 character", 0);
507 /* We only get here on error. */
512 java_store_unicode (l, c, unicode_escape_p)
515 int unicode_escape_p;
517 if (l->size == l->max)
519 l->max += JAVA_LINE_MAX;
520 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
521 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
522 sizeof (char)*l->max);
524 l->line [l->size] = c;
525 l->unicode_escape_p [l->size++] = unicode_escape_p;
529 java_read_unicode (lex, unicode_escape_p)
531 int *unicode_escape_p;
535 c = java_read_char (lex);
536 *unicode_escape_p = 0;
545 if ((lex->bs_count) % 2 == 1)
547 /* Odd number of \ seen. */
548 c = java_read_char (lex);
551 unicode_t unicode = 0;
554 /* Recognize any number of `u's in \u. */
555 while ((c = java_read_char (lex)) == 'u')
558 /* Unget the most recent character as it is not a `u'. */
561 lex->unget_value = c;
563 /* Next should be 4 hex digits, otherwise it's an error.
564 The hex value is converted into the unicode, pushed into
565 the Unicode stream. */
566 for (shift = 12; shift >= 0; shift -= 4)
568 if ((c = java_read_char (lex)) == UEOF)
571 unicode |= (unicode_t)(hex_value (c) << shift);
573 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
576 *unicode_escape_p = 1;
579 lex->unget_value = c;
581 return (unicode_t) '\\';
585 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
587 int *unicode_escape_p;
589 int c = java_read_unicode (lex, unicode_escape_p);
593 /* We have to read ahead to see if we got \r\n. In that case we
594 return a single line terminator. */
596 c = java_read_unicode (lex, &dummy);
598 lex->unget_value = c;
599 /* In either case we must return a newline. */
609 /* It's time to read a line when... */
610 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
615 if (ctxp->lexer->hit_eof)
618 java_allocate_new_line ();
619 if (ctxp->c_line->line[0] != '\n')
623 int unicode_escape_p;
624 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
629 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
630 if (ctxp->c_line->white_space_only
631 && !JAVA_WHITE_SPACE_P (c)
633 ctxp->c_line->white_space_only = 0;
635 if ((c == '\n') || (c == UEOF))
639 if (c == UEOF && ! found_chars)
641 ctxp->lexer->hit_eof = 1;
646 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
647 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
648 return ctxp->c_line->line [ctxp->c_line->current++];
651 /* Parse the end of a C style comment.
652 * C is the first character following the '/' and '*'. */
654 java_parse_end_comment (c)
657 for ( ;; c = java_get_unicode ())
662 java_lex_error ("Comment not terminated at end of input", 0);
665 switch (c = java_get_unicode ())
668 java_lex_error ("Comment not terminated at end of input", 0);
672 case '*': /* Reparse only '*'. */
673 java_unget_unicode ();
679 /* Parse the documentation section. Keywords must be at the beginning
680 of a documentation comment line (ignoring white space and any `*'
681 character). Parsed keyword(s): @DEPRECATED. */
684 java_parse_doc_section (c)
687 int valid_tag = 0, seen_star = 0;
689 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
701 c = java_get_unicode();
705 java_lex_error ("Comment not terminated at end of input", 0);
707 if (seen_star && (c == '/'))
708 return 1; /* Goto step1 in caller. */
710 /* We're parsing `@deprecated'. */
711 if (valid_tag && (c == '@'))
716 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
718 c = java_get_unicode ();
719 tag [tag_index++] = c;
723 java_lex_error ("Comment not terminated at end of input", 0);
724 tag [tag_index] = '\0';
726 if (!strcmp (tag, "deprecated"))
727 ctxp->deprecated = 1;
729 java_unget_unicode ();
733 /* Return true if C is a valid start character for a Java identifier.
734 This is only called if C >= 128 -- smaller values are handled
735 inline. However, this function handles all values anyway. */
737 java_start_char_p (c)
740 unsigned int hi = c / 256;
741 const char *const page = type_table[hi];
742 unsigned long val = (unsigned long) page;
745 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
746 flags = page[c & 255];
750 return flags & LETTER_START;
753 /* Return true if C is a valid part character for a Java identifier.
754 This is only called if C >= 128 -- smaller values are handled
755 inline. However, this function handles all values anyway. */
760 unsigned int hi = c / 256;
761 const char *const page = type_table[hi];
762 unsigned long val = (unsigned long) page;
765 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
766 flags = page[c & 255];
770 return flags & LETTER_PART;
774 java_parse_escape_sequence ()
779 switch (c = java_get_unicode ())
782 return (unicode_t)0x8;
784 return (unicode_t)0x9;
786 return (unicode_t)0xa;
788 return (unicode_t)0xc;
790 return (unicode_t)0xd;
792 return (unicode_t)0x22;
794 return (unicode_t)0x27;
796 return (unicode_t)0x5c;
797 case '0': case '1': case '2': case '3': case '4':
798 case '5': case '6': case '7':
801 int octal_escape_index = 0;
805 for (; octal_escape_index < max && RANGE (c, '0', '7');
806 c = java_get_unicode ())
808 if (octal_escape_index == 0 && c > '3')
810 /* According to the grammar, `\477' has a well-defined
811 meaning -- it is `\47' followed by `7'. */
814 octal_escape [octal_escape_index++] = c;
817 java_unget_unicode ();
819 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
820 i < octal_escape_index; i++, shift -= 3)
821 char_lit |= (octal_escape [i] - '0') << shift;
826 java_lex_error ("Invalid character in escape sequence", 0);
827 return JAVA_CHAR_ERROR;
832 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
834 /* Subroutine of java_lex: converts floating-point literals to tree
835 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
836 store the result. FFLAG indicates whether the literal was tagged
837 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
838 is the line number on which to report any error. */
840 static void java_perform_atof PARAMS ((YYSTYPE *, char *, int, int));
843 java_perform_atof (java_lval, literal_token, fflag, number_beginning)
847 int number_beginning;
849 REAL_VALUE_TYPE value;
850 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
852 SET_REAL_VALUE_ATOF (value,
853 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
855 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
857 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
860 else if (IS_ZERO (value))
862 /* We check to see if the value is really 0 or if we've found an
863 underflow. We do this in the most primitive imaginable way. */
865 char *p = literal_token;
868 while (*p && *p != 'e' && *p != 'E')
870 if (*p != '0' && *p != '.')
879 int i = ctxp->c_line->current;
880 ctxp->c_line->current = number_beginning;
881 java_lex_error ("Floating point literal underflow", 0);
882 ctxp->c_line->current = i;
886 SET_LVAL_NODE_TYPE (build_real (type, value), type);
890 static int yylex PARAMS ((YYSTYPE *));
901 unicode_t first_unicode;
902 int ascii_index, all_ascii;
905 /* Translation of the Unicode escape in the raw stream of Unicode
906 characters. Takes care of line terminator. */
908 /* Skip white spaces: SP, TAB and FF or ULT. */
909 for (c = java_get_unicode ();
910 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
913 ctxp->elc.line = ctxp->c_line->lineno;
914 ctxp->elc.col = ctxp->c_line->char_col-2;
917 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
919 if (c == 0x1a) /* CTRL-Z. */
921 if ((c = java_get_unicode ()) == UEOF)
922 return 0; /* Ok here. */
924 java_unget_unicode (); /* Caught later, at the end of the
927 /* Handle EOF here. */
928 if (c == UEOF) /* Should probably do something here... */
931 /* Take care of eventual comments. */
934 switch (c = java_get_unicode ())
939 c = java_get_unicode ();
942 /* It is ok to end a `//' comment with EOF, unless
943 we're being pedantic. */
945 java_lex_error ("Comment not terminated at end of input",
949 if (c == '\n') /* ULT */
955 if ((c = java_get_unicode ()) == '*')
957 if ((c = java_get_unicode ()) == '/')
958 goto step1; /* Empty documentation comment. */
959 else if (java_parse_doc_section (c))
963 java_parse_end_comment ((c = java_get_unicode ()));
967 java_unget_unicode ();
973 ctxp->elc.line = ctxp->c_line->lineno;
974 ctxp->elc.prev_col = ctxp->elc.col;
975 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
976 if (ctxp->elc.col < 0)
979 /* Numeric literals. */
980 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
982 /* This section of code is borrowed from gcc/c-lex.c. */
983 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
984 int parts[TOTAL_PARTS];
985 HOST_WIDE_INT high, low;
986 /* End borrowed section. */
987 char literal_token [256];
988 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
989 int found_hex_digits = 0, found_non_octal_digits = 0;
992 int number_beginning = ctxp->c_line->current;
996 /* We might have a . separator instead of a FP like .[0-9]*. */
999 unicode_t peep = java_sneak_unicode ();
1001 if (!JAVA_ASCII_DIGIT (peep))
1004 BUILD_OPERATOR (DOT_TK);
1008 for (i = 0; i < TOTAL_PARTS; i++)
1013 c = java_get_unicode ();
1014 if (c == 'x' || c == 'X')
1017 c = java_get_unicode ();
1019 else if (JAVA_ASCII_DIGIT (c))
1023 /* Push the '.' back and prepare for a FP parsing... */
1024 java_unget_unicode ();
1029 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1030 JAVA_LEX_LIT ("0", 10);
1034 SET_LVAL_NODE (long_zero_node);
1035 return (INT_LIT_TK);
1037 SET_LVAL_NODE (float_zero_node);
1040 SET_LVAL_NODE (double_zero_node);
1043 java_unget_unicode ();
1044 SET_LVAL_NODE (integer_zero_node);
1045 return (INT_LIT_TK);
1049 /* Parse the first part of the literal, until we find something
1050 which is not a number. */
1051 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1052 JAVA_ASCII_DIGIT (c))
1054 /* We store in a string (in case it turns out to be a FP) and in
1055 PARTS if we have to process a integer literal. */
1056 int numeric = hex_value (c);
1059 /* Remember when we find a valid hexadecimal digit. */
1061 found_hex_digits = 1;
1062 /* Remember when we find an invalid octal digit. */
1063 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1064 found_non_octal_digits = 1;
1066 literal_token [literal_index++] = c;
1067 /* This section of code if borrowed from gcc/c-lex.c. */
1068 for (count = 0; count < TOTAL_PARTS; count++)
1070 parts[count] *= radix;
1073 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1074 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1077 parts[0] += numeric;
1079 if (parts [TOTAL_PARTS-1] != 0)
1081 /* End borrowed section. */
1082 c = java_get_unicode ();
1085 /* If we have something from the FP char set but not a digit, parse
1087 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1090 int seen_digit = (literal_index ? 1 : 0);
1091 int seen_exponent = 0;
1092 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1093 double unless specified. */
1095 /* It is ok if the radix is 8 because this just means we've
1096 seen a leading `0'. However, radix==16 is invalid. */
1098 java_lex_error ("Can't express non-decimal FP literal", 0);
1108 literal_token [literal_index++ ] = c;
1109 c = java_get_unicode ();
1112 java_lex_error ("Invalid character in FP literal", 0);
1115 if (c == 'e' || c == 'E')
1119 /* {E,e} must have seen at least a digit. */
1122 ("Invalid FP literal, mantissa must have digit", 0);
1126 literal_token [literal_index++] = c;
1127 c = java_get_unicode ();
1130 java_lex_error ("Invalid character in FP literal", 0);
1132 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1134 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1135 stage = 4; /* So we fall through. */
1138 if ((c=='-' || c =='+') && stage == 2)
1141 literal_token [literal_index++] = c;
1142 c = java_get_unicode ();
1145 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1146 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1147 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1148 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1150 if (JAVA_ASCII_DIGIT (c))
1154 literal_token [literal_index++ ] = c;
1155 c = java_get_unicode ();
1159 if (stage != 4) /* Don't push back fF/dD. */
1160 java_unget_unicode ();
1162 /* An exponent (if any) must have seen a digit. */
1163 if (seen_exponent && !seen_digit)
1165 ("Invalid FP literal, exponent must have digit", 0);
1167 literal_token [literal_index] = '\0';
1168 JAVA_LEX_LIT (literal_token, radix);
1171 java_perform_atof (java_lval, literal_token,
1172 fflag, number_beginning);
1177 } /* JAVA_ASCII_FPCHAR (c) */
1179 /* Here we get back to converting the integral literal. */
1180 if (radix == 16 && ! found_hex_digits)
1182 ("0x must be followed by at least one hexadecimal digit", 0);
1183 else if (radix == 8 && found_non_octal_digits)
1184 java_lex_error ("Octal literal contains digit out of range", 0);
1185 else if (c == 'L' || c == 'l')
1188 java_unget_unicode ();
1190 #ifdef JAVA_LEX_DEBUG
1191 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1192 JAVA_LEX_LIT (literal_token, radix);
1194 /* This section of code is borrowed from gcc/c-lex.c. */
1197 bytes = GET_TYPE_PRECISION (long_type_node);
1198 for (i = bytes; i < TOTAL_PARTS; i++)
1206 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1208 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1209 / HOST_BITS_PER_CHAR)]
1210 << (i * HOST_BITS_PER_CHAR));
1211 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1213 /* End borrowed section. */
1215 /* Range checking. */
1218 /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1219 9223372036854775807L is the biggest `long' literal that can be
1220 expressed using a 10 radix. For other radices, everything that
1221 fits withing 64 bits is OK. */
1222 int hb = (high >> 31);
1223 if (overflow || (hb && low && radix == 10)
1224 || (hb && high & 0x7fffffff && radix == 10))
1225 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1229 /* 2147483648 is valid if operand of a '-'. Otherwise,
1230 2147483647 is the biggest `int' literal that can be
1231 expressed using a 10 radix. For other radices, everything
1232 that fits within 32 bits is OK. As all literals are
1233 signed, we sign extend here. */
1234 int hb = (low >> 31) & 0x1;
1235 if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1236 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1240 value = build_int_2 (low, high);
1241 JAVA_RADIX10_FLAG (value) = radix == 10;
1242 SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1244 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1245 long_suffix ? long_type_node : int_type_node);
1250 /* Character literals. */
1254 if ((c = java_get_unicode ()) == '\\')
1255 char_lit = java_parse_escape_sequence ();
1258 if (c == '\n' || c == '\'')
1259 java_lex_error ("Invalid character literal", 0);
1263 c = java_get_unicode ();
1265 if ((c == '\n') || (c == UEOF))
1266 java_lex_error ("Character literal not terminated at end of line", 0);
1268 java_lex_error ("Syntax error in character literal", 0);
1270 if (char_lit == JAVA_CHAR_ERROR)
1271 char_lit = 0; /* We silently convert it to zero. */
1273 JAVA_LEX_CHAR_LIT (char_lit);
1274 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1278 /* String literals. */
1284 for (no_error = 1, c = java_get_unicode ();
1285 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1288 c = java_parse_escape_sequence ();
1289 if (c == JAVA_CHAR_ERROR)
1292 c = 0; /* We silently convert it to zero. */
1294 java_unicode_2_utf8 (c);
1296 if (c == '\n' || c == UEOF) /* ULT. */
1298 lineno--; /* Refer to the line where the terminator was seen. */
1299 java_lex_error ("String not terminated at end of line", 0);
1303 obstack_1grow (&temporary_obstack, '\0');
1304 string = obstack_finish (&temporary_obstack);
1306 if (!no_error || (c != '"'))
1307 java_lval->node = error_mark_node; /* FIXME: Requires futher
1310 java_lval->node = build_string (strlen (string), string);
1312 obstack_free (&temporary_obstack, string);
1313 return STRING_LIT_TK;
1321 BUILD_OPERATOR (OP_TK);
1327 if (ctxp->ccb_indent == 1)
1328 ctxp->first_ccb_indent1 = lineno;
1330 BUILD_OPERATOR (OCB_TK);
1334 if (ctxp->ccb_indent == 1)
1335 ctxp->last_ccb_indent1 = lineno;
1336 BUILD_OPERATOR (CCB_TK);
1339 BUILD_OPERATOR (OSB_TK);
1351 BUILD_OPERATOR (DOT_TK);
1352 /* return DOT_TK; */
1359 if ((c = java_get_unicode ()) == '=')
1361 BUILD_OPERATOR (EQ_TK);
1365 /* Equals is used in two different locations. In the
1366 variable_declarator: rule, it has to be seen as '=' as opposed
1367 to being seen as an ordinary assignment operator in
1368 assignment_operators: rule. */
1369 java_unget_unicode ();
1370 BUILD_OPERATOR (ASSIGN_TK);
1374 switch ((c = java_get_unicode ()))
1377 BUILD_OPERATOR (GTE_TK);
1379 switch ((c = java_get_unicode ()))
1382 if ((c = java_get_unicode ()) == '=')
1384 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1388 java_unget_unicode ();
1389 BUILD_OPERATOR (ZRS_TK);
1392 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1394 java_unget_unicode ();
1395 BUILD_OPERATOR (SRS_TK);
1398 java_unget_unicode ();
1399 BUILD_OPERATOR (GT_TK);
1403 switch ((c = java_get_unicode ()))
1406 BUILD_OPERATOR (LTE_TK);
1408 if ((c = java_get_unicode ()) == '=')
1410 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1414 java_unget_unicode ();
1415 BUILD_OPERATOR (LS_TK);
1418 java_unget_unicode ();
1419 BUILD_OPERATOR (LT_TK);
1423 switch ((c = java_get_unicode ()))
1426 BUILD_OPERATOR (BOOL_AND_TK);
1428 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1430 java_unget_unicode ();
1431 BUILD_OPERATOR (AND_TK);
1435 switch ((c = java_get_unicode ()))
1438 BUILD_OPERATOR (BOOL_OR_TK);
1440 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1442 java_unget_unicode ();
1443 BUILD_OPERATOR (OR_TK);
1447 switch ((c = java_get_unicode ()))
1450 BUILD_OPERATOR (INCR_TK);
1452 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1454 java_unget_unicode ();
1455 BUILD_OPERATOR (PLUS_TK);
1459 switch ((c = java_get_unicode ()))
1462 BUILD_OPERATOR (DECR_TK);
1464 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1466 java_unget_unicode ();
1467 BUILD_OPERATOR (MINUS_TK);
1471 if ((c = java_get_unicode ()) == '=')
1473 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1477 java_unget_unicode ();
1478 BUILD_OPERATOR (MULT_TK);
1482 if ((c = java_get_unicode ()) == '=')
1484 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1488 java_unget_unicode ();
1489 BUILD_OPERATOR (DIV_TK);
1493 if ((c = java_get_unicode ()) == '=')
1495 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1499 java_unget_unicode ();
1500 BUILD_OPERATOR (XOR_TK);
1504 if ((c = java_get_unicode ()) == '=')
1506 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1510 java_unget_unicode ();
1511 BUILD_OPERATOR (REM_TK);
1515 if ((c = java_get_unicode()) == '=')
1517 BUILD_OPERATOR (NEQ_TK);
1521 java_unget_unicode ();
1522 BUILD_OPERATOR (NEG_TK);
1527 BUILD_OPERATOR (REL_QM_TK);
1530 BUILD_OPERATOR (REL_CL_TK);
1532 BUILD_OPERATOR (NOT_TK);
1535 /* Keyword, boolean literal or null literal. */
1536 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1537 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1539 java_unicode_2_utf8 (c);
1540 if (all_ascii && c >= 128)
1545 obstack_1grow (&temporary_obstack, '\0');
1546 string = obstack_finish (&temporary_obstack);
1547 java_unget_unicode ();
1549 /* If we have something all ascii, we consider a keyword, a boolean
1550 literal, a null literal or an all ASCII identifier. Otherwise,
1551 this is an identifier (possibly not respecting formation rule). */
1554 const struct java_keyword *kw;
1555 if ((kw=java_keyword (string, ascii_index)))
1557 JAVA_LEX_KW (string);
1560 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1561 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1562 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1563 case PRIVATE_TK: case STRICT_TK:
1564 SET_MODIFIER_CTX (kw->token);
1567 SET_LVAL_NODE (float_type_node);
1570 SET_LVAL_NODE (double_type_node);
1573 SET_LVAL_NODE (boolean_type_node);
1576 SET_LVAL_NODE (byte_type_node);
1579 SET_LVAL_NODE (short_type_node);
1582 SET_LVAL_NODE (int_type_node);
1585 SET_LVAL_NODE (long_type_node);
1588 SET_LVAL_NODE (char_type_node);
1591 /* Keyword based literals. */
1594 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1595 boolean_true_node : boolean_false_node));
1598 SET_LVAL_NODE (null_pointer_node);
1601 /* Some keyword we want to retain information on the location
1602 they where found. */
1614 BUILD_OPERATOR (kw->token);
1622 /* We may have an ID here. */
1623 if (JAVA_START_CHAR_P (first_unicode))
1625 JAVA_LEX_ID (string);
1626 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1630 /* Everything else is an invalid character in the input. */
1632 char lex_error_buffer [128];
1633 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1634 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1635 java_lex_error (lex_error_buffer, 1);
1641 /* This is called by the parser to see if an error should be generated
1642 due to numeric overflow. This function only handles the particular
1643 case of the largest negative value, and is only called in the case
1644 where this value is not preceded by `-'. */
1646 error_if_numeric_overflow (value)
1649 if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1651 unsigned HOST_WIDE_INT lo, hi;
1653 lo = TREE_INT_CST_LOW (value);
1654 hi = TREE_INT_CST_HIGH (value);
1655 if (TREE_TYPE (value) == long_type_node)
1657 int hb = (hi >> 31);
1658 if (hb && !(hi & 0x7fffffff))
1659 java_lex_error ("Numeric overflow for `long' literal", 0);
1663 int hb = (lo >> 31) & 0x1;
1664 if (hb && !(lo & 0x7fffffff))
1665 java_lex_error ("Numeric overflow for `int' literal", 0);
1669 #endif /* JC1_LITE */
1672 java_unicode_2_utf8 (unicode)
1675 if (RANGE (unicode, 0x01, 0x7f))
1676 obstack_1grow (&temporary_obstack, (char)unicode);
1677 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1679 obstack_1grow (&temporary_obstack,
1680 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1681 obstack_1grow (&temporary_obstack,
1682 (unsigned char)(0x80 | (unicode & 0x3f)));
1684 else /* Range 0x800-0xffff. */
1686 obstack_1grow (&temporary_obstack,
1687 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1688 obstack_1grow (&temporary_obstack,
1689 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1690 obstack_1grow (&temporary_obstack,
1691 (unsigned char)(0x80 | (unicode & 0x003f)));
1697 build_wfl_node (node)
1700 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1701 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1702 TREE_TYPE (node) = NULL_TREE;
1708 java_lex_error (msg, forward)
1709 const char *msg ATTRIBUTE_UNUSED;
1710 int forward ATTRIBUTE_UNUSED;
1713 ctxp->elc.line = ctxp->c_line->lineno;
1714 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1716 /* Might be caught in the middle of some error report. */
1717 ctxp->java_error_flag = 0;
1734 if (next != '\n' && next != EOF)
1746 java_get_line_col (filename, line, col)
1747 const char *filename ATTRIBUTE_UNUSED;
1748 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1753 /* Dumb implementation. Doesn't try to cache or optimize things. */
1754 /* First line of the file is line 1, first column is 1. */
1756 /* COL == -1 means, at the CR/LF in LINE. */
1757 /* COL == -2 means, at the first non space char in LINE. */
1760 int c, ccol, cline = 1;
1761 int current_line_col = 0;
1762 int first_non_space = 0;
1765 if (!(fp = fopen (filename, "r")))
1766 fatal_io_error ("can't open %s", filename);
1768 while (cline != line)
1773 static const char msg[] = "<<file too short - unexpected EOF>>";
1774 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1777 if (java_is_eol (fp, c))
1781 /* Gather the chars of the current line in a buffer. */
1785 if (c < 0 || java_is_eol (fp, c))
1787 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1788 first_non_space = current_line_col;
1789 obstack_1grow (&temporary_obstack, c);
1794 obstack_1grow (&temporary_obstack, '\n');
1798 col = current_line_col;
1799 first_non_space = 0;
1802 col = first_non_space;
1804 first_non_space = 0;
1806 /* Place the '^' a the right position. */
1807 base = obstack_base (&temporary_obstack);
1808 for (ccol = 1; ccol <= col+3; ccol++)
1810 /* Compute \t when reaching first_non_space. */
1811 char c = (first_non_space ?
1812 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1813 obstack_1grow (&temporary_obstack, c);
1815 obstack_grow0 (&temporary_obstack, "^", 1);
1818 return obstack_finish (&temporary_obstack);
1824 utf8_cmp (str, length, name)
1825 const unsigned char *str;
1829 const unsigned char *limit = str + length;
1832 for (i = 0; name[i]; ++i)
1834 int ch = UTF8_GET (str, limit);
1836 return ch - name[i];
1839 return str == limit ? 0 : 1;
1842 /* A sorted list of all C++ keywords. */
1844 static const char *const cxx_keywords[] =
1952 /* Return true if NAME is a C++ keyword. */
1955 cxx_keyword_p (name, length)
1959 int last = ARRAY_SIZE (cxx_keywords);
1961 int mid = (last + first) / 2;
1964 for (mid = (last + first) / 2;
1966 old = mid, mid = (last + first) / 2)
1968 int kwl = strlen (cxx_keywords[mid]);
1969 int min_length = kwl > length ? length : kwl;
1970 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1975 /* We've found a match if all the remaining characters are `$'. */
1976 for (i = min_length; i < length && name[i] == '$'; ++i)
1990 #endif /* JC1_LITE */