1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declarations. */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
70 static void error_if_numeric_overflow PARAMS ((tree));
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init = 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap = 0;
85 java_init_lex (finput, encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
94 if (!java_lang_cloneable)
95 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
96 if (!java_io_serializable)
97 java_io_serializable = get_identifier ("java.io.Serializable");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset ((PTR) ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
132 memset ((PTR) current_jcf, 0, sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = lineno = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (line, i)
147 struct java_line *line;
150 static char buffer [10];
151 if (line->unicode_escape_p [i] || line->line [i] > 128)
152 sprintf (buffer, "\\u%04x", line->line [i]);
155 buffer [0] = line->line [i];
162 java_sneak_unicode ()
164 return (ctxp->c_line->line [ctxp->c_line->current]);
168 java_unget_unicode ()
170 if (!ctxp->c_line->current)
171 /* Can't unget unicode. */
174 ctxp->c_line->current--;
175 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
179 java_allocate_new_line ()
181 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
182 char ahead_escape_p = (ctxp->c_line ?
183 ctxp->c_line->unicode_escape_ahead_p : 0);
185 if (ctxp->c_line && !ctxp->c_line->white_space_only)
189 free (ctxp->p_line->unicode_escape_p);
190 free (ctxp->p_line->line);
193 ctxp->p_line = ctxp->c_line;
194 ctxp->c_line = NULL; /* Reallocated. */
199 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
200 ctxp->c_line->max = JAVA_LINE_MAX;
201 ctxp->c_line->line = (unicode_t *)xmalloc
202 (sizeof (unicode_t)*ctxp->c_line->max);
203 ctxp->c_line->unicode_escape_p =
204 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
205 ctxp->c_line->white_space_only = 0;
208 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
209 ctxp->c_line->char_col = ctxp->c_line->current = 0;
212 ctxp->c_line->line [ctxp->c_line->size] = ahead;
213 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
214 ctxp->c_line->size++;
216 ctxp->c_line->ahead [0] = 0;
217 ctxp->c_line->unicode_escape_ahead_p = 0;
218 ctxp->c_line->lineno = ++lineno;
219 ctxp->c_line->white_space_only = 1;
222 /* Create a new lexer object. */
225 java_new_lexer (finput, encoding)
227 const char *encoding;
229 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
232 lex->finput = finput;
234 lex->unget_value = 0;
238 lex->handle = iconv_open ("UCS-2", encoding);
239 if (lex->handle != (iconv_t) -1)
245 lex->read_anything = 0;
246 lex->use_fallback = 0;
248 /* Work around broken iconv() implementations by doing checking at
249 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
250 then all UCS-2 encoders will be broken. Perhaps not a valid
258 handle = iconv_open ("UCS-2", "UTF-8");
259 if (handle != (iconv_t) -1)
266 /* This is the UTF-8 encoding of \ufeff. */
273 outp = (char *) &result;
276 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
278 iconv_close (handle);
279 /* Conversion must be complete for us to use the result. */
280 if (r != (size_t) -1 && inc == 0 && outc == 0)
281 need_byteswap = (result != 0xfeff);
285 lex->byte_swap = need_byteswap;
288 #endif /* HAVE_ICONV */
290 /* If iconv failed, use the internal decoder if the default
291 encoding was requested. This code is used on platforms where
292 iconv exists but is insufficient for our needs. For
293 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
295 On Solaris the default encoding, as returned by nl_langinfo(),
296 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
297 understand that. We work around that by pretending
298 `646' to be the same as UTF-8. */
299 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
303 lex->use_fallback = 1;
304 #endif /* HAVE_ICONV */
308 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
314 java_destroy_lexer (lex)
318 if (! lex->use_fallback)
319 iconv_close (lex->handle);
328 if (lex->unget_value)
330 unicode_t r = lex->unget_value;
331 lex->unget_value = 0;
336 if (! lex->use_fallback)
338 size_t ir, inbytesleft, in_save, out_count, out_save;
342 /* If there is data which has already been converted, use it. */
343 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
350 /* See if we need to read more data. If FIRST == 0 then
351 the previous conversion attempt ended in the middle of
352 a character at the end of the buffer. Otherwise we
353 only have to read if the buffer is empty. */
354 if (lex->first == 0 || lex->first >= lex->last)
358 if (lex->first >= lex->last)
363 if (feof (lex->finput))
365 r = fread (&lex->buffer[lex->last], 1,
366 sizeof (lex->buffer) - lex->last,
371 inbytesleft = lex->last - lex->first;
372 out_count = sizeof (lex->out_buffer) - lex->out_last;
374 if (inbytesleft == 0)
376 /* We've tried to read and there is nothing left. */
380 in_save = inbytesleft;
381 out_save = out_count;
382 inp = &lex->buffer[lex->first];
383 outp = &lex->out_buffer[lex->out_last];
384 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
385 &inbytesleft, &outp, &out_count);
387 /* If we haven't read any bytes, then look to see if we
389 if (! lex->read_anything && out_save - out_count >= 2)
391 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
397 else if (uc == 0xfffe)
402 lex->read_anything = 1;
408 for (i = 0; i < out_save - out_count; i += 2)
410 char t = lex->out_buffer[lex->out_last + i];
411 lex->out_buffer[lex->out_last + i]
412 = lex->out_buffer[lex->out_last + i + 1];
413 lex->out_buffer[lex->out_last + i + 1] = t;
417 lex->first += in_save - inbytesleft;
418 lex->out_last += out_save - out_count;
420 /* If we converted anything at all, move along. */
421 if (out_count != out_save)
424 if (ir == (size_t) -1)
428 /* This is ok. This means that the end of our buffer
429 is in the middle of a character sequence. We just
430 move the valid part of the buffer to the beginning
432 memmove (&lex->buffer[0], &lex->buffer[lex->first],
433 lex->last - lex->first);
434 lex->last -= lex->first;
439 /* A more serious error. */
440 java_lex_error ("unrecognized character in input stream",
448 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
450 /* Don't have any data. */
455 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
460 #endif /* HAVE_ICONV */
463 c = getc (lex->finput);
468 return (unicode_t) c;
471 if ((c & 0xe0) == 0xc0)
473 c1 = getc (lex->finput);
474 if ((c1 & 0xc0) == 0x80)
476 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
477 /* Check for valid 2-byte characters. We explicitly
478 allow \0 because this encoding is common in the
480 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
484 else if ((c & 0xf0) == 0xe0)
486 c1 = getc (lex->finput);
487 if ((c1 & 0xc0) == 0x80)
489 c2 = getc (lex->finput);
490 if ((c2 & 0xc0) == 0x80)
492 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
495 /* Check for valid 3-byte characters.
496 Don't allow surrogate, \ufffe or \uffff. */
497 if (IN_RANGE (r, 0x800, 0xffff)
498 && ! IN_RANGE (r, 0xd800, 0xdfff)
499 && r != 0xfffe && r != 0xffff)
505 /* We simply don't support invalid characters. We also
506 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
507 cannot be valid Java characters. */
508 java_lex_error ("malformed UTF-8 character", 0);
512 /* We only get here on error. */
517 java_store_unicode (l, c, unicode_escape_p)
520 int unicode_escape_p;
522 if (l->size == l->max)
524 l->max += JAVA_LINE_MAX;
525 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
526 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
527 sizeof (char)*l->max);
529 l->line [l->size] = c;
530 l->unicode_escape_p [l->size++] = unicode_escape_p;
534 java_read_unicode (lex, unicode_escape_p)
536 int *unicode_escape_p;
540 c = java_read_char (lex);
541 *unicode_escape_p = 0;
550 if ((lex->bs_count) % 2 == 1)
552 /* Odd number of \ seen. */
553 c = java_read_char (lex);
556 unicode_t unicode = 0;
559 /* Recognize any number of `u's in \u. */
560 while ((c = java_read_char (lex)) == 'u')
563 /* Unget the most recent character as it is not a `u'. */
566 lex->unget_value = c;
568 /* Next should be 4 hex digits, otherwise it's an error.
569 The hex value is converted into the unicode, pushed into
570 the Unicode stream. */
571 for (shift = 12; shift >= 0; shift -= 4)
573 if ((c = java_read_char (lex)) == UEOF)
576 unicode |= (unicode_t)(hex_value (c) << shift);
578 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
581 *unicode_escape_p = 1;
584 lex->unget_value = c;
586 return (unicode_t) '\\';
590 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
592 int *unicode_escape_p;
594 int c = java_read_unicode (lex, unicode_escape_p);
598 /* We have to read ahead to see if we got \r\n. In that case we
599 return a single line terminator. */
601 c = java_read_unicode (lex, &dummy);
602 if (c != '\n' && c != UEOF)
603 lex->unget_value = c;
604 /* In either case we must return a newline. */
614 /* It's time to read a line when... */
615 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
620 if (ctxp->lexer->hit_eof)
623 java_allocate_new_line ();
624 if (ctxp->c_line->line[0] != '\n')
628 int unicode_escape_p;
629 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
634 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
635 if (ctxp->c_line->white_space_only
636 && !JAVA_WHITE_SPACE_P (c)
638 ctxp->c_line->white_space_only = 0;
640 if ((c == '\n') || (c == UEOF))
644 if (c == UEOF && ! found_chars)
646 ctxp->lexer->hit_eof = 1;
651 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
652 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
653 return ctxp->c_line->line [ctxp->c_line->current++];
656 /* Parse the end of a C style comment.
657 * C is the first character following the '/' and '*'. */
659 java_parse_end_comment (c)
662 for ( ;; c = java_get_unicode ())
667 java_lex_error ("Comment not terminated at end of input", 0);
670 switch (c = java_get_unicode ())
673 java_lex_error ("Comment not terminated at end of input", 0);
677 case '*': /* Reparse only '*'. */
678 java_unget_unicode ();
684 /* Parse the documentation section. Keywords must be at the beginning
685 of a documentation comment line (ignoring white space and any `*'
686 character). Parsed keyword(s): @DEPRECATED. */
689 java_parse_doc_section (c)
692 int valid_tag = 0, seen_star = 0;
694 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
706 c = java_get_unicode();
710 java_lex_error ("Comment not terminated at end of input", 0);
712 if (seen_star && (c == '/'))
713 return 1; /* Goto step1 in caller. */
715 /* We're parsing `@deprecated'. */
716 if (valid_tag && (c == '@'))
721 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
723 c = java_get_unicode ();
724 tag [tag_index++] = c;
728 java_lex_error ("Comment not terminated at end of input", 0);
729 tag [tag_index] = '\0';
731 if (!strcmp (tag, "deprecated"))
732 ctxp->deprecated = 1;
734 java_unget_unicode ();
738 /* Return true if C is a valid start character for a Java identifier.
739 This is only called if C >= 128 -- smaller values are handled
740 inline. However, this function handles all values anyway. */
742 java_start_char_p (c)
745 unsigned int hi = c / 256;
746 const char *const page = type_table[hi];
747 unsigned long val = (unsigned long) page;
750 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
751 flags = page[c & 255];
755 return flags & LETTER_START;
758 /* Return true if C is a valid part character for a Java identifier.
759 This is only called if C >= 128 -- smaller values are handled
760 inline. However, this function handles all values anyway. */
765 unsigned int hi = c / 256;
766 const char *const page = type_table[hi];
767 unsigned long val = (unsigned long) page;
770 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
771 flags = page[c & 255];
775 return flags & LETTER_PART;
779 java_parse_escape_sequence ()
784 switch (c = java_get_unicode ())
787 return (unicode_t)0x8;
789 return (unicode_t)0x9;
791 return (unicode_t)0xa;
793 return (unicode_t)0xc;
795 return (unicode_t)0xd;
797 return (unicode_t)0x22;
799 return (unicode_t)0x27;
801 return (unicode_t)0x5c;
802 case '0': case '1': case '2': case '3': case '4':
803 case '5': case '6': case '7':
806 int octal_escape_index = 0;
810 for (; octal_escape_index < max && RANGE (c, '0', '7');
811 c = java_get_unicode ())
813 if (octal_escape_index == 0 && c > '3')
815 /* According to the grammar, `\477' has a well-defined
816 meaning -- it is `\47' followed by `7'. */
819 octal_escape [octal_escape_index++] = c;
822 java_unget_unicode ();
824 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
825 i < octal_escape_index; i++, shift -= 3)
826 char_lit |= (octal_escape [i] - '0') << shift;
831 java_lex_error ("Invalid character in escape sequence", 0);
832 return JAVA_CHAR_ERROR;
837 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
839 /* Subroutine of java_lex: converts floating-point literals to tree
840 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
841 store the result. FFLAG indicates whether the literal was tagged
842 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
843 is the line number on which to report any error. */
845 static void java_perform_atof PARAMS ((YYSTYPE *, char *, int, int));
848 java_perform_atof (java_lval, literal_token, fflag, number_beginning)
852 int number_beginning;
854 REAL_VALUE_TYPE value;
855 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
857 SET_REAL_VALUE_ATOF (value,
858 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
860 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
862 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
865 else if (IS_ZERO (value))
867 /* We check to see if the value is really 0 or if we've found an
868 underflow. We do this in the most primitive imaginable way. */
870 char *p = literal_token;
873 while (*p && *p != 'e' && *p != 'E')
875 if (*p != '0' && *p != '.')
884 int i = ctxp->c_line->current;
885 ctxp->c_line->current = number_beginning;
886 java_lex_error ("Floating point literal underflow", 0);
887 ctxp->c_line->current = i;
891 SET_LVAL_NODE_TYPE (build_real (type, value), type);
895 static int yylex PARAMS ((YYSTYPE *));
906 unicode_t first_unicode;
907 int ascii_index, all_ascii;
910 /* Translation of the Unicode escape in the raw stream of Unicode
911 characters. Takes care of line terminator. */
913 /* Skip white spaces: SP, TAB and FF or ULT. */
914 for (c = java_get_unicode ();
915 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
918 ctxp->elc.line = ctxp->c_line->lineno;
919 ctxp->elc.col = ctxp->c_line->char_col-2;
922 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
924 if (c == 0x1a) /* CTRL-Z. */
926 if ((c = java_get_unicode ()) == UEOF)
927 return 0; /* Ok here. */
929 java_unget_unicode (); /* Caught later, at the end of the
932 /* Handle EOF here. */
933 if (c == UEOF) /* Should probably do something here... */
936 /* Take care of eventual comments. */
939 switch (c = java_get_unicode ())
944 c = java_get_unicode ();
947 /* It is ok to end a `//' comment with EOF, unless
948 we're being pedantic. */
950 java_lex_error ("Comment not terminated at end of input",
954 if (c == '\n') /* ULT */
960 if ((c = java_get_unicode ()) == '*')
962 if ((c = java_get_unicode ()) == '/')
963 goto step1; /* Empty documentation comment. */
964 else if (java_parse_doc_section (c))
968 java_parse_end_comment ((c = java_get_unicode ()));
972 java_unget_unicode ();
978 ctxp->elc.line = ctxp->c_line->lineno;
979 ctxp->elc.prev_col = ctxp->elc.col;
980 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
981 if (ctxp->elc.col < 0)
984 /* Numeric literals. */
985 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
987 /* This section of code is borrowed from gcc/c-lex.c. */
988 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
989 int parts[TOTAL_PARTS];
990 HOST_WIDE_INT high, low;
991 /* End borrowed section. */
992 char literal_token [256];
993 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
994 int found_hex_digits = 0, found_non_octal_digits = 0;
997 int number_beginning = ctxp->c_line->current;
1001 /* We might have a . separator instead of a FP like .[0-9]*. */
1004 unicode_t peep = java_sneak_unicode ();
1006 if (!JAVA_ASCII_DIGIT (peep))
1009 BUILD_OPERATOR (DOT_TK);
1013 for (i = 0; i < TOTAL_PARTS; i++)
1018 c = java_get_unicode ();
1019 if (c == 'x' || c == 'X')
1022 c = java_get_unicode ();
1024 else if (JAVA_ASCII_DIGIT (c))
1026 else if (c == '.' || c == 'e' || c =='E')
1028 /* Push the '.', 'e', or 'E' back and prepare for a FP
1030 java_unget_unicode ();
1035 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1036 JAVA_LEX_LIT ("0", 10);
1040 SET_LVAL_NODE (long_zero_node);
1041 return (INT_LIT_TK);
1043 SET_LVAL_NODE (float_zero_node);
1046 SET_LVAL_NODE (double_zero_node);
1049 java_unget_unicode ();
1050 SET_LVAL_NODE (integer_zero_node);
1051 return (INT_LIT_TK);
1055 /* Parse the first part of the literal, until we find something
1056 which is not a number. */
1057 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1058 JAVA_ASCII_DIGIT (c))
1060 /* We store in a string (in case it turns out to be a FP) and in
1061 PARTS if we have to process a integer literal. */
1062 int numeric = hex_value (c);
1065 /* Remember when we find a valid hexadecimal digit. */
1067 found_hex_digits = 1;
1068 /* Remember when we find an invalid octal digit. */
1069 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1070 found_non_octal_digits = 1;
1072 literal_token [literal_index++] = c;
1073 /* This section of code if borrowed from gcc/c-lex.c. */
1074 for (count = 0; count < TOTAL_PARTS; count++)
1076 parts[count] *= radix;
1079 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1080 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1083 parts[0] += numeric;
1085 if (parts [TOTAL_PARTS-1] != 0)
1087 /* End borrowed section. */
1088 c = java_get_unicode ();
1091 /* If we have something from the FP char set but not a digit, parse
1093 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1096 int seen_digit = (literal_index ? 1 : 0);
1097 int seen_exponent = 0;
1098 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1099 double unless specified. */
1101 /* It is ok if the radix is 8 because this just means we've
1102 seen a leading `0'. However, radix==16 is invalid. */
1104 java_lex_error ("Can't express non-decimal FP literal", 0);
1114 literal_token [literal_index++ ] = c;
1115 c = java_get_unicode ();
1118 java_lex_error ("Invalid character in FP literal", 0);
1121 if (c == 'e' || c == 'E')
1125 /* {E,e} must have seen at least a digit. */
1128 ("Invalid FP literal, mantissa must have digit", 0);
1132 literal_token [literal_index++] = c;
1133 c = java_get_unicode ();
1136 java_lex_error ("Invalid character in FP literal", 0);
1138 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1140 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1141 stage = 4; /* So we fall through. */
1144 if ((c=='-' || c =='+') && stage == 2)
1147 literal_token [literal_index++] = c;
1148 c = java_get_unicode ();
1151 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1152 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1153 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1154 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1156 if (JAVA_ASCII_DIGIT (c))
1160 literal_token [literal_index++ ] = c;
1161 c = java_get_unicode ();
1165 if (stage != 4) /* Don't push back fF/dD. */
1166 java_unget_unicode ();
1168 /* An exponent (if any) must have seen a digit. */
1169 if (seen_exponent && !seen_digit)
1171 ("Invalid FP literal, exponent must have digit", 0);
1173 literal_token [literal_index] = '\0';
1174 JAVA_LEX_LIT (literal_token, radix);
1177 java_perform_atof (java_lval, literal_token,
1178 fflag, number_beginning);
1183 } /* JAVA_ASCII_FPCHAR (c) */
1185 /* Here we get back to converting the integral literal. */
1186 if (radix == 16 && ! found_hex_digits)
1188 ("0x must be followed by at least one hexadecimal digit", 0);
1189 else if (radix == 8 && found_non_octal_digits)
1190 java_lex_error ("Octal literal contains digit out of range", 0);
1191 else if (c == 'L' || c == 'l')
1194 java_unget_unicode ();
1196 #ifdef JAVA_LEX_DEBUG
1197 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1198 JAVA_LEX_LIT (literal_token, radix);
1200 /* This section of code is borrowed from gcc/c-lex.c. */
1203 bytes = GET_TYPE_PRECISION (long_type_node);
1204 for (i = bytes; i < TOTAL_PARTS; i++)
1212 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1214 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1215 / HOST_BITS_PER_CHAR)]
1216 << (i * HOST_BITS_PER_CHAR));
1217 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1219 /* End borrowed section. */
1222 /* Range checking. */
1223 value = build_int_2 (low, high);
1224 /* Temporarily set type to unsigned. */
1225 SET_LVAL_NODE_TYPE (value, (long_suffix
1226 ? unsigned_long_type_node
1227 : unsigned_int_type_node));
1229 /* For base 10 numbers, only values up to the highest value
1230 (plus one) can be written. For instance, only ints up to
1231 2147483648 can be written. The special case of the largest
1232 negative value is handled elsewhere. For other bases, any
1233 number can be represented. */
1234 if (overflow || (radix == 10
1235 && tree_int_cst_lt (long_suffix
1241 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1243 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1246 /* Sign extend the value. */
1247 SET_LVAL_NODE_TYPE (value, (long_suffix ? long_type_node : int_type_node));
1248 force_fit_type (value, 0);
1249 JAVA_RADIX10_FLAG (value) = radix == 10;
1251 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1252 long_suffix ? long_type_node : int_type_node);
1257 /* Character literals. */
1261 if ((c = java_get_unicode ()) == '\\')
1262 char_lit = java_parse_escape_sequence ();
1265 if (c == '\n' || c == '\'')
1266 java_lex_error ("Invalid character literal", 0);
1270 c = java_get_unicode ();
1272 if ((c == '\n') || (c == UEOF))
1273 java_lex_error ("Character literal not terminated at end of line", 0);
1275 java_lex_error ("Syntax error in character literal", 0);
1277 if (char_lit == JAVA_CHAR_ERROR)
1278 char_lit = 0; /* We silently convert it to zero. */
1280 JAVA_LEX_CHAR_LIT (char_lit);
1281 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1285 /* String literals. */
1291 for (no_error = 1, c = java_get_unicode ();
1292 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1295 c = java_parse_escape_sequence ();
1296 if (c == JAVA_CHAR_ERROR)
1299 c = 0; /* We silently convert it to zero. */
1301 java_unicode_2_utf8 (c);
1303 if (c == '\n' || c == UEOF) /* ULT. */
1305 lineno--; /* Refer to the line where the terminator was seen. */
1306 java_lex_error ("String not terminated at end of line", 0);
1310 obstack_1grow (&temporary_obstack, '\0');
1311 string = obstack_finish (&temporary_obstack);
1313 if (!no_error || (c != '"'))
1314 java_lval->node = error_mark_node; /* FIXME: Requires futher
1317 java_lval->node = build_string (strlen (string), string);
1319 obstack_free (&temporary_obstack, string);
1320 return STRING_LIT_TK;
1328 BUILD_OPERATOR (OP_TK);
1334 if (ctxp->ccb_indent == 1)
1335 ctxp->first_ccb_indent1 = lineno;
1337 BUILD_OPERATOR (OCB_TK);
1341 if (ctxp->ccb_indent == 1)
1342 ctxp->last_ccb_indent1 = lineno;
1343 BUILD_OPERATOR (CCB_TK);
1346 BUILD_OPERATOR (OSB_TK);
1358 BUILD_OPERATOR (DOT_TK);
1359 /* return DOT_TK; */
1366 if ((c = java_get_unicode ()) == '=')
1368 BUILD_OPERATOR (EQ_TK);
1372 /* Equals is used in two different locations. In the
1373 variable_declarator: rule, it has to be seen as '=' as opposed
1374 to being seen as an ordinary assignment operator in
1375 assignment_operators: rule. */
1376 java_unget_unicode ();
1377 BUILD_OPERATOR (ASSIGN_TK);
1381 switch ((c = java_get_unicode ()))
1384 BUILD_OPERATOR (GTE_TK);
1386 switch ((c = java_get_unicode ()))
1389 if ((c = java_get_unicode ()) == '=')
1391 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1395 java_unget_unicode ();
1396 BUILD_OPERATOR (ZRS_TK);
1399 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1401 java_unget_unicode ();
1402 BUILD_OPERATOR (SRS_TK);
1405 java_unget_unicode ();
1406 BUILD_OPERATOR (GT_TK);
1410 switch ((c = java_get_unicode ()))
1413 BUILD_OPERATOR (LTE_TK);
1415 if ((c = java_get_unicode ()) == '=')
1417 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1421 java_unget_unicode ();
1422 BUILD_OPERATOR (LS_TK);
1425 java_unget_unicode ();
1426 BUILD_OPERATOR (LT_TK);
1430 switch ((c = java_get_unicode ()))
1433 BUILD_OPERATOR (BOOL_AND_TK);
1435 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1437 java_unget_unicode ();
1438 BUILD_OPERATOR (AND_TK);
1442 switch ((c = java_get_unicode ()))
1445 BUILD_OPERATOR (BOOL_OR_TK);
1447 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1449 java_unget_unicode ();
1450 BUILD_OPERATOR (OR_TK);
1454 switch ((c = java_get_unicode ()))
1457 BUILD_OPERATOR (INCR_TK);
1459 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1461 java_unget_unicode ();
1462 BUILD_OPERATOR (PLUS_TK);
1466 switch ((c = java_get_unicode ()))
1469 BUILD_OPERATOR (DECR_TK);
1471 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1473 java_unget_unicode ();
1474 BUILD_OPERATOR (MINUS_TK);
1478 if ((c = java_get_unicode ()) == '=')
1480 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1484 java_unget_unicode ();
1485 BUILD_OPERATOR (MULT_TK);
1489 if ((c = java_get_unicode ()) == '=')
1491 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1495 java_unget_unicode ();
1496 BUILD_OPERATOR (DIV_TK);
1500 if ((c = java_get_unicode ()) == '=')
1502 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1506 java_unget_unicode ();
1507 BUILD_OPERATOR (XOR_TK);
1511 if ((c = java_get_unicode ()) == '=')
1513 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1517 java_unget_unicode ();
1518 BUILD_OPERATOR (REM_TK);
1522 if ((c = java_get_unicode()) == '=')
1524 BUILD_OPERATOR (NEQ_TK);
1528 java_unget_unicode ();
1529 BUILD_OPERATOR (NEG_TK);
1534 BUILD_OPERATOR (REL_QM_TK);
1537 BUILD_OPERATOR (REL_CL_TK);
1539 BUILD_OPERATOR (NOT_TK);
1542 /* Keyword, boolean literal or null literal. */
1543 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1544 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1546 java_unicode_2_utf8 (c);
1547 if (all_ascii && c >= 128)
1552 obstack_1grow (&temporary_obstack, '\0');
1553 string = obstack_finish (&temporary_obstack);
1554 java_unget_unicode ();
1556 /* If we have something all ascii, we consider a keyword, a boolean
1557 literal, a null literal or an all ASCII identifier. Otherwise,
1558 this is an identifier (possibly not respecting formation rule). */
1561 const struct java_keyword *kw;
1562 if ((kw=java_keyword (string, ascii_index)))
1564 JAVA_LEX_KW (string);
1567 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1568 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1569 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1570 case PRIVATE_TK: case STRICT_TK:
1571 SET_MODIFIER_CTX (kw->token);
1574 SET_LVAL_NODE (float_type_node);
1577 SET_LVAL_NODE (double_type_node);
1580 SET_LVAL_NODE (boolean_type_node);
1583 SET_LVAL_NODE (byte_type_node);
1586 SET_LVAL_NODE (short_type_node);
1589 SET_LVAL_NODE (int_type_node);
1592 SET_LVAL_NODE (long_type_node);
1595 SET_LVAL_NODE (char_type_node);
1598 /* Keyword based literals. */
1601 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1602 boolean_true_node : boolean_false_node));
1605 SET_LVAL_NODE (null_pointer_node);
1611 BUILD_OPERATOR (kw->token);
1617 /* Some keyword we want to retain information on the location
1618 they where found. */
1630 BUILD_OPERATOR (kw->token);
1638 /* We may have an ID here. */
1639 if (JAVA_START_CHAR_P (first_unicode))
1641 JAVA_LEX_ID (string);
1642 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1646 /* Everything else is an invalid character in the input. */
1648 char lex_error_buffer [128];
1649 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1650 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1651 java_lex_error (lex_error_buffer, 1);
1657 /* This is called by the parser to see if an error should be generated
1658 due to numeric overflow. This function only handles the particular
1659 case of the largest negative value, and is only called in the case
1660 where this value is not preceded by `-'. */
1662 error_if_numeric_overflow (value)
1665 if (TREE_CODE (value) == INTEGER_CST
1666 && JAVA_RADIX10_FLAG (value)
1667 && tree_int_cst_sgn (value) < 0)
1669 if (TREE_TYPE (value) == long_type_node)
1670 java_lex_error ("Numeric overflow for `long' literal", 0);
1672 java_lex_error ("Numeric overflow for `int' literal", 0);
1675 #endif /* JC1_LITE */
1678 java_unicode_2_utf8 (unicode)
1681 if (RANGE (unicode, 0x01, 0x7f))
1682 obstack_1grow (&temporary_obstack, (char)unicode);
1683 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1685 obstack_1grow (&temporary_obstack,
1686 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1687 obstack_1grow (&temporary_obstack,
1688 (unsigned char)(0x80 | (unicode & 0x3f)));
1690 else /* Range 0x800-0xffff. */
1692 obstack_1grow (&temporary_obstack,
1693 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1694 obstack_1grow (&temporary_obstack,
1695 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1696 obstack_1grow (&temporary_obstack,
1697 (unsigned char)(0x80 | (unicode & 0x003f)));
1703 build_wfl_node (node)
1706 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1707 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1708 TREE_TYPE (node) = NULL_TREE;
1714 java_lex_error (msg, forward)
1715 const char *msg ATTRIBUTE_UNUSED;
1716 int forward ATTRIBUTE_UNUSED;
1719 ctxp->elc.line = ctxp->c_line->lineno;
1720 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1722 /* Might be caught in the middle of some error report. */
1723 ctxp->java_error_flag = 0;
1740 if (next != '\n' && next != EOF)
1752 java_get_line_col (filename, line, col)
1753 const char *filename ATTRIBUTE_UNUSED;
1754 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1759 /* Dumb implementation. Doesn't try to cache or optimize things. */
1760 /* First line of the file is line 1, first column is 1. */
1762 /* COL == -1 means, at the CR/LF in LINE. */
1763 /* COL == -2 means, at the first non space char in LINE. */
1766 int c, ccol, cline = 1;
1767 int current_line_col = 0;
1768 int first_non_space = 0;
1771 if (!(fp = fopen (filename, "r")))
1772 fatal_io_error ("can't open %s", filename);
1774 while (cline != line)
1779 static const char msg[] = "<<file too short - unexpected EOF>>";
1780 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1783 if (java_is_eol (fp, c))
1787 /* Gather the chars of the current line in a buffer. */
1791 if (c < 0 || java_is_eol (fp, c))
1793 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1794 first_non_space = current_line_col;
1795 obstack_1grow (&temporary_obstack, c);
1800 obstack_1grow (&temporary_obstack, '\n');
1804 col = current_line_col;
1805 first_non_space = 0;
1808 col = first_non_space;
1810 first_non_space = 0;
1812 /* Place the '^' a the right position. */
1813 base = obstack_base (&temporary_obstack);
1814 for (ccol = 1; ccol <= col+3; ccol++)
1816 /* Compute \t when reaching first_non_space. */
1817 char c = (first_non_space ?
1818 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1819 obstack_1grow (&temporary_obstack, c);
1821 obstack_grow0 (&temporary_obstack, "^", 1);
1824 return obstack_finish (&temporary_obstack);
1830 utf8_cmp (str, length, name)
1831 const unsigned char *str;
1835 const unsigned char *limit = str + length;
1838 for (i = 0; name[i]; ++i)
1840 int ch = UTF8_GET (str, limit);
1842 return ch - name[i];
1845 return str == limit ? 0 : 1;
1848 /* A sorted list of all C++ keywords. */
1850 static const char *const cxx_keywords[] =
1958 /* Return true if NAME is a C++ keyword. */
1961 cxx_keyword_p (name, length)
1965 int last = ARRAY_SIZE (cxx_keywords);
1967 int mid = (last + first) / 2;
1970 for (mid = (last + first) / 2;
1972 old = mid, mid = (last + first) / 2)
1974 int kwl = strlen (cxx_keywords[mid]);
1975 int min_length = kwl > length ? length : kwl;
1976 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1981 /* We've found a match if all the remaining characters are `$'. */
1982 for (i = min_length; i < length && name[i] == '$'; ++i)
1996 #endif /* JC1_LITE */