1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declarations. */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
70 static void error_if_numeric_overflow PARAMS ((tree));
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init = 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap = 0;
85 java_init_lex (finput, encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
94 if (!java_lang_cloneable)
95 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
96 if (!java_io_serializable)
97 java_io_serializable = get_identifier ("java.io.Serializable");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset ((PTR) ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
132 memset ((PTR) current_jcf, 0, sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = lineno = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (line, i)
147 struct java_line *line;
150 static char buffer [10];
151 if (line->unicode_escape_p [i] || line->line [i] > 128)
152 sprintf (buffer, "\\u%04x", line->line [i]);
155 buffer [0] = line->line [i];
162 java_sneak_unicode ()
164 return (ctxp->c_line->line [ctxp->c_line->current]);
168 java_unget_unicode ()
170 if (!ctxp->c_line->current)
171 /* Can't unget unicode. */
174 ctxp->c_line->current--;
175 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
179 java_allocate_new_line ()
181 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
182 char ahead_escape_p = (ctxp->c_line ?
183 ctxp->c_line->unicode_escape_ahead_p : 0);
185 if (ctxp->c_line && !ctxp->c_line->white_space_only)
189 free (ctxp->p_line->unicode_escape_p);
190 free (ctxp->p_line->line);
193 ctxp->p_line = ctxp->c_line;
194 ctxp->c_line = NULL; /* Reallocated. */
199 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
200 ctxp->c_line->max = JAVA_LINE_MAX;
201 ctxp->c_line->line = (unicode_t *)xmalloc
202 (sizeof (unicode_t)*ctxp->c_line->max);
203 ctxp->c_line->unicode_escape_p =
204 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
205 ctxp->c_line->white_space_only = 0;
208 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
209 ctxp->c_line->char_col = ctxp->c_line->current = 0;
212 ctxp->c_line->line [ctxp->c_line->size] = ahead;
213 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
214 ctxp->c_line->size++;
216 ctxp->c_line->ahead [0] = 0;
217 ctxp->c_line->unicode_escape_ahead_p = 0;
218 ctxp->c_line->lineno = ++lineno;
219 ctxp->c_line->white_space_only = 1;
222 /* Create a new lexer object. */
225 java_new_lexer (finput, encoding)
227 const char *encoding;
229 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
232 lex->finput = finput;
234 lex->unget_value = 0;
238 lex->handle = iconv_open ("UCS-2", encoding);
239 if (lex->handle != (iconv_t) -1)
245 lex->read_anything = 0;
246 lex->use_fallback = 0;
248 /* Work around broken iconv() implementations by doing checking at
249 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
250 then all UCS-2 encoders will be broken. Perhaps not a valid
258 handle = iconv_open ("UCS-2", "UTF-8");
259 if (handle != (iconv_t) -1)
266 /* This is the UTF-8 encoding of \ufeff. */
273 outp = (char *) &result;
276 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
278 iconv_close (handle);
279 /* Conversion must be complete for us to use the result. */
280 if (r != (size_t) -1 && inc == 0 && outc == 0)
281 need_byteswap = (result != 0xfeff);
285 lex->byte_swap = need_byteswap;
288 #endif /* HAVE_ICONV */
290 /* If iconv failed, use the internal decoder if the default
291 encoding was requested. This code is used on platforms where
292 iconv exists but is insufficient for our needs. For
293 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
295 On Solaris the default encoding, as returned by nl_langinfo(),
296 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
297 understand that. We work around that by pretending
298 `646' to be the same as UTF-8. */
299 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
303 lex->use_fallback = 1;
304 #endif /* HAVE_ICONV */
308 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
314 java_destroy_lexer (lex)
318 if (! lex->use_fallback)
319 iconv_close (lex->handle);
328 if (lex->unget_value)
330 unicode_t r = lex->unget_value;
331 lex->unget_value = 0;
336 if (! lex->use_fallback)
338 size_t ir, inbytesleft, in_save, out_count, out_save;
342 /* If there is data which has already been converted, use it. */
343 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
350 /* See if we need to read more data. If FIRST == 0 then
351 the previous conversion attempt ended in the middle of
352 a character at the end of the buffer. Otherwise we
353 only have to read if the buffer is empty. */
354 if (lex->first == 0 || lex->first >= lex->last)
358 if (lex->first >= lex->last)
363 if (feof (lex->finput))
365 r = fread (&lex->buffer[lex->last], 1,
366 sizeof (lex->buffer) - lex->last,
371 inbytesleft = lex->last - lex->first;
372 out_count = sizeof (lex->out_buffer) - lex->out_last;
374 if (inbytesleft == 0)
376 /* We've tried to read and there is nothing left. */
380 in_save = inbytesleft;
381 out_save = out_count;
382 inp = &lex->buffer[lex->first];
383 outp = &lex->out_buffer[lex->out_last];
384 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
385 &inbytesleft, &outp, &out_count);
387 /* If we haven't read any bytes, then look to see if we
389 if (! lex->read_anything && out_save - out_count >= 2)
391 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
397 else if (uc == 0xfffe)
402 lex->read_anything = 1;
408 for (i = 0; i < out_save - out_count; i += 2)
410 char t = lex->out_buffer[lex->out_last + i];
411 lex->out_buffer[lex->out_last + i]
412 = lex->out_buffer[lex->out_last + i + 1];
413 lex->out_buffer[lex->out_last + i + 1] = t;
417 lex->first += in_save - inbytesleft;
418 lex->out_last += out_save - out_count;
420 /* If we converted anything at all, move along. */
421 if (out_count != out_save)
424 if (ir == (size_t) -1)
428 /* This is ok. This means that the end of our buffer
429 is in the middle of a character sequence. We just
430 move the valid part of the buffer to the beginning
432 memmove (&lex->buffer[0], &lex->buffer[lex->first],
433 lex->last - lex->first);
434 lex->last -= lex->first;
439 /* A more serious error. */
440 java_lex_error ("unrecognized character in input stream",
448 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
450 /* Don't have any data. */
455 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
460 #endif /* HAVE_ICONV */
463 c = getc (lex->finput);
468 return (unicode_t) c;
471 if ((c & 0xe0) == 0xc0)
473 c1 = getc (lex->finput);
474 if ((c1 & 0xc0) == 0x80)
476 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
477 /* Check for valid 2-byte characters. We explicitly
478 allow \0 because this encoding is common in the
480 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
484 else if ((c & 0xf0) == 0xe0)
486 c1 = getc (lex->finput);
487 if ((c1 & 0xc0) == 0x80)
489 c2 = getc (lex->finput);
490 if ((c2 & 0xc0) == 0x80)
492 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
495 /* Check for valid 3-byte characters.
496 Don't allow surrogate, \ufffe or \uffff. */
497 if (IN_RANGE (r, 0x800, 0xffff)
498 && ! IN_RANGE (r, 0xd800, 0xdfff)
499 && r != 0xfffe && r != 0xffff)
505 /* We simply don't support invalid characters. We also
506 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
507 cannot be valid Java characters. */
508 java_lex_error ("malformed UTF-8 character", 0);
512 /* We only get here on error. */
517 java_store_unicode (l, c, unicode_escape_p)
520 int unicode_escape_p;
522 if (l->size == l->max)
524 l->max += JAVA_LINE_MAX;
525 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
526 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
527 sizeof (char)*l->max);
529 l->line [l->size] = c;
530 l->unicode_escape_p [l->size++] = unicode_escape_p;
534 java_read_unicode (lex, unicode_escape_p)
536 int *unicode_escape_p;
540 c = java_read_char (lex);
541 *unicode_escape_p = 0;
550 if ((lex->bs_count) % 2 == 1)
552 /* Odd number of \ seen. */
553 c = java_read_char (lex);
556 unicode_t unicode = 0;
559 /* Recognize any number of `u's in \u. */
560 while ((c = java_read_char (lex)) == 'u')
563 /* Unget the most recent character as it is not a `u'. */
566 lex->unget_value = c;
568 /* Next should be 4 hex digits, otherwise it's an error.
569 The hex value is converted into the unicode, pushed into
570 the Unicode stream. */
571 for (shift = 12; shift >= 0; shift -= 4)
573 if ((c = java_read_char (lex)) == UEOF)
576 unicode |= (unicode_t)(hex_value (c) << shift);
578 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
581 *unicode_escape_p = 1;
584 lex->unget_value = c;
586 return (unicode_t) '\\';
590 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
592 int *unicode_escape_p;
594 int c = java_read_unicode (lex, unicode_escape_p);
598 /* We have to read ahead to see if we got \r\n. In that case we
599 return a single line terminator. */
601 c = java_read_unicode (lex, &dummy);
603 lex->unget_value = c;
604 /* In either case we must return a newline. */
614 /* It's time to read a line when... */
615 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
620 if (ctxp->lexer->hit_eof)
623 java_allocate_new_line ();
624 if (ctxp->c_line->line[0] != '\n')
628 int unicode_escape_p;
629 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
634 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
635 if (ctxp->c_line->white_space_only
636 && !JAVA_WHITE_SPACE_P (c)
638 ctxp->c_line->white_space_only = 0;
640 if ((c == '\n') || (c == UEOF))
644 if (c == UEOF && ! found_chars)
646 ctxp->lexer->hit_eof = 1;
651 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
652 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
653 return ctxp->c_line->line [ctxp->c_line->current++];
656 /* Parse the end of a C style comment.
657 * C is the first character following the '/' and '*'. */
659 java_parse_end_comment (c)
662 for ( ;; c = java_get_unicode ())
667 java_lex_error ("Comment not terminated at end of input", 0);
670 switch (c = java_get_unicode ())
673 java_lex_error ("Comment not terminated at end of input", 0);
677 case '*': /* Reparse only '*'. */
678 java_unget_unicode ();
684 /* Parse the documentation section. Keywords must be at the beginning
685 of a documentation comment line (ignoring white space and any `*'
686 character). Parsed keyword(s): @DEPRECATED. */
689 java_parse_doc_section (c)
692 int valid_tag = 0, seen_star = 0;
694 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
706 c = java_get_unicode();
710 java_lex_error ("Comment not terminated at end of input", 0);
712 if (seen_star && (c == '/'))
713 return 1; /* Goto step1 in caller. */
715 /* We're parsing `@deprecated'. */
716 if (valid_tag && (c == '@'))
721 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
723 c = java_get_unicode ();
724 tag [tag_index++] = c;
728 java_lex_error ("Comment not terminated at end of input", 0);
729 tag [tag_index] = '\0';
731 if (!strcmp (tag, "deprecated"))
732 ctxp->deprecated = 1;
734 java_unget_unicode ();
738 /* Return true if C is a valid start character for a Java identifier.
739 This is only called if C >= 128 -- smaller values are handled
740 inline. However, this function handles all values anyway. */
742 java_start_char_p (c)
745 unsigned int hi = c / 256;
746 const char *const page = type_table[hi];
747 unsigned long val = (unsigned long) page;
750 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
751 flags = page[c & 255];
755 return flags & LETTER_START;
758 /* Return true if C is a valid part character for a Java identifier.
759 This is only called if C >= 128 -- smaller values are handled
760 inline. However, this function handles all values anyway. */
765 unsigned int hi = c / 256;
766 const char *const page = type_table[hi];
767 unsigned long val = (unsigned long) page;
770 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
771 flags = page[c & 255];
775 return flags & LETTER_PART;
779 java_parse_escape_sequence ()
784 switch (c = java_get_unicode ())
787 return (unicode_t)0x8;
789 return (unicode_t)0x9;
791 return (unicode_t)0xa;
793 return (unicode_t)0xc;
795 return (unicode_t)0xd;
797 return (unicode_t)0x22;
799 return (unicode_t)0x27;
801 return (unicode_t)0x5c;
802 case '0': case '1': case '2': case '3': case '4':
803 case '5': case '6': case '7':
806 int octal_escape_index = 0;
810 for (; octal_escape_index < max && RANGE (c, '0', '7');
811 c = java_get_unicode ())
813 if (octal_escape_index == 0 && c > '3')
815 /* According to the grammar, `\477' has a well-defined
816 meaning -- it is `\47' followed by `7'. */
819 octal_escape [octal_escape_index++] = c;
822 java_unget_unicode ();
824 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
825 i < octal_escape_index; i++, shift -= 3)
826 char_lit |= (octal_escape [i] - '0') << shift;
831 java_lex_error ("Invalid character in escape sequence", 0);
832 return JAVA_CHAR_ERROR;
837 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
839 /* Subroutine of java_lex: converts floating-point literals to tree
840 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
841 store the result. FFLAG indicates whether the literal was tagged
842 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
843 is the line number on which to report any error. */
845 static void java_perform_atof PARAMS ((YYSTYPE *, char *, int, int));
848 java_perform_atof (java_lval, literal_token, fflag, number_beginning)
852 int number_beginning;
854 REAL_VALUE_TYPE value;
855 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
857 SET_REAL_VALUE_ATOF (value,
858 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
860 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
862 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
865 else if (IS_ZERO (value))
867 /* We check to see if the value is really 0 or if we've found an
868 underflow. We do this in the most primitive imaginable way. */
870 char *p = literal_token;
873 while (*p && *p != 'e' && *p != 'E')
875 if (*p != '0' && *p != '.')
884 int i = ctxp->c_line->current;
885 ctxp->c_line->current = number_beginning;
886 java_lex_error ("Floating point literal underflow", 0);
887 ctxp->c_line->current = i;
891 SET_LVAL_NODE_TYPE (build_real (type, value), type);
895 static int yylex PARAMS ((YYSTYPE *));
906 unicode_t first_unicode;
907 int ascii_index, all_ascii;
910 /* Translation of the Unicode escape in the raw stream of Unicode
911 characters. Takes care of line terminator. */
913 /* Skip white spaces: SP, TAB and FF or ULT. */
914 for (c = java_get_unicode ();
915 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
918 ctxp->elc.line = ctxp->c_line->lineno;
919 ctxp->elc.col = ctxp->c_line->char_col-2;
922 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
924 if (c == 0x1a) /* CTRL-Z. */
926 if ((c = java_get_unicode ()) == UEOF)
927 return 0; /* Ok here. */
929 java_unget_unicode (); /* Caught later, at the end of the
932 /* Handle EOF here. */
933 if (c == UEOF) /* Should probably do something here... */
936 /* Take care of eventual comments. */
939 switch (c = java_get_unicode ())
944 c = java_get_unicode ();
947 /* It is ok to end a `//' comment with EOF, unless
948 we're being pedantic. */
950 java_lex_error ("Comment not terminated at end of input",
954 if (c == '\n') /* ULT */
960 if ((c = java_get_unicode ()) == '*')
962 if ((c = java_get_unicode ()) == '/')
963 goto step1; /* Empty documentation comment. */
964 else if (java_parse_doc_section (c))
968 java_parse_end_comment ((c = java_get_unicode ()));
972 java_unget_unicode ();
978 ctxp->elc.line = ctxp->c_line->lineno;
979 ctxp->elc.prev_col = ctxp->elc.col;
980 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
981 if (ctxp->elc.col < 0)
984 /* Numeric literals. */
985 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
987 /* This section of code is borrowed from gcc/c-lex.c. */
988 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
989 int parts[TOTAL_PARTS];
990 HOST_WIDE_INT high, low;
991 /* End borrowed section. */
992 char literal_token [256];
993 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
994 int found_hex_digits = 0, found_non_octal_digits = 0;
997 int number_beginning = ctxp->c_line->current;
1001 /* We might have a . separator instead of a FP like .[0-9]*. */
1004 unicode_t peep = java_sneak_unicode ();
1006 if (!JAVA_ASCII_DIGIT (peep))
1009 BUILD_OPERATOR (DOT_TK);
1013 for (i = 0; i < TOTAL_PARTS; i++)
1018 c = java_get_unicode ();
1019 if (c == 'x' || c == 'X')
1022 c = java_get_unicode ();
1024 else if (JAVA_ASCII_DIGIT (c))
1026 else if (c == '.' || c == 'e' || c =='E')
1028 /* Push the '.', 'e', or 'E' back and prepare for a FP
1030 java_unget_unicode ();
1035 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1036 JAVA_LEX_LIT ("0", 10);
1040 SET_LVAL_NODE (long_zero_node);
1041 return (INT_LIT_TK);
1043 SET_LVAL_NODE (float_zero_node);
1046 SET_LVAL_NODE (double_zero_node);
1049 java_unget_unicode ();
1050 SET_LVAL_NODE (integer_zero_node);
1051 return (INT_LIT_TK);
1055 /* Parse the first part of the literal, until we find something
1056 which is not a number. */
1057 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1058 JAVA_ASCII_DIGIT (c))
1060 /* We store in a string (in case it turns out to be a FP) and in
1061 PARTS if we have to process a integer literal. */
1062 int numeric = hex_value (c);
1065 /* Remember when we find a valid hexadecimal digit. */
1067 found_hex_digits = 1;
1068 /* Remember when we find an invalid octal digit. */
1069 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1070 found_non_octal_digits = 1;
1072 literal_token [literal_index++] = c;
1073 /* This section of code if borrowed from gcc/c-lex.c. */
1074 for (count = 0; count < TOTAL_PARTS; count++)
1076 parts[count] *= radix;
1079 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1080 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1083 parts[0] += numeric;
1085 if (parts [TOTAL_PARTS-1] != 0)
1087 /* End borrowed section. */
1088 c = java_get_unicode ();
1091 /* If we have something from the FP char set but not a digit, parse
1093 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1096 int seen_digit = (literal_index ? 1 : 0);
1097 int seen_exponent = 0;
1098 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1099 double unless specified. */
1101 /* It is ok if the radix is 8 because this just means we've
1102 seen a leading `0'. However, radix==16 is invalid. */
1104 java_lex_error ("Can't express non-decimal FP literal", 0);
1114 literal_token [literal_index++ ] = c;
1115 c = java_get_unicode ();
1118 java_lex_error ("Invalid character in FP literal", 0);
1121 if (c == 'e' || c == 'E')
1125 /* {E,e} must have seen at least a digit. */
1128 ("Invalid FP literal, mantissa must have digit", 0);
1132 literal_token [literal_index++] = c;
1133 c = java_get_unicode ();
1136 java_lex_error ("Invalid character in FP literal", 0);
1138 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1140 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1141 stage = 4; /* So we fall through. */
1144 if ((c=='-' || c =='+') && stage == 2)
1147 literal_token [literal_index++] = c;
1148 c = java_get_unicode ();
1151 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1152 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1153 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1154 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1156 if (JAVA_ASCII_DIGIT (c))
1160 literal_token [literal_index++ ] = c;
1161 c = java_get_unicode ();
1165 if (stage != 4) /* Don't push back fF/dD. */
1166 java_unget_unicode ();
1168 /* An exponent (if any) must have seen a digit. */
1169 if (seen_exponent && !seen_digit)
1171 ("Invalid FP literal, exponent must have digit", 0);
1173 literal_token [literal_index] = '\0';
1174 JAVA_LEX_LIT (literal_token, radix);
1177 java_perform_atof (java_lval, literal_token,
1178 fflag, number_beginning);
1183 } /* JAVA_ASCII_FPCHAR (c) */
1185 /* Here we get back to converting the integral literal. */
1186 if (radix == 16 && ! found_hex_digits)
1188 ("0x must be followed by at least one hexadecimal digit", 0);
1189 else if (radix == 8 && found_non_octal_digits)
1190 java_lex_error ("Octal literal contains digit out of range", 0);
1191 else if (c == 'L' || c == 'l')
1194 java_unget_unicode ();
1196 #ifdef JAVA_LEX_DEBUG
1197 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1198 JAVA_LEX_LIT (literal_token, radix);
1200 /* This section of code is borrowed from gcc/c-lex.c. */
1203 bytes = GET_TYPE_PRECISION (long_type_node);
1204 for (i = bytes; i < TOTAL_PARTS; i++)
1212 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1214 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1215 / HOST_BITS_PER_CHAR)]
1216 << (i * HOST_BITS_PER_CHAR));
1217 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1219 /* End borrowed section. */
1221 /* Range checking. */
1224 /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1225 9223372036854775807L is the biggest `long' literal that can be
1226 expressed using a 10 radix. For other radices, everything that
1227 fits withing 64 bits is OK. */
1228 int hb = (high >> 31);
1229 if (overflow || (hb && low && radix == 10)
1230 || (hb && high & 0x7fffffff && radix == 10))
1231 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1235 /* 2147483648 is valid if operand of a '-'. Otherwise,
1236 2147483647 is the biggest `int' literal that can be
1237 expressed using a 10 radix. For other radices, everything
1238 that fits within 32 bits is OK. As all literals are
1239 signed, we sign extend here. */
1240 int hb = (low >> 31) & 0x1;
1241 if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1242 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1246 value = build_int_2 (low, high);
1247 JAVA_RADIX10_FLAG (value) = radix == 10;
1248 SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1250 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1251 long_suffix ? long_type_node : int_type_node);
1256 /* Character literals. */
1260 if ((c = java_get_unicode ()) == '\\')
1261 char_lit = java_parse_escape_sequence ();
1264 if (c == '\n' || c == '\'')
1265 java_lex_error ("Invalid character literal", 0);
1269 c = java_get_unicode ();
1271 if ((c == '\n') || (c == UEOF))
1272 java_lex_error ("Character literal not terminated at end of line", 0);
1274 java_lex_error ("Syntax error in character literal", 0);
1276 if (char_lit == JAVA_CHAR_ERROR)
1277 char_lit = 0; /* We silently convert it to zero. */
1279 JAVA_LEX_CHAR_LIT (char_lit);
1280 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1284 /* String literals. */
1290 for (no_error = 1, c = java_get_unicode ();
1291 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1294 c = java_parse_escape_sequence ();
1295 if (c == JAVA_CHAR_ERROR)
1298 c = 0; /* We silently convert it to zero. */
1300 java_unicode_2_utf8 (c);
1302 if (c == '\n' || c == UEOF) /* ULT. */
1304 lineno--; /* Refer to the line where the terminator was seen. */
1305 java_lex_error ("String not terminated at end of line", 0);
1309 obstack_1grow (&temporary_obstack, '\0');
1310 string = obstack_finish (&temporary_obstack);
1312 if (!no_error || (c != '"'))
1313 java_lval->node = error_mark_node; /* FIXME: Requires futher
1316 java_lval->node = build_string (strlen (string), string);
1318 obstack_free (&temporary_obstack, string);
1319 return STRING_LIT_TK;
1327 BUILD_OPERATOR (OP_TK);
1333 if (ctxp->ccb_indent == 1)
1334 ctxp->first_ccb_indent1 = lineno;
1336 BUILD_OPERATOR (OCB_TK);
1340 if (ctxp->ccb_indent == 1)
1341 ctxp->last_ccb_indent1 = lineno;
1342 BUILD_OPERATOR (CCB_TK);
1345 BUILD_OPERATOR (OSB_TK);
1357 BUILD_OPERATOR (DOT_TK);
1358 /* return DOT_TK; */
1365 if ((c = java_get_unicode ()) == '=')
1367 BUILD_OPERATOR (EQ_TK);
1371 /* Equals is used in two different locations. In the
1372 variable_declarator: rule, it has to be seen as '=' as opposed
1373 to being seen as an ordinary assignment operator in
1374 assignment_operators: rule. */
1375 java_unget_unicode ();
1376 BUILD_OPERATOR (ASSIGN_TK);
1380 switch ((c = java_get_unicode ()))
1383 BUILD_OPERATOR (GTE_TK);
1385 switch ((c = java_get_unicode ()))
1388 if ((c = java_get_unicode ()) == '=')
1390 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1394 java_unget_unicode ();
1395 BUILD_OPERATOR (ZRS_TK);
1398 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1400 java_unget_unicode ();
1401 BUILD_OPERATOR (SRS_TK);
1404 java_unget_unicode ();
1405 BUILD_OPERATOR (GT_TK);
1409 switch ((c = java_get_unicode ()))
1412 BUILD_OPERATOR (LTE_TK);
1414 if ((c = java_get_unicode ()) == '=')
1416 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1420 java_unget_unicode ();
1421 BUILD_OPERATOR (LS_TK);
1424 java_unget_unicode ();
1425 BUILD_OPERATOR (LT_TK);
1429 switch ((c = java_get_unicode ()))
1432 BUILD_OPERATOR (BOOL_AND_TK);
1434 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1436 java_unget_unicode ();
1437 BUILD_OPERATOR (AND_TK);
1441 switch ((c = java_get_unicode ()))
1444 BUILD_OPERATOR (BOOL_OR_TK);
1446 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1448 java_unget_unicode ();
1449 BUILD_OPERATOR (OR_TK);
1453 switch ((c = java_get_unicode ()))
1456 BUILD_OPERATOR (INCR_TK);
1458 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1460 java_unget_unicode ();
1461 BUILD_OPERATOR (PLUS_TK);
1465 switch ((c = java_get_unicode ()))
1468 BUILD_OPERATOR (DECR_TK);
1470 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1472 java_unget_unicode ();
1473 BUILD_OPERATOR (MINUS_TK);
1477 if ((c = java_get_unicode ()) == '=')
1479 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1483 java_unget_unicode ();
1484 BUILD_OPERATOR (MULT_TK);
1488 if ((c = java_get_unicode ()) == '=')
1490 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1494 java_unget_unicode ();
1495 BUILD_OPERATOR (DIV_TK);
1499 if ((c = java_get_unicode ()) == '=')
1501 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1505 java_unget_unicode ();
1506 BUILD_OPERATOR (XOR_TK);
1510 if ((c = java_get_unicode ()) == '=')
1512 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1516 java_unget_unicode ();
1517 BUILD_OPERATOR (REM_TK);
1521 if ((c = java_get_unicode()) == '=')
1523 BUILD_OPERATOR (NEQ_TK);
1527 java_unget_unicode ();
1528 BUILD_OPERATOR (NEG_TK);
1533 BUILD_OPERATOR (REL_QM_TK);
1536 BUILD_OPERATOR (REL_CL_TK);
1538 BUILD_OPERATOR (NOT_TK);
1541 /* Keyword, boolean literal or null literal. */
1542 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1543 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1545 java_unicode_2_utf8 (c);
1546 if (all_ascii && c >= 128)
1551 obstack_1grow (&temporary_obstack, '\0');
1552 string = obstack_finish (&temporary_obstack);
1553 java_unget_unicode ();
1555 /* If we have something all ascii, we consider a keyword, a boolean
1556 literal, a null literal or an all ASCII identifier. Otherwise,
1557 this is an identifier (possibly not respecting formation rule). */
1560 const struct java_keyword *kw;
1561 if ((kw=java_keyword (string, ascii_index)))
1563 JAVA_LEX_KW (string);
1566 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1567 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1568 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1569 case PRIVATE_TK: case STRICT_TK:
1570 SET_MODIFIER_CTX (kw->token);
1573 SET_LVAL_NODE (float_type_node);
1576 SET_LVAL_NODE (double_type_node);
1579 SET_LVAL_NODE (boolean_type_node);
1582 SET_LVAL_NODE (byte_type_node);
1585 SET_LVAL_NODE (short_type_node);
1588 SET_LVAL_NODE (int_type_node);
1591 SET_LVAL_NODE (long_type_node);
1594 SET_LVAL_NODE (char_type_node);
1597 /* Keyword based literals. */
1600 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1601 boolean_true_node : boolean_false_node));
1604 SET_LVAL_NODE (null_pointer_node);
1607 /* Some keyword we want to retain information on the location
1608 they where found. */
1621 BUILD_OPERATOR (kw->token);
1629 /* We may have an ID here. */
1630 if (JAVA_START_CHAR_P (first_unicode))
1632 JAVA_LEX_ID (string);
1633 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1637 /* Everything else is an invalid character in the input. */
1639 char lex_error_buffer [128];
1640 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1641 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1642 java_lex_error (lex_error_buffer, 1);
1648 /* This is called by the parser to see if an error should be generated
1649 due to numeric overflow. This function only handles the particular
1650 case of the largest negative value, and is only called in the case
1651 where this value is not preceded by `-'. */
1653 error_if_numeric_overflow (value)
1656 if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1658 unsigned HOST_WIDE_INT lo, hi;
1660 lo = TREE_INT_CST_LOW (value);
1661 hi = TREE_INT_CST_HIGH (value);
1662 if (TREE_TYPE (value) == long_type_node)
1664 int hb = (hi >> 31);
1665 if (hb && !(hi & 0x7fffffff))
1666 java_lex_error ("Numeric overflow for `long' literal", 0);
1670 int hb = (lo >> 31) & 0x1;
1671 if (hb && !(lo & 0x7fffffff))
1672 java_lex_error ("Numeric overflow for `int' literal", 0);
1676 #endif /* JC1_LITE */
1679 java_unicode_2_utf8 (unicode)
1682 if (RANGE (unicode, 0x01, 0x7f))
1683 obstack_1grow (&temporary_obstack, (char)unicode);
1684 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1686 obstack_1grow (&temporary_obstack,
1687 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1688 obstack_1grow (&temporary_obstack,
1689 (unsigned char)(0x80 | (unicode & 0x3f)));
1691 else /* Range 0x800-0xffff. */
1693 obstack_1grow (&temporary_obstack,
1694 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1695 obstack_1grow (&temporary_obstack,
1696 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1697 obstack_1grow (&temporary_obstack,
1698 (unsigned char)(0x80 | (unicode & 0x003f)));
1704 build_wfl_node (node)
1707 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1708 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1709 TREE_TYPE (node) = NULL_TREE;
1715 java_lex_error (msg, forward)
1716 const char *msg ATTRIBUTE_UNUSED;
1717 int forward ATTRIBUTE_UNUSED;
1720 ctxp->elc.line = ctxp->c_line->lineno;
1721 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1723 /* Might be caught in the middle of some error report. */
1724 ctxp->java_error_flag = 0;
1741 if (next != '\n' && next != EOF)
1753 java_get_line_col (filename, line, col)
1754 const char *filename ATTRIBUTE_UNUSED;
1755 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1760 /* Dumb implementation. Doesn't try to cache or optimize things. */
1761 /* First line of the file is line 1, first column is 1. */
1763 /* COL == -1 means, at the CR/LF in LINE. */
1764 /* COL == -2 means, at the first non space char in LINE. */
1767 int c, ccol, cline = 1;
1768 int current_line_col = 0;
1769 int first_non_space = 0;
1772 if (!(fp = fopen (filename, "r")))
1773 fatal_io_error ("can't open %s", filename);
1775 while (cline != line)
1780 static const char msg[] = "<<file too short - unexpected EOF>>";
1781 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1784 if (java_is_eol (fp, c))
1788 /* Gather the chars of the current line in a buffer. */
1792 if (c < 0 || java_is_eol (fp, c))
1794 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1795 first_non_space = current_line_col;
1796 obstack_1grow (&temporary_obstack, c);
1801 obstack_1grow (&temporary_obstack, '\n');
1805 col = current_line_col;
1806 first_non_space = 0;
1809 col = first_non_space;
1811 first_non_space = 0;
1813 /* Place the '^' a the right position. */
1814 base = obstack_base (&temporary_obstack);
1815 for (ccol = 1; ccol <= col+3; ccol++)
1817 /* Compute \t when reaching first_non_space. */
1818 char c = (first_non_space ?
1819 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1820 obstack_1grow (&temporary_obstack, c);
1822 obstack_grow0 (&temporary_obstack, "^", 1);
1825 return obstack_finish (&temporary_obstack);
1831 utf8_cmp (str, length, name)
1832 const unsigned char *str;
1836 const unsigned char *limit = str + length;
1839 for (i = 0; name[i]; ++i)
1841 int ch = UTF8_GET (str, limit);
1843 return ch - name[i];
1846 return str == limit ? 0 : 1;
1849 /* A sorted list of all C++ keywords. */
1851 static const char *const cxx_keywords[] =
1959 /* Return true if NAME is a C++ keyword. */
1962 cxx_keyword_p (name, length)
1966 int last = ARRAY_SIZE (cxx_keywords);
1968 int mid = (last + first) / 2;
1971 for (mid = (last + first) / 2;
1973 old = mid, mid = (last + first) / 2)
1975 int kwl = strlen (cxx_keywords[mid]);
1976 int min_length = kwl > length ? length : kwl;
1977 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1982 /* We've found a match if all the remaining characters are `$'. */
1983 for (i = min_length; i < length && name[i] == '$'; ++i)
1997 #endif /* JC1_LITE */