1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declaration */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
70 static void error_if_numeric_overflow PARAMS ((tree));
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init = 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap = 0;
85 java_init_lex (finput, encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
94 if (!java_lang_cloneable)
95 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
96 if (!java_io_serializable)
97 java_io_serializable = get_identifier ("java.io.Serializable");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
132 memset ((PTR) current_jcf, 0, sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = lineno = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (line, i)
147 struct java_line *line;
150 static char buffer [10];
151 if (line->unicode_escape_p [i] || line->line [i] > 128)
152 sprintf (buffer, "\\u%04x", line->line [i]);
155 buffer [0] = line->line [i];
162 java_sneak_unicode ()
164 return (ctxp->c_line->line [ctxp->c_line->current]);
168 java_unget_unicode ()
170 if (!ctxp->c_line->current)
171 /* Can't unget unicode. */
174 ctxp->c_line->current--;
175 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
179 java_allocate_new_line ()
181 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
182 char ahead_escape_p = (ctxp->c_line ?
183 ctxp->c_line->unicode_escape_ahead_p : 0);
185 if (ctxp->c_line && !ctxp->c_line->white_space_only)
189 free (ctxp->p_line->unicode_escape_p);
190 free (ctxp->p_line->line);
193 ctxp->p_line = ctxp->c_line;
194 ctxp->c_line = NULL; /* Reallocated */
199 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
200 ctxp->c_line->max = JAVA_LINE_MAX;
201 ctxp->c_line->line = (unicode_t *)xmalloc
202 (sizeof (unicode_t)*ctxp->c_line->max);
203 ctxp->c_line->unicode_escape_p =
204 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
205 ctxp->c_line->white_space_only = 0;
208 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
209 ctxp->c_line->char_col = ctxp->c_line->current = 0;
212 ctxp->c_line->line [ctxp->c_line->size] = ahead;
213 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
214 ctxp->c_line->size++;
216 ctxp->c_line->ahead [0] = 0;
217 ctxp->c_line->unicode_escape_ahead_p = 0;
218 ctxp->c_line->lineno = ++lineno;
219 ctxp->c_line->white_space_only = 1;
222 /* Create a new lexer object. */
225 java_new_lexer (finput, encoding)
227 const char *encoding;
229 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
232 lex->finput = finput;
234 lex->unget_value = 0;
238 lex->handle = iconv_open ("UCS-2", encoding);
239 if (lex->handle != (iconv_t) -1)
245 lex->read_anything = 0;
246 lex->use_fallback = 0;
248 /* Work around broken iconv() implementations by doing checking at
249 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
250 then all UCS-2 encoders will be broken. Perhaps not a valid
258 handle = iconv_open ("UCS-2", "UTF-8");
259 if (handle != (iconv_t) -1)
266 /* This is the UTF-8 encoding of \ufeff. */
273 outp = (char *) &result;
276 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
278 iconv_close (handle);
279 /* Conversion must be complete for us to use the result. */
280 if (r != (size_t) -1 && inc == 0 && outc == 0)
281 need_byteswap = (result != 0xfeff);
285 lex->byte_swap = need_byteswap;
288 #endif /* HAVE_ICONV */
290 /* If iconv failed, use the internal decoder if the default
291 encoding was requested. This code is used on platforms where
292 iconv exists but is insufficient for our needs. For
293 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2. */
294 if (strcmp (encoding, DEFAULT_ENCODING))
298 lex->use_fallback = 1;
299 #endif /* HAVE_ICONV */
303 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
309 java_destroy_lexer (lex)
313 if (! lex->use_fallback)
314 iconv_close (lex->handle);
323 if (lex->unget_value)
325 unicode_t r = lex->unget_value;
326 lex->unget_value = 0;
331 if (! lex->use_fallback)
333 size_t ir, inbytesleft, in_save, out_count, out_save;
337 /* If there is data which has already been converted, use it. */
338 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
345 /* See if we need to read more data. If FIRST == 0 then
346 the previous conversion attempt ended in the middle of
347 a character at the end of the buffer. Otherwise we
348 only have to read if the buffer is empty. */
349 if (lex->first == 0 || lex->first >= lex->last)
353 if (lex->first >= lex->last)
358 if (feof (lex->finput))
360 r = fread (&lex->buffer[lex->last], 1,
361 sizeof (lex->buffer) - lex->last,
366 inbytesleft = lex->last - lex->first;
367 out_count = sizeof (lex->out_buffer) - lex->out_last;
369 if (inbytesleft == 0)
371 /* We've tried to read and there is nothing left. */
375 in_save = inbytesleft;
376 out_save = out_count;
377 inp = &lex->buffer[lex->first];
378 outp = &lex->out_buffer[lex->out_last];
379 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
380 &inbytesleft, &outp, &out_count);
382 /* If we haven't read any bytes, then look to see if we
384 if (! lex->read_anything && out_save - out_count >= 2)
386 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
392 else if (uc == 0xfffe)
397 lex->read_anything = 1;
403 for (i = 0; i < out_save - out_count; i += 2)
405 char t = lex->out_buffer[lex->out_last + i];
406 lex->out_buffer[lex->out_last + i]
407 = lex->out_buffer[lex->out_last + i + 1];
408 lex->out_buffer[lex->out_last + i + 1] = t;
412 lex->first += in_save - inbytesleft;
413 lex->out_last += out_save - out_count;
415 /* If we converted anything at all, move along. */
416 if (out_count != out_save)
419 if (ir == (size_t) -1)
423 /* This is ok. This means that the end of our buffer
424 is in the middle of a character sequence. We just
425 move the valid part of the buffer to the beginning
427 memmove (&lex->buffer[0], &lex->buffer[lex->first],
428 lex->last - lex->first);
429 lex->last -= lex->first;
434 /* A more serious error. */
435 java_lex_error ("unrecognized character in input stream",
443 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
445 /* Don't have any data. */
450 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
455 #endif /* HAVE_ICONV */
458 c = getc (lex->finput);
463 return (unicode_t) c;
466 if ((c & 0xe0) == 0xc0)
468 c1 = getc (lex->finput);
469 if ((c1 & 0xc0) == 0x80)
471 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
472 /* Check for valid 2-byte characters. We explicitly
473 allow \0 because this encoding is common in the
475 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
479 else if ((c & 0xf0) == 0xe0)
481 c1 = getc (lex->finput);
482 if ((c1 & 0xc0) == 0x80)
484 c2 = getc (lex->finput);
485 if ((c2 & 0xc0) == 0x80)
487 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
490 /* Check for valid 3-byte characters.
491 Don't allow surrogate, \ufffe or \uffff. */
492 if (r >= 0x800 && r <= 0xffff
493 && ! (r >= 0xd800 && r <= 0xdfff)
494 && r != 0xfffe && r != 0xffff)
500 /* We simply don't support invalid characters. We also
501 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
502 cannot be valid Java characters. */
503 java_lex_error ("malformed UTF-8 character", 0);
507 /* We only get here on error. */
512 java_store_unicode (l, c, unicode_escape_p)
515 int unicode_escape_p;
517 if (l->size == l->max)
519 l->max += JAVA_LINE_MAX;
520 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
521 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
522 sizeof (char)*l->max);
524 l->line [l->size] = c;
525 l->unicode_escape_p [l->size++] = unicode_escape_p;
529 java_read_unicode (lex, unicode_escape_p)
531 int *unicode_escape_p;
535 c = java_read_char (lex);
536 *unicode_escape_p = 0;
545 if ((lex->bs_count) % 2 == 1)
547 /* Odd number of \ seen. */
548 c = java_read_char (lex);
551 unicode_t unicode = 0;
554 /* Recognize any number of `u's in \u. */
555 while ((c = java_read_char (lex)) == 'u')
558 /* Unget the most recent character as it is not a `u'. */
561 lex->unget_value = c;
563 /* Next should be 4 hex digits, otherwise it's an error.
564 The hex value is converted into the unicode, pushed into
565 the Unicode stream. */
566 for (shift = 12; shift >= 0; shift -= 4)
568 if ((c = java_read_char (lex)) == UEOF)
571 unicode |= (unicode_t)((c-'0') << shift);
572 else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
573 unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
575 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
578 *unicode_escape_p = 1;
581 lex->unget_value = c;
583 return (unicode_t) '\\';
587 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
589 int *unicode_escape_p;
591 int c = java_read_unicode (lex, unicode_escape_p);
595 /* We have to read ahead to see if we got \r\n. In that case we
596 return a single line terminator. */
598 c = java_read_unicode (lex, &dummy);
600 lex->unget_value = c;
601 /* In either case we must return a newline. */
611 /* It's time to read a line when... */
612 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
617 if (ctxp->lexer->hit_eof)
620 java_allocate_new_line ();
621 if (ctxp->c_line->line[0] != '\n')
625 int unicode_escape_p;
626 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
631 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
632 if (ctxp->c_line->white_space_only
633 && !JAVA_WHITE_SPACE_P (c)
635 ctxp->c_line->white_space_only = 0;
637 if ((c == '\n') || (c == UEOF))
641 if (c == UEOF && ! found_chars)
643 ctxp->lexer->hit_eof = 1;
648 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
649 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
650 return ctxp->c_line->line [ctxp->c_line->current++];
653 /* Parse the end of a C style comment.
654 * C is the first character following the '/' and '*'. */
656 java_parse_end_comment (c)
659 for ( ;; c = java_get_unicode ())
664 java_lex_error ("Comment not terminated at end of input", 0);
667 switch (c = java_get_unicode ())
670 java_lex_error ("Comment not terminated at end of input", 0);
674 case '*': /* reparse only '*' */
675 java_unget_unicode ();
681 /* Parse the documentation section. Keywords must be at the beginning
682 of a documentation comment line (ignoring white space and any `*'
683 character). Parsed keyword(s): @DEPRECATED. */
686 java_parse_doc_section (c)
689 int valid_tag = 0, seen_star = 0;
691 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
703 c = java_get_unicode();
707 java_lex_error ("Comment not terminated at end of input", 0);
709 if (seen_star && (c == '/'))
710 return 1; /* Goto step1 in caller */
712 /* We're parsing @deprecated */
713 if (valid_tag && (c == '@'))
718 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
720 c = java_get_unicode ();
721 tag [tag_index++] = c;
725 java_lex_error ("Comment not terminated at end of input", 0);
726 tag [tag_index] = '\0';
728 if (!strcmp (tag, "deprecated"))
729 ctxp->deprecated = 1;
731 java_unget_unicode ();
735 /* Return true if C is a valid start character for a Java identifier.
736 This is only called if C >= 128 -- smaller values are handled
737 inline. However, this function handles all values anyway. */
739 java_start_char_p (c)
742 unsigned int hi = c / 256;
743 char *page = type_table[hi];
744 unsigned long val = (unsigned long) page;
747 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
748 flags = page[c & 255];
752 return flags & LETTER_START;
755 /* Return true if C is a valid part character for a Java identifier.
756 This is only called if C >= 128 -- smaller values are handled
757 inline. However, this function handles all values anyway. */
762 unsigned int hi = c / 256;
763 char *page = type_table[hi];
764 unsigned long val = (unsigned long) page;
767 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
768 flags = page[c & 255];
772 return flags & LETTER_PART;
776 java_parse_escape_sequence ()
781 switch (c = java_get_unicode ())
784 return (unicode_t)0x8;
786 return (unicode_t)0x9;
788 return (unicode_t)0xa;
790 return (unicode_t)0xc;
792 return (unicode_t)0xd;
794 return (unicode_t)0x22;
796 return (unicode_t)0x27;
798 return (unicode_t)0x5c;
799 case '0': case '1': case '2': case '3': case '4':
800 case '5': case '6': case '7':
803 int octal_escape_index = 0;
807 for (; octal_escape_index < max && RANGE (c, '0', '7');
808 c = java_get_unicode ())
810 if (octal_escape_index == 0 && c > '3')
812 /* According to the grammar, `\477' has a well-defined
813 meaning -- it is `\47' followed by `7'. */
816 octal_escape [octal_escape_index++] = c;
819 java_unget_unicode ();
821 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
822 i < octal_escape_index; i++, shift -= 3)
823 char_lit |= (octal_escape [i] - '0') << shift;
828 java_lex_error ("Invalid character in escape sequence", 0);
829 return JAVA_CHAR_ERROR;
833 /* Isolate the code which may raise an arithmetic exception in its
842 int number_beginning;
845 #ifdef REAL_ARITHMETIC
846 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
848 #define IS_ZERO(X) ((X) == 0)
851 static void java_perform_atof PARAMS ((PTR));
854 java_perform_atof (av)
857 struct jpa_args *a = (struct jpa_args *)av;
858 YYSTYPE *java_lval = a->java_lval;
859 int number_beginning = a->number_beginning;
860 REAL_VALUE_TYPE value;
861 tree type = (a->fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
863 SET_REAL_VALUE_ATOF (value,
864 REAL_VALUE_ATOF (a->literal_token, TYPE_MODE (type)));
866 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
868 JAVA_FLOAT_RANGE_ERROR ((a->fflag ? "float" : "double"));
871 else if (IS_ZERO (value))
873 /* We check to see if the value is really 0 or if we've found an
874 underflow. We do this in the most primitive imaginable way. */
876 char *p = a->literal_token;
879 while (*p && *p != 'e' && *p != 'E')
881 if (*p != '0' && *p != '.')
890 int i = ctxp->c_line->current;
891 ctxp->c_line->current = number_beginning;
892 java_lex_error ("Floating point literal underflow", 0);
893 ctxp->c_line->current = i;
897 SET_LVAL_NODE_TYPE (build_real (type, value), type);
901 static int yylex PARAMS ((YYSTYPE *));
912 unicode_t first_unicode;
913 int ascii_index, all_ascii;
916 /* Translation of the Unicode escape in the raw stream of Unicode
917 characters. Takes care of line terminator. */
919 /* Skip white spaces: SP, TAB and FF or ULT */
920 for (c = java_get_unicode ();
921 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
924 ctxp->elc.line = ctxp->c_line->lineno;
925 ctxp->elc.col = ctxp->c_line->char_col-2;
928 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
930 if (c == 0x1a) /* CTRL-Z */
932 if ((c = java_get_unicode ()) == UEOF)
933 return 0; /* Ok here */
935 java_unget_unicode (); /* Caught later, at the end of the function */
937 /* Handle EOF here */
938 if (c == UEOF) /* Should probably do something here... */
941 /* Take care of eventual comments. */
944 switch (c = java_get_unicode ())
949 c = java_get_unicode ();
952 /* It is ok to end a `//' comment with EOF, unless
953 we're being pedantic. */
955 java_lex_error ("Comment not terminated at end of input",
959 if (c == '\n') /* ULT */
965 if ((c = java_get_unicode ()) == '*')
967 if ((c = java_get_unicode ()) == '/')
968 goto step1; /* Empy documentation comment */
969 else if (java_parse_doc_section (c))
973 java_parse_end_comment ((c = java_get_unicode ()));
977 java_unget_unicode ();
983 ctxp->elc.line = ctxp->c_line->lineno;
984 ctxp->elc.prev_col = ctxp->elc.col;
985 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
986 if (ctxp->elc.col < 0)
989 /* Numeric literals */
990 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
992 /* This section of code is borrowed from gcc/c-lex.c */
993 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
994 int parts[TOTAL_PARTS];
995 HOST_WIDE_INT high, low;
996 /* End borrowed section */
997 char literal_token [256];
998 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
999 int found_hex_digits = 0;
1002 int number_beginning = ctxp->c_line->current;
1006 /* We might have a . separator instead of a FP like .[0-9]* */
1009 unicode_t peep = java_sneak_unicode ();
1011 if (!JAVA_ASCII_DIGIT (peep))
1014 BUILD_OPERATOR (DOT_TK);
1018 for (i = 0; i < TOTAL_PARTS; i++)
1023 c = java_get_unicode ();
1024 if (c == 'x' || c == 'X')
1027 c = java_get_unicode ();
1029 else if (JAVA_ASCII_DIGIT (c))
1033 /* Push the '.' back and prepare for a FP parsing... */
1034 java_unget_unicode ();
1039 /* We have a zero literal: 0, 0{f,F}, 0{d,D} */
1040 JAVA_LEX_LIT ("0", 10);
1044 SET_LVAL_NODE (long_zero_node);
1045 return (INT_LIT_TK);
1047 SET_LVAL_NODE (float_zero_node);
1050 SET_LVAL_NODE (double_zero_node);
1053 java_unget_unicode ();
1054 SET_LVAL_NODE (integer_zero_node);
1055 return (INT_LIT_TK);
1059 /* Parse the first part of the literal, until we find something
1060 which is not a number. */
1061 while ((radix == 10 && JAVA_ASCII_DIGIT (c)) ||
1062 (radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1063 (radix == 8 && JAVA_ASCII_OCTDIGIT (c)))
1065 /* We store in a string (in case it turns out to be a FP) and in
1066 PARTS if we have to process a integer literal. */
1067 int numeric = (ISDIGIT (c) ? c-'0' : 10 +(c|0x20)-'a');
1070 /* Remember when we find a valid hexadecimal digit */
1072 found_hex_digits = 1;
1074 literal_token [literal_index++] = c;
1075 /* This section of code if borrowed from gcc/c-lex.c */
1076 for (count = 0; count < TOTAL_PARTS; count++)
1078 parts[count] *= radix;
1081 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1082 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1085 parts[0] += numeric;
1087 if (parts [TOTAL_PARTS-1] != 0)
1089 /* End borrowed section. */
1090 c = java_get_unicode ();
1093 /* If we have something from the FP char set but not a digit, parse
1095 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1098 int seen_digit = (literal_index ? 1 : 0);
1099 int seen_exponent = 0;
1100 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1101 double unless specified. */
1103 /* It is ok if the radix is 8 because this just means we've
1104 seen a leading `0'. However, radix==16 is invalid. */
1106 java_lex_error ("Can't express non-decimal FP literal", 0);
1116 literal_token [literal_index++ ] = c;
1117 c = java_get_unicode ();
1120 java_lex_error ("Invalid character in FP literal", 0);
1123 if (c == 'e' || c == 'E')
1127 /* {E,e} must have seen at list a digit */
1129 java_lex_error ("Invalid FP literal", 0);
1133 literal_token [literal_index++] = c;
1134 c = java_get_unicode ();
1137 java_lex_error ("Invalid character in FP literal", 0);
1139 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1141 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1142 stage = 4; /* So we fall through */
1145 if ((c=='-' || c =='+') && stage == 2)
1148 literal_token [literal_index++] = c;
1149 c = java_get_unicode ();
1152 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1153 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1154 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1155 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1157 if (JAVA_ASCII_DIGIT (c))
1159 literal_token [literal_index++ ] = c;
1160 c = java_get_unicode ();
1167 if (stage != 4) /* Don't push back fF/dD */
1168 java_unget_unicode ();
1170 /* An exponent (if any) must have seen a digit. */
1171 if (seen_exponent && !seen_digit)
1172 java_lex_error ("Invalid FP literal", 0);
1174 literal_token [literal_index] = '\0';
1175 JAVA_LEX_LIT (literal_token, radix);
1178 a.literal_token = literal_token;
1180 a.java_lval = java_lval;
1181 a.number_beginning = number_beginning;
1182 if (do_float_handler (java_perform_atof, (PTR) &a))
1185 JAVA_FLOAT_RANGE_ERROR ((fflag ? "float" : "double"));
1191 } /* JAVA_ASCCI_FPCHAR (c) */
1193 if (radix == 16 && ! found_hex_digits)
1195 ("0x must be followed by at least one hexadecimal digit", 0);
1197 /* Here we get back to converting the integral literal. */
1198 if (c == 'L' || c == 'l')
1200 else if (radix == 16 && JAVA_ASCII_LETTER (c))
1201 java_lex_error ("Digit out of range in hexadecimal literal", 0);
1202 else if (radix == 8 && JAVA_ASCII_DIGIT (c))
1203 java_lex_error ("Digit out of range in octal literal", 0);
1204 else if (radix == 16 && !literal_index)
1205 java_lex_error ("No digit specified for hexadecimal literal", 0);
1207 java_unget_unicode ();
1209 #ifdef JAVA_LEX_DEBUG
1210 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1211 JAVA_LEX_LIT (literal_token, radix);
1213 /* This section of code is borrowed from gcc/c-lex.c */
1216 bytes = GET_TYPE_PRECISION (long_type_node);
1217 for (i = bytes; i < TOTAL_PARTS; i++)
1225 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1227 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1228 / HOST_BITS_PER_CHAR)]
1229 << (i * HOST_BITS_PER_CHAR));
1230 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1232 /* End borrowed section. */
1234 /* Range checking */
1237 /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1238 9223372036854775807L is the biggest `long' literal that can be
1239 expressed using a 10 radix. For other radixes, everything that
1240 fits withing 64 bits is OK. */
1241 int hb = (high >> 31);
1242 if (overflow || (hb && low && radix == 10)
1243 || (hb && high & 0x7fffffff && radix == 10))
1244 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1248 /* 2147483648 is valid if operand of a '-'. Otherwise,
1249 2147483647 is the biggest `int' literal that can be
1250 expressed using a 10 radix. For other radixes, everything
1251 that fits within 32 bits is OK. As all literals are
1252 signed, we sign extend here. */
1253 int hb = (low >> 31) & 0x1;
1254 if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1255 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1259 value = build_int_2 (low, high);
1260 JAVA_RADIX10_FLAG (value) = radix == 10;
1261 SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1263 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1264 long_suffix ? long_type_node : int_type_node);
1269 /* Character literals */
1273 if ((c = java_get_unicode ()) == '\\')
1274 char_lit = java_parse_escape_sequence ();
1277 if (c == '\n' || c == '\'')
1278 java_lex_error ("Invalid character literal", 0);
1282 c = java_get_unicode ();
1284 if ((c == '\n') || (c == UEOF))
1285 java_lex_error ("Character literal not terminated at end of line", 0);
1287 java_lex_error ("Syntax error in character literal", 0);
1289 if (char_lit == JAVA_CHAR_ERROR)
1290 char_lit = 0; /* We silently convert it to zero */
1292 JAVA_LEX_CHAR_LIT (char_lit);
1293 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1297 /* String literals */
1303 for (no_error = 1, c = java_get_unicode ();
1304 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1307 c = java_parse_escape_sequence ();
1308 if (c == JAVA_CHAR_ERROR)
1311 c = 0; /* We silently convert it to zero. */
1313 java_unicode_2_utf8 (c);
1315 if (c == '\n' || c == UEOF) /* ULT */
1317 lineno--; /* Refer to the line the terminator was seen */
1318 java_lex_error ("String not terminated at end of line", 0);
1322 obstack_1grow (&temporary_obstack, '\0');
1323 string = obstack_finish (&temporary_obstack);
1325 if (!no_error || (c != '"'))
1326 java_lval->node = error_mark_node; /* Requires futher testing FIXME */
1328 java_lval->node = build_string (strlen (string), string);
1330 obstack_free (&temporary_obstack, string);
1331 return STRING_LIT_TK;
1339 BUILD_OPERATOR (OP_TK);
1345 if (ctxp->ccb_indent == 1)
1346 ctxp->first_ccb_indent1 = lineno;
1348 BUILD_OPERATOR (OCB_TK);
1352 if (ctxp->ccb_indent == 1)
1353 ctxp->last_ccb_indent1 = lineno;
1354 BUILD_OPERATOR (CCB_TK);
1357 BUILD_OPERATOR (OSB_TK);
1369 BUILD_OPERATOR (DOT_TK);
1370 /* return DOT_TK; */
1377 if ((c = java_get_unicode ()) == '=')
1379 BUILD_OPERATOR (EQ_TK);
1383 /* Equals is used in two different locations. In the
1384 variable_declarator: rule, it has to be seen as '=' as opposed
1385 to being seen as an ordinary assignment operator in
1386 assignment_operators: rule. */
1387 java_unget_unicode ();
1388 BUILD_OPERATOR (ASSIGN_TK);
1392 switch ((c = java_get_unicode ()))
1395 BUILD_OPERATOR (GTE_TK);
1397 switch ((c = java_get_unicode ()))
1400 if ((c = java_get_unicode ()) == '=')
1402 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1406 java_unget_unicode ();
1407 BUILD_OPERATOR (ZRS_TK);
1410 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1412 java_unget_unicode ();
1413 BUILD_OPERATOR (SRS_TK);
1416 java_unget_unicode ();
1417 BUILD_OPERATOR (GT_TK);
1421 switch ((c = java_get_unicode ()))
1424 BUILD_OPERATOR (LTE_TK);
1426 if ((c = java_get_unicode ()) == '=')
1428 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1432 java_unget_unicode ();
1433 BUILD_OPERATOR (LS_TK);
1436 java_unget_unicode ();
1437 BUILD_OPERATOR (LT_TK);
1441 switch ((c = java_get_unicode ()))
1444 BUILD_OPERATOR (BOOL_AND_TK);
1446 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1448 java_unget_unicode ();
1449 BUILD_OPERATOR (AND_TK);
1453 switch ((c = java_get_unicode ()))
1456 BUILD_OPERATOR (BOOL_OR_TK);
1458 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1460 java_unget_unicode ();
1461 BUILD_OPERATOR (OR_TK);
1465 switch ((c = java_get_unicode ()))
1468 BUILD_OPERATOR (INCR_TK);
1470 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1472 java_unget_unicode ();
1473 BUILD_OPERATOR (PLUS_TK);
1477 switch ((c = java_get_unicode ()))
1480 BUILD_OPERATOR (DECR_TK);
1482 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1484 java_unget_unicode ();
1485 BUILD_OPERATOR (MINUS_TK);
1489 if ((c = java_get_unicode ()) == '=')
1491 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1495 java_unget_unicode ();
1496 BUILD_OPERATOR (MULT_TK);
1500 if ((c = java_get_unicode ()) == '=')
1502 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1506 java_unget_unicode ();
1507 BUILD_OPERATOR (DIV_TK);
1511 if ((c = java_get_unicode ()) == '=')
1513 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1517 java_unget_unicode ();
1518 BUILD_OPERATOR (XOR_TK);
1522 if ((c = java_get_unicode ()) == '=')
1524 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1528 java_unget_unicode ();
1529 BUILD_OPERATOR (REM_TK);
1533 if ((c = java_get_unicode()) == '=')
1535 BUILD_OPERATOR (NEQ_TK);
1539 java_unget_unicode ();
1540 BUILD_OPERATOR (NEG_TK);
1545 BUILD_OPERATOR (REL_QM_TK);
1548 BUILD_OPERATOR (REL_CL_TK);
1550 BUILD_OPERATOR (NOT_TK);
1553 /* Keyword, boolean literal or null literal */
1554 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1555 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1557 java_unicode_2_utf8 (c);
1558 if (all_ascii && c >= 128)
1563 obstack_1grow (&temporary_obstack, '\0');
1564 string = obstack_finish (&temporary_obstack);
1565 java_unget_unicode ();
1567 /* If we have something all ascii, we consider a keyword, a boolean
1568 literal, a null literal or an all ASCII identifier. Otherwise,
1569 this is an identifier (possibly not respecting formation rule). */
1572 struct java_keyword *kw;
1573 if ((kw=java_keyword (string, ascii_index)))
1575 JAVA_LEX_KW (string);
1578 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1579 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1580 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1581 case PRIVATE_TK: case STRICT_TK:
1582 SET_MODIFIER_CTX (kw->token);
1585 SET_LVAL_NODE (float_type_node);
1588 SET_LVAL_NODE (double_type_node);
1591 SET_LVAL_NODE (boolean_type_node);
1594 SET_LVAL_NODE (byte_type_node);
1597 SET_LVAL_NODE (short_type_node);
1600 SET_LVAL_NODE (int_type_node);
1603 SET_LVAL_NODE (long_type_node);
1606 SET_LVAL_NODE (char_type_node);
1609 /* Keyword based literals */
1612 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1613 boolean_true_node : boolean_false_node));
1616 SET_LVAL_NODE (null_pointer_node);
1619 /* Some keyword we want to retain information on the location
1632 BUILD_OPERATOR (kw->token);
1640 /* We may have an ID here */
1641 if (JAVA_START_CHAR_P (first_unicode))
1643 JAVA_LEX_ID (string);
1644 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1648 /* Everything else is an invalid character in the input */
1650 char lex_error_buffer [128];
1651 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1652 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1653 java_lex_error (lex_error_buffer, 1);
1659 /* This is called by the parser to see if an error should be generated
1660 due to numeric overflow. This function only handles the particular
1661 case of the largest negative value, and is only called in the case
1662 where this value is not preceeded by `-'. */
1664 error_if_numeric_overflow (value)
1667 if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1669 unsigned HOST_WIDE_INT lo, hi;
1671 lo = TREE_INT_CST_LOW (value);
1672 hi = TREE_INT_CST_HIGH (value);
1673 if (TREE_TYPE (value) == long_type_node)
1675 int hb = (hi >> 31);
1676 if (hb && !(hi & 0x7fffffff))
1677 java_lex_error ("Numeric overflow for `long' literal", 0);
1681 int hb = (lo >> 31) & 0x1;
1682 if (hb && !(lo & 0x7fffffff))
1683 java_lex_error ("Numeric overflow for `int' literal", 0);
1687 #endif /* JC1_LITE */
1690 java_unicode_2_utf8 (unicode)
1693 if (RANGE (unicode, 0x01, 0x7f))
1694 obstack_1grow (&temporary_obstack, (char)unicode);
1695 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1697 obstack_1grow (&temporary_obstack,
1698 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1699 obstack_1grow (&temporary_obstack,
1700 (unsigned char)(0x80 | (unicode & 0x3f)));
1702 else /* Range 0x800-0xffff */
1704 obstack_1grow (&temporary_obstack,
1705 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1706 obstack_1grow (&temporary_obstack,
1707 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1708 obstack_1grow (&temporary_obstack,
1709 (unsigned char)(0x80 | (unicode & 0x003f)));
1715 build_wfl_node (node)
1718 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1719 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1720 TREE_TYPE (node) = NULL_TREE;
1726 java_lex_error (msg, forward)
1727 const char *msg ATTRIBUTE_UNUSED;
1728 int forward ATTRIBUTE_UNUSED;
1731 ctxp->elc.line = ctxp->c_line->lineno;
1732 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1734 /* Might be caught in the middle of some error report */
1735 ctxp->java_error_flag = 0;
1752 if (next != '\n' && next != EOF)
1764 java_get_line_col (filename, line, col)
1765 const char *filename ATTRIBUTE_UNUSED;
1766 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1771 /* Dumb implementation. Doesn't try to cache or optimize things. */
1772 /* First line of the file is line 1, first column is 1 */
1774 /* COL == -1 means, at the CR/LF in LINE */
1775 /* COL == -2 means, at the first non space char in LINE */
1778 int c, ccol, cline = 1;
1779 int current_line_col = 0;
1780 int first_non_space = 0;
1783 if (!(fp = fopen (filename, "r")))
1784 fatal_io_error ("can't open %s", filename);
1786 while (cline != line)
1791 static const char msg[] = "<<file too short - unexpected EOF>>";
1792 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1795 if (java_is_eol (fp, c))
1799 /* Gather the chars of the current line in a buffer */
1803 if (c < 0 || java_is_eol (fp, c))
1805 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1806 first_non_space = current_line_col;
1807 obstack_1grow (&temporary_obstack, c);
1812 obstack_1grow (&temporary_obstack, '\n');
1816 col = current_line_col;
1817 first_non_space = 0;
1820 col = first_non_space;
1822 first_non_space = 0;
1824 /* Place the '^' a the right position */
1825 base = obstack_base (&temporary_obstack);
1826 for (ccol = 1; ccol <= col+3; ccol++)
1828 /* Compute \t when reaching first_non_space */
1829 char c = (first_non_space ?
1830 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1831 obstack_1grow (&temporary_obstack, c);
1833 obstack_grow0 (&temporary_obstack, "^", 1);
1836 return obstack_finish (&temporary_obstack);
1842 utf8_cmp (str, length, name)
1843 const unsigned char *str;
1847 const unsigned char *limit = str + length;
1850 for (i = 0; name[i]; ++i)
1852 int ch = UTF8_GET (str, limit);
1854 return ch - name[i];
1857 return str == limit ? 0 : 1;
1860 /* A sorted list of all C++ keywords. */
1862 static const char *const cxx_keywords[] =
1970 /* Return true if NAME is a C++ keyword. */
1973 cxx_keyword_p (name, length)
1977 int last = ARRAY_SIZE (cxx_keywords);
1979 int mid = (last + first) / 2;
1982 for (mid = (last + first) / 2;
1984 old = mid, mid = (last + first) / 2)
1986 int kwl = strlen (cxx_keywords[mid]);
1987 int min_length = kwl > length ? length : kwl;
1988 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1993 /* We've found a match if all the remaining characters are
1995 for (i = min_length; i < length && name[i] == '$'; ++i)
2009 #endif /* JC1_LITE */