1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
45 /* Function declarations. */
46 static char *java_sprint_unicode (int);
47 static void java_unicode_2_utf8 (unicode_t);
48 static void java_lex_error (const char *, int);
50 static int do_java_lex (YYSTYPE *);
51 static int java_lex (YYSTYPE *);
52 static int java_is_eol (FILE *, int);
53 static tree build_wfl_node (tree);
55 static int java_parse_escape_sequence (void);
56 static int java_start_char_p (unicode_t);
57 static int java_part_char_p (unicode_t);
58 static int java_space_char_p (unicode_t);
59 static void java_parse_doc_section (int);
60 static void java_parse_end_comment (int);
61 static int java_read_char (java_lexer *);
62 static int java_get_unicode (void);
63 static int java_peek_unicode (void);
64 static void java_next_unicode (void);
65 static int java_read_unicode (java_lexer *, int *);
67 static int utf8_cmp (const unsigned char *, int, const char *);
70 java_lexer *java_new_lexer (FILE *, const char *);
72 static void error_if_numeric_overflow (tree);
76 /* This is nonzero if we have initialized `need_byteswap'. */
77 static int byteswap_init = 0;
79 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
80 big-endian order -- not native endian order. We handle this by
81 doing a conversion once at startup and seeing what happens. This
82 flag holds the results of this determination. */
83 static int need_byteswap = 0;
87 java_init_lex (FILE *finput, const char *encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
95 inst_id = get_identifier ("inst$");
97 wpv_id = get_identifier ("write_parm_value$");
99 if (!java_lang_imported)
101 tree node = build_tree_list (build_unknown_wfl (java_lang_id),
103 read_import_dir (TREE_PURPOSE (node));
104 TREE_CHAIN (node) = ctxp->import_demand_list;
105 ctxp->import_demand_list = node;
106 java_lang_imported = 1;
111 #ifdef USE_MAPPED_LOCATION
112 wfl_operator = build_expr_wfl (NULL_TREE, input_location);
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
118 label_id = get_identifier ("$L");
120 wfl_append = build_unknown_wfl (get_identifier ("append"));
121 if (!wfl_string_buffer)
123 build_unknown_wfl (get_identifier (flag_emit_class_files
124 ? "java.lang.StringBuffer"
125 : "gnu.gcj.runtime.StringBuffer"));
127 wfl_to_string = build_unknown_wfl (get_identifier ("toString"));
129 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
130 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
132 memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->save_location = input_location;
138 ctxp->java_error_flag = 0;
139 ctxp->lexer = java_new_lexer (finput, encoding);
143 java_sprint_unicode (int c)
145 static char buffer [10];
146 if (c < ' ' || c >= 127)
147 sprintf (buffer, "\\u%04x", c);
156 /* Create a new lexer object. */
159 java_new_lexer (FILE *finput, const char *encoding)
161 java_lexer *lex = xmalloc (sizeof (java_lexer));
164 lex->finput = finput;
166 lex->unget_value = 0;
167 lex->next_unicode = 0;
168 lex->avail_unicode = 0;
169 lex->next_columns = 1;
170 lex->encoding = encoding;
171 lex->position.line = 1;
172 lex->position.col = 1;
174 #ifdef USE_MAPPED_LOCATION
176 = linemap_line_start (&line_table, 1, 120);
183 lex->handle = iconv_open ("UCS-2", encoding);
184 if (lex->handle != (iconv_t) -1)
190 lex->read_anything = 0;
191 lex->use_fallback = 0;
193 /* Work around broken iconv() implementations by doing checking at
194 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
195 then all UCS-2 encoders will be broken. Perhaps not a valid
203 handle = iconv_open ("UCS-2", "UTF-8");
204 if (handle != (iconv_t) -1)
211 /* This is the UTF-8 encoding of \ufeff. */
218 outp = (char *) &result;
221 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
223 iconv_close (handle);
224 /* Conversion must be complete for us to use the result. */
225 if (r != (size_t) -1 && inc == 0 && outc == 0)
226 need_byteswap = (result != 0xfeff);
230 lex->byte_swap = need_byteswap;
233 #endif /* HAVE_ICONV */
235 /* If iconv failed, use the internal decoder if the default
236 encoding was requested. This code is used on platforms where
237 iconv exists but is insufficient for our needs. For
238 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
240 On Solaris the default encoding, as returned by nl_langinfo(),
241 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
242 understand that. We work around that by pretending
243 `646' to be the same as UTF-8. */
244 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
249 lex->use_fallback = 1;
250 lex->encoding = "UTF-8";
252 #endif /* HAVE_ICONV */
256 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
262 java_destroy_lexer (java_lexer *lex)
265 if (! lex->use_fallback)
266 iconv_close (lex->handle);
272 java_read_char (java_lexer *lex)
275 if (! lex->use_fallback)
277 size_t ir, inbytesleft, in_save, out_count, out_save;
281 /* If there is data which has already been converted, use it. */
282 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
289 /* See if we need to read more data. If FIRST == 0 then
290 the previous conversion attempt ended in the middle of
291 a character at the end of the buffer. Otherwise we
292 only have to read if the buffer is empty. */
293 if (lex->first == 0 || lex->first >= lex->last)
297 if (lex->first >= lex->last)
302 if (feof (lex->finput))
304 r = fread (&lex->buffer[lex->last], 1,
305 sizeof (lex->buffer) - lex->last,
310 inbytesleft = lex->last - lex->first;
311 out_count = sizeof (lex->out_buffer) - lex->out_last;
313 if (inbytesleft == 0)
315 /* We've tried to read and there is nothing left. */
319 in_save = inbytesleft;
320 out_save = out_count;
321 inp = &lex->buffer[lex->first];
322 outp = (char *) &lex->out_buffer[lex->out_last];
323 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
324 &inbytesleft, &outp, &out_count);
326 /* If we haven't read any bytes, then look to see if we
328 if (! lex->read_anything && out_save - out_count >= 2)
330 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
336 else if (uc == 0xfffe)
341 lex->read_anything = 1;
347 for (i = 0; i < out_save - out_count; i += 2)
349 char t = lex->out_buffer[lex->out_last + i];
350 lex->out_buffer[lex->out_last + i]
351 = lex->out_buffer[lex->out_last + i + 1];
352 lex->out_buffer[lex->out_last + i + 1] = t;
356 lex->first += in_save - inbytesleft;
357 lex->out_last += out_save - out_count;
359 /* If we converted anything at all, move along. */
360 if (out_count != out_save)
363 if (ir == (size_t) -1)
367 /* This is ok. This means that the end of our buffer
368 is in the middle of a character sequence. We just
369 move the valid part of the buffer to the beginning
371 memmove (&lex->buffer[0], &lex->buffer[lex->first],
372 lex->last - lex->first);
373 lex->last -= lex->first;
378 /* A more serious error. */
381 "Unrecognized character for encoding '%s'",
383 java_lex_error (buffer, 0);
390 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
392 /* Don't have any data. */
397 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
402 #endif /* HAVE_ICONV */
405 c = getc (lex->finput);
410 return (unicode_t) c;
413 if ((c & 0xe0) == 0xc0)
415 c1 = getc (lex->finput);
416 if ((c1 & 0xc0) == 0x80)
418 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
419 /* Check for valid 2-byte characters. We explicitly
420 allow \0 because this encoding is common in the
422 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
426 else if ((c & 0xf0) == 0xe0)
428 c1 = getc (lex->finput);
429 if ((c1 & 0xc0) == 0x80)
431 c2 = getc (lex->finput);
432 if ((c2 & 0xc0) == 0x80)
434 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
437 /* Check for valid 3-byte characters.
438 Don't allow surrogate, \ufffe or \uffff. */
439 if (IN_RANGE (r, 0x800, 0xffff)
440 && ! IN_RANGE (r, 0xd800, 0xdfff)
441 && r != 0xfffe && r != 0xffff)
447 /* We simply don't support invalid characters. We also
448 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
449 cannot be valid Java characters. */
450 java_lex_error ("malformed UTF-8 character", 0);
454 /* We only get here on error. */
459 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
463 if (lex->unget_value)
465 c = lex->unget_value;
466 lex->unget_value = 0;
469 c = java_read_char (lex);
471 *unicode_escape_p = 0;
480 if ((lex->bs_count) % 2 == 1)
482 /* Odd number of \ seen. */
483 c = java_read_char (lex);
486 unicode_t unicode = 0;
489 /* Recognize any number of `u's in \u. */
490 while ((c = java_read_char (lex)) == 'u')
498 java_lex_error ("prematurely terminated \\u sequence", 0);
503 unicode |= (unicode_t)(hex_value (c) << shift);
506 java_lex_error ("non-hex digit in \\u sequence", 0);
510 c = java_read_char (lex);
516 lex->unget_value = c;
519 *unicode_escape_p = 1;
522 lex->unget_value = c;
524 return (unicode_t) '\\';
527 /* Get the next Unicode character (post-Unicode-escape-handling).
528 Move the current position to just after returned character. */
531 java_get_unicode (void)
533 int next = java_peek_unicode ();
534 java_next_unicode ();
538 /* Return the next Unicode character (post-Unicode-escape-handling).
539 Do not move the current position, which remains just before
540 the returned character. */
543 java_peek_unicode (void)
545 int unicode_escape_p;
546 java_lexer *lex = ctxp->lexer;
547 if (lex->avail_unicode)
548 return lex->next_unicode;
551 next = java_read_unicode (lex, &unicode_escape_p);
555 /* We have to read ahead to see if we got \r\n.
556 In that case we return a single line terminator. */
558 next = java_read_unicode (lex, &dummy);
559 if (next != '\n' && next != UEOF)
560 lex->unget_value = next;
561 /* In either case we must return a newline. */
565 lex->next_unicode = next;
566 lex->avail_unicode = 1;
570 lex->next_columns = 0;
576 lex->next_columns = 1 - lex->position.col;
578 else if (next == '\t')
580 int cur_col = lex->position.col;
581 lex->next_columns = ((cur_col + 7) & ~7) + 1 - cur_col;
586 lex->next_columns = 1;
588 if (unicode_escape_p)
589 lex->next_columns = 6;
593 /* Move forward one Unicode character (post-Unicode-escape-handling).
594 Only allowed after java_peek_unicode. The combination java_peek_unicode
595 followed by java_next_unicode is equivalent to java_get_unicode. */
597 static void java_next_unicode (void)
599 struct java_lexer *lex = ctxp->lexer;
600 lex->position.col += lex->next_columns;
601 if (lex->next_unicode == '\n')
603 lex->position.line++;
605 #ifdef USE_MAPPED_LOCATION
607 = linemap_line_start (&line_table, lex->position.line, 120);
609 input_line = lex->position.line;
613 lex->avail_unicode = 0;
617 /* The inverse of java_next_unicode.
618 Not currently used, but could be if it would be cleaner or faster.
619 java_peek_unicode == java_get_unicode + java_unget_unicode.
620 java_get_unicode == java_peek_unicode + java_next_unicode.
622 static void java_unget_unicode ()
624 struct java_lexer *lex = ctxp->lexer;
625 if (lex->avail_unicode)
626 fatal_error ("internal error - bad unget");
627 lex->avail_unicode = 1;
628 lex->position.col -= lex->next_columns;
632 /* Parse the end of a C style comment.
633 * C is the first character following the '/' and '*'. */
635 java_parse_end_comment (int c)
637 for ( ;; c = java_get_unicode ())
642 java_lex_error ("Comment not terminated at end of input", 0);
645 switch (c = java_peek_unicode ())
648 java_lex_error ("Comment not terminated at end of input", 0);
651 java_next_unicode ();
653 case '*': /* Reparse only '*'. */
660 /* Parse the documentation section. Keywords must be at the beginning
661 of a documentation comment line (ignoring white space and any `*'
662 character). Parsed keyword(s): @DEPRECATED. */
665 java_parse_doc_section (int c)
669 /* We reset this here, because only the most recent doc comment
670 applies to the following declaration. */
671 ctxp->deprecated = 0;
673 /* We loop over all the lines of the comment. We'll eventually exit
674 if we hit EOF prematurely, or when we see the comment
678 /* These first steps need only be done if we're still looking
679 for the deprecated tag. If we've already seen it, we might
680 as well skip looking for it again. */
681 if (! ctxp->deprecated)
683 /* Skip whitespace and '*'s. We must also check for the end
684 of the comment here. */
685 while (JAVA_WHITE_SPACE_P (c) || c == '*')
687 last_was_star = (c == '*');
688 c = java_get_unicode ();
689 if (last_was_star && c == '/')
691 /* We just saw the comment terminator. */
701 const char *deprecated = "@deprecated";
704 for (i = 0; deprecated[i]; ++i)
706 if (c != deprecated[i])
708 /* We write the code in this way, with the
709 update at the end, so that after the loop
710 we're left with the next character in C. */
711 c = java_get_unicode ();
717 /* @deprecated must be followed by a space or newline.
718 We also allow a '*' in case it appears just before
719 the end of a comment. In this position only we also
720 must allow any Unicode space character. */
721 if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
724 ctxp->deprecated = 1;
729 /* We've examined the relevant content from this line. Now we
730 skip the remaining characters and start over with the next
731 line. We also check for end of comment here. */
732 while (c != '\n' && c != UEOF)
734 last_was_star = (c == '*');
735 c = java_get_unicode ();
736 if (last_was_star && c == '/')
742 /* We have to advance past the \n. */
743 c = java_get_unicode ();
749 java_lex_error ("Comment not terminated at end of input", 0);
752 /* Return true if C is a valid start character for a Java identifier.
753 This is only called if C >= 128 -- smaller values are handled
754 inline. However, this function handles all values anyway. */
756 java_start_char_p (unicode_t c)
758 unsigned int hi = c / 256;
759 const char *const page = type_table[hi];
760 unsigned long val = (unsigned long) page;
763 if ((val & ~ LETTER_MASK) != 0)
764 flags = page[c & 255];
768 return flags & LETTER_START;
771 /* Return true if C is a valid part character for a Java identifier.
772 This is only called if C >= 128 -- smaller values are handled
773 inline. However, this function handles all values anyway. */
775 java_part_char_p (unicode_t c)
777 unsigned int hi = c / 256;
778 const char *const page = type_table[hi];
779 unsigned long val = (unsigned long) page;
782 if ((val & ~ LETTER_MASK) != 0)
783 flags = page[c & 255];
787 return flags & LETTER_PART;
790 /* Return true if C is whitespace. */
792 java_space_char_p (unicode_t c)
794 unsigned int hi = c / 256;
795 const char *const page = type_table[hi];
796 unsigned long val = (unsigned long) page;
799 if ((val & ~ LETTER_MASK) != 0)
800 flags = page[c & 255];
804 return flags & LETTER_SPACE;
808 java_parse_escape_sequence (void)
812 switch (c = java_get_unicode ())
815 return (unicode_t)0x8;
817 return (unicode_t)0x9;
819 return (unicode_t)0xa;
821 return (unicode_t)0xc;
823 return (unicode_t)0xd;
825 return (unicode_t)0x22;
827 return (unicode_t)0x27;
829 return (unicode_t)0x5c;
830 case '0': case '1': case '2': case '3': case '4':
831 case '5': case '6': case '7':
834 unicode_t char_lit = 0;
838 /* According to the grammar, `\477' has a well-defined
839 meaning -- it is `\47' followed by `7'. */
845 char_lit = 8 * char_lit + c - '0';
848 c = java_peek_unicode ();
849 if (! RANGE (c, '0', '7'))
851 java_next_unicode ();
857 java_lex_error ("Invalid character in escape sequence", -1);
858 return JAVA_CHAR_ERROR;
863 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
865 /* Subroutine of java_lex: converts floating-point literals to tree
866 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
867 store the result. FFLAG indicates whether the literal was tagged
868 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
869 is the line number on which to report any error. */
871 static void java_perform_atof (YYSTYPE *, char *, int, int);
874 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
875 int number_beginning)
877 REAL_VALUE_TYPE value;
878 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
880 SET_REAL_VALUE_ATOF (value,
881 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
883 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
885 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
888 else if (IS_ZERO (value))
890 /* We check to see if the value is really 0 or if we've found an
891 underflow. We do this in the most primitive imaginable way. */
893 char *p = literal_token;
896 while (*p && *p != 'e' && *p != 'E')
898 if (*p != '0' && *p != '.')
907 int save_col = ctxp->lexer->position.col;
908 ctxp->lexer->position.col = number_beginning;
909 java_lex_error ("Floating point literal underflow", 0);
910 ctxp->lexer->position.col = save_col;
914 SET_LVAL_NODE (build_real (type, value));
918 static int yylex (YYSTYPE *);
922 yylex (YYSTYPE *java_lval)
924 do_java_lex (YYSTYPE *java_lval)
930 /* Translation of the Unicode escape in the raw stream of Unicode
931 characters. Takes care of line terminator. */
933 /* Skip white spaces: SP, TAB and FF or ULT. */
936 c = java_peek_unicode ();
937 if (c != '\n' && ! JAVA_WHITE_SPACE_P (c))
939 java_next_unicode ();
942 /* Handle EOF here. */
943 if (c == UEOF) /* Should probably do something here... */
947 #ifdef USE_MAPPED_LOCATION
948 LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table,
949 ctxp->lexer->position.col);
951 ctxp->lexer->token_start = ctxp->lexer->position;
955 /* Numeric literals. */
956 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
958 /* This section of code is borrowed from gcc/c-lex.c. */
959 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
960 int parts[TOTAL_PARTS];
961 HOST_WIDE_INT high, low;
962 /* End borrowed section. */
963 char literal_token [256];
964 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
965 int found_hex_digits = 0, found_non_octal_digits = -1;
968 int number_beginning = ctxp->lexer->position.col;
972 for (i = 0; i < TOTAL_PARTS; i++)
977 java_next_unicode ();
978 c = java_peek_unicode ();
979 if (c == 'x' || c == 'X')
982 java_next_unicode ();
983 c = java_peek_unicode ();
985 else if (JAVA_ASCII_DIGIT (c))
987 literal_token [literal_index++] = '0';
990 else if (c == '.' || c == 'e' || c =='E')
992 literal_token [literal_index++] = '0';
993 /* Handle C during floating-point parsing. */
997 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1001 java_next_unicode ();
1002 SET_LVAL_NODE (long_zero_node);
1003 return (INT_LIT_TK);
1005 java_next_unicode ();
1006 SET_LVAL_NODE (float_zero_node);
1009 java_next_unicode ();
1010 SET_LVAL_NODE (double_zero_node);
1013 SET_LVAL_NODE (integer_zero_node);
1014 return (INT_LIT_TK);
1018 /* Parse the first part of the literal, until we find something
1019 which is not a number. */
1020 while (radix == 16 ? JAVA_ASCII_HEXDIGIT (c) : JAVA_ASCII_DIGIT (c))
1022 /* We store in a string (in case it turns out to be a FP) and in
1023 PARTS if we have to process a integer literal. */
1024 int numeric = hex_value (c);
1027 /* Remember when we find a valid hexadecimal digit. */
1029 found_hex_digits = 1;
1030 /* Remember when we find an invalid octal digit. */
1031 else if (radix == 8 && numeric >= 8 && found_non_octal_digits < 0)
1032 found_non_octal_digits = literal_index;
1034 literal_token [literal_index++] = c;
1035 /* This section of code if borrowed from gcc/c-lex.c. */
1036 for (count = 0; count < TOTAL_PARTS; count++)
1038 parts[count] *= radix;
1041 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1042 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1045 parts[0] += numeric;
1047 if (parts [TOTAL_PARTS-1] != 0)
1049 /* End borrowed section. */
1050 java_next_unicode ();
1051 c = java_peek_unicode ();
1054 /* If we have something from the FP char set but not a digit, parse
1056 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1058 /* stage==0: seen digits only
1059 * stage==1: seen '.'
1060 * stage==2: seen 'e' or 'E'.
1061 * stage==3: seen '+' or '-' after 'e' or 'E'.
1062 * stage==4: seen type suffix ('f'/'F'/'d'/'D')
1065 int seen_digit = (literal_index ? 1 : 0);
1066 int seen_exponent = 0;
1067 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1068 double unless specified. */
1070 /* It is ok if the radix is 8 because this just means we've
1071 seen a leading `0'. However, radix==16 is invalid. */
1073 java_lex_error ("Can't express non-decimal FP literal", 0);
1083 literal_token [literal_index++ ] = c;
1084 java_next_unicode ();
1085 c = java_peek_unicode ();
1086 if (literal_index == 1 && !JAVA_ASCII_DIGIT (c))
1087 BUILD_OPERATOR (DOT_TK);
1090 java_lex_error ("Invalid character in FP literal", 0);
1093 if (c == 'e' || c == 'E')
1097 /* {E,e} must have seen at least a digit. */
1100 ("Invalid FP literal, mantissa must have digit", 0);
1104 literal_token [literal_index++] = c;
1105 java_next_unicode ();
1106 c = java_peek_unicode ();
1109 java_lex_error ("Invalid character in FP literal", 0);
1111 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1113 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1114 stage = 4; /* So we fall through. */
1117 if ((c=='-' || c =='+') && stage == 2)
1120 literal_token [literal_index++] = c;
1121 java_next_unicode ();
1122 c = java_peek_unicode ();
1125 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1126 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1127 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1128 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1130 if (JAVA_ASCII_DIGIT (c))
1134 literal_token [literal_index++ ] = c;
1135 java_next_unicode ();
1136 c = java_peek_unicode ();
1140 if (stage == 4) /* Don't push back fF/dD. */
1141 java_next_unicode ();
1143 /* An exponent (if any) must have seen a digit. */
1144 if (seen_exponent && !seen_digit)
1146 ("Invalid FP literal, exponent must have digit", 0);
1148 literal_token [literal_index] = '\0';
1151 java_perform_atof (java_lval, literal_token,
1152 fflag, number_beginning);
1157 } /* JAVA_ASCII_FPCHAR (c) */
1159 /* Here we get back to converting the integral literal. */
1160 if (radix == 16 && ! found_hex_digits)
1162 ("0x must be followed by at least one hexadecimal digit", 0);
1163 else if (radix == 8 && found_non_octal_digits >= 0)
1165 int back = literal_index - found_non_octal_digits;
1166 ctxp->lexer->position.col -= back;
1167 java_lex_error ("Octal literal contains digit out of range", 0);
1168 ctxp->lexer->position.col += back;
1170 else if (c == 'L' || c == 'l')
1172 java_next_unicode ();
1176 /* This section of code is borrowed from gcc/c-lex.c. */
1179 bytes = GET_TYPE_PRECISION (long_type_node);
1180 for (i = bytes; i < TOTAL_PARTS; i++)
1188 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1190 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1191 / HOST_BITS_PER_CHAR)]
1192 << (i * HOST_BITS_PER_CHAR));
1193 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1195 /* End borrowed section. */
1198 /* Range checking. */
1199 /* Temporarily set type to unsigned. */
1200 value = build_int_cst_wide (long_suffix
1201 ? unsigned_long_type_node
1202 : unsigned_int_type_node, low, high);
1203 SET_LVAL_NODE (value);
1205 /* For base 10 numbers, only values up to the highest value
1206 (plus one) can be written. For instance, only ints up to
1207 2147483648 can be written. The special case of the largest
1208 negative value is handled elsewhere. For other bases, any
1209 number can be represented. */
1210 if (overflow || (radix == 10
1211 && tree_int_cst_lt (long_suffix
1217 JAVA_RANGE_ERROR ("Numeric overflow for 'long' literal");
1219 JAVA_RANGE_ERROR ("Numeric overflow for 'int' literal");
1222 /* Sign extend the value. */
1223 value = build_int_cst_wide (long_suffix ? long_type_node : int_type_node,
1225 value = force_fit_type (value, 0, false, false);
1229 value = copy_node (value);
1230 JAVA_NOT_RADIX10_FLAG (value) = 1;
1233 SET_LVAL_NODE (value);
1238 /* We may have an ID here. */
1239 if (JAVA_START_CHAR_P (c))
1241 int ascii_index = 0, all_ascii = 1;
1243 /* Keyword, boolean literal or null literal. */
1244 while (c != UEOF && JAVA_PART_CHAR_P (c))
1246 java_unicode_2_utf8 (c);
1249 java_next_unicode ();
1251 c = java_peek_unicode ();
1254 obstack_1grow (&temporary_obstack, '\0');
1255 string = obstack_finish (&temporary_obstack);
1257 /* If we have something all ascii, we consider a keyword, a boolean
1258 literal, a null literal or an all ASCII identifier. Otherwise,
1259 this is an identifier (possibly not respecting formation rule). */
1262 const struct java_keyword *kw;
1263 if ((kw=java_keyword (string, ascii_index)))
1267 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1268 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1269 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1270 case PRIVATE_TK: case STRICT_TK:
1271 SET_MODIFIER_CTX (kw->token);
1274 SET_LVAL_NODE (float_type_node);
1277 SET_LVAL_NODE (double_type_node);
1280 SET_LVAL_NODE (boolean_type_node);
1283 SET_LVAL_NODE (byte_type_node);
1286 SET_LVAL_NODE (short_type_node);
1289 SET_LVAL_NODE (int_type_node);
1292 SET_LVAL_NODE (long_type_node);
1295 SET_LVAL_NODE (char_type_node);
1298 /* Keyword based literals. */
1301 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1302 boolean_true_node : boolean_false_node));
1305 SET_LVAL_NODE (null_pointer_node);
1311 BUILD_OPERATOR (kw->token);
1317 /* Some keyword we want to retain information on the location
1318 they where found. */
1330 BUILD_OPERATOR (kw->token);
1338 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1342 java_next_unicode ();
1344 /* Character literals. */
1349 if ((c = java_get_unicode ()) == '\\')
1350 char_lit = java_parse_escape_sequence ();
1353 if (c == '\n' || c == '\'')
1354 java_lex_error ("Invalid character literal", 0);
1358 c = java_get_unicode ();
1360 if ((c == '\n') || (c == UEOF))
1361 java_lex_error ("Character literal not terminated at end of line", 0);
1363 java_lex_error ("Syntax error in character literal", 0);
1365 if (char_lit == JAVA_CHAR_ERROR)
1366 char_lit = 0; /* We silently convert it to zero. */
1368 SET_LVAL_NODE (build_int_cst (char_type_node, char_lit));
1372 /* String literals. */
1380 c = java_peek_unicode ();
1381 if (c == '\n' || c == UEOF) /* ULT. */
1383 java_lex_error ("String not terminated at end of line", 0);
1386 java_next_unicode ();
1390 c = java_parse_escape_sequence ();
1391 if (c == JAVA_CHAR_ERROR)
1394 c = 0; /* We silently convert it to zero. */
1396 java_unicode_2_utf8 (c);
1399 obstack_1grow (&temporary_obstack, '\0');
1400 string = obstack_finish (&temporary_obstack);
1402 if (!no_error || (c != '"'))
1403 java_lval->node = error_mark_node; /* FIXME: Requires further
1406 java_lval->node = build_string (strlen (string), string);
1408 obstack_free (&temporary_obstack, string);
1409 return STRING_LIT_TK;
1415 /* Check for comment. */
1416 switch (c = java_peek_unicode ())
1419 java_next_unicode ();
1422 c = java_get_unicode ();
1425 /* It is ok to end a `//' comment with EOF, unless
1426 we're being pedantic. */
1428 java_lex_error ("Comment not terminated at end of input",
1432 if (c == '\n') /* ULT */
1438 java_next_unicode ();
1439 if ((c = java_get_unicode ()) == '*')
1441 c = java_get_unicode ();
1444 /* Empty documentation comment. We have to reset
1445 the deprecation marker as only the most recent
1446 doc comment applies. */
1447 ctxp->deprecated = 0;
1450 java_parse_doc_section (c);
1453 java_parse_end_comment ((c = java_get_unicode ()));
1458 java_next_unicode ();
1459 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1462 BUILD_OPERATOR (DIV_TK);
1466 BUILD_OPERATOR (OP_TK);
1471 java_lval->operator.token = OCB_TK;
1472 java_lval->operator.location = BUILD_LOCATION();
1474 #ifdef USE_MAPPED_LOCATION
1475 if (ctxp->ccb_indent == 1)
1476 ctxp->first_ccb_indent1 = input_location;
1478 if (ctxp->ccb_indent == 1)
1479 ctxp->first_ccb_indent1 = input_line;
1485 java_lval->operator.token = CCB_TK;
1486 java_lval->operator.location = BUILD_LOCATION();
1489 #ifdef USE_MAPPED_LOCATION
1490 if (ctxp->ccb_indent == 1)
1491 ctxp->last_ccb_indent1 = input_location;
1493 if (ctxp->ccb_indent == 1)
1494 ctxp->last_ccb_indent1 = input_line;
1498 BUILD_OPERATOR (OSB_TK);
1506 BUILD_OPERATOR (DOT_TK);
1510 c = java_peek_unicode ();
1513 java_next_unicode ();
1514 BUILD_OPERATOR (EQ_TK);
1518 /* Equals is used in two different locations. In the
1519 variable_declarator: rule, it has to be seen as '=' as opposed
1520 to being seen as an ordinary assignment operator in
1521 assignment_operators: rule. */
1522 BUILD_OPERATOR (ASSIGN_TK);
1526 switch ((c = java_peek_unicode ()))
1529 java_next_unicode ();
1530 BUILD_OPERATOR (GTE_TK);
1532 java_next_unicode ();
1533 switch ((c = java_peek_unicode ()))
1536 java_next_unicode ();
1537 c = java_peek_unicode ();
1540 java_next_unicode ();
1541 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1545 BUILD_OPERATOR (ZRS_TK);
1548 java_next_unicode ();
1549 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1551 BUILD_OPERATOR (SRS_TK);
1554 BUILD_OPERATOR (GT_TK);
1558 switch ((c = java_peek_unicode ()))
1561 java_next_unicode ();
1562 BUILD_OPERATOR (LTE_TK);
1564 java_next_unicode ();
1565 if ((c = java_peek_unicode ()) == '=')
1567 java_next_unicode ();
1568 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1572 BUILD_OPERATOR (LS_TK);
1575 BUILD_OPERATOR (LT_TK);
1579 switch ((c = java_peek_unicode ()))
1582 java_next_unicode ();
1583 BUILD_OPERATOR (BOOL_AND_TK);
1585 java_next_unicode ();
1586 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1588 BUILD_OPERATOR (AND_TK);
1592 switch ((c = java_peek_unicode ()))
1595 java_next_unicode ();
1596 BUILD_OPERATOR (BOOL_OR_TK);
1598 java_next_unicode ();
1599 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1601 BUILD_OPERATOR (OR_TK);
1605 switch ((c = java_peek_unicode ()))
1608 java_next_unicode ();
1609 BUILD_OPERATOR (INCR_TK);
1611 java_next_unicode ();
1612 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1614 BUILD_OPERATOR (PLUS_TK);
1618 switch ((c = java_peek_unicode ()))
1621 java_next_unicode ();
1622 BUILD_OPERATOR (DECR_TK);
1624 java_next_unicode ();
1625 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1627 BUILD_OPERATOR (MINUS_TK);
1631 if ((c = java_peek_unicode ()) == '=')
1633 java_next_unicode ();
1634 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1638 BUILD_OPERATOR (MULT_TK);
1642 if ((c = java_peek_unicode ()) == '=')
1644 java_next_unicode ();
1645 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1649 BUILD_OPERATOR (XOR_TK);
1653 if ((c = java_peek_unicode ()) == '=')
1655 java_next_unicode ();
1656 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1660 BUILD_OPERATOR (REM_TK);
1664 if ((c = java_peek_unicode()) == '=')
1666 java_next_unicode ();
1667 BUILD_OPERATOR (NEQ_TK);
1671 BUILD_OPERATOR (NEG_TK);
1675 BUILD_OPERATOR (REL_QM_TK);
1677 BUILD_OPERATOR (REL_CL_TK);
1679 BUILD_OPERATOR (NOT_TK);
1682 if (c == 0x1a) /* CTRL-Z. */
1684 if ((c = java_peek_unicode ()) == UEOF)
1685 return 0; /* Ok here. */
1688 /* Everything else is an invalid character in the input. */
1690 char lex_error_buffer [128];
1691 sprintf (lex_error_buffer, "Invalid character '%s' in input",
1692 java_sprint_unicode (c));
1693 java_lex_error (lex_error_buffer, -1);
1700 /* The exported interface to the lexer. */
1702 java_lex (YYSTYPE *java_lval)
1706 timevar_push (TV_LEX);
1707 r = do_java_lex (java_lval);
1708 timevar_pop (TV_LEX);
1712 /* This is called by the parser to see if an error should be generated
1713 due to numeric overflow. This function only handles the particular
1714 case of the largest negative value, and is only called in the case
1715 where this value is not preceded by `-'. */
1717 error_if_numeric_overflow (tree value)
1719 if (TREE_CODE (value) == INTEGER_CST
1720 && !JAVA_NOT_RADIX10_FLAG (value)
1721 && tree_int_cst_sgn (value) < 0)
1723 if (TREE_TYPE (value) == long_type_node)
1724 java_lex_error ("Numeric overflow for 'long' literal", 0);
1726 java_lex_error ("Numeric overflow for 'int' literal", 0);
1730 #endif /* JC1_LITE */
1733 java_unicode_2_utf8 (unicode_t unicode)
1735 if (RANGE (unicode, 0x01, 0x7f))
1736 obstack_1grow (&temporary_obstack, (char)unicode);
1737 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1739 obstack_1grow (&temporary_obstack,
1740 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1741 obstack_1grow (&temporary_obstack,
1742 (unsigned char)(0x80 | (unicode & 0x3f)));
1744 else /* Range 0x800-0xffff. */
1746 obstack_1grow (&temporary_obstack,
1747 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1748 obstack_1grow (&temporary_obstack,
1749 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1750 obstack_1grow (&temporary_obstack,
1751 (unsigned char)(0x80 | (unicode & 0x003f)));
1757 build_wfl_node (tree node)
1759 #ifdef USE_MAPPED_LOCATION
1760 node = build_expr_wfl (node, input_location);
1762 node = build_expr_wfl (node, ctxp->filename,
1763 ctxp->lexer->token_start.line,
1764 ctxp->lexer->token_start.col);
1766 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1767 TREE_TYPE (node) = NULL_TREE;
1773 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1776 int col = (ctxp->lexer->position.col
1777 + forward * ctxp->lexer->next_columns);
1778 #if USE_MAPPED_LOCATION
1779 source_location save_location = input_location;
1780 LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table, col);
1782 /* Might be caught in the middle of some error report. */
1783 ctxp->java_error_flag = 0;
1786 input_location = save_location;
1788 java_lc save = ctxp->lexer->token_start;
1789 ctxp->lexer->token_start.line = ctxp->lexer->position.line;
1790 ctxp->lexer->token_start.col = col;
1792 /* Might be caught in the middle of some error report. */
1793 ctxp->java_error_flag = 0;
1796 ctxp->lexer->token_start = save;
1803 java_is_eol (FILE *fp, int c)
1810 if (next != '\n' && next != EOF)
1822 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1823 int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1828 /* Dumb implementation. Doesn't try to cache or optimize things. */
1829 /* First line of the file is line 1, first column is 1. */
1831 /* COL == -1 means, at the CR/LF in LINE. */
1832 /* COL == -2 means, at the first non space char in LINE. */
1835 int c, ccol, cline = 1;
1836 int current_line_col = 0;
1837 int first_non_space = 0;
1840 if (!(fp = fopen (filename, "r")))
1841 fatal_error ("can't open %s: %m", filename);
1843 while (cline != line)
1848 static const char msg[] = "<<file too short - unexpected EOF>>";
1849 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1852 if (java_is_eol (fp, c))
1856 /* Gather the chars of the current line in a buffer. */
1860 if (c < 0 || java_is_eol (fp, c))
1862 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1863 first_non_space = current_line_col;
1864 obstack_1grow (&temporary_obstack, c);
1869 obstack_1grow (&temporary_obstack, '\n');
1873 col = current_line_col;
1874 first_non_space = 0;
1877 col = first_non_space;
1879 first_non_space = 0;
1881 /* Place the '^' a the right position. */
1882 base = obstack_base (&temporary_obstack);
1883 for (col += 2, ccol = 0; ccol < col; ccol++)
1885 /* Compute \t when reaching first_non_space. */
1886 char c = (first_non_space ?
1887 (base [ccol] == '\t' ? '\t' : ' ') : ' ');
1888 obstack_1grow (&temporary_obstack, c);
1890 obstack_grow0 (&temporary_obstack, "^", 1);
1893 return obstack_finish (&temporary_obstack);
1899 utf8_cmp (const unsigned char *str, int length, const char *name)
1901 const unsigned char *limit = str + length;
1904 for (i = 0; name[i]; ++i)
1906 int ch = UTF8_GET (str, limit);
1908 return ch - name[i];
1911 return str == limit ? 0 : 1;
1914 /* A sorted list of all C++ keywords. */
1916 static const char *const cxx_keywords[] =
2024 /* Return true if NAME is a C++ keyword. */
2027 cxx_keyword_p (const char *name, int length)
2029 int last = ARRAY_SIZE (cxx_keywords);
2031 int mid = (last + first) / 2;
2034 for (mid = (last + first) / 2;
2036 old = mid, mid = (last + first) / 2)
2038 int kwl = strlen (cxx_keywords[mid]);
2039 int min_length = kwl > length ? length : kwl;
2040 int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2045 /* We've found a match if all the remaining characters are `$'. */
2046 for (i = min_length; i < length && name[i] == '$'; ++i)
2060 #endif /* JC1_LITE */