1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
45 /* Function declarations. */
46 static char *java_sprint_unicode (int);
47 static void java_unicode_2_utf8 (unicode_t);
48 static void java_lex_error (const char *, int);
50 static int do_java_lex (YYSTYPE *);
51 static int java_lex (YYSTYPE *);
52 static int java_is_eol (FILE *, int);
53 static tree build_wfl_node (tree);
55 static int java_parse_escape_sequence (void);
56 static int java_start_char_p (unicode_t);
57 static int java_part_char_p (unicode_t);
58 static int java_space_char_p (unicode_t);
59 static void java_parse_doc_section (int);
60 static void java_parse_end_comment (int);
61 static int java_read_char (java_lexer *);
62 static int java_get_unicode (void);
63 static int java_peek_unicode (void);
64 static void java_next_unicode (void);
65 static int java_read_unicode (java_lexer *, int *);
67 static int utf8_cmp (const unsigned char *, int, const char *);
70 java_lexer *java_new_lexer (FILE *, const char *);
72 static void error_if_numeric_overflow (tree);
76 /* This is nonzero if we have initialized `need_byteswap'. */
77 static int byteswap_init = 0;
79 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
80 big-endian order -- not native endian order. We handle this by
81 doing a conversion once at startup and seeing what happens. This
82 flag holds the results of this determination. */
83 static int need_byteswap = 0;
87 java_init_lex (FILE *finput, const char *encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
95 inst_id = get_identifier ("inst$");
97 wpv_id = get_identifier ("write_parm_value$");
99 if (!java_lang_imported)
101 tree node = build_tree_list (build_unknown_wfl (java_lang_id),
103 read_import_dir (TREE_PURPOSE (node));
104 TREE_CHAIN (node) = ctxp->import_demand_list;
105 ctxp->import_demand_list = node;
106 java_lang_imported = 1;
111 #ifdef USE_MAPPED_LOCATION
112 wfl_operator = build_expr_wfl (NULL_TREE, input_location);
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
118 label_id = get_identifier ("$L");
120 wfl_append = build_unknown_wfl (get_identifier ("append"));
121 if (!wfl_string_buffer)
123 build_unknown_wfl (get_identifier (flag_emit_class_files
124 ? "java.lang.StringBuffer"
125 : "gnu.gcj.runtime.StringBuffer"));
127 wfl_to_string = build_unknown_wfl (get_identifier ("toString"));
129 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
130 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
132 memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->save_location = input_location;
138 ctxp->java_error_flag = 0;
139 ctxp->lexer = java_new_lexer (finput, encoding);
143 java_sprint_unicode (int c)
145 static char buffer [10];
146 if (c < ' ' || c >= 127)
147 sprintf (buffer, "\\u%04x", c);
156 /* Create a new lexer object. */
159 java_new_lexer (FILE *finput, const char *encoding)
161 java_lexer *lex = xmalloc (sizeof (java_lexer));
164 lex->finput = finput;
166 lex->unget_value = 0;
167 lex->next_unicode = 0;
168 lex->avail_unicode = 0;
169 lex->next_columns = 1;
170 lex->encoding = encoding;
171 lex->position.line = 1;
172 lex->position.col = 1;
174 #ifdef USE_MAPPED_LOCATION
176 = linemap_line_start (&line_table, 1, 120);
183 lex->handle = iconv_open ("UCS-2", encoding);
184 if (lex->handle != (iconv_t) -1)
190 lex->read_anything = 0;
191 lex->use_fallback = 0;
193 /* Work around broken iconv() implementations by doing checking at
194 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
195 then all UCS-2 encoders will be broken. Perhaps not a valid
203 handle = iconv_open ("UCS-2", "UTF-8");
204 if (handle != (iconv_t) -1)
211 /* This is the UTF-8 encoding of \ufeff. */
218 outp = (char *) &result;
221 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
223 iconv_close (handle);
224 /* Conversion must be complete for us to use the result. */
225 if (r != (size_t) -1 && inc == 0 && outc == 0)
226 need_byteswap = (result != 0xfeff);
230 lex->byte_swap = need_byteswap;
233 #endif /* HAVE_ICONV */
235 /* If iconv failed, use the internal decoder if the default
236 encoding was requested. This code is used on platforms where
237 iconv exists but is insufficient for our needs. For
238 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
240 On Solaris the default encoding, as returned by nl_langinfo(),
241 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
242 understand that. We work around that by pretending
243 `646' to be the same as UTF-8. */
244 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
249 lex->use_fallback = 1;
250 lex->encoding = "UTF-8";
252 #endif /* HAVE_ICONV */
256 fatal_error ("unknown encoding: %qs\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n%<--encoding=UTF-8%> option", encoding);
262 java_destroy_lexer (java_lexer *lex)
265 if (! lex->use_fallback)
266 iconv_close (lex->handle);
272 java_read_char (java_lexer *lex)
275 if (! lex->use_fallback)
277 size_t ir, inbytesleft, in_save, out_count, out_save;
281 /* If there is data which has already been converted, use it. */
282 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
289 /* See if we need to read more data. If FIRST == 0 then
290 the previous conversion attempt ended in the middle of
291 a character at the end of the buffer. Otherwise we
292 only have to read if the buffer is empty. */
293 if (lex->first == 0 || lex->first >= lex->last)
297 if (lex->first >= lex->last)
302 if (feof (lex->finput))
304 r = fread (&lex->buffer[lex->last], 1,
305 sizeof (lex->buffer) - lex->last,
310 inbytesleft = lex->last - lex->first;
311 out_count = sizeof (lex->out_buffer) - lex->out_last;
313 if (inbytesleft == 0)
315 /* We've tried to read and there is nothing left. */
319 in_save = inbytesleft;
320 out_save = out_count;
321 inp = &lex->buffer[lex->first];
322 outp = (char *) &lex->out_buffer[lex->out_last];
323 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
324 &inbytesleft, &outp, &out_count);
326 /* If we haven't read any bytes, then look to see if we
328 if (! lex->read_anything && out_save - out_count >= 2)
330 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
336 else if (uc == 0xfffe)
341 lex->read_anything = 1;
347 for (i = 0; i < out_save - out_count; i += 2)
349 char t = lex->out_buffer[lex->out_last + i];
350 lex->out_buffer[lex->out_last + i]
351 = lex->out_buffer[lex->out_last + i + 1];
352 lex->out_buffer[lex->out_last + i + 1] = t;
356 lex->first += in_save - inbytesleft;
357 lex->out_last += out_save - out_count;
359 /* If we converted anything at all, move along. */
360 if (out_count != out_save)
363 if (ir == (size_t) -1)
367 /* This is ok. This means that the end of our buffer
368 is in the middle of a character sequence. We just
369 move the valid part of the buffer to the beginning
371 memmove (&lex->buffer[0], &lex->buffer[lex->first],
372 lex->last - lex->first);
373 lex->last -= lex->first;
378 /* A more serious error. */
381 "Unrecognized character for encoding '%s'",
383 java_lex_error (buffer, 0);
390 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
392 /* Don't have any data. */
397 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
402 #endif /* HAVE_ICONV */
405 c = getc (lex->finput);
410 return (unicode_t) c;
413 if ((c & 0xe0) == 0xc0)
415 c1 = getc (lex->finput);
416 if ((c1 & 0xc0) == 0x80)
418 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
419 /* Check for valid 2-byte characters. We explicitly
420 allow \0 because this encoding is common in the
422 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
426 else if ((c & 0xf0) == 0xe0)
428 c1 = getc (lex->finput);
429 if ((c1 & 0xc0) == 0x80)
431 c2 = getc (lex->finput);
432 if ((c2 & 0xc0) == 0x80)
434 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
437 /* Check for valid 3-byte characters.
438 Don't allow surrogate, \ufffe or \uffff. */
439 if (IN_RANGE (r, 0x800, 0xffff)
440 && ! IN_RANGE (r, 0xd800, 0xdfff)
441 && r != 0xfffe && r != 0xffff)
447 /* We simply don't support invalid characters. We also
448 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
449 cannot be valid Java characters. */
450 java_lex_error ("malformed UTF-8 character", 0);
454 /* We only get here on error. */
459 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
463 if (lex->unget_value)
465 c = lex->unget_value;
466 lex->unget_value = 0;
469 c = java_read_char (lex);
471 *unicode_escape_p = 0;
480 if ((lex->bs_count) % 2 == 1)
482 /* Odd number of \ seen. */
483 c = java_read_char (lex);
486 unicode_t unicode = 0;
489 /* Recognize any number of `u's in \u. */
490 while ((c = java_read_char (lex)) == 'u')
498 java_lex_error ("prematurely terminated \\u sequence", 0);
503 unicode |= (unicode_t)(hex_value (c) << shift);
506 java_lex_error ("non-hex digit in \\u sequence", 0);
510 c = java_read_char (lex);
516 lex->unget_value = c;
519 *unicode_escape_p = 1;
522 lex->unget_value = c;
524 return (unicode_t) '\\';
527 /* Get the next Unicode character (post-Unicode-escape-handling).
528 Move the current position to just after returned character. */
531 java_get_unicode (void)
533 int next = java_peek_unicode ();
534 java_next_unicode ();
538 /* Return the next Unicode character (post-Unicode-escape-handling).
539 Do not move the current position, which remains just before
540 the returned character. */
543 java_peek_unicode (void)
545 int unicode_escape_p;
546 java_lexer *lex = ctxp->lexer;
549 if (lex->avail_unicode)
550 return lex->next_unicode;
552 next = java_read_unicode (lex, &unicode_escape_p);
556 /* We have to read ahead to see if we got \r\n.
557 In that case we return a single line terminator. */
559 next = java_read_unicode (lex, &dummy);
560 if (next != '\n' && next != UEOF)
561 lex->unget_value = next;
562 /* In either case we must return a newline. */
566 lex->next_unicode = next;
567 lex->avail_unicode = 1;
571 lex->next_columns = 0;
577 lex->next_columns = 1 - lex->position.col;
579 else if (next == '\t')
581 int cur_col = lex->position.col;
582 lex->next_columns = ((cur_col + 7) & ~7) + 1 - cur_col;
587 lex->next_columns = 1;
589 if (unicode_escape_p)
590 lex->next_columns = 6;
594 /* Move forward one Unicode character (post-Unicode-escape-handling).
595 Only allowed after java_peek_unicode. The combination java_peek_unicode
596 followed by java_next_unicode is equivalent to java_get_unicode. */
598 static void java_next_unicode (void)
600 struct java_lexer *lex = ctxp->lexer;
601 lex->position.col += lex->next_columns;
602 if (lex->next_unicode == '\n')
604 lex->position.line++;
606 #ifdef USE_MAPPED_LOCATION
608 = linemap_line_start (&line_table, lex->position.line, 120);
610 input_line = lex->position.line;
614 lex->avail_unicode = 0;
618 /* The inverse of java_next_unicode.
619 Not currently used, but could be if it would be cleaner or faster.
620 java_peek_unicode == java_get_unicode + java_unget_unicode.
621 java_get_unicode == java_peek_unicode + java_next_unicode.
623 static void java_unget_unicode ()
625 struct java_lexer *lex = ctxp->lexer;
626 if (lex->avail_unicode)
627 fatal_error ("internal error - bad unget");
628 lex->avail_unicode = 1;
629 lex->position.col -= lex->next_columns;
633 /* Parse the end of a C style comment.
634 * C is the first character following the '/' and '*'. */
636 java_parse_end_comment (int c)
638 for ( ;; c = java_get_unicode ())
643 java_lex_error ("Comment not terminated at end of input", 0);
646 switch (c = java_peek_unicode ())
649 java_lex_error ("Comment not terminated at end of input", 0);
652 java_next_unicode ();
654 case '*': /* Reparse only '*'. */
661 /* Parse the documentation section. Keywords must be at the beginning
662 of a documentation comment line (ignoring white space and any `*'
663 character). Parsed keyword(s): @DEPRECATED. */
666 java_parse_doc_section (int c)
670 /* We reset this here, because only the most recent doc comment
671 applies to the following declaration. */
672 ctxp->deprecated = 0;
674 /* We loop over all the lines of the comment. We'll eventually exit
675 if we hit EOF prematurely, or when we see the comment
679 /* These first steps need only be done if we're still looking
680 for the deprecated tag. If we've already seen it, we might
681 as well skip looking for it again. */
682 if (! ctxp->deprecated)
684 /* Skip whitespace and '*'s. We must also check for the end
685 of the comment here. */
686 while (JAVA_WHITE_SPACE_P (c) || c == '*')
688 last_was_star = (c == '*');
689 c = java_get_unicode ();
690 if (last_was_star && c == '/')
692 /* We just saw the comment terminator. */
702 const char *deprecated = "@deprecated";
705 for (i = 0; deprecated[i]; ++i)
707 if (c != deprecated[i])
709 /* We write the code in this way, with the
710 update at the end, so that after the loop
711 we're left with the next character in C. */
712 c = java_get_unicode ();
718 /* @deprecated must be followed by a space or newline.
719 We also allow a '*' in case it appears just before
720 the end of a comment. In this position only we also
721 must allow any Unicode space character. */
722 if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
725 ctxp->deprecated = 1;
730 /* We've examined the relevant content from this line. Now we
731 skip the remaining characters and start over with the next
732 line. We also check for end of comment here. */
733 while (c != '\n' && c != UEOF)
735 last_was_star = (c == '*');
736 c = java_get_unicode ();
737 if (last_was_star && c == '/')
743 /* We have to advance past the \n. */
744 c = java_get_unicode ();
750 java_lex_error ("Comment not terminated at end of input", 0);
753 /* Return true if C is a valid start character for a Java identifier.
754 This is only called if C >= 128 -- smaller values are handled
755 inline. However, this function handles all values anyway. */
757 java_start_char_p (unicode_t c)
759 unsigned int hi = c / 256;
760 const char *const page = type_table[hi];
761 unsigned long val = (unsigned long) page;
764 if ((val & ~ LETTER_MASK) != 0)
765 flags = page[c & 255];
769 return flags & LETTER_START;
772 /* Return true if C is a valid part character for a Java identifier.
773 This is only called if C >= 128 -- smaller values are handled
774 inline. However, this function handles all values anyway. */
776 java_part_char_p (unicode_t c)
778 unsigned int hi = c / 256;
779 const char *const page = type_table[hi];
780 unsigned long val = (unsigned long) page;
783 if ((val & ~ LETTER_MASK) != 0)
784 flags = page[c & 255];
788 return flags & LETTER_PART;
791 /* Return true if C is whitespace. */
793 java_space_char_p (unicode_t c)
795 unsigned int hi = c / 256;
796 const char *const page = type_table[hi];
797 unsigned long val = (unsigned long) page;
800 if ((val & ~ LETTER_MASK) != 0)
801 flags = page[c & 255];
805 return flags & LETTER_SPACE;
809 java_parse_escape_sequence (void)
813 switch (c = java_get_unicode ())
816 return (unicode_t)0x8;
818 return (unicode_t)0x9;
820 return (unicode_t)0xa;
822 return (unicode_t)0xc;
824 return (unicode_t)0xd;
826 return (unicode_t)0x22;
828 return (unicode_t)0x27;
830 return (unicode_t)0x5c;
831 case '0': case '1': case '2': case '3': case '4':
832 case '5': case '6': case '7':
835 unicode_t char_lit = 0;
839 /* According to the grammar, `\477' has a well-defined
840 meaning -- it is `\47' followed by `7'. */
846 char_lit = 8 * char_lit + c - '0';
849 c = java_peek_unicode ();
850 if (! RANGE (c, '0', '7'))
852 java_next_unicode ();
858 java_lex_error ("Invalid character in escape sequence", -1);
859 return JAVA_CHAR_ERROR;
864 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
866 /* Subroutine of java_lex: converts floating-point literals to tree
867 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
868 store the result. FFLAG indicates whether the literal was tagged
869 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
870 is the line number on which to report any error. */
872 static void java_perform_atof (YYSTYPE *, char *, int, int);
875 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
876 int number_beginning)
878 REAL_VALUE_TYPE value;
879 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
881 SET_REAL_VALUE_ATOF (value,
882 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
884 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
886 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
889 else if (IS_ZERO (value))
891 /* We check to see if the value is really 0 or if we've found an
892 underflow. We do this in the most primitive imaginable way. */
894 char *p = literal_token;
897 while (*p && *p != 'e' && *p != 'E')
899 if (*p != '0' && *p != '.')
908 int save_col = ctxp->lexer->position.col;
909 ctxp->lexer->position.col = number_beginning;
910 java_lex_error ("Floating point literal underflow", 0);
911 ctxp->lexer->position.col = save_col;
915 SET_LVAL_NODE (build_real (type, value));
919 static int yylex (YYSTYPE *);
923 yylex (YYSTYPE *java_lval)
925 do_java_lex (YYSTYPE *java_lval)
931 /* Translation of the Unicode escape in the raw stream of Unicode
932 characters. Takes care of line terminator. */
934 /* Skip white spaces: SP, TAB and FF or ULT. */
937 c = java_peek_unicode ();
938 if (c != '\n' && ! JAVA_WHITE_SPACE_P (c))
940 java_next_unicode ();
943 /* Handle EOF here. */
944 if (c == UEOF) /* Should probably do something here... */
948 #ifdef USE_MAPPED_LOCATION
949 LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table,
950 ctxp->lexer->position.col);
952 ctxp->lexer->token_start = ctxp->lexer->position;
956 /* Numeric literals. */
957 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
959 /* This section of code is borrowed from gcc/c-lex.c. */
960 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
961 int parts[TOTAL_PARTS];
962 HOST_WIDE_INT high, low;
963 /* End borrowed section. */
964 char literal_token [256];
965 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
966 int found_hex_digits = 0, found_non_octal_digits = -1;
969 int number_beginning = ctxp->lexer->position.col;
973 for (i = 0; i < TOTAL_PARTS; i++)
978 java_next_unicode ();
979 c = java_peek_unicode ();
980 if (c == 'x' || c == 'X')
983 java_next_unicode ();
984 c = java_peek_unicode ();
986 else if (JAVA_ASCII_DIGIT (c))
988 literal_token [literal_index++] = '0';
991 else if (c == '.' || c == 'e' || c =='E')
993 literal_token [literal_index++] = '0';
994 /* Handle C during floating-point parsing. */
998 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1002 java_next_unicode ();
1003 SET_LVAL_NODE (long_zero_node);
1004 return (INT_LIT_TK);
1006 java_next_unicode ();
1007 SET_LVAL_NODE (float_zero_node);
1010 java_next_unicode ();
1011 SET_LVAL_NODE (double_zero_node);
1014 SET_LVAL_NODE (integer_zero_node);
1015 return (INT_LIT_TK);
1019 /* Parse the first part of the literal, until we find something
1020 which is not a number. */
1021 while (radix == 16 ? JAVA_ASCII_HEXDIGIT (c) : JAVA_ASCII_DIGIT (c))
1023 /* We store in a string (in case it turns out to be a FP) and in
1024 PARTS if we have to process a integer literal. */
1025 int numeric = hex_value (c);
1028 /* Remember when we find a valid hexadecimal digit. */
1030 found_hex_digits = 1;
1031 /* Remember when we find an invalid octal digit. */
1032 else if (radix == 8 && numeric >= 8 && found_non_octal_digits < 0)
1033 found_non_octal_digits = literal_index;
1035 literal_token [literal_index++] = c;
1036 /* This section of code if borrowed from gcc/c-lex.c. */
1037 for (count = 0; count < TOTAL_PARTS; count++)
1039 parts[count] *= radix;
1042 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1043 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1046 parts[0] += numeric;
1048 if (parts [TOTAL_PARTS-1] != 0)
1050 /* End borrowed section. */
1051 java_next_unicode ();
1052 c = java_peek_unicode ();
1055 /* If we have something from the FP char set but not a digit, parse
1057 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1059 /* stage==0: seen digits only
1060 * stage==1: seen '.'
1061 * stage==2: seen 'e' or 'E'.
1062 * stage==3: seen '+' or '-' after 'e' or 'E'.
1063 * stage==4: seen type suffix ('f'/'F'/'d'/'D')
1066 int seen_digit = (literal_index ? 1 : 0);
1067 int seen_exponent = 0;
1068 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1069 double unless specified. */
1071 /* It is ok if the radix is 8 because this just means we've
1072 seen a leading `0'. However, radix==16 is invalid. */
1074 java_lex_error ("Can't express non-decimal FP literal", 0);
1084 literal_token [literal_index++ ] = c;
1085 java_next_unicode ();
1086 c = java_peek_unicode ();
1087 if (literal_index == 1 && !JAVA_ASCII_DIGIT (c))
1088 BUILD_OPERATOR (DOT_TK);
1091 java_lex_error ("Invalid character in FP literal", 0);
1094 if (c == 'e' || c == 'E')
1098 /* {E,e} must have seen at least a digit. */
1101 ("Invalid FP literal, mantissa must have digit", 0);
1105 literal_token [literal_index++] = c;
1106 java_next_unicode ();
1107 c = java_peek_unicode ();
1110 java_lex_error ("Invalid character in FP literal", 0);
1112 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1114 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1115 stage = 4; /* So we fall through. */
1118 if ((c=='-' || c =='+') && stage == 2)
1121 literal_token [literal_index++] = c;
1122 java_next_unicode ();
1123 c = java_peek_unicode ();
1126 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1127 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1128 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1129 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1131 if (JAVA_ASCII_DIGIT (c))
1135 literal_token [literal_index++ ] = c;
1136 java_next_unicode ();
1137 c = java_peek_unicode ();
1141 if (stage == 4) /* Don't push back fF/dD. */
1142 java_next_unicode ();
1144 /* An exponent (if any) must have seen a digit. */
1145 if (seen_exponent && !seen_digit)
1147 ("Invalid FP literal, exponent must have digit", 0);
1149 literal_token [literal_index] = '\0';
1152 java_perform_atof (java_lval, literal_token,
1153 fflag, number_beginning);
1158 } /* JAVA_ASCII_FPCHAR (c) */
1160 /* Here we get back to converting the integral literal. */
1161 if (radix == 16 && ! found_hex_digits)
1163 ("0x must be followed by at least one hexadecimal digit", 0);
1164 else if (radix == 8 && found_non_octal_digits >= 0)
1166 int back = literal_index - found_non_octal_digits;
1167 ctxp->lexer->position.col -= back;
1168 java_lex_error ("Octal literal contains digit out of range", 0);
1169 ctxp->lexer->position.col += back;
1171 else if (c == 'L' || c == 'l')
1173 java_next_unicode ();
1177 /* This section of code is borrowed from gcc/c-lex.c. */
1180 bytes = GET_TYPE_PRECISION (long_type_node);
1181 for (i = bytes; i < TOTAL_PARTS; i++)
1189 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1191 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1192 / HOST_BITS_PER_CHAR)]
1193 << (i * HOST_BITS_PER_CHAR));
1194 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1196 /* End borrowed section. */
1199 /* Range checking. */
1200 /* Temporarily set type to unsigned. */
1201 value = build_int_cst_wide (long_suffix
1202 ? unsigned_long_type_node
1203 : unsigned_int_type_node, low, high);
1204 SET_LVAL_NODE (value);
1206 /* For base 10 numbers, only values up to the highest value
1207 (plus one) can be written. For instance, only ints up to
1208 2147483648 can be written. The special case of the largest
1209 negative value is handled elsewhere. For other bases, any
1210 number can be represented. */
1211 if (overflow || (radix == 10
1212 && tree_int_cst_lt (long_suffix
1218 JAVA_RANGE_ERROR ("Numeric overflow for 'long' literal");
1220 JAVA_RANGE_ERROR ("Numeric overflow for 'int' literal");
1223 /* Sign extend the value. */
1224 value = build_int_cst_wide (long_suffix ? long_type_node : int_type_node,
1226 value = force_fit_type (value, 0, false, false);
1230 value = copy_node (value);
1231 JAVA_NOT_RADIX10_FLAG (value) = 1;
1234 SET_LVAL_NODE (value);
1239 /* We may have an ID here. */
1240 if (JAVA_START_CHAR_P (c))
1242 int ascii_index = 0, all_ascii = 1;
1244 /* Keyword, boolean literal or null literal. */
1245 while (c != UEOF && JAVA_PART_CHAR_P (c))
1247 java_unicode_2_utf8 (c);
1250 java_next_unicode ();
1252 c = java_peek_unicode ();
1255 obstack_1grow (&temporary_obstack, '\0');
1256 string = obstack_finish (&temporary_obstack);
1258 /* If we have something all ascii, we consider a keyword, a boolean
1259 literal, a null literal or an all ASCII identifier. Otherwise,
1260 this is an identifier (possibly not respecting formation rule). */
1263 const struct java_keyword *kw;
1264 if ((kw=java_keyword (string, ascii_index)))
1268 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1269 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1270 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1271 case PRIVATE_TK: case STRICT_TK:
1272 SET_MODIFIER_CTX (kw->token);
1275 SET_LVAL_NODE (float_type_node);
1278 SET_LVAL_NODE (double_type_node);
1281 SET_LVAL_NODE (boolean_type_node);
1284 SET_LVAL_NODE (byte_type_node);
1287 SET_LVAL_NODE (short_type_node);
1290 SET_LVAL_NODE (int_type_node);
1293 SET_LVAL_NODE (long_type_node);
1296 SET_LVAL_NODE (char_type_node);
1299 /* Keyword based literals. */
1302 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1303 boolean_true_node : boolean_false_node));
1306 SET_LVAL_NODE (null_pointer_node);
1312 BUILD_OPERATOR (kw->token);
1318 /* Some keyword we want to retain information on the location
1319 they where found. */
1331 BUILD_OPERATOR (kw->token);
1339 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1343 java_next_unicode ();
1345 /* Character literals. */
1350 if ((c = java_get_unicode ()) == '\\')
1351 char_lit = java_parse_escape_sequence ();
1354 if (c == '\n' || c == '\'')
1355 java_lex_error ("Invalid character literal", 0);
1359 c = java_get_unicode ();
1361 if ((c == '\n') || (c == UEOF))
1362 java_lex_error ("Character literal not terminated at end of line", 0);
1364 java_lex_error ("Syntax error in character literal", 0);
1366 if (char_lit == JAVA_CHAR_ERROR)
1367 char_lit = 0; /* We silently convert it to zero. */
1369 SET_LVAL_NODE (build_int_cst (char_type_node, char_lit));
1373 /* String literals. */
1381 c = java_peek_unicode ();
1382 if (c == '\n' || c == UEOF) /* ULT. */
1384 java_lex_error ("String not terminated at end of line", 0);
1387 java_next_unicode ();
1391 c = java_parse_escape_sequence ();
1392 if (c == JAVA_CHAR_ERROR)
1395 c = 0; /* We silently convert it to zero. */
1397 java_unicode_2_utf8 (c);
1400 obstack_1grow (&temporary_obstack, '\0');
1401 string = obstack_finish (&temporary_obstack);
1403 if (!no_error || (c != '"'))
1404 java_lval->node = error_mark_node; /* FIXME: Requires further
1407 java_lval->node = build_string (strlen (string), string);
1409 obstack_free (&temporary_obstack, string);
1410 return STRING_LIT_TK;
1416 /* Check for comment. */
1417 switch (c = java_peek_unicode ())
1420 java_next_unicode ();
1423 c = java_get_unicode ();
1426 /* It is ok to end a `//' comment with EOF, unless
1427 we're being pedantic. */
1429 java_lex_error ("Comment not terminated at end of input",
1433 if (c == '\n') /* ULT */
1439 java_next_unicode ();
1440 if ((c = java_get_unicode ()) == '*')
1442 c = java_get_unicode ();
1445 /* Empty documentation comment. We have to reset
1446 the deprecation marker as only the most recent
1447 doc comment applies. */
1448 ctxp->deprecated = 0;
1451 java_parse_doc_section (c);
1454 java_parse_end_comment ((c = java_get_unicode ()));
1459 java_next_unicode ();
1460 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1463 BUILD_OPERATOR (DIV_TK);
1467 BUILD_OPERATOR (OP_TK);
1472 java_lval->operator.token = OCB_TK;
1473 java_lval->operator.location = BUILD_LOCATION();
1475 #ifdef USE_MAPPED_LOCATION
1476 if (ctxp->ccb_indent == 1)
1477 ctxp->first_ccb_indent1 = input_location;
1479 if (ctxp->ccb_indent == 1)
1480 ctxp->first_ccb_indent1 = input_line;
1486 java_lval->operator.token = CCB_TK;
1487 java_lval->operator.location = BUILD_LOCATION();
1490 #ifdef USE_MAPPED_LOCATION
1491 if (ctxp->ccb_indent == 1)
1492 ctxp->last_ccb_indent1 = input_location;
1494 if (ctxp->ccb_indent == 1)
1495 ctxp->last_ccb_indent1 = input_line;
1499 BUILD_OPERATOR (OSB_TK);
1507 BUILD_OPERATOR (DOT_TK);
1511 c = java_peek_unicode ();
1514 java_next_unicode ();
1515 BUILD_OPERATOR (EQ_TK);
1519 /* Equals is used in two different locations. In the
1520 variable_declarator: rule, it has to be seen as '=' as opposed
1521 to being seen as an ordinary assignment operator in
1522 assignment_operators: rule. */
1523 BUILD_OPERATOR (ASSIGN_TK);
1527 switch ((c = java_peek_unicode ()))
1530 java_next_unicode ();
1531 BUILD_OPERATOR (GTE_TK);
1533 java_next_unicode ();
1534 switch ((c = java_peek_unicode ()))
1537 java_next_unicode ();
1538 c = java_peek_unicode ();
1541 java_next_unicode ();
1542 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1546 BUILD_OPERATOR (ZRS_TK);
1549 java_next_unicode ();
1550 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1552 BUILD_OPERATOR (SRS_TK);
1555 BUILD_OPERATOR (GT_TK);
1559 switch ((c = java_peek_unicode ()))
1562 java_next_unicode ();
1563 BUILD_OPERATOR (LTE_TK);
1565 java_next_unicode ();
1566 if ((c = java_peek_unicode ()) == '=')
1568 java_next_unicode ();
1569 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1573 BUILD_OPERATOR (LS_TK);
1576 BUILD_OPERATOR (LT_TK);
1580 switch ((c = java_peek_unicode ()))
1583 java_next_unicode ();
1584 BUILD_OPERATOR (BOOL_AND_TK);
1586 java_next_unicode ();
1587 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1589 BUILD_OPERATOR (AND_TK);
1593 switch ((c = java_peek_unicode ()))
1596 java_next_unicode ();
1597 BUILD_OPERATOR (BOOL_OR_TK);
1599 java_next_unicode ();
1600 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1602 BUILD_OPERATOR (OR_TK);
1606 switch ((c = java_peek_unicode ()))
1609 java_next_unicode ();
1610 BUILD_OPERATOR (INCR_TK);
1612 java_next_unicode ();
1613 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1615 BUILD_OPERATOR (PLUS_TK);
1619 switch ((c = java_peek_unicode ()))
1622 java_next_unicode ();
1623 BUILD_OPERATOR (DECR_TK);
1625 java_next_unicode ();
1626 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1628 BUILD_OPERATOR (MINUS_TK);
1632 if ((c = java_peek_unicode ()) == '=')
1634 java_next_unicode ();
1635 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1639 BUILD_OPERATOR (MULT_TK);
1643 if ((c = java_peek_unicode ()) == '=')
1645 java_next_unicode ();
1646 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1650 BUILD_OPERATOR (XOR_TK);
1654 if ((c = java_peek_unicode ()) == '=')
1656 java_next_unicode ();
1657 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1661 BUILD_OPERATOR (REM_TK);
1665 if ((c = java_peek_unicode()) == '=')
1667 java_next_unicode ();
1668 BUILD_OPERATOR (NEQ_TK);
1672 BUILD_OPERATOR (NEG_TK);
1676 BUILD_OPERATOR (REL_QM_TK);
1678 BUILD_OPERATOR (REL_CL_TK);
1680 BUILD_OPERATOR (NOT_TK);
1683 if (c == 0x1a) /* CTRL-Z. */
1685 if ((c = java_peek_unicode ()) == UEOF)
1686 return 0; /* Ok here. */
1689 /* Everything else is an invalid character in the input. */
1691 char lex_error_buffer [128];
1692 sprintf (lex_error_buffer, "Invalid character '%s' in input",
1693 java_sprint_unicode (c));
1694 java_lex_error (lex_error_buffer, -1);
1701 /* The exported interface to the lexer. */
1703 java_lex (YYSTYPE *java_lval)
1707 timevar_push (TV_LEX);
1708 r = do_java_lex (java_lval);
1709 timevar_pop (TV_LEX);
1713 /* This is called by the parser to see if an error should be generated
1714 due to numeric overflow. This function only handles the particular
1715 case of the largest negative value, and is only called in the case
1716 where this value is not preceded by `-'. */
1718 error_if_numeric_overflow (tree value)
1720 if (TREE_CODE (value) == INTEGER_CST
1721 && !JAVA_NOT_RADIX10_FLAG (value)
1722 && tree_int_cst_sgn (value) < 0)
1724 if (TREE_TYPE (value) == long_type_node)
1725 java_lex_error ("Numeric overflow for 'long' literal", 0);
1727 java_lex_error ("Numeric overflow for 'int' literal", 0);
1731 #endif /* JC1_LITE */
1734 java_unicode_2_utf8 (unicode_t unicode)
1736 if (RANGE (unicode, 0x01, 0x7f))
1737 obstack_1grow (&temporary_obstack, (char)unicode);
1738 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1740 obstack_1grow (&temporary_obstack,
1741 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1742 obstack_1grow (&temporary_obstack,
1743 (unsigned char)(0x80 | (unicode & 0x3f)));
1745 else /* Range 0x800-0xffff. */
1747 obstack_1grow (&temporary_obstack,
1748 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1749 obstack_1grow (&temporary_obstack,
1750 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1751 obstack_1grow (&temporary_obstack,
1752 (unsigned char)(0x80 | (unicode & 0x003f)));
1758 build_wfl_node (tree node)
1760 #ifdef USE_MAPPED_LOCATION
1761 node = build_expr_wfl (node, input_location);
1763 node = build_expr_wfl (node, ctxp->filename,
1764 ctxp->lexer->token_start.line,
1765 ctxp->lexer->token_start.col);
1767 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1768 TREE_TYPE (node) = NULL_TREE;
1774 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1777 int col = (ctxp->lexer->position.col
1778 + forward * ctxp->lexer->next_columns);
1779 #if USE_MAPPED_LOCATION
1780 source_location save_location = input_location;
1781 LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table, col);
1783 /* Might be caught in the middle of some error report. */
1784 ctxp->java_error_flag = 0;
1787 input_location = save_location;
1789 java_lc save = ctxp->lexer->token_start;
1790 ctxp->lexer->token_start.line = ctxp->lexer->position.line;
1791 ctxp->lexer->token_start.col = col;
1793 /* Might be caught in the middle of some error report. */
1794 ctxp->java_error_flag = 0;
1797 ctxp->lexer->token_start = save;
1804 java_is_eol (FILE *fp, int c)
1811 if (next != '\n' && next != EOF)
1823 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1824 int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1829 /* Dumb implementation. Doesn't try to cache or optimize things. */
1830 /* First line of the file is line 1, first column is 1. */
1832 /* COL == -1 means, at the CR/LF in LINE. */
1833 /* COL == -2 means, at the first non space char in LINE. */
1836 int c, ccol, cline = 1;
1837 int current_line_col = 0;
1838 int first_non_space = 0;
1841 if (!(fp = fopen (filename, "r")))
1842 fatal_error ("can't open %s: %m", filename);
1844 while (cline != line)
1849 static const char msg[] = "<<file too short - unexpected EOF>>";
1850 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1853 if (java_is_eol (fp, c))
1857 /* Gather the chars of the current line in a buffer. */
1861 if (c < 0 || java_is_eol (fp, c))
1863 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1864 first_non_space = current_line_col;
1865 obstack_1grow (&temporary_obstack, c);
1870 obstack_1grow (&temporary_obstack, '\n');
1874 col = current_line_col;
1875 first_non_space = 0;
1878 col = first_non_space;
1880 first_non_space = 0;
1882 /* Place the '^' a the right position. */
1883 base = obstack_base (&temporary_obstack);
1884 for (col += 2, ccol = 0; ccol < col; ccol++)
1886 /* Compute \t when reaching first_non_space. */
1887 char c = (first_non_space ?
1888 (base [ccol] == '\t' ? '\t' : ' ') : ' ');
1889 obstack_1grow (&temporary_obstack, c);
1891 obstack_grow0 (&temporary_obstack, "^", 1);
1894 return obstack_finish (&temporary_obstack);
1900 utf8_cmp (const unsigned char *str, int length, const char *name)
1902 const unsigned char *limit = str + length;
1905 for (i = 0; name[i]; ++i)
1907 int ch = UTF8_GET (str, limit);
1909 return ch - name[i];
1912 return str == limit ? 0 : 1;
1915 /* A sorted list of all C++ keywords. */
1917 static const char *const cxx_keywords[] =
2025 /* Return true if NAME is a C++ keyword. */
2028 cxx_keyword_p (const char *name, int length)
2030 int last = ARRAY_SIZE (cxx_keywords);
2032 int mid = (last + first) / 2;
2035 for (mid = (last + first) / 2;
2037 old = mid, mid = (last + first) / 2)
2039 int kwl = strlen (cxx_keywords[mid]);
2040 int min_length = kwl > length ? length : kwl;
2041 int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2046 /* We've found a match if all the remaining characters are `$'. */
2047 for (i = min_length; i < length && name[i] == '$'; ++i)
2061 #endif /* JC1_LITE */