1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
45 /* Function declarations. */
46 static char *java_sprint_unicode (struct java_line *, int);
47 static void java_unicode_2_utf8 (unicode_t);
48 static void java_lex_error (const char *, int);
50 static int do_java_lex (YYSTYPE *);
51 static int java_lex (YYSTYPE *);
52 static int java_is_eol (FILE *, int);
53 static tree build_wfl_node (tree);
55 static void java_store_unicode (struct java_line *, unicode_t, int);
56 static int java_parse_escape_sequence (void);
57 static int java_start_char_p (unicode_t);
58 static int java_part_char_p (unicode_t);
59 static int java_space_char_p (unicode_t);
60 static void java_parse_doc_section (int);
61 static void java_parse_end_comment (int);
62 static int java_get_unicode (void);
63 static int java_read_unicode (java_lexer *, int *);
64 static int java_read_unicode_collapsing_terminators (java_lexer *, int *);
65 static void java_store_unicode (struct java_line *, unicode_t, int);
66 static int java_read_char (java_lexer *);
67 static void java_allocate_new_line (void);
68 static void java_unget_unicode (void);
69 static unicode_t java_sneak_unicode (void);
71 static int utf8_cmp (const unsigned char *, int, const char *);
74 java_lexer *java_new_lexer (FILE *, const char *);
76 static void error_if_numeric_overflow (tree);
80 /* This is nonzero if we have initialized `need_byteswap'. */
81 static int byteswap_init = 0;
83 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
84 big-endian order -- not native endian order. We handle this by
85 doing a conversion once at startup and seeing what happens. This
86 flag holds the results of this determination. */
87 static int need_byteswap = 0;
91 java_init_lex (FILE *finput, const char *encoding)
94 int java_lang_imported = 0;
97 java_lang_id = get_identifier ("java.lang");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
132 current_jcf = ggc_alloc_cleared (sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = input_line = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (struct java_line *line, int i)
148 static char buffer [10];
149 if (line->unicode_escape_p [i] || line->line [i] > 128)
150 sprintf (buffer, "\\u%04x", line->line [i]);
153 buffer [0] = line->line [i];
160 java_sneak_unicode (void)
162 return (ctxp->c_line->line [ctxp->c_line->current]);
166 java_unget_unicode (void)
168 if (!ctxp->c_line->current)
169 /* Can't unget unicode. */
172 ctxp->c_line->current--;
173 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
177 java_allocate_new_line (void)
179 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
180 char ahead_escape_p = (ctxp->c_line ?
181 ctxp->c_line->unicode_escape_ahead_p : 0);
183 if (ctxp->c_line && !ctxp->c_line->white_space_only)
187 free (ctxp->p_line->unicode_escape_p);
188 free (ctxp->p_line->line);
191 ctxp->p_line = ctxp->c_line;
192 ctxp->c_line = NULL; /* Reallocated. */
197 ctxp->c_line = xmalloc (sizeof (struct java_line));
198 ctxp->c_line->max = JAVA_LINE_MAX;
199 ctxp->c_line->line = xmalloc (sizeof (unicode_t)*ctxp->c_line->max);
200 ctxp->c_line->unicode_escape_p =
201 xmalloc (sizeof (char)*ctxp->c_line->max);
202 ctxp->c_line->white_space_only = 0;
205 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
206 ctxp->c_line->char_col = ctxp->c_line->current = 0;
209 ctxp->c_line->line [ctxp->c_line->size] = ahead;
210 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
211 ctxp->c_line->size++;
213 ctxp->c_line->ahead [0] = 0;
214 ctxp->c_line->unicode_escape_ahead_p = 0;
215 ctxp->c_line->lineno = ++input_line;
216 ctxp->c_line->white_space_only = 1;
219 /* Create a new lexer object. */
222 java_new_lexer (FILE *finput, const char *encoding)
224 java_lexer *lex = xmalloc (sizeof (java_lexer));
227 lex->finput = finput;
229 lex->unget_value = 0;
231 lex->encoding = encoding;
234 lex->handle = iconv_open ("UCS-2", encoding);
235 if (lex->handle != (iconv_t) -1)
241 lex->read_anything = 0;
242 lex->use_fallback = 0;
244 /* Work around broken iconv() implementations by doing checking at
245 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
246 then all UCS-2 encoders will be broken. Perhaps not a valid
254 handle = iconv_open ("UCS-2", "UTF-8");
255 if (handle != (iconv_t) -1)
262 /* This is the UTF-8 encoding of \ufeff. */
269 outp = (char *) &result;
272 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
274 iconv_close (handle);
275 /* Conversion must be complete for us to use the result. */
276 if (r != (size_t) -1 && inc == 0 && outc == 0)
277 need_byteswap = (result != 0xfeff);
281 lex->byte_swap = need_byteswap;
284 #endif /* HAVE_ICONV */
286 /* If iconv failed, use the internal decoder if the default
287 encoding was requested. This code is used on platforms where
288 iconv exists but is insufficient for our needs. For
289 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
291 On Solaris the default encoding, as returned by nl_langinfo(),
292 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
293 understand that. We work around that by pretending
294 `646' to be the same as UTF-8. */
295 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
300 lex->use_fallback = 1;
301 lex->encoding = "UTF-8";
303 #endif /* HAVE_ICONV */
307 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
313 java_destroy_lexer (java_lexer *lex)
316 if (! lex->use_fallback)
317 iconv_close (lex->handle);
323 java_read_char (java_lexer *lex)
325 if (lex->unget_value)
327 unicode_t r = lex->unget_value;
328 lex->unget_value = 0;
333 if (! lex->use_fallback)
335 size_t ir, inbytesleft, in_save, out_count, out_save;
339 /* If there is data which has already been converted, use it. */
340 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
347 /* See if we need to read more data. If FIRST == 0 then
348 the previous conversion attempt ended in the middle of
349 a character at the end of the buffer. Otherwise we
350 only have to read if the buffer is empty. */
351 if (lex->first == 0 || lex->first >= lex->last)
355 if (lex->first >= lex->last)
360 if (feof (lex->finput))
362 r = fread (&lex->buffer[lex->last], 1,
363 sizeof (lex->buffer) - lex->last,
368 inbytesleft = lex->last - lex->first;
369 out_count = sizeof (lex->out_buffer) - lex->out_last;
371 if (inbytesleft == 0)
373 /* We've tried to read and there is nothing left. */
377 in_save = inbytesleft;
378 out_save = out_count;
379 inp = &lex->buffer[lex->first];
380 outp = (char *) &lex->out_buffer[lex->out_last];
381 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
382 &inbytesleft, &outp, &out_count);
384 /* If we haven't read any bytes, then look to see if we
386 if (! lex->read_anything && out_save - out_count >= 2)
388 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
394 else if (uc == 0xfffe)
399 lex->read_anything = 1;
405 for (i = 0; i < out_save - out_count; i += 2)
407 char t = lex->out_buffer[lex->out_last + i];
408 lex->out_buffer[lex->out_last + i]
409 = lex->out_buffer[lex->out_last + i + 1];
410 lex->out_buffer[lex->out_last + i + 1] = t;
414 lex->first += in_save - inbytesleft;
415 lex->out_last += out_save - out_count;
417 /* If we converted anything at all, move along. */
418 if (out_count != out_save)
421 if (ir == (size_t) -1)
425 /* This is ok. This means that the end of our buffer
426 is in the middle of a character sequence. We just
427 move the valid part of the buffer to the beginning
429 memmove (&lex->buffer[0], &lex->buffer[lex->first],
430 lex->last - lex->first);
431 lex->last -= lex->first;
436 /* A more serious error. */
439 "Unrecognized character for encoding '%s'",
441 java_lex_error (buffer, 0);
448 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
450 /* Don't have any data. */
455 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
460 #endif /* HAVE_ICONV */
463 c = getc (lex->finput);
468 return (unicode_t) c;
471 if ((c & 0xe0) == 0xc0)
473 c1 = getc (lex->finput);
474 if ((c1 & 0xc0) == 0x80)
476 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
477 /* Check for valid 2-byte characters. We explicitly
478 allow \0 because this encoding is common in the
480 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
484 else if ((c & 0xf0) == 0xe0)
486 c1 = getc (lex->finput);
487 if ((c1 & 0xc0) == 0x80)
489 c2 = getc (lex->finput);
490 if ((c2 & 0xc0) == 0x80)
492 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
495 /* Check for valid 3-byte characters.
496 Don't allow surrogate, \ufffe or \uffff. */
497 if (IN_RANGE (r, 0x800, 0xffff)
498 && ! IN_RANGE (r, 0xd800, 0xdfff)
499 && r != 0xfffe && r != 0xffff)
505 /* We simply don't support invalid characters. We also
506 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
507 cannot be valid Java characters. */
508 java_lex_error ("malformed UTF-8 character", 0);
512 /* We only get here on error. */
517 java_store_unicode (struct java_line *l, unicode_t c, int unicode_escape_p)
519 if (l->size == l->max)
521 l->max += JAVA_LINE_MAX;
522 l->line = xrealloc (l->line, sizeof (unicode_t)*l->max);
523 l->unicode_escape_p = xrealloc (l->unicode_escape_p,
524 sizeof (char)*l->max);
526 l->line [l->size] = c;
527 l->unicode_escape_p [l->size++] = unicode_escape_p;
531 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
535 c = java_read_char (lex);
536 *unicode_escape_p = 0;
545 if ((lex->bs_count) % 2 == 1)
547 /* Odd number of \ seen. */
548 c = java_read_char (lex);
551 unicode_t unicode = 0;
554 /* Recognize any number of `u's in \u. */
555 while ((c = java_read_char (lex)) == 'u')
563 java_lex_error ("prematurely terminated \\u sequence", 0);
568 unicode |= (unicode_t)(hex_value (c) << shift);
571 java_lex_error ("non-hex digit in \\u sequence", 0);
575 c = java_read_char (lex);
581 lex->unget_value = c;
584 *unicode_escape_p = 1;
587 lex->unget_value = c;
589 return (unicode_t) '\\';
593 java_read_unicode_collapsing_terminators (java_lexer *lex,
594 int *unicode_escape_p)
596 int c = java_read_unicode (lex, unicode_escape_p);
600 /* We have to read ahead to see if we got \r\n. In that case we
601 return a single line terminator. */
603 c = java_read_unicode (lex, &dummy);
604 if (c != '\n' && c != UEOF)
605 lex->unget_value = c;
606 /* In either case we must return a newline. */
614 java_get_unicode (void)
616 /* It's time to read a line when... */
617 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
622 if (ctxp->lexer->hit_eof)
625 java_allocate_new_line ();
626 if (ctxp->c_line->line[0] != '\n')
630 int unicode_escape_p;
631 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
636 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
637 if (ctxp->c_line->white_space_only
638 && !JAVA_WHITE_SPACE_P (c)
640 ctxp->c_line->white_space_only = 0;
642 if ((c == '\n') || (c == UEOF))
646 if (c == UEOF && ! found_chars)
648 ctxp->lexer->hit_eof = 1;
653 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
654 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
655 return ctxp->c_line->line [ctxp->c_line->current++];
658 /* Parse the end of a C style comment.
659 * C is the first character following the '/' and '*'. */
661 java_parse_end_comment (int c)
663 for ( ;; c = java_get_unicode ())
668 java_lex_error ("Comment not terminated at end of input", 0);
671 switch (c = java_get_unicode ())
674 java_lex_error ("Comment not terminated at end of input", 0);
678 case '*': /* Reparse only '*'. */
679 java_unget_unicode ();
685 /* Parse the documentation section. Keywords must be at the beginning
686 of a documentation comment line (ignoring white space and any `*'
687 character). Parsed keyword(s): @DEPRECATED. */
690 java_parse_doc_section (int c)
694 /* We reset this here, because only the most recent doc comment
695 applies to the following declaration. */
696 ctxp->deprecated = 0;
698 /* We loop over all the lines of the comment. We'll eventually exit
699 if we hit EOF prematurely, or when we see the comment
703 /* These first steps need only be done if we're still looking
704 for the deprecated tag. If we've already seen it, we might
705 as well skip looking for it again. */
706 if (! ctxp->deprecated)
708 /* Skip whitespace and '*'s. We must also check for the end
709 of the comment here. */
710 while (JAVA_WHITE_SPACE_P (c) || c == '*')
712 last_was_star = (c == '*');
713 c = java_get_unicode ();
714 if (last_was_star && c == '/')
716 /* We just saw the comment terminator. */
726 const char *deprecated = "@deprecated";
729 for (i = 0; deprecated[i]; ++i)
731 if (c != deprecated[i])
733 /* We write the code in this way, with the
734 update at the end, so that after the loop
735 we're left with the next character in C. */
736 c = java_get_unicode ();
742 /* @deprecated must be followed by a space or newline.
743 We also allow a '*' in case it appears just before
744 the end of a comment. In this position only we also
745 must allow any Unicode space character. */
746 if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
749 ctxp->deprecated = 1;
754 /* We've examined the relevant content from this line. Now we
755 skip the remaining characters and start over with the next
756 line. We also check for end of comment here. */
757 while (c != '\n' && c != UEOF)
759 last_was_star = (c == '*');
760 c = java_get_unicode ();
761 if (last_was_star && c == '/')
767 /* We have to advance past the \n. */
768 c = java_get_unicode ();
774 java_lex_error ("Comment not terminated at end of input", 0);
777 /* Return true if C is a valid start character for a Java identifier.
778 This is only called if C >= 128 -- smaller values are handled
779 inline. However, this function handles all values anyway. */
781 java_start_char_p (unicode_t c)
783 unsigned int hi = c / 256;
784 const char *const page = type_table[hi];
785 unsigned long val = (unsigned long) page;
788 if ((val & ~ LETTER_MASK) != 0)
789 flags = page[c & 255];
793 return flags & LETTER_START;
796 /* Return true if C is a valid part character for a Java identifier.
797 This is only called if C >= 128 -- smaller values are handled
798 inline. However, this function handles all values anyway. */
800 java_part_char_p (unicode_t c)
802 unsigned int hi = c / 256;
803 const char *const page = type_table[hi];
804 unsigned long val = (unsigned long) page;
807 if ((val & ~ LETTER_MASK) != 0)
808 flags = page[c & 255];
812 return flags & LETTER_PART;
815 /* Return true if C is whitespace. */
817 java_space_char_p (unicode_t c)
819 unsigned int hi = c / 256;
820 const char *const page = type_table[hi];
821 unsigned long val = (unsigned long) page;
824 if ((val & ~ LETTER_MASK) != 0)
825 flags = page[c & 255];
829 return flags & LETTER_SPACE;
833 java_parse_escape_sequence (void)
838 switch (c = java_get_unicode ())
841 return (unicode_t)0x8;
843 return (unicode_t)0x9;
845 return (unicode_t)0xa;
847 return (unicode_t)0xc;
849 return (unicode_t)0xd;
851 return (unicode_t)0x22;
853 return (unicode_t)0x27;
855 return (unicode_t)0x5c;
856 case '0': case '1': case '2': case '3': case '4':
857 case '5': case '6': case '7':
860 int octal_escape_index = 0;
864 for (; octal_escape_index < max && RANGE (c, '0', '7');
865 c = java_get_unicode ())
867 if (octal_escape_index == 0 && c > '3')
869 /* According to the grammar, `\477' has a well-defined
870 meaning -- it is `\47' followed by `7'. */
873 octal_escape [octal_escape_index++] = c;
876 java_unget_unicode ();
878 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
879 i < octal_escape_index; i++, shift -= 3)
880 char_lit |= (octal_escape [i] - '0') << shift;
885 java_lex_error ("Invalid character in escape sequence", 0);
886 return JAVA_CHAR_ERROR;
891 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
893 /* Subroutine of java_lex: converts floating-point literals to tree
894 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
895 store the result. FFLAG indicates whether the literal was tagged
896 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
897 is the line number on which to report any error. */
899 static void java_perform_atof (YYSTYPE *, char *, int, int);
902 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
903 int number_beginning)
905 REAL_VALUE_TYPE value;
906 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
908 SET_REAL_VALUE_ATOF (value,
909 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
911 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
913 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
916 else if (IS_ZERO (value))
918 /* We check to see if the value is really 0 or if we've found an
919 underflow. We do this in the most primitive imaginable way. */
921 char *p = literal_token;
924 while (*p && *p != 'e' && *p != 'E')
926 if (*p != '0' && *p != '.')
935 int i = ctxp->c_line->current;
936 ctxp->c_line->current = number_beginning;
937 java_lex_error ("Floating point literal underflow", 0);
938 ctxp->c_line->current = i;
942 SET_LVAL_NODE (build_real (type, value));
946 static int yylex (YYSTYPE *);
950 yylex (YYSTYPE *java_lval)
952 do_java_lex (YYSTYPE *java_lval)
956 unicode_t first_unicode;
957 int ascii_index, all_ascii;
960 /* Translation of the Unicode escape in the raw stream of Unicode
961 characters. Takes care of line terminator. */
963 /* Skip white spaces: SP, TAB and FF or ULT. */
964 for (c = java_get_unicode ();
965 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
968 ctxp->elc.line = ctxp->c_line->lineno;
969 ctxp->elc.col = ctxp->c_line->char_col-2;
972 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
974 if (c == 0x1a) /* CTRL-Z. */
976 if ((c = java_get_unicode ()) == UEOF)
977 return 0; /* Ok here. */
979 java_unget_unicode (); /* Caught later, at the end of the
982 /* Handle EOF here. */
983 if (c == UEOF) /* Should probably do something here... */
986 /* Take care of eventual comments. */
989 switch (c = java_get_unicode ())
994 c = java_get_unicode ();
997 /* It is ok to end a `//' comment with EOF, unless
998 we're being pedantic. */
1000 java_lex_error ("Comment not terminated at end of input",
1004 if (c == '\n') /* ULT */
1010 if ((c = java_get_unicode ()) == '*')
1012 c = java_get_unicode ();
1015 /* Empty documentation comment. We have to reset
1016 the deprecation marker as only the most recent
1017 doc comment applies. */
1018 ctxp->deprecated = 0;
1021 java_parse_doc_section (c);
1024 java_parse_end_comment ((c = java_get_unicode ()));
1028 java_unget_unicode ();
1034 ctxp->elc.line = ctxp->c_line->lineno;
1035 ctxp->elc.prev_col = ctxp->elc.col;
1036 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
1037 if (ctxp->elc.col < 0)
1040 /* Numeric literals. */
1041 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
1043 /* This section of code is borrowed from gcc/c-lex.c. */
1044 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
1045 int parts[TOTAL_PARTS];
1046 HOST_WIDE_INT high, low;
1047 /* End borrowed section. */
1048 char literal_token [256];
1049 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
1050 int found_hex_digits = 0, found_non_octal_digits = 0;
1053 int number_beginning = ctxp->c_line->current;
1057 /* We might have a . separator instead of a FP like .[0-9]*. */
1060 unicode_t peep = java_sneak_unicode ();
1062 if (!JAVA_ASCII_DIGIT (peep))
1065 BUILD_OPERATOR (DOT_TK);
1069 for (i = 0; i < TOTAL_PARTS; i++)
1074 c = java_get_unicode ();
1075 if (c == 'x' || c == 'X')
1078 c = java_get_unicode ();
1080 else if (JAVA_ASCII_DIGIT (c))
1082 else if (c == '.' || c == 'e' || c =='E')
1084 /* Push the '.', 'e', or 'E' back and prepare for a FP
1086 java_unget_unicode ();
1091 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1092 JAVA_LEX_LIT ("0", 10);
1096 SET_LVAL_NODE (long_zero_node);
1097 return (INT_LIT_TK);
1099 SET_LVAL_NODE (float_zero_node);
1102 SET_LVAL_NODE (double_zero_node);
1105 java_unget_unicode ();
1106 SET_LVAL_NODE (integer_zero_node);
1107 return (INT_LIT_TK);
1111 /* Parse the first part of the literal, until we find something
1112 which is not a number. */
1113 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1114 JAVA_ASCII_DIGIT (c))
1116 /* We store in a string (in case it turns out to be a FP) and in
1117 PARTS if we have to process a integer literal. */
1118 int numeric = hex_value (c);
1121 /* Remember when we find a valid hexadecimal digit. */
1123 found_hex_digits = 1;
1124 /* Remember when we find an invalid octal digit. */
1125 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1126 found_non_octal_digits = 1;
1128 literal_token [literal_index++] = c;
1129 /* This section of code if borrowed from gcc/c-lex.c. */
1130 for (count = 0; count < TOTAL_PARTS; count++)
1132 parts[count] *= radix;
1135 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1136 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1139 parts[0] += numeric;
1141 if (parts [TOTAL_PARTS-1] != 0)
1143 /* End borrowed section. */
1144 c = java_get_unicode ();
1147 /* If we have something from the FP char set but not a digit, parse
1149 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1152 int seen_digit = (literal_index ? 1 : 0);
1153 int seen_exponent = 0;
1154 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1155 double unless specified. */
1157 /* It is ok if the radix is 8 because this just means we've
1158 seen a leading `0'. However, radix==16 is invalid. */
1160 java_lex_error ("Can't express non-decimal FP literal", 0);
1170 literal_token [literal_index++ ] = c;
1171 c = java_get_unicode ();
1174 java_lex_error ("Invalid character in FP literal", 0);
1177 if (c == 'e' || c == 'E')
1181 /* {E,e} must have seen at least a digit. */
1184 ("Invalid FP literal, mantissa must have digit", 0);
1188 literal_token [literal_index++] = c;
1189 c = java_get_unicode ();
1192 java_lex_error ("Invalid character in FP literal", 0);
1194 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1196 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1197 stage = 4; /* So we fall through. */
1200 if ((c=='-' || c =='+') && stage == 2)
1203 literal_token [literal_index++] = c;
1204 c = java_get_unicode ();
1207 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1208 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1209 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1210 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1212 if (JAVA_ASCII_DIGIT (c))
1216 literal_token [literal_index++ ] = c;
1217 c = java_get_unicode ();
1221 if (stage != 4) /* Don't push back fF/dD. */
1222 java_unget_unicode ();
1224 /* An exponent (if any) must have seen a digit. */
1225 if (seen_exponent && !seen_digit)
1227 ("Invalid FP literal, exponent must have digit", 0);
1229 literal_token [literal_index] = '\0';
1230 JAVA_LEX_LIT (literal_token, radix);
1233 java_perform_atof (java_lval, literal_token,
1234 fflag, number_beginning);
1239 } /* JAVA_ASCII_FPCHAR (c) */
1241 /* Here we get back to converting the integral literal. */
1242 if (radix == 16 && ! found_hex_digits)
1244 ("0x must be followed by at least one hexadecimal digit", 0);
1245 else if (radix == 8 && found_non_octal_digits)
1246 java_lex_error ("Octal literal contains digit out of range", 0);
1247 else if (c == 'L' || c == 'l')
1250 java_unget_unicode ();
1252 #ifdef JAVA_LEX_DEBUG
1253 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1254 JAVA_LEX_LIT (literal_token, radix);
1256 /* This section of code is borrowed from gcc/c-lex.c. */
1259 bytes = GET_TYPE_PRECISION (long_type_node);
1260 for (i = bytes; i < TOTAL_PARTS; i++)
1268 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1270 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1271 / HOST_BITS_PER_CHAR)]
1272 << (i * HOST_BITS_PER_CHAR));
1273 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1275 /* End borrowed section. */
1278 /* Range checking. */
1279 value = build_int_2 (low, high);
1280 /* Temporarily set type to unsigned. */
1281 TREE_TYPE (value) = (long_suffix
1282 ? unsigned_long_type_node
1283 : unsigned_int_type_node);
1284 SET_LVAL_NODE (value);
1286 /* For base 10 numbers, only values up to the highest value
1287 (plus one) can be written. For instance, only ints up to
1288 2147483648 can be written. The special case of the largest
1289 negative value is handled elsewhere. For other bases, any
1290 number can be represented. */
1291 if (overflow || (radix == 10
1292 && tree_int_cst_lt (long_suffix
1298 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1300 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1303 /* Sign extend the value. */
1304 TREE_TYPE (value) = long_suffix ? long_type_node : int_type_node;
1305 value = force_fit_type (value, 0, false, false);
1306 SET_LVAL_NODE (value);
1308 JAVA_RADIX10_FLAG (value) = radix == 10;
1313 /* Character literals. */
1318 if ((c = java_get_unicode ()) == '\\')
1319 char_lit = java_parse_escape_sequence ();
1322 if (c == '\n' || c == '\'')
1323 java_lex_error ("Invalid character literal", 0);
1327 c = java_get_unicode ();
1329 if ((c == '\n') || (c == UEOF))
1330 java_lex_error ("Character literal not terminated at end of line", 0);
1332 java_lex_error ("Syntax error in character literal", 0);
1334 if (char_lit == JAVA_CHAR_ERROR)
1335 char_lit = 0; /* We silently convert it to zero. */
1337 JAVA_LEX_CHAR_LIT (char_lit);
1340 tree value = build_int_2 (char_lit, 0);
1341 TREE_TYPE (value) = char_type_node;
1342 SET_LVAL_NODE (value);
1348 /* String literals. */
1354 for (no_error = 1, c = java_get_unicode ();
1355 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1358 c = java_parse_escape_sequence ();
1359 if (c == JAVA_CHAR_ERROR)
1362 c = 0; /* We silently convert it to zero. */
1364 java_unicode_2_utf8 (c);
1366 if (c == '\n' || c == UEOF) /* ULT. */
1368 input_line--; /* Refer to the line where the terminator was seen. */
1369 java_lex_error ("String not terminated at end of line", 0);
1373 obstack_1grow (&temporary_obstack, '\0');
1374 string = obstack_finish (&temporary_obstack);
1376 if (!no_error || (c != '"'))
1377 java_lval->node = error_mark_node; /* FIXME: Requires further
1380 java_lval->node = build_string (strlen (string), string);
1382 obstack_free (&temporary_obstack, string);
1383 return STRING_LIT_TK;
1391 BUILD_OPERATOR (OP_TK);
1397 if (ctxp->ccb_indent == 1)
1398 ctxp->first_ccb_indent1 = input_line;
1400 BUILD_OPERATOR (OCB_TK);
1404 if (ctxp->ccb_indent == 1)
1405 ctxp->last_ccb_indent1 = input_line;
1406 BUILD_OPERATOR (CCB_TK);
1409 BUILD_OPERATOR (OSB_TK);
1421 BUILD_OPERATOR (DOT_TK);
1422 /* return DOT_TK; */
1429 if ((c = java_get_unicode ()) == '=')
1431 BUILD_OPERATOR (EQ_TK);
1435 /* Equals is used in two different locations. In the
1436 variable_declarator: rule, it has to be seen as '=' as opposed
1437 to being seen as an ordinary assignment operator in
1438 assignment_operators: rule. */
1439 java_unget_unicode ();
1440 BUILD_OPERATOR (ASSIGN_TK);
1444 switch ((c = java_get_unicode ()))
1447 BUILD_OPERATOR (GTE_TK);
1449 switch ((c = java_get_unicode ()))
1452 if ((c = java_get_unicode ()) == '=')
1454 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1458 java_unget_unicode ();
1459 BUILD_OPERATOR (ZRS_TK);
1462 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1464 java_unget_unicode ();
1465 BUILD_OPERATOR (SRS_TK);
1468 java_unget_unicode ();
1469 BUILD_OPERATOR (GT_TK);
1473 switch ((c = java_get_unicode ()))
1476 BUILD_OPERATOR (LTE_TK);
1478 if ((c = java_get_unicode ()) == '=')
1480 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1484 java_unget_unicode ();
1485 BUILD_OPERATOR (LS_TK);
1488 java_unget_unicode ();
1489 BUILD_OPERATOR (LT_TK);
1493 switch ((c = java_get_unicode ()))
1496 BUILD_OPERATOR (BOOL_AND_TK);
1498 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1500 java_unget_unicode ();
1501 BUILD_OPERATOR (AND_TK);
1505 switch ((c = java_get_unicode ()))
1508 BUILD_OPERATOR (BOOL_OR_TK);
1510 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1512 java_unget_unicode ();
1513 BUILD_OPERATOR (OR_TK);
1517 switch ((c = java_get_unicode ()))
1520 BUILD_OPERATOR (INCR_TK);
1522 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1524 java_unget_unicode ();
1525 BUILD_OPERATOR (PLUS_TK);
1529 switch ((c = java_get_unicode ()))
1532 BUILD_OPERATOR (DECR_TK);
1534 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1536 java_unget_unicode ();
1537 BUILD_OPERATOR (MINUS_TK);
1541 if ((c = java_get_unicode ()) == '=')
1543 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1547 java_unget_unicode ();
1548 BUILD_OPERATOR (MULT_TK);
1552 if ((c = java_get_unicode ()) == '=')
1554 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1558 java_unget_unicode ();
1559 BUILD_OPERATOR (DIV_TK);
1563 if ((c = java_get_unicode ()) == '=')
1565 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1569 java_unget_unicode ();
1570 BUILD_OPERATOR (XOR_TK);
1574 if ((c = java_get_unicode ()) == '=')
1576 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1580 java_unget_unicode ();
1581 BUILD_OPERATOR (REM_TK);
1585 if ((c = java_get_unicode()) == '=')
1587 BUILD_OPERATOR (NEQ_TK);
1591 java_unget_unicode ();
1592 BUILD_OPERATOR (NEG_TK);
1597 BUILD_OPERATOR (REL_QM_TK);
1600 BUILD_OPERATOR (REL_CL_TK);
1602 BUILD_OPERATOR (NOT_TK);
1605 /* Keyword, boolean literal or null literal. */
1606 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1607 c != UEOF && JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1609 java_unicode_2_utf8 (c);
1610 if (all_ascii && c >= 128)
1615 obstack_1grow (&temporary_obstack, '\0');
1616 string = obstack_finish (&temporary_obstack);
1618 java_unget_unicode ();
1620 /* If we have something all ascii, we consider a keyword, a boolean
1621 literal, a null literal or an all ASCII identifier. Otherwise,
1622 this is an identifier (possibly not respecting formation rule). */
1625 const struct java_keyword *kw;
1626 if ((kw=java_keyword (string, ascii_index)))
1628 JAVA_LEX_KW (string);
1631 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1632 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1633 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1634 case PRIVATE_TK: case STRICT_TK:
1635 SET_MODIFIER_CTX (kw->token);
1638 SET_LVAL_NODE (float_type_node);
1641 SET_LVAL_NODE (double_type_node);
1644 SET_LVAL_NODE (boolean_type_node);
1647 SET_LVAL_NODE (byte_type_node);
1650 SET_LVAL_NODE (short_type_node);
1653 SET_LVAL_NODE (int_type_node);
1656 SET_LVAL_NODE (long_type_node);
1659 SET_LVAL_NODE (char_type_node);
1662 /* Keyword based literals. */
1665 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1666 boolean_true_node : boolean_false_node));
1669 SET_LVAL_NODE (null_pointer_node);
1675 BUILD_OPERATOR (kw->token);
1681 /* Some keyword we want to retain information on the location
1682 they where found. */
1694 BUILD_OPERATOR (kw->token);
1702 /* We may have an ID here. */
1703 if (JAVA_START_CHAR_P (first_unicode))
1705 JAVA_LEX_ID (string);
1706 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1710 /* Everything else is an invalid character in the input. */
1712 char lex_error_buffer [128];
1713 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1714 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1715 java_lex_error (lex_error_buffer, 1);
1722 /* The exported interface to the lexer. */
1724 java_lex (YYSTYPE *java_lval)
1728 timevar_push (TV_LEX);
1729 r = do_java_lex (java_lval);
1730 timevar_pop (TV_LEX);
1734 /* This is called by the parser to see if an error should be generated
1735 due to numeric overflow. This function only handles the particular
1736 case of the largest negative value, and is only called in the case
1737 where this value is not preceded by `-'. */
1739 error_if_numeric_overflow (tree value)
1741 if (TREE_CODE (value) == INTEGER_CST
1742 && JAVA_RADIX10_FLAG (value)
1743 && tree_int_cst_sgn (value) < 0)
1745 if (TREE_TYPE (value) == long_type_node)
1746 java_lex_error ("Numeric overflow for `long' literal", 0);
1748 java_lex_error ("Numeric overflow for `int' literal", 0);
1752 #endif /* JC1_LITE */
1755 java_unicode_2_utf8 (unicode_t unicode)
1757 if (RANGE (unicode, 0x01, 0x7f))
1758 obstack_1grow (&temporary_obstack, (char)unicode);
1759 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1761 obstack_1grow (&temporary_obstack,
1762 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1763 obstack_1grow (&temporary_obstack,
1764 (unsigned char)(0x80 | (unicode & 0x3f)));
1766 else /* Range 0x800-0xffff. */
1768 obstack_1grow (&temporary_obstack,
1769 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1770 obstack_1grow (&temporary_obstack,
1771 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1772 obstack_1grow (&temporary_obstack,
1773 (unsigned char)(0x80 | (unicode & 0x003f)));
1779 build_wfl_node (tree node)
1781 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1782 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1783 TREE_TYPE (node) = NULL_TREE;
1789 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1792 ctxp->elc.line = ctxp->c_line->lineno;
1793 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1795 /* Might be caught in the middle of some error report. */
1796 ctxp->java_error_flag = 0;
1804 java_is_eol (FILE *fp, int c)
1811 if (next != '\n' && next != EOF)
1823 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1824 int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1829 /* Dumb implementation. Doesn't try to cache or optimize things. */
1830 /* First line of the file is line 1, first column is 1. */
1832 /* COL == -1 means, at the CR/LF in LINE. */
1833 /* COL == -2 means, at the first non space char in LINE. */
1836 int c, ccol, cline = 1;
1837 int current_line_col = 0;
1838 int first_non_space = 0;
1841 if (!(fp = fopen (filename, "r")))
1842 fatal_error ("can't open %s: %m", filename);
1844 while (cline != line)
1849 static const char msg[] = "<<file too short - unexpected EOF>>";
1850 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1853 if (java_is_eol (fp, c))
1857 /* Gather the chars of the current line in a buffer. */
1861 if (c < 0 || java_is_eol (fp, c))
1863 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1864 first_non_space = current_line_col;
1865 obstack_1grow (&temporary_obstack, c);
1870 obstack_1grow (&temporary_obstack, '\n');
1874 col = current_line_col;
1875 first_non_space = 0;
1878 col = first_non_space;
1880 first_non_space = 0;
1882 /* Place the '^' a the right position. */
1883 base = obstack_base (&temporary_obstack);
1884 for (ccol = 1; ccol <= col+3; ccol++)
1886 /* Compute \t when reaching first_non_space. */
1887 char c = (first_non_space ?
1888 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1889 obstack_1grow (&temporary_obstack, c);
1891 obstack_grow0 (&temporary_obstack, "^", 1);
1894 return obstack_finish (&temporary_obstack);
1900 utf8_cmp (const unsigned char *str, int length, const char *name)
1902 const unsigned char *limit = str + length;
1905 for (i = 0; name[i]; ++i)
1907 int ch = UTF8_GET (str, limit);
1909 return ch - name[i];
1912 return str == limit ? 0 : 1;
1915 /* A sorted list of all C++ keywords. */
1917 static const char *const cxx_keywords[] =
2025 /* Return true if NAME is a C++ keyword. */
2028 cxx_keyword_p (const char *name, int length)
2030 int last = ARRAY_SIZE (cxx_keywords);
2032 int mid = (last + first) / 2;
2035 for (mid = (last + first) / 2;
2037 old = mid, mid = (last + first) / 2)
2039 int kwl = strlen (cxx_keywords[mid]);
2040 int min_length = kwl > length ? length : kwl;
2041 int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2046 /* We've found a match if all the remaining characters are `$'. */
2047 for (i = min_length; i < length && name[i] == '$'; ++i)
2061 #endif /* JC1_LITE */