1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
45 /* Function declarations. */
46 static char *java_sprint_unicode (struct java_line *, int);
47 static void java_unicode_2_utf8 (unicode_t);
48 static void java_lex_error (const char *, int);
50 static int do_java_lex (YYSTYPE *);
51 static int java_lex (YYSTYPE *);
52 static int java_is_eol (FILE *, int);
53 static tree build_wfl_node (tree);
55 static void java_store_unicode (struct java_line *, unicode_t, int);
56 static int java_parse_escape_sequence (void);
57 static int java_start_char_p (unicode_t);
58 static int java_part_char_p (unicode_t);
59 static int java_space_char_p (unicode_t);
60 static void java_parse_doc_section (int);
61 static void java_parse_end_comment (int);
62 static int java_get_unicode (void);
63 static int java_read_unicode (java_lexer *, int *);
64 static int java_read_unicode_collapsing_terminators (java_lexer *, int *);
65 static void java_store_unicode (struct java_line *, unicode_t, int);
66 static int java_read_char (java_lexer *);
67 static void java_allocate_new_line (void);
68 static void java_unget_unicode (void);
69 static unicode_t java_sneak_unicode (void);
71 static int utf8_cmp (const unsigned char *, int, const char *);
74 java_lexer *java_new_lexer (FILE *, const char *);
76 static void error_if_numeric_overflow (tree);
80 /* This is nonzero if we have initialized `need_byteswap'. */
81 static int byteswap_init = 0;
83 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
84 big-endian order -- not native endian order. We handle this by
85 doing a conversion once at startup and seeing what happens. This
86 flag holds the results of this determination. */
87 static int need_byteswap = 0;
91 java_init_lex (FILE *finput, const char *encoding)
94 int java_lang_imported = 0;
97 java_lang_id = get_identifier ("java.lang");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier (flag_emit_class_files
122 ? "java.lang.StringBuffer"
123 : "gnu.gcj.runtime.StringBuffer"),
126 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
128 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
129 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
131 memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
132 current_jcf = ggc_alloc_cleared (sizeof (JCF));
133 ctxp->current_parsed_class = NULL;
134 ctxp->package = NULL_TREE;
137 ctxp->filename = input_filename;
138 ctxp->lineno = input_line = 0;
141 ctxp->java_error_flag = 0;
142 ctxp->lexer = java_new_lexer (finput, encoding);
146 java_sprint_unicode (struct java_line *line, int i)
148 static char buffer [10];
149 if (line->unicode_escape_p [i] || line->line [i] > 128)
150 sprintf (buffer, "\\u%04x", line->line [i]);
153 buffer [0] = line->line [i];
160 java_sneak_unicode (void)
162 return (ctxp->c_line->line [ctxp->c_line->current]);
166 java_unget_unicode (void)
168 if (!ctxp->c_line->current)
169 /* Can't unget unicode. */
172 ctxp->c_line->current--;
173 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
177 java_allocate_new_line (void)
179 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
180 char ahead_escape_p = (ctxp->c_line ?
181 ctxp->c_line->unicode_escape_ahead_p : 0);
183 if (ctxp->c_line && !ctxp->c_line->white_space_only)
187 free (ctxp->p_line->unicode_escape_p);
188 free (ctxp->p_line->line);
191 ctxp->p_line = ctxp->c_line;
192 ctxp->c_line = NULL; /* Reallocated. */
197 ctxp->c_line = xmalloc (sizeof (struct java_line));
198 ctxp->c_line->max = JAVA_LINE_MAX;
199 ctxp->c_line->line = xmalloc (sizeof (unicode_t)*ctxp->c_line->max);
200 ctxp->c_line->unicode_escape_p =
201 xmalloc (sizeof (char)*ctxp->c_line->max);
202 ctxp->c_line->white_space_only = 0;
205 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
206 ctxp->c_line->char_col = ctxp->c_line->current = 0;
209 ctxp->c_line->line [ctxp->c_line->size] = ahead;
210 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
211 ctxp->c_line->size++;
213 ctxp->c_line->ahead [0] = 0;
214 ctxp->c_line->unicode_escape_ahead_p = 0;
215 ctxp->c_line->lineno = ++input_line;
216 ctxp->c_line->white_space_only = 1;
219 /* Create a new lexer object. */
222 java_new_lexer (FILE *finput, const char *encoding)
224 java_lexer *lex = xmalloc (sizeof (java_lexer));
227 lex->finput = finput;
229 lex->unget_value = 0;
231 lex->encoding = encoding;
234 lex->handle = iconv_open ("UCS-2", encoding);
235 if (lex->handle != (iconv_t) -1)
241 lex->read_anything = 0;
242 lex->use_fallback = 0;
244 /* Work around broken iconv() implementations by doing checking at
245 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
246 then all UCS-2 encoders will be broken. Perhaps not a valid
254 handle = iconv_open ("UCS-2", "UTF-8");
255 if (handle != (iconv_t) -1)
262 /* This is the UTF-8 encoding of \ufeff. */
269 outp = (char *) &result;
272 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
274 iconv_close (handle);
275 /* Conversion must be complete for us to use the result. */
276 if (r != (size_t) -1 && inc == 0 && outc == 0)
277 need_byteswap = (result != 0xfeff);
281 lex->byte_swap = need_byteswap;
284 #endif /* HAVE_ICONV */
286 /* If iconv failed, use the internal decoder if the default
287 encoding was requested. This code is used on platforms where
288 iconv exists but is insufficient for our needs. For
289 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
291 On Solaris the default encoding, as returned by nl_langinfo(),
292 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
293 understand that. We work around that by pretending
294 `646' to be the same as UTF-8. */
295 if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
300 lex->use_fallback = 1;
301 lex->encoding = "UTF-8";
303 #endif /* HAVE_ICONV */
307 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
313 java_destroy_lexer (java_lexer *lex)
316 if (! lex->use_fallback)
317 iconv_close (lex->handle);
323 java_read_char (java_lexer *lex)
325 if (lex->unget_value)
327 unicode_t r = lex->unget_value;
328 lex->unget_value = 0;
333 if (! lex->use_fallback)
335 size_t ir, inbytesleft, in_save, out_count, out_save;
339 /* If there is data which has already been converted, use it. */
340 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
347 /* See if we need to read more data. If FIRST == 0 then
348 the previous conversion attempt ended in the middle of
349 a character at the end of the buffer. Otherwise we
350 only have to read if the buffer is empty. */
351 if (lex->first == 0 || lex->first >= lex->last)
355 if (lex->first >= lex->last)
360 if (feof (lex->finput))
362 r = fread (&lex->buffer[lex->last], 1,
363 sizeof (lex->buffer) - lex->last,
368 inbytesleft = lex->last - lex->first;
369 out_count = sizeof (lex->out_buffer) - lex->out_last;
371 if (inbytesleft == 0)
373 /* We've tried to read and there is nothing left. */
377 in_save = inbytesleft;
378 out_save = out_count;
379 inp = &lex->buffer[lex->first];
380 outp = (char *) &lex->out_buffer[lex->out_last];
381 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
382 &inbytesleft, &outp, &out_count);
384 /* If we haven't read any bytes, then look to see if we
386 if (! lex->read_anything && out_save - out_count >= 2)
388 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
394 else if (uc == 0xfffe)
399 lex->read_anything = 1;
405 for (i = 0; i < out_save - out_count; i += 2)
407 char t = lex->out_buffer[lex->out_last + i];
408 lex->out_buffer[lex->out_last + i]
409 = lex->out_buffer[lex->out_last + i + 1];
410 lex->out_buffer[lex->out_last + i + 1] = t;
414 lex->first += in_save - inbytesleft;
415 lex->out_last += out_save - out_count;
417 /* If we converted anything at all, move along. */
418 if (out_count != out_save)
421 if (ir == (size_t) -1)
425 /* This is ok. This means that the end of our buffer
426 is in the middle of a character sequence. We just
427 move the valid part of the buffer to the beginning
429 memmove (&lex->buffer[0], &lex->buffer[lex->first],
430 lex->last - lex->first);
431 lex->last -= lex->first;
436 /* A more serious error. */
439 "Unrecognized character for encoding '%s'",
441 java_lex_error (buffer, 0);
448 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
450 /* Don't have any data. */
455 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
460 #endif /* HAVE_ICONV */
463 c = getc (lex->finput);
468 return (unicode_t) c;
471 if ((c & 0xe0) == 0xc0)
473 c1 = getc (lex->finput);
474 if ((c1 & 0xc0) == 0x80)
476 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
477 /* Check for valid 2-byte characters. We explicitly
478 allow \0 because this encoding is common in the
480 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
484 else if ((c & 0xf0) == 0xe0)
486 c1 = getc (lex->finput);
487 if ((c1 & 0xc0) == 0x80)
489 c2 = getc (lex->finput);
490 if ((c2 & 0xc0) == 0x80)
492 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
495 /* Check for valid 3-byte characters.
496 Don't allow surrogate, \ufffe or \uffff. */
497 if (IN_RANGE (r, 0x800, 0xffff)
498 && ! IN_RANGE (r, 0xd800, 0xdfff)
499 && r != 0xfffe && r != 0xffff)
505 /* We simply don't support invalid characters. We also
506 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
507 cannot be valid Java characters. */
508 java_lex_error ("malformed UTF-8 character", 0);
512 /* We only get here on error. */
517 java_store_unicode (struct java_line *l, unicode_t c, int unicode_escape_p)
519 if (l->size == l->max)
521 l->max += JAVA_LINE_MAX;
522 l->line = xrealloc (l->line, sizeof (unicode_t)*l->max);
523 l->unicode_escape_p = xrealloc (l->unicode_escape_p,
524 sizeof (char)*l->max);
526 l->line [l->size] = c;
527 l->unicode_escape_p [l->size++] = unicode_escape_p;
531 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
535 c = java_read_char (lex);
536 *unicode_escape_p = 0;
545 if ((lex->bs_count) % 2 == 1)
547 /* Odd number of \ seen. */
548 c = java_read_char (lex);
551 unicode_t unicode = 0;
554 /* Recognize any number of `u's in \u. */
555 while ((c = java_read_char (lex)) == 'u')
563 java_lex_error ("prematurely terminated \\u sequence", 0);
568 unicode |= (unicode_t)(hex_value (c) << shift);
571 java_lex_error ("non-hex digit in \\u sequence", 0);
575 c = java_read_char (lex);
581 lex->unget_value = c;
584 *unicode_escape_p = 1;
587 lex->unget_value = c;
589 return (unicode_t) '\\';
593 java_read_unicode_collapsing_terminators (java_lexer *lex,
594 int *unicode_escape_p)
596 int c = java_read_unicode (lex, unicode_escape_p);
600 /* We have to read ahead to see if we got \r\n. In that case we
601 return a single line terminator. */
603 c = java_read_unicode (lex, &dummy);
604 if (c != '\n' && c != UEOF)
605 lex->unget_value = c;
606 /* In either case we must return a newline. */
614 java_get_unicode (void)
616 /* It's time to read a line when... */
617 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
622 if (ctxp->lexer->hit_eof)
625 java_allocate_new_line ();
626 if (ctxp->c_line->line[0] != '\n')
630 int unicode_escape_p;
631 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
636 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
637 if (ctxp->c_line->white_space_only
638 && !JAVA_WHITE_SPACE_P (c)
640 ctxp->c_line->white_space_only = 0;
642 if ((c == '\n') || (c == UEOF))
646 if (c == UEOF && ! found_chars)
648 ctxp->lexer->hit_eof = 1;
653 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
654 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
655 return ctxp->c_line->line [ctxp->c_line->current++];
658 /* Parse the end of a C style comment.
659 * C is the first character following the '/' and '*'. */
661 java_parse_end_comment (int c)
663 for ( ;; c = java_get_unicode ())
668 java_lex_error ("Comment not terminated at end of input", 0);
671 switch (c = java_get_unicode ())
674 java_lex_error ("Comment not terminated at end of input", 0);
678 case '*': /* Reparse only '*'. */
679 java_unget_unicode ();
685 /* Parse the documentation section. Keywords must be at the beginning
686 of a documentation comment line (ignoring white space and any `*'
687 character). Parsed keyword(s): @DEPRECATED. */
690 java_parse_doc_section (int c)
694 /* We reset this here, because only the most recent doc comment
695 applies to the following declaration. */
696 ctxp->deprecated = 0;
698 /* We loop over all the lines of the comment. We'll eventually exit
699 if we hit EOF prematurely, or when we see the comment
703 /* These first steps need only be done if we're still looking
704 for the deprecated tag. If we've already seen it, we might
705 as well skip looking for it again. */
706 if (! ctxp->deprecated)
708 /* Skip whitespace and '*'s. We must also check for the end
709 of the comment here. */
710 while (JAVA_WHITE_SPACE_P (c) || c == '*')
712 last_was_star = (c == '*');
713 c = java_get_unicode ();
714 if (last_was_star && c == '/')
716 /* We just saw the comment terminator. */
726 const char *deprecated = "@deprecated";
729 for (i = 0; deprecated[i]; ++i)
731 if (c != deprecated[i])
733 /* We write the code in this way, with the
734 update at the end, so that after the loop
735 we're left with the next character in C. */
736 c = java_get_unicode ();
742 /* @deprecated must be followed by a space or newline.
743 We also allow a '*' in case it appears just before
744 the end of a comment. In this position only we also
745 must allow any Unicode space character. */
746 if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
749 ctxp->deprecated = 1;
754 /* We've examined the relevant content from this line. Now we
755 skip the remaining characters and start over with the next
756 line. We also check for end of comment here. */
757 while (c != '\n' && c != UEOF)
759 last_was_star = (c == '*');
760 c = java_get_unicode ();
761 if (last_was_star && c == '/')
767 /* We have to advance past the \n. */
768 c = java_get_unicode ();
774 java_lex_error ("Comment not terminated at end of input", 0);
777 /* Return true if C is a valid start character for a Java identifier.
778 This is only called if C >= 128 -- smaller values are handled
779 inline. However, this function handles all values anyway. */
781 java_start_char_p (unicode_t c)
783 unsigned int hi = c / 256;
784 const char *const page = type_table[hi];
785 unsigned long val = (unsigned long) page;
788 if ((val & ~ LETTER_MASK) != 0)
789 flags = page[c & 255];
793 return flags & LETTER_START;
796 /* Return true if C is a valid part character for a Java identifier.
797 This is only called if C >= 128 -- smaller values are handled
798 inline. However, this function handles all values anyway. */
800 java_part_char_p (unicode_t c)
802 unsigned int hi = c / 256;
803 const char *const page = type_table[hi];
804 unsigned long val = (unsigned long) page;
807 if ((val & ~ LETTER_MASK) != 0)
808 flags = page[c & 255];
812 return flags & LETTER_PART;
815 /* Return true if C is whitespace. */
817 java_space_char_p (unicode_t c)
819 unsigned int hi = c / 256;
820 const char *const page = type_table[hi];
821 unsigned long val = (unsigned long) page;
824 if ((val & ~ LETTER_MASK) != 0)
825 flags = page[c & 255];
829 return flags & LETTER_SPACE;
833 java_parse_escape_sequence (void)
838 switch (c = java_get_unicode ())
841 return (unicode_t)0x8;
843 return (unicode_t)0x9;
845 return (unicode_t)0xa;
847 return (unicode_t)0xc;
849 return (unicode_t)0xd;
851 return (unicode_t)0x22;
853 return (unicode_t)0x27;
855 return (unicode_t)0x5c;
856 case '0': case '1': case '2': case '3': case '4':
857 case '5': case '6': case '7':
860 int octal_escape_index = 0;
864 for (; octal_escape_index < max && RANGE (c, '0', '7');
865 c = java_get_unicode ())
867 if (octal_escape_index == 0 && c > '3')
869 /* According to the grammar, `\477' has a well-defined
870 meaning -- it is `\47' followed by `7'. */
873 octal_escape [octal_escape_index++] = c;
876 java_unget_unicode ();
878 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
879 i < octal_escape_index; i++, shift -= 3)
880 char_lit |= (octal_escape [i] - '0') << shift;
885 java_lex_error ("Invalid character in escape sequence", 0);
886 return JAVA_CHAR_ERROR;
891 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
893 /* Subroutine of java_lex: converts floating-point literals to tree
894 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
895 store the result. FFLAG indicates whether the literal was tagged
896 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
897 is the line number on which to report any error. */
899 static void java_perform_atof (YYSTYPE *, char *, int, int);
902 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
903 int number_beginning)
905 REAL_VALUE_TYPE value;
906 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
908 SET_REAL_VALUE_ATOF (value,
909 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
911 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
913 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
916 else if (IS_ZERO (value))
918 /* We check to see if the value is really 0 or if we've found an
919 underflow. We do this in the most primitive imaginable way. */
921 char *p = literal_token;
924 while (*p && *p != 'e' && *p != 'E')
926 if (*p != '0' && *p != '.')
935 int i = ctxp->c_line->current;
936 ctxp->c_line->current = number_beginning;
937 java_lex_error ("Floating point literal underflow", 0);
938 ctxp->c_line->current = i;
942 SET_LVAL_NODE (build_real (type, value));
946 static int yylex (YYSTYPE *);
950 yylex (YYSTYPE *java_lval)
952 do_java_lex (YYSTYPE *java_lval)
956 unicode_t first_unicode;
957 int ascii_index, all_ascii;
960 /* Translation of the Unicode escape in the raw stream of Unicode
961 characters. Takes care of line terminator. */
963 /* Skip white spaces: SP, TAB and FF or ULT. */
964 for (c = java_get_unicode ();
965 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
968 ctxp->elc.line = ctxp->c_line->lineno;
969 ctxp->elc.col = ctxp->c_line->char_col-2;
972 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
974 if (c == 0x1a) /* CTRL-Z. */
976 if ((c = java_get_unicode ()) == UEOF)
977 return 0; /* Ok here. */
979 java_unget_unicode (); /* Caught later, at the end of the
982 /* Handle EOF here. */
983 if (c == UEOF) /* Should probably do something here... */
986 /* Take care of eventual comments. */
989 switch (c = java_get_unicode ())
994 c = java_get_unicode ();
997 /* It is ok to end a `//' comment with EOF, unless
998 we're being pedantic. */
1000 java_lex_error ("Comment not terminated at end of input",
1004 if (c == '\n') /* ULT */
1010 if ((c = java_get_unicode ()) == '*')
1012 c = java_get_unicode ();
1015 /* Empty documentation comment. We have to reset
1016 the deprecation marker as only the most recent
1017 doc comment applies. */
1018 ctxp->deprecated = 0;
1021 java_parse_doc_section (c);
1024 java_parse_end_comment ((c = java_get_unicode ()));
1028 java_unget_unicode ();
1034 ctxp->elc.line = ctxp->c_line->lineno;
1035 ctxp->elc.prev_col = ctxp->elc.col;
1036 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
1037 if (ctxp->elc.col < 0)
1040 /* Numeric literals. */
1041 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
1043 /* This section of code is borrowed from gcc/c-lex.c. */
1044 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
1045 int parts[TOTAL_PARTS];
1046 HOST_WIDE_INT high, low;
1047 /* End borrowed section. */
1048 char literal_token [256];
1049 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
1050 int found_hex_digits = 0, found_non_octal_digits = 0;
1053 int number_beginning = ctxp->c_line->current;
1057 /* We might have a . separator instead of a FP like .[0-9]*. */
1060 unicode_t peep = java_sneak_unicode ();
1062 if (!JAVA_ASCII_DIGIT (peep))
1065 BUILD_OPERATOR (DOT_TK);
1069 for (i = 0; i < TOTAL_PARTS; i++)
1074 c = java_get_unicode ();
1075 if (c == 'x' || c == 'X')
1078 c = java_get_unicode ();
1080 else if (JAVA_ASCII_DIGIT (c))
1082 else if (c == '.' || c == 'e' || c =='E')
1084 /* Push the '.', 'e', or 'E' back and prepare for a FP
1086 java_unget_unicode ();
1091 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1092 JAVA_LEX_LIT ("0", 10);
1096 SET_LVAL_NODE (long_zero_node);
1097 return (INT_LIT_TK);
1099 SET_LVAL_NODE (float_zero_node);
1102 SET_LVAL_NODE (double_zero_node);
1105 java_unget_unicode ();
1106 SET_LVAL_NODE (integer_zero_node);
1107 return (INT_LIT_TK);
1111 /* Parse the first part of the literal, until we find something
1112 which is not a number. */
1113 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1114 JAVA_ASCII_DIGIT (c))
1116 /* We store in a string (in case it turns out to be a FP) and in
1117 PARTS if we have to process a integer literal. */
1118 int numeric = hex_value (c);
1121 /* Remember when we find a valid hexadecimal digit. */
1123 found_hex_digits = 1;
1124 /* Remember when we find an invalid octal digit. */
1125 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1126 found_non_octal_digits = 1;
1128 literal_token [literal_index++] = c;
1129 /* This section of code if borrowed from gcc/c-lex.c. */
1130 for (count = 0; count < TOTAL_PARTS; count++)
1132 parts[count] *= radix;
1135 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1136 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1139 parts[0] += numeric;
1141 if (parts [TOTAL_PARTS-1] != 0)
1143 /* End borrowed section. */
1144 c = java_get_unicode ();
1147 /* If we have something from the FP char set but not a digit, parse
1149 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1152 int seen_digit = (literal_index ? 1 : 0);
1153 int seen_exponent = 0;
1154 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1155 double unless specified. */
1157 /* It is ok if the radix is 8 because this just means we've
1158 seen a leading `0'. However, radix==16 is invalid. */
1160 java_lex_error ("Can't express non-decimal FP literal", 0);
1170 literal_token [literal_index++ ] = c;
1171 c = java_get_unicode ();
1174 java_lex_error ("Invalid character in FP literal", 0);
1177 if (c == 'e' || c == 'E')
1181 /* {E,e} must have seen at least a digit. */
1184 ("Invalid FP literal, mantissa must have digit", 0);
1188 literal_token [literal_index++] = c;
1189 c = java_get_unicode ();
1192 java_lex_error ("Invalid character in FP literal", 0);
1194 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1196 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1197 stage = 4; /* So we fall through. */
1200 if ((c=='-' || c =='+') && stage == 2)
1203 literal_token [literal_index++] = c;
1204 c = java_get_unicode ();
1207 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1208 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1209 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1210 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1212 if (JAVA_ASCII_DIGIT (c))
1216 literal_token [literal_index++ ] = c;
1217 c = java_get_unicode ();
1221 if (stage != 4) /* Don't push back fF/dD. */
1222 java_unget_unicode ();
1224 /* An exponent (if any) must have seen a digit. */
1225 if (seen_exponent && !seen_digit)
1227 ("Invalid FP literal, exponent must have digit", 0);
1229 literal_token [literal_index] = '\0';
1230 JAVA_LEX_LIT (literal_token, radix);
1233 java_perform_atof (java_lval, literal_token,
1234 fflag, number_beginning);
1239 } /* JAVA_ASCII_FPCHAR (c) */
1241 /* Here we get back to converting the integral literal. */
1242 if (radix == 16 && ! found_hex_digits)
1244 ("0x must be followed by at least one hexadecimal digit", 0);
1245 else if (radix == 8 && found_non_octal_digits)
1246 java_lex_error ("Octal literal contains digit out of range", 0);
1247 else if (c == 'L' || c == 'l')
1250 java_unget_unicode ();
1252 #ifdef JAVA_LEX_DEBUG
1253 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1254 JAVA_LEX_LIT (literal_token, radix);
1256 /* This section of code is borrowed from gcc/c-lex.c. */
1259 bytes = GET_TYPE_PRECISION (long_type_node);
1260 for (i = bytes; i < TOTAL_PARTS; i++)
1268 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1270 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1271 / HOST_BITS_PER_CHAR)]
1272 << (i * HOST_BITS_PER_CHAR));
1273 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1275 /* End borrowed section. */
1278 /* Range checking. */
1279 /* Temporarily set type to unsigned. */
1280 value = build_int_cst_wide (long_suffix
1281 ? unsigned_long_type_node
1282 : unsigned_int_type_node, low, high);
1283 SET_LVAL_NODE (value);
1285 /* For base 10 numbers, only values up to the highest value
1286 (plus one) can be written. For instance, only ints up to
1287 2147483648 can be written. The special case of the largest
1288 negative value is handled elsewhere. For other bases, any
1289 number can be represented. */
1290 if (overflow || (radix == 10
1291 && tree_int_cst_lt (long_suffix
1297 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1299 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1302 /* Sign extend the value. */
1303 value = build_int_cst_wide (long_suffix ? long_type_node : int_type_node,
1305 value = force_fit_type (value, 0, false, false);
1309 value = copy_node (value);
1310 JAVA_NOT_RADIX10_FLAG (value) = 1;
1313 SET_LVAL_NODE (value);
1318 /* Character literals. */
1323 if ((c = java_get_unicode ()) == '\\')
1324 char_lit = java_parse_escape_sequence ();
1327 if (c == '\n' || c == '\'')
1328 java_lex_error ("Invalid character literal", 0);
1332 c = java_get_unicode ();
1334 if ((c == '\n') || (c == UEOF))
1335 java_lex_error ("Character literal not terminated at end of line", 0);
1337 java_lex_error ("Syntax error in character literal", 0);
1339 if (char_lit == JAVA_CHAR_ERROR)
1340 char_lit = 0; /* We silently convert it to zero. */
1342 JAVA_LEX_CHAR_LIT (char_lit);
1343 SET_LVAL_NODE (build_int_cst (char_type_node, char_lit));
1347 /* String literals. */
1353 for (no_error = 1, c = java_get_unicode ();
1354 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1357 c = java_parse_escape_sequence ();
1358 if (c == JAVA_CHAR_ERROR)
1361 c = 0; /* We silently convert it to zero. */
1363 java_unicode_2_utf8 (c);
1365 if (c == '\n' || c == UEOF) /* ULT. */
1367 input_line--; /* Refer to the line where the terminator was seen. */
1368 java_lex_error ("String not terminated at end of line", 0);
1372 obstack_1grow (&temporary_obstack, '\0');
1373 string = obstack_finish (&temporary_obstack);
1375 if (!no_error || (c != '"'))
1376 java_lval->node = error_mark_node; /* FIXME: Requires further
1379 java_lval->node = build_string (strlen (string), string);
1381 obstack_free (&temporary_obstack, string);
1382 return STRING_LIT_TK;
1390 BUILD_OPERATOR (OP_TK);
1396 if (ctxp->ccb_indent == 1)
1397 ctxp->first_ccb_indent1 = input_line;
1399 BUILD_OPERATOR (OCB_TK);
1403 if (ctxp->ccb_indent == 1)
1404 ctxp->last_ccb_indent1 = input_line;
1405 BUILD_OPERATOR (CCB_TK);
1408 BUILD_OPERATOR (OSB_TK);
1420 BUILD_OPERATOR (DOT_TK);
1421 /* return DOT_TK; */
1428 if ((c = java_get_unicode ()) == '=')
1430 BUILD_OPERATOR (EQ_TK);
1434 /* Equals is used in two different locations. In the
1435 variable_declarator: rule, it has to be seen as '=' as opposed
1436 to being seen as an ordinary assignment operator in
1437 assignment_operators: rule. */
1438 java_unget_unicode ();
1439 BUILD_OPERATOR (ASSIGN_TK);
1443 switch ((c = java_get_unicode ()))
1446 BUILD_OPERATOR (GTE_TK);
1448 switch ((c = java_get_unicode ()))
1451 if ((c = java_get_unicode ()) == '=')
1453 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1457 java_unget_unicode ();
1458 BUILD_OPERATOR (ZRS_TK);
1461 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1463 java_unget_unicode ();
1464 BUILD_OPERATOR (SRS_TK);
1467 java_unget_unicode ();
1468 BUILD_OPERATOR (GT_TK);
1472 switch ((c = java_get_unicode ()))
1475 BUILD_OPERATOR (LTE_TK);
1477 if ((c = java_get_unicode ()) == '=')
1479 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1483 java_unget_unicode ();
1484 BUILD_OPERATOR (LS_TK);
1487 java_unget_unicode ();
1488 BUILD_OPERATOR (LT_TK);
1492 switch ((c = java_get_unicode ()))
1495 BUILD_OPERATOR (BOOL_AND_TK);
1497 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1499 java_unget_unicode ();
1500 BUILD_OPERATOR (AND_TK);
1504 switch ((c = java_get_unicode ()))
1507 BUILD_OPERATOR (BOOL_OR_TK);
1509 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1511 java_unget_unicode ();
1512 BUILD_OPERATOR (OR_TK);
1516 switch ((c = java_get_unicode ()))
1519 BUILD_OPERATOR (INCR_TK);
1521 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1523 java_unget_unicode ();
1524 BUILD_OPERATOR (PLUS_TK);
1528 switch ((c = java_get_unicode ()))
1531 BUILD_OPERATOR (DECR_TK);
1533 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1535 java_unget_unicode ();
1536 BUILD_OPERATOR (MINUS_TK);
1540 if ((c = java_get_unicode ()) == '=')
1542 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1546 java_unget_unicode ();
1547 BUILD_OPERATOR (MULT_TK);
1551 if ((c = java_get_unicode ()) == '=')
1553 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1557 java_unget_unicode ();
1558 BUILD_OPERATOR (DIV_TK);
1562 if ((c = java_get_unicode ()) == '=')
1564 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1568 java_unget_unicode ();
1569 BUILD_OPERATOR (XOR_TK);
1573 if ((c = java_get_unicode ()) == '=')
1575 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1579 java_unget_unicode ();
1580 BUILD_OPERATOR (REM_TK);
1584 if ((c = java_get_unicode()) == '=')
1586 BUILD_OPERATOR (NEQ_TK);
1590 java_unget_unicode ();
1591 BUILD_OPERATOR (NEG_TK);
1596 BUILD_OPERATOR (REL_QM_TK);
1599 BUILD_OPERATOR (REL_CL_TK);
1601 BUILD_OPERATOR (NOT_TK);
1604 /* Keyword, boolean literal or null literal. */
1605 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1606 c != UEOF && JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1608 java_unicode_2_utf8 (c);
1609 if (all_ascii && c >= 128)
1614 obstack_1grow (&temporary_obstack, '\0');
1615 string = obstack_finish (&temporary_obstack);
1617 java_unget_unicode ();
1619 /* If we have something all ascii, we consider a keyword, a boolean
1620 literal, a null literal or an all ASCII identifier. Otherwise,
1621 this is an identifier (possibly not respecting formation rule). */
1624 const struct java_keyword *kw;
1625 if ((kw=java_keyword (string, ascii_index)))
1627 JAVA_LEX_KW (string);
1630 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1631 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1632 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1633 case PRIVATE_TK: case STRICT_TK:
1634 SET_MODIFIER_CTX (kw->token);
1637 SET_LVAL_NODE (float_type_node);
1640 SET_LVAL_NODE (double_type_node);
1643 SET_LVAL_NODE (boolean_type_node);
1646 SET_LVAL_NODE (byte_type_node);
1649 SET_LVAL_NODE (short_type_node);
1652 SET_LVAL_NODE (int_type_node);
1655 SET_LVAL_NODE (long_type_node);
1658 SET_LVAL_NODE (char_type_node);
1661 /* Keyword based literals. */
1664 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1665 boolean_true_node : boolean_false_node));
1668 SET_LVAL_NODE (null_pointer_node);
1674 BUILD_OPERATOR (kw->token);
1680 /* Some keyword we want to retain information on the location
1681 they where found. */
1693 BUILD_OPERATOR (kw->token);
1701 /* We may have an ID here. */
1702 if (JAVA_START_CHAR_P (first_unicode))
1704 JAVA_LEX_ID (string);
1705 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1709 /* Everything else is an invalid character in the input. */
1711 char lex_error_buffer [128];
1712 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1713 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1714 java_lex_error (lex_error_buffer, 1);
1721 /* The exported interface to the lexer. */
1723 java_lex (YYSTYPE *java_lval)
1727 timevar_push (TV_LEX);
1728 r = do_java_lex (java_lval);
1729 timevar_pop (TV_LEX);
1733 /* This is called by the parser to see if an error should be generated
1734 due to numeric overflow. This function only handles the particular
1735 case of the largest negative value, and is only called in the case
1736 where this value is not preceded by `-'. */
1738 error_if_numeric_overflow (tree value)
1740 if (TREE_CODE (value) == INTEGER_CST
1741 && !JAVA_NOT_RADIX10_FLAG (value)
1742 && tree_int_cst_sgn (value) < 0)
1744 if (TREE_TYPE (value) == long_type_node)
1745 java_lex_error ("Numeric overflow for `long' literal", 0);
1747 java_lex_error ("Numeric overflow for `int' literal", 0);
1751 #endif /* JC1_LITE */
1754 java_unicode_2_utf8 (unicode_t unicode)
1756 if (RANGE (unicode, 0x01, 0x7f))
1757 obstack_1grow (&temporary_obstack, (char)unicode);
1758 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1760 obstack_1grow (&temporary_obstack,
1761 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1762 obstack_1grow (&temporary_obstack,
1763 (unsigned char)(0x80 | (unicode & 0x3f)));
1765 else /* Range 0x800-0xffff. */
1767 obstack_1grow (&temporary_obstack,
1768 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1769 obstack_1grow (&temporary_obstack,
1770 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1771 obstack_1grow (&temporary_obstack,
1772 (unsigned char)(0x80 | (unicode & 0x003f)));
1778 build_wfl_node (tree node)
1780 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1781 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1782 TREE_TYPE (node) = NULL_TREE;
1788 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1791 ctxp->elc.line = ctxp->c_line->lineno;
1792 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1794 /* Might be caught in the middle of some error report. */
1795 ctxp->java_error_flag = 0;
1803 java_is_eol (FILE *fp, int c)
1810 if (next != '\n' && next != EOF)
1822 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1823 int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1828 /* Dumb implementation. Doesn't try to cache or optimize things. */
1829 /* First line of the file is line 1, first column is 1. */
1831 /* COL == -1 means, at the CR/LF in LINE. */
1832 /* COL == -2 means, at the first non space char in LINE. */
1835 int c, ccol, cline = 1;
1836 int current_line_col = 0;
1837 int first_non_space = 0;
1840 if (!(fp = fopen (filename, "r")))
1841 fatal_error ("can't open %s: %m", filename);
1843 while (cline != line)
1848 static const char msg[] = "<<file too short - unexpected EOF>>";
1849 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1852 if (java_is_eol (fp, c))
1856 /* Gather the chars of the current line in a buffer. */
1860 if (c < 0 || java_is_eol (fp, c))
1862 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1863 first_non_space = current_line_col;
1864 obstack_1grow (&temporary_obstack, c);
1869 obstack_1grow (&temporary_obstack, '\n');
1873 col = current_line_col;
1874 first_non_space = 0;
1877 col = first_non_space;
1879 first_non_space = 0;
1881 /* Place the '^' a the right position. */
1882 base = obstack_base (&temporary_obstack);
1883 for (ccol = 1; ccol <= col+3; ccol++)
1885 /* Compute \t when reaching first_non_space. */
1886 char c = (first_non_space ?
1887 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1888 obstack_1grow (&temporary_obstack, c);
1890 obstack_grow0 (&temporary_obstack, "^", 1);
1893 return obstack_finish (&temporary_obstack);
1899 utf8_cmp (const unsigned char *str, int length, const char *name)
1901 const unsigned char *limit = str + length;
1904 for (i = 0; name[i]; ++i)
1906 int ch = UTF8_GET (str, limit);
1908 return ch - name[i];
1911 return str == limit ? 0 : 1;
1914 /* A sorted list of all C++ keywords. */
1916 static const char *const cxx_keywords[] =
2024 /* Return true if NAME is a C++ keyword. */
2027 cxx_keyword_p (const char *name, int length)
2029 int last = ARRAY_SIZE (cxx_keywords);
2031 int mid = (last + first) / 2;
2034 for (mid = (last + first) / 2;
2036 old = mid, mid = (last + first) / 2)
2038 int kwl = strlen (cxx_keywords[mid]);
2039 int min_length = kwl > length ? length : kwl;
2040 int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2045 /* We've found a match if all the remaining characters are `$'. */
2046 for (i = min_length; i < length && name[i] == '$'; ++i)
2060 #endif /* JC1_LITE */