1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declaration */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
70 static void error_if_numeric_overflow PARAMS ((tree));
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init = 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap = 0;
85 java_init_lex (finput, encoding)
90 int java_lang_imported = 0;
93 java_lang_id = get_identifier ("java.lang");
94 if (!java_lang_cloneable)
95 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
96 if (!java_io_serializable)
97 java_io_serializable = get_identifier ("java.io.Serializable");
99 inst_id = get_identifier ("inst$");
101 wpv_id = get_identifier ("write_parm_value$");
103 if (!java_lang_imported)
105 tree node = build_tree_list
106 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
107 read_import_dir (TREE_PURPOSE (node));
108 TREE_CHAIN (node) = ctxp->import_demand_list;
109 ctxp->import_demand_list = node;
110 java_lang_imported = 1;
114 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
116 label_id = get_identifier ("$L");
118 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
119 if (!wfl_string_buffer)
121 build_expr_wfl (get_identifier ("java.lang.StringBuffer"), NULL, 0, 0);
123 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
125 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
126 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
128 memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
129 memset ((PTR) current_jcf, 0, sizeof (JCF));
130 ctxp->current_parsed_class = NULL;
131 ctxp->package = NULL_TREE;
134 ctxp->filename = input_filename;
135 ctxp->lineno = lineno = 0;
138 ctxp->java_error_flag = 0;
139 ctxp->lexer = java_new_lexer (finput, encoding);
143 java_sprint_unicode (line, i)
144 struct java_line *line;
147 static char buffer [10];
148 if (line->unicode_escape_p [i] || line->line [i] > 128)
149 sprintf (buffer, "\\u%04x", line->line [i]);
152 buffer [0] = line->line [i];
159 java_sneak_unicode ()
161 return (ctxp->c_line->line [ctxp->c_line->current]);
165 java_unget_unicode ()
167 if (!ctxp->c_line->current)
168 /* Can't unget unicode. */
171 ctxp->c_line->current--;
172 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
176 java_allocate_new_line ()
178 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
179 char ahead_escape_p = (ctxp->c_line ?
180 ctxp->c_line->unicode_escape_ahead_p : 0);
182 if (ctxp->c_line && !ctxp->c_line->white_space_only)
186 free (ctxp->p_line->unicode_escape_p);
187 free (ctxp->p_line->line);
190 ctxp->p_line = ctxp->c_line;
191 ctxp->c_line = NULL; /* Reallocated */
196 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
197 ctxp->c_line->max = JAVA_LINE_MAX;
198 ctxp->c_line->line = (unicode_t *)xmalloc
199 (sizeof (unicode_t)*ctxp->c_line->max);
200 ctxp->c_line->unicode_escape_p =
201 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
202 ctxp->c_line->white_space_only = 0;
205 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
206 ctxp->c_line->char_col = ctxp->c_line->current = 0;
209 ctxp->c_line->line [ctxp->c_line->size] = ahead;
210 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
211 ctxp->c_line->size++;
213 ctxp->c_line->ahead [0] = 0;
214 ctxp->c_line->unicode_escape_ahead_p = 0;
215 ctxp->c_line->lineno = ++lineno;
216 ctxp->c_line->white_space_only = 1;
219 /* Create a new lexer object. */
222 java_new_lexer (finput, encoding)
224 const char *encoding;
226 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
229 lex->finput = finput;
231 lex->unget_value = 0;
235 lex->handle = iconv_open ("UCS-2", encoding);
236 if (lex->handle != (iconv_t) -1)
242 lex->read_anything = 0;
243 lex->use_fallback = 0;
245 /* Work around broken iconv() implementations by doing checking at
246 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
247 then all UCS-2 encoders will be broken. Perhaps not a valid
255 handle = iconv_open ("UCS-2", "UTF-8");
256 if (handle != (iconv_t) -1)
263 /* This is the UTF-8 encoding of \ufeff. */
270 outp = (char *) &result;
273 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
275 iconv_close (handle);
276 /* Conversion must be complete for us to use the result. */
277 if (r != (size_t) -1 && inc == 0 && outc == 0)
278 need_byteswap = (result != 0xfeff);
282 lex->byte_swap = need_byteswap;
285 #endif /* HAVE_ICONV */
287 /* If iconv failed, use the internal decoder if the default
288 encoding was requested. This code is used on platforms where
289 iconv exists but is insufficient for our needs. For
290 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2. */
291 if (strcmp (encoding, DEFAULT_ENCODING))
295 lex->use_fallback = 1;
296 #endif /* HAVE_ICONV */
300 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
306 java_destroy_lexer (lex)
310 if (! lex->use_fallback)
311 iconv_close (lex->handle);
320 if (lex->unget_value)
322 unicode_t r = lex->unget_value;
323 lex->unget_value = 0;
328 if (! lex->use_fallback)
330 size_t ir, inbytesleft, in_save, out_count, out_save;
334 /* If there is data which has already been converted, use it. */
335 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
342 /* See if we need to read more data. If FIRST == 0 then
343 the previous conversion attempt ended in the middle of
344 a character at the end of the buffer. Otherwise we
345 only have to read if the buffer is empty. */
346 if (lex->first == 0 || lex->first >= lex->last)
350 if (lex->first >= lex->last)
355 if (feof (lex->finput))
357 r = fread (&lex->buffer[lex->last], 1,
358 sizeof (lex->buffer) - lex->last,
363 inbytesleft = lex->last - lex->first;
364 out_count = sizeof (lex->out_buffer) - lex->out_last;
366 if (inbytesleft == 0)
368 /* We've tried to read and there is nothing left. */
372 in_save = inbytesleft;
373 out_save = out_count;
374 inp = &lex->buffer[lex->first];
375 outp = &lex->out_buffer[lex->out_last];
376 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
377 &inbytesleft, &outp, &out_count);
379 /* If we haven't read any bytes, then look to see if we
381 if (! lex->read_anything && out_save - out_count >= 2)
383 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
389 else if (uc == 0xfffe)
394 lex->read_anything = 1;
400 for (i = 0; i < out_save - out_count; i += 2)
402 char t = lex->out_buffer[lex->out_last + i];
403 lex->out_buffer[lex->out_last + i]
404 = lex->out_buffer[lex->out_last + i + 1];
405 lex->out_buffer[lex->out_last + i + 1] = t;
409 lex->first += in_save - inbytesleft;
410 lex->out_last += out_save - out_count;
412 /* If we converted anything at all, move along. */
413 if (out_count != out_save)
416 if (ir == (size_t) -1)
420 /* This is ok. This means that the end of our buffer
421 is in the middle of a character sequence. We just
422 move the valid part of the buffer to the beginning
424 memmove (&lex->buffer[0], &lex->buffer[lex->first],
425 lex->last - lex->first);
426 lex->last -= lex->first;
431 /* A more serious error. */
432 java_lex_error ("unrecognized character in input stream",
440 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
442 /* Don't have any data. */
447 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
452 #endif /* HAVE_ICONV */
455 c = getc (lex->finput);
460 return (unicode_t) c;
463 if ((c & 0xe0) == 0xc0)
465 c1 = getc (lex->finput);
466 if ((c1 & 0xc0) == 0x80)
468 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
469 /* Check for valid 2-byte characters. We explicitly
470 allow \0 because this encoding is common in the
472 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
476 else if ((c & 0xf0) == 0xe0)
478 c1 = getc (lex->finput);
479 if ((c1 & 0xc0) == 0x80)
481 c2 = getc (lex->finput);
482 if ((c2 & 0xc0) == 0x80)
484 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
487 /* Check for valid 3-byte characters.
488 Don't allow surrogate, \ufffe or \uffff. */
489 if (r >= 0x800 && r <= 0xffff
490 && ! (r >= 0xd800 && r <= 0xdfff)
491 && r != 0xfffe && r != 0xffff)
497 /* We simply don't support invalid characters. We also
498 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
499 cannot be valid Java characters. */
500 java_lex_error ("malformed UTF-8 character", 0);
504 /* We only get here on error. */
509 java_store_unicode (l, c, unicode_escape_p)
512 int unicode_escape_p;
514 if (l->size == l->max)
516 l->max += JAVA_LINE_MAX;
517 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
518 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
519 sizeof (char)*l->max);
521 l->line [l->size] = c;
522 l->unicode_escape_p [l->size++] = unicode_escape_p;
526 java_read_unicode (lex, unicode_escape_p)
528 int *unicode_escape_p;
532 c = java_read_char (lex);
533 *unicode_escape_p = 0;
542 if ((lex->bs_count) % 2 == 1)
544 /* Odd number of \ seen. */
545 c = java_read_char (lex);
548 unicode_t unicode = 0;
551 /* Recognize any number of `u's in \u. */
552 while ((c = java_read_char (lex)) == 'u')
555 /* Unget the most recent character as it is not a `u'. */
558 lex->unget_value = c;
560 /* Next should be 4 hex digits, otherwise it's an error.
561 The hex value is converted into the unicode, pushed into
562 the Unicode stream. */
563 for (shift = 12; shift >= 0; shift -= 4)
565 if ((c = java_read_char (lex)) == UEOF)
568 unicode |= (unicode_t)((c-'0') << shift);
569 else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
570 unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
572 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
575 *unicode_escape_p = 1;
578 lex->unget_value = c;
580 return (unicode_t) '\\';
584 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
586 int *unicode_escape_p;
588 int c = java_read_unicode (lex, unicode_escape_p);
592 /* We have to read ahead to see if we got \r\n. In that case we
593 return a single line terminator. */
595 c = java_read_unicode (lex, &dummy);
597 lex->unget_value = c;
598 /* In either case we must return a newline. */
608 /* It's time to read a line when... */
609 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
614 if (ctxp->lexer->hit_eof)
617 java_allocate_new_line ();
618 if (ctxp->c_line->line[0] != '\n')
622 int unicode_escape_p;
623 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
628 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
629 if (ctxp->c_line->white_space_only
630 && !JAVA_WHITE_SPACE_P (c)
632 ctxp->c_line->white_space_only = 0;
634 if ((c == '\n') || (c == UEOF))
638 if (c == UEOF && ! found_chars)
640 ctxp->lexer->hit_eof = 1;
645 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
646 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
647 return ctxp->c_line->line [ctxp->c_line->current++];
650 /* Parse the end of a C style comment.
651 * C is the first character following the '/' and '*'. */
653 java_parse_end_comment (c)
656 for ( ;; c = java_get_unicode ())
661 java_lex_error ("Comment not terminated at end of input", 0);
664 switch (c = java_get_unicode ())
667 java_lex_error ("Comment not terminated at end of input", 0);
671 case '*': /* reparse only '*' */
672 java_unget_unicode ();
678 /* Parse the documentation section. Keywords must be at the beginning
679 of a documentation comment line (ignoring white space and any `*'
680 character). Parsed keyword(s): @DEPRECATED. */
683 java_parse_doc_section (c)
686 int valid_tag = 0, seen_star = 0;
688 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
700 c = java_get_unicode();
704 java_lex_error ("Comment not terminated at end of input", 0);
706 if (seen_star && (c == '/'))
707 return 1; /* Goto step1 in caller */
709 /* We're parsing @deprecated */
710 if (valid_tag && (c == '@'))
715 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
717 c = java_get_unicode ();
718 tag [tag_index++] = c;
722 java_lex_error ("Comment not terminated at end of input", 0);
723 tag [tag_index] = '\0';
725 if (!strcmp (tag, "deprecated"))
726 ctxp->deprecated = 1;
728 java_unget_unicode ();
732 /* Return true if C is a valid start character for a Java identifier.
733 This is only called if C >= 128 -- smaller values are handled
734 inline. However, this function handles all values anyway. */
736 java_start_char_p (c)
739 unsigned int hi = c / 256;
740 char *page = type_table[hi];
741 unsigned long val = (unsigned long) page;
744 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
745 flags = page[c & 255];
749 return flags & LETTER_START;
752 /* Return true if C is a valid part character for a Java identifier.
753 This is only called if C >= 128 -- smaller values are handled
754 inline. However, this function handles all values anyway. */
759 unsigned int hi = c / 256;
760 char *page = type_table[hi];
761 unsigned long val = (unsigned long) page;
764 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
765 flags = page[c & 255];
769 return flags & LETTER_PART;
773 java_parse_escape_sequence ()
778 switch (c = java_get_unicode ())
781 return (unicode_t)0x8;
783 return (unicode_t)0x9;
785 return (unicode_t)0xa;
787 return (unicode_t)0xc;
789 return (unicode_t)0xd;
791 return (unicode_t)0x22;
793 return (unicode_t)0x27;
795 return (unicode_t)0x5c;
796 case '0': case '1': case '2': case '3': case '4':
797 case '5': case '6': case '7':
800 int octal_escape_index = 0;
804 for (; octal_escape_index < max && RANGE (c, '0', '7');
805 c = java_get_unicode ())
807 if (octal_escape_index == 0 && c > '3')
809 /* According to the grammar, `\477' has a well-defined
810 meaning -- it is `\47' followed by `7'. */
813 octal_escape [octal_escape_index++] = c;
816 java_unget_unicode ();
818 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
819 i < octal_escape_index; i++, shift -= 3)
820 char_lit |= (octal_escape [i] - '0') << shift;
825 java_lex_error ("Invalid character in escape sequence", 0);
826 return JAVA_CHAR_ERROR;
830 /* Isolate the code which may raise an arithmetic exception in its
839 int number_beginning;
842 #ifdef REAL_ARITHMETIC
843 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
845 #define IS_ZERO(X) ((X) == 0)
848 static void java_perform_atof PARAMS ((PTR));
851 java_perform_atof (av)
854 struct jpa_args *a = (struct jpa_args *)av;
855 YYSTYPE *java_lval = a->java_lval;
856 int number_beginning = a->number_beginning;
857 REAL_VALUE_TYPE value;
858 tree type = (a->fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
860 SET_REAL_VALUE_ATOF (value,
861 REAL_VALUE_ATOF (a->literal_token, TYPE_MODE (type)));
863 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
865 JAVA_FLOAT_RANGE_ERROR ((a->fflag ? "float" : "double"));
868 else if (IS_ZERO (value))
870 /* We check to see if the value is really 0 or if we've found an
871 underflow. We do this in the most primitive imaginable way. */
873 char *p = a->literal_token;
876 while (*p && *p != 'e' && *p != 'E')
878 if (*p != '0' && *p != '.')
887 int i = ctxp->c_line->current;
888 ctxp->c_line->current = number_beginning;
889 java_lex_error ("Floating point literal underflow", 0);
890 ctxp->c_line->current = i;
894 SET_LVAL_NODE_TYPE (build_real (type, value), type);
898 static int yylex PARAMS ((YYSTYPE *));
909 unicode_t first_unicode;
910 int ascii_index, all_ascii;
913 /* Translation of the Unicode escape in the raw stream of Unicode
914 characters. Takes care of line terminator. */
916 /* Skip white spaces: SP, TAB and FF or ULT */
917 for (c = java_get_unicode ();
918 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
921 ctxp->elc.line = ctxp->c_line->lineno;
922 ctxp->elc.col = ctxp->c_line->char_col-2;
925 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
927 if (c == 0x1a) /* CTRL-Z */
929 if ((c = java_get_unicode ()) == UEOF)
930 return 0; /* Ok here */
932 java_unget_unicode (); /* Caught later, at the end of the function */
934 /* Handle EOF here */
935 if (c == UEOF) /* Should probably do something here... */
938 /* Take care of eventual comments. */
941 switch (c = java_get_unicode ())
946 c = java_get_unicode ();
949 /* It is ok to end a `//' comment with EOF, unless
950 we're being pedantic. */
952 java_lex_error ("Comment not terminated at end of input",
956 if (c == '\n') /* ULT */
962 if ((c = java_get_unicode ()) == '*')
964 if ((c = java_get_unicode ()) == '/')
965 goto step1; /* Empy documentation comment */
966 else if (java_parse_doc_section (c))
970 java_parse_end_comment ((c = java_get_unicode ()));
974 java_unget_unicode ();
980 ctxp->elc.line = ctxp->c_line->lineno;
981 ctxp->elc.prev_col = ctxp->elc.col;
982 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
983 if (ctxp->elc.col < 0)
986 /* Numeric literals */
987 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
989 /* This section of code is borrowed from gcc/c-lex.c */
990 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
991 int parts[TOTAL_PARTS];
992 HOST_WIDE_INT high, low;
993 /* End borrowed section */
994 char literal_token [256];
995 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
996 int found_hex_digits = 0;
999 int number_beginning = ctxp->c_line->current;
1003 /* We might have a . separator instead of a FP like .[0-9]* */
1006 unicode_t peep = java_sneak_unicode ();
1008 if (!JAVA_ASCII_DIGIT (peep))
1011 BUILD_OPERATOR (DOT_TK);
1015 for (i = 0; i < TOTAL_PARTS; i++)
1020 c = java_get_unicode ();
1021 if (c == 'x' || c == 'X')
1024 c = java_get_unicode ();
1026 else if (JAVA_ASCII_DIGIT (c))
1030 /* Push the '.' back and prepare for a FP parsing... */
1031 java_unget_unicode ();
1036 /* We have a zero literal: 0, 0{f,F}, 0{d,D} */
1037 JAVA_LEX_LIT ("0", 10);
1041 SET_LVAL_NODE (long_zero_node);
1042 return (INT_LIT_TK);
1044 SET_LVAL_NODE (float_zero_node);
1047 SET_LVAL_NODE (double_zero_node);
1050 java_unget_unicode ();
1051 SET_LVAL_NODE (integer_zero_node);
1052 return (INT_LIT_TK);
1056 /* Parse the first part of the literal, until we find something
1057 which is not a number. */
1058 while ((radix == 10 && JAVA_ASCII_DIGIT (c)) ||
1059 (radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1060 (radix == 8 && JAVA_ASCII_OCTDIGIT (c)))
1062 /* We store in a string (in case it turns out to be a FP) and in
1063 PARTS if we have to process a integer literal. */
1064 int numeric = (ISDIGIT (c) ? c-'0' : 10 +(c|0x20)-'a');
1067 /* Remember when we find a valid hexadecimal digit */
1069 found_hex_digits = 1;
1071 literal_token [literal_index++] = c;
1072 /* This section of code if borrowed from gcc/c-lex.c */
1073 for (count = 0; count < TOTAL_PARTS; count++)
1075 parts[count] *= radix;
1078 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1079 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1082 parts[0] += numeric;
1084 if (parts [TOTAL_PARTS-1] != 0)
1086 /* End borrowed section. */
1087 c = java_get_unicode ();
1090 /* If we have something from the FP char set but not a digit, parse
1092 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1095 int seen_digit = (literal_index ? 1 : 0);
1096 int seen_exponent = 0;
1097 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1098 double unless specified. */
1100 /* It is ok if the radix is 8 because this just means we've
1101 seen a leading `0'. However, radix==16 is invalid. */
1103 java_lex_error ("Can't express non-decimal FP literal", 0);
1113 literal_token [literal_index++ ] = c;
1114 c = java_get_unicode ();
1117 java_lex_error ("Invalid character in FP literal", 0);
1120 if (c == 'e' || c == 'E')
1124 /* {E,e} must have seen at list a digit */
1126 java_lex_error ("Invalid FP literal", 0);
1130 literal_token [literal_index++] = c;
1131 c = java_get_unicode ();
1134 java_lex_error ("Invalid character in FP literal", 0);
1136 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1138 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1139 stage = 4; /* So we fall through */
1142 if ((c=='-' || c =='+') && stage == 2)
1145 literal_token [literal_index++] = c;
1146 c = java_get_unicode ();
1149 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1150 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1151 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1152 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1154 if (JAVA_ASCII_DIGIT (c))
1156 literal_token [literal_index++ ] = c;
1157 c = java_get_unicode ();
1164 if (stage != 4) /* Don't push back fF/dD */
1165 java_unget_unicode ();
1167 /* An exponent (if any) must have seen a digit. */
1168 if (seen_exponent && !seen_digit)
1169 java_lex_error ("Invalid FP literal", 0);
1171 literal_token [literal_index] = '\0';
1172 JAVA_LEX_LIT (literal_token, radix);
1175 a.literal_token = literal_token;
1177 a.java_lval = java_lval;
1178 a.number_beginning = number_beginning;
1179 if (do_float_handler (java_perform_atof, (PTR) &a))
1182 JAVA_FLOAT_RANGE_ERROR ((fflag ? "float" : "double"));
1188 } /* JAVA_ASCCI_FPCHAR (c) */
1190 if (radix == 16 && ! found_hex_digits)
1192 ("0x must be followed by at least one hexadecimal digit", 0);
1194 /* Here we get back to converting the integral literal. */
1195 if (c == 'L' || c == 'l')
1197 else if (radix == 16 && JAVA_ASCII_LETTER (c))
1198 java_lex_error ("Digit out of range in hexadecimal literal", 0);
1199 else if (radix == 8 && JAVA_ASCII_DIGIT (c))
1200 java_lex_error ("Digit out of range in octal literal", 0);
1201 else if (radix == 16 && !literal_index)
1202 java_lex_error ("No digit specified for hexadecimal literal", 0);
1204 java_unget_unicode ();
1206 #ifdef JAVA_LEX_DEBUG
1207 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1208 JAVA_LEX_LIT (literal_token, radix);
1210 /* This section of code is borrowed from gcc/c-lex.c */
1213 bytes = GET_TYPE_PRECISION (long_type_node);
1214 for (i = bytes; i < TOTAL_PARTS; i++)
1222 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1224 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1225 / HOST_BITS_PER_CHAR)]
1226 << (i * HOST_BITS_PER_CHAR));
1227 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1229 /* End borrowed section. */
1231 /* Range checking */
1234 /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1235 9223372036854775807L is the biggest `long' literal that can be
1236 expressed using a 10 radix. For other radixes, everything that
1237 fits withing 64 bits is OK. */
1238 int hb = (high >> 31);
1239 if (overflow || (hb && low && radix == 10)
1240 || (hb && high & 0x7fffffff && radix == 10))
1241 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1245 /* 2147483648 is valid if operand of a '-'. Otherwise,
1246 2147483647 is the biggest `int' literal that can be
1247 expressed using a 10 radix. For other radixes, everything
1248 that fits within 32 bits is OK. As all literals are
1249 signed, we sign extend here. */
1250 int hb = (low >> 31) & 0x1;
1251 if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1252 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1256 value = build_int_2 (low, high);
1257 JAVA_RADIX10_FLAG (value) = radix == 10;
1258 SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1260 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1261 long_suffix ? long_type_node : int_type_node);
1266 /* Character literals */
1270 if ((c = java_get_unicode ()) == '\\')
1271 char_lit = java_parse_escape_sequence ();
1274 if (c == '\n' || c == '\'')
1275 java_lex_error ("Invalid character literal", 0);
1279 c = java_get_unicode ();
1281 if ((c == '\n') || (c == UEOF))
1282 java_lex_error ("Character literal not terminated at end of line", 0);
1284 java_lex_error ("Syntax error in character literal", 0);
1286 if (char_lit == JAVA_CHAR_ERROR)
1287 char_lit = 0; /* We silently convert it to zero */
1289 JAVA_LEX_CHAR_LIT (char_lit);
1290 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1294 /* String literals */
1300 for (no_error = 1, c = java_get_unicode ();
1301 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1304 c = java_parse_escape_sequence ();
1305 if (c == JAVA_CHAR_ERROR)
1308 c = 0; /* We silently convert it to zero. */
1310 java_unicode_2_utf8 (c);
1312 if (c == '\n' || c == UEOF) /* ULT */
1314 lineno--; /* Refer to the line the terminator was seen */
1315 java_lex_error ("String not terminated at end of line", 0);
1319 obstack_1grow (&temporary_obstack, '\0');
1320 string = obstack_finish (&temporary_obstack);
1322 if (!no_error || (c != '"'))
1323 java_lval->node = error_mark_node; /* Requires futher testing FIXME */
1325 java_lval->node = build_string (strlen (string), string);
1327 obstack_free (&temporary_obstack, string);
1328 return STRING_LIT_TK;
1336 BUILD_OPERATOR (OP_TK);
1342 if (ctxp->ccb_indent == 1)
1343 ctxp->first_ccb_indent1 = lineno;
1345 BUILD_OPERATOR (OCB_TK);
1349 if (ctxp->ccb_indent == 1)
1350 ctxp->last_ccb_indent1 = lineno;
1351 BUILD_OPERATOR (CCB_TK);
1354 BUILD_OPERATOR (OSB_TK);
1366 BUILD_OPERATOR (DOT_TK);
1367 /* return DOT_TK; */
1374 if ((c = java_get_unicode ()) == '=')
1376 BUILD_OPERATOR (EQ_TK);
1380 /* Equals is used in two different locations. In the
1381 variable_declarator: rule, it has to be seen as '=' as opposed
1382 to being seen as an ordinary assignment operator in
1383 assignment_operators: rule. */
1384 java_unget_unicode ();
1385 BUILD_OPERATOR (ASSIGN_TK);
1389 switch ((c = java_get_unicode ()))
1392 BUILD_OPERATOR (GTE_TK);
1394 switch ((c = java_get_unicode ()))
1397 if ((c = java_get_unicode ()) == '=')
1399 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1403 java_unget_unicode ();
1404 BUILD_OPERATOR (ZRS_TK);
1407 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1409 java_unget_unicode ();
1410 BUILD_OPERATOR (SRS_TK);
1413 java_unget_unicode ();
1414 BUILD_OPERATOR (GT_TK);
1418 switch ((c = java_get_unicode ()))
1421 BUILD_OPERATOR (LTE_TK);
1423 if ((c = java_get_unicode ()) == '=')
1425 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1429 java_unget_unicode ();
1430 BUILD_OPERATOR (LS_TK);
1433 java_unget_unicode ();
1434 BUILD_OPERATOR (LT_TK);
1438 switch ((c = java_get_unicode ()))
1441 BUILD_OPERATOR (BOOL_AND_TK);
1443 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1445 java_unget_unicode ();
1446 BUILD_OPERATOR (AND_TK);
1450 switch ((c = java_get_unicode ()))
1453 BUILD_OPERATOR (BOOL_OR_TK);
1455 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1457 java_unget_unicode ();
1458 BUILD_OPERATOR (OR_TK);
1462 switch ((c = java_get_unicode ()))
1465 BUILD_OPERATOR (INCR_TK);
1467 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1469 java_unget_unicode ();
1470 BUILD_OPERATOR (PLUS_TK);
1474 switch ((c = java_get_unicode ()))
1477 BUILD_OPERATOR (DECR_TK);
1479 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1481 java_unget_unicode ();
1482 BUILD_OPERATOR (MINUS_TK);
1486 if ((c = java_get_unicode ()) == '=')
1488 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1492 java_unget_unicode ();
1493 BUILD_OPERATOR (MULT_TK);
1497 if ((c = java_get_unicode ()) == '=')
1499 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1503 java_unget_unicode ();
1504 BUILD_OPERATOR (DIV_TK);
1508 if ((c = java_get_unicode ()) == '=')
1510 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1514 java_unget_unicode ();
1515 BUILD_OPERATOR (XOR_TK);
1519 if ((c = java_get_unicode ()) == '=')
1521 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1525 java_unget_unicode ();
1526 BUILD_OPERATOR (REM_TK);
1530 if ((c = java_get_unicode()) == '=')
1532 BUILD_OPERATOR (NEQ_TK);
1536 java_unget_unicode ();
1537 BUILD_OPERATOR (NEG_TK);
1542 BUILD_OPERATOR (REL_QM_TK);
1545 BUILD_OPERATOR (REL_CL_TK);
1547 BUILD_OPERATOR (NOT_TK);
1550 /* Keyword, boolean literal or null literal */
1551 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1552 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1554 java_unicode_2_utf8 (c);
1555 if (all_ascii && c >= 128)
1560 obstack_1grow (&temporary_obstack, '\0');
1561 string = obstack_finish (&temporary_obstack);
1562 java_unget_unicode ();
1564 /* If we have something all ascii, we consider a keyword, a boolean
1565 literal, a null literal or an all ASCII identifier. Otherwise,
1566 this is an identifier (possibly not respecting formation rule). */
1569 struct java_keyword *kw;
1570 if ((kw=java_keyword (string, ascii_index)))
1572 JAVA_LEX_KW (string);
1575 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1576 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1577 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1578 case PRIVATE_TK: case STRICT_TK:
1579 SET_MODIFIER_CTX (kw->token);
1582 SET_LVAL_NODE (float_type_node);
1585 SET_LVAL_NODE (double_type_node);
1588 SET_LVAL_NODE (boolean_type_node);
1591 SET_LVAL_NODE (byte_type_node);
1594 SET_LVAL_NODE (short_type_node);
1597 SET_LVAL_NODE (int_type_node);
1600 SET_LVAL_NODE (long_type_node);
1603 SET_LVAL_NODE (char_type_node);
1606 /* Keyword based literals */
1609 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1610 boolean_true_node : boolean_false_node));
1613 SET_LVAL_NODE (null_pointer_node);
1616 /* Some keyword we want to retain information on the location
1629 BUILD_OPERATOR (kw->token);
1637 /* We may have an ID here */
1638 if (JAVA_START_CHAR_P (first_unicode))
1640 JAVA_LEX_ID (string);
1641 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1645 /* Everything else is an invalid character in the input */
1647 char lex_error_buffer [128];
1648 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1649 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1650 java_lex_error (lex_error_buffer, 1);
1656 /* This is called by the parser to see if an error should be generated
1657 due to numeric overflow. This function only handles the particular
1658 case of the largest negative value, and is only called in the case
1659 where this value is not preceeded by `-'. */
1661 error_if_numeric_overflow (value)
1664 if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1666 unsigned HOST_WIDE_INT lo, hi;
1668 lo = TREE_INT_CST_LOW (value);
1669 hi = TREE_INT_CST_HIGH (value);
1670 if (TREE_TYPE (value) == long_type_node)
1672 int hb = (hi >> 31);
1673 if (hb && !(hi & 0x7fffffff))
1674 java_lex_error ("Numeric overflow for `long' literal", 0);
1678 int hb = (lo >> 31) & 0x1;
1679 if (hb && !(lo & 0x7fffffff))
1680 java_lex_error ("Numeric overflow for `int' literal", 0);
1684 #endif /* JC1_LITE */
1687 java_unicode_2_utf8 (unicode)
1690 if (RANGE (unicode, 0x01, 0x7f))
1691 obstack_1grow (&temporary_obstack, (char)unicode);
1692 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1694 obstack_1grow (&temporary_obstack,
1695 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1696 obstack_1grow (&temporary_obstack,
1697 (unsigned char)(0x80 | (unicode & 0x3f)));
1699 else /* Range 0x800-0xffff */
1701 obstack_1grow (&temporary_obstack,
1702 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1703 obstack_1grow (&temporary_obstack,
1704 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1705 obstack_1grow (&temporary_obstack,
1706 (unsigned char)(0x80 | (unicode & 0x003f)));
1712 build_wfl_node (node)
1715 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1716 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1717 TREE_TYPE (node) = NULL_TREE;
1723 java_lex_error (msg, forward)
1724 const char *msg ATTRIBUTE_UNUSED;
1725 int forward ATTRIBUTE_UNUSED;
1728 ctxp->elc.line = ctxp->c_line->lineno;
1729 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1731 /* Might be caught in the middle of some error report */
1732 ctxp->java_error_flag = 0;
1749 if (next != '\n' && next != EOF)
1761 java_get_line_col (filename, line, col)
1762 const char *filename ATTRIBUTE_UNUSED;
1763 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1768 /* Dumb implementation. Doesn't try to cache or optimize things. */
1769 /* First line of the file is line 1, first column is 1 */
1771 /* COL == -1 means, at the CR/LF in LINE */
1772 /* COL == -2 means, at the first non space char in LINE */
1775 int c, ccol, cline = 1;
1776 int current_line_col = 0;
1777 int first_non_space = 0;
1780 if (!(fp = fopen (filename, "r")))
1781 fatal_io_error ("can't open %s", filename);
1783 while (cline != line)
1788 static const char msg[] = "<<file too short - unexpected EOF>>";
1789 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1792 if (java_is_eol (fp, c))
1796 /* Gather the chars of the current line in a buffer */
1800 if (c < 0 || java_is_eol (fp, c))
1802 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1803 first_non_space = current_line_col;
1804 obstack_1grow (&temporary_obstack, c);
1809 obstack_1grow (&temporary_obstack, '\n');
1813 col = current_line_col;
1814 first_non_space = 0;
1817 col = first_non_space;
1819 first_non_space = 0;
1821 /* Place the '^' a the right position */
1822 base = obstack_base (&temporary_obstack);
1823 for (ccol = 1; ccol <= col+3; ccol++)
1825 /* Compute \t when reaching first_non_space */
1826 char c = (first_non_space ?
1827 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1828 obstack_1grow (&temporary_obstack, c);
1830 obstack_grow0 (&temporary_obstack, "^", 1);
1833 return obstack_finish (&temporary_obstack);
1839 utf8_cmp (str, length, name)
1840 const unsigned char *str;
1844 const unsigned char *limit = str + length;
1847 for (i = 0; name[i]; ++i)
1849 int ch = UTF8_GET (str, limit);
1851 return ch - name[i];
1854 return str == limit ? 0 : 1;
1857 /* A sorted list of all C++ keywords. */
1859 static const char *const cxx_keywords[] =
1967 /* Return true if NAME is a C++ keyword. */
1970 cxx_keyword_p (name, length)
1974 int last = ARRAY_SIZE (cxx_keywords);
1976 int mid = (last + first) / 2;
1979 for (mid = (last + first) / 2;
1981 old = mid, mid = (last + first) / 2)
1983 int kwl = strlen (cxx_keywords[mid]);
1984 int min_length = kwl > length ? length : kwl;
1985 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1990 /* We've found a match if all the remaining characters are
1992 for (i = min_length; i < length && name[i] == '$'; ++i)
2006 #endif /* JC1_LITE */