1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
3 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
5 This file is part of GNU CC.
7 GNU CC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU CC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU CC; see the file COPYING. If not, write to
19 the Free Software Foundation, 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA.
22 Java and all Java-based marks are trademarks or registered trademarks
23 of Sun Microsystems, Inc. in the United States and other countries.
24 The Free Software Foundation is independent of Sun Microsystems, Inc. */
26 /* It defines java_lex (yylex) that reads a Java ASCII source file
27 possibly containing Unicode escape sequence or utf8 encoded
28 characters and returns a token for everything found but comments,
29 white spaces and line terminators. When necessary, it also fills
30 the java_lval (yylval) union. It's implemented to be called by a
31 re-entrant parser generated by Bison.
33 The lexical analysis conforms to the Java grammar described in "The
34 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
35 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
39 #include "chartables.h"
41 /* Function declaration */
42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
44 static void java_lex_error PARAMS ((const char *, int));
46 static int java_is_eol PARAMS ((FILE *, int));
47 static tree build_wfl_node PARAMS ((tree));
49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
50 static int java_parse_escape_sequence PARAMS ((void));
51 static int java_start_char_p PARAMS ((unicode_t));
52 static int java_part_char_p PARAMS ((unicode_t));
53 static int java_parse_doc_section PARAMS ((int));
54 static void java_parse_end_comment PARAMS ((int));
55 static int java_get_unicode PARAMS ((void));
56 static int java_read_unicode PARAMS ((java_lexer *, int *));
57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
60 static int java_read_char PARAMS ((java_lexer *));
61 static void java_allocate_new_line PARAMS ((void));
62 static void java_unget_unicode PARAMS ((void));
63 static unicode_t java_sneak_unicode PARAMS ((void));
65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
71 /* This is nonzero if we have initialized `need_byteswap'. */
72 static int byteswap_init = 0;
74 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
75 big-endian order -- not native endian order. We handle this by
76 doing a conversion once at startup and seeing what happens. This
77 flag holds the results of this determination. */
78 static int need_byteswap = 0;
82 java_init_lex (finput, encoding)
87 int java_lang_imported = 0;
90 java_lang_id = get_identifier ("java.lang");
91 if (!java_lang_cloneable)
92 java_lang_cloneable = get_identifier ("java.lang.Cloneable");
93 if (!java_io_serializable)
94 java_io_serializable = get_identifier ("java.io.Serializable");
96 inst_id = get_identifier ("inst$");
98 wpv_id = get_identifier ("write_parm_value$");
100 if (!java_lang_imported)
102 tree node = build_tree_list
103 (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
104 read_import_dir (TREE_PURPOSE (node));
105 TREE_CHAIN (node) = ctxp->import_demand_list;
106 ctxp->import_demand_list = node;
107 java_lang_imported = 1;
111 wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
113 label_id = get_identifier ("$L");
115 wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
116 if (!wfl_string_buffer)
118 build_expr_wfl (get_identifier ("java.lang.StringBuffer"), NULL, 0, 0);
120 wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
122 CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
123 CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
125 memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
126 memset ((PTR) current_jcf, 0, sizeof (JCF));
127 ctxp->current_parsed_class = NULL;
128 ctxp->package = NULL_TREE;
131 ctxp->filename = input_filename;
132 ctxp->lineno = lineno = 0;
135 ctxp->minus_seen = 0;
136 ctxp->java_error_flag = 0;
137 ctxp->lexer = java_new_lexer (finput, encoding);
141 java_sprint_unicode (line, i)
142 struct java_line *line;
145 static char buffer [10];
146 if (line->unicode_escape_p [i] || line->line [i] > 128)
147 sprintf (buffer, "\\u%04x", line->line [i]);
150 buffer [0] = line->line [i];
157 java_sneak_unicode ()
159 return (ctxp->c_line->line [ctxp->c_line->current]);
163 java_unget_unicode ()
165 if (!ctxp->c_line->current)
166 /* Can't unget unicode. */
169 ctxp->c_line->current--;
170 ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
174 java_allocate_new_line ()
176 unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
177 char ahead_escape_p = (ctxp->c_line ?
178 ctxp->c_line->unicode_escape_ahead_p : 0);
180 if (ctxp->c_line && !ctxp->c_line->white_space_only)
184 free (ctxp->p_line->unicode_escape_p);
185 free (ctxp->p_line->line);
188 ctxp->p_line = ctxp->c_line;
189 ctxp->c_line = NULL; /* Reallocated */
194 ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
195 ctxp->c_line->max = JAVA_LINE_MAX;
196 ctxp->c_line->line = (unicode_t *)xmalloc
197 (sizeof (unicode_t)*ctxp->c_line->max);
198 ctxp->c_line->unicode_escape_p =
199 (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
200 ctxp->c_line->white_space_only = 0;
203 ctxp->c_line->line [0] = ctxp->c_line->size = 0;
204 ctxp->c_line->char_col = ctxp->c_line->current = 0;
207 ctxp->c_line->line [ctxp->c_line->size] = ahead;
208 ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
209 ctxp->c_line->size++;
211 ctxp->c_line->ahead [0] = 0;
212 ctxp->c_line->unicode_escape_ahead_p = 0;
213 ctxp->c_line->lineno = ++lineno;
214 ctxp->c_line->white_space_only = 1;
217 /* Create a new lexer object. */
220 java_new_lexer (finput, encoding)
222 const char *encoding;
224 java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
227 lex->finput = finput;
229 lex->unget_value = 0;
233 lex->handle = iconv_open ("UCS-2", encoding);
234 if (lex->handle != (iconv_t) -1)
240 lex->read_anything = 0;
241 lex->use_fallback = 0;
243 /* Work around broken iconv() implementations by doing checking at
244 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
245 then all UCS-2 encoders will be broken. Perhaps not a valid
253 handle = iconv_open ("UCS-2", "UTF-8");
254 if (handle != (iconv_t) -1)
261 /* This is the UTF-8 encoding of \ufeff. */
268 outp = (char *) &result;
271 r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
273 iconv_close (handle);
274 /* Conversion must be complete for us to use the result. */
275 if (r != (size_t) -1 && inc == 0 && outc == 0)
276 need_byteswap = (result != 0xfeff);
280 lex->byte_swap = need_byteswap;
283 #endif /* HAVE_ICONV */
285 /* If iconv failed, use the internal decoder if the default
286 encoding was requested. This code is used on platforms where
287 iconv exists but is insufficient for our needs. For
288 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2. */
289 if (strcmp (encoding, DEFAULT_ENCODING))
293 lex->use_fallback = 1;
294 #endif /* HAVE_ICONV */
298 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option.", encoding);
304 java_destroy_lexer (lex)
308 if (! lex->use_fallback)
309 iconv_close (lex->handle);
318 if (lex->unget_value)
320 unicode_t r = lex->unget_value;
321 lex->unget_value = 0;
326 if (! lex->use_fallback)
328 size_t ir, inbytesleft, in_save, out_count, out_save;
332 /* If there is data which has already been converted, use it. */
333 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
340 /* See if we need to read more data. If FIRST == 0 then
341 the previous conversion attempt ended in the middle of
342 a character at the end of the buffer. Otherwise we
343 only have to read if the buffer is empty. */
344 if (lex->first == 0 || lex->first >= lex->last)
348 if (lex->first >= lex->last)
353 if (feof (lex->finput))
355 r = fread (&lex->buffer[lex->last], 1,
356 sizeof (lex->buffer) - lex->last,
361 inbytesleft = lex->last - lex->first;
362 out_count = sizeof (lex->out_buffer) - lex->out_last;
364 if (inbytesleft == 0)
366 /* We've tried to read and there is nothing left. */
370 in_save = inbytesleft;
371 out_save = out_count;
372 inp = &lex->buffer[lex->first];
373 outp = &lex->out_buffer[lex->out_last];
374 ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
375 &inbytesleft, &outp, &out_count);
377 /* If we haven't read any bytes, then look to see if we
379 if (! lex->read_anything && out_save - out_count >= 2)
381 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
387 else if (uc == 0xfffe)
392 lex->read_anything = 1;
398 for (i = 0; i < out_save - out_count; i += 2)
400 char t = lex->out_buffer[lex->out_last + i];
401 lex->out_buffer[lex->out_last + i]
402 = lex->out_buffer[lex->out_last + i + 1];
403 lex->out_buffer[lex->out_last + i + 1] = t;
407 lex->first += in_save - inbytesleft;
408 lex->out_last += out_save - out_count;
410 /* If we converted anything at all, move along. */
411 if (out_count != out_save)
414 if (ir == (size_t) -1)
418 /* This is ok. This means that the end of our buffer
419 is in the middle of a character sequence. We just
420 move the valid part of the buffer to the beginning
422 memmove (&lex->buffer[0], &lex->buffer[lex->first],
423 lex->last - lex->first);
424 lex->last -= lex->first;
429 /* A more serious error. */
430 java_lex_error ("unrecognized character in input stream",
438 if (lex->out_first == -1 || lex->out_first >= lex->out_last)
440 /* Don't have any data. */
445 result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
450 #endif /* HAVE_ICONV */
453 c = getc (lex->finput);
458 return (unicode_t) c;
461 if ((c & 0xe0) == 0xc0)
463 c1 = getc (lex->finput);
464 if ((c1 & 0xc0) == 0x80)
466 unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
467 /* Check for valid 2-byte characters. We explicitly
468 allow \0 because this encoding is common in the
470 if (r == 0 || (r >= 0x80 && r <= 0x7ff))
474 else if ((c & 0xf0) == 0xe0)
476 c1 = getc (lex->finput);
477 if ((c1 & 0xc0) == 0x80)
479 c2 = getc (lex->finput);
480 if ((c2 & 0xc0) == 0x80)
482 unicode_t r = (unicode_t)(((c & 0xf) << 12) +
485 /* Check for valid 3-byte characters.
486 Don't allow surrogate, \ufffe or \uffff. */
487 if (r >= 0x800 && r <= 0xffff
488 && ! (r >= 0xd800 && r <= 0xdfff)
489 && r != 0xfffe && r != 0xffff)
495 /* We simply don't support invalid characters. We also
496 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
497 cannot be valid Java characters. */
498 java_lex_error ("malformed UTF-8 character", 0);
502 /* We only get here on error. */
507 java_store_unicode (l, c, unicode_escape_p)
510 int unicode_escape_p;
512 if (l->size == l->max)
514 l->max += JAVA_LINE_MAX;
515 l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
516 l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
517 sizeof (char)*l->max);
519 l->line [l->size] = c;
520 l->unicode_escape_p [l->size++] = unicode_escape_p;
524 java_read_unicode (lex, unicode_escape_p)
526 int *unicode_escape_p;
530 c = java_read_char (lex);
531 *unicode_escape_p = 0;
540 if ((lex->bs_count) % 2 == 1)
542 /* Odd number of \ seen. */
543 c = java_read_char (lex);
546 unicode_t unicode = 0;
549 /* Recognize any number of `u's in \u. */
550 while ((c = java_read_char (lex)) == 'u')
553 /* Unget the most recent character as it is not a `u'. */
556 lex->unget_value = c;
558 /* Next should be 4 hex digits, otherwise it's an error.
559 The hex value is converted into the unicode, pushed into
560 the Unicode stream. */
561 for (shift = 12; shift >= 0; shift -= 4)
563 if ((c = java_read_char (lex)) == UEOF)
565 if (c >= '0' && c <= '9')
566 unicode |= (unicode_t)((c-'0') << shift);
567 else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
568 unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
570 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
573 *unicode_escape_p = 1;
576 lex->unget_value = c;
578 return (unicode_t) '\\';
582 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
584 int *unicode_escape_p;
586 int c = java_read_unicode (lex, unicode_escape_p);
590 /* We have to read ahead to see if we got \r\n. In that case we
591 return a single line terminator. */
593 c = java_read_unicode (lex, &dummy);
595 lex->unget_value = c;
596 /* In either case we must return a newline. */
606 /* It's time to read a line when... */
607 if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
612 if (ctxp->lexer->hit_eof)
615 java_allocate_new_line ();
616 if (ctxp->c_line->line[0] != '\n')
620 int unicode_escape_p;
621 c = java_read_unicode_collapsing_terminators (ctxp->lexer,
626 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
627 if (ctxp->c_line->white_space_only
628 && !JAVA_WHITE_SPACE_P (c)
630 ctxp->c_line->white_space_only = 0;
632 if ((c == '\n') || (c == UEOF))
636 if (c == UEOF && ! found_chars)
638 ctxp->lexer->hit_eof = 1;
643 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
644 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
645 return ctxp->c_line->line [ctxp->c_line->current++];
648 /* Parse the end of a C style comment.
649 * C is the first character following the '/' and '*'. */
651 java_parse_end_comment (c)
654 for ( ;; c = java_get_unicode ())
659 java_lex_error ("Comment not terminated at end of input", 0);
662 switch (c = java_get_unicode ())
665 java_lex_error ("Comment not terminated at end of input", 0);
669 case '*': /* reparse only '*' */
670 java_unget_unicode ();
676 /* Parse the documentation section. Keywords must be at the beginning
677 of a documentation comment line (ignoring white space and any `*'
678 character). Parsed keyword(s): @DEPRECATED. */
681 java_parse_doc_section (c)
684 int valid_tag = 0, seen_star = 0;
686 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
698 c = java_get_unicode();
702 java_lex_error ("Comment not terminated at end of input", 0);
704 if (seen_star && (c == '/'))
705 return 1; /* Goto step1 in caller */
707 /* We're parsing @deprecated */
708 if (valid_tag && (c == '@'))
713 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
715 c = java_get_unicode ();
716 tag [tag_index++] = c;
720 java_lex_error ("Comment not terminated at end of input", 0);
721 tag [tag_index] = '\0';
723 if (!strcmp (tag, "deprecated"))
724 ctxp->deprecated = 1;
726 java_unget_unicode ();
730 /* Return true if C is a valid start character for a Java identifier.
731 This is only called if C >= 128 -- smaller values are handled
732 inline. However, this function handles all values anyway. */
734 java_start_char_p (c)
737 unsigned int hi = c / 256;
738 char *page = type_table[hi];
739 unsigned long val = (unsigned long) page;
742 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
743 flags = page[c & 255];
747 return flags & LETTER_START;
750 /* Return true if C is a valid part character for a Java identifier.
751 This is only called if C >= 128 -- smaller values are handled
752 inline. However, this function handles all values anyway. */
757 unsigned int hi = c / 256;
758 char *page = type_table[hi];
759 unsigned long val = (unsigned long) page;
762 if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
763 flags = page[c & 255];
767 return flags & LETTER_PART;
771 java_parse_escape_sequence ()
776 switch (c = java_get_unicode ())
779 return (unicode_t)0x8;
781 return (unicode_t)0x9;
783 return (unicode_t)0xa;
785 return (unicode_t)0xc;
787 return (unicode_t)0xd;
789 return (unicode_t)0x22;
791 return (unicode_t)0x27;
793 return (unicode_t)0x5c;
794 case '0': case '1': case '2': case '3': case '4':
795 case '5': case '6': case '7':
798 int octal_escape_index = 0;
802 for (; octal_escape_index < max && RANGE (c, '0', '7');
803 c = java_get_unicode ())
805 if (octal_escape_index == 0 && c > '3')
807 /* According to the grammar, `\477' has a well-defined
808 meaning -- it is `\47' followed by `7'. */
811 octal_escape [octal_escape_index++] = c;
814 java_unget_unicode ();
816 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
817 i < octal_escape_index; i++, shift -= 3)
818 char_lit |= (octal_escape [i] - '0') << shift;
823 java_lex_error ("Invalid character in escape sequence", 0);
824 return JAVA_CHAR_ERROR;
828 /* Isolate the code which may raise an arithmetic exception in its
837 int number_beginning;
840 #ifdef REAL_ARITHMETIC
841 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
843 #define IS_ZERO(X) ((X) == 0)
846 static void java_perform_atof PARAMS ((PTR));
849 java_perform_atof (av)
852 struct jpa_args *a = (struct jpa_args *)av;
853 YYSTYPE *java_lval = a->java_lval;
854 int number_beginning = a->number_beginning;
855 REAL_VALUE_TYPE value;
856 tree type = (a->fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
858 SET_REAL_VALUE_ATOF (value,
859 REAL_VALUE_ATOF (a->literal_token, TYPE_MODE (type)));
861 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
863 JAVA_FLOAT_RANGE_ERROR ((a->fflag ? "float" : "double"));
866 else if (IS_ZERO (value))
868 /* We check to see if the value is really 0 or if we've found an
869 underflow. We do this in the most primitive imaginable way. */
871 char *p = a->literal_token;
874 while (*p && *p != 'e' && *p != 'E')
876 if (*p != '0' && *p != '.')
885 int i = ctxp->c_line->current;
886 ctxp->c_line->current = number_beginning;
887 java_lex_error ("Floating point literal underflow", 0);
888 ctxp->c_line->current = i;
892 SET_LVAL_NODE_TYPE (build_real (type, value), type);
896 static int yylex PARAMS ((YYSTYPE *));
907 unicode_t first_unicode;
908 int ascii_index, all_ascii;
911 /* Translation of the Unicode escape in the raw stream of Unicode
912 characters. Takes care of line terminator. */
914 /* Skip white spaces: SP, TAB and FF or ULT */
915 for (c = java_get_unicode ();
916 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
919 ctxp->elc.line = ctxp->c_line->lineno;
920 ctxp->elc.col = ctxp->c_line->char_col-2;
923 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
925 if (c == 0x1a) /* CTRL-Z */
927 if ((c = java_get_unicode ()) == UEOF)
928 return 0; /* Ok here */
930 java_unget_unicode (); /* Caught later, at the end of the function */
932 /* Handle EOF here */
933 if (c == UEOF) /* Should probably do something here... */
936 /* Take care of eventual comments. */
939 switch (c = java_get_unicode ())
944 c = java_get_unicode ();
947 /* It is ok to end a `//' comment with EOF, unless
948 we're being pedantic. */
950 java_lex_error ("Comment not terminated at end of input",
954 if (c == '\n') /* ULT */
960 if ((c = java_get_unicode ()) == '*')
962 if ((c = java_get_unicode ()) == '/')
963 goto step1; /* Empy documentation comment */
964 else if (java_parse_doc_section (c))
968 java_parse_end_comment ((c = java_get_unicode ()));
972 java_unget_unicode ();
978 ctxp->elc.line = ctxp->c_line->lineno;
979 ctxp->elc.prev_col = ctxp->elc.col;
980 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
981 if (ctxp->elc.col < 0)
984 /* Numeric literals */
985 if (JAVA_ASCII_DIGIT (c) || (c == '.'))
987 /* This section of code is borrowed from gcc/c-lex.c */
988 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
989 int parts[TOTAL_PARTS];
990 HOST_WIDE_INT high, low;
991 /* End borrowed section */
992 char literal_token [256];
993 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
996 int number_beginning = ctxp->c_line->current;
999 /* We might have a . separator instead of a FP like .[0-9]* */
1002 unicode_t peep = java_sneak_unicode ();
1004 if (!JAVA_ASCII_DIGIT (peep))
1007 BUILD_OPERATOR (DOT_TK);
1011 for (i = 0; i < TOTAL_PARTS; i++)
1016 c = java_get_unicode ();
1017 if (c == 'x' || c == 'X')
1020 c = java_get_unicode ();
1022 else if (JAVA_ASCII_DIGIT (c))
1026 /* Push the '.' back and prepare for a FP parsing... */
1027 java_unget_unicode ();
1032 /* We have a zero literal: 0, 0{f,F}, 0{d,D} */
1033 JAVA_LEX_LIT ("0", 10);
1037 SET_LVAL_NODE (long_zero_node);
1038 return (INT_LIT_TK);
1040 SET_LVAL_NODE (float_zero_node);
1043 SET_LVAL_NODE (double_zero_node);
1046 java_unget_unicode ();
1047 SET_LVAL_NODE (integer_zero_node);
1048 return (INT_LIT_TK);
1052 /* Parse the first part of the literal, until we find something
1053 which is not a number. */
1054 while ((radix == 10 && JAVA_ASCII_DIGIT (c)) ||
1055 (radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1056 (radix == 8 && JAVA_ASCII_OCTDIGIT (c)))
1058 /* We store in a string (in case it turns out to be a FP) and in
1059 PARTS if we have to process a integer literal. */
1060 int numeric = (RANGE (c, '0', '9') ? c-'0' : 10 +(c|0x20)-'a');
1063 literal_token [literal_index++] = c;
1064 /* This section of code if borrowed from gcc/c-lex.c */
1065 for (count = 0; count < TOTAL_PARTS; count++)
1067 parts[count] *= radix;
1070 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR);
1071 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1074 parts[0] += numeric;
1076 if (parts [TOTAL_PARTS-1] != 0)
1078 /* End borrowed section. */
1079 c = java_get_unicode ();
1082 /* If we have something from the FP char set but not a digit, parse
1084 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1087 int seen_digit = (literal_index ? 1 : 0);
1088 int seen_exponent = 0;
1089 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1090 double unless specified. */
1092 /* It is ok if the radix is 8 because this just means we've
1093 seen a leading `0'. However, radix==16 is invalid. */
1095 java_lex_error ("Can't express non-decimal FP literal", 0);
1105 literal_token [literal_index++ ] = c;
1106 c = java_get_unicode ();
1109 java_lex_error ("Invalid character in FP literal", 0);
1112 if (c == 'e' || c == 'E')
1116 /* {E,e} must have seen at list a digit */
1118 java_lex_error ("Invalid FP literal", 0);
1122 literal_token [literal_index++] = c;
1123 c = java_get_unicode ();
1126 java_lex_error ("Invalid character in FP literal", 0);
1128 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1130 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1131 stage = 4; /* So we fall through */
1134 if ((c=='-' || c =='+') && stage == 2)
1137 literal_token [literal_index++] = c;
1138 c = java_get_unicode ();
1141 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1142 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1143 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1144 (stage == 3 && JAVA_ASCII_DIGIT (c)))
1146 if (JAVA_ASCII_DIGIT (c))
1148 literal_token [literal_index++ ] = c;
1149 c = java_get_unicode ();
1156 if (stage != 4) /* Don't push back fF/dD */
1157 java_unget_unicode ();
1159 /* An exponent (if any) must have seen a digit. */
1160 if (seen_exponent && !seen_digit)
1161 java_lex_error ("Invalid FP literal", 0);
1163 literal_token [literal_index] = '\0';
1164 JAVA_LEX_LIT (literal_token, radix);
1167 a.literal_token = literal_token;
1169 a.java_lval = java_lval;
1170 a.number_beginning = number_beginning;
1171 if (do_float_handler (java_perform_atof, (PTR) &a))
1174 JAVA_FLOAT_RANGE_ERROR ((fflag ? "float" : "double"));
1180 } /* JAVA_ASCCI_FPCHAR (c) */
1182 /* Here we get back to converting the integral literal. */
1183 if (c == 'L' || c == 'l')
1185 else if (radix == 16 && JAVA_ASCII_LETTER (c))
1186 java_lex_error ("Digit out of range in hexadecimal literal", 0);
1187 else if (radix == 8 && JAVA_ASCII_DIGIT (c))
1188 java_lex_error ("Digit out of range in octal literal", 0);
1189 else if (radix == 16 && !literal_index)
1190 java_lex_error ("No digit specified for hexadecimal literal", 0);
1192 java_unget_unicode ();
1194 #ifdef JAVA_LEX_DEBUG
1195 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1196 JAVA_LEX_LIT (literal_token, radix);
1198 /* This section of code is borrowed from gcc/c-lex.c */
1201 bytes = GET_TYPE_PRECISION (long_type_node);
1202 for (i = bytes; i < TOTAL_PARTS; i++)
1210 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1212 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1213 / HOST_BITS_PER_CHAR)]
1214 << (i * HOST_BITS_PER_CHAR));
1215 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1217 /* End borrowed section. */
1219 /* Range checking */
1222 /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1223 9223372036854775807L is the biggest `long' literal that can be
1224 expressed using a 10 radix. For other radixes, everything that
1225 fits withing 64 bits is OK. */
1226 int hb = (high >> 31);
1227 if (overflow || (hb && low && radix == 10) ||
1228 (hb && high & 0x7fffffff && radix == 10) ||
1229 (hb && !(high & 0x7fffffff) && !ctxp->minus_seen && radix == 10))
1230 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1234 /* 2147483648 is valid if operand of a '-'. Otherwise,
1235 2147483647 is the biggest `int' literal that can be
1236 expressed using a 10 radix. For other radixes, everything
1237 that fits within 32 bits is OK. As all literals are
1238 signed, we sign extend here. */
1239 int hb = (low >> 31) & 0x1;
1240 if (overflow || high || (hb && low & 0x7fffffff && radix == 10) ||
1241 (hb && !(low & 0x7fffffff) && !ctxp->minus_seen && radix == 10))
1242 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1245 ctxp->minus_seen = 0;
1246 SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1247 (long_suffix ? long_type_node : int_type_node));
1251 ctxp->minus_seen = 0;
1253 /* Character literals */
1257 if ((c = java_get_unicode ()) == '\\')
1258 char_lit = java_parse_escape_sequence ();
1261 if (c == '\n' || c == '\'')
1262 java_lex_error ("Invalid character literal", 0);
1266 c = java_get_unicode ();
1268 if ((c == '\n') || (c == UEOF))
1269 java_lex_error ("Character literal not terminated at end of line", 0);
1271 java_lex_error ("Syntax error in character literal", 0);
1273 if (char_lit == JAVA_CHAR_ERROR)
1274 char_lit = 0; /* We silently convert it to zero */
1276 JAVA_LEX_CHAR_LIT (char_lit);
1277 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1281 /* String literals */
1287 for (no_error = 1, c = java_get_unicode ();
1288 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1291 c = java_parse_escape_sequence ();
1292 if (c == JAVA_CHAR_ERROR)
1295 c = 0; /* We silently convert it to zero. */
1297 java_unicode_2_utf8 (c);
1299 if (c == '\n' || c == UEOF) /* ULT */
1301 lineno--; /* Refer to the line the terminator was seen */
1302 java_lex_error ("String not terminated at end of line.", 0);
1306 obstack_1grow (&temporary_obstack, '\0');
1307 string = obstack_finish (&temporary_obstack);
1309 if (!no_error || (c != '"'))
1310 java_lval->node = error_mark_node; /* Requires futher testing FIXME */
1312 java_lval->node = build_string (strlen (string), string);
1314 obstack_free (&temporary_obstack, string);
1315 return STRING_LIT_TK;
1323 BUILD_OPERATOR (OP_TK);
1329 if (ctxp->ccb_indent == 1)
1330 ctxp->first_ccb_indent1 = lineno;
1332 BUILD_OPERATOR (OCB_TK);
1336 if (ctxp->ccb_indent == 1)
1337 ctxp->last_ccb_indent1 = lineno;
1338 BUILD_OPERATOR (CCB_TK);
1341 BUILD_OPERATOR (OSB_TK);
1353 BUILD_OPERATOR (DOT_TK);
1354 /* return DOT_TK; */
1361 if ((c = java_get_unicode ()) == '=')
1363 BUILD_OPERATOR (EQ_TK);
1367 /* Equals is used in two different locations. In the
1368 variable_declarator: rule, it has to be seen as '=' as opposed
1369 to being seen as an ordinary assignment operator in
1370 assignment_operators: rule. */
1371 java_unget_unicode ();
1372 BUILD_OPERATOR (ASSIGN_TK);
1376 switch ((c = java_get_unicode ()))
1379 BUILD_OPERATOR (GTE_TK);
1381 switch ((c = java_get_unicode ()))
1384 if ((c = java_get_unicode ()) == '=')
1386 BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1390 java_unget_unicode ();
1391 BUILD_OPERATOR (ZRS_TK);
1394 BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1396 java_unget_unicode ();
1397 BUILD_OPERATOR (SRS_TK);
1400 java_unget_unicode ();
1401 BUILD_OPERATOR (GT_TK);
1405 switch ((c = java_get_unicode ()))
1408 BUILD_OPERATOR (LTE_TK);
1410 if ((c = java_get_unicode ()) == '=')
1412 BUILD_OPERATOR2 (LS_ASSIGN_TK);
1416 java_unget_unicode ();
1417 BUILD_OPERATOR (LS_TK);
1420 java_unget_unicode ();
1421 BUILD_OPERATOR (LT_TK);
1425 switch ((c = java_get_unicode ()))
1428 BUILD_OPERATOR (BOOL_AND_TK);
1430 BUILD_OPERATOR2 (AND_ASSIGN_TK);
1432 java_unget_unicode ();
1433 BUILD_OPERATOR (AND_TK);
1437 switch ((c = java_get_unicode ()))
1440 BUILD_OPERATOR (BOOL_OR_TK);
1442 BUILD_OPERATOR2 (OR_ASSIGN_TK);
1444 java_unget_unicode ();
1445 BUILD_OPERATOR (OR_TK);
1449 switch ((c = java_get_unicode ()))
1452 BUILD_OPERATOR (INCR_TK);
1454 BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1456 java_unget_unicode ();
1457 BUILD_OPERATOR (PLUS_TK);
1461 switch ((c = java_get_unicode ()))
1464 BUILD_OPERATOR (DECR_TK);
1466 BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1468 java_unget_unicode ();
1469 ctxp->minus_seen = 1;
1470 BUILD_OPERATOR (MINUS_TK);
1474 if ((c = java_get_unicode ()) == '=')
1476 BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1480 java_unget_unicode ();
1481 BUILD_OPERATOR (MULT_TK);
1485 if ((c = java_get_unicode ()) == '=')
1487 BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1491 java_unget_unicode ();
1492 BUILD_OPERATOR (DIV_TK);
1496 if ((c = java_get_unicode ()) == '=')
1498 BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1502 java_unget_unicode ();
1503 BUILD_OPERATOR (XOR_TK);
1507 if ((c = java_get_unicode ()) == '=')
1509 BUILD_OPERATOR2 (REM_ASSIGN_TK);
1513 java_unget_unicode ();
1514 BUILD_OPERATOR (REM_TK);
1518 if ((c = java_get_unicode()) == '=')
1520 BUILD_OPERATOR (NEQ_TK);
1524 java_unget_unicode ();
1525 BUILD_OPERATOR (NEG_TK);
1530 BUILD_OPERATOR (REL_QM_TK);
1533 BUILD_OPERATOR (REL_CL_TK);
1535 BUILD_OPERATOR (NOT_TK);
1538 /* Keyword, boolean literal or null literal */
1539 for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1540 JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1542 java_unicode_2_utf8 (c);
1543 if (all_ascii && c >= 128)
1548 obstack_1grow (&temporary_obstack, '\0');
1549 string = obstack_finish (&temporary_obstack);
1550 java_unget_unicode ();
1552 /* If we have something all ascii, we consider a keyword, a boolean
1553 literal, a null literal or an all ASCII identifier. Otherwise,
1554 this is an identifier (possibly not respecting formation rule). */
1557 struct java_keyword *kw;
1558 if ((kw=java_keyword (string, ascii_index)))
1560 JAVA_LEX_KW (string);
1563 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK:
1564 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK:
1565 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1566 case PRIVATE_TK: case STRICT_TK:
1567 SET_MODIFIER_CTX (kw->token);
1570 SET_LVAL_NODE (float_type_node);
1573 SET_LVAL_NODE (double_type_node);
1576 SET_LVAL_NODE (boolean_type_node);
1579 SET_LVAL_NODE (byte_type_node);
1582 SET_LVAL_NODE (short_type_node);
1585 SET_LVAL_NODE (int_type_node);
1588 SET_LVAL_NODE (long_type_node);
1591 SET_LVAL_NODE (char_type_node);
1594 /* Keyword based literals */
1597 SET_LVAL_NODE ((kw->token == TRUE_TK ?
1598 boolean_true_node : boolean_false_node));
1601 SET_LVAL_NODE (null_pointer_node);
1604 /* Some keyword we want to retain information on the location
1617 BUILD_OPERATOR (kw->token);
1625 /* We may have an ID here */
1626 if (JAVA_START_CHAR_P (first_unicode))
1628 JAVA_LEX_ID (string);
1629 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1633 /* Everything else is an invalid character in the input */
1635 char lex_error_buffer [128];
1636 sprintf (lex_error_buffer, "Invalid character `%s' in input",
1637 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1638 java_lex_error (lex_error_buffer, 1);
1644 java_unicode_2_utf8 (unicode)
1647 if (RANGE (unicode, 0x01, 0x7f))
1648 obstack_1grow (&temporary_obstack, (char)unicode);
1649 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1651 obstack_1grow (&temporary_obstack,
1652 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1653 obstack_1grow (&temporary_obstack,
1654 (unsigned char)(0x80 | (unicode & 0x3f)));
1656 else /* Range 0x800-0xffff */
1658 obstack_1grow (&temporary_obstack,
1659 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1660 obstack_1grow (&temporary_obstack,
1661 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1662 obstack_1grow (&temporary_obstack,
1663 (unsigned char)(0x80 | (unicode & 0x003f)));
1669 build_wfl_node (node)
1672 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1673 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1674 TREE_TYPE (node) = NULL_TREE;
1680 java_lex_error (msg, forward)
1681 const char *msg ATTRIBUTE_UNUSED;
1682 int forward ATTRIBUTE_UNUSED;
1685 ctxp->elc.line = ctxp->c_line->lineno;
1686 ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1688 /* Might be caught in the middle of some error report */
1689 ctxp->java_error_flag = 0;
1706 if (next != '\n' && next != EOF)
1718 java_get_line_col (filename, line, col)
1719 const char *filename ATTRIBUTE_UNUSED;
1720 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1725 /* Dumb implementation. Doesn't try to cache or optimize things. */
1726 /* First line of the file is line 1, first column is 1 */
1728 /* COL == -1 means, at the CR/LF in LINE */
1729 /* COL == -2 means, at the first non space char in LINE */
1732 int c, ccol, cline = 1;
1733 int current_line_col = 0;
1734 int first_non_space = 0;
1737 if (!(fp = fopen (filename, "r")))
1738 fatal_io_error ("can't open %s", filename);
1740 while (cline != line)
1745 static char msg[] = "<<file too short - unexpected EOF>>";
1746 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1749 if (java_is_eol (fp, c))
1753 /* Gather the chars of the current line in a buffer */
1757 if (c < 0 || java_is_eol (fp, c))
1759 if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1760 first_non_space = current_line_col;
1761 obstack_1grow (&temporary_obstack, c);
1766 obstack_1grow (&temporary_obstack, '\n');
1770 col = current_line_col;
1771 first_non_space = 0;
1774 col = first_non_space;
1776 first_non_space = 0;
1778 /* Place the '^' a the right position */
1779 base = obstack_base (&temporary_obstack);
1780 for (ccol = 1; ccol <= col+3; ccol++)
1782 /* Compute \t when reaching first_non_space */
1783 char c = (first_non_space ?
1784 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1785 obstack_1grow (&temporary_obstack, c);
1787 obstack_grow0 (&temporary_obstack, "^", 1);
1790 return obstack_finish (&temporary_obstack);
1796 utf8_cmp (str, length, name)
1797 const unsigned char *str;
1801 const unsigned char *limit = str + length;
1804 for (i = 0; name[i]; ++i)
1806 int ch = UTF8_GET (str, limit);
1808 return ch - name[i];
1811 return str == limit ? 0 : 1;
1814 /* A sorted list of all C++ keywords. */
1816 static const char *cxx_keywords[] =
1924 /* Return true if NAME is a C++ keyword. */
1927 cxx_keyword_p (name, length)
1931 int last = ARRAY_SIZE (cxx_keywords);
1933 int mid = (last + first) / 2;
1936 for (mid = (last + first) / 2;
1938 old = mid, mid = (last + first) / 2)
1940 int kwl = strlen (cxx_keywords[mid]);
1941 int min_length = kwl > length ? length : kwl;
1942 int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1947 /* We've found a match if all the remaining characters are
1949 for (i = min_length; i < length && name[i] == '$'; ++i)
1963 #endif /* JC1_LITE */