1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, U s },
45 #define TK(e, s) { SPELL_ ## s, U #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
59 unsigned int, enum cpp_ttype);
60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
61 static int name_p (cpp_reader *, const cpp_string *);
62 static tokenrun *next_tokenrun (tokenrun *);
64 static _cpp_buff *new_buff (size_t);
69 Compares, the token TOKEN to the NUL-terminated string STRING.
70 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 cpp_ideq (const cpp_token *token, const char *string)
74 if (token->type != CPP_NAME)
77 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
80 /* Record a note TYPE at byte POS into the current cleaned logical
83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 if (buffer->notes_used == buffer->notes_cap)
87 buffer->notes_cap = buffer->notes_cap * 2 + 200;
88 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
92 buffer->notes[buffer->notes_used].pos = pos;
93 buffer->notes[buffer->notes_used].type = type;
97 /* Returns with a logical line that contains no escaped newlines or
98 trigraphs. This is a time-critical inner loop. */
100 _cpp_clean_line (cpp_reader *pfile)
106 buffer = pfile->buffer;
107 buffer->cur_note = buffer->notes_used = 0;
108 buffer->cur = buffer->line_base = buffer->next_line;
109 buffer->need_line = false;
110 s = buffer->next_line - 1;
112 if (!buffer->from_stage3)
114 /* Short circuit for the common case of an un-escaped line with
115 no trigraphs. The primary win here is by not writing any
116 data back to memory until we have to. */
120 if (c == '\n' || c == '\r')
124 if (s == buffer->rlimit)
127 /* DOS line ending? */
128 if (c == '\r' && s[1] == '\n')
131 if (s == buffer->rlimit)
134 /* check for escaped newline */
136 while (p != buffer->next_line && is_nvspace (p[-1]))
138 if (p == buffer->next_line || p[-1] != '\\')
141 /* Have an escaped newline; process it and proceed to
143 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
145 buffer->next_line = p - 1;
148 if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
150 /* Have a trigraph. We may or may not have to convert
151 it. Add a line note regardless, for -Wtrigraphs. */
152 add_line_note (buffer, s, s[2]);
153 if (CPP_OPTION (pfile, trigraphs))
155 /* We do, and that means we have to switch to the
158 *d = _cpp_trigraph_map[s[2]];
171 if (c == '\n' || c == '\r')
173 /* Handle DOS line endings. */
174 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
176 if (s == buffer->rlimit)
181 while (p != buffer->next_line && is_nvspace (p[-1]))
183 if (p == buffer->next_line || p[-1] != '\\')
186 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
188 buffer->next_line = p - 1;
190 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
192 /* Add a note regardless, for the benefit of -Wtrigraphs. */
193 add_line_note (buffer, d, s[2]);
194 if (CPP_OPTION (pfile, trigraphs))
196 *d = _cpp_trigraph_map[s[2]];
206 while (*s != '\n' && *s != '\r');
209 /* Handle DOS line endings. */
210 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
216 /* A sentinel note that should never be processed. */
217 add_line_note (buffer, d + 1, '\n');
218 buffer->next_line = s + 1;
221 /* Return true if the trigraph indicated by NOTE should be warned
222 about in a comment. */
224 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
228 /* Within comments we don't warn about trigraphs, unless the
229 trigraph forms an escaped newline, as that may change
231 if (note->type != '/')
234 /* If -trigraphs, then this was an escaped newline iff the next note
236 if (CPP_OPTION (pfile, trigraphs))
237 return note[1].pos == note->pos;
239 /* Otherwise, see if this forms an escaped newline. */
241 while (is_nvspace (*p))
244 /* There might have been escaped newlines between the trigraph and the
245 newline we found. Hence the position test. */
246 return (*p == '\n' && p < note[1].pos);
249 /* Process the notes created by add_line_note as far as the current
252 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
254 cpp_buffer *buffer = pfile->buffer;
258 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
261 if (note->pos > buffer->cur)
265 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
267 if (note->type == '\\' || note->type == ' ')
269 if (note->type == ' ' && !in_comment)
270 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
271 "backslash and newline separated by space");
273 if (buffer->next_line > buffer->rlimit)
275 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
276 "backslash-newline at end of file");
277 /* Prevent "no newline at end of file" warning. */
278 buffer->next_line = buffer->rlimit;
281 buffer->line_base = note->pos;
282 CPP_INCREMENT_LINE (pfile, 0);
284 else if (_cpp_trigraph_map[note->type])
286 if (CPP_OPTION (pfile, warn_trigraphs)
287 && (!in_comment || warn_in_comment (pfile, note)))
289 if (CPP_OPTION (pfile, trigraphs))
290 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
291 "trigraph ??%c converted to %c",
293 (int) _cpp_trigraph_map[note->type]);
297 (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
298 "trigraph ??%c ignored, use -trigraphs to enable",
308 /* Skip a C-style block comment. We find the end of the comment by
309 seeing if an asterisk is before every '/' we encounter. Returns
310 nonzero if comment terminated by EOF, zero otherwise.
312 Buffer->cur points to the initial asterisk of the comment. */
314 _cpp_skip_block_comment (cpp_reader *pfile)
316 cpp_buffer *buffer = pfile->buffer;
317 const uchar *cur = buffer->cur;
326 /* People like decorating comments with '*', so check for '/'
327 instead for efficiency. */
335 /* Warn about potential nested comments, but not if the '/'
336 comes immediately before the true comment delimiter.
337 Don't bother to get it right across escaped newlines. */
338 if (CPP_OPTION (pfile, warn_comments)
339 && cur[0] == '*' && cur[1] != '/')
342 cpp_error_with_line (pfile, CPP_DL_WARNING,
343 pfile->line_table->highest_line, CPP_BUF_COL (buffer),
344 "\"/*\" within comment");
350 buffer->cur = cur - 1;
351 _cpp_process_line_notes (pfile, true);
352 if (buffer->next_line >= buffer->rlimit)
354 _cpp_clean_line (pfile);
356 cols = buffer->next_line - buffer->line_base;
357 CPP_INCREMENT_LINE (pfile, cols);
364 _cpp_process_line_notes (pfile, true);
368 /* Skip a C++ line comment, leaving buffer->cur pointing to the
369 terminating newline. Handles escaped newlines. Returns nonzero
370 if a multiline comment. */
372 skip_line_comment (cpp_reader *pfile)
374 cpp_buffer *buffer = pfile->buffer;
375 unsigned int orig_line = pfile->line_table->highest_line;
377 while (*buffer->cur != '\n')
380 _cpp_process_line_notes (pfile, true);
381 return orig_line != pfile->line_table->highest_line;
384 /* Skips whitespace, saving the next non-whitespace character. */
386 skip_whitespace (cpp_reader *pfile, cppchar_t c)
388 cpp_buffer *buffer = pfile->buffer;
389 bool saw_NUL = false;
393 /* Horizontal space always OK. */
394 if (c == ' ' || c == '\t')
396 /* Just \f \v or \0 left. */
399 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
400 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
401 CPP_BUF_COL (buffer),
402 "%s in preprocessing directive",
403 c == '\f' ? "form feed" : "vertical tab");
407 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
408 while (is_nvspace (c));
411 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
416 /* See if the characters of a number token are valid in a name (no
419 name_p (cpp_reader *pfile, const cpp_string *string)
423 for (i = 0; i < string->len; i++)
424 if (!is_idchar (string->text[i]))
430 /* After parsing an identifier or other sequence, produce a warning about
431 sequences not in NFC/NFKC. */
433 warn_about_normalization (cpp_reader *pfile,
434 const cpp_token *token,
435 const struct normalize_state *s)
437 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
438 && !pfile->state.skipping)
440 /* Make sure that the token is printed using UCNs, even
441 if we'd otherwise happily print UTF-8. */
442 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
445 sz = cpp_spell_token (pfile, token, buf, false) - buf;
446 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
447 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
448 "`%.*s' is not in NFKC", (int) sz, buf);
450 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
451 "`%.*s' is not in NFC", (int) sz, buf);
455 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
456 an identifier. FIRST is TRUE if this starts an identifier. */
458 forms_identifier_p (cpp_reader *pfile, int first,
459 struct normalize_state *state)
461 cpp_buffer *buffer = pfile->buffer;
463 if (*buffer->cur == '$')
465 if (!CPP_OPTION (pfile, dollars_in_ident))
469 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
471 CPP_OPTION (pfile, warn_dollars) = 0;
472 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
478 /* Is this a syntactically valid UCN? */
479 if (CPP_OPTION (pfile, extended_identifiers)
480 && *buffer->cur == '\\'
481 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
484 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
493 /* Lex an identifier starting at BUFFER->CUR - 1. */
494 static cpp_hashnode *
495 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
496 struct normalize_state *nst)
498 cpp_hashnode *result;
501 unsigned int hash = HT_HASHSTEP (0, *base);
503 cur = pfile->buffer->cur;
505 while (ISIDNUM (*cur))
507 hash = HT_HASHSTEP (hash, *cur);
510 pfile->buffer->cur = cur;
511 if (starts_ucn || forms_identifier_p (pfile, false, nst))
513 /* Slower version for identifiers containing UCNs (or $). */
515 while (ISIDNUM (*pfile->buffer->cur))
517 pfile->buffer->cur++;
518 NORMALIZE_STATE_UPDATE_IDNUM (nst);
520 } while (forms_identifier_p (pfile, false, nst));
521 result = _cpp_interpret_identifier (pfile, base,
522 pfile->buffer->cur - base);
527 hash = HT_HASHFINISH (hash, len);
529 result = (cpp_hashnode *)
530 ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
533 /* Rarely, identifiers require diagnostics when lexed. */
534 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
535 && !pfile->state.skipping, 0))
537 /* It is allowed to poison the same identifier twice. */
538 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
539 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
542 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
543 replacement list of a variadic macro. */
544 if (result == pfile->spec_nodes.n__VA_ARGS__
545 && !pfile->state.va_args_ok)
546 cpp_error (pfile, CPP_DL_PEDWARN,
547 "__VA_ARGS__ can only appear in the expansion"
548 " of a C99 variadic macro");
554 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
556 lex_number (cpp_reader *pfile, cpp_string *number,
557 struct normalize_state *nst)
563 base = pfile->buffer->cur - 1;
566 cur = pfile->buffer->cur;
568 /* N.B. ISIDNUM does not include $. */
569 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
572 NORMALIZE_STATE_UPDATE_IDNUM (nst);
575 pfile->buffer->cur = cur;
577 while (forms_identifier_p (pfile, false, nst));
579 number->len = cur - base;
580 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
581 memcpy (dest, base, number->len);
582 dest[number->len] = '\0';
586 /* Create a token of type TYPE with a literal spelling. */
588 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
589 unsigned int len, enum cpp_ttype type)
591 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
593 memcpy (dest, base, len);
596 token->val.str.len = len;
597 token->val.str.text = dest;
600 /* Lexes a string, character constant, or angle-bracketed header file
601 name. The stored string contains the spelling, including opening
602 quote and leading any leading 'L'. It returns the type of the
603 literal, or CPP_OTHER if it was not properly terminated.
605 The spelling is NUL-terminated, but it is not guaranteed that this
606 is the first NUL since embedded NULs are preserved. */
608 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
610 bool saw_NUL = false;
612 cppchar_t terminator;
617 if (terminator == 'L')
619 if (terminator == '\"')
620 type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
621 else if (terminator == '\'')
622 type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
624 terminator = '>', type = CPP_HEADER_NAME;
628 cppchar_t c = *cur++;
630 /* In #include-style directives, terminators are not escapable. */
631 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
633 else if (c == terminator)
645 if (saw_NUL && !pfile->state.skipping)
646 cpp_error (pfile, CPP_DL_WARNING,
647 "null character(s) preserved in literal");
649 pfile->buffer->cur = cur;
650 create_literal (pfile, token, base, cur - base, type);
653 /* The stored comment includes the comment start and any terminator. */
655 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
658 unsigned char *buffer;
659 unsigned int len, clen;
661 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
663 /* C++ comments probably (not definitely) have moved past a new
664 line, which we don't want to save in the comment. */
665 if (is_vspace (pfile->buffer->cur[-1]))
668 /* If we are currently in a directive, then we need to store all
669 C++ comments as C comments internally, and so we need to
670 allocate a little extra space in that case.
672 Note that the only time we encounter a directive here is
673 when we are saving comments in a "#define". */
674 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
676 buffer = _cpp_unaligned_alloc (pfile, clen);
678 token->type = CPP_COMMENT;
679 token->val.str.len = clen;
680 token->val.str.text = buffer;
683 memcpy (buffer + 1, from, len - 1);
685 /* Finish conversion to a C comment, if necessary. */
686 if (pfile->state.in_directive && type == '/')
689 buffer[clen - 2] = '*';
690 buffer[clen - 1] = '/';
694 /* Allocate COUNT tokens for RUN. */
696 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
698 run->base = XNEWVEC (cpp_token, count);
699 run->limit = run->base + count;
703 /* Returns the next tokenrun, or creates one if there is none. */
705 next_tokenrun (tokenrun *run)
707 if (run->next == NULL)
709 run->next = XNEW (tokenrun);
710 run->next->prev = run;
711 _cpp_init_tokenrun (run->next, 250);
717 /* Allocate a single token that is invalidated at the same time as the
718 rest of the tokens on the line. Has its line and col set to the
719 same as the last lexed token, so that diagnostics appear in the
722 _cpp_temp_token (cpp_reader *pfile)
724 cpp_token *old, *result;
726 old = pfile->cur_token - 1;
727 if (pfile->cur_token == pfile->cur_run->limit)
729 pfile->cur_run = next_tokenrun (pfile->cur_run);
730 pfile->cur_token = pfile->cur_run->base;
733 result = pfile->cur_token++;
734 result->src_loc = old->src_loc;
738 /* Lex a token into RESULT (external interface). Takes care of issues
739 like directive handling, token lookahead, multiple include
740 optimization and skipping. */
742 _cpp_lex_token (cpp_reader *pfile)
748 if (pfile->cur_token == pfile->cur_run->limit)
750 pfile->cur_run = next_tokenrun (pfile->cur_run);
751 pfile->cur_token = pfile->cur_run->base;
754 if (pfile->lookaheads)
757 result = pfile->cur_token++;
760 result = _cpp_lex_direct (pfile);
762 if (result->flags & BOL)
764 /* Is this a directive. If _cpp_handle_directive returns
765 false, it is an assembler #. */
766 if (result->type == CPP_HASH
767 /* 6.10.3 p 11: Directives in a list of macro arguments
768 gives undefined behavior. This implementation
769 handles the directive as normal. */
770 && pfile->state.parsing_args != 1)
772 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
774 if (pfile->directive_result.type == CPP_PADDING)
776 result = &pfile->directive_result;
779 else if (pfile->state.in_deferred_pragma)
780 result = &pfile->directive_result;
782 if (pfile->cb.line_change && !pfile->state.skipping)
783 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
786 /* We don't skip tokens in directives. */
787 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
790 /* Outside a directive, invalidate controlling macros. At file
791 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
792 get here and MI optimization works. */
793 pfile->mi_valid = false;
795 if (!pfile->state.skipping || result->type == CPP_EOF)
802 /* Returns true if a fresh line has been loaded. */
804 _cpp_get_fresh_line (cpp_reader *pfile)
808 /* We can't get a new line until we leave the current directive. */
809 if (pfile->state.in_directive)
814 cpp_buffer *buffer = pfile->buffer;
816 if (!buffer->need_line)
819 if (buffer->next_line < buffer->rlimit)
821 _cpp_clean_line (pfile);
825 /* First, get out of parsing arguments state. */
826 if (pfile->state.parsing_args)
829 /* End of buffer. Non-empty files should end in a newline. */
830 if (buffer->buf != buffer->rlimit
831 && buffer->next_line > buffer->rlimit
832 && !buffer->from_stage3)
834 /* Only warn once. */
835 buffer->next_line = buffer->rlimit;
836 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
837 CPP_BUF_COLUMN (buffer, buffer->cur),
838 "no newline at end of file");
841 return_at_eof = buffer->return_at_eof;
842 _cpp_pop_buffer (pfile);
843 if (pfile->buffer == NULL || return_at_eof)
848 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
851 result->type = ELSE_TYPE; \
852 if (*buffer->cur == CHAR) \
853 buffer->cur++, result->type = THEN_TYPE; \
857 /* Lex a token into pfile->cur_token, which is also incremented, to
858 get diagnostics pointing to the correct location.
860 Does not handle issues such as token lookahead, multiple-include
861 optimization, directives, skipping etc. This function is only
862 suitable for use by _cpp_lex_token, and in special cases like
863 lex_expansion_token which doesn't care for any of these issues.
865 When meeting a newline, returns CPP_EOF if parsing a directive,
866 otherwise returns to the start of the token buffer if permissible.
867 Returns the location of the lexed token. */
869 _cpp_lex_direct (cpp_reader *pfile)
873 const unsigned char *comment_start;
874 cpp_token *result = pfile->cur_token++;
878 buffer = pfile->buffer;
879 if (buffer->need_line)
881 if (pfile->state.in_deferred_pragma)
883 result->type = CPP_PRAGMA_EOL;
884 pfile->state.in_deferred_pragma = false;
885 if (!pfile->state.pragma_allow_expansion)
886 pfile->state.prevent_expansion--;
889 if (!_cpp_get_fresh_line (pfile))
891 result->type = CPP_EOF;
892 if (!pfile->state.in_directive)
894 /* Tell the compiler the line number of the EOF token. */
895 result->src_loc = pfile->line_table->highest_line;
900 if (!pfile->keep_tokens)
902 pfile->cur_run = &pfile->base_run;
903 result = pfile->base_run.base;
904 pfile->cur_token = result + 1;
907 if (pfile->state.parsing_args == 2)
908 result->flags |= PREV_WHITE;
910 buffer = pfile->buffer;
912 result->src_loc = pfile->line_table->highest_line;
915 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
916 && !pfile->overlaid_buffer)
918 _cpp_process_line_notes (pfile, false);
919 result->src_loc = pfile->line_table->highest_line;
923 LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
924 CPP_BUF_COLUMN (buffer, buffer->cur));
928 case ' ': case '\t': case '\f': case '\v': case '\0':
929 result->flags |= PREV_WHITE;
930 skip_whitespace (pfile, c);
934 if (buffer->cur < buffer->rlimit)
935 CPP_INCREMENT_LINE (pfile, 0);
936 buffer->need_line = true;
939 case '0': case '1': case '2': case '3': case '4':
940 case '5': case '6': case '7': case '8': case '9':
942 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
943 result->type = CPP_NUMBER;
944 lex_number (pfile, &result->val.str, &nst);
945 warn_about_normalization (pfile, result, &nst);
950 /* 'L' may introduce wide characters or strings. */
951 if (*buffer->cur == '\'' || *buffer->cur == '"')
953 lex_string (pfile, result, buffer->cur - 1);
959 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
960 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
961 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
962 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
964 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
965 case 'G': case 'H': case 'I': case 'J': case 'K':
966 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
967 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
969 result->type = CPP_NAME;
971 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
972 result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
974 warn_about_normalization (pfile, result, &nst);
977 /* Convert named operators to their proper types. */
978 if (result->val.node->flags & NODE_OPERATOR)
980 result->flags |= NAMED_OP;
981 result->type = (enum cpp_ttype) result->val.node->directive_index;
987 lex_string (pfile, result, buffer->cur - 1);
991 /* A potential block or line comment. */
992 comment_start = buffer->cur;
997 if (_cpp_skip_block_comment (pfile))
998 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1000 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1001 || cpp_in_system_header (pfile)))
1003 /* Warn about comments only if pedantically GNUC89, and not
1004 in system headers. */
1005 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1006 && ! buffer->warned_cplusplus_comments)
1008 cpp_error (pfile, CPP_DL_PEDWARN,
1009 "C++ style comments are not allowed in ISO C90");
1010 cpp_error (pfile, CPP_DL_PEDWARN,
1011 "(this will be reported only once per input file)");
1012 buffer->warned_cplusplus_comments = 1;
1015 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1016 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1021 result->type = CPP_DIV_EQ;
1026 result->type = CPP_DIV;
1030 if (!pfile->state.save_comments)
1032 result->flags |= PREV_WHITE;
1033 goto update_tokens_line;
1036 /* Save the comment as a token in its own right. */
1037 save_comment (pfile, result, comment_start, c);
1041 if (pfile->state.angled_headers)
1043 lex_string (pfile, result, buffer->cur - 1);
1047 result->type = CPP_LESS;
1048 if (*buffer->cur == '=')
1049 buffer->cur++, result->type = CPP_LESS_EQ;
1050 else if (*buffer->cur == '<')
1053 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1055 else if (CPP_OPTION (pfile, digraphs))
1057 if (*buffer->cur == ':')
1060 result->flags |= DIGRAPH;
1061 result->type = CPP_OPEN_SQUARE;
1063 else if (*buffer->cur == '%')
1066 result->flags |= DIGRAPH;
1067 result->type = CPP_OPEN_BRACE;
1073 result->type = CPP_GREATER;
1074 if (*buffer->cur == '=')
1075 buffer->cur++, result->type = CPP_GREATER_EQ;
1076 else if (*buffer->cur == '>')
1079 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1084 result->type = CPP_MOD;
1085 if (*buffer->cur == '=')
1086 buffer->cur++, result->type = CPP_MOD_EQ;
1087 else if (CPP_OPTION (pfile, digraphs))
1089 if (*buffer->cur == ':')
1092 result->flags |= DIGRAPH;
1093 result->type = CPP_HASH;
1094 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1095 buffer->cur += 2, result->type = CPP_PASTE;
1097 else if (*buffer->cur == '>')
1100 result->flags |= DIGRAPH;
1101 result->type = CPP_CLOSE_BRACE;
1107 result->type = CPP_DOT;
1108 if (ISDIGIT (*buffer->cur))
1110 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1111 result->type = CPP_NUMBER;
1112 lex_number (pfile, &result->val.str, &nst);
1113 warn_about_normalization (pfile, result, &nst);
1115 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1116 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1117 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1118 buffer->cur++, result->type = CPP_DOT_STAR;
1122 result->type = CPP_PLUS;
1123 if (*buffer->cur == '+')
1124 buffer->cur++, result->type = CPP_PLUS_PLUS;
1125 else if (*buffer->cur == '=')
1126 buffer->cur++, result->type = CPP_PLUS_EQ;
1130 result->type = CPP_MINUS;
1131 if (*buffer->cur == '>')
1134 result->type = CPP_DEREF;
1135 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1136 buffer->cur++, result->type = CPP_DEREF_STAR;
1138 else if (*buffer->cur == '-')
1139 buffer->cur++, result->type = CPP_MINUS_MINUS;
1140 else if (*buffer->cur == '=')
1141 buffer->cur++, result->type = CPP_MINUS_EQ;
1145 result->type = CPP_AND;
1146 if (*buffer->cur == '&')
1147 buffer->cur++, result->type = CPP_AND_AND;
1148 else if (*buffer->cur == '=')
1149 buffer->cur++, result->type = CPP_AND_EQ;
1153 result->type = CPP_OR;
1154 if (*buffer->cur == '|')
1155 buffer->cur++, result->type = CPP_OR_OR;
1156 else if (*buffer->cur == '=')
1157 buffer->cur++, result->type = CPP_OR_EQ;
1161 result->type = CPP_COLON;
1162 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1163 buffer->cur++, result->type = CPP_SCOPE;
1164 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1167 result->flags |= DIGRAPH;
1168 result->type = CPP_CLOSE_SQUARE;
1172 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1173 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1174 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1175 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1176 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1178 case '?': result->type = CPP_QUERY; break;
1179 case '~': result->type = CPP_COMPL; break;
1180 case ',': result->type = CPP_COMMA; break;
1181 case '(': result->type = CPP_OPEN_PAREN; break;
1182 case ')': result->type = CPP_CLOSE_PAREN; break;
1183 case '[': result->type = CPP_OPEN_SQUARE; break;
1184 case ']': result->type = CPP_CLOSE_SQUARE; break;
1185 case '{': result->type = CPP_OPEN_BRACE; break;
1186 case '}': result->type = CPP_CLOSE_BRACE; break;
1187 case ';': result->type = CPP_SEMICOLON; break;
1189 /* @ is a punctuator in Objective-C. */
1190 case '@': result->type = CPP_ATSIGN; break;
1195 const uchar *base = --buffer->cur;
1196 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1198 if (forms_identifier_p (pfile, true, &nst))
1200 result->type = CPP_NAME;
1201 result->val.node = lex_identifier (pfile, base, true, &nst);
1202 warn_about_normalization (pfile, result, &nst);
1209 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1216 /* An upper bound on the number of bytes needed to spell TOKEN.
1217 Does not include preceding whitespace. */
1219 cpp_token_len (const cpp_token *token)
1223 switch (TOKEN_SPELL (token))
1225 default: len = 4; break;
1226 case SPELL_LITERAL: len = token->val.str.len; break;
1227 case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
1233 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1234 Return the number of bytes read out of NAME. (There are always
1235 10 bytes written to BUFFER.) */
1238 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1244 unsigned long utf32;
1246 /* Compute the length of the UTF-8 sequence. */
1247 for (t = *name; t & 0x80; t <<= 1)
1250 utf32 = *name & (0x7F >> ucn_len);
1251 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1253 utf32 = (utf32 << 6) | (*++name & 0x3F);
1255 /* Ill-formed UTF-8. */
1256 if ((*name & ~0x3F) != 0x80)
1262 for (j = 7; j >= 0; j--)
1263 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1268 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1269 already contain the enough space to hold the token's spelling.
1270 Returns a pointer to the character after the last character written.
1271 FORSTRING is true if this is to be the spelling after translation
1272 phase 1 (this is different for UCNs).
1273 FIXME: Would be nice if we didn't need the PFILE argument. */
1275 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1276 unsigned char *buffer, bool forstring)
1278 switch (TOKEN_SPELL (token))
1280 case SPELL_OPERATOR:
1282 const unsigned char *spelling;
1285 if (token->flags & DIGRAPH)
1287 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1288 else if (token->flags & NAMED_OP)
1291 spelling = TOKEN_NAME (token);
1293 while ((c = *spelling++) != '\0')
1302 memcpy (buffer, NODE_NAME (token->val.node),
1303 NODE_LEN (token->val.node));
1304 buffer += NODE_LEN (token->val.node);
1309 const unsigned char * name = NODE_NAME (token->val.node);
1311 for (i = 0; i < NODE_LEN (token->val.node); i++)
1312 if (name[i] & ~0x7F)
1314 i += utf8_to_ucn (buffer, name + i) - 1;
1318 *buffer++ = NODE_NAME (token->val.node)[i];
1323 memcpy (buffer, token->val.str.text, token->val.str.len);
1324 buffer += token->val.str.len;
1328 cpp_error (pfile, CPP_DL_ICE,
1329 "unspellable token %s", TOKEN_NAME (token));
1336 /* Returns TOKEN spelt as a null-terminated string. The string is
1337 freed when the reader is destroyed. Useful for diagnostics. */
1339 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1341 unsigned int len = cpp_token_len (token) + 1;
1342 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1344 end = cpp_spell_token (pfile, token, start, false);
1350 /* Used by C front ends, which really should move to using
1351 cpp_token_as_text. */
1353 cpp_type2name (enum cpp_ttype type)
1355 return (const char *) token_spellings[type].name;
1358 /* Writes the spelling of token to FP, without any preceding space.
1359 Separated from cpp_spell_token for efficiency - to avoid stdio
1360 double-buffering. */
1362 cpp_output_token (const cpp_token *token, FILE *fp)
1364 switch (TOKEN_SPELL (token))
1366 case SPELL_OPERATOR:
1368 const unsigned char *spelling;
1371 if (token->flags & DIGRAPH)
1373 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1374 else if (token->flags & NAMED_OP)
1377 spelling = TOKEN_NAME (token);
1382 while ((c = *++spelling) != '\0');
1390 const unsigned char * name = NODE_NAME (token->val.node);
1392 for (i = 0; i < NODE_LEN (token->val.node); i++)
1393 if (name[i] & ~0x7F)
1395 unsigned char buffer[10];
1396 i += utf8_to_ucn (buffer, name + i) - 1;
1397 fwrite (buffer, 1, 10, fp);
1400 fputc (NODE_NAME (token->val.node)[i], fp);
1405 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1409 /* An error, most probably. */
1414 /* Compare two tokens. */
1416 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1418 if (a->type == b->type && a->flags == b->flags)
1419 switch (TOKEN_SPELL (a))
1421 default: /* Keep compiler happy. */
1422 case SPELL_OPERATOR:
1425 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1427 return a->val.node == b->val.node;
1429 return (a->val.str.len == b->val.str.len
1430 && !memcmp (a->val.str.text, b->val.str.text,
1437 /* Returns nonzero if a space should be inserted to avoid an
1438 accidental token paste for output. For simplicity, it is
1439 conservative, and occasionally advises a space where one is not
1440 needed, e.g. "." and ".2". */
1442 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1443 const cpp_token *token2)
1445 enum cpp_ttype a = token1->type, b = token2->type;
1448 if (token1->flags & NAMED_OP)
1450 if (token2->flags & NAMED_OP)
1454 if (token2->flags & DIGRAPH)
1455 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1456 else if (token_spellings[b].category == SPELL_OPERATOR)
1457 c = token_spellings[b].name[0];
1459 /* Quickly get everything that can paste with an '='. */
1460 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1465 case CPP_GREATER: return c == '>';
1466 case CPP_LESS: return c == '<' || c == '%' || c == ':';
1467 case CPP_PLUS: return c == '+';
1468 case CPP_MINUS: return c == '-' || c == '>';
1469 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1470 case CPP_MOD: return c == ':' || c == '>';
1471 case CPP_AND: return c == '&';
1472 case CPP_OR: return c == '|';
1473 case CPP_COLON: return c == ':' || c == '>';
1474 case CPP_DEREF: return c == '*';
1475 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1476 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1477 case CPP_NAME: return ((b == CPP_NUMBER
1478 && name_p (pfile, &token2->val.str))
1480 || b == CPP_CHAR || b == CPP_STRING); /* L */
1481 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1482 || c == '.' || c == '+' || c == '-');
1484 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1486 || (CPP_OPTION (pfile, objc)
1487 && token1->val.str.text[0] == '@'
1488 && (b == CPP_NAME || b == CPP_STRING)));
1495 /* Output all the remaining tokens on the current line, and a newline
1496 character, to FP. Leading whitespace is removed. If there are
1497 macros, special token padding is not performed. */
1499 cpp_output_line (cpp_reader *pfile, FILE *fp)
1501 const cpp_token *token;
1503 token = cpp_get_token (pfile);
1504 while (token->type != CPP_EOF)
1506 cpp_output_token (token, fp);
1507 token = cpp_get_token (pfile);
1508 if (token->flags & PREV_WHITE)
1515 /* Memory buffers. Changing these three constants can have a dramatic
1516 effect on performance. The values here are reasonable defaults,
1517 but might be tuned. If you adjust them, be sure to test across a
1518 range of uses of cpplib, including heavy nested function-like macro
1519 expansion. Also check the change in peak memory usage (NJAMD is a
1520 good tool for this). */
1521 #define MIN_BUFF_SIZE 8000
1522 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1523 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1524 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1526 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1527 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1530 /* Create a new allocation buffer. Place the control block at the end
1531 of the buffer, so that buffer overflows will cause immediate chaos. */
1533 new_buff (size_t len)
1536 unsigned char *base;
1538 if (len < MIN_BUFF_SIZE)
1539 len = MIN_BUFF_SIZE;
1540 len = CPP_ALIGN (len);
1542 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1543 result = (_cpp_buff *) (base + len);
1544 result->base = base;
1546 result->limit = base + len;
1547 result->next = NULL;
1551 /* Place a chain of unwanted allocation buffers on the free list. */
1553 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1555 _cpp_buff *end = buff;
1559 end->next = pfile->free_buffs;
1560 pfile->free_buffs = buff;
1563 /* Return a free buffer of size at least MIN_SIZE. */
1565 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1567 _cpp_buff *result, **p;
1569 for (p = &pfile->free_buffs;; p = &(*p)->next)
1574 return new_buff (min_size);
1576 size = result->limit - result->base;
1577 /* Return a buffer that's big enough, but don't waste one that's
1579 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1584 result->next = NULL;
1585 result->cur = result->base;
1589 /* Creates a new buffer with enough space to hold the uncommitted
1590 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1591 the excess bytes to the new buffer. Chains the new buffer after
1592 BUFF, and returns the new buffer. */
1594 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1596 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1597 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1599 buff->next = new_buff;
1600 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1604 /* Creates a new buffer with enough space to hold the uncommitted
1605 remaining bytes of the buffer pointed to by BUFF, and at least
1606 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1607 Chains the new buffer before the buffer pointed to by BUFF, and
1608 updates the pointer to point to the new buffer. */
1610 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1612 _cpp_buff *new_buff, *old_buff = *pbuff;
1613 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1615 new_buff = _cpp_get_buff (pfile, size);
1616 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1617 new_buff->next = old_buff;
1621 /* Free a chain of buffers starting at BUFF. */
1623 _cpp_free_buff (_cpp_buff *buff)
1627 for (; buff; buff = next)
1634 /* Allocate permanent, unaligned storage of length LEN. */
1636 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1638 _cpp_buff *buff = pfile->u_buff;
1639 unsigned char *result = buff->cur;
1641 if (len > (size_t) (buff->limit - result))
1643 buff = _cpp_get_buff (pfile, len);
1644 buff->next = pfile->u_buff;
1645 pfile->u_buff = buff;
1649 buff->cur = result + len;
1653 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1654 That buffer is used for growing allocations when saving macro
1655 replacement lists in a #define, and when parsing an answer to an
1656 assertion in #assert, #unassert or #if (and therefore possibly
1657 whilst expanding macros). It therefore must not be used by any
1658 code that they might call: specifically the lexer and the guts of
1661 All existing other uses clearly fit this restriction: storing
1662 registered pragmas during initialization. */
1664 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1666 _cpp_buff *buff = pfile->a_buff;
1667 unsigned char *result = buff->cur;
1669 if (len > (size_t) (buff->limit - result))
1671 buff = _cpp_get_buff (pfile, len);
1672 buff->next = pfile->a_buff;
1673 pfile->a_buff = buff;
1677 buff->cur = result + len;
1681 /* Say which field of TOK is in use. */
1683 enum cpp_token_fld_kind
1684 cpp_token_val_index (cpp_token *tok)
1686 switch (TOKEN_SPELL (tok))
1689 return CPP_TOKEN_FLD_NODE;
1691 return CPP_TOKEN_FLD_STR;
1693 if (tok->type == CPP_MACRO_ARG)
1694 return CPP_TOKEN_FLD_ARG_NO;
1695 else if (tok->type == CPP_PADDING)
1696 return CPP_TOKEN_FLD_SOURCE;
1697 else if (tok->type == CPP_PRAGMA)
1698 return CPP_TOKEN_FLD_PRAGMA;
1699 /* else fall through */
1701 return CPP_TOKEN_FLD_NONE;