1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, U s },
45 #define TK(e, s) { SPELL_ ## s, U #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
59 unsigned int, enum cpp_ttype);
60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
61 static int name_p (cpp_reader *, const cpp_string *);
62 static tokenrun *next_tokenrun (tokenrun *);
64 static _cpp_buff *new_buff (size_t);
69 Compares, the token TOKEN to the NUL-terminated string STRING.
70 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 cpp_ideq (const cpp_token *token, const char *string)
74 if (token->type != CPP_NAME)
77 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
80 /* Record a note TYPE at byte POS into the current cleaned logical
83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 if (buffer->notes_used == buffer->notes_cap)
87 buffer->notes_cap = buffer->notes_cap * 2 + 200;
88 buffer->notes = xrealloc (buffer->notes,
89 buffer->notes_cap * sizeof (_cpp_line_note));
92 buffer->notes[buffer->notes_used].pos = pos;
93 buffer->notes[buffer->notes_used].type = type;
97 /* Returns with a logical line that contains no escaped newlines or
98 trigraphs. This is a time-critical inner loop. */
100 _cpp_clean_line (cpp_reader *pfile)
106 buffer = pfile->buffer;
107 buffer->cur_note = buffer->notes_used = 0;
108 buffer->cur = buffer->line_base = buffer->next_line;
109 buffer->need_line = false;
110 s = buffer->next_line - 1;
112 if (!buffer->from_stage3)
114 /* Short circuit for the common case of an un-escaped line with
115 no trigraphs. The primary win here is by not writing any
116 data back to memory until we have to. */
120 if (c == '\n' || c == '\r')
124 if (s == buffer->rlimit)
127 /* DOS line ending? */
128 if (c == '\r' && s[1] == '\n')
131 if (s == buffer->rlimit)
134 /* check for escaped newline */
136 while (p != buffer->next_line && is_nvspace (p[-1]))
138 if (p == buffer->next_line || p[-1] != '\\')
141 /* Have an escaped newline; process it and proceed to
143 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
145 buffer->next_line = p - 1;
148 if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
150 /* Have a trigraph. We may or may not have to convert
151 it. Add a line note regardless, for -Wtrigraphs. */
152 add_line_note (buffer, s, s[2]);
153 if (CPP_OPTION (pfile, trigraphs))
155 /* We do, and that means we have to switch to the
158 *d = _cpp_trigraph_map[s[2]];
171 if (c == '\n' || c == '\r')
173 /* Handle DOS line endings. */
174 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
176 if (s == buffer->rlimit)
181 while (p != buffer->next_line && is_nvspace (p[-1]))
183 if (p == buffer->next_line || p[-1] != '\\')
186 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
188 buffer->next_line = p - 1;
190 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
192 /* Add a note regardless, for the benefit of -Wtrigraphs. */
193 add_line_note (buffer, d, s[2]);
194 if (CPP_OPTION (pfile, trigraphs))
196 *d = _cpp_trigraph_map[s[2]];
206 while (*s != '\n' && *s != '\r');
209 /* Handle DOS line endings. */
210 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
216 /* A sentinel note that should never be processed. */
217 add_line_note (buffer, d + 1, '\n');
218 buffer->next_line = s + 1;
221 /* Return true if the trigraph indicated by NOTE should be warned
222 about in a comment. */
224 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
228 /* Within comments we don't warn about trigraphs, unless the
229 trigraph forms an escaped newline, as that may change
231 if (note->type != '/')
234 /* If -trigraphs, then this was an escaped newline iff the next note
236 if (CPP_OPTION (pfile, trigraphs))
237 return note[1].pos == note->pos;
239 /* Otherwise, see if this forms an escaped newline. */
241 while (is_nvspace (*p))
244 /* There might have been escaped newlines between the trigraph and the
245 newline we found. Hence the position test. */
246 return (*p == '\n' && p < note[1].pos);
249 /* Process the notes created by add_line_note as far as the current
252 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
254 cpp_buffer *buffer = pfile->buffer;
258 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
261 if (note->pos > buffer->cur)
265 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
267 if (note->type == '\\' || note->type == ' ')
269 if (note->type == ' ' && !in_comment)
270 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
271 "backslash and newline separated by space");
273 if (buffer->next_line > buffer->rlimit)
275 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
276 "backslash-newline at end of file");
277 /* Prevent "no newline at end of file" warning. */
278 buffer->next_line = buffer->rlimit;
281 buffer->line_base = note->pos;
282 CPP_INCREMENT_LINE (pfile, 0);
284 else if (_cpp_trigraph_map[note->type])
286 if (CPP_OPTION (pfile, warn_trigraphs)
287 && (!in_comment || warn_in_comment (pfile, note)))
289 if (CPP_OPTION (pfile, trigraphs))
290 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
291 "trigraph ??%c converted to %c",
293 (int) _cpp_trigraph_map[note->type]);
297 (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
298 "trigraph ??%c ignored, use -trigraphs to enable",
308 /* Skip a C-style block comment. We find the end of the comment by
309 seeing if an asterisk is before every '/' we encounter. Returns
310 nonzero if comment terminated by EOF, zero otherwise.
312 Buffer->cur points to the initial asterisk of the comment. */
314 _cpp_skip_block_comment (cpp_reader *pfile)
316 cpp_buffer *buffer = pfile->buffer;
317 const uchar *cur = buffer->cur;
326 /* People like decorating comments with '*', so check for '/'
327 instead for efficiency. */
335 /* Warn about potential nested comments, but not if the '/'
336 comes immediately before the true comment delimiter.
337 Don't bother to get it right across escaped newlines. */
338 if (CPP_OPTION (pfile, warn_comments)
339 && cur[0] == '*' && cur[1] != '/')
342 cpp_error_with_line (pfile, CPP_DL_WARNING,
343 pfile->line_table->highest_line, CPP_BUF_COL (buffer),
344 "\"/*\" within comment");
350 buffer->cur = cur - 1;
351 _cpp_process_line_notes (pfile, true);
352 if (buffer->next_line >= buffer->rlimit)
354 _cpp_clean_line (pfile);
356 cols = buffer->next_line - buffer->line_base;
357 CPP_INCREMENT_LINE (pfile, cols);
364 _cpp_process_line_notes (pfile, true);
368 /* Skip a C++ line comment, leaving buffer->cur pointing to the
369 terminating newline. Handles escaped newlines. Returns nonzero
370 if a multiline comment. */
372 skip_line_comment (cpp_reader *pfile)
374 cpp_buffer *buffer = pfile->buffer;
375 unsigned int orig_line = pfile->line_table->highest_line;
377 while (*buffer->cur != '\n')
380 _cpp_process_line_notes (pfile, true);
381 return orig_line != pfile->line_table->highest_line;
384 /* Skips whitespace, saving the next non-whitespace character. */
386 skip_whitespace (cpp_reader *pfile, cppchar_t c)
388 cpp_buffer *buffer = pfile->buffer;
389 bool saw_NUL = false;
393 /* Horizontal space always OK. */
394 if (c == ' ' || c == '\t')
396 /* Just \f \v or \0 left. */
399 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
400 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
401 CPP_BUF_COL (buffer),
402 "%s in preprocessing directive",
403 c == '\f' ? "form feed" : "vertical tab");
407 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
408 while (is_nvspace (c));
411 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
416 /* See if the characters of a number token are valid in a name (no
419 name_p (cpp_reader *pfile, const cpp_string *string)
423 for (i = 0; i < string->len; i++)
424 if (!is_idchar (string->text[i]))
430 /* After parsing an identifier or other sequence, produce a warning about
431 sequences not in NFC/NFKC. */
433 warn_about_normalization (cpp_reader *pfile,
434 const cpp_token *token,
435 const struct normalize_state *s)
437 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
438 && !pfile->state.skipping)
440 /* Make sure that the token is printed using UCNs, even
441 if we'd otherwise happily print UTF-8. */
442 unsigned char *buf = xmalloc (cpp_token_len (token));
445 sz = cpp_spell_token (pfile, token, buf, false) - buf;
446 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
447 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
448 "`%.*s' is not in NFKC", sz, buf);
450 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
451 "`%.*s' is not in NFC", sz, buf);
455 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
456 an identifier. FIRST is TRUE if this starts an identifier. */
458 forms_identifier_p (cpp_reader *pfile, int first,
459 struct normalize_state *state)
461 cpp_buffer *buffer = pfile->buffer;
463 if (*buffer->cur == '$')
465 if (!CPP_OPTION (pfile, dollars_in_ident))
469 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
471 CPP_OPTION (pfile, warn_dollars) = 0;
472 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
478 /* Is this a syntactically valid UCN? */
479 if (*buffer->cur == '\\'
480 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
483 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
492 /* Lex an identifier starting at BUFFER->CUR - 1. */
493 static cpp_hashnode *
494 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
495 struct normalize_state *nst)
497 cpp_hashnode *result;
500 unsigned int hash = HT_HASHSTEP (0, *base);
502 cur = pfile->buffer->cur;
504 while (ISIDNUM (*cur))
506 hash = HT_HASHSTEP (hash, *cur);
509 pfile->buffer->cur = cur;
510 if (starts_ucn || forms_identifier_p (pfile, false, nst))
512 /* Slower version for identifiers containing UCNs (or $). */
514 while (ISIDNUM (*pfile->buffer->cur))
516 pfile->buffer->cur++;
517 NORMALIZE_STATE_UPDATE_IDNUM (nst);
519 } while (forms_identifier_p (pfile, false, nst));
520 result = _cpp_interpret_identifier (pfile, base,
521 pfile->buffer->cur - base);
526 hash = HT_HASHFINISH (hash, len);
528 result = (cpp_hashnode *)
529 ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
532 /* Rarely, identifiers require diagnostics when lexed. */
533 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
534 && !pfile->state.skipping, 0))
536 /* It is allowed to poison the same identifier twice. */
537 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
538 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
541 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
542 replacement list of a variadic macro. */
543 if (result == pfile->spec_nodes.n__VA_ARGS__
544 && !pfile->state.va_args_ok)
545 cpp_error (pfile, CPP_DL_PEDWARN,
546 "__VA_ARGS__ can only appear in the expansion"
547 " of a C99 variadic macro");
553 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
555 lex_number (cpp_reader *pfile, cpp_string *number,
556 struct normalize_state *nst)
562 base = pfile->buffer->cur - 1;
565 cur = pfile->buffer->cur;
567 /* N.B. ISIDNUM does not include $. */
568 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
571 NORMALIZE_STATE_UPDATE_IDNUM (nst);
574 pfile->buffer->cur = cur;
576 while (forms_identifier_p (pfile, false, nst));
578 number->len = cur - base;
579 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
580 memcpy (dest, base, number->len);
581 dest[number->len] = '\0';
585 /* Create a token of type TYPE with a literal spelling. */
587 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
588 unsigned int len, enum cpp_ttype type)
590 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
592 memcpy (dest, base, len);
595 token->val.str.len = len;
596 token->val.str.text = dest;
599 /* Lexes a string, character constant, or angle-bracketed header file
600 name. The stored string contains the spelling, including opening
601 quote and leading any leading 'L'. It returns the type of the
602 literal, or CPP_OTHER if it was not properly terminated.
604 The spelling is NUL-terminated, but it is not guaranteed that this
605 is the first NUL since embedded NULs are preserved. */
607 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
609 bool saw_NUL = false;
611 cppchar_t terminator;
616 if (terminator == 'L')
618 if (terminator == '\"')
619 type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
620 else if (terminator == '\'')
621 type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
623 terminator = '>', type = CPP_HEADER_NAME;
627 cppchar_t c = *cur++;
629 /* In #include-style directives, terminators are not escapable. */
630 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
632 else if (c == terminator)
644 if (saw_NUL && !pfile->state.skipping)
645 cpp_error (pfile, CPP_DL_WARNING,
646 "null character(s) preserved in literal");
648 pfile->buffer->cur = cur;
649 create_literal (pfile, token, base, cur - base, type);
652 /* The stored comment includes the comment start and any terminator. */
654 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
657 unsigned char *buffer;
658 unsigned int len, clen;
660 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
662 /* C++ comments probably (not definitely) have moved past a new
663 line, which we don't want to save in the comment. */
664 if (is_vspace (pfile->buffer->cur[-1]))
667 /* If we are currently in a directive, then we need to store all
668 C++ comments as C comments internally, and so we need to
669 allocate a little extra space in that case.
671 Note that the only time we encounter a directive here is
672 when we are saving comments in a "#define". */
673 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
675 buffer = _cpp_unaligned_alloc (pfile, clen);
677 token->type = CPP_COMMENT;
678 token->val.str.len = clen;
679 token->val.str.text = buffer;
682 memcpy (buffer + 1, from, len - 1);
684 /* Finish conversion to a C comment, if necessary. */
685 if (pfile->state.in_directive && type == '/')
688 buffer[clen - 2] = '*';
689 buffer[clen - 1] = '/';
693 /* Allocate COUNT tokens for RUN. */
695 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
697 run->base = XNEWVEC (cpp_token, count);
698 run->limit = run->base + count;
702 /* Returns the next tokenrun, or creates one if there is none. */
704 next_tokenrun (tokenrun *run)
706 if (run->next == NULL)
708 run->next = XNEW (tokenrun);
709 run->next->prev = run;
710 _cpp_init_tokenrun (run->next, 250);
716 /* Allocate a single token that is invalidated at the same time as the
717 rest of the tokens on the line. Has its line and col set to the
718 same as the last lexed token, so that diagnostics appear in the
721 _cpp_temp_token (cpp_reader *pfile)
723 cpp_token *old, *result;
725 old = pfile->cur_token - 1;
726 if (pfile->cur_token == pfile->cur_run->limit)
728 pfile->cur_run = next_tokenrun (pfile->cur_run);
729 pfile->cur_token = pfile->cur_run->base;
732 result = pfile->cur_token++;
733 result->src_loc = old->src_loc;
737 /* Lex a token into RESULT (external interface). Takes care of issues
738 like directive handling, token lookahead, multiple include
739 optimization and skipping. */
741 _cpp_lex_token (cpp_reader *pfile)
747 if (pfile->cur_token == pfile->cur_run->limit)
749 pfile->cur_run = next_tokenrun (pfile->cur_run);
750 pfile->cur_token = pfile->cur_run->base;
753 if (pfile->lookaheads)
756 result = pfile->cur_token++;
759 result = _cpp_lex_direct (pfile);
761 if (result->flags & BOL)
763 /* Is this a directive. If _cpp_handle_directive returns
764 false, it is an assembler #. */
765 if (result->type == CPP_HASH
766 /* 6.10.3 p 11: Directives in a list of macro arguments
767 gives undefined behavior. This implementation
768 handles the directive as normal. */
769 && pfile->state.parsing_args != 1
770 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
772 if (pfile->directive_result.type == CPP_PADDING)
776 result = &pfile->directive_result;
781 if (pfile->cb.line_change && !pfile->state.skipping)
782 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
785 /* We don't skip tokens in directives. */
786 if (pfile->state.in_directive)
789 /* Outside a directive, invalidate controlling macros. At file
790 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
791 get here and MI optimization works. */
792 pfile->mi_valid = false;
794 if (!pfile->state.skipping || result->type == CPP_EOF)
801 /* Returns true if a fresh line has been loaded. */
803 _cpp_get_fresh_line (cpp_reader *pfile)
807 /* We can't get a new line until we leave the current directive. */
808 if (pfile->state.in_directive)
813 cpp_buffer *buffer = pfile->buffer;
815 if (!buffer->need_line)
818 if (buffer->next_line < buffer->rlimit)
820 _cpp_clean_line (pfile);
824 /* First, get out of parsing arguments state. */
825 if (pfile->state.parsing_args)
828 /* End of buffer. Non-empty files should end in a newline. */
829 if (buffer->buf != buffer->rlimit
830 && buffer->next_line > buffer->rlimit
831 && !buffer->from_stage3)
833 /* Only warn once. */
834 buffer->next_line = buffer->rlimit;
835 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
836 CPP_BUF_COLUMN (buffer, buffer->cur),
837 "no newline at end of file");
840 return_at_eof = buffer->return_at_eof;
841 _cpp_pop_buffer (pfile);
842 if (pfile->buffer == NULL || return_at_eof)
847 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
850 result->type = ELSE_TYPE; \
851 if (*buffer->cur == CHAR) \
852 buffer->cur++, result->type = THEN_TYPE; \
856 /* Lex a token into pfile->cur_token, which is also incremented, to
857 get diagnostics pointing to the correct location.
859 Does not handle issues such as token lookahead, multiple-include
860 optimization, directives, skipping etc. This function is only
861 suitable for use by _cpp_lex_token, and in special cases like
862 lex_expansion_token which doesn't care for any of these issues.
864 When meeting a newline, returns CPP_EOF if parsing a directive,
865 otherwise returns to the start of the token buffer if permissible.
866 Returns the location of the lexed token. */
868 _cpp_lex_direct (cpp_reader *pfile)
872 const unsigned char *comment_start;
873 cpp_token *result = pfile->cur_token++;
877 buffer = pfile->buffer;
878 if (buffer->need_line)
880 if (!_cpp_get_fresh_line (pfile))
882 result->type = CPP_EOF;
883 if (!pfile->state.in_directive)
885 /* Tell the compiler the line number of the EOF token. */
886 result->src_loc = pfile->line_table->highest_line;
891 if (!pfile->keep_tokens)
893 pfile->cur_run = &pfile->base_run;
894 result = pfile->base_run.base;
895 pfile->cur_token = result + 1;
898 if (pfile->state.parsing_args == 2)
899 result->flags |= PREV_WHITE;
901 buffer = pfile->buffer;
903 result->src_loc = pfile->line_table->highest_line;
906 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
907 && !pfile->overlaid_buffer)
909 _cpp_process_line_notes (pfile, false);
910 result->src_loc = pfile->line_table->highest_line;
914 LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
915 CPP_BUF_COLUMN (buffer, buffer->cur));
919 case ' ': case '\t': case '\f': case '\v': case '\0':
920 result->flags |= PREV_WHITE;
921 skip_whitespace (pfile, c);
925 if (buffer->cur < buffer->rlimit)
926 CPP_INCREMENT_LINE (pfile, 0);
927 buffer->need_line = true;
930 case '0': case '1': case '2': case '3': case '4':
931 case '5': case '6': case '7': case '8': case '9':
933 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
934 result->type = CPP_NUMBER;
935 lex_number (pfile, &result->val.str, &nst);
936 warn_about_normalization (pfile, result, &nst);
941 /* 'L' may introduce wide characters or strings. */
942 if (*buffer->cur == '\'' || *buffer->cur == '"')
944 lex_string (pfile, result, buffer->cur - 1);
950 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
951 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
952 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
953 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
955 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
956 case 'G': case 'H': case 'I': case 'J': case 'K':
957 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
958 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
960 result->type = CPP_NAME;
962 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
963 result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
965 warn_about_normalization (pfile, result, &nst);
968 /* Convert named operators to their proper types. */
969 if (result->val.node->flags & NODE_OPERATOR)
971 result->flags |= NAMED_OP;
972 result->type = result->val.node->directive_index;
978 lex_string (pfile, result, buffer->cur - 1);
982 /* A potential block or line comment. */
983 comment_start = buffer->cur;
988 if (_cpp_skip_block_comment (pfile))
989 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
991 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
992 || cpp_in_system_header (pfile)))
994 /* Warn about comments only if pedantically GNUC89, and not
995 in system headers. */
996 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
997 && ! buffer->warned_cplusplus_comments)
999 cpp_error (pfile, CPP_DL_PEDWARN,
1000 "C++ style comments are not allowed in ISO C90");
1001 cpp_error (pfile, CPP_DL_PEDWARN,
1002 "(this will be reported only once per input file)");
1003 buffer->warned_cplusplus_comments = 1;
1006 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1007 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1012 result->type = CPP_DIV_EQ;
1017 result->type = CPP_DIV;
1021 if (!pfile->state.save_comments)
1023 result->flags |= PREV_WHITE;
1024 goto update_tokens_line;
1027 /* Save the comment as a token in its own right. */
1028 save_comment (pfile, result, comment_start, c);
1032 if (pfile->state.angled_headers)
1034 lex_string (pfile, result, buffer->cur - 1);
1038 result->type = CPP_LESS;
1039 if (*buffer->cur == '=')
1040 buffer->cur++, result->type = CPP_LESS_EQ;
1041 else if (*buffer->cur == '<')
1044 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1046 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1049 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1051 else if (CPP_OPTION (pfile, digraphs))
1053 if (*buffer->cur == ':')
1056 result->flags |= DIGRAPH;
1057 result->type = CPP_OPEN_SQUARE;
1059 else if (*buffer->cur == '%')
1062 result->flags |= DIGRAPH;
1063 result->type = CPP_OPEN_BRACE;
1069 result->type = CPP_GREATER;
1070 if (*buffer->cur == '=')
1071 buffer->cur++, result->type = CPP_GREATER_EQ;
1072 else if (*buffer->cur == '>')
1075 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1077 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1080 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1085 result->type = CPP_MOD;
1086 if (*buffer->cur == '=')
1087 buffer->cur++, result->type = CPP_MOD_EQ;
1088 else if (CPP_OPTION (pfile, digraphs))
1090 if (*buffer->cur == ':')
1093 result->flags |= DIGRAPH;
1094 result->type = CPP_HASH;
1095 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1096 buffer->cur += 2, result->type = CPP_PASTE;
1098 else if (*buffer->cur == '>')
1101 result->flags |= DIGRAPH;
1102 result->type = CPP_CLOSE_BRACE;
1108 result->type = CPP_DOT;
1109 if (ISDIGIT (*buffer->cur))
1111 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1112 result->type = CPP_NUMBER;
1113 lex_number (pfile, &result->val.str, &nst);
1114 warn_about_normalization (pfile, result, &nst);
1116 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1117 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1118 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1119 buffer->cur++, result->type = CPP_DOT_STAR;
1123 result->type = CPP_PLUS;
1124 if (*buffer->cur == '+')
1125 buffer->cur++, result->type = CPP_PLUS_PLUS;
1126 else if (*buffer->cur == '=')
1127 buffer->cur++, result->type = CPP_PLUS_EQ;
1131 result->type = CPP_MINUS;
1132 if (*buffer->cur == '>')
1135 result->type = CPP_DEREF;
1136 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1137 buffer->cur++, result->type = CPP_DEREF_STAR;
1139 else if (*buffer->cur == '-')
1140 buffer->cur++, result->type = CPP_MINUS_MINUS;
1141 else if (*buffer->cur == '=')
1142 buffer->cur++, result->type = CPP_MINUS_EQ;
1146 result->type = CPP_AND;
1147 if (*buffer->cur == '&')
1148 buffer->cur++, result->type = CPP_AND_AND;
1149 else if (*buffer->cur == '=')
1150 buffer->cur++, result->type = CPP_AND_EQ;
1154 result->type = CPP_OR;
1155 if (*buffer->cur == '|')
1156 buffer->cur++, result->type = CPP_OR_OR;
1157 else if (*buffer->cur == '=')
1158 buffer->cur++, result->type = CPP_OR_EQ;
1162 result->type = CPP_COLON;
1163 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1164 buffer->cur++, result->type = CPP_SCOPE;
1165 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1168 result->flags |= DIGRAPH;
1169 result->type = CPP_CLOSE_SQUARE;
1173 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1174 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1175 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1176 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1177 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1179 case '?': result->type = CPP_QUERY; break;
1180 case '~': result->type = CPP_COMPL; break;
1181 case ',': result->type = CPP_COMMA; break;
1182 case '(': result->type = CPP_OPEN_PAREN; break;
1183 case ')': result->type = CPP_CLOSE_PAREN; break;
1184 case '[': result->type = CPP_OPEN_SQUARE; break;
1185 case ']': result->type = CPP_CLOSE_SQUARE; break;
1186 case '{': result->type = CPP_OPEN_BRACE; break;
1187 case '}': result->type = CPP_CLOSE_BRACE; break;
1188 case ';': result->type = CPP_SEMICOLON; break;
1190 /* @ is a punctuator in Objective-C. */
1191 case '@': result->type = CPP_ATSIGN; break;
1196 const uchar *base = --buffer->cur;
1197 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1199 if (forms_identifier_p (pfile, true, &nst))
1201 result->type = CPP_NAME;
1202 result->val.node = lex_identifier (pfile, base, true, &nst);
1203 warn_about_normalization (pfile, result, &nst);
1210 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1217 /* An upper bound on the number of bytes needed to spell TOKEN.
1218 Does not include preceding whitespace. */
1220 cpp_token_len (const cpp_token *token)
1224 switch (TOKEN_SPELL (token))
1226 default: len = 4; break;
1227 case SPELL_LITERAL: len = token->val.str.len; break;
1228 case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
1234 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1235 Return the number of bytes read out of NAME. (There are always
1236 10 bytes written to BUFFER.) */
1239 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1245 unsigned long utf32;
1247 /* Compute the length of the UTF-8 sequence. */
1248 for (t = *name; t & 0x80; t <<= 1)
1251 utf32 = *name & (0x7F >> ucn_len);
1252 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1254 utf32 = (utf32 << 6) | (*++name & 0x3F);
1256 /* Ill-formed UTF-8. */
1257 if ((*name & ~0x3F) != 0x80)
1263 for (j = 7; j >= 0; j--)
1264 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1269 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1270 already contain the enough space to hold the token's spelling.
1271 Returns a pointer to the character after the last character written.
1272 FORSTRING is true if this is to be the spelling after translation
1273 phase 1 (this is different for UCNs).
1274 FIXME: Would be nice if we didn't need the PFILE argument. */
1276 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1277 unsigned char *buffer, bool forstring)
1279 switch (TOKEN_SPELL (token))
1281 case SPELL_OPERATOR:
1283 const unsigned char *spelling;
1286 if (token->flags & DIGRAPH)
1288 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1289 else if (token->flags & NAMED_OP)
1292 spelling = TOKEN_NAME (token);
1294 while ((c = *spelling++) != '\0')
1303 memcpy (buffer, NODE_NAME (token->val.node),
1304 NODE_LEN (token->val.node));
1305 buffer += NODE_LEN (token->val.node);
1310 const unsigned char * name = NODE_NAME (token->val.node);
1312 for (i = 0; i < NODE_LEN (token->val.node); i++)
1313 if (name[i] & ~0x7F)
1315 i += utf8_to_ucn (buffer, name + i) - 1;
1319 *buffer++ = NODE_NAME (token->val.node)[i];
1324 memcpy (buffer, token->val.str.text, token->val.str.len);
1325 buffer += token->val.str.len;
1329 cpp_error (pfile, CPP_DL_ICE,
1330 "unspellable token %s", TOKEN_NAME (token));
1337 /* Returns TOKEN spelt as a null-terminated string. The string is
1338 freed when the reader is destroyed. Useful for diagnostics. */
1340 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1342 unsigned int len = cpp_token_len (token) + 1;
1343 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1345 end = cpp_spell_token (pfile, token, start, false);
1351 /* Used by C front ends, which really should move to using
1352 cpp_token_as_text. */
1354 cpp_type2name (enum cpp_ttype type)
1356 return (const char *) token_spellings[type].name;
1359 /* Writes the spelling of token to FP, without any preceding space.
1360 Separated from cpp_spell_token for efficiency - to avoid stdio
1361 double-buffering. */
1363 cpp_output_token (const cpp_token *token, FILE *fp)
1365 switch (TOKEN_SPELL (token))
1367 case SPELL_OPERATOR:
1369 const unsigned char *spelling;
1372 if (token->flags & DIGRAPH)
1374 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1375 else if (token->flags & NAMED_OP)
1378 spelling = TOKEN_NAME (token);
1383 while ((c = *++spelling) != '\0');
1391 const unsigned char * name = NODE_NAME (token->val.node);
1393 for (i = 0; i < NODE_LEN (token->val.node); i++)
1394 if (name[i] & ~0x7F)
1396 unsigned char buffer[10];
1397 i += utf8_to_ucn (buffer, name + i) - 1;
1398 fwrite (buffer, 1, 10, fp);
1401 fputc (NODE_NAME (token->val.node)[i], fp);
1406 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1410 /* An error, most probably. */
1415 /* Compare two tokens. */
1417 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1419 if (a->type == b->type && a->flags == b->flags)
1420 switch (TOKEN_SPELL (a))
1422 default: /* Keep compiler happy. */
1423 case SPELL_OPERATOR:
1426 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1428 return a->val.node == b->val.node;
1430 return (a->val.str.len == b->val.str.len
1431 && !memcmp (a->val.str.text, b->val.str.text,
1438 /* Returns nonzero if a space should be inserted to avoid an
1439 accidental token paste for output. For simplicity, it is
1440 conservative, and occasionally advises a space where one is not
1441 needed, e.g. "." and ".2". */
1443 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1444 const cpp_token *token2)
1446 enum cpp_ttype a = token1->type, b = token2->type;
1449 if (token1->flags & NAMED_OP)
1451 if (token2->flags & NAMED_OP)
1455 if (token2->flags & DIGRAPH)
1456 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1457 else if (token_spellings[b].category == SPELL_OPERATOR)
1458 c = token_spellings[b].name[0];
1460 /* Quickly get everything that can paste with an '='. */
1461 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1466 case CPP_GREATER: return c == '>' || c == '?';
1467 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1468 case CPP_PLUS: return c == '+';
1469 case CPP_MINUS: return c == '-' || c == '>';
1470 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1471 case CPP_MOD: return c == ':' || c == '>';
1472 case CPP_AND: return c == '&';
1473 case CPP_OR: return c == '|';
1474 case CPP_COLON: return c == ':' || c == '>';
1475 case CPP_DEREF: return c == '*';
1476 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1477 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1478 case CPP_NAME: return ((b == CPP_NUMBER
1479 && name_p (pfile, &token2->val.str))
1481 || b == CPP_CHAR || b == CPP_STRING); /* L */
1482 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1483 || c == '.' || c == '+' || c == '-');
1485 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1487 || (CPP_OPTION (pfile, objc)
1488 && token1->val.str.text[0] == '@'
1489 && (b == CPP_NAME || b == CPP_STRING)));
1496 /* Output all the remaining tokens on the current line, and a newline
1497 character, to FP. Leading whitespace is removed. If there are
1498 macros, special token padding is not performed. */
1500 cpp_output_line (cpp_reader *pfile, FILE *fp)
1502 const cpp_token *token;
1504 token = cpp_get_token (pfile);
1505 while (token->type != CPP_EOF)
1507 cpp_output_token (token, fp);
1508 token = cpp_get_token (pfile);
1509 if (token->flags & PREV_WHITE)
1516 /* Memory buffers. Changing these three constants can have a dramatic
1517 effect on performance. The values here are reasonable defaults,
1518 but might be tuned. If you adjust them, be sure to test across a
1519 range of uses of cpplib, including heavy nested function-like macro
1520 expansion. Also check the change in peak memory usage (NJAMD is a
1521 good tool for this). */
1522 #define MIN_BUFF_SIZE 8000
1523 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1524 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1525 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1527 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1528 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1531 /* Create a new allocation buffer. Place the control block at the end
1532 of the buffer, so that buffer overflows will cause immediate chaos. */
1534 new_buff (size_t len)
1537 unsigned char *base;
1539 if (len < MIN_BUFF_SIZE)
1540 len = MIN_BUFF_SIZE;
1541 len = CPP_ALIGN (len);
1543 base = xmalloc (len + sizeof (_cpp_buff));
1544 result = (_cpp_buff *) (base + len);
1545 result->base = base;
1547 result->limit = base + len;
1548 result->next = NULL;
1552 /* Place a chain of unwanted allocation buffers on the free list. */
1554 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1556 _cpp_buff *end = buff;
1560 end->next = pfile->free_buffs;
1561 pfile->free_buffs = buff;
1564 /* Return a free buffer of size at least MIN_SIZE. */
1566 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1568 _cpp_buff *result, **p;
1570 for (p = &pfile->free_buffs;; p = &(*p)->next)
1575 return new_buff (min_size);
1577 size = result->limit - result->base;
1578 /* Return a buffer that's big enough, but don't waste one that's
1580 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1585 result->next = NULL;
1586 result->cur = result->base;
1590 /* Creates a new buffer with enough space to hold the uncommitted
1591 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1592 the excess bytes to the new buffer. Chains the new buffer after
1593 BUFF, and returns the new buffer. */
1595 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1597 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1598 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1600 buff->next = new_buff;
1601 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1605 /* Creates a new buffer with enough space to hold the uncommitted
1606 remaining bytes of the buffer pointed to by BUFF, and at least
1607 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1608 Chains the new buffer before the buffer pointed to by BUFF, and
1609 updates the pointer to point to the new buffer. */
1611 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1613 _cpp_buff *new_buff, *old_buff = *pbuff;
1614 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1616 new_buff = _cpp_get_buff (pfile, size);
1617 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1618 new_buff->next = old_buff;
1622 /* Free a chain of buffers starting at BUFF. */
1624 _cpp_free_buff (_cpp_buff *buff)
1628 for (; buff; buff = next)
1635 /* Allocate permanent, unaligned storage of length LEN. */
1637 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1639 _cpp_buff *buff = pfile->u_buff;
1640 unsigned char *result = buff->cur;
1642 if (len > (size_t) (buff->limit - result))
1644 buff = _cpp_get_buff (pfile, len);
1645 buff->next = pfile->u_buff;
1646 pfile->u_buff = buff;
1650 buff->cur = result + len;
1654 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1655 That buffer is used for growing allocations when saving macro
1656 replacement lists in a #define, and when parsing an answer to an
1657 assertion in #assert, #unassert or #if (and therefore possibly
1658 whilst expanding macros). It therefore must not be used by any
1659 code that they might call: specifically the lexer and the guts of
1662 All existing other uses clearly fit this restriction: storing
1663 registered pragmas during initialization. */
1665 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1667 _cpp_buff *buff = pfile->a_buff;
1668 unsigned char *result = buff->cur;
1670 if (len > (size_t) (buff->limit - result))
1672 buff = _cpp_get_buff (pfile, len);
1673 buff->next = pfile->a_buff;
1674 pfile->a_buff = buff;
1678 buff->cur = result + len;
1682 /* Say which field of TOK is in use. */
1684 enum cpp_token_fld_kind
1685 cpp_token_val_index (cpp_token *tok)
1687 switch (TOKEN_SPELL (tok))
1690 return CPP_TOKEN_FLD_NODE;
1692 return CPP_TOKEN_FLD_STR;
1694 if (tok->type == CPP_MACRO_ARG)
1695 return CPP_TOKEN_FLD_ARG_NO;
1696 else if (tok->type == CPP_PADDING)
1697 return CPP_TOKEN_FLD_SOURCE;
1698 else if (tok->type == CPP_PRAGMA)
1699 return CPP_TOKEN_FLD_STR;
1700 /* else fall through */
1702 return CPP_TOKEN_FLD_NONE;