gcc/cpplex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7    Single-pass line tokenization by Neil Booth, April 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 2, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; if not, write to the Free Software
  21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "cpphash.h"
  27
  28 /* MULTIBYTE_CHARS support only works for native compilers.
  29    ??? Ideally what we want is to model widechar support after
  30    the current floating point support.  */
  31 #ifdef CROSS_COMPILE
  32 #undef MULTIBYTE_CHARS
  33 #endif
  34
  35 #ifdef MULTIBYTE_CHARS
  36 #include "mbchar.h"
  37 #include <locale.h>
  38 #endif
  39
  40 /* Tokens with SPELL_STRING store their spelling in the token list,
  41    and it's length in the token->val.name.len.  */
  42 enum spell_type
  43 {
  44   SPELL_OPERATOR = 0,
  45   SPELL_CHAR,
  46   SPELL_IDENT,
  47   SPELL_NUMBER,
  48   SPELL_STRING,
  49   SPELL_NONE
  50 };
  51
  52 struct token_spelling
  53 {
  54   enum spell_type category;
  55   const unsigned char *name;
  56 };
  57
  58 static const unsigned char *const digraph_spellings[] =
  59 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  60
  61 #define OP(e, s) { SPELL_OPERATOR, U s           },
  62 #define TK(e, s) { s,              U STRINGX (e) },
  63 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  64 #undef OP
  65 #undef TK
  66
  67 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  68 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  69 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
  70
  71 static void handle_newline PARAMS ((cpp_reader *));
  72 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
  73 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
  74
  75 static int skip_block_comment PARAMS ((cpp_reader *));
  76 static int skip_line_comment PARAMS ((cpp_reader *));
  77 static void adjust_column PARAMS ((cpp_reader *));
  78 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
  79 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
  80 static U_CHAR *parse_slow PARAMS ((cpp_reader *, const U_CHAR *, int,
  81                                    unsigned int *));
  82 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
  83 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
  84 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
  85 static bool trigraph_p PARAMS ((cpp_reader *));
  86 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
  87 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
  88 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
  89                                    const unsigned char *, unsigned int *));
  90 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
  91
  92 static unsigned int hex_digit_value PARAMS ((unsigned int));
  93 static _cpp_buff *new_buff PARAMS ((size_t));
  94
  95 /* Utility routine:
  96
  97    Compares, the token TOKEN to the NUL-terminated string STRING.
  98    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  99 int
 100 cpp_ideq (token, string)
 101      const cpp_token *token;
 102      const char *string;
 103 {
 104   if (token->type != CPP_NAME)
 105     return 0;
 106
 107   return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
 108 }
 109
 110 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
 111    Returns with buffer->cur pointing to the character immediately
 112    following the newline (combination).  */
 113 static void
 114 handle_newline (pfile)
 115      cpp_reader *pfile;
 116 {
 117   cpp_buffer *buffer = pfile->buffer;
 118
 119   /* Handle CR-LF and LF-CR.  Most other implementations (e.g. java)
 120      only accept CR-LF; maybe we should fall back to that behaviour?  */
 121   if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
 122     buffer->cur++;
 123
 124   buffer->line_base = buffer->cur;
 125   buffer->col_adjust = 0;
 126   pfile->line++;
 127 }
 128
 129 /* Subroutine of skip_escaped_newlines; called when a 3-character
 130    sequence beginning with "??" is encountered.  buffer->cur points to
 131    the second '?'.
 132
 133    Warn if necessary, and returns true if the sequence forms a
 134    trigraph and the trigraph should be honoured.  */
 135 static bool
 136 trigraph_p (pfile)
 137      cpp_reader *pfile;
 138 {
 139   cpp_buffer *buffer = pfile->buffer;
 140   cppchar_t from_char = buffer->cur[1];
 141   bool accept;
 142
 143   if (!_cpp_trigraph_map[from_char])
 144     return false;
 145
 146   accept = CPP_OPTION (pfile, trigraphs);
 147
 148   /* Don't warn about trigraphs in comments.  */
 149   if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
 150     {
 151       if (accept)
 152         cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 1,
 153                                "trigraph ??%c converted to %c",
 154                                (int) from_char,
 155                                (int) _cpp_trigraph_map[from_char]);
 156       else if (buffer->cur != buffer->last_Wtrigraphs)
 157         {
 158           buffer->last_Wtrigraphs = buffer->cur;
 159           cpp_warning_with_line (pfile, pfile->line,
 160                                  CPP_BUF_COL (buffer) - 1,
 161                                  "trigraph ??%c ignored", (int) from_char);
 162         }
 163     }
 164
 165   return accept;
 166 }
 167
 168 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
 169    lie in buffer->cur[-1].  Returns the next byte, which will be in
 170    buffer->cur[-1].  This routine performs preprocessing stages 1 and
 171    2 of the ISO C standard.  */
 172 static cppchar_t
 173 skip_escaped_newlines (pfile)
 174      cpp_reader *pfile;
 175 {
 176   cpp_buffer *buffer = pfile->buffer;
 177   cppchar_t next = buffer->cur[-1];
 178
 179   /* Only do this if we apply stages 1 and 2.  */
 180   if (!buffer->from_stage3)
 181     {
 182       const unsigned char *saved_cur;
 183       cppchar_t next1;
 184
 185       do
 186         {
 187           if (next == '?')
 188             {
 189               if (buffer->cur[0] != '?' || !trigraph_p (pfile))
 190                 break;
 191
 192               /* Translate the trigraph.  */
 193               next = _cpp_trigraph_map[buffer->cur[1]];
 194               buffer->cur += 2;
 195               if (next != '\\')
 196                 break;
 197             }
 198
 199           if (buffer->cur == buffer->rlimit)
 200             break;
 201
 202           /* We have a backslash, and room for at least one more
 203              character.  Skip horizontal whitespace.  */
 204           saved_cur = buffer->cur;
 205           do
 206             next1 = *buffer->cur++;
 207           while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
 208
 209           if (!is_vspace (next1))
 210             {
 211               buffer->cur = saved_cur;
 212               break;
 213             }
 214
 215           if (saved_cur != buffer->cur - 1
 216               && !pfile->state.lexing_comment)
 217             cpp_warning (pfile, "backslash and newline separated by space");
 218
 219           handle_newline (pfile);
 220           buffer->backup_to = buffer->cur;
 221           if (buffer->cur == buffer->rlimit)
 222             {
 223               cpp_pedwarn (pfile, "backslash-newline at end of file");
 224               next = EOF;
 225             }
 226           else
 227             next = *buffer->cur++;
 228         }
 229       while (next == '\\' || next == '?');
 230     }
 231
 232   return next;
 233 }
 234
 235 /* Obtain the next character, after trigraph conversion and skipping
 236    an arbitrarily long string of escaped newlines.  The common case of
 237    no trigraphs or escaped newlines falls through quickly.  On return,
 238    buffer->backup_to points to where to return to if the character is
 239    not to be processed.  */
 240 static cppchar_t
 241 get_effective_char (pfile)
 242      cpp_reader *pfile;
 243 {
 244   cppchar_t next;
 245   cpp_buffer *buffer = pfile->buffer;
 246
 247   buffer->backup_to = buffer->cur;
 248   next = *buffer->cur++;
 249   if (__builtin_expect (next == '?' || next == '\\', 0))
 250     next = skip_escaped_newlines (pfile);
 251
 252    return next;
 253 }
 254
 255 /* Skip a C-style block comment.  We find the end of the comment by
 256    seeing if an asterisk is before every '/' we encounter.  Returns
 257    non-zero if comment terminated by EOF, zero otherwise.  */
 258 static int
 259 skip_block_comment (pfile)
 260      cpp_reader *pfile;
 261 {
 262   cpp_buffer *buffer = pfile->buffer;
 263   cppchar_t c = EOF, prevc = EOF;
 264
 265   pfile->state.lexing_comment = 1;
 266   while (buffer->cur != buffer->rlimit)
 267     {
 268       prevc = c, c = *buffer->cur++;
 269
 270       /* FIXME: For speed, create a new character class of characters
 271          of interest inside block comments.  */
 272       if (c == '?' || c == '\\')
 273         c = skip_escaped_newlines (pfile);
 274
 275       /* People like decorating comments with '*', so check for '/'
 276          instead for efficiency.  */
 277       if (c == '/')
 278         {
 279           if (prevc == '*')
 280             break;
 281
 282           /* Warn about potential nested comments, but not if the '/'
 283              comes immediately before the true comment delimiter.
 284              Don't bother to get it right across escaped newlines.  */
 285           if (CPP_OPTION (pfile, warn_comments)
 286               && buffer->cur[0] == '*' && buffer->cur[1] != '/')
 287             cpp_warning_with_line (pfile,
 288                                    pfile->line, CPP_BUF_COL (buffer),
 289                                    "\"/*\" within comment");
 290         }
 291       else if (is_vspace (c))
 292         handle_newline (pfile);
 293       else if (c == '\t')
 294         adjust_column (pfile);
 295     }
 296
 297   pfile->state.lexing_comment = 0;
 298   return c != '/' || prevc != '*';
 299 }
 300
 301 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 302    terminating newline.  Handles escaped newlines.  Returns non-zero
 303    if a multiline comment.  */
 304 static int
 305 skip_line_comment (pfile)
 306      cpp_reader *pfile;
 307 {
 308   cpp_buffer *buffer = pfile->buffer;
 309   unsigned int orig_line = pfile->line;
 310   cppchar_t c;
 311
 312   pfile->state.lexing_comment = 1;
 313   do
 314     {
 315       if (buffer->cur == buffer->rlimit)
 316         goto at_eof;
 317
 318       c = *buffer->cur++;
 319       if (c == '?' || c == '\\')
 320         c = skip_escaped_newlines (pfile);
 321     }
 322   while (!is_vspace (c));
 323
 324   /* Step back over the newline, except at EOF.  */
 325   buffer->cur--;
 326  at_eof:
 327
 328   pfile->state.lexing_comment = 0;
 329   return orig_line != pfile->line;
 330 }
 331
 332 /* pfile->buffer->cur is one beyond the \t character.  Update
 333    col_adjust so we track the column correctly.  */
 334 static void
 335 adjust_column (pfile)
 336      cpp_reader *pfile;
 337 {
 338   cpp_buffer *buffer = pfile->buffer;
 339   unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column.  */
 340
 341   /* Round it up to multiple of the tabstop, but subtract 1 since the
 342      tab itself occupies a character position.  */
 343   buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
 344                          - col % CPP_OPTION (pfile, tabstop)) - 1;
 345 }
 346
 347 /* Skips whitespace, saving the next non-whitespace character.
 348    Adjusts pfile->col_adjust to account for tabs.  Without this,
 349    tokens might be assigned an incorrect column.  */
 350 static int
 351 skip_whitespace (pfile, c)
 352      cpp_reader *pfile;
 353      cppchar_t c;
 354 {
 355   cpp_buffer *buffer = pfile->buffer;
 356   unsigned int warned = 0;
 357
 358   do
 359     {
 360       /* Horizontal space always OK.  */
 361       if (c == ' ')
 362         ;
 363       else if (c == '\t')
 364         adjust_column (pfile);
 365       /* Just \f \v or \0 left.  */
 366       else if (c == '\0')
 367         {
 368           if (buffer->cur - 1 == buffer->rlimit)
 369             return 0;
 370           if (!warned)
 371             {
 372               cpp_warning (pfile, "null character(s) ignored");
 373               warned = 1;
 374             }
 375         }
 376       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 377         cpp_pedwarn_with_line (pfile, pfile->line,
 378                                CPP_BUF_COL (buffer),
 379                                "%s in preprocessing directive",
 380                                c == '\f' ? "form feed" : "vertical tab");
 381
 382       c = *buffer->cur++;
 383     }
 384   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 385   while (is_nvspace (c));
 386
 387   buffer->cur--;
 388   return 1;
 389 }
 390
 391 /* See if the characters of a number token are valid in a name (no
 392    '.', '+' or '-').  */
 393 static int
 394 name_p (pfile, string)
 395      cpp_reader *pfile;
 396      const cpp_string *string;
 397 {
 398   unsigned int i;
 399
 400   for (i = 0; i < string->len; i++)
 401     if (!is_idchar (string->text[i]))
 402       return 0;
 403
 404   return 1;
 405 }
 406
 407 /* Parse an identifier, skipping embedded backslash-newlines.  This is
 408    a critical inner loop.  The common case is an identifier which has
 409    not been split by backslash-newline, does not contain a dollar
 410    sign, and has already been scanned (roughly 10:1 ratio of
 411    seen:unseen identifiers in normal code; the distribution is
 412    Poisson-like).  Second most common case is a new identifier, not
 413    split and no dollar sign.  The other possibilities are rare and
 414    have been relegated to parse_slow.  */
 415 static cpp_hashnode *
 416 parse_identifier (pfile)
 417      cpp_reader *pfile;
 418 {
 419   cpp_hashnode *result;
 420   const U_CHAR *cur, *base;
 421
 422   /* Fast-path loop.  Skim over a normal identifier.
 423      N.B. ISIDNUM does not include $.  */
 424   cur = pfile->buffer->cur;
 425   while (ISIDNUM (*cur))
 426     cur++;
 427
 428   /* Check for slow-path cases.  */
 429   if (*cur == '?' || *cur == '\\' || *cur == '$')
 430     {
 431       unsigned int len;
 432
 433       base = parse_slow (pfile, cur, 0, &len);
 434       result = (cpp_hashnode *)
 435         ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
 436     }
 437   else
 438     {
 439       base = pfile->buffer->cur - 1;
 440       pfile->buffer->cur = cur;
 441       result = (cpp_hashnode *)
 442         ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
 443     }
 444
 445   /* Rarely, identifiers require diagnostics when lexed.
 446      XXX Has to be forced out of the fast path.  */
 447   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 448                         && !pfile->state.skipping, 0))
 449     {
 450       /* It is allowed to poison the same identifier twice.  */
 451       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 452         cpp_error (pfile, "attempt to use poisoned \"%s\"",
 453                    NODE_NAME (result));
 454
 455       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 456          replacement list of a variadic macro.  */
 457       if (result == pfile->spec_nodes.n__VA_ARGS__
 458           && !pfile->state.va_args_ok)
 459         cpp_pedwarn (pfile,
 460         "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
 461     }
 462
 463   return result;
 464 }
 465
 466 /* Slow path.  This handles numbers and identifiers which have been
 467    split, or contain dollar signs.  The part of the token from
 468    PFILE->buffer->cur-1 to CUR has already been scanned.  NUMBER_P is
 469    1 if it's a number, and 2 if it has a leading period.  Returns a
 470    pointer to the token's NUL-terminated spelling in permanent
 471    storage, and sets PLEN to its length.  */
 472 static U_CHAR *
 473 parse_slow (pfile, cur, number_p, plen)
 474      cpp_reader *pfile;
 475      const U_CHAR *cur;
 476      int number_p;
 477      unsigned int *plen;
 478 {
 479   cpp_buffer *buffer = pfile->buffer;
 480   const U_CHAR *base = buffer->cur - 1;
 481   struct obstack *stack = &pfile->hash_table->stack;
 482   unsigned int c, prevc, saw_dollar = 0;
 483
 484   /* Place any leading period.  */
 485   if (number_p == 2)
 486     obstack_1grow (stack, '.');
 487
 488   /* Copy the part of the token which is known to be okay.  */
 489   obstack_grow (stack, base, cur - base);
 490
 491   /* Now process the part which isn't.  We are looking at one of
 492      '$', '\\', or '?' on entry to this loop.  */
 493   prevc = cur[-1];
 494   c = *cur++;
 495   buffer->cur = cur;
 496   for (;;)
 497     {
 498       /* Potential escaped newline?  */
 499       buffer->backup_to = buffer->cur - 1;
 500       if (c == '?' || c == '\\')
 501         c = skip_escaped_newlines (pfile);
 502
 503       if (!is_idchar (c))
 504         {
 505           if (!number_p)
 506             break;
 507           if (c != '.' && !VALID_SIGN (c, prevc))
 508             break;
 509         }
 510
 511       /* Handle normal identifier characters in this loop.  */
 512       do
 513         {
 514           prevc = c;
 515           obstack_1grow (stack, c);
 516
 517           if (c == '$')
 518             saw_dollar++;
 519
 520           c = *buffer->cur++;
 521         }
 522       while (is_idchar (c));
 523     }
 524
 525   /* Step back over the unwanted char.  */
 526   BACKUP ();
 527
 528   /* $ is not an identifier character in the standard, but is commonly
 529      accepted as an extension.  Don't warn about it in skipped
 530      conditional blocks.  */
 531   if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
 532     cpp_pedwarn (pfile, "'$' character(s) in identifier or number");
 533
 534   /* Identifiers and numbers are null-terminated.  */
 535   *plen = obstack_object_size (stack);
 536   obstack_1grow (stack, '\0');
 537   return obstack_finish (stack);
 538 }
 539
 540 /* Parse a number, beginning with character C, skipping embedded
 541    backslash-newlines.  LEADING_PERIOD is non-zero if there was a "."
 542    before C.  Place the result in NUMBER.  */
 543 static void
 544 parse_number (pfile, number, leading_period)
 545      cpp_reader *pfile;
 546      cpp_string *number;
 547      int leading_period;
 548 {
 549   const U_CHAR *cur;
 550
 551   /* Fast-path loop.  Skim over a normal number.
 552      N.B. ISIDNUM does not include $.  */
 553   cur = pfile->buffer->cur;
 554   while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 555     cur++;
 556
 557   /* Check for slow-path cases.  */
 558   if (*cur == '?' || *cur == '\\' || *cur == '$')
 559     number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
 560   else
 561     {
 562       const U_CHAR *base = pfile->buffer->cur - 1;
 563       U_CHAR *dest;
 564
 565       number->len = cur - base + leading_period;
 566       dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 567       dest[number->len] = '\0';
 568       number->text = dest;
 569
 570       if (leading_period)
 571         *dest++ = '.';
 572       memcpy (dest, base, cur - base);
 573       pfile->buffer->cur = cur;
 574     }
 575 }
 576
 577 /* Subroutine of parse_string.  */
 578 static int
 579 unescaped_terminator_p (pfile, dest)
 580      cpp_reader *pfile;
 581      const unsigned char *dest;
 582 {
 583   const unsigned char *start, *temp;
 584
 585   /* In #include-style directives, terminators are not escapeable.  */
 586   if (pfile->state.angled_headers)
 587     return 1;
 588
 589   start = BUFF_FRONT (pfile->u_buff);
 590
 591   /* An odd number of consecutive backslashes represents an escaped
 592      terminator.  */
 593   for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
 594     ;
 595
 596   return ((dest - temp) & 1) == 0;
 597 }
 598
 599 /* Parses a string, character constant, or angle-bracketed header file
 600    name.  Handles embedded trigraphs and escaped newlines.  The stored
 601    string is guaranteed NUL-terminated, but it is not guaranteed that
 602    this is the first NUL since embedded NULs are preserved.
 603
 604    When this function returns, buffer->cur points to the next
 605    character to be processed.  */
 606 static void
 607 parse_string (pfile, token, terminator)
 608      cpp_reader *pfile;
 609      cpp_token *token;
 610      cppchar_t terminator;
 611 {
 612   cpp_buffer *buffer = pfile->buffer;
 613   unsigned char *dest, *limit;
 614   cppchar_t c;
 615   bool warned_nulls = false;
 616
 617   dest = BUFF_FRONT (pfile->u_buff);
 618   limit = BUFF_LIMIT (pfile->u_buff);
 619
 620   for (;;)
 621     {
 622       /* We need room for another char, possibly the terminating NUL.  */
 623       if ((size_t) (limit - dest) < 1)
 624         {
 625           size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
 626           _cpp_extend_buff (pfile, &pfile->u_buff, 2);
 627           dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
 628           limit = BUFF_LIMIT (pfile->u_buff);
 629         }
 630
 631       /* Handle trigraphs, escaped newlines etc.  */
 632       c = *buffer->cur++;
 633       if (c == '?' || c == '\\')
 634         c = skip_escaped_newlines (pfile);
 635
 636       if (c == terminator)
 637         {
 638           if (unescaped_terminator_p (pfile, dest))
 639             break;
 640         }
 641       else if (is_vspace (c))
 642         {
 643           /* No string literal may extend over multiple lines.  In
 644              assembly language, suppress the error except for <>
 645              includes.  This is a kludge around not knowing where
 646              comments are.  */
 647         unterminated:
 648           if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
 649             cpp_error (pfile, "missing terminating %c character", terminator);
 650           buffer->cur--;
 651           break;
 652         }
 653       else if (c == '\0')
 654         {
 655           if (buffer->cur - 1 == buffer->rlimit)
 656             goto unterminated;
 657           if (!warned_nulls)
 658             {
 659               warned_nulls = true;
 660               cpp_warning (pfile, "null character(s) preserved in literal");
 661             }
 662         }
 663
 664       *dest++ = c;
 665     }
 666
 667   *dest = '\0';
 668
 669   token->val.str.text = BUFF_FRONT (pfile->u_buff);
 670   token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
 671   BUFF_FRONT (pfile->u_buff) = dest + 1;
 672 }
 673
 674 /* The stored comment includes the comment start and any terminator.  */
 675 static void
 676 save_comment (pfile, token, from)
 677      cpp_reader *pfile;
 678      cpp_token *token;
 679      const unsigned char *from;
 680 {
 681   unsigned char *buffer;
 682   unsigned int len;
 683
 684   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 685
 686   /* C++ comments probably (not definitely) have moved past a new
 687      line, which we don't want to save in the comment.  */
 688   if (is_vspace (pfile->buffer->cur[-1]))
 689     len--;
 690   buffer = _cpp_unaligned_alloc (pfile, len);
 691
 692   token->type = CPP_COMMENT;
 693   token->val.str.len = len;
 694   token->val.str.text = buffer;
 695
 696   buffer[0] = '/';
 697   memcpy (buffer + 1, from, len - 1);
 698 }
 699
 700 /* Allocate COUNT tokens for RUN.  */
 701 void
 702 _cpp_init_tokenrun (run, count)
 703      tokenrun *run;
 704      unsigned int count;
 705 {
 706   run->base = xnewvec (cpp_token, count);
 707   run->limit = run->base + count;
 708   run->next = NULL;
 709 }
 710
 711 /* Returns the next tokenrun, or creates one if there is none.  */
 712 static tokenrun *
 713 next_tokenrun (run)
 714      tokenrun *run;
 715 {
 716   if (run->next == NULL)
 717     {
 718       run->next = xnew (tokenrun);
 719       run->next->prev = run;
 720       _cpp_init_tokenrun (run->next, 250);
 721     }
 722
 723   return run->next;
 724 }
 725
 726 /* Allocate a single token that is invalidated at the same time as the
 727    rest of the tokens on the line.  Has its line and col set to the
 728    same as the last lexed token, so that diagnostics appear in the
 729    right place.  */
 730 cpp_token *
 731 _cpp_temp_token (pfile)
 732      cpp_reader *pfile;
 733 {
 734   cpp_token *old, *result;
 735
 736   old = pfile->cur_token - 1;
 737   if (pfile->cur_token == pfile->cur_run->limit)
 738     {
 739       pfile->cur_run = next_tokenrun (pfile->cur_run);
 740       pfile->cur_token = pfile->cur_run->base;
 741     }
 742
 743   result = pfile->cur_token++;
 744   result->line = old->line;
 745   result->col = old->col;
 746   return result;
 747 }
 748
 749 /* Lex a token into RESULT (external interface).  Takes care of issues
 750    like directive handling, token lookahead, multiple include
 751    optimization and skipping.  */
 752 const cpp_token *
 753 _cpp_lex_token (pfile)
 754      cpp_reader *pfile;
 755 {
 756   cpp_token *result;
 757
 758   for (;;)
 759     {
 760       if (pfile->cur_token == pfile->cur_run->limit)
 761         {
 762           pfile->cur_run = next_tokenrun (pfile->cur_run);
 763           pfile->cur_token = pfile->cur_run->base;
 764         }
 765
 766       if (pfile->lookaheads)
 767         {
 768           pfile->lookaheads--;
 769           result = pfile->cur_token++;
 770         }
 771       else
 772         result = _cpp_lex_direct (pfile);
 773
 774       if (result->flags & BOL)
 775         {
 776           /* Is this a directive.  If _cpp_handle_directive returns
 777              false, it is an assembler #.  */
 778           if (result->type == CPP_HASH
 779               /* 6.10.3 p 11: Directives in a list of macro arguments
 780                  gives undefined behavior.  This implementation
 781                  handles the directive as normal.  */
 782               && pfile->state.parsing_args != 1
 783               && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 784             continue;
 785           if (pfile->cb.line_change && !pfile->state.skipping)
 786             (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
 787         }
 788
 789       /* We don't skip tokens in directives.  */
 790       if (pfile->state.in_directive)
 791         break;
 792
 793       /* Outside a directive, invalidate controlling macros.  At file
 794          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 795          get here and MI optimisation works.  */
 796       pfile->mi_valid = false;
 797
 798       if (!pfile->state.skipping || result->type == CPP_EOF)
 799         break;
 800     }
 801
 802   return result;
 803 }
 804
 805 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)  \
 806   do {                                          \
 807     if (get_effective_char (pfile) == CHAR)     \
 808       result->type = THEN_TYPE;                 \
 809     else                                        \
 810       {                                         \
 811         BACKUP ();                              \
 812         result->type = ELSE_TYPE;               \
 813       }                                         \
 814   } while (0)
 815
 816 /* Lex a token into pfile->cur_token, which is also incremented, to
 817    get diagnostics pointing to the correct location.
 818
 819    Does not handle issues such as token lookahead, multiple-include
 820    optimisation, directives, skipping etc.  This function is only
 821    suitable for use by _cpp_lex_token, and in special cases like
 822    lex_expansion_token which doesn't care for any of these issues.
 823
 824    When meeting a newline, returns CPP_EOF if parsing a directive,
 825    otherwise returns to the start of the token buffer if permissible.
 826    Returns the location of the lexed token.  */
 827 cpp_token *
 828 _cpp_lex_direct (pfile)
 829      cpp_reader *pfile;
 830 {
 831   cppchar_t c;
 832   cpp_buffer *buffer;
 833   const unsigned char *comment_start;
 834   cpp_token *result = pfile->cur_token++;
 835
 836  fresh_line:
 837   buffer = pfile->buffer;
 838   result->flags = buffer->saved_flags;
 839   buffer->saved_flags = 0;
 840  update_tokens_line:
 841   result->line = pfile->line;
 842
 843  skipped_white:
 844   c = *buffer->cur++;
 845   result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
 846
 847  trigraph:
 848   switch (c)
 849     {
 850     case ' ': case '\t': case '\f': case '\v': case '\0':
 851       result->flags |= PREV_WHITE;
 852       if (skip_whitespace (pfile, c))
 853         goto skipped_white;
 854
 855       /* EOF.  */
 856       buffer->cur--;
 857       buffer->saved_flags = BOL;
 858       if (!pfile->state.parsing_args && !pfile->state.in_directive)
 859         {
 860           if (buffer->cur != buffer->line_base)
 861             {
 862               /* Non-empty files should end in a newline.  Don't warn
 863                  for command line and _Pragma buffers.  */
 864               if (!buffer->from_stage3)
 865                 cpp_pedwarn (pfile, "no newline at end of file");
 866               handle_newline (pfile);
 867             }
 868
 869           /* Don't pop the last buffer.  */
 870           if (buffer->prev)
 871             {
 872               unsigned char stop = buffer->return_at_eof;
 873
 874               _cpp_pop_buffer (pfile);
 875               if (!stop)
 876                 goto fresh_line;
 877             }
 878         }
 879       result->type = CPP_EOF;
 880       break;
 881
 882     case '\n': case '\r':
 883       handle_newline (pfile);
 884       buffer->saved_flags = BOL;
 885       if (! pfile->state.in_directive)
 886         {
 887           if (pfile->state.parsing_args == 2)
 888             buffer->saved_flags |= PREV_WHITE;
 889           if (!pfile->keep_tokens)
 890             {
 891               pfile->cur_run = &pfile->base_run;
 892               result = pfile->base_run.base;
 893               pfile->cur_token = result + 1;
 894             }
 895           goto fresh_line;
 896         }
 897       result->type = CPP_EOF;
 898       break;
 899
 900     case '?':
 901     case '\\':
 902       /* These could start an escaped newline, or '?' a trigraph.  Let
 903          skip_escaped_newlines do all the work.  */
 904       {
 905         unsigned int line = pfile->line;
 906
 907         c = skip_escaped_newlines (pfile);
 908         if (line != pfile->line)
 909           {
 910             buffer->cur--;
 911             /* We had at least one escaped newline of some sort.
 912                Update the token's line and column.  */
 913             goto update_tokens_line;
 914           }
 915       }
 916
 917       /* We are either the original '?' or '\\', or a trigraph.  */
 918       if (c == '?')
 919         result->type = CPP_QUERY;
 920       else if (c == '\\')
 921         goto random_char;
 922       else
 923         goto trigraph;
 924       break;
 925
 926     case '0': case '1': case '2': case '3': case '4':
 927     case '5': case '6': case '7': case '8': case '9':
 928       result->type = CPP_NUMBER;
 929       parse_number (pfile, &result->val.str, 0);
 930       break;
 931
 932     case 'L':
 933       /* 'L' may introduce wide characters or strings.  */
 934         {
 935           const unsigned char *pos = buffer->cur;
 936
 937           c = get_effective_char (pfile);
 938           if (c == '\'' || c == '"')
 939             {
 940               result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
 941               parse_string (pfile, result, c);
 942               break;
 943             }
 944           buffer->cur = pos;
 945         }
 946         /* Fall through.  */
 947
 948     start_ident:
 949     case '_':
 950     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 951     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 952     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 953     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 954     case 'y': case 'z':
 955     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 956     case 'G': case 'H': case 'I': case 'J': case 'K':
 957     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 958     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 959     case 'Y': case 'Z':
 960       result->type = CPP_NAME;
 961       result->val.node = parse_identifier (pfile);
 962
 963       /* Convert named operators to their proper types.  */
 964       if (result->val.node->flags & NODE_OPERATOR)
 965         {
 966           result->flags |= NAMED_OP;
 967           result->type = result->val.node->value.operator;
 968         }
 969       break;
 970
 971     case '\'':
 972     case '"':
 973       result->type = c == '"' ? CPP_STRING: CPP_CHAR;
 974       parse_string (pfile, result, c);
 975       break;
 976
 977     case '/':
 978       /* A potential block or line comment.  */
 979       comment_start = buffer->cur;
 980       c = get_effective_char (pfile);
 981
 982       if (c == '*')
 983         {
 984           if (skip_block_comment (pfile))
 985             cpp_error (pfile, "unterminated comment");
 986         }
 987       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
 988                             || CPP_IN_SYSTEM_HEADER (pfile)))
 989         {
 990           /* Warn about comments only if pedantically GNUC89, and not
 991              in system headers.  */
 992           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
 993               && ! buffer->warned_cplusplus_comments)
 994             {
 995               cpp_pedwarn (pfile,
 996                            "C++ style comments are not allowed in ISO C89");
 997               cpp_pedwarn (pfile,
 998                            "(this will be reported only once per input file)");
 999               buffer->warned_cplusplus_comments = 1;
1000             }
1001
1002           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1003             cpp_warning (pfile, "multi-line comment");
1004         }
1005       else if (c == '=')
1006         {
1007           result->type = CPP_DIV_EQ;
1008           break;
1009         }
1010       else
1011         {
1012           BACKUP ();
1013           result->type = CPP_DIV;
1014           break;
1015         }
1016
1017       if (!pfile->state.save_comments)
1018         {
1019           result->flags |= PREV_WHITE;
1020           goto update_tokens_line;
1021         }
1022
1023       /* Save the comment as a token in its own right.  */
1024       save_comment (pfile, result, comment_start);
1025       break;
1026
1027     case '<':
1028       if (pfile->state.angled_headers)
1029         {
1030           result->type = CPP_HEADER_NAME;
1031           parse_string (pfile, result, '>');
1032           break;
1033         }
1034
1035       c = get_effective_char (pfile);
1036       if (c == '=')
1037         result->type = CPP_LESS_EQ;
1038       else if (c == '<')
1039         IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1040       else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1041         IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1042       else if (c == ':' && CPP_OPTION (pfile, digraphs))
1043         {
1044           result->type = CPP_OPEN_SQUARE;
1045           result->flags |= DIGRAPH;
1046         }
1047       else if (c == '%' && CPP_OPTION (pfile, digraphs))
1048         {
1049           result->type = CPP_OPEN_BRACE;
1050           result->flags |= DIGRAPH;
1051         }
1052       else
1053         {
1054           BACKUP ();
1055           result->type = CPP_LESS;
1056         }
1057       break;
1058
1059     case '>':
1060       c = get_effective_char (pfile);
1061       if (c == '=')
1062         result->type = CPP_GREATER_EQ;
1063       else if (c == '>')
1064         IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1065       else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1066         IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1067       else
1068         {
1069           BACKUP ();
1070           result->type = CPP_GREATER;
1071         }
1072       break;
1073
1074     case '%':
1075       c = get_effective_char (pfile);
1076       if (c == '=')
1077         result->type = CPP_MOD_EQ;
1078       else if (CPP_OPTION (pfile, digraphs) && c == ':')
1079         {
1080           result->flags |= DIGRAPH;
1081           result->type = CPP_HASH;
1082           if (get_effective_char (pfile) == '%')
1083             {
1084               const unsigned char *pos = buffer->cur;
1085
1086               if (get_effective_char (pfile) == ':')
1087                 result->type = CPP_PASTE;
1088               else
1089                 buffer->cur = pos - 1;
1090             }
1091           else
1092             BACKUP ();
1093         }
1094       else if (CPP_OPTION (pfile, digraphs) && c == '>')
1095         {
1096           result->flags |= DIGRAPH;
1097           result->type = CPP_CLOSE_BRACE;
1098         }
1099       else
1100         {
1101           BACKUP ();
1102           result->type = CPP_MOD;
1103         }
1104       break;
1105
1106     case '.':
1107       result->type = CPP_DOT;
1108       c = get_effective_char (pfile);
1109       if (c == '.')
1110         {
1111           const unsigned char *pos = buffer->cur;
1112
1113           if (get_effective_char (pfile) == '.')
1114             result->type = CPP_ELLIPSIS;
1115           else
1116             buffer->cur = pos - 1;
1117         }
1118       /* All known character sets have 0...9 contiguous.  */
1119       else if (ISDIGIT (c))
1120         {
1121           result->type = CPP_NUMBER;
1122           parse_number (pfile, &result->val.str, 1);
1123         }
1124       else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1125         result->type = CPP_DOT_STAR;
1126       else
1127         BACKUP ();
1128       break;
1129
1130     case '+':
1131       c = get_effective_char (pfile);
1132       if (c == '+')
1133         result->type = CPP_PLUS_PLUS;
1134       else if (c == '=')
1135         result->type = CPP_PLUS_EQ;
1136       else
1137         {
1138           BACKUP ();
1139           result->type = CPP_PLUS;
1140         }
1141       break;
1142
1143     case '-':
1144       c = get_effective_char (pfile);
1145       if (c == '>')
1146         {
1147           result->type = CPP_DEREF;
1148           if (CPP_OPTION (pfile, cplusplus))
1149             {
1150               if (get_effective_char (pfile) == '*')
1151                 result->type = CPP_DEREF_STAR;
1152               else
1153                 BACKUP ();
1154             }
1155         }
1156       else if (c == '-')
1157         result->type = CPP_MINUS_MINUS;
1158       else if (c == '=')
1159         result->type = CPP_MINUS_EQ;
1160       else
1161         {
1162           BACKUP ();
1163           result->type = CPP_MINUS;
1164         }
1165       break;
1166
1167     case '&':
1168       c = get_effective_char (pfile);
1169       if (c == '&')
1170         result->type = CPP_AND_AND;
1171       else if (c == '=')
1172         result->type = CPP_AND_EQ;
1173       else
1174         {
1175           BACKUP ();
1176           result->type = CPP_AND;
1177         }
1178       break;
1179
1180     case '|':
1181       c = get_effective_char (pfile);
1182       if (c == '|')
1183         result->type = CPP_OR_OR;
1184       else if (c == '=')
1185         result->type = CPP_OR_EQ;
1186       else
1187         {
1188           BACKUP ();
1189           result->type = CPP_OR;
1190         }
1191       break;
1192
1193     case ':':
1194       c = get_effective_char (pfile);
1195       if (c == ':' && CPP_OPTION (pfile, cplusplus))
1196         result->type = CPP_SCOPE;
1197       else if (c == '>' && CPP_OPTION (pfile, digraphs))
1198         {
1199           result->flags |= DIGRAPH;
1200           result->type = CPP_CLOSE_SQUARE;
1201         }
1202       else
1203         {
1204           BACKUP ();
1205           result->type = CPP_COLON;
1206         }
1207       break;
1208
1209     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1210     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1211     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1212     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1213     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1214
1215     case '~': result->type = CPP_COMPL; break;
1216     case ',': result->type = CPP_COMMA; break;
1217     case '(': result->type = CPP_OPEN_PAREN; break;
1218     case ')': result->type = CPP_CLOSE_PAREN; break;
1219     case '[': result->type = CPP_OPEN_SQUARE; break;
1220     case ']': result->type = CPP_CLOSE_SQUARE; break;
1221     case '{': result->type = CPP_OPEN_BRACE; break;
1222     case '}': result->type = CPP_CLOSE_BRACE; break;
1223     case ';': result->type = CPP_SEMICOLON; break;
1224
1225       /* @ is a punctuator in Objective C.  */
1226     case '@': result->type = CPP_ATSIGN; break;
1227
1228     case '$':
1229       if (CPP_OPTION (pfile, dollars_in_ident))
1230         goto start_ident;
1231       /* Fall through...  */
1232
1233     random_char:
1234     default:
1235       result->type = CPP_OTHER;
1236       result->val.c = c;
1237       break;
1238     }
1239
1240   return result;
1241 }
1242
1243 /* An upper bound on the number of bytes needed to spell TOKEN,
1244    including preceding whitespace.  */
1245 unsigned int
1246 cpp_token_len (token)
1247      const cpp_token *token;
1248 {
1249   unsigned int len;
1250
1251   switch (TOKEN_SPELL (token))
1252     {
1253     default:            len = 0;                                break;
1254     case SPELL_NUMBER:
1255     case SPELL_STRING:  len = token->val.str.len;               break;
1256     case SPELL_IDENT:   len = NODE_LEN (token->val.node);       break;
1257     }
1258   /* 1 for whitespace, 4 for comment delimiters.  */
1259   return len + 5;
1260 }
1261
1262 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1263    already contain the enough space to hold the token's spelling.
1264    Returns a pointer to the character after the last character
1265    written.  */
1266 unsigned char *
1267 cpp_spell_token (pfile, token, buffer)
1268      cpp_reader *pfile;         /* Would be nice to be rid of this...  */
1269      const cpp_token *token;
1270      unsigned char *buffer;
1271 {
1272   switch (TOKEN_SPELL (token))
1273     {
1274     case SPELL_OPERATOR:
1275       {
1276         const unsigned char *spelling;
1277         unsigned char c;
1278
1279         if (token->flags & DIGRAPH)
1280           spelling
1281             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1282         else if (token->flags & NAMED_OP)
1283           goto spell_ident;
1284         else
1285           spelling = TOKEN_NAME (token);
1286
1287         while ((c = *spelling++) != '\0')
1288           *buffer++ = c;
1289       }
1290       break;
1291
1292     case SPELL_CHAR:
1293       *buffer++ = token->val.c;
1294       break;
1295
1296     spell_ident:
1297     case SPELL_IDENT:
1298       memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1299       buffer += NODE_LEN (token->val.node);
1300       break;
1301
1302     case SPELL_NUMBER:
1303       memcpy (buffer, token->val.str.text, token->val.str.len);
1304       buffer += token->val.str.len;
1305       break;
1306
1307     case SPELL_STRING:
1308       {
1309         int left, right, tag;
1310         switch (token->type)
1311           {
1312           case CPP_STRING:      left = '"';  right = '"';  tag = '\0'; break;
1313           case CPP_WSTRING:     left = '"';  right = '"';  tag = 'L';  break;
1314           case CPP_CHAR:        left = '\''; right = '\''; tag = '\0'; break;
1315           case CPP_WCHAR:       left = '\''; right = '\''; tag = 'L';  break;
1316           case CPP_HEADER_NAME: left = '<';  right = '>';  tag = '\0'; break;
1317           default:
1318             cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1319             return buffer;
1320           }
1321         if (tag) *buffer++ = tag;
1322         *buffer++ = left;
1323         memcpy (buffer, token->val.str.text, token->val.str.len);
1324         buffer += token->val.str.len;
1325         *buffer++ = right;
1326       }
1327       break;
1328
1329     case SPELL_NONE:
1330       cpp_ice (pfile, "unspellable token %s", TOKEN_NAME (token));
1331       break;
1332     }
1333
1334   return buffer;
1335 }
1336
1337 /* Returns TOKEN spelt as a null-terminated string.  The string is
1338    freed when the reader is destroyed.  Useful for diagnostics.  */
1339 unsigned char *
1340 cpp_token_as_text (pfile, token)
1341      cpp_reader *pfile;
1342      const cpp_token *token;
1343 {
1344   unsigned int len = cpp_token_len (token);
1345   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1346
1347   end = cpp_spell_token (pfile, token, start);
1348   end[0] = '\0';
1349
1350   return start;
1351 }
1352
1353 /* Used by C front ends, which really should move to using
1354    cpp_token_as_text.  */
1355 const char *
1356 cpp_type2name (type)
1357      enum cpp_ttype type;
1358 {
1359   return (const char *) token_spellings[type].name;
1360 }
1361
1362 /* Writes the spelling of token to FP, without any preceding space.
1363    Separated from cpp_spell_token for efficiency - to avoid stdio
1364    double-buffering.  */
1365 void
1366 cpp_output_token (token, fp)
1367      const cpp_token *token;
1368      FILE *fp;
1369 {
1370   switch (TOKEN_SPELL (token))
1371     {
1372     case SPELL_OPERATOR:
1373       {
1374         const unsigned char *spelling;
1375         int c;
1376
1377         if (token->flags & DIGRAPH)
1378           spelling
1379             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1380         else if (token->flags & NAMED_OP)
1381           goto spell_ident;
1382         else
1383           spelling = TOKEN_NAME (token);
1384
1385         c = *spelling;
1386         do
1387           putc (c, fp);
1388         while ((c = *++spelling) != '\0');
1389       }
1390       break;
1391
1392     case SPELL_CHAR:
1393       putc (token->val.c, fp);
1394       break;
1395
1396     spell_ident:
1397     case SPELL_IDENT:
1398       fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1399     break;
1400
1401     case SPELL_NUMBER:
1402       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1403       break;
1404
1405     case SPELL_STRING:
1406       {
1407         int left, right, tag;
1408         switch (token->type)
1409           {
1410           case CPP_STRING:      left = '"';  right = '"';  tag = '\0'; break;
1411           case CPP_WSTRING:     left = '"';  right = '"';  tag = 'L';  break;
1412           case CPP_CHAR:        left = '\''; right = '\''; tag = '\0'; break;
1413           case CPP_WCHAR:       left = '\''; right = '\''; tag = 'L';  break;
1414           case CPP_HEADER_NAME: left = '<';  right = '>';  tag = '\0'; break;
1415           default:
1416             fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1417             return;
1418           }
1419         if (tag) putc (tag, fp);
1420         putc (left, fp);
1421         fwrite (token->val.str.text, 1, token->val.str.len, fp);
1422         putc (right, fp);
1423       }
1424       break;
1425
1426     case SPELL_NONE:
1427       /* An error, most probably.  */
1428       break;
1429     }
1430 }
1431
1432 /* Compare two tokens.  */
1433 int
1434 _cpp_equiv_tokens (a, b)
1435      const cpp_token *a, *b;
1436 {
1437   if (a->type == b->type && a->flags == b->flags)
1438     switch (TOKEN_SPELL (a))
1439       {
1440       default:                  /* Keep compiler happy.  */
1441       case SPELL_OPERATOR:
1442         return 1;
1443       case SPELL_CHAR:
1444         return a->val.c == b->val.c; /* Character.  */
1445       case SPELL_NONE:
1446         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1447       case SPELL_IDENT:
1448         return a->val.node == b->val.node;
1449       case SPELL_NUMBER:
1450       case SPELL_STRING:
1451         return (a->val.str.len == b->val.str.len
1452                 && !memcmp (a->val.str.text, b->val.str.text,
1453                             a->val.str.len));
1454       }
1455
1456   return 0;
1457 }
1458
1459 /* Returns nonzero if a space should be inserted to avoid an
1460    accidental token paste for output.  For simplicity, it is
1461    conservative, and occasionally advises a space where one is not
1462    needed, e.g. "." and ".2".  */
1463 int
1464 cpp_avoid_paste (pfile, token1, token2)
1465      cpp_reader *pfile;
1466      const cpp_token *token1, *token2;
1467 {
1468   enum cpp_ttype a = token1->type, b = token2->type;
1469   cppchar_t c;
1470
1471   if (token1->flags & NAMED_OP)
1472     a = CPP_NAME;
1473   if (token2->flags & NAMED_OP)
1474     b = CPP_NAME;
1475
1476   c = EOF;
1477   if (token2->flags & DIGRAPH)
1478     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1479   else if (token_spellings[b].category == SPELL_OPERATOR)
1480     c = token_spellings[b].name[0];
1481
1482   /* Quickly get everything that can paste with an '='.  */
1483   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1484     return 1;
1485
1486   switch (a)
1487     {
1488     case CPP_GREATER:   return c == '>' || c == '?';
1489     case CPP_LESS:      return c == '<' || c == '?' || c == '%' || c == ':';
1490     case CPP_PLUS:      return c == '+';
1491     case CPP_MINUS:     return c == '-' || c == '>';
1492     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1493     case CPP_MOD:       return c == ':' || c == '>';
1494     case CPP_AND:       return c == '&';
1495     case CPP_OR:        return c == '|';
1496     case CPP_COLON:     return c == ':' || c == '>';
1497     case CPP_DEREF:     return c == '*';
1498     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1499     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1500     case CPP_NAME:      return ((b == CPP_NUMBER
1501                                  && name_p (pfile, &token2->val.str))
1502                                 || b == CPP_NAME
1503                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1504     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1505                                 || c == '.' || c == '+' || c == '-');
1506     case CPP_OTHER:     return (CPP_OPTION (pfile, objc)
1507                                 && token1->val.c == '@'
1508                                 && (b == CPP_NAME || b == CPP_STRING));
1509     default:            break;
1510     }
1511
1512   return 0;
1513 }
1514
1515 /* Output all the remaining tokens on the current line, and a newline
1516    character, to FP.  Leading whitespace is removed.  If there are
1517    macros, special token padding is not performed.  */
1518 void
1519 cpp_output_line (pfile, fp)
1520      cpp_reader *pfile;
1521      FILE *fp;
1522 {
1523   const cpp_token *token;
1524
1525   token = cpp_get_token (pfile);
1526   while (token->type != CPP_EOF)
1527     {
1528       cpp_output_token (token, fp);
1529       token = cpp_get_token (pfile);
1530       if (token->flags & PREV_WHITE)
1531         putc (' ', fp);
1532     }
1533
1534   putc ('\n', fp);
1535 }
1536
1537 /* Returns the value of a hexadecimal digit.  */
1538 static unsigned int
1539 hex_digit_value (c)
1540      unsigned int c;
1541 {
1542   if (hex_p (c))
1543     return hex_value (c);
1544   else
1545     abort ();
1546 }
1547
1548 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.  Returns 1 to indicate
1549    failure if cpplib is not parsing C++ or C99.  Such failure is
1550    silent, and no variables are updated.  Otherwise returns 0, and
1551    warns if -Wtraditional.
1552
1553    [lex.charset]: The character designated by the universal character
1554    name \UNNNNNNNN is that character whose character short name in
1555    ISO/IEC 10646 is NNNNNNNN; the character designated by the
1556    universal character name \uNNNN is that character whose character
1557    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
1558    for a universal character name is less than 0x20 or in the range
1559    0x7F-0x9F (inclusive), or if the universal character name
1560    designates a character in the basic source character set, then the
1561    program is ill-formed.
1562
1563    We assume that wchar_t is Unicode, so we don't need to do any
1564    mapping.  Is this ever wrong?
1565
1566    PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1567    LIMIT is the end of the string or charconst.  PSTR is updated to
1568    point after the UCS on return, and the UCS is written into PC.  */
1569
1570 static int
1571 maybe_read_ucs (pfile, pstr, limit, pc)
1572      cpp_reader *pfile;
1573      const unsigned char **pstr;
1574      const unsigned char *limit;
1575      unsigned int *pc;
1576 {
1577   const unsigned char *p = *pstr;
1578   unsigned int code = 0;
1579   unsigned int c = *pc, length;
1580
1581   /* Only attempt to interpret a UCS for C++ and C99.  */
1582   if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1583     return 1;
1584
1585   if (CPP_WTRADITIONAL (pfile))
1586     cpp_warning (pfile, "the meaning of '\\%c' is different in traditional C", c);
1587
1588   length = (c == 'u' ? 4: 8);
1589
1590   if ((size_t) (limit - p) < length)
1591     {
1592       cpp_error (pfile, "incomplete universal-character-name");
1593       /* Skip to the end to avoid more diagnostics.  */
1594       p = limit;
1595     }
1596   else
1597     {
1598       for (; length; length--, p++)
1599         {
1600           c = *p;
1601           if (ISXDIGIT (c))
1602             code = (code << 4) + hex_digit_value (c);
1603           else
1604             {
1605               cpp_error (pfile,
1606                          "non-hex digit '%c' in universal-character-name", c);
1607               /* We shouldn't skip in case there are multibyte chars.  */
1608               break;
1609             }
1610         }
1611     }
1612
1613 #ifdef TARGET_EBCDIC
1614   cpp_error (pfile, "universal-character-name on EBCDIC target");
1615   code = 0x3f;  /* EBCDIC invalid character */
1616 #else
1617  /* True extended characters are OK.  */
1618   if (code >= 0xa0
1619       && !(code & 0x80000000)
1620       && !(code >= 0xD800 && code <= 0xDFFF))
1621     ;
1622   /* The standard permits $, @ and ` to be specified as UCNs.  We use
1623      hex escapes so that this also works with EBCDIC hosts.  */
1624   else if (code == 0x24 || code == 0x40 || code == 0x60)
1625     ;
1626   /* Don't give another error if one occurred above.  */
1627   else if (length == 0)
1628     cpp_error (pfile, "universal-character-name out of range");
1629 #endif
1630
1631   *pstr = p;
1632   *pc = code;
1633   return 0;
1634 }
1635
1636 /* Interpret an escape sequence, and return its value.  PSTR points to
1637    the input pointer, which is just after the backslash.  LIMIT is how
1638    much text we have.  MASK is a bitmask for the precision for the
1639    destination type (char or wchar_t).
1640
1641    Handles all relevant diagnostics.  */
1642 unsigned int
1643 cpp_parse_escape (pfile, pstr, limit, mask)
1644      cpp_reader *pfile;
1645      const unsigned char **pstr;
1646      const unsigned char *limit;
1647      unsigned HOST_WIDE_INT mask;
1648 {
1649   int unknown = 0;
1650   const unsigned char *str = *pstr;
1651   unsigned int c = *str++;
1652
1653   switch (c)
1654     {
1655     case '\\': case '\'': case '"': case '?': break;
1656     case 'b': c = TARGET_BS;      break;
1657     case 'f': c = TARGET_FF;      break;
1658     case 'n': c = TARGET_NEWLINE; break;
1659     case 'r': c = TARGET_CR;      break;
1660     case 't': c = TARGET_TAB;     break;
1661     case 'v': c = TARGET_VT;      break;
1662
1663     case '(': case '{': case '[': case '%':
1664       /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1665          '\%' is used to prevent SCCS from getting confused.  */
1666       unknown = CPP_PEDANTIC (pfile);
1667       break;
1668
1669     case 'a':
1670       if (CPP_WTRADITIONAL (pfile))
1671         cpp_warning (pfile, "the meaning of '\\a' is different in traditional C");
1672       c = TARGET_BELL;
1673       break;
1674
1675     case 'e': case 'E':
1676       if (CPP_PEDANTIC (pfile))
1677         cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1678       c = TARGET_ESC;
1679       break;
1680
1681     case 'u': case 'U':
1682       unknown = maybe_read_ucs (pfile, &str, limit, &c);
1683       break;
1684
1685     case 'x':
1686       if (CPP_WTRADITIONAL (pfile))
1687         cpp_warning (pfile, "the meaning of '\\x' is different in traditional C");
1688
1689         {
1690           unsigned int i = 0, overflow = 0;
1691           int digits_found = 0;
1692
1693           while (str < limit)
1694             {
1695               c = *str;
1696               if (! ISXDIGIT (c))
1697                 break;
1698               str++;
1699               overflow |= i ^ (i << 4 >> 4);
1700               i = (i << 4) + hex_digit_value (c);
1701               digits_found = 1;
1702             }
1703
1704           if (!digits_found)
1705             cpp_error (pfile, "\\x used with no following hex digits");
1706
1707           if (overflow | (i != (i & mask)))
1708             {
1709               cpp_pedwarn (pfile, "hex escape sequence out of range");
1710               i &= mask;
1711             }
1712           c = i;
1713         }
1714       break;
1715
1716     case '0':  case '1':  case '2':  case '3':
1717     case '4':  case '5':  case '6':  case '7':
1718       {
1719         unsigned int i = c - '0';
1720         int count = 0;
1721
1722         while (str < limit && ++count < 3)
1723           {
1724             c = *str;
1725             if (c < '0' || c > '7')
1726               break;
1727             str++;
1728             i = (i << 3) + c - '0';
1729           }
1730
1731         if (i != (i & mask))
1732           {
1733             cpp_pedwarn (pfile, "octal escape sequence out of range");
1734             i &= mask;
1735           }
1736         c = i;
1737       }
1738       break;
1739
1740     default:
1741       unknown = 1;
1742       break;
1743     }
1744
1745   if (unknown)
1746     {
1747       if (ISGRAPH (c))
1748         cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1749       else
1750         cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1751     }
1752
1753   if (c > mask)
1754     cpp_pedwarn (pfile, "escape sequence out of range for character");
1755
1756   *pstr = str;
1757   return c;
1758 }
1759
1760 #ifndef MAX_CHAR_TYPE_SIZE
1761 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1762 #endif
1763
1764 #ifndef MAX_WCHAR_TYPE_SIZE
1765 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1766 #endif
1767
1768 /* Interpret a (possibly wide) character constant in TOKEN.
1769    WARN_MULTI warns about multi-character charconsts.  PCHARS_SEEN points
1770    to a variable that is filled in with the number of characters seen.  */
1771 HOST_WIDE_INT
1772 cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen)
1773      cpp_reader *pfile;
1774      const cpp_token *token;
1775      int warn_multi;
1776      unsigned int *pchars_seen;
1777 {
1778   const unsigned char *str = token->val.str.text;
1779   const unsigned char *limit = str + token->val.str.len;
1780   unsigned int chars_seen = 0;
1781   unsigned int width, max_chars, c;
1782   unsigned HOST_WIDE_INT mask;
1783   HOST_WIDE_INT result = 0;
1784   bool unsigned_p;
1785
1786 #ifdef MULTIBYTE_CHARS
1787   (void) local_mbtowc (NULL, NULL, 0);
1788 #endif
1789
1790   /* Width in bits.  */
1791   if (token->type == CPP_CHAR)
1792     {
1793       width = MAX_CHAR_TYPE_SIZE;
1794       unsigned_p = CPP_OPTION (pfile, signed_char) == 0;
1795     }
1796   else
1797     {
1798       width = MAX_WCHAR_TYPE_SIZE;
1799       unsigned_p = WCHAR_UNSIGNED;
1800     }
1801
1802   if (width < HOST_BITS_PER_WIDE_INT)
1803     mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1804   else
1805     mask = ~0;
1806   max_chars = HOST_BITS_PER_WIDE_INT / width;
1807
1808   while (str < limit)
1809     {
1810 #ifdef MULTIBYTE_CHARS
1811       wchar_t wc;
1812       int char_len;
1813
1814       char_len = local_mbtowc (&wc, str, limit - str);
1815       if (char_len == -1)
1816         {
1817           cpp_warning (pfile, "ignoring invalid multibyte character");
1818           c = *str++;
1819         }
1820       else
1821         {
1822           str += char_len;
1823           c = wc;
1824         }
1825 #else
1826       c = *str++;
1827 #endif
1828
1829       if (c == '\\')
1830         c = cpp_parse_escape (pfile, &str, limit, mask);
1831
1832 #ifdef MAP_CHARACTER
1833       if (ISPRINT (c))
1834         c = MAP_CHARACTER (c);
1835 #endif
1836
1837       /* Merge character into result; ignore excess chars.  */
1838       if (++chars_seen <= max_chars)
1839         {
1840           if (width < HOST_BITS_PER_WIDE_INT)
1841             result = (result << width) | (c & mask);
1842           else
1843             result = c;
1844         }
1845     }
1846
1847   if (chars_seen == 0)
1848     cpp_error (pfile, "empty character constant");
1849   else if (chars_seen > max_chars)
1850     {
1851       chars_seen = max_chars;
1852       cpp_warning (pfile, "character constant too long");
1853     }
1854   else if (chars_seen > 1 && warn_multi)
1855     cpp_warning (pfile, "multi-character character constant");
1856
1857   /* If relevant type is signed, sign-extend the constant.  */
1858   if (chars_seen)
1859     {
1860       unsigned int nbits = chars_seen * width;
1861
1862       mask = (unsigned HOST_WIDE_INT) ~0 >> (HOST_BITS_PER_WIDE_INT - nbits);
1863       if (unsigned_p || ((result >> (nbits - 1)) & 1) == 0)
1864         result &= mask;
1865       else
1866         result |= ~mask;
1867     }
1868
1869   *pchars_seen = chars_seen;
1870   return result;
1871 }
1872
1873 /* Memory buffers.  Changing these three constants can have a dramatic
1874    effect on performance.  The values here are reasonable defaults,
1875    but might be tuned.  If you adjust them, be sure to test across a
1876    range of uses of cpplib, including heavy nested function-like macro
1877    expansion.  Also check the change in peak memory usage (NJAMD is a
1878    good tool for this).  */
1879 #define MIN_BUFF_SIZE 8000
1880 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1881 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1882         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1883
1884 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1885   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1886 #endif
1887
1888 struct dummy
1889 {
1890   char c;
1891   union
1892   {
1893     double d;
1894     int *p;
1895   } u;
1896 };
1897
1898 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1899 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1900
1901 /* Create a new allocation buffer.  Place the control block at the end
1902    of the buffer, so that buffer overflows will cause immediate chaos.  */
1903 static _cpp_buff *
1904 new_buff (len)
1905      size_t len;
1906 {
1907   _cpp_buff *result;
1908   unsigned char *base;
1909
1910   if (len < MIN_BUFF_SIZE)
1911     len = MIN_BUFF_SIZE;
1912   len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
1913
1914   base = xmalloc (len + sizeof (_cpp_buff));
1915   result = (_cpp_buff *) (base + len);
1916   result->base = base;
1917   result->cur = base;
1918   result->limit = base + len;
1919   result->next = NULL;
1920   return result;
1921 }
1922
1923 /* Place a chain of unwanted allocation buffers on the free list.  */
1924 void
1925 _cpp_release_buff (pfile, buff)
1926      cpp_reader *pfile;
1927      _cpp_buff *buff;
1928 {
1929   _cpp_buff *end = buff;
1930
1931   while (end->next)
1932     end = end->next;
1933   end->next = pfile->free_buffs;
1934   pfile->free_buffs = buff;
1935 }
1936
1937 /* Return a free buffer of size at least MIN_SIZE.  */
1938 _cpp_buff *
1939 _cpp_get_buff (pfile, min_size)
1940      cpp_reader *pfile;
1941      size_t min_size;
1942 {
1943   _cpp_buff *result, **p;
1944
1945   for (p = &pfile->free_buffs;; p = &(*p)->next)
1946     {
1947       size_t size;
1948
1949       if (*p == NULL)
1950         return new_buff (min_size);
1951       result = *p;
1952       size = result->limit - result->base;
1953       /* Return a buffer that's big enough, but don't waste one that's
1954          way too big.  */
1955       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1956         break;
1957     }
1958
1959   *p = result->next;
1960   result->next = NULL;
1961   result->cur = result->base;
1962   return result;
1963 }
1964
1965 /* Creates a new buffer with enough space to hold the uncommitted
1966    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1967    the excess bytes to the new buffer.  Chains the new buffer after
1968    BUFF, and returns the new buffer.  */
1969 _cpp_buff *
1970 _cpp_append_extend_buff (pfile, buff, min_extra)
1971      cpp_reader *pfile;
1972      _cpp_buff *buff;
1973      size_t min_extra;
1974 {
1975   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1976   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1977
1978   buff->next = new_buff;
1979   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1980   return new_buff;
1981 }
1982
1983 /* Creates a new buffer with enough space to hold the uncommitted
1984    remaining bytes of the buffer pointed to by BUFF, and at least
1985    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1986    Chains the new buffer before the buffer pointed to by BUFF, and
1987    updates the pointer to point to the new buffer.  */
1988 void
1989 _cpp_extend_buff (pfile, pbuff, min_extra)
1990      cpp_reader *pfile;
1991      _cpp_buff **pbuff;
1992      size_t min_extra;
1993 {
1994   _cpp_buff *new_buff, *old_buff = *pbuff;
1995   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1996
1997   new_buff = _cpp_get_buff (pfile, size);
1998   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1999   new_buff->next = old_buff;
2000   *pbuff = new_buff;
2001 }
2002
2003 /* Free a chain of buffers starting at BUFF.  */
2004 void
2005 _cpp_free_buff (buff)
2006      _cpp_buff *buff;
2007 {
2008   _cpp_buff *next;
2009
2010   for (; buff; buff = next)
2011     {
2012       next = buff->next;
2013       free (buff->base);
2014     }
2015 }
2016
2017 /* Allocate permanent, unaligned storage of length LEN.  */
2018 unsigned char *
2019 _cpp_unaligned_alloc (pfile, len)
2020      cpp_reader *pfile;
2021      size_t len;
2022 {
2023   _cpp_buff *buff = pfile->u_buff;
2024   unsigned char *result = buff->cur;
2025
2026   if (len > (size_t) (buff->limit - result))
2027     {
2028       buff = _cpp_get_buff (pfile, len);
2029       buff->next = pfile->u_buff;
2030       pfile->u_buff = buff;
2031       result = buff->cur;
2032     }
2033
2034   buff->cur = result + len;
2035   return result;
2036 }
2037
2038 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2039    That buffer is used for growing allocations when saving macro
2040    replacement lists in a #define, and when parsing an answer to an
2041    assertion in #assert, #unassert or #if (and therefore possibly
2042    whilst expanding macros).  It therefore must not be used by any
2043    code that they might call: specifically the lexer and the guts of
2044    the macro expander.
2045
2046    All existing other uses clearly fit this restriction: storing
2047    registered pragmas during initialization.  */
2048 unsigned char *
2049 _cpp_aligned_alloc (pfile, len)
2050      cpp_reader *pfile;
2051      size_t len;
2052 {
2053   _cpp_buff *buff = pfile->a_buff;
2054   unsigned char *result = buff->cur;
2055
2056   if (len > (size_t) (buff->limit - result))
2057     {
2058       buff = _cpp_get_buff (pfile, len);
2059       buff->next = pfile->a_buff;
2060       pfile->a_buff = buff;
2061       result = buff->cur;
2062     }
2063
2064   buff->cur = result + len;
2065   return result;
2066 }