gcc/cpplex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7    Single-pass line tokenization by Neil Booth, April 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 2, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; if not, write to the Free Software
  21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  22
  23 /* This lexer works with a single pass of the file.  Recently I
  24    re-wrote it to minimize the places where we step backwards in the
  25    input stream, to make future changes to support multi-byte
  26    character sets fairly straight-forward.
  27
  28    There is now only one routine where we do step backwards:
  29    skip_escaped_newlines.  This routine could probably also be changed
  30    so that it doesn't need to step back.  One possibility is to use a
  31    trick similar to that used in lex_period and lex_percent.  Two
  32    extra characters might be needed, but skip_escaped_newlines itself
  33    would probably be the only place that needs to be aware of that,
  34    and changes to the remaining routines would probably only be needed
  35    if they process a backslash.  */
  36
  37 #include "config.h"
  38 #include "system.h"
  39 #include "cpplib.h"
  40 #include "cpphash.h"
  41
  42 /* MULTIBYTE_CHARS support only works for native compilers.
  43    ??? Ideally what we want is to model widechar support after
  44    the current floating point support.  */
  45 #ifdef CROSS_COMPILE
  46 #undef MULTIBYTE_CHARS
  47 #endif
  48
  49 #ifdef MULTIBYTE_CHARS
  50 #include "mbchar.h"
  51 #include <locale.h>
  52 #endif
  53
  54 /* Tokens with SPELL_STRING store their spelling in the token list,
  55    and it's length in the token->val.name.len.  */
  56 enum spell_type
  57 {
  58   SPELL_OPERATOR = 0,
  59   SPELL_CHAR,
  60   SPELL_IDENT,
  61   SPELL_STRING,
  62   SPELL_NONE
  63 };
  64
  65 struct token_spelling
  66 {
  67   enum spell_type category;
  68   const unsigned char *name;
  69 };
  70
  71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
  72                                              U":>", U"<%", U"%>"};
  73
  74 #define OP(e, s) { SPELL_OPERATOR, U s           },
  75 #define TK(e, s) { s,              U STRINGX (e) },
  76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
  77 #undef OP
  78 #undef TK
  79
  80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  82
  83 static cppchar_t handle_newline PARAMS ((cpp_reader *, cppchar_t));
  84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *, cppchar_t));
  85 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
  86
  87 static int skip_block_comment PARAMS ((cpp_reader *));
  88 static int skip_line_comment PARAMS ((cpp_reader *));
  89 static void adjust_column PARAMS ((cpp_reader *));
  90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
  91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
  92 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
  93                                                     const U_CHAR *));
  94 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
  95 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
  96 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
  97 static void unterminated PARAMS ((cpp_reader *, int));
  98 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
  99 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
 100 static void lex_percent PARAMS ((cpp_reader *, cpp_token *));
 101 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
 102 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
 103 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
 104                                    const unsigned char *, unsigned int *));
 105 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
 106
 107 static cpp_chunk *new_chunk PARAMS ((unsigned int));
 108 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
 109 static unsigned int hex_digit_value PARAMS ((unsigned int));
 110
 111 /* Utility routine:
 112
 113    Compares, the token TOKEN to the NUL-terminated string STRING.
 114    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
 115
 116 int
 117 cpp_ideq (token, string)
 118      const cpp_token *token;
 119      const char *string;
 120 {
 121   if (token->type != CPP_NAME)
 122     return 0;
 123
 124   return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
 125 }
 126
 127 /* Call when meeting a newline.  Returns the character after the newline
 128    (or carriage-return newline combination), or EOF.  */
 129 static cppchar_t
 130 handle_newline (pfile, newline_char)
 131      cpp_reader *pfile;
 132      cppchar_t newline_char;
 133 {
 134   cpp_buffer *buffer;
 135   cppchar_t next = EOF;
 136
 137   pfile->line++;
 138   buffer = pfile->buffer;
 139   buffer->col_adjust = 0;
 140   buffer->line_base = buffer->cur;
 141
 142   /* Handle CR-LF and LF-CR combinations, get the next character.  */
 143   if (buffer->cur < buffer->rlimit)
 144     {
 145       next = *buffer->cur++;
 146       if (next + newline_char == '\r' + '\n')
 147         {
 148           buffer->line_base = buffer->cur;
 149           if (buffer->cur < buffer->rlimit)
 150             next = *buffer->cur++;
 151           else
 152             next = EOF;
 153         }
 154     }
 155
 156   buffer->read_ahead = next;
 157   return next;
 158 }
 159
 160 /* Subroutine of skip_escaped_newlines; called when a trigraph is
 161    encountered.  It warns if necessary, and returns true if the
 162    trigraph should be honoured.  FROM_CHAR is the third character of a
 163    trigraph, and presumed to be the previous character for position
 164    reporting.  */
 165 static int
 166 trigraph_ok (pfile, from_char)
 167      cpp_reader *pfile;
 168      cppchar_t from_char;
 169 {
 170   int accept = CPP_OPTION (pfile, trigraphs);
 171
 172   /* Don't warn about trigraphs in comments.  */
 173   if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
 174     {
 175       cpp_buffer *buffer = pfile->buffer;
 176
 177       if (accept)
 178         cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 2,
 179                                "trigraph ??%c converted to %c",
 180                                (int) from_char,
 181                                (int) _cpp_trigraph_map[from_char]);
 182       else if (buffer->cur != buffer->last_Wtrigraphs)
 183         {
 184           buffer->last_Wtrigraphs = buffer->cur;
 185           cpp_warning_with_line (pfile, pfile->line,
 186                                  CPP_BUF_COL (buffer) - 2,
 187                                  "trigraph ??%c ignored", (int) from_char);
 188         }
 189     }
 190
 191   return accept;
 192 }
 193
 194 /* Assumes local variables buffer and result.  */
 195 #define ACCEPT_CHAR(t) \
 196   do { result->type = t; buffer->read_ahead = EOF; } while (0)
 197
 198 /* When we move to multibyte character sets, add to these something
 199    that saves and restores the state of the multibyte conversion
 200    library.  This probably involves saving and restoring a "cookie".
 201    In the case of glibc it is an 8-byte structure, so is not a high
 202    overhead operation.  In any case, it's out of the fast path.  */
 203 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
 204 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
 205
 206 /* Skips any escaped newlines introduced by NEXT, which is either a
 207    '?' or a '\\'.  Returns the next character, which will also have
 208    been placed in buffer->read_ahead.  This routine performs
 209    preprocessing stages 1 and 2 of the ISO C standard.  */
 210 static cppchar_t
 211 skip_escaped_newlines (pfile, next)
 212      cpp_reader *pfile;
 213      cppchar_t next;
 214 {
 215   cpp_buffer *buffer = pfile->buffer;
 216
 217   /* Only do this if we apply stages 1 and 2.  */
 218   if (!buffer->from_stage3)
 219     {
 220       cppchar_t next1;
 221       const unsigned char *saved_cur;
 222       int space;
 223
 224       do
 225         {
 226           if (buffer->cur == buffer->rlimit)
 227             break;
 228
 229           SAVE_STATE ();
 230           if (next == '?')
 231             {
 232               next1 = *buffer->cur++;
 233               if (next1 != '?' || buffer->cur == buffer->rlimit)
 234                 {
 235                   RESTORE_STATE ();
 236                   break;
 237                 }
 238
 239               next1 = *buffer->cur++;
 240               if (!_cpp_trigraph_map[next1]
 241                   || !trigraph_ok (pfile, next1))
 242                 {
 243                   RESTORE_STATE ();
 244                   break;
 245                 }
 246
 247               /* We have a full trigraph here.  */
 248               next = _cpp_trigraph_map[next1];
 249               if (next != '\\' || buffer->cur == buffer->rlimit)
 250                 break;
 251               SAVE_STATE ();
 252             }
 253
 254           /* We have a backslash, and room for at least one more character.  */
 255           space = 0;
 256           do
 257             {
 258               next1 = *buffer->cur++;
 259               if (!is_nvspace (next1))
 260                 break;
 261               space = 1;
 262             }
 263           while (buffer->cur < buffer->rlimit);
 264
 265           if (!is_vspace (next1))
 266             {
 267               RESTORE_STATE ();
 268               break;
 269             }
 270
 271           if (space && !pfile->state.lexing_comment)
 272             cpp_warning (pfile, "backslash and newline separated by space");
 273
 274           next = handle_newline (pfile, next1);
 275           if (next == EOF)
 276             cpp_pedwarn (pfile, "backslash-newline at end of file");
 277         }
 278       while (next == '\\' || next == '?');
 279     }
 280
 281   buffer->read_ahead = next;
 282   return next;
 283 }
 284
 285 /* Obtain the next character, after trigraph conversion and skipping
 286    an arbitrary string of escaped newlines.  The common case of no
 287    trigraphs or escaped newlines falls through quickly.  */
 288 static cppchar_t
 289 get_effective_char (pfile)
 290      cpp_reader *pfile;
 291 {
 292   cpp_buffer *buffer = pfile->buffer;
 293   cppchar_t next = EOF;
 294
 295   if (buffer->cur < buffer->rlimit)
 296     {
 297       next = *buffer->cur++;
 298
 299       /* '?' can introduce trigraphs (and therefore backslash); '\\'
 300          can introduce escaped newlines, which we want to skip, or
 301          UCNs, which, depending upon lexer state, we will handle in
 302          the future.  */
 303       if (next == '?' || next == '\\')
 304         next = skip_escaped_newlines (pfile, next);
 305     }
 306
 307   buffer->read_ahead = next;
 308   return next;
 309 }
 310
 311 /* Skip a C-style block comment.  We find the end of the comment by
 312    seeing if an asterisk is before every '/' we encounter.  Returns
 313    non-zero if comment terminated by EOF, zero otherwise.  */
 314 static int
 315 skip_block_comment (pfile)
 316      cpp_reader *pfile;
 317 {
 318   cpp_buffer *buffer = pfile->buffer;
 319   cppchar_t c = EOF, prevc = EOF;
 320
 321   pfile->state.lexing_comment = 1;
 322   while (buffer->cur != buffer->rlimit)
 323     {
 324       prevc = c, c = *buffer->cur++;
 325
 326     next_char:
 327       /* FIXME: For speed, create a new character class of characters
 328          of interest inside block comments.  */
 329       if (c == '?' || c == '\\')
 330         c = skip_escaped_newlines (pfile, c);
 331
 332       /* People like decorating comments with '*', so check for '/'
 333          instead for efficiency.  */
 334       if (c == '/')
 335         {
 336           if (prevc == '*')
 337             break;
 338
 339           /* Warn about potential nested comments, but not if the '/'
 340              comes immediately before the true comment delimeter.
 341              Don't bother to get it right across escaped newlines.  */
 342           if (CPP_OPTION (pfile, warn_comments)
 343               && buffer->cur != buffer->rlimit)
 344             {
 345               prevc = c, c = *buffer->cur++;
 346               if (c == '*' && buffer->cur != buffer->rlimit)
 347                 {
 348                   prevc = c, c = *buffer->cur++;
 349                   if (c != '/')
 350                     cpp_warning_with_line (pfile, pfile->line,
 351                                            CPP_BUF_COL (buffer) - 2,
 352                                            "\"/*\" within comment");
 353                 }
 354               goto next_char;
 355             }
 356         }
 357       else if (is_vspace (c))
 358         {
 359           prevc = c, c = handle_newline (pfile, c);
 360           goto next_char;
 361         }
 362       else if (c == '\t')
 363         adjust_column (pfile);
 364     }
 365
 366   pfile->state.lexing_comment = 0;
 367   buffer->read_ahead = EOF;
 368   return c != '/' || prevc != '*';
 369 }
 370
 371 /* Skip a C++ line comment.  Handles escaped newlines.  Returns
 372    non-zero if a multiline comment.  The following new line, if any,
 373    is left in buffer->read_ahead.  */
 374 static int
 375 skip_line_comment (pfile)
 376      cpp_reader *pfile;
 377 {
 378   cpp_buffer *buffer = pfile->buffer;
 379   unsigned int orig_line = pfile->line;
 380   cppchar_t c;
 381
 382   pfile->state.lexing_comment = 1;
 383   do
 384     {
 385       c = EOF;
 386       if (buffer->cur == buffer->rlimit)
 387         break;
 388
 389       c = *buffer->cur++;
 390       if (c == '?' || c == '\\')
 391         c = skip_escaped_newlines (pfile, c);
 392     }
 393   while (!is_vspace (c));
 394
 395   pfile->state.lexing_comment = 0;
 396   buffer->read_ahead = c;       /* Leave any newline for caller.  */
 397   return orig_line != pfile->line;
 398 }
 399
 400 /* pfile->buffer->cur is one beyond the \t character.  Update
 401    col_adjust so we track the column correctly.  */
 402 static void
 403 adjust_column (pfile)
 404      cpp_reader *pfile;
 405 {
 406   cpp_buffer *buffer = pfile->buffer;
 407   unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column.  */
 408
 409   /* Round it up to multiple of the tabstop, but subtract 1 since the
 410      tab itself occupies a character position.  */
 411   buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
 412                          - col % CPP_OPTION (pfile, tabstop)) - 1;
 413 }
 414
 415 /* Skips whitespace, saving the next non-whitespace character.
 416    Adjusts pfile->col_adjust to account for tabs.  Without this,
 417    tokens might be assigned an incorrect column.  */
 418 static void
 419 skip_whitespace (pfile, c)
 420      cpp_reader *pfile;
 421      cppchar_t c;
 422 {
 423   cpp_buffer *buffer = pfile->buffer;
 424   unsigned int warned = 0;
 425
 426   do
 427     {
 428       /* Horizontal space always OK.  */
 429       if (c == ' ')
 430         ;
 431       else if (c == '\t')
 432         adjust_column (pfile);
 433       /* Just \f \v or \0 left.  */
 434       else if (c == '\0')
 435         {
 436           if (!warned)
 437             {
 438               cpp_warning (pfile, "null character(s) ignored");
 439               warned = 1;
 440             }
 441         }
 442       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 443         cpp_pedwarn_with_line (pfile, pfile->line,
 444                                CPP_BUF_COL (buffer),
 445                                "%s in preprocessing directive",
 446                                c == '\f' ? "form feed" : "vertical tab");
 447
 448       c = EOF;
 449       if (buffer->cur == buffer->rlimit)
 450         break;
 451       c = *buffer->cur++;
 452     }
 453   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 454   while (is_nvspace (c));
 455
 456   /* Remember the next character.  */
 457   buffer->read_ahead = c;
 458 }
 459
 460 /* See if the characters of a number token are valid in a name (no
 461    '.', '+' or '-').  */
 462 static int
 463 name_p (pfile, string)
 464      cpp_reader *pfile;
 465      const cpp_string *string;
 466 {
 467   unsigned int i;
 468
 469   for (i = 0; i < string->len; i++)
 470     if (!is_idchar (string->text[i]))
 471       return 0;
 472
 473   return 1;
 474 }
 475
 476 /* Parse an identifier, skipping embedded backslash-newlines.  This is
 477    a critical inner loop.  The common case is an identifier which has
 478    not been split by backslash-newline, does not contain a dollar
 479    sign, and has already been scanned (roughly 10:1 ratio of
 480    seen:unseen identifiers in normal code; the distribution is
 481    Poisson-like).  Second most common case is a new identifier, not
 482    split and no dollar sign.  The other possibilities are rare and
 483    have been relegated to parse_identifier_slow.  */
 484
 485 static cpp_hashnode *
 486 parse_identifier (pfile)
 487      cpp_reader *pfile;
 488 {
 489   cpp_hashnode *result;
 490   const U_CHAR *cur, *rlimit;
 491
 492   /* Fast-path loop.  Skim over a normal identifier.
 493      N.B. ISIDNUM does not include $.  */
 494   cur    = pfile->buffer->cur - 1;
 495   rlimit = pfile->buffer->rlimit;
 496   do
 497     cur++;
 498   while (cur < rlimit && ISIDNUM (*cur));
 499
 500   /* Check for slow-path cases.  */
 501   if (cur < rlimit && (*cur == '?' || *cur == '\\' || *cur == '$'))
 502     result = parse_identifier_slow (pfile, cur);
 503   else
 504     {
 505       const U_CHAR *base = pfile->buffer->cur - 1;
 506       result = (cpp_hashnode *)
 507         ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
 508       pfile->buffer->cur = cur;
 509     }
 510
 511   /* Rarely, identifiers require diagnostics when lexed.
 512      XXX Has to be forced out of the fast path.  */
 513   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 514                         && !pfile->state.skipping, 0))
 515     {
 516       /* It is allowed to poison the same identifier twice.  */
 517       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 518         cpp_error (pfile, "attempt to use poisoned \"%s\"",
 519                    NODE_NAME (result));
 520
 521       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 522          replacement list of a variadic macro.  */
 523       if (result == pfile->spec_nodes.n__VA_ARGS__
 524           && !pfile->state.va_args_ok)
 525         cpp_pedwarn (pfile,
 526         "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
 527     }
 528
 529   return result;
 530 }
 531
 532 /* Slow path.  This handles identifiers which have been split, and
 533    identifiers which contain dollar signs.  The part of the identifier
 534    from PFILE->buffer->cur-1 to CUR has already been scanned.  */
 535 static cpp_hashnode *
 536 parse_identifier_slow (pfile, cur)
 537      cpp_reader *pfile;
 538      const U_CHAR *cur;
 539 {
 540   cpp_buffer *buffer = pfile->buffer;
 541   const U_CHAR *base = buffer->cur - 1;
 542   struct obstack *stack = &pfile->hash_table->stack;
 543   unsigned int c, saw_dollar = 0, len;
 544
 545   /* Copy the part of the token which is known to be okay.  */
 546   obstack_grow (stack, base, cur - base);
 547
 548   /* Now process the part which isn't.  We are looking at one of
 549      '$', '\\', or '?' on entry to this loop.  */
 550   c = *cur++;
 551   buffer->cur = cur;
 552   do
 553     {
 554       while (is_idchar (c))
 555         {
 556           obstack_1grow (stack, c);
 557
 558           if (c == '$')
 559             saw_dollar++;
 560
 561           c = EOF;
 562           if (buffer->cur == buffer->rlimit)
 563             break;
 564
 565           c = *buffer->cur++;
 566         }
 567
 568       /* Potential escaped newline?  */
 569       if (c != '?' && c != '\\')
 570         break;
 571       c = skip_escaped_newlines (pfile, c);
 572     }
 573   while (is_idchar (c));
 574
 575   /* Remember the next character.  */
 576   buffer->read_ahead = c;
 577
 578   /* $ is not a identifier character in the standard, but is commonly
 579      accepted as an extension.  Don't warn about it in skipped
 580      conditional blocks.  */
 581   if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
 582     cpp_pedwarn (pfile, "'$' character(s) in identifier");
 583
 584   /* Identifiers are null-terminated.  */
 585   len = obstack_object_size (stack);
 586   obstack_1grow (stack, '\0');
 587
 588   return (cpp_hashnode *)
 589     ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
 590 }
 591
 592 /* Parse a number, skipping embedded backslash-newlines.  */
 593 static void
 594 parse_number (pfile, number, c, leading_period)
 595      cpp_reader *pfile;
 596      cpp_string *number;
 597      cppchar_t c;
 598      int leading_period;
 599 {
 600   cpp_buffer *buffer = pfile->buffer;
 601   cpp_pool *pool = &pfile->ident_pool;
 602   unsigned char *dest, *limit;
 603
 604   dest = POOL_FRONT (pool);
 605   limit = POOL_LIMIT (pool);
 606
 607   /* Place a leading period.  */
 608   if (leading_period)
 609     {
 610       if (dest >= limit)
 611         limit = _cpp_next_chunk (pool, 0, &dest);
 612       *dest++ = '.';
 613     }
 614
 615   do
 616     {
 617       do
 618         {
 619           /* Need room for terminating null.  */
 620           if (dest + 1 >= limit)
 621             limit = _cpp_next_chunk (pool, 0, &dest);
 622           *dest++ = c;
 623
 624           c = EOF;
 625           if (buffer->cur == buffer->rlimit)
 626             break;
 627
 628           c = *buffer->cur++;
 629         }
 630       while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
 631
 632       /* Potential escaped newline?  */
 633       if (c != '?' && c != '\\')
 634         break;
 635       c = skip_escaped_newlines (pfile, c);
 636     }
 637   while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
 638
 639   /* Remember the next character.  */
 640   buffer->read_ahead = c;
 641
 642   /* Null-terminate the number.  */
 643   *dest = '\0';
 644
 645   number->text = POOL_FRONT (pool);
 646   number->len = dest - number->text;
 647   POOL_COMMIT (pool, number->len + 1);
 648 }
 649
 650 /* Subroutine of parse_string.  Emits error for unterminated strings.  */
 651 static void
 652 unterminated (pfile, term)
 653      cpp_reader *pfile;
 654      int term;
 655 {
 656   cpp_error (pfile, "missing terminating %c character", term);
 657
 658   if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line)
 659     {
 660       cpp_error_with_line (pfile, pfile->mls_line, pfile->mls_col,
 661                            "possible start of unterminated string literal");
 662       pfile->mls_line = 0;
 663     }
 664 }
 665
 666 /* Subroutine of parse_string.  */
 667 static int
 668 unescaped_terminator_p (pfile, dest)
 669      cpp_reader *pfile;
 670      const unsigned char *dest;
 671 {
 672   const unsigned char *start, *temp;
 673
 674   /* In #include-style directives, terminators are not escapeable.  */
 675   if (pfile->state.angled_headers)
 676     return 1;
 677
 678   start = POOL_FRONT (&pfile->ident_pool);
 679
 680   /* An odd number of consecutive backslashes represents an escaped
 681      terminator.  */
 682   for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
 683     ;
 684
 685   return ((dest - temp) & 1) == 0;
 686 }
 687
 688 /* Parses a string, character constant, or angle-bracketed header file
 689    name.  Handles embedded trigraphs and escaped newlines.  The stored
 690    string is guaranteed NUL-terminated, but it is not guaranteed that
 691    this is the first NUL since embedded NULs are preserved.
 692
 693    Multi-line strings are allowed, but they are deprecated.  */
 694 static void
 695 parse_string (pfile, token, terminator)
 696      cpp_reader *pfile;
 697      cpp_token *token;
 698      cppchar_t terminator;
 699 {
 700   cpp_buffer *buffer = pfile->buffer;
 701   cpp_pool *pool = &pfile->ident_pool;
 702   unsigned char *dest, *limit;
 703   cppchar_t c;
 704   bool warned_nulls = false, warned_multi = false;
 705
 706   dest = POOL_FRONT (pool);
 707   limit = POOL_LIMIT (pool);
 708
 709   for (;;)
 710     {
 711       if (buffer->cur == buffer->rlimit)
 712         c = EOF;
 713       else
 714         c = *buffer->cur++;
 715
 716     have_char:
 717       /* We need space for the terminating NUL.  */
 718       if (dest >= limit)
 719         limit = _cpp_next_chunk (pool, 0, &dest);
 720
 721       if (c == EOF)
 722         {
 723           unterminated (pfile, terminator);
 724           break;
 725         }
 726
 727       /* Handle trigraphs, escaped newlines etc.  */
 728       if (c == '?' || c == '\\')
 729         c = skip_escaped_newlines (pfile, c);
 730
 731       if (c == terminator && unescaped_terminator_p (pfile, dest))
 732         {
 733           c = EOF;
 734           break;
 735         }
 736       else if (is_vspace (c))
 737         {
 738           /* In assembly language, silently terminate string and
 739              character literals at end of line.  This is a kludge
 740              around not knowing where comments are.  */
 741           if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
 742             break;
 743
 744           /* Character constants and header names may not extend over
 745              multiple lines.  In Standard C, neither may strings.
 746              Unfortunately, we accept multiline strings as an
 747              extension, except in #include family directives.  */
 748           if (terminator != '"' || pfile->state.angled_headers)
 749             {
 750               unterminated (pfile, terminator);
 751               break;
 752             }
 753
 754           if (!warned_multi)
 755             {
 756               warned_multi = true;
 757               cpp_pedwarn (pfile, "multi-line string literals are deprecated");
 758             }
 759
 760           if (pfile->mls_line == 0)
 761             {
 762               pfile->mls_line = token->line;
 763               pfile->mls_col = token->col;
 764             }
 765
 766           c = handle_newline (pfile, c);
 767           *dest++ = '\n';
 768           goto have_char;
 769         }
 770       else if (c == '\0' && !warned_nulls)
 771         {
 772           warned_nulls = true;
 773           cpp_warning (pfile, "null character(s) preserved in literal");
 774         }
 775
 776       *dest++ = c;
 777     }
 778
 779   /* Remember the next character.  */
 780   buffer->read_ahead = c;
 781   *dest = '\0';
 782
 783   token->val.str.text = POOL_FRONT (pool);
 784   token->val.str.len = dest - token->val.str.text;
 785   POOL_COMMIT (pool, token->val.str.len + 1);
 786 }
 787
 788 /* The stored comment includes the comment start and any terminator.  */
 789 static void
 790 save_comment (pfile, token, from)
 791      cpp_reader *pfile;
 792      cpp_token *token;
 793      const unsigned char *from;
 794 {
 795   unsigned char *buffer;
 796   unsigned int len;
 797
 798   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 799   /* C++ comments probably (not definitely) have moved past a new
 800      line, which we don't want to save in the comment.  */
 801   if (pfile->buffer->read_ahead != EOF)
 802     len--;
 803   buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
 804
 805   token->type = CPP_COMMENT;
 806   token->val.str.len = len;
 807   token->val.str.text = buffer;
 808
 809   buffer[0] = '/';
 810   memcpy (buffer + 1, from, len - 1);
 811 }
 812
 813 /* Subroutine of _cpp_lex_direct to handle '%'.  A little tricky, since we
 814    want to avoid stepping back when lexing %:%X.  */
 815 static void
 816 lex_percent (pfile, result)
 817      cpp_reader *pfile;
 818      cpp_token *result;
 819 {
 820   cpp_buffer *buffer= pfile->buffer;
 821   cppchar_t c;
 822
 823   result->type = CPP_MOD;
 824   /* Parsing %:%X could leave an extra character.  */
 825   if (buffer->extra_char == EOF)
 826     c = get_effective_char (pfile);
 827   else
 828     {
 829       c = buffer->read_ahead = buffer->extra_char;
 830       buffer->extra_char = EOF;
 831     }
 832
 833   if (c == '=')
 834     ACCEPT_CHAR (CPP_MOD_EQ);
 835   else if (CPP_OPTION (pfile, digraphs))
 836     {
 837       if (c == ':')
 838         {
 839           result->flags |= DIGRAPH;
 840           ACCEPT_CHAR (CPP_HASH);
 841           if (get_effective_char (pfile) == '%')
 842             {
 843               buffer->extra_char = get_effective_char (pfile);
 844               if (buffer->extra_char == ':')
 845                 {
 846                   buffer->extra_char = EOF;
 847                   ACCEPT_CHAR (CPP_PASTE);
 848                 }
 849               else
 850                 /* We'll catch the extra_char when we're called back.  */
 851                 buffer->read_ahead = '%';
 852             }
 853         }
 854       else if (c == '>')
 855         {
 856           result->flags |= DIGRAPH;
 857           ACCEPT_CHAR (CPP_CLOSE_BRACE);
 858         }
 859     }
 860 }
 861
 862 /* Subroutine of _cpp_lex_direct to handle '.'.  This is tricky, since we
 863    want to avoid stepping back when lexing '...' or '.123'.  In the
 864    latter case we should also set a flag for parse_number.  */
 865 static void
 866 lex_dot (pfile, result)
 867      cpp_reader *pfile;
 868      cpp_token *result;
 869 {
 870   cpp_buffer *buffer = pfile->buffer;
 871   cppchar_t c;
 872
 873   /* Parsing ..X could leave an extra character.  */
 874   if (buffer->extra_char == EOF)
 875     c = get_effective_char (pfile);
 876   else
 877     {
 878       c = buffer->read_ahead = buffer->extra_char;
 879       buffer->extra_char = EOF;
 880     }
 881
 882   /* All known character sets have 0...9 contiguous.  */
 883   if (c >= '0' && c <= '9')
 884     {
 885       result->type = CPP_NUMBER;
 886       parse_number (pfile, &result->val.str, c, 1);
 887     }
 888   else
 889     {
 890       result->type = CPP_DOT;
 891       if (c == '.')
 892         {
 893           buffer->extra_char = get_effective_char (pfile);
 894           if (buffer->extra_char == '.')
 895             {
 896               buffer->extra_char = EOF;
 897               ACCEPT_CHAR (CPP_ELLIPSIS);
 898             }
 899           else
 900             /* We'll catch the extra_char when we're called back.  */
 901             buffer->read_ahead = '.';
 902         }
 903       else if (c == '*' && CPP_OPTION (pfile, cplusplus))
 904         ACCEPT_CHAR (CPP_DOT_STAR);
 905     }
 906 }
 907
 908 /* Allocate COUNT tokens for RUN.  */
 909 void
 910 _cpp_init_tokenrun (run, count)
 911      tokenrun *run;
 912      unsigned int count;
 913 {
 914   run->base = xnewvec (cpp_token, count);
 915   run->limit = run->base + count;
 916   run->next = NULL;
 917 }
 918
 919 /* Returns the next tokenrun, or creates one if there is none.  */
 920 static tokenrun *
 921 next_tokenrun (run)
 922      tokenrun *run;
 923 {
 924   if (run->next == NULL)
 925     {
 926       run->next = xnew (tokenrun);
 927       run->next->prev = run;
 928       _cpp_init_tokenrun (run->next, 250);
 929     }
 930
 931   return run->next;
 932 }
 933
 934 /* Lex a token into RESULT (external interface).  Takes care of issues
 935    like directive handling, token lookahead, multiple include
 936    opimisation and skipping.  */
 937 const cpp_token *
 938 _cpp_lex_token (pfile)
 939      cpp_reader *pfile;
 940 {
 941   cpp_token *result;
 942
 943   for (;;)
 944     {
 945       if (pfile->cur_token == pfile->cur_run->limit)
 946         {
 947           pfile->cur_run = next_tokenrun (pfile->cur_run);
 948           pfile->cur_token = pfile->cur_run->base;
 949         }
 950
 951       if (pfile->lookaheads)
 952         {
 953           pfile->lookaheads--;
 954           result = pfile->cur_token++;
 955         }
 956       else
 957         result = _cpp_lex_direct (pfile);
 958
 959       if (result->flags & BOL)
 960         {
 961           /* Is this a directive.  If _cpp_handle_directive returns
 962              false, it is an assembler #.  */
 963           if (result->type == CPP_HASH
 964               && !pfile->state.parsing_args
 965               && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 966             continue;
 967           if (pfile->cb.line_change && !pfile->state.skipping)
 968             (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
 969         }
 970
 971       /* We don't skip tokens in directives.  */
 972       if (pfile->state.in_directive)
 973         break;
 974
 975       /* Outside a directive, invalidate controlling macros.  At file
 976          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 977          get here and MI optimisation works.  */
 978       pfile->mi_valid = false;
 979
 980       if (!pfile->state.skipping || result->type == CPP_EOF)
 981         break;
 982     }
 983
 984   return result;
 985 }
 986
 987 /* Lex a token into pfile->cur_token, which is also incremented, to
 988    get diagnostics pointing to the correct location.
 989
 990    Does not handle issues such as token lookahead, multiple-include
 991    optimisation, directives, skipping etc.  This function is only
 992    suitable for use by _cpp_lex_token, and in special cases like
 993    lex_expansion_token which doesn't care for any of these issues.
 994
 995    When meeting a newline, returns CPP_EOF if parsing a directive,
 996    otherwise returns to the start of the token buffer if permissible.
 997    Returns the location of the lexed token.  */
 998 cpp_token *
 999 _cpp_lex_direct (pfile)
1000      cpp_reader *pfile;
1001 {
1002   cppchar_t c;
1003   cpp_buffer *buffer;
1004   const unsigned char *comment_start;
1005   cpp_token *result = pfile->cur_token++;
1006
1007  fresh_line:
1008   buffer = pfile->buffer;
1009   result->flags = buffer->saved_flags;
1010   buffer->saved_flags = 0;
1011  update_tokens_line:
1012   result->line = pfile->line;
1013
1014  skipped_white:
1015   c = buffer->read_ahead;
1016   if (c == EOF && buffer->cur < buffer->rlimit)
1017     c = *buffer->cur++;
1018   result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
1019   buffer->read_ahead = EOF;
1020
1021  trigraph:
1022   switch (c)
1023     {
1024     case EOF:
1025       buffer->saved_flags = BOL;
1026       if (!pfile->state.parsing_args && !pfile->state.in_directive)
1027         {
1028           if (buffer->cur != buffer->line_base)
1029             {
1030               /* Non-empty files should end in a newline.  Don't warn
1031                  for command line and _Pragma buffers.  */
1032               if (!buffer->from_stage3)
1033                 cpp_pedwarn (pfile, "no newline at end of file");
1034               handle_newline (pfile, '\n');
1035             }
1036
1037           /* Don't pop the last buffer.  */
1038           if (buffer->prev)
1039             {
1040               unsigned char stop = buffer->return_at_eof;
1041
1042               _cpp_pop_buffer (pfile);
1043               if (!stop)
1044                 goto fresh_line;
1045             }
1046         }
1047       result->type = CPP_EOF;
1048       break;
1049
1050     case ' ': case '\t': case '\f': case '\v': case '\0':
1051       skip_whitespace (pfile, c);
1052       result->flags |= PREV_WHITE;
1053       goto skipped_white;
1054
1055     case '\n': case '\r':
1056       handle_newline (pfile, c);
1057       buffer->saved_flags = BOL;
1058       if (! pfile->state.in_directive)
1059         {
1060           if (!pfile->keep_tokens)
1061             {
1062               pfile->cur_run = &pfile->base_run;
1063               result = pfile->base_run.base;
1064               pfile->cur_token = result + 1;
1065             }
1066           goto fresh_line;
1067         }
1068       result->type = CPP_EOF;
1069       break;
1070
1071     case '?':
1072     case '\\':
1073       /* These could start an escaped newline, or '?' a trigraph.  Let
1074          skip_escaped_newlines do all the work.  */
1075       {
1076         unsigned int line = pfile->line;
1077
1078         c = skip_escaped_newlines (pfile, c);
1079         if (line != pfile->line)
1080           /* We had at least one escaped newline of some sort, and the
1081              next character is in buffer->read_ahead.  Update the
1082              token's line and column.  */
1083             goto update_tokens_line;
1084
1085         /* We are either the original '?' or '\\', or a trigraph.  */
1086         result->type = CPP_QUERY;
1087         buffer->read_ahead = EOF;
1088         if (c == '\\')
1089           goto random_char;
1090         else if (c != '?')
1091           goto trigraph;
1092       }
1093       break;
1094
1095     case '0': case '1': case '2': case '3': case '4':
1096     case '5': case '6': case '7': case '8': case '9':
1097       result->type = CPP_NUMBER;
1098       parse_number (pfile, &result->val.str, c, 0);
1099       break;
1100
1101     case '$':
1102       if (!CPP_OPTION (pfile, dollars_in_ident))
1103         goto random_char;
1104       /* Fall through...  */
1105
1106     case '_':
1107     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1108     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1109     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1110     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1111     case 'y': case 'z':
1112     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1113     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1114     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1115     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1116     case 'Y': case 'Z':
1117       result->type = CPP_NAME;
1118       result->val.node = parse_identifier (pfile);
1119
1120       /* 'L' may introduce wide characters or strings.  */
1121       if (result->val.node == pfile->spec_nodes.n_L)
1122         {
1123           c = buffer->read_ahead;
1124           if (c == EOF && buffer->cur < buffer->rlimit)
1125             c = *buffer->cur;
1126           if (c == '\'' || c == '"')
1127             {
1128               buffer->cur++;
1129               ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1130               goto make_string;
1131             }
1132         }
1133       /* Convert named operators to their proper types.  */
1134       else if (result->val.node->flags & NODE_OPERATOR)
1135         {
1136           result->flags |= NAMED_OP;
1137           result->type = result->val.node->value.operator;
1138         }
1139       break;
1140
1141     case '\'':
1142     case '"':
1143       result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1144     make_string:
1145       parse_string (pfile, result, c);
1146       break;
1147
1148     case '/':
1149       /* A potential block or line comment.  */
1150       comment_start = buffer->cur;
1151       result->type = CPP_DIV;
1152       c = get_effective_char (pfile);
1153       if (c == '=')
1154         ACCEPT_CHAR (CPP_DIV_EQ);
1155       if (c != '/' && c != '*')
1156         break;
1157
1158       if (c == '*')
1159         {
1160           if (skip_block_comment (pfile))
1161             cpp_error (pfile, "unterminated comment");
1162         }
1163       else
1164         {
1165           if (!CPP_OPTION (pfile, cplusplus_comments)
1166               && !CPP_IN_SYSTEM_HEADER (pfile))
1167             break;
1168
1169           /* Warn about comments only if pedantically GNUC89, and not
1170              in system headers.  */
1171           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1172               && ! buffer->warned_cplusplus_comments)
1173             {
1174               cpp_pedwarn (pfile,
1175                            "C++ style comments are not allowed in ISO C89");
1176               cpp_pedwarn (pfile,
1177                            "(this will be reported only once per input file)");
1178               buffer->warned_cplusplus_comments = 1;
1179             }
1180
1181           /* Skip_line_comment updates buffer->read_ahead.  */
1182           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1183             cpp_warning (pfile, "multi-line comment");
1184         }
1185
1186       /* Skipping the comment has updated buffer->read_ahead.  */
1187       if (!pfile->state.save_comments)
1188         {
1189           result->flags |= PREV_WHITE;
1190           goto update_tokens_line;
1191         }
1192
1193       /* Save the comment as a token in its own right.  */
1194       save_comment (pfile, result, comment_start);
1195       /* Don't do MI optimisation.  */
1196       break;
1197
1198     case '<':
1199       if (pfile->state.angled_headers)
1200         {
1201           result->type = CPP_HEADER_NAME;
1202           c = '>';              /* terminator.  */
1203           goto make_string;
1204         }
1205
1206       result->type = CPP_LESS;
1207       c = get_effective_char (pfile);
1208       if (c == '=')
1209         ACCEPT_CHAR (CPP_LESS_EQ);
1210       else if (c == '<')
1211         {
1212           ACCEPT_CHAR (CPP_LSHIFT);
1213           if (get_effective_char (pfile) == '=')
1214             ACCEPT_CHAR (CPP_LSHIFT_EQ);
1215         }
1216       else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1217         {
1218           ACCEPT_CHAR (CPP_MIN);
1219           if (get_effective_char (pfile) == '=')
1220             ACCEPT_CHAR (CPP_MIN_EQ);
1221         }
1222       else if (c == ':' && CPP_OPTION (pfile, digraphs))
1223         {
1224           ACCEPT_CHAR (CPP_OPEN_SQUARE);
1225           result->flags |= DIGRAPH;
1226         }
1227       else if (c == '%' && CPP_OPTION (pfile, digraphs))
1228         {
1229           ACCEPT_CHAR (CPP_OPEN_BRACE);
1230           result->flags |= DIGRAPH;
1231         }
1232       break;
1233
1234     case '>':
1235       result->type = CPP_GREATER;
1236       c = get_effective_char (pfile);
1237       if (c == '=')
1238         ACCEPT_CHAR (CPP_GREATER_EQ);
1239       else if (c == '>')
1240         {
1241           ACCEPT_CHAR (CPP_RSHIFT);
1242           if (get_effective_char (pfile) == '=')
1243             ACCEPT_CHAR (CPP_RSHIFT_EQ);
1244         }
1245       else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1246         {
1247           ACCEPT_CHAR (CPP_MAX);
1248           if (get_effective_char (pfile) == '=')
1249             ACCEPT_CHAR (CPP_MAX_EQ);
1250         }
1251       break;
1252
1253     case '%':
1254       lex_percent (pfile, result);
1255       break;
1256
1257     case '.':
1258       lex_dot (pfile, result);
1259       break;
1260
1261     case '+':
1262       result->type = CPP_PLUS;
1263       c = get_effective_char (pfile);
1264       if (c == '=')
1265         ACCEPT_CHAR (CPP_PLUS_EQ);
1266       else if (c == '+')
1267         ACCEPT_CHAR (CPP_PLUS_PLUS);
1268       break;
1269
1270     case '-':
1271       result->type = CPP_MINUS;
1272       c = get_effective_char (pfile);
1273       if (c == '>')
1274         {
1275           ACCEPT_CHAR (CPP_DEREF);
1276           if (CPP_OPTION (pfile, cplusplus)
1277               && get_effective_char (pfile) == '*')
1278             ACCEPT_CHAR (CPP_DEREF_STAR);
1279         }
1280       else if (c == '=')
1281         ACCEPT_CHAR (CPP_MINUS_EQ);
1282       else if (c == '-')
1283         ACCEPT_CHAR (CPP_MINUS_MINUS);
1284       break;
1285
1286     case '*':
1287       result->type = CPP_MULT;
1288       if (get_effective_char (pfile) == '=')
1289         ACCEPT_CHAR (CPP_MULT_EQ);
1290       break;
1291
1292     case '=':
1293       result->type = CPP_EQ;
1294       if (get_effective_char (pfile) == '=')
1295         ACCEPT_CHAR (CPP_EQ_EQ);
1296       break;
1297
1298     case '!':
1299       result->type = CPP_NOT;
1300       if (get_effective_char (pfile) == '=')
1301         ACCEPT_CHAR (CPP_NOT_EQ);
1302       break;
1303
1304     case '&':
1305       result->type = CPP_AND;
1306       c = get_effective_char (pfile);
1307       if (c == '=')
1308         ACCEPT_CHAR (CPP_AND_EQ);
1309       else if (c == '&')
1310         ACCEPT_CHAR (CPP_AND_AND);
1311       break;
1312
1313     case '#':
1314       result->type = CPP_HASH;
1315       if (get_effective_char (pfile) == '#')
1316           ACCEPT_CHAR (CPP_PASTE);
1317       break;
1318
1319     case '|':
1320       result->type = CPP_OR;
1321       c = get_effective_char (pfile);
1322       if (c == '=')
1323         ACCEPT_CHAR (CPP_OR_EQ);
1324       else if (c == '|')
1325         ACCEPT_CHAR (CPP_OR_OR);
1326       break;
1327
1328     case '^':
1329       result->type = CPP_XOR;
1330       if (get_effective_char (pfile) == '=')
1331         ACCEPT_CHAR (CPP_XOR_EQ);
1332       break;
1333
1334     case ':':
1335       result->type = CPP_COLON;
1336       c = get_effective_char (pfile);
1337       if (c == ':' && CPP_OPTION (pfile, cplusplus))
1338         ACCEPT_CHAR (CPP_SCOPE);
1339       else if (c == '>' && CPP_OPTION (pfile, digraphs))
1340         {
1341           result->flags |= DIGRAPH;
1342           ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1343         }
1344       break;
1345
1346     case '~': result->type = CPP_COMPL; break;
1347     case ',': result->type = CPP_COMMA; break;
1348     case '(': result->type = CPP_OPEN_PAREN; break;
1349     case ')': result->type = CPP_CLOSE_PAREN; break;
1350     case '[': result->type = CPP_OPEN_SQUARE; break;
1351     case ']': result->type = CPP_CLOSE_SQUARE; break;
1352     case '{': result->type = CPP_OPEN_BRACE; break;
1353     case '}': result->type = CPP_CLOSE_BRACE; break;
1354     case ';': result->type = CPP_SEMICOLON; break;
1355
1356       /* @ is a punctuator in Objective C.  */
1357     case '@': result->type = CPP_ATSIGN; break;
1358
1359     random_char:
1360     default:
1361       result->type = CPP_OTHER;
1362       result->val.c = c;
1363       break;
1364     }
1365
1366   return result;
1367 }
1368
1369 /* An upper bound on the number of bytes needed to spell a token,
1370    including preceding whitespace.  */
1371 unsigned int
1372 cpp_token_len (token)
1373      const cpp_token *token;
1374 {
1375   unsigned int len;
1376
1377   switch (TOKEN_SPELL (token))
1378     {
1379     default:            len = 0;                                break;
1380     case SPELL_STRING:  len = token->val.str.len;               break;
1381     case SPELL_IDENT:   len = NODE_LEN (token->val.node);       break;
1382     }
1383   /* 1 for whitespace, 4 for comment delimeters.  */
1384   return len + 5;
1385 }
1386
1387 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1388    already contain the enough space to hold the token's spelling.
1389    Returns a pointer to the character after the last character
1390    written.  */
1391 unsigned char *
1392 cpp_spell_token (pfile, token, buffer)
1393      cpp_reader *pfile;         /* Would be nice to be rid of this...  */
1394      const cpp_token *token;
1395      unsigned char *buffer;
1396 {
1397   switch (TOKEN_SPELL (token))
1398     {
1399     case SPELL_OPERATOR:
1400       {
1401         const unsigned char *spelling;
1402         unsigned char c;
1403
1404         if (token->flags & DIGRAPH)
1405           spelling
1406             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1407         else if (token->flags & NAMED_OP)
1408           goto spell_ident;
1409         else
1410           spelling = TOKEN_NAME (token);
1411
1412         while ((c = *spelling++) != '\0')
1413           *buffer++ = c;
1414       }
1415       break;
1416
1417     case SPELL_IDENT:
1418       spell_ident:
1419       memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1420       buffer += NODE_LEN (token->val.node);
1421       break;
1422
1423     case SPELL_STRING:
1424       {
1425         int left, right, tag;
1426         switch (token->type)
1427           {
1428           case CPP_STRING:      left = '"';  right = '"';  tag = '\0'; break;
1429           case CPP_WSTRING:     left = '"';  right = '"';  tag = 'L';  break;
1430           case CPP_CHAR:        left = '\''; right = '\''; tag = '\0'; break;
1431           case CPP_WCHAR:       left = '\''; right = '\''; tag = 'L';  break;
1432           case CPP_HEADER_NAME: left = '<';  right = '>';  tag = '\0'; break;
1433           default:              left = '\0'; right = '\0'; tag = '\0'; break;
1434           }
1435         if (tag) *buffer++ = tag;
1436         if (left) *buffer++ = left;
1437         memcpy (buffer, token->val.str.text, token->val.str.len);
1438         buffer += token->val.str.len;
1439         if (right) *buffer++ = right;
1440       }
1441       break;
1442
1443     case SPELL_CHAR:
1444       *buffer++ = token->val.c;
1445       break;
1446
1447     case SPELL_NONE:
1448       cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1449       break;
1450     }
1451
1452   return buffer;
1453 }
1454
1455 /* Returns a token as a null-terminated string.  The string is
1456    temporary, and automatically freed later.  Useful for diagnostics.  */
1457 unsigned char *
1458 cpp_token_as_text (pfile, token)
1459      cpp_reader *pfile;
1460      const cpp_token *token;
1461 {
1462   unsigned int len = cpp_token_len (token);
1463   unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1464
1465   end = cpp_spell_token (pfile, token, start);
1466   end[0] = '\0';
1467
1468   return start;
1469 }
1470
1471 /* Used by C front ends.  Should really move to using cpp_token_as_text.  */
1472 const char *
1473 cpp_type2name (type)
1474      enum cpp_ttype type;
1475 {
1476   return (const char *) token_spellings[type].name;
1477 }
1478
1479 /* Writes the spelling of token to FP.  Separate from cpp_spell_token
1480    for efficiency - to avoid double-buffering.  Also, outputs a space
1481    if PREV_WHITE is flagged.  */
1482 void
1483 cpp_output_token (token, fp)
1484      const cpp_token *token;
1485      FILE *fp;
1486 {
1487   if (token->flags & PREV_WHITE)
1488     putc (' ', fp);
1489
1490   switch (TOKEN_SPELL (token))
1491     {
1492     case SPELL_OPERATOR:
1493       {
1494         const unsigned char *spelling;
1495
1496         if (token->flags & DIGRAPH)
1497           spelling
1498             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1499         else if (token->flags & NAMED_OP)
1500           goto spell_ident;
1501         else
1502           spelling = TOKEN_NAME (token);
1503
1504         ufputs (spelling, fp);
1505       }
1506       break;
1507
1508     spell_ident:
1509     case SPELL_IDENT:
1510       ufputs (NODE_NAME (token->val.node), fp);
1511     break;
1512
1513     case SPELL_STRING:
1514       {
1515         int left, right, tag;
1516         switch (token->type)
1517           {
1518           case CPP_STRING:      left = '"';  right = '"';  tag = '\0'; break;
1519           case CPP_WSTRING:     left = '"';  right = '"';  tag = 'L';  break;
1520           case CPP_CHAR:        left = '\''; right = '\''; tag = '\0'; break;
1521           case CPP_WCHAR:       left = '\''; right = '\''; tag = 'L';  break;
1522           case CPP_HEADER_NAME: left = '<';  right = '>';  tag = '\0'; break;
1523           default:              left = '\0'; right = '\0'; tag = '\0'; break;
1524           }
1525         if (tag) putc (tag, fp);
1526         if (left) putc (left, fp);
1527         fwrite (token->val.str.text, 1, token->val.str.len, fp);
1528         if (right) putc (right, fp);
1529       }
1530       break;
1531
1532     case SPELL_CHAR:
1533       putc (token->val.c, fp);
1534       break;
1535
1536     case SPELL_NONE:
1537       /* An error, most probably.  */
1538       break;
1539     }
1540 }
1541
1542 /* Compare two tokens.  */
1543 int
1544 _cpp_equiv_tokens (a, b)
1545      const cpp_token *a, *b;
1546 {
1547   if (a->type == b->type && a->flags == b->flags)
1548     switch (TOKEN_SPELL (a))
1549       {
1550       default:                  /* Keep compiler happy.  */
1551       case SPELL_OPERATOR:
1552         return 1;
1553       case SPELL_CHAR:
1554         return a->val.c == b->val.c; /* Character.  */
1555       case SPELL_NONE:
1556         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1557       case SPELL_IDENT:
1558         return a->val.node == b->val.node;
1559       case SPELL_STRING:
1560         return (a->val.str.len == b->val.str.len
1561                 && !memcmp (a->val.str.text, b->val.str.text,
1562                             a->val.str.len));
1563       }
1564
1565   return 0;
1566 }
1567
1568 /* Determine whether two tokens can be pasted together, and if so,
1569    what the resulting token is.  Returns CPP_EOF if the tokens cannot
1570    be pasted, or the appropriate type for the merged token if they
1571    can.  */
1572 enum cpp_ttype
1573 cpp_can_paste (pfile, token1, token2, digraph)
1574      cpp_reader * pfile;
1575      const cpp_token *token1, *token2;
1576      int* digraph;
1577 {
1578   enum cpp_ttype a = token1->type, b = token2->type;
1579   int cxx = CPP_OPTION (pfile, cplusplus);
1580
1581   /* Treat named operators as if they were ordinary NAMEs.  */
1582   if (token1->flags & NAMED_OP)
1583     a = CPP_NAME;
1584   if (token2->flags & NAMED_OP)
1585     b = CPP_NAME;
1586
1587   if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1588     return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1589
1590   switch (a)
1591     {
1592     case CPP_GREATER:
1593       if (b == a) return CPP_RSHIFT;
1594       if (b == CPP_QUERY && cxx)        return CPP_MAX;
1595       if (b == CPP_GREATER_EQ)  return CPP_RSHIFT_EQ;
1596       break;
1597     case CPP_LESS:
1598       if (b == a) return CPP_LSHIFT;
1599       if (b == CPP_QUERY && cxx)        return CPP_MIN;
1600       if (b == CPP_LESS_EQ)     return CPP_LSHIFT_EQ;
1601       if (CPP_OPTION (pfile, digraphs))
1602         {
1603           if (b == CPP_COLON)
1604             {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1605           if (b == CPP_MOD)
1606             {*digraph = 1; return CPP_OPEN_BRACE;}      /* <% digraph */
1607         }
1608       break;
1609
1610     case CPP_PLUS: if (b == a)  return CPP_PLUS_PLUS; break;
1611     case CPP_AND:  if (b == a)  return CPP_AND_AND; break;
1612     case CPP_OR:   if (b == a)  return CPP_OR_OR;   break;
1613
1614     case CPP_MINUS:
1615       if (b == a)               return CPP_MINUS_MINUS;
1616       if (b == CPP_GREATER)     return CPP_DEREF;
1617       break;
1618     case CPP_COLON:
1619       if (b == a && cxx)        return CPP_SCOPE;
1620       if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1621         {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1622       break;
1623
1624     case CPP_MOD:
1625       if (CPP_OPTION (pfile, digraphs))
1626         {
1627           if (b == CPP_GREATER)
1628             {*digraph = 1; return CPP_CLOSE_BRACE;}  /* %> digraph */
1629           if (b == CPP_COLON)
1630             {*digraph = 1; return CPP_HASH;}         /* %: digraph */
1631         }
1632       break;
1633     case CPP_DEREF:
1634       if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1635       break;
1636     case CPP_DOT:
1637       if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1638       if (b == CPP_NUMBER)      return CPP_NUMBER;
1639       break;
1640
1641     case CPP_HASH:
1642       if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1643         /* %:%: digraph */
1644         {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1645       break;
1646
1647     case CPP_NAME:
1648       if (b == CPP_NAME)        return CPP_NAME;
1649       if (b == CPP_NUMBER
1650           && name_p (pfile, &token2->val.str)) return CPP_NAME;
1651       if (b == CPP_CHAR
1652           && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1653       if (b == CPP_STRING
1654           && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1655       break;
1656
1657     case CPP_NUMBER:
1658       if (b == CPP_NUMBER)      return CPP_NUMBER;
1659       if (b == CPP_NAME)        return CPP_NUMBER;
1660       if (b == CPP_DOT)         return CPP_NUMBER;
1661       /* Numbers cannot have length zero, so this is safe.  */
1662       if ((b == CPP_PLUS || b == CPP_MINUS)
1663           && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1664         return CPP_NUMBER;
1665       break;
1666
1667     default:
1668       break;
1669     }
1670
1671   return CPP_EOF;
1672 }
1673
1674 /* Returns nonzero if a space should be inserted to avoid an
1675    accidental token paste for output.  For simplicity, it is
1676    conservative, and occasionally advises a space where one is not
1677    needed, e.g. "." and ".2".  */
1678
1679 int
1680 cpp_avoid_paste (pfile, token1, token2)
1681      cpp_reader *pfile;
1682      const cpp_token *token1, *token2;
1683 {
1684   enum cpp_ttype a = token1->type, b = token2->type;
1685   cppchar_t c;
1686
1687   if (token1->flags & NAMED_OP)
1688     a = CPP_NAME;
1689   if (token2->flags & NAMED_OP)
1690     b = CPP_NAME;
1691
1692   c = EOF;
1693   if (token2->flags & DIGRAPH)
1694     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1695   else if (token_spellings[b].category == SPELL_OPERATOR)
1696     c = token_spellings[b].name[0];
1697
1698   /* Quickly get everything that can paste with an '='.  */
1699   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1700     return 1;
1701
1702   switch (a)
1703     {
1704     case CPP_GREATER:   return c == '>' || c == '?';
1705     case CPP_LESS:      return c == '<' || c == '?' || c == '%' || c == ':';
1706     case CPP_PLUS:      return c == '+';
1707     case CPP_MINUS:     return c == '-' || c == '>';
1708     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1709     case CPP_MOD:       return c == ':' || c == '>';
1710     case CPP_AND:       return c == '&';
1711     case CPP_OR:        return c == '|';
1712     case CPP_COLON:     return c == ':' || c == '>';
1713     case CPP_DEREF:     return c == '*';
1714     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1715     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1716     case CPP_NAME:      return ((b == CPP_NUMBER
1717                                  && name_p (pfile, &token2->val.str))
1718                                 || b == CPP_NAME
1719                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1720     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1721                                 || c == '.' || c == '+' || c == '-');
1722     case CPP_OTHER:     return (CPP_OPTION (pfile, objc)
1723                                 && token1->val.c == '@'
1724                                 && (b == CPP_NAME || b == CPP_STRING));
1725     default:            break;
1726     }
1727
1728   return 0;
1729 }
1730
1731 /* Output all the remaining tokens on the current line, and a newline
1732    character, to FP.  Leading whitespace is removed.  */
1733 void
1734 cpp_output_line (pfile, fp)
1735      cpp_reader *pfile;
1736      FILE *fp;
1737 {
1738   cpp_token token;
1739
1740   cpp_get_token (pfile, &token);
1741   token.flags &= ~PREV_WHITE;
1742   while (token.type != CPP_EOF)
1743     {
1744       cpp_output_token (&token, fp);
1745       cpp_get_token (pfile, &token);
1746     }
1747
1748   putc ('\n', fp);
1749 }
1750
1751 /* Returns the value of a hexadecimal digit.  */
1752 static unsigned int
1753 hex_digit_value (c)
1754      unsigned int c;
1755 {
1756   if (c >= 'a' && c <= 'f')
1757     return c - 'a' + 10;
1758   if (c >= 'A' && c <= 'F')
1759     return c - 'A' + 10;
1760   if (c >= '0' && c <= '9')
1761     return c - '0';
1762   abort ();
1763 }
1764
1765 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.  Returns 1 to indicate
1766    failure if cpplib is not parsing C++ or C99.  Such failure is
1767    silent, and no variables are updated.  Otherwise returns 0, and
1768    warns if -Wtraditional.
1769
1770    [lex.charset]: The character designated by the universal character
1771    name \UNNNNNNNN is that character whose character short name in
1772    ISO/IEC 10646 is NNNNNNNN; the character designated by the
1773    universal character name \uNNNN is that character whose character
1774    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
1775    for a universal character name is less than 0x20 or in the range
1776    0x7F-0x9F (inclusive), or if the universal character name
1777    designates a character in the basic source character set, then the
1778    program is ill-formed.
1779
1780    We assume that wchar_t is Unicode, so we don't need to do any
1781    mapping.  Is this ever wrong?
1782
1783    PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1784    LIMIT is the end of the string or charconst.  PSTR is updated to
1785    point after the UCS on return, and the UCS is written into PC.  */
1786
1787 static int
1788 maybe_read_ucs (pfile, pstr, limit, pc)
1789      cpp_reader *pfile;
1790      const unsigned char **pstr;
1791      const unsigned char *limit;
1792      unsigned int *pc;
1793 {
1794   const unsigned char *p = *pstr;
1795   unsigned int code = 0;
1796   unsigned int c = *pc, length;
1797
1798   /* Only attempt to interpret a UCS for C++ and C99.  */
1799   if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1800     return 1;
1801
1802   if (CPP_WTRADITIONAL (pfile))
1803     cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1804
1805   length = (c == 'u' ? 4: 8);
1806
1807   if ((size_t) (limit - p) < length)
1808     {
1809       cpp_error (pfile, "incomplete universal-character-name");
1810       /* Skip to the end to avoid more diagnostics.  */
1811       p = limit;
1812     }
1813   else
1814     {
1815       for (; length; length--, p++)
1816         {
1817           c = *p;
1818           if (ISXDIGIT (c))
1819             code = (code << 4) + hex_digit_value (c);
1820           else
1821             {
1822               cpp_error (pfile,
1823                          "non-hex digit '%c' in universal-character-name", c);
1824               /* We shouldn't skip in case there are multibyte chars.  */
1825               break;
1826             }
1827         }
1828     }
1829
1830 #ifdef TARGET_EBCDIC
1831   cpp_error (pfile, "universal-character-name on EBCDIC target");
1832   code = 0x3f;  /* EBCDIC invalid character */
1833 #else
1834  /* True extended characters are OK.  */
1835   if (code >= 0xa0
1836       && !(code & 0x80000000)
1837       && !(code >= 0xD800 && code <= 0xDFFF))
1838     ;
1839   /* The standard permits $, @ and ` to be specified as UCNs.  We use
1840      hex escapes so that this also works with EBCDIC hosts.  */
1841   else if (code == 0x24 || code == 0x40 || code == 0x60)
1842     ;
1843   /* Don't give another error if one occurred above.  */
1844   else if (length == 0)
1845     cpp_error (pfile, "universal-character-name out of range");
1846 #endif
1847
1848   *pstr = p;
1849   *pc = code;
1850   return 0;
1851 }
1852
1853 /* Interpret an escape sequence, and return its value.  PSTR points to
1854    the input pointer, which is just after the backslash.  LIMIT is how
1855    much text we have.  MASK is a bitmask for the precision for the
1856    destination type (char or wchar_t).  TRADITIONAL, if true, does not
1857    interpret escapes that did not exist in traditional C.
1858
1859    Handles all relevant diagnostics.  */
1860
1861 unsigned int
1862 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1863      cpp_reader *pfile;
1864      const unsigned char **pstr;
1865      const unsigned char *limit;
1866      unsigned HOST_WIDE_INT mask;
1867      int traditional;
1868 {
1869   int unknown = 0;
1870   const unsigned char *str = *pstr;
1871   unsigned int c = *str++;
1872
1873   switch (c)
1874     {
1875     case '\\': case '\'': case '"': case '?': break;
1876     case 'b': c = TARGET_BS;      break;
1877     case 'f': c = TARGET_FF;      break;
1878     case 'n': c = TARGET_NEWLINE; break;
1879     case 'r': c = TARGET_CR;      break;
1880     case 't': c = TARGET_TAB;     break;
1881     case 'v': c = TARGET_VT;      break;
1882
1883     case '(': case '{': case '[': case '%':
1884       /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1885          '\%' is used to prevent SCCS from getting confused.  */
1886       unknown = CPP_PEDANTIC (pfile);
1887       break;
1888
1889     case 'a':
1890       if (CPP_WTRADITIONAL (pfile))
1891         cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1892       if (!traditional)
1893         c = TARGET_BELL;
1894       break;
1895
1896     case 'e': case 'E':
1897       if (CPP_PEDANTIC (pfile))
1898         cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1899       c = TARGET_ESC;
1900       break;
1901
1902     case 'u': case 'U':
1903       unknown = maybe_read_ucs (pfile, &str, limit, &c);
1904       break;
1905
1906     case 'x':
1907       if (CPP_WTRADITIONAL (pfile))
1908         cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1909
1910       if (!traditional)
1911         {
1912           unsigned int i = 0, overflow = 0;
1913           int digits_found = 0;
1914
1915           while (str < limit)
1916             {
1917               c = *str;
1918               if (! ISXDIGIT (c))
1919                 break;
1920               str++;
1921               overflow |= i ^ (i << 4 >> 4);
1922               i = (i << 4) + hex_digit_value (c);
1923               digits_found = 1;
1924             }
1925
1926           if (!digits_found)
1927             cpp_error (pfile, "\\x used with no following hex digits");
1928
1929           if (overflow | (i != (i & mask)))
1930             {
1931               cpp_pedwarn (pfile, "hex escape sequence out of range");
1932               i &= mask;
1933             }
1934           c = i;
1935         }
1936       break;
1937
1938     case '0':  case '1':  case '2':  case '3':
1939     case '4':  case '5':  case '6':  case '7':
1940       {
1941         unsigned int i = c - '0';
1942         int count = 0;
1943
1944         while (str < limit && ++count < 3)
1945           {
1946             c = *str;
1947             if (c < '0' || c > '7')
1948               break;
1949             str++;
1950             i = (i << 3) + c - '0';
1951           }
1952
1953         if (i != (i & mask))
1954           {
1955             cpp_pedwarn (pfile, "octal escape sequence out of range");
1956             i &= mask;
1957           }
1958         c = i;
1959       }
1960       break;
1961
1962     default:
1963       unknown = 1;
1964       break;
1965     }
1966
1967   if (unknown)
1968     {
1969       if (ISGRAPH (c))
1970         cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1971       else
1972         cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1973     }
1974
1975   if (c > mask)
1976     cpp_pedwarn (pfile, "escape sequence out of range for character");
1977
1978   *pstr = str;
1979   return c;
1980 }
1981
1982 #ifndef MAX_CHAR_TYPE_SIZE
1983 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1984 #endif
1985
1986 #ifndef MAX_WCHAR_TYPE_SIZE
1987 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1988 #endif
1989
1990 /* Interpret a (possibly wide) character constant in TOKEN.
1991    WARN_MULTI warns about multi-character charconsts, if not
1992    TRADITIONAL.  TRADITIONAL also indicates not to interpret escapes
1993    that did not exist in traditional C.  PCHARS_SEEN points to a
1994    variable that is filled in with the number of characters seen.  */
1995 HOST_WIDE_INT
1996 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1997      cpp_reader *pfile;
1998      const cpp_token *token;
1999      int warn_multi;
2000      int traditional;
2001      unsigned int *pchars_seen;
2002 {
2003   const unsigned char *str = token->val.str.text;
2004   const unsigned char *limit = str + token->val.str.len;
2005   unsigned int chars_seen = 0;
2006   unsigned int width, max_chars, c;
2007   unsigned HOST_WIDE_INT mask;
2008   HOST_WIDE_INT result = 0;
2009
2010 #ifdef MULTIBYTE_CHARS
2011   (void) local_mbtowc (NULL, NULL, 0);
2012 #endif
2013
2014   /* Width in bits.  */
2015   if (token->type == CPP_CHAR)
2016     width = MAX_CHAR_TYPE_SIZE;
2017   else
2018     width = MAX_WCHAR_TYPE_SIZE;
2019
2020   if (width < HOST_BITS_PER_WIDE_INT)
2021     mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
2022   else
2023     mask = ~0;
2024   max_chars = HOST_BITS_PER_WIDE_INT / width;
2025
2026   while (str < limit)
2027     {
2028 #ifdef MULTIBYTE_CHARS
2029       wchar_t wc;
2030       int char_len;
2031
2032       char_len = local_mbtowc (&wc, str, limit - str);
2033       if (char_len == -1)
2034         {
2035           cpp_warning (pfile, "ignoring invalid multibyte character");
2036           c = *str++;
2037         }
2038       else
2039         {
2040           str += char_len;
2041           c = wc;
2042         }
2043 #else
2044       c = *str++;
2045 #endif
2046
2047       if (c == '\\')
2048         c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
2049
2050 #ifdef MAP_CHARACTER
2051       if (ISPRINT (c))
2052         c = MAP_CHARACTER (c);
2053 #endif
2054
2055       /* Merge character into result; ignore excess chars.  */
2056       if (++chars_seen <= max_chars)
2057         {
2058           if (width < HOST_BITS_PER_WIDE_INT)
2059             result = (result << width) | (c & mask);
2060           else
2061             result = c;
2062         }
2063     }
2064
2065   if (chars_seen == 0)
2066     cpp_error (pfile, "empty character constant");
2067   else if (chars_seen > max_chars)
2068     {
2069       chars_seen = max_chars;
2070       cpp_warning (pfile, "character constant too long");
2071     }
2072   else if (chars_seen > 1 && !traditional && warn_multi)
2073     cpp_warning (pfile, "multi-character character constant");
2074
2075   /* If char type is signed, sign-extend the constant.  The
2076      __CHAR_UNSIGNED__ macro is set by the driver if appropriate.  */
2077   if (token->type == CPP_CHAR && chars_seen)
2078     {
2079       unsigned int nbits = chars_seen * width;
2080       unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
2081
2082       if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
2083           || ((result >> (nbits - 1)) & 1) == 0)
2084         result &= mask;
2085       else
2086         result |= ~mask;
2087     }
2088
2089   *pchars_seen = chars_seen;
2090   return result;
2091 }
2092
2093 /* Memory pools.  */
2094
2095 struct dummy
2096 {
2097   char c;
2098   union
2099   {
2100     double d;
2101     int *p;
2102   } u;
2103 };
2104
2105 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2106
2107 static int
2108 chunk_suitable (pool, chunk, size)
2109      cpp_pool *pool;
2110      cpp_chunk *chunk;
2111      unsigned int size;
2112 {
2113   /* Being at least twice SIZE means we can use memcpy in
2114      _cpp_next_chunk rather than memmove.  Besides, it's a good idea
2115      anyway.  */
2116   return (chunk && pool->locked != chunk
2117           && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2118 }
2119
2120 /* Returns the end of the new pool.  PTR points to a char in the old
2121    pool, and is updated to point to the same char in the new pool.  */
2122 unsigned char *
2123 _cpp_next_chunk (pool, len, ptr)
2124      cpp_pool *pool;
2125      unsigned int len;
2126      unsigned char **ptr;
2127 {
2128   cpp_chunk *chunk = pool->cur->next;
2129
2130   /* LEN is the minimum size we want in the new pool.  */
2131   len += POOL_ROOM (pool);
2132   if (! chunk_suitable (pool, chunk, len))
2133     {
2134       chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2135
2136       chunk->next = pool->cur->next;
2137       pool->cur->next = chunk;
2138     }
2139
2140   /* Update the pointer before changing chunk's front.  */
2141   if (ptr)
2142     *ptr += chunk->base - POOL_FRONT (pool);
2143
2144   memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2145   chunk->front = chunk->base;
2146
2147   pool->cur = chunk;
2148   return POOL_LIMIT (pool);
2149 }
2150
2151 static cpp_chunk *
2152 new_chunk (size)
2153      unsigned int size;
2154 {
2155   unsigned char *base;
2156   cpp_chunk *result;
2157
2158   size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2159   base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2160   /* Put the chunk descriptor at the end.  Then chunk overruns will
2161      cause obvious chaos.  */
2162   result = (cpp_chunk *) (base + size);
2163   result->base = base;
2164   result->front = base;
2165   result->limit = base + size;
2166   result->next = 0;
2167
2168   return result;
2169 }
2170
2171 void
2172 _cpp_init_pool (pool, size, align, temp)
2173      cpp_pool *pool;
2174      unsigned int size, align, temp;
2175 {
2176   if (align == 0)
2177     align = DEFAULT_ALIGNMENT;
2178   if (align & (align - 1))
2179     abort ();
2180   pool->align = align;
2181   pool->first = new_chunk (size);
2182   pool->cur = pool->first;
2183   pool->locked = 0;
2184   pool->locks = 0;
2185   if (temp)
2186     pool->cur->next = pool->cur;
2187 }
2188
2189 void
2190 _cpp_lock_pool (pool)
2191      cpp_pool *pool;
2192 {
2193   if (pool->locks++ == 0)
2194     pool->locked = pool->cur;
2195 }
2196
2197 void
2198 _cpp_unlock_pool (pool)
2199      cpp_pool *pool;
2200 {
2201   if (--pool->locks == 0)
2202     pool->locked = 0;
2203 }
2204
2205 void
2206 _cpp_free_pool (pool)
2207      cpp_pool *pool;
2208 {
2209   cpp_chunk *chunk = pool->first, *next;
2210
2211   do
2212     {
2213       next = chunk->next;
2214       free (chunk->base);
2215       chunk = next;
2216     }
2217   while (chunk && chunk != pool->first);
2218 }
2219
2220 /* Reserve LEN bytes from a memory pool.  */
2221 unsigned char *
2222 _cpp_pool_reserve (pool, len)
2223      cpp_pool *pool;
2224      unsigned int len;
2225 {
2226   len = POOL_ALIGN (len, pool->align);
2227   if (len > (unsigned int) POOL_ROOM (pool))
2228     _cpp_next_chunk (pool, len, 0);
2229
2230   return POOL_FRONT (pool);
2231 }
2232
2233 /* Allocate LEN bytes from a memory pool.  */
2234 unsigned char *
2235 _cpp_pool_alloc (pool, len)
2236      cpp_pool *pool;
2237      unsigned int len;
2238 {
2239   unsigned char *result = _cpp_pool_reserve (pool, len);
2240
2241   POOL_COMMIT (pool, len);
2242   return result;
2243 }