libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  59                             unsigned int, enum cpp_ttype);
  60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  61 static int name_p (cpp_reader *, const cpp_string *);
  62 static tokenrun *next_tokenrun (tokenrun *);
  63
  64 static _cpp_buff *new_buff (size_t);
  65
  66
  67 /* Utility routine:
  68
  69    Compares, the token TOKEN to the NUL-terminated string STRING.
  70    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  71 int
  72 cpp_ideq (const cpp_token *token, const char *string)
  73 {
  74   if (token->type != CPP_NAME)
  75     return 0;
  76
  77   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  78 }
  79
  80 /* Record a note TYPE at byte POS into the current cleaned logical
  81    line.  */
  82 static void
  83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  84 {
  85   if (buffer->notes_used == buffer->notes_cap)
  86     {
  87       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  88       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  89                                   buffer->notes_cap);
  90     }
  91
  92   buffer->notes[buffer->notes_used].pos = pos;
  93   buffer->notes[buffer->notes_used].type = type;
  94   buffer->notes_used++;
  95 }
  96
  97 /* Returns with a logical line that contains no escaped newlines or
  98    trigraphs.  This is a time-critical inner loop.  */
  99 void
 100 _cpp_clean_line (cpp_reader *pfile)
 101 {
 102   cpp_buffer *buffer;
 103   const uchar *s;
 104   uchar c, *d, *p;
 105
 106   buffer = pfile->buffer;
 107   buffer->cur_note = buffer->notes_used = 0;
 108   buffer->cur = buffer->line_base = buffer->next_line;
 109   buffer->need_line = false;
 110   s = buffer->next_line - 1;
 111
 112   if (!buffer->from_stage3)
 113     {
 114       const uchar *pbackslash = NULL;
 115
 116       /* Short circuit for the common case of an un-escaped line with
 117          no trigraphs.  The primary win here is by not writing any
 118          data back to memory until we have to.  */
 119       for (;;)
 120         {
 121           c = *++s;
 122           if (__builtin_expect (c == '\n', false)
 123               || __builtin_expect (c == '\r', false))
 124             {
 125               d = (uchar *) s;
 126
 127               if (__builtin_expect (s == buffer->rlimit, false))
 128                 goto done;
 129
 130               /* DOS line ending? */
 131               if (__builtin_expect (c == '\r', false)
 132                   && s[1] == '\n')
 133                 {
 134                   s++;
 135                   if (s == buffer->rlimit)
 136                     goto done;
 137                 }
 138
 139               if (__builtin_expect (pbackslash == NULL, true))
 140                 goto done;
 141
 142               /* Check for escaped newline.  */
 143               p = d;
 144               while (is_nvspace (p[-1]))
 145                 p--;
 146               if (p - 1 != pbackslash)
 147                 goto done;
 148
 149               /* Have an escaped newline; process it and proceed to
 150                  the slow path.  */
 151               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 152               d = p - 2;
 153               buffer->next_line = p - 1;
 154               break;
 155             }
 156           if (__builtin_expect (c == '\\', false))
 157             pbackslash = s;
 158           else if (__builtin_expect (c == '?', false)
 159                    && __builtin_expect (s[1] == '?', false)
 160                    && _cpp_trigraph_map[s[2]])
 161             {
 162               /* Have a trigraph.  We may or may not have to convert
 163                  it.  Add a line note regardless, for -Wtrigraphs.  */
 164               add_line_note (buffer, s, s[2]);
 165               if (CPP_OPTION (pfile, trigraphs))
 166                 {
 167                   /* We do, and that means we have to switch to the
 168                      slow path.  */
 169                   d = (uchar *) s;
 170                   *d = _cpp_trigraph_map[s[2]];
 171                   s += 2;
 172                   break;
 173                 }
 174             }
 175         }
 176
 177
 178       for (;;)
 179         {
 180           c = *++s;
 181           *++d = c;
 182
 183           if (c == '\n' || c == '\r')
 184             {
 185                   /* Handle DOS line endings.  */
 186               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 187                 s++;
 188               if (s == buffer->rlimit)
 189                 break;
 190
 191               /* Escaped?  */
 192               p = d;
 193               while (p != buffer->next_line && is_nvspace (p[-1]))
 194                 p--;
 195               if (p == buffer->next_line || p[-1] != '\\')
 196                 break;
 197
 198               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 199               d = p - 2;
 200               buffer->next_line = p - 1;
 201             }
 202           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 203             {
 204               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 205               add_line_note (buffer, d, s[2]);
 206               if (CPP_OPTION (pfile, trigraphs))
 207                 {
 208                   *d = _cpp_trigraph_map[s[2]];
 209                   s += 2;
 210                 }
 211             }
 212         }
 213     }
 214   else
 215     {
 216       do
 217         s++;
 218       while (*s != '\n' && *s != '\r');
 219       d = (uchar *) s;
 220
 221       /* Handle DOS line endings.  */
 222       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 223         s++;
 224     }
 225
 226  done:
 227   *d = '\n';
 228   /* A sentinel note that should never be processed.  */
 229   add_line_note (buffer, d + 1, '\n');
 230   buffer->next_line = s + 1;
 231 }
 232
 233 /* Return true if the trigraph indicated by NOTE should be warned
 234    about in a comment.  */
 235 static bool
 236 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 237 {
 238   const uchar *p;
 239
 240   /* Within comments we don't warn about trigraphs, unless the
 241      trigraph forms an escaped newline, as that may change
 242      behavior.  */
 243   if (note->type != '/')
 244     return false;
 245
 246   /* If -trigraphs, then this was an escaped newline iff the next note
 247      is coincident.  */
 248   if (CPP_OPTION (pfile, trigraphs))
 249     return note[1].pos == note->pos;
 250
 251   /* Otherwise, see if this forms an escaped newline.  */
 252   p = note->pos + 3;
 253   while (is_nvspace (*p))
 254     p++;
 255
 256   /* There might have been escaped newlines between the trigraph and the
 257      newline we found.  Hence the position test.  */
 258   return (*p == '\n' && p < note[1].pos);
 259 }
 260
 261 /* Process the notes created by add_line_note as far as the current
 262    location.  */
 263 void
 264 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 265 {
 266   cpp_buffer *buffer = pfile->buffer;
 267
 268   for (;;)
 269     {
 270       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 271       unsigned int col;
 272
 273       if (note->pos > buffer->cur)
 274         break;
 275
 276       buffer->cur_note++;
 277       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 278
 279       if (note->type == '\\' || note->type == ' ')
 280         {
 281           if (note->type == ' ' && !in_comment)
 282             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 283                                  "backslash and newline separated by space");
 284
 285           if (buffer->next_line > buffer->rlimit)
 286             {
 287               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 288                                    "backslash-newline at end of file");
 289               /* Prevent "no newline at end of file" warning.  */
 290               buffer->next_line = buffer->rlimit;
 291             }
 292
 293           buffer->line_base = note->pos;
 294           CPP_INCREMENT_LINE (pfile, 0);
 295         }
 296       else if (_cpp_trigraph_map[note->type])
 297         {
 298           if (CPP_OPTION (pfile, warn_trigraphs)
 299               && (!in_comment || warn_in_comment (pfile, note)))
 300             {
 301               if (CPP_OPTION (pfile, trigraphs))
 302                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 303                                      "trigraph ??%c converted to %c",
 304                                      note->type,
 305                                      (int) _cpp_trigraph_map[note->type]);
 306               else
 307                 {
 308                   cpp_error_with_line
 309                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 310                      "trigraph ??%c ignored, use -trigraphs to enable",
 311                      note->type);
 312                 }
 313             }
 314         }
 315       else
 316         abort ();
 317     }
 318 }
 319
 320 /* Skip a C-style block comment.  We find the end of the comment by
 321    seeing if an asterisk is before every '/' we encounter.  Returns
 322    nonzero if comment terminated by EOF, zero otherwise.
 323
 324    Buffer->cur points to the initial asterisk of the comment.  */
 325 bool
 326 _cpp_skip_block_comment (cpp_reader *pfile)
 327 {
 328   cpp_buffer *buffer = pfile->buffer;
 329   const uchar *cur = buffer->cur;
 330   uchar c;
 331
 332   cur++;
 333   if (*cur == '/')
 334     cur++;
 335
 336   for (;;)
 337     {
 338       /* People like decorating comments with '*', so check for '/'
 339          instead for efficiency.  */
 340       c = *cur++;
 341
 342       if (c == '/')
 343         {
 344           if (cur[-2] == '*')
 345             break;
 346
 347           /* Warn about potential nested comments, but not if the '/'
 348              comes immediately before the true comment delimiter.
 349              Don't bother to get it right across escaped newlines.  */
 350           if (CPP_OPTION (pfile, warn_comments)
 351               && cur[0] == '*' && cur[1] != '/')
 352             {
 353               buffer->cur = cur;
 354               cpp_error_with_line (pfile, CPP_DL_WARNING,
 355                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 356                                    "\"/*\" within comment");
 357             }
 358         }
 359       else if (c == '\n')
 360         {
 361           unsigned int cols;
 362           buffer->cur = cur - 1;
 363           _cpp_process_line_notes (pfile, true);
 364           if (buffer->next_line >= buffer->rlimit)
 365             return true;
 366           _cpp_clean_line (pfile);
 367
 368           cols = buffer->next_line - buffer->line_base;
 369           CPP_INCREMENT_LINE (pfile, cols);
 370
 371           cur = buffer->cur;
 372         }
 373     }
 374
 375   buffer->cur = cur;
 376   _cpp_process_line_notes (pfile, true);
 377   return false;
 378 }
 379
 380 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 381    terminating newline.  Handles escaped newlines.  Returns nonzero
 382    if a multiline comment.  */
 383 static int
 384 skip_line_comment (cpp_reader *pfile)
 385 {
 386   cpp_buffer *buffer = pfile->buffer;
 387   source_location orig_line = pfile->line_table->highest_line;
 388
 389   while (*buffer->cur != '\n')
 390     buffer->cur++;
 391
 392   _cpp_process_line_notes (pfile, true);
 393   return orig_line != pfile->line_table->highest_line;
 394 }
 395
 396 /* Skips whitespace, saving the next non-whitespace character.  */
 397 static void
 398 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 399 {
 400   cpp_buffer *buffer = pfile->buffer;
 401   bool saw_NUL = false;
 402
 403   do
 404     {
 405       /* Horizontal space always OK.  */
 406       if (c == ' ' || c == '\t')
 407         ;
 408       /* Just \f \v or \0 left.  */
 409       else if (c == '\0')
 410         saw_NUL = true;
 411       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 412         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 413                              CPP_BUF_COL (buffer),
 414                              "%s in preprocessing directive",
 415                              c == '\f' ? "form feed" : "vertical tab");
 416
 417       c = *buffer->cur++;
 418     }
 419   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 420   while (is_nvspace (c));
 421
 422   if (saw_NUL)
 423     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 424
 425   buffer->cur--;
 426 }
 427
 428 /* See if the characters of a number token are valid in a name (no
 429    '.', '+' or '-').  */
 430 static int
 431 name_p (cpp_reader *pfile, const cpp_string *string)
 432 {
 433   unsigned int i;
 434
 435   for (i = 0; i < string->len; i++)
 436     if (!is_idchar (string->text[i]))
 437       return 0;
 438
 439   return 1;
 440 }
 441
 442 /* After parsing an identifier or other sequence, produce a warning about
 443    sequences not in NFC/NFKC.  */
 444 static void
 445 warn_about_normalization (cpp_reader *pfile,
 446                           const cpp_token *token,
 447                           const struct normalize_state *s)
 448 {
 449   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 450       && !pfile->state.skipping)
 451     {
 452       /* Make sure that the token is printed using UCNs, even
 453          if we'd otherwise happily print UTF-8.  */
 454       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 455       size_t sz;
 456
 457       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 458       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 459         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 460                              "`%.*s' is not in NFKC", (int) sz, buf);
 461       else
 462         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 463                              "`%.*s' is not in NFC", (int) sz, buf);
 464     }
 465 }
 466
 467 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 468    an identifier.  FIRST is TRUE if this starts an identifier.  */
 469 static bool
 470 forms_identifier_p (cpp_reader *pfile, int first,
 471                     struct normalize_state *state)
 472 {
 473   cpp_buffer *buffer = pfile->buffer;
 474
 475   if (*buffer->cur == '$')
 476     {
 477       if (!CPP_OPTION (pfile, dollars_in_ident))
 478         return false;
 479
 480       buffer->cur++;
 481       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 482         {
 483           CPP_OPTION (pfile, warn_dollars) = 0;
 484           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 485         }
 486
 487       return true;
 488     }
 489
 490   /* Is this a syntactically valid UCN?  */
 491   if (CPP_OPTION (pfile, extended_identifiers)
 492       && *buffer->cur == '\\'
 493       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 494     {
 495       buffer->cur += 2;
 496       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 497                           state))
 498         return true;
 499       buffer->cur -= 2;
 500     }
 501
 502   return false;
 503 }
 504
 505 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 506 static cpp_hashnode *
 507 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 508                 struct normalize_state *nst)
 509 {
 510   cpp_hashnode *result;
 511   const uchar *cur;
 512   unsigned int len;
 513   unsigned int hash = HT_HASHSTEP (0, *base);
 514
 515   cur = pfile->buffer->cur;
 516   if (! starts_ucn)
 517     while (ISIDNUM (*cur))
 518       {
 519         hash = HT_HASHSTEP (hash, *cur);
 520         cur++;
 521       }
 522   pfile->buffer->cur = cur;
 523   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 524     {
 525       /* Slower version for identifiers containing UCNs (or $).  */
 526       do {
 527         while (ISIDNUM (*pfile->buffer->cur))
 528           {
 529             pfile->buffer->cur++;
 530             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 531           }
 532       } while (forms_identifier_p (pfile, false, nst));
 533       result = _cpp_interpret_identifier (pfile, base,
 534                                           pfile->buffer->cur - base);
 535     }
 536   else
 537     {
 538       len = cur - base;
 539       hash = HT_HASHFINISH (hash, len);
 540
 541       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 542                                                   base, len, hash, HT_ALLOC));
 543     }
 544
 545   /* Rarely, identifiers require diagnostics when lexed.  */
 546   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 547                         && !pfile->state.skipping, 0))
 548     {
 549       /* It is allowed to poison the same identifier twice.  */
 550       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 551         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 552                    NODE_NAME (result));
 553
 554       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 555          replacement list of a variadic macro.  */
 556       if (result == pfile->spec_nodes.n__VA_ARGS__
 557           && !pfile->state.va_args_ok)
 558         cpp_error (pfile, CPP_DL_PEDWARN,
 559                    "__VA_ARGS__ can only appear in the expansion"
 560                    " of a C99 variadic macro");
 561     }
 562
 563   return result;
 564 }
 565
 566 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 567 static void
 568 lex_number (cpp_reader *pfile, cpp_string *number,
 569             struct normalize_state *nst)
 570 {
 571   const uchar *cur;
 572   const uchar *base;
 573   uchar *dest;
 574
 575   base = pfile->buffer->cur - 1;
 576   do
 577     {
 578       cur = pfile->buffer->cur;
 579
 580       /* N.B. ISIDNUM does not include $.  */
 581       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 582         {
 583           cur++;
 584           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 585         }
 586
 587       pfile->buffer->cur = cur;
 588     }
 589   while (forms_identifier_p (pfile, false, nst));
 590
 591   number->len = cur - base;
 592   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 593   memcpy (dest, base, number->len);
 594   dest[number->len] = '\0';
 595   number->text = dest;
 596 }
 597
 598 /* Create a token of type TYPE with a literal spelling.  */
 599 static void
 600 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 601                 unsigned int len, enum cpp_ttype type)
 602 {
 603   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 604
 605   memcpy (dest, base, len);
 606   dest[len] = '\0';
 607   token->type = type;
 608   token->val.str.len = len;
 609   token->val.str.text = dest;
 610 }
 611
 612 /* Lexes a string, character constant, or angle-bracketed header file
 613    name.  The stored string contains the spelling, including opening
 614    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 615    of the literal, or CPP_OTHER if it was not properly terminated.
 616
 617    The spelling is NUL-terminated, but it is not guaranteed that this
 618    is the first NUL since embedded NULs are preserved.  */
 619 static void
 620 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 621 {
 622   bool saw_NUL = false;
 623   const uchar *cur;
 624   cppchar_t terminator;
 625   enum cpp_ttype type;
 626
 627   cur = base;
 628   terminator = *cur++;
 629   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 630     terminator = *cur++;
 631   if (terminator == '\"')
 632     type = (*base == 'L' ? CPP_WSTRING :
 633             *base == 'U' ? CPP_STRING32 :
 634             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 635   else if (terminator == '\'')
 636     type = (*base == 'L' ? CPP_WCHAR :
 637             *base == 'U' ? CPP_CHAR32 :
 638             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 639   else
 640     terminator = '>', type = CPP_HEADER_NAME;
 641
 642   for (;;)
 643     {
 644       cppchar_t c = *cur++;
 645
 646       /* In #include-style directives, terminators are not escapable.  */
 647       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 648         cur++;
 649       else if (c == terminator)
 650         break;
 651       else if (c == '\n')
 652         {
 653           cur--;
 654           type = CPP_OTHER;
 655           break;
 656         }
 657       else if (c == '\0')
 658         saw_NUL = true;
 659     }
 660
 661   if (saw_NUL && !pfile->state.skipping)
 662     cpp_error (pfile, CPP_DL_WARNING,
 663                "null character(s) preserved in literal");
 664
 665   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 666     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 667                (int) terminator);
 668
 669   pfile->buffer->cur = cur;
 670   create_literal (pfile, token, base, cur - base, type);
 671 }
 672
 673 /* The stored comment includes the comment start and any terminator.  */
 674 static void
 675 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 676               cppchar_t type)
 677 {
 678   unsigned char *buffer;
 679   unsigned int len, clen;
 680
 681   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 682
 683   /* C++ comments probably (not definitely) have moved past a new
 684      line, which we don't want to save in the comment.  */
 685   if (is_vspace (pfile->buffer->cur[-1]))
 686     len--;
 687
 688   /* If we are currently in a directive, then we need to store all
 689      C++ comments as C comments internally, and so we need to
 690      allocate a little extra space in that case.
 691
 692      Note that the only time we encounter a directive here is
 693      when we are saving comments in a "#define".  */
 694   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 695
 696   buffer = _cpp_unaligned_alloc (pfile, clen);
 697
 698   token->type = CPP_COMMENT;
 699   token->val.str.len = clen;
 700   token->val.str.text = buffer;
 701
 702   buffer[0] = '/';
 703   memcpy (buffer + 1, from, len - 1);
 704
 705   /* Finish conversion to a C comment, if necessary.  */
 706   if (pfile->state.in_directive && type == '/')
 707     {
 708       buffer[1] = '*';
 709       buffer[clen - 2] = '*';
 710       buffer[clen - 1] = '/';
 711     }
 712 }
 713
 714 /* Allocate COUNT tokens for RUN.  */
 715 void
 716 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 717 {
 718   run->base = XNEWVEC (cpp_token, count);
 719   run->limit = run->base + count;
 720   run->next = NULL;
 721 }
 722
 723 /* Returns the next tokenrun, or creates one if there is none.  */
 724 static tokenrun *
 725 next_tokenrun (tokenrun *run)
 726 {
 727   if (run->next == NULL)
 728     {
 729       run->next = XNEW (tokenrun);
 730       run->next->prev = run;
 731       _cpp_init_tokenrun (run->next, 250);
 732     }
 733
 734   return run->next;
 735 }
 736
 737 /* Look ahead in the input stream.  */
 738 const cpp_token *
 739 cpp_peek_token (cpp_reader *pfile, int index)
 740 {
 741   cpp_context *context = pfile->context;
 742   const cpp_token *peektok;
 743   int count;
 744
 745   /* First, scan through any pending cpp_context objects.  */
 746   while (context->prev)
 747     {
 748       ptrdiff_t sz = (context->direct_p
 749                       ? LAST (context).token - FIRST (context).token
 750                       : LAST (context).ptoken - FIRST (context).ptoken);
 751
 752       if (index < (int) sz)
 753         return (context->direct_p
 754                 ? FIRST (context).token + index
 755                 : *(FIRST (context).ptoken + index));
 756
 757       index -= (int) sz;
 758       context = context->prev;
 759     }
 760
 761   /* We will have to read some new tokens after all (and do so
 762      without invalidating preceding tokens).  */
 763   count = index;
 764   pfile->keep_tokens++;
 765
 766   do
 767     {
 768       peektok = _cpp_lex_token (pfile);
 769       if (peektok->type == CPP_EOF)
 770         return peektok;
 771     }
 772   while (index--);
 773
 774   _cpp_backup_tokens_direct (pfile, count + 1);
 775   pfile->keep_tokens--;
 776
 777   return peektok;
 778 }
 779
 780 /* Allocate a single token that is invalidated at the same time as the
 781    rest of the tokens on the line.  Has its line and col set to the
 782    same as the last lexed token, so that diagnostics appear in the
 783    right place.  */
 784 cpp_token *
 785 _cpp_temp_token (cpp_reader *pfile)
 786 {
 787   cpp_token *old, *result;
 788   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 789   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 790
 791   old = pfile->cur_token - 1;
 792   /* Any pre-existing lookaheads must not be clobbered.  */
 793   if (la)
 794     {
 795       if (sz <= la)
 796         {
 797           tokenrun *next = next_tokenrun (pfile->cur_run);
 798
 799           if (sz < la)
 800             memmove (next->base + 1, next->base,
 801                      (la - sz) * sizeof (cpp_token));
 802
 803           next->base[0] = pfile->cur_run->limit[-1];
 804         }
 805
 806       if (sz > 1)
 807         memmove (pfile->cur_token + 1, pfile->cur_token,
 808                  MIN (la, sz - 1) * sizeof (cpp_token));
 809     }
 810
 811   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 812     {
 813       pfile->cur_run = next_tokenrun (pfile->cur_run);
 814       pfile->cur_token = pfile->cur_run->base;
 815     }
 816
 817   result = pfile->cur_token++;
 818   result->src_loc = old->src_loc;
 819   return result;
 820 }
 821
 822 /* Lex a token into RESULT (external interface).  Takes care of issues
 823    like directive handling, token lookahead, multiple include
 824    optimization and skipping.  */
 825 const cpp_token *
 826 _cpp_lex_token (cpp_reader *pfile)
 827 {
 828   cpp_token *result;
 829
 830   for (;;)
 831     {
 832       if (pfile->cur_token == pfile->cur_run->limit)
 833         {
 834           pfile->cur_run = next_tokenrun (pfile->cur_run);
 835           pfile->cur_token = pfile->cur_run->base;
 836         }
 837       /* We assume that the current token is somewhere in the current
 838          run.  */
 839       if (pfile->cur_token < pfile->cur_run->base
 840           || pfile->cur_token >= pfile->cur_run->limit)
 841         abort ();
 842
 843       if (pfile->lookaheads)
 844         {
 845           pfile->lookaheads--;
 846           result = pfile->cur_token++;
 847         }
 848       else
 849         result = _cpp_lex_direct (pfile);
 850
 851       if (result->flags & BOL)
 852         {
 853           /* Is this a directive.  If _cpp_handle_directive returns
 854              false, it is an assembler #.  */
 855           if (result->type == CPP_HASH
 856               /* 6.10.3 p 11: Directives in a list of macro arguments
 857                  gives undefined behavior.  This implementation
 858                  handles the directive as normal.  */
 859               && pfile->state.parsing_args != 1)
 860             {
 861               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 862                 {
 863                   if (pfile->directive_result.type == CPP_PADDING)
 864                     continue;
 865                   result = &pfile->directive_result;
 866                 }
 867             }
 868           else if (pfile->state.in_deferred_pragma)
 869             result = &pfile->directive_result;
 870
 871           if (pfile->cb.line_change && !pfile->state.skipping)
 872             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 873         }
 874
 875       /* We don't skip tokens in directives.  */
 876       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 877         break;
 878
 879       /* Outside a directive, invalidate controlling macros.  At file
 880          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 881          get here and MI optimization works.  */
 882       pfile->mi_valid = false;
 883
 884       if (!pfile->state.skipping || result->type == CPP_EOF)
 885         break;
 886     }
 887
 888   return result;
 889 }
 890
 891 /* Returns true if a fresh line has been loaded.  */
 892 bool
 893 _cpp_get_fresh_line (cpp_reader *pfile)
 894 {
 895   int return_at_eof;
 896
 897   /* We can't get a new line until we leave the current directive.  */
 898   if (pfile->state.in_directive)
 899     return false;
 900
 901   for (;;)
 902     {
 903       cpp_buffer *buffer = pfile->buffer;
 904
 905       if (!buffer->need_line)
 906         return true;
 907
 908       if (buffer->next_line < buffer->rlimit)
 909         {
 910           _cpp_clean_line (pfile);
 911           return true;
 912         }
 913
 914       /* First, get out of parsing arguments state.  */
 915       if (pfile->state.parsing_args)
 916         return false;
 917
 918       /* End of buffer.  Non-empty files should end in a newline.  */
 919       if (buffer->buf != buffer->rlimit
 920           && buffer->next_line > buffer->rlimit
 921           && !buffer->from_stage3)
 922         {
 923           /* Clip to buffer size.  */
 924           buffer->next_line = buffer->rlimit;
 925         }
 926
 927       return_at_eof = buffer->return_at_eof;
 928       _cpp_pop_buffer (pfile);
 929       if (pfile->buffer == NULL || return_at_eof)
 930         return false;
 931     }
 932 }
 933
 934 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 935   do                                                    \
 936     {                                                   \
 937       result->type = ELSE_TYPE;                         \
 938       if (*buffer->cur == CHAR)                         \
 939         buffer->cur++, result->type = THEN_TYPE;        \
 940     }                                                   \
 941   while (0)
 942
 943 /* Lex a token into pfile->cur_token, which is also incremented, to
 944    get diagnostics pointing to the correct location.
 945
 946    Does not handle issues such as token lookahead, multiple-include
 947    optimization, directives, skipping etc.  This function is only
 948    suitable for use by _cpp_lex_token, and in special cases like
 949    lex_expansion_token which doesn't care for any of these issues.
 950
 951    When meeting a newline, returns CPP_EOF if parsing a directive,
 952    otherwise returns to the start of the token buffer if permissible.
 953    Returns the location of the lexed token.  */
 954 cpp_token *
 955 _cpp_lex_direct (cpp_reader *pfile)
 956 {
 957   cppchar_t c;
 958   cpp_buffer *buffer;
 959   const unsigned char *comment_start;
 960   cpp_token *result = pfile->cur_token++;
 961
 962  fresh_line:
 963   result->flags = 0;
 964   buffer = pfile->buffer;
 965   if (buffer->need_line)
 966     {
 967       if (pfile->state.in_deferred_pragma)
 968         {
 969           result->type = CPP_PRAGMA_EOL;
 970           pfile->state.in_deferred_pragma = false;
 971           if (!pfile->state.pragma_allow_expansion)
 972             pfile->state.prevent_expansion--;
 973           return result;
 974         }
 975       if (!_cpp_get_fresh_line (pfile))
 976         {
 977           result->type = CPP_EOF;
 978           if (!pfile->state.in_directive)
 979             {
 980               /* Tell the compiler the line number of the EOF token.  */
 981               result->src_loc = pfile->line_table->highest_line;
 982               result->flags = BOL;
 983             }
 984           return result;
 985         }
 986       if (!pfile->keep_tokens)
 987         {
 988           pfile->cur_run = &pfile->base_run;
 989           result = pfile->base_run.base;
 990           pfile->cur_token = result + 1;
 991         }
 992       result->flags = BOL;
 993       if (pfile->state.parsing_args == 2)
 994         result->flags |= PREV_WHITE;
 995     }
 996   buffer = pfile->buffer;
 997  update_tokens_line:
 998   result->src_loc = pfile->line_table->highest_line;
 999
1000  skipped_white:
1001   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1002       && !pfile->overlaid_buffer)
1003     {
1004       _cpp_process_line_notes (pfile, false);
1005       result->src_loc = pfile->line_table->highest_line;
1006     }
1007   c = *buffer->cur++;
1008
1009   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1010                                CPP_BUF_COLUMN (buffer, buffer->cur));
1011
1012   switch (c)
1013     {
1014     case ' ': case '\t': case '\f': case '\v': case '\0':
1015       result->flags |= PREV_WHITE;
1016       skip_whitespace (pfile, c);
1017       goto skipped_white;
1018
1019     case '\n':
1020       if (buffer->cur < buffer->rlimit)
1021         CPP_INCREMENT_LINE (pfile, 0);
1022       buffer->need_line = true;
1023       goto fresh_line;
1024
1025     case '0': case '1': case '2': case '3': case '4':
1026     case '5': case '6': case '7': case '8': case '9':
1027       {
1028         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1029         result->type = CPP_NUMBER;
1030         lex_number (pfile, &result->val.str, &nst);
1031         warn_about_normalization (pfile, result, &nst);
1032         break;
1033       }
1034
1035     case 'L':
1036     case 'u':
1037     case 'U':
1038       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1039       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1040         {
1041           if (*buffer->cur == '\'' || *buffer->cur == '"')
1042             {
1043               lex_string (pfile, result, buffer->cur - 1);
1044               break;
1045             }
1046         }
1047       /* Fall through.  */
1048
1049     case '_':
1050     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1051     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1052     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1053     case 's': case 't':           case 'v': case 'w': case 'x':
1054     case 'y': case 'z':
1055     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1056     case 'G': case 'H': case 'I': case 'J': case 'K':
1057     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1058     case 'S': case 'T':           case 'V': case 'W': case 'X':
1059     case 'Y': case 'Z':
1060       result->type = CPP_NAME;
1061       {
1062         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1063         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1064                                            &nst);
1065         warn_about_normalization (pfile, result, &nst);
1066       }
1067
1068       /* Convert named operators to their proper types.  */
1069       if (result->val.node->flags & NODE_OPERATOR)
1070         {
1071           result->flags |= NAMED_OP;
1072           result->type = (enum cpp_ttype) result->val.node->directive_index;
1073         }
1074       break;
1075
1076     case '\'':
1077     case '"':
1078       lex_string (pfile, result, buffer->cur - 1);
1079       break;
1080
1081     case '/':
1082       /* A potential block or line comment.  */
1083       comment_start = buffer->cur;
1084       c = *buffer->cur;
1085
1086       if (c == '*')
1087         {
1088           if (_cpp_skip_block_comment (pfile))
1089             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1090         }
1091       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1092                             || cpp_in_system_header (pfile)))
1093         {
1094           /* Warn about comments only if pedantically GNUC89, and not
1095              in system headers.  */
1096           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1097               && ! buffer->warned_cplusplus_comments)
1098             {
1099               cpp_error (pfile, CPP_DL_PEDWARN,
1100                          "C++ style comments are not allowed in ISO C90");
1101               cpp_error (pfile, CPP_DL_PEDWARN,
1102                          "(this will be reported only once per input file)");
1103               buffer->warned_cplusplus_comments = 1;
1104             }
1105
1106           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1107             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1108         }
1109       else if (c == '=')
1110         {
1111           buffer->cur++;
1112           result->type = CPP_DIV_EQ;
1113           break;
1114         }
1115       else
1116         {
1117           result->type = CPP_DIV;
1118           break;
1119         }
1120
1121       if (!pfile->state.save_comments)
1122         {
1123           result->flags |= PREV_WHITE;
1124           goto update_tokens_line;
1125         }
1126
1127       /* Save the comment as a token in its own right.  */
1128       save_comment (pfile, result, comment_start, c);
1129       break;
1130
1131     case '<':
1132       if (pfile->state.angled_headers)
1133         {
1134           lex_string (pfile, result, buffer->cur - 1);
1135           break;
1136         }
1137
1138       result->type = CPP_LESS;
1139       if (*buffer->cur == '=')
1140         buffer->cur++, result->type = CPP_LESS_EQ;
1141       else if (*buffer->cur == '<')
1142         {
1143           buffer->cur++;
1144           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1145         }
1146       else if (CPP_OPTION (pfile, digraphs))
1147         {
1148           if (*buffer->cur == ':')
1149             {
1150               buffer->cur++;
1151               result->flags |= DIGRAPH;
1152               result->type = CPP_OPEN_SQUARE;
1153             }
1154           else if (*buffer->cur == '%')
1155             {
1156               buffer->cur++;
1157               result->flags |= DIGRAPH;
1158               result->type = CPP_OPEN_BRACE;
1159             }
1160         }
1161       break;
1162
1163     case '>':
1164       result->type = CPP_GREATER;
1165       if (*buffer->cur == '=')
1166         buffer->cur++, result->type = CPP_GREATER_EQ;
1167       else if (*buffer->cur == '>')
1168         {
1169           buffer->cur++;
1170           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1171         }
1172       break;
1173
1174     case '%':
1175       result->type = CPP_MOD;
1176       if (*buffer->cur == '=')
1177         buffer->cur++, result->type = CPP_MOD_EQ;
1178       else if (CPP_OPTION (pfile, digraphs))
1179         {
1180           if (*buffer->cur == ':')
1181             {
1182               buffer->cur++;
1183               result->flags |= DIGRAPH;
1184               result->type = CPP_HASH;
1185               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1186                 buffer->cur += 2, result->type = CPP_PASTE;
1187             }
1188           else if (*buffer->cur == '>')
1189             {
1190               buffer->cur++;
1191               result->flags |= DIGRAPH;
1192               result->type = CPP_CLOSE_BRACE;
1193             }
1194         }
1195       break;
1196
1197     case '.':
1198       result->type = CPP_DOT;
1199       if (ISDIGIT (*buffer->cur))
1200         {
1201           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1202           result->type = CPP_NUMBER;
1203           lex_number (pfile, &result->val.str, &nst);
1204           warn_about_normalization (pfile, result, &nst);
1205         }
1206       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1207         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1208       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1209         buffer->cur++, result->type = CPP_DOT_STAR;
1210       break;
1211
1212     case '+':
1213       result->type = CPP_PLUS;
1214       if (*buffer->cur == '+')
1215         buffer->cur++, result->type = CPP_PLUS_PLUS;
1216       else if (*buffer->cur == '=')
1217         buffer->cur++, result->type = CPP_PLUS_EQ;
1218       break;
1219
1220     case '-':
1221       result->type = CPP_MINUS;
1222       if (*buffer->cur == '>')
1223         {
1224           buffer->cur++;
1225           result->type = CPP_DEREF;
1226           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1227             buffer->cur++, result->type = CPP_DEREF_STAR;
1228         }
1229       else if (*buffer->cur == '-')
1230         buffer->cur++, result->type = CPP_MINUS_MINUS;
1231       else if (*buffer->cur == '=')
1232         buffer->cur++, result->type = CPP_MINUS_EQ;
1233       break;
1234
1235     case '&':
1236       result->type = CPP_AND;
1237       if (*buffer->cur == '&')
1238         buffer->cur++, result->type = CPP_AND_AND;
1239       else if (*buffer->cur == '=')
1240         buffer->cur++, result->type = CPP_AND_EQ;
1241       break;
1242
1243     case '|':
1244       result->type = CPP_OR;
1245       if (*buffer->cur == '|')
1246         buffer->cur++, result->type = CPP_OR_OR;
1247       else if (*buffer->cur == '=')
1248         buffer->cur++, result->type = CPP_OR_EQ;
1249       break;
1250
1251     case ':':
1252       result->type = CPP_COLON;
1253       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1254         buffer->cur++, result->type = CPP_SCOPE;
1255       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1256         {
1257           buffer->cur++;
1258           result->flags |= DIGRAPH;
1259           result->type = CPP_CLOSE_SQUARE;
1260         }
1261       break;
1262
1263     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1264     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1265     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1266     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1267     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1268
1269     case '?': result->type = CPP_QUERY; break;
1270     case '~': result->type = CPP_COMPL; break;
1271     case ',': result->type = CPP_COMMA; break;
1272     case '(': result->type = CPP_OPEN_PAREN; break;
1273     case ')': result->type = CPP_CLOSE_PAREN; break;
1274     case '[': result->type = CPP_OPEN_SQUARE; break;
1275     case ']': result->type = CPP_CLOSE_SQUARE; break;
1276     case '{': result->type = CPP_OPEN_BRACE; break;
1277     case '}': result->type = CPP_CLOSE_BRACE; break;
1278     case ';': result->type = CPP_SEMICOLON; break;
1279
1280       /* @ is a punctuator in Objective-C.  */
1281     case '@': result->type = CPP_ATSIGN; break;
1282
1283     case '$':
1284     case '\\':
1285       {
1286         const uchar *base = --buffer->cur;
1287         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1288
1289         if (forms_identifier_p (pfile, true, &nst))
1290           {
1291             result->type = CPP_NAME;
1292             result->val.node = lex_identifier (pfile, base, true, &nst);
1293             warn_about_normalization (pfile, result, &nst);
1294             break;
1295           }
1296         buffer->cur++;
1297       }
1298
1299     default:
1300       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1301       break;
1302     }
1303
1304   return result;
1305 }
1306
1307 /* An upper bound on the number of bytes needed to spell TOKEN.
1308    Does not include preceding whitespace.  */
1309 unsigned int
1310 cpp_token_len (const cpp_token *token)
1311 {
1312   unsigned int len;
1313
1314   switch (TOKEN_SPELL (token))
1315     {
1316     default:            len = 4;                                break;
1317     case SPELL_LITERAL: len = token->val.str.len;               break;
1318     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1319     }
1320
1321   return len;
1322 }
1323
1324 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1325    Return the number of bytes read out of NAME.  (There are always
1326    10 bytes written to BUFFER.)  */
1327
1328 static size_t
1329 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1330 {
1331   int j;
1332   int ucn_len = 0;
1333   int ucn_len_c;
1334   unsigned t;
1335   unsigned long utf32;
1336
1337   /* Compute the length of the UTF-8 sequence.  */
1338   for (t = *name; t & 0x80; t <<= 1)
1339     ucn_len++;
1340
1341   utf32 = *name & (0x7F >> ucn_len);
1342   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1343     {
1344       utf32 = (utf32 << 6) | (*++name & 0x3F);
1345
1346       /* Ill-formed UTF-8.  */
1347       if ((*name & ~0x3F) != 0x80)
1348         abort ();
1349     }
1350
1351   *buffer++ = '\\';
1352   *buffer++ = 'U';
1353   for (j = 7; j >= 0; j--)
1354     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1355   return ucn_len;
1356 }
1357
1358
1359 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1360    already contain the enough space to hold the token's spelling.
1361    Returns a pointer to the character after the last character written.
1362    FORSTRING is true if this is to be the spelling after translation
1363    phase 1 (this is different for UCNs).
1364    FIXME: Would be nice if we didn't need the PFILE argument.  */
1365 unsigned char *
1366 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1367                  unsigned char *buffer, bool forstring)
1368 {
1369   switch (TOKEN_SPELL (token))
1370     {
1371     case SPELL_OPERATOR:
1372       {
1373         const unsigned char *spelling;
1374         unsigned char c;
1375
1376         if (token->flags & DIGRAPH)
1377           spelling
1378             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1379         else if (token->flags & NAMED_OP)
1380           goto spell_ident;
1381         else
1382           spelling = TOKEN_NAME (token);
1383
1384         while ((c = *spelling++) != '\0')
1385           *buffer++ = c;
1386       }
1387       break;
1388
1389     spell_ident:
1390     case SPELL_IDENT:
1391       if (forstring)
1392         {
1393           memcpy (buffer, NODE_NAME (token->val.node),
1394                   NODE_LEN (token->val.node));
1395           buffer += NODE_LEN (token->val.node);
1396         }
1397       else
1398         {
1399           size_t i;
1400           const unsigned char * name = NODE_NAME (token->val.node);
1401
1402           for (i = 0; i < NODE_LEN (token->val.node); i++)
1403             if (name[i] & ~0x7F)
1404               {
1405                 i += utf8_to_ucn (buffer, name + i) - 1;
1406                 buffer += 10;
1407               }
1408             else
1409               *buffer++ = NODE_NAME (token->val.node)[i];
1410         }
1411       break;
1412
1413     case SPELL_LITERAL:
1414       memcpy (buffer, token->val.str.text, token->val.str.len);
1415       buffer += token->val.str.len;
1416       break;
1417
1418     case SPELL_NONE:
1419       cpp_error (pfile, CPP_DL_ICE,
1420                  "unspellable token %s", TOKEN_NAME (token));
1421       break;
1422     }
1423
1424   return buffer;
1425 }
1426
1427 /* Returns TOKEN spelt as a null-terminated string.  The string is
1428    freed when the reader is destroyed.  Useful for diagnostics.  */
1429 unsigned char *
1430 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1431 {
1432   unsigned int len = cpp_token_len (token) + 1;
1433   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1434
1435   end = cpp_spell_token (pfile, token, start, false);
1436   end[0] = '\0';
1437
1438   return start;
1439 }
1440
1441 /* Used by C front ends, which really should move to using
1442    cpp_token_as_text.  */
1443 const char *
1444 cpp_type2name (enum cpp_ttype type)
1445 {
1446   return (const char *) token_spellings[type].name;
1447 }
1448
1449 /* Writes the spelling of token to FP, without any preceding space.
1450    Separated from cpp_spell_token for efficiency - to avoid stdio
1451    double-buffering.  */
1452 void
1453 cpp_output_token (const cpp_token *token, FILE *fp)
1454 {
1455   switch (TOKEN_SPELL (token))
1456     {
1457     case SPELL_OPERATOR:
1458       {
1459         const unsigned char *spelling;
1460         int c;
1461
1462         if (token->flags & DIGRAPH)
1463           spelling
1464             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1465         else if (token->flags & NAMED_OP)
1466           goto spell_ident;
1467         else
1468           spelling = TOKEN_NAME (token);
1469
1470         c = *spelling;
1471         do
1472           putc (c, fp);
1473         while ((c = *++spelling) != '\0');
1474       }
1475       break;
1476
1477     spell_ident:
1478     case SPELL_IDENT:
1479       {
1480         size_t i;
1481         const unsigned char * name = NODE_NAME (token->val.node);
1482
1483         for (i = 0; i < NODE_LEN (token->val.node); i++)
1484           if (name[i] & ~0x7F)
1485             {
1486               unsigned char buffer[10];
1487               i += utf8_to_ucn (buffer, name + i) - 1;
1488               fwrite (buffer, 1, 10, fp);
1489             }
1490           else
1491             fputc (NODE_NAME (token->val.node)[i], fp);
1492       }
1493       break;
1494
1495     case SPELL_LITERAL:
1496       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1497       break;
1498
1499     case SPELL_NONE:
1500       /* An error, most probably.  */
1501       break;
1502     }
1503 }
1504
1505 /* Compare two tokens.  */
1506 int
1507 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1508 {
1509   if (a->type == b->type && a->flags == b->flags)
1510     switch (TOKEN_SPELL (a))
1511       {
1512       default:                  /* Keep compiler happy.  */
1513       case SPELL_OPERATOR:
1514         return 1;
1515       case SPELL_NONE:
1516         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1517       case SPELL_IDENT:
1518         return a->val.node == b->val.node;
1519       case SPELL_LITERAL:
1520         return (a->val.str.len == b->val.str.len
1521                 && !memcmp (a->val.str.text, b->val.str.text,
1522                             a->val.str.len));
1523       }
1524
1525   return 0;
1526 }
1527
1528 /* Returns nonzero if a space should be inserted to avoid an
1529    accidental token paste for output.  For simplicity, it is
1530    conservative, and occasionally advises a space where one is not
1531    needed, e.g. "." and ".2".  */
1532 int
1533 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1534                  const cpp_token *token2)
1535 {
1536   enum cpp_ttype a = token1->type, b = token2->type;
1537   cppchar_t c;
1538
1539   if (token1->flags & NAMED_OP)
1540     a = CPP_NAME;
1541   if (token2->flags & NAMED_OP)
1542     b = CPP_NAME;
1543
1544   c = EOF;
1545   if (token2->flags & DIGRAPH)
1546     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1547   else if (token_spellings[b].category == SPELL_OPERATOR)
1548     c = token_spellings[b].name[0];
1549
1550   /* Quickly get everything that can paste with an '='.  */
1551   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1552     return 1;
1553
1554   switch (a)
1555     {
1556     case CPP_GREATER:   return c == '>';
1557     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1558     case CPP_PLUS:      return c == '+';
1559     case CPP_MINUS:     return c == '-' || c == '>';
1560     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1561     case CPP_MOD:       return c == ':' || c == '>';
1562     case CPP_AND:       return c == '&';
1563     case CPP_OR:        return c == '|';
1564     case CPP_COLON:     return c == ':' || c == '>';
1565     case CPP_DEREF:     return c == '*';
1566     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1567     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1568     case CPP_NAME:      return ((b == CPP_NUMBER
1569                                  && name_p (pfile, &token2->val.str))
1570                                 || b == CPP_NAME
1571                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1572     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1573                                 || c == '.' || c == '+' || c == '-');
1574                                       /* UCNs */
1575     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1576                                  && b == CPP_NAME)
1577                                 || (CPP_OPTION (pfile, objc)
1578                                     && token1->val.str.text[0] == '@'
1579                                     && (b == CPP_NAME || b == CPP_STRING)));
1580     default:            break;
1581     }
1582
1583   return 0;
1584 }
1585
1586 /* Output all the remaining tokens on the current line, and a newline
1587    character, to FP.  Leading whitespace is removed.  If there are
1588    macros, special token padding is not performed.  */
1589 void
1590 cpp_output_line (cpp_reader *pfile, FILE *fp)
1591 {
1592   const cpp_token *token;
1593
1594   token = cpp_get_token (pfile);
1595   while (token->type != CPP_EOF)
1596     {
1597       cpp_output_token (token, fp);
1598       token = cpp_get_token (pfile);
1599       if (token->flags & PREV_WHITE)
1600         putc (' ', fp);
1601     }
1602
1603   putc ('\n', fp);
1604 }
1605
1606 /* Return a string representation of all the remaining tokens on the
1607    current line.  The result is allocated using xmalloc and must be
1608    freed by the caller.  */
1609 unsigned char *
1610 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1611 {
1612   const cpp_token *token;
1613   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1614   unsigned int alloced = 120 + out;
1615   unsigned char *result = (unsigned char *) xmalloc (alloced);
1616
1617   /* If DIR_NAME is empty, there are no initial contents.  */
1618   if (dir_name)
1619     {
1620       sprintf ((char *) result, "#%s ", dir_name);
1621       out += 2;
1622     }
1623
1624   token = cpp_get_token (pfile);
1625   while (token->type != CPP_EOF)
1626     {
1627       unsigned char *last;
1628       /* Include room for a possible space and the terminating nul.  */
1629       unsigned int len = cpp_token_len (token) + 2;
1630
1631       if (out + len > alloced)
1632         {
1633           alloced *= 2;
1634           if (out + len > alloced)
1635             alloced = out + len;
1636           result = (unsigned char *) xrealloc (result, alloced);
1637         }
1638
1639       last = cpp_spell_token (pfile, token, &result[out], 0);
1640       out = last - result;
1641
1642       token = cpp_get_token (pfile);
1643       if (token->flags & PREV_WHITE)
1644         result[out++] = ' ';
1645     }
1646
1647   result[out] = '\0';
1648   return result;
1649 }
1650
1651 /* Memory buffers.  Changing these three constants can have a dramatic
1652    effect on performance.  The values here are reasonable defaults,
1653    but might be tuned.  If you adjust them, be sure to test across a
1654    range of uses of cpplib, including heavy nested function-like macro
1655    expansion.  Also check the change in peak memory usage (NJAMD is a
1656    good tool for this).  */
1657 #define MIN_BUFF_SIZE 8000
1658 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1659 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1660         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1661
1662 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1663   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1664 #endif
1665
1666 /* Create a new allocation buffer.  Place the control block at the end
1667    of the buffer, so that buffer overflows will cause immediate chaos.  */
1668 static _cpp_buff *
1669 new_buff (size_t len)
1670 {
1671   _cpp_buff *result;
1672   unsigned char *base;
1673
1674   if (len < MIN_BUFF_SIZE)
1675     len = MIN_BUFF_SIZE;
1676   len = CPP_ALIGN (len);
1677
1678   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1679   result = (_cpp_buff *) (base + len);
1680   result->base = base;
1681   result->cur = base;
1682   result->limit = base + len;
1683   result->next = NULL;
1684   return result;
1685 }
1686
1687 /* Place a chain of unwanted allocation buffers on the free list.  */
1688 void
1689 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1690 {
1691   _cpp_buff *end = buff;
1692
1693   while (end->next)
1694     end = end->next;
1695   end->next = pfile->free_buffs;
1696   pfile->free_buffs = buff;
1697 }
1698
1699 /* Return a free buffer of size at least MIN_SIZE.  */
1700 _cpp_buff *
1701 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1702 {
1703   _cpp_buff *result, **p;
1704
1705   for (p = &pfile->free_buffs;; p = &(*p)->next)
1706     {
1707       size_t size;
1708
1709       if (*p == NULL)
1710         return new_buff (min_size);
1711       result = *p;
1712       size = result->limit - result->base;
1713       /* Return a buffer that's big enough, but don't waste one that's
1714          way too big.  */
1715       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1716         break;
1717     }
1718
1719   *p = result->next;
1720   result->next = NULL;
1721   result->cur = result->base;
1722   return result;
1723 }
1724
1725 /* Creates a new buffer with enough space to hold the uncommitted
1726    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1727    the excess bytes to the new buffer.  Chains the new buffer after
1728    BUFF, and returns the new buffer.  */
1729 _cpp_buff *
1730 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1731 {
1732   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1733   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1734
1735   buff->next = new_buff;
1736   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1737   return new_buff;
1738 }
1739
1740 /* Creates a new buffer with enough space to hold the uncommitted
1741    remaining bytes of the buffer pointed to by BUFF, and at least
1742    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1743    Chains the new buffer before the buffer pointed to by BUFF, and
1744    updates the pointer to point to the new buffer.  */
1745 void
1746 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1747 {
1748   _cpp_buff *new_buff, *old_buff = *pbuff;
1749   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1750
1751   new_buff = _cpp_get_buff (pfile, size);
1752   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1753   new_buff->next = old_buff;
1754   *pbuff = new_buff;
1755 }
1756
1757 /* Free a chain of buffers starting at BUFF.  */
1758 void
1759 _cpp_free_buff (_cpp_buff *buff)
1760 {
1761   _cpp_buff *next;
1762
1763   for (; buff; buff = next)
1764     {
1765       next = buff->next;
1766       free (buff->base);
1767     }
1768 }
1769
1770 /* Allocate permanent, unaligned storage of length LEN.  */
1771 unsigned char *
1772 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1773 {
1774   _cpp_buff *buff = pfile->u_buff;
1775   unsigned char *result = buff->cur;
1776
1777   if (len > (size_t) (buff->limit - result))
1778     {
1779       buff = _cpp_get_buff (pfile, len);
1780       buff->next = pfile->u_buff;
1781       pfile->u_buff = buff;
1782       result = buff->cur;
1783     }
1784
1785   buff->cur = result + len;
1786   return result;
1787 }
1788
1789 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1790    That buffer is used for growing allocations when saving macro
1791    replacement lists in a #define, and when parsing an answer to an
1792    assertion in #assert, #unassert or #if (and therefore possibly
1793    whilst expanding macros).  It therefore must not be used by any
1794    code that they might call: specifically the lexer and the guts of
1795    the macro expander.
1796
1797    All existing other uses clearly fit this restriction: storing
1798    registered pragmas during initialization.  */
1799 unsigned char *
1800 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1801 {
1802   _cpp_buff *buff = pfile->a_buff;
1803   unsigned char *result = buff->cur;
1804
1805   if (len > (size_t) (buff->limit - result))
1806     {
1807       buff = _cpp_get_buff (pfile, len);
1808       buff->next = pfile->a_buff;
1809       pfile->a_buff = buff;
1810       result = buff->cur;
1811     }
1812
1813   buff->cur = result + len;
1814   return result;
1815 }
1816
1817 /* Say which field of TOK is in use.  */
1818
1819 enum cpp_token_fld_kind
1820 cpp_token_val_index (cpp_token *tok)
1821 {
1822   switch (TOKEN_SPELL (tok))
1823     {
1824     case SPELL_IDENT:
1825       return CPP_TOKEN_FLD_NODE;
1826     case SPELL_LITERAL:
1827       return CPP_TOKEN_FLD_STR;
1828     case SPELL_NONE:
1829       if (tok->type == CPP_MACRO_ARG)
1830         return CPP_TOKEN_FLD_ARG_NO;
1831       else if (tok->type == CPP_PADDING)
1832         return CPP_TOKEN_FLD_SOURCE;
1833       else if (tok->type == CPP_PRAGMA)
1834         return CPP_TOKEN_FLD_PRAGMA;
1835       /* else fall through */
1836     default:
1837       return CPP_TOKEN_FLD_NONE;
1838     }
1839 }