libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 /* Returns with a logical line that contains no escaped newlines or
 100    trigraphs.  This is a time-critical inner loop.  */
 101 void
 102 _cpp_clean_line (cpp_reader *pfile)
 103 {
 104   cpp_buffer *buffer;
 105   const uchar *s;
 106   uchar c, *d, *p;
 107
 108   buffer = pfile->buffer;
 109   buffer->cur_note = buffer->notes_used = 0;
 110   buffer->cur = buffer->line_base = buffer->next_line;
 111   buffer->need_line = false;
 112   s = buffer->next_line - 1;
 113
 114   if (!buffer->from_stage3)
 115     {
 116       const uchar *pbackslash = NULL;
 117
 118       /* Short circuit for the common case of an un-escaped line with
 119          no trigraphs.  The primary win here is by not writing any
 120          data back to memory until we have to.  */
 121       for (;;)
 122         {
 123           c = *++s;
 124           if (__builtin_expect (c == '\n', false)
 125               || __builtin_expect (c == '\r', false))
 126             {
 127               d = (uchar *) s;
 128
 129               if (__builtin_expect (s == buffer->rlimit, false))
 130                 goto done;
 131
 132               /* DOS line ending? */
 133               if (__builtin_expect (c == '\r', false)
 134                   && s[1] == '\n')
 135                 {
 136                   s++;
 137                   if (s == buffer->rlimit)
 138                     goto done;
 139                 }
 140
 141               if (__builtin_expect (pbackslash == NULL, true))
 142                 goto done;
 143
 144               /* Check for escaped newline.  */
 145               p = d;
 146               while (is_nvspace (p[-1]))
 147                 p--;
 148               if (p - 1 != pbackslash)
 149                 goto done;
 150
 151               /* Have an escaped newline; process it and proceed to
 152                  the slow path.  */
 153               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 154               d = p - 2;
 155               buffer->next_line = p - 1;
 156               break;
 157             }
 158           if (__builtin_expect (c == '\\', false))
 159             pbackslash = s;
 160           else if (__builtin_expect (c == '?', false)
 161                    && __builtin_expect (s[1] == '?', false)
 162                    && _cpp_trigraph_map[s[2]])
 163             {
 164               /* Have a trigraph.  We may or may not have to convert
 165                  it.  Add a line note regardless, for -Wtrigraphs.  */
 166               add_line_note (buffer, s, s[2]);
 167               if (CPP_OPTION (pfile, trigraphs))
 168                 {
 169                   /* We do, and that means we have to switch to the
 170                      slow path.  */
 171                   d = (uchar *) s;
 172                   *d = _cpp_trigraph_map[s[2]];
 173                   s += 2;
 174                   break;
 175                 }
 176             }
 177         }
 178
 179
 180       for (;;)
 181         {
 182           c = *++s;
 183           *++d = c;
 184
 185           if (c == '\n' || c == '\r')
 186             {
 187                   /* Handle DOS line endings.  */
 188               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 189                 s++;
 190               if (s == buffer->rlimit)
 191                 break;
 192
 193               /* Escaped?  */
 194               p = d;
 195               while (p != buffer->next_line && is_nvspace (p[-1]))
 196                 p--;
 197               if (p == buffer->next_line || p[-1] != '\\')
 198                 break;
 199
 200               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 201               d = p - 2;
 202               buffer->next_line = p - 1;
 203             }
 204           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 205             {
 206               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 207               add_line_note (buffer, d, s[2]);
 208               if (CPP_OPTION (pfile, trigraphs))
 209                 {
 210                   *d = _cpp_trigraph_map[s[2]];
 211                   s += 2;
 212                 }
 213             }
 214         }
 215     }
 216   else
 217     {
 218       do
 219         s++;
 220       while (*s != '\n' && *s != '\r');
 221       d = (uchar *) s;
 222
 223       /* Handle DOS line endings.  */
 224       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 225         s++;
 226     }
 227
 228  done:
 229   *d = '\n';
 230   /* A sentinel note that should never be processed.  */
 231   add_line_note (buffer, d + 1, '\n');
 232   buffer->next_line = s + 1;
 233 }
 234
 235 /* Return true if the trigraph indicated by NOTE should be warned
 236    about in a comment.  */
 237 static bool
 238 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 239 {
 240   const uchar *p;
 241
 242   /* Within comments we don't warn about trigraphs, unless the
 243      trigraph forms an escaped newline, as that may change
 244      behavior.  */
 245   if (note->type != '/')
 246     return false;
 247
 248   /* If -trigraphs, then this was an escaped newline iff the next note
 249      is coincident.  */
 250   if (CPP_OPTION (pfile, trigraphs))
 251     return note[1].pos == note->pos;
 252
 253   /* Otherwise, see if this forms an escaped newline.  */
 254   p = note->pos + 3;
 255   while (is_nvspace (*p))
 256     p++;
 257
 258   /* There might have been escaped newlines between the trigraph and the
 259      newline we found.  Hence the position test.  */
 260   return (*p == '\n' && p < note[1].pos);
 261 }
 262
 263 /* Process the notes created by add_line_note as far as the current
 264    location.  */
 265 void
 266 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 267 {
 268   cpp_buffer *buffer = pfile->buffer;
 269
 270   for (;;)
 271     {
 272       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 273       unsigned int col;
 274
 275       if (note->pos > buffer->cur)
 276         break;
 277
 278       buffer->cur_note++;
 279       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 280
 281       if (note->type == '\\' || note->type == ' ')
 282         {
 283           if (note->type == ' ' && !in_comment)
 284             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 285                                  "backslash and newline separated by space");
 286
 287           if (buffer->next_line > buffer->rlimit)
 288             {
 289               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 290                                    "backslash-newline at end of file");
 291               /* Prevent "no newline at end of file" warning.  */
 292               buffer->next_line = buffer->rlimit;
 293             }
 294
 295           buffer->line_base = note->pos;
 296           CPP_INCREMENT_LINE (pfile, 0);
 297         }
 298       else if (_cpp_trigraph_map[note->type])
 299         {
 300           if (CPP_OPTION (pfile, warn_trigraphs)
 301               && (!in_comment || warn_in_comment (pfile, note)))
 302             {
 303               if (CPP_OPTION (pfile, trigraphs))
 304                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 305                                      "trigraph ??%c converted to %c",
 306                                      note->type,
 307                                      (int) _cpp_trigraph_map[note->type]);
 308               else
 309                 {
 310                   cpp_error_with_line
 311                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 312                      "trigraph ??%c ignored, use -trigraphs to enable",
 313                      note->type);
 314                 }
 315             }
 316         }
 317       else
 318         abort ();
 319     }
 320 }
 321
 322 /* Skip a C-style block comment.  We find the end of the comment by
 323    seeing if an asterisk is before every '/' we encounter.  Returns
 324    nonzero if comment terminated by EOF, zero otherwise.
 325
 326    Buffer->cur points to the initial asterisk of the comment.  */
 327 bool
 328 _cpp_skip_block_comment (cpp_reader *pfile)
 329 {
 330   cpp_buffer *buffer = pfile->buffer;
 331   const uchar *cur = buffer->cur;
 332   uchar c;
 333
 334   cur++;
 335   if (*cur == '/')
 336     cur++;
 337
 338   for (;;)
 339     {
 340       /* People like decorating comments with '*', so check for '/'
 341          instead for efficiency.  */
 342       c = *cur++;
 343
 344       if (c == '/')
 345         {
 346           if (cur[-2] == '*')
 347             break;
 348
 349           /* Warn about potential nested comments, but not if the '/'
 350              comes immediately before the true comment delimiter.
 351              Don't bother to get it right across escaped newlines.  */
 352           if (CPP_OPTION (pfile, warn_comments)
 353               && cur[0] == '*' && cur[1] != '/')
 354             {
 355               buffer->cur = cur;
 356               cpp_error_with_line (pfile, CPP_DL_WARNING,
 357                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 358                                    "\"/*\" within comment");
 359             }
 360         }
 361       else if (c == '\n')
 362         {
 363           unsigned int cols;
 364           buffer->cur = cur - 1;
 365           _cpp_process_line_notes (pfile, true);
 366           if (buffer->next_line >= buffer->rlimit)
 367             return true;
 368           _cpp_clean_line (pfile);
 369
 370           cols = buffer->next_line - buffer->line_base;
 371           CPP_INCREMENT_LINE (pfile, cols);
 372
 373           cur = buffer->cur;
 374         }
 375     }
 376
 377   buffer->cur = cur;
 378   _cpp_process_line_notes (pfile, true);
 379   return false;
 380 }
 381
 382 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 383    terminating newline.  Handles escaped newlines.  Returns nonzero
 384    if a multiline comment.  */
 385 static int
 386 skip_line_comment (cpp_reader *pfile)
 387 {
 388   cpp_buffer *buffer = pfile->buffer;
 389   source_location orig_line = pfile->line_table->highest_line;
 390
 391   while (*buffer->cur != '\n')
 392     buffer->cur++;
 393
 394   _cpp_process_line_notes (pfile, true);
 395   return orig_line != pfile->line_table->highest_line;
 396 }
 397
 398 /* Skips whitespace, saving the next non-whitespace character.  */
 399 static void
 400 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 401 {
 402   cpp_buffer *buffer = pfile->buffer;
 403   bool saw_NUL = false;
 404
 405   do
 406     {
 407       /* Horizontal space always OK.  */
 408       if (c == ' ' || c == '\t')
 409         ;
 410       /* Just \f \v or \0 left.  */
 411       else if (c == '\0')
 412         saw_NUL = true;
 413       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 414         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 415                              CPP_BUF_COL (buffer),
 416                              "%s in preprocessing directive",
 417                              c == '\f' ? "form feed" : "vertical tab");
 418
 419       c = *buffer->cur++;
 420     }
 421   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 422   while (is_nvspace (c));
 423
 424   if (saw_NUL)
 425     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 426
 427   buffer->cur--;
 428 }
 429
 430 /* See if the characters of a number token are valid in a name (no
 431    '.', '+' or '-').  */
 432 static int
 433 name_p (cpp_reader *pfile, const cpp_string *string)
 434 {
 435   unsigned int i;
 436
 437   for (i = 0; i < string->len; i++)
 438     if (!is_idchar (string->text[i]))
 439       return 0;
 440
 441   return 1;
 442 }
 443
 444 /* After parsing an identifier or other sequence, produce a warning about
 445    sequences not in NFC/NFKC.  */
 446 static void
 447 warn_about_normalization (cpp_reader *pfile,
 448                           const cpp_token *token,
 449                           const struct normalize_state *s)
 450 {
 451   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 452       && !pfile->state.skipping)
 453     {
 454       /* Make sure that the token is printed using UCNs, even
 455          if we'd otherwise happily print UTF-8.  */
 456       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 457       size_t sz;
 458
 459       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 460       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 461         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 462                              "`%.*s' is not in NFKC", (int) sz, buf);
 463       else
 464         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 465                              "`%.*s' is not in NFC", (int) sz, buf);
 466     }
 467 }
 468
 469 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 470    an identifier.  FIRST is TRUE if this starts an identifier.  */
 471 static bool
 472 forms_identifier_p (cpp_reader *pfile, int first,
 473                     struct normalize_state *state)
 474 {
 475   cpp_buffer *buffer = pfile->buffer;
 476
 477   if (*buffer->cur == '$')
 478     {
 479       if (!CPP_OPTION (pfile, dollars_in_ident))
 480         return false;
 481
 482       buffer->cur++;
 483       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 484         {
 485           CPP_OPTION (pfile, warn_dollars) = 0;
 486           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 487         }
 488
 489       return true;
 490     }
 491
 492   /* Is this a syntactically valid UCN?  */
 493   if (CPP_OPTION (pfile, extended_identifiers)
 494       && *buffer->cur == '\\'
 495       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 496     {
 497       buffer->cur += 2;
 498       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 499                           state))
 500         return true;
 501       buffer->cur -= 2;
 502     }
 503
 504   return false;
 505 }
 506
 507 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 508 static cpp_hashnode *
 509 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 510                 struct normalize_state *nst)
 511 {
 512   cpp_hashnode *result;
 513   const uchar *cur;
 514   unsigned int len;
 515   unsigned int hash = HT_HASHSTEP (0, *base);
 516
 517   cur = pfile->buffer->cur;
 518   if (! starts_ucn)
 519     while (ISIDNUM (*cur))
 520       {
 521         hash = HT_HASHSTEP (hash, *cur);
 522         cur++;
 523       }
 524   pfile->buffer->cur = cur;
 525   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 526     {
 527       /* Slower version for identifiers containing UCNs (or $).  */
 528       do {
 529         while (ISIDNUM (*pfile->buffer->cur))
 530           {
 531             pfile->buffer->cur++;
 532             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 533           }
 534       } while (forms_identifier_p (pfile, false, nst));
 535       result = _cpp_interpret_identifier (pfile, base,
 536                                           pfile->buffer->cur - base);
 537     }
 538   else
 539     {
 540       len = cur - base;
 541       hash = HT_HASHFINISH (hash, len);
 542
 543       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 544                                                   base, len, hash, HT_ALLOC));
 545     }
 546
 547   /* Rarely, identifiers require diagnostics when lexed.  */
 548   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 549                         && !pfile->state.skipping, 0))
 550     {
 551       /* It is allowed to poison the same identifier twice.  */
 552       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 553         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 554                    NODE_NAME (result));
 555
 556       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 557          replacement list of a variadic macro.  */
 558       if (result == pfile->spec_nodes.n__VA_ARGS__
 559           && !pfile->state.va_args_ok)
 560         cpp_error (pfile, CPP_DL_PEDWARN,
 561                    "__VA_ARGS__ can only appear in the expansion"
 562                    " of a C99 variadic macro");
 563     }
 564
 565   return result;
 566 }
 567
 568 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 569 static void
 570 lex_number (cpp_reader *pfile, cpp_string *number,
 571             struct normalize_state *nst)
 572 {
 573   const uchar *cur;
 574   const uchar *base;
 575   uchar *dest;
 576
 577   base = pfile->buffer->cur - 1;
 578   do
 579     {
 580       cur = pfile->buffer->cur;
 581
 582       /* N.B. ISIDNUM does not include $.  */
 583       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 584         {
 585           cur++;
 586           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 587         }
 588
 589       pfile->buffer->cur = cur;
 590     }
 591   while (forms_identifier_p (pfile, false, nst));
 592
 593   number->len = cur - base;
 594   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 595   memcpy (dest, base, number->len);
 596   dest[number->len] = '\0';
 597   number->text = dest;
 598 }
 599
 600 /* Create a token of type TYPE with a literal spelling.  */
 601 static void
 602 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 603                 unsigned int len, enum cpp_ttype type)
 604 {
 605   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 606
 607   memcpy (dest, base, len);
 608   dest[len] = '\0';
 609   token->type = type;
 610   token->val.str.len = len;
 611   token->val.str.text = dest;
 612 }
 613
 614 /* Lexes a string, character constant, or angle-bracketed header file
 615    name.  The stored string contains the spelling, including opening
 616    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 617    of the literal, or CPP_OTHER if it was not properly terminated, or
 618    CPP_LESS for an unterminated header name which must be relexed as
 619    normal tokens.
 620
 621    The spelling is NUL-terminated, but it is not guaranteed that this
 622    is the first NUL since embedded NULs are preserved.  */
 623 static void
 624 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 625 {
 626   bool saw_NUL = false;
 627   const uchar *cur;
 628   cppchar_t terminator;
 629   enum cpp_ttype type;
 630
 631   cur = base;
 632   terminator = *cur++;
 633   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 634     terminator = *cur++;
 635   if (terminator == '\"')
 636     type = (*base == 'L' ? CPP_WSTRING :
 637             *base == 'U' ? CPP_STRING32 :
 638             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 639   else if (terminator == '\'')
 640     type = (*base == 'L' ? CPP_WCHAR :
 641             *base == 'U' ? CPP_CHAR32 :
 642             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 643   else
 644     terminator = '>', type = CPP_HEADER_NAME;
 645
 646   for (;;)
 647     {
 648       cppchar_t c = *cur++;
 649
 650       /* In #include-style directives, terminators are not escapable.  */
 651       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 652         cur++;
 653       else if (c == terminator)
 654         break;
 655       else if (c == '\n')
 656         {
 657           cur--;
 658           /* Unmatched quotes always yield undefined behavior, but
 659              greedy lexing means that what appears to be an unterminated
 660              header name may actually be a legitimate sequence of tokens.  */
 661           if (terminator == '>')
 662             {
 663               token->type = CPP_LESS;
 664               return;
 665             }
 666           type = CPP_OTHER;
 667           break;
 668         }
 669       else if (c == '\0')
 670         saw_NUL = true;
 671     }
 672
 673   if (saw_NUL && !pfile->state.skipping)
 674     cpp_error (pfile, CPP_DL_WARNING,
 675                "null character(s) preserved in literal");
 676
 677   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 678     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 679                (int) terminator);
 680
 681   pfile->buffer->cur = cur;
 682   create_literal (pfile, token, base, cur - base, type);
 683 }
 684
 685 /* Return the comment table. The client may not make any assumption
 686    about the ordering of the table.  */
 687 cpp_comment_table *
 688 cpp_get_comments (cpp_reader *pfile)
 689 {
 690   return &pfile->comments;
 691 }
 692
 693 /* Append a comment to the end of the comment table. */
 694 static void
 695 store_comment (cpp_reader *pfile, cpp_token *token)
 696 {
 697   int len;
 698
 699   if (pfile->comments.allocated == 0)
 700     {
 701       pfile->comments.allocated = 256;
 702       pfile->comments.entries = (cpp_comment *) xmalloc
 703         (pfile->comments.allocated * sizeof (cpp_comment));
 704     }
 705
 706   if (pfile->comments.count == pfile->comments.allocated)
 707     {
 708       pfile->comments.allocated *= 2;
 709       pfile->comments.entries = (cpp_comment *) xrealloc
 710         (pfile->comments.entries,
 711          pfile->comments.allocated * sizeof (cpp_comment));
 712     }
 713
 714   len = token->val.str.len;
 715
 716   /* Copy comment. Note, token may not be NULL terminated. */
 717   pfile->comments.entries[pfile->comments.count].comment =
 718     (char *) xmalloc (sizeof (char) * (len + 1));
 719   memcpy (pfile->comments.entries[pfile->comments.count].comment,
 720           token->val.str.text, len);
 721   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
 722
 723   /* Set source location. */
 724   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
 725
 726   /* Increment the count of entries in the comment table. */
 727   pfile->comments.count++;
 728 }
 729
 730 /* The stored comment includes the comment start and any terminator.  */
 731 static void
 732 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 733               cppchar_t type)
 734 {
 735   unsigned char *buffer;
 736   unsigned int len, clen;
 737
 738   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 739
 740   /* C++ comments probably (not definitely) have moved past a new
 741      line, which we don't want to save in the comment.  */
 742   if (is_vspace (pfile->buffer->cur[-1]))
 743     len--;
 744
 745   /* If we are currently in a directive, then we need to store all
 746      C++ comments as C comments internally, and so we need to
 747      allocate a little extra space in that case.
 748
 749      Note that the only time we encounter a directive here is
 750      when we are saving comments in a "#define".  */
 751   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 752
 753   buffer = _cpp_unaligned_alloc (pfile, clen);
 754
 755   token->type = CPP_COMMENT;
 756   token->val.str.len = clen;
 757   token->val.str.text = buffer;
 758
 759   buffer[0] = '/';
 760   memcpy (buffer + 1, from, len - 1);
 761
 762   /* Finish conversion to a C comment, if necessary.  */
 763   if (pfile->state.in_directive && type == '/')
 764     {
 765       buffer[1] = '*';
 766       buffer[clen - 2] = '*';
 767       buffer[clen - 1] = '/';
 768     }
 769
 770   /* Finally store this comment for use by clients of libcpp. */
 771   store_comment (pfile, token);
 772 }
 773
 774 /* Allocate COUNT tokens for RUN.  */
 775 void
 776 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 777 {
 778   run->base = XNEWVEC (cpp_token, count);
 779   run->limit = run->base + count;
 780   run->next = NULL;
 781 }
 782
 783 /* Returns the next tokenrun, or creates one if there is none.  */
 784 static tokenrun *
 785 next_tokenrun (tokenrun *run)
 786 {
 787   if (run->next == NULL)
 788     {
 789       run->next = XNEW (tokenrun);
 790       run->next->prev = run;
 791       _cpp_init_tokenrun (run->next, 250);
 792     }
 793
 794   return run->next;
 795 }
 796
 797 /* Look ahead in the input stream.  */
 798 const cpp_token *
 799 cpp_peek_token (cpp_reader *pfile, int index)
 800 {
 801   cpp_context *context = pfile->context;
 802   const cpp_token *peektok;
 803   int count;
 804
 805   /* First, scan through any pending cpp_context objects.  */
 806   while (context->prev)
 807     {
 808       ptrdiff_t sz = (context->direct_p
 809                       ? LAST (context).token - FIRST (context).token
 810                       : LAST (context).ptoken - FIRST (context).ptoken);
 811
 812       if (index < (int) sz)
 813         return (context->direct_p
 814                 ? FIRST (context).token + index
 815                 : *(FIRST (context).ptoken + index));
 816
 817       index -= (int) sz;
 818       context = context->prev;
 819     }
 820
 821   /* We will have to read some new tokens after all (and do so
 822      without invalidating preceding tokens).  */
 823   count = index;
 824   pfile->keep_tokens++;
 825
 826   do
 827     {
 828       peektok = _cpp_lex_token (pfile);
 829       if (peektok->type == CPP_EOF)
 830         return peektok;
 831     }
 832   while (index--);
 833
 834   _cpp_backup_tokens_direct (pfile, count + 1);
 835   pfile->keep_tokens--;
 836
 837   return peektok;
 838 }
 839
 840 /* Allocate a single token that is invalidated at the same time as the
 841    rest of the tokens on the line.  Has its line and col set to the
 842    same as the last lexed token, so that diagnostics appear in the
 843    right place.  */
 844 cpp_token *
 845 _cpp_temp_token (cpp_reader *pfile)
 846 {
 847   cpp_token *old, *result;
 848   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 849   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 850
 851   old = pfile->cur_token - 1;
 852   /* Any pre-existing lookaheads must not be clobbered.  */
 853   if (la)
 854     {
 855       if (sz <= la)
 856         {
 857           tokenrun *next = next_tokenrun (pfile->cur_run);
 858
 859           if (sz < la)
 860             memmove (next->base + 1, next->base,
 861                      (la - sz) * sizeof (cpp_token));
 862
 863           next->base[0] = pfile->cur_run->limit[-1];
 864         }
 865
 866       if (sz > 1)
 867         memmove (pfile->cur_token + 1, pfile->cur_token,
 868                  MIN (la, sz - 1) * sizeof (cpp_token));
 869     }
 870
 871   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 872     {
 873       pfile->cur_run = next_tokenrun (pfile->cur_run);
 874       pfile->cur_token = pfile->cur_run->base;
 875     }
 876
 877   result = pfile->cur_token++;
 878   result->src_loc = old->src_loc;
 879   return result;
 880 }
 881
 882 /* Lex a token into RESULT (external interface).  Takes care of issues
 883    like directive handling, token lookahead, multiple include
 884    optimization and skipping.  */
 885 const cpp_token *
 886 _cpp_lex_token (cpp_reader *pfile)
 887 {
 888   cpp_token *result;
 889
 890   for (;;)
 891     {
 892       if (pfile->cur_token == pfile->cur_run->limit)
 893         {
 894           pfile->cur_run = next_tokenrun (pfile->cur_run);
 895           pfile->cur_token = pfile->cur_run->base;
 896         }
 897       /* We assume that the current token is somewhere in the current
 898          run.  */
 899       if (pfile->cur_token < pfile->cur_run->base
 900           || pfile->cur_token >= pfile->cur_run->limit)
 901         abort ();
 902
 903       if (pfile->lookaheads)
 904         {
 905           pfile->lookaheads--;
 906           result = pfile->cur_token++;
 907         }
 908       else
 909         result = _cpp_lex_direct (pfile);
 910
 911       if (result->flags & BOL)
 912         {
 913           /* Is this a directive.  If _cpp_handle_directive returns
 914              false, it is an assembler #.  */
 915           if (result->type == CPP_HASH
 916               /* 6.10.3 p 11: Directives in a list of macro arguments
 917                  gives undefined behavior.  This implementation
 918                  handles the directive as normal.  */
 919               && pfile->state.parsing_args != 1)
 920             {
 921               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 922                 {
 923                   if (pfile->directive_result.type == CPP_PADDING)
 924                     continue;
 925                   result = &pfile->directive_result;
 926                 }
 927             }
 928           else if (pfile->state.in_deferred_pragma)
 929             result = &pfile->directive_result;
 930
 931           if (pfile->cb.line_change && !pfile->state.skipping)
 932             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 933         }
 934
 935       /* We don't skip tokens in directives.  */
 936       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 937         break;
 938
 939       /* Outside a directive, invalidate controlling macros.  At file
 940          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 941          get here and MI optimization works.  */
 942       pfile->mi_valid = false;
 943
 944       if (!pfile->state.skipping || result->type == CPP_EOF)
 945         break;
 946     }
 947
 948   return result;
 949 }
 950
 951 /* Returns true if a fresh line has been loaded.  */
 952 bool
 953 _cpp_get_fresh_line (cpp_reader *pfile)
 954 {
 955   int return_at_eof;
 956
 957   /* We can't get a new line until we leave the current directive.  */
 958   if (pfile->state.in_directive)
 959     return false;
 960
 961   for (;;)
 962     {
 963       cpp_buffer *buffer = pfile->buffer;
 964
 965       if (!buffer->need_line)
 966         return true;
 967
 968       if (buffer->next_line < buffer->rlimit)
 969         {
 970           _cpp_clean_line (pfile);
 971           return true;
 972         }
 973
 974       /* First, get out of parsing arguments state.  */
 975       if (pfile->state.parsing_args)
 976         return false;
 977
 978       /* End of buffer.  Non-empty files should end in a newline.  */
 979       if (buffer->buf != buffer->rlimit
 980           && buffer->next_line > buffer->rlimit
 981           && !buffer->from_stage3)
 982         {
 983           /* Clip to buffer size.  */
 984           buffer->next_line = buffer->rlimit;
 985         }
 986
 987       return_at_eof = buffer->return_at_eof;
 988       _cpp_pop_buffer (pfile);
 989       if (pfile->buffer == NULL || return_at_eof)
 990         return false;
 991     }
 992 }
 993
 994 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 995   do                                                    \
 996     {                                                   \
 997       result->type = ELSE_TYPE;                         \
 998       if (*buffer->cur == CHAR)                         \
 999         buffer->cur++, result->type = THEN_TYPE;        \
1000     }                                                   \
1001   while (0)
1002
1003 /* Lex a token into pfile->cur_token, which is also incremented, to
1004    get diagnostics pointing to the correct location.
1005
1006    Does not handle issues such as token lookahead, multiple-include
1007    optimization, directives, skipping etc.  This function is only
1008    suitable for use by _cpp_lex_token, and in special cases like
1009    lex_expansion_token which doesn't care for any of these issues.
1010
1011    When meeting a newline, returns CPP_EOF if parsing a directive,
1012    otherwise returns to the start of the token buffer if permissible.
1013    Returns the location of the lexed token.  */
1014 cpp_token *
1015 _cpp_lex_direct (cpp_reader *pfile)
1016 {
1017   cppchar_t c;
1018   cpp_buffer *buffer;
1019   const unsigned char *comment_start;
1020   cpp_token *result = pfile->cur_token++;
1021
1022  fresh_line:
1023   result->flags = 0;
1024   buffer = pfile->buffer;
1025   if (buffer->need_line)
1026     {
1027       if (pfile->state.in_deferred_pragma)
1028         {
1029           result->type = CPP_PRAGMA_EOL;
1030           pfile->state.in_deferred_pragma = false;
1031           if (!pfile->state.pragma_allow_expansion)
1032             pfile->state.prevent_expansion--;
1033           return result;
1034         }
1035       if (!_cpp_get_fresh_line (pfile))
1036         {
1037           result->type = CPP_EOF;
1038           if (!pfile->state.in_directive)
1039             {
1040               /* Tell the compiler the line number of the EOF token.  */
1041               result->src_loc = pfile->line_table->highest_line;
1042               result->flags = BOL;
1043             }
1044           return result;
1045         }
1046       if (!pfile->keep_tokens)
1047         {
1048           pfile->cur_run = &pfile->base_run;
1049           result = pfile->base_run.base;
1050           pfile->cur_token = result + 1;
1051         }
1052       result->flags = BOL;
1053       if (pfile->state.parsing_args == 2)
1054         result->flags |= PREV_WHITE;
1055     }
1056   buffer = pfile->buffer;
1057  update_tokens_line:
1058   result->src_loc = pfile->line_table->highest_line;
1059
1060  skipped_white:
1061   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1062       && !pfile->overlaid_buffer)
1063     {
1064       _cpp_process_line_notes (pfile, false);
1065       result->src_loc = pfile->line_table->highest_line;
1066     }
1067   c = *buffer->cur++;
1068
1069   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1070                                CPP_BUF_COLUMN (buffer, buffer->cur));
1071
1072   switch (c)
1073     {
1074     case ' ': case '\t': case '\f': case '\v': case '\0':
1075       result->flags |= PREV_WHITE;
1076       skip_whitespace (pfile, c);
1077       goto skipped_white;
1078
1079     case '\n':
1080       if (buffer->cur < buffer->rlimit)
1081         CPP_INCREMENT_LINE (pfile, 0);
1082       buffer->need_line = true;
1083       goto fresh_line;
1084
1085     case '0': case '1': case '2': case '3': case '4':
1086     case '5': case '6': case '7': case '8': case '9':
1087       {
1088         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1089         result->type = CPP_NUMBER;
1090         lex_number (pfile, &result->val.str, &nst);
1091         warn_about_normalization (pfile, result, &nst);
1092         break;
1093       }
1094
1095     case 'L':
1096     case 'u':
1097     case 'U':
1098       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1099       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1100         {
1101           if (*buffer->cur == '\'' || *buffer->cur == '"')
1102             {
1103               lex_string (pfile, result, buffer->cur - 1);
1104               break;
1105             }
1106         }
1107       /* Fall through.  */
1108
1109     case '_':
1110     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1111     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1112     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1113     case 's': case 't':           case 'v': case 'w': case 'x':
1114     case 'y': case 'z':
1115     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1116     case 'G': case 'H': case 'I': case 'J': case 'K':
1117     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1118     case 'S': case 'T':           case 'V': case 'W': case 'X':
1119     case 'Y': case 'Z':
1120       result->type = CPP_NAME;
1121       {
1122         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1123         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
1124                                                 &nst);
1125         warn_about_normalization (pfile, result, &nst);
1126       }
1127
1128       /* Convert named operators to their proper types.  */
1129       if (result->val.node.node->flags & NODE_OPERATOR)
1130         {
1131           result->flags |= NAMED_OP;
1132           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
1133         }
1134       break;
1135
1136     case '\'':
1137     case '"':
1138       lex_string (pfile, result, buffer->cur - 1);
1139       break;
1140
1141     case '/':
1142       /* A potential block or line comment.  */
1143       comment_start = buffer->cur;
1144       c = *buffer->cur;
1145
1146       if (c == '*')
1147         {
1148           if (_cpp_skip_block_comment (pfile))
1149             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1150         }
1151       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1152                             || cpp_in_system_header (pfile)))
1153         {
1154           /* Warn about comments only if pedantically GNUC89, and not
1155              in system headers.  */
1156           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1157               && ! buffer->warned_cplusplus_comments)
1158             {
1159               cpp_error (pfile, CPP_DL_PEDWARN,
1160                          "C++ style comments are not allowed in ISO C90");
1161               cpp_error (pfile, CPP_DL_PEDWARN,
1162                          "(this will be reported only once per input file)");
1163               buffer->warned_cplusplus_comments = 1;
1164             }
1165
1166           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1167             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1168         }
1169       else if (c == '=')
1170         {
1171           buffer->cur++;
1172           result->type = CPP_DIV_EQ;
1173           break;
1174         }
1175       else
1176         {
1177           result->type = CPP_DIV;
1178           break;
1179         }
1180
1181       if (!pfile->state.save_comments)
1182         {
1183           result->flags |= PREV_WHITE;
1184           goto update_tokens_line;
1185         }
1186
1187       /* Save the comment as a token in its own right.  */
1188       save_comment (pfile, result, comment_start, c);
1189       break;
1190
1191     case '<':
1192       if (pfile->state.angled_headers)
1193         {
1194           lex_string (pfile, result, buffer->cur - 1);
1195           if (result->type != CPP_LESS)
1196             break;
1197         }
1198
1199       result->type = CPP_LESS;
1200       if (*buffer->cur == '=')
1201         buffer->cur++, result->type = CPP_LESS_EQ;
1202       else if (*buffer->cur == '<')
1203         {
1204           buffer->cur++;
1205           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1206         }
1207       else if (CPP_OPTION (pfile, digraphs))
1208         {
1209           if (*buffer->cur == ':')
1210             {
1211               buffer->cur++;
1212               result->flags |= DIGRAPH;
1213               result->type = CPP_OPEN_SQUARE;
1214             }
1215           else if (*buffer->cur == '%')
1216             {
1217               buffer->cur++;
1218               result->flags |= DIGRAPH;
1219               result->type = CPP_OPEN_BRACE;
1220             }
1221         }
1222       break;
1223
1224     case '>':
1225       result->type = CPP_GREATER;
1226       if (*buffer->cur == '=')
1227         buffer->cur++, result->type = CPP_GREATER_EQ;
1228       else if (*buffer->cur == '>')
1229         {
1230           buffer->cur++;
1231           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1232         }
1233       break;
1234
1235     case '%':
1236       result->type = CPP_MOD;
1237       if (*buffer->cur == '=')
1238         buffer->cur++, result->type = CPP_MOD_EQ;
1239       else if (CPP_OPTION (pfile, digraphs))
1240         {
1241           if (*buffer->cur == ':')
1242             {
1243               buffer->cur++;
1244               result->flags |= DIGRAPH;
1245               result->type = CPP_HASH;
1246               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1247                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
1248             }
1249           else if (*buffer->cur == '>')
1250             {
1251               buffer->cur++;
1252               result->flags |= DIGRAPH;
1253               result->type = CPP_CLOSE_BRACE;
1254             }
1255         }
1256       break;
1257
1258     case '.':
1259       result->type = CPP_DOT;
1260       if (ISDIGIT (*buffer->cur))
1261         {
1262           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1263           result->type = CPP_NUMBER;
1264           lex_number (pfile, &result->val.str, &nst);
1265           warn_about_normalization (pfile, result, &nst);
1266         }
1267       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1268         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1269       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1270         buffer->cur++, result->type = CPP_DOT_STAR;
1271       break;
1272
1273     case '+':
1274       result->type = CPP_PLUS;
1275       if (*buffer->cur == '+')
1276         buffer->cur++, result->type = CPP_PLUS_PLUS;
1277       else if (*buffer->cur == '=')
1278         buffer->cur++, result->type = CPP_PLUS_EQ;
1279       break;
1280
1281     case '-':
1282       result->type = CPP_MINUS;
1283       if (*buffer->cur == '>')
1284         {
1285           buffer->cur++;
1286           result->type = CPP_DEREF;
1287           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1288             buffer->cur++, result->type = CPP_DEREF_STAR;
1289         }
1290       else if (*buffer->cur == '-')
1291         buffer->cur++, result->type = CPP_MINUS_MINUS;
1292       else if (*buffer->cur == '=')
1293         buffer->cur++, result->type = CPP_MINUS_EQ;
1294       break;
1295
1296     case '&':
1297       result->type = CPP_AND;
1298       if (*buffer->cur == '&')
1299         buffer->cur++, result->type = CPP_AND_AND;
1300       else if (*buffer->cur == '=')
1301         buffer->cur++, result->type = CPP_AND_EQ;
1302       break;
1303
1304     case '|':
1305       result->type = CPP_OR;
1306       if (*buffer->cur == '|')
1307         buffer->cur++, result->type = CPP_OR_OR;
1308       else if (*buffer->cur == '=')
1309         buffer->cur++, result->type = CPP_OR_EQ;
1310       break;
1311
1312     case ':':
1313       result->type = CPP_COLON;
1314       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1315         buffer->cur++, result->type = CPP_SCOPE;
1316       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1317         {
1318           buffer->cur++;
1319           result->flags |= DIGRAPH;
1320           result->type = CPP_CLOSE_SQUARE;
1321         }
1322       break;
1323
1324     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1325     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1326     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1327     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1328     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
1329
1330     case '?': result->type = CPP_QUERY; break;
1331     case '~': result->type = CPP_COMPL; break;
1332     case ',': result->type = CPP_COMMA; break;
1333     case '(': result->type = CPP_OPEN_PAREN; break;
1334     case ')': result->type = CPP_CLOSE_PAREN; break;
1335     case '[': result->type = CPP_OPEN_SQUARE; break;
1336     case ']': result->type = CPP_CLOSE_SQUARE; break;
1337     case '{': result->type = CPP_OPEN_BRACE; break;
1338     case '}': result->type = CPP_CLOSE_BRACE; break;
1339     case ';': result->type = CPP_SEMICOLON; break;
1340
1341       /* @ is a punctuator in Objective-C.  */
1342     case '@': result->type = CPP_ATSIGN; break;
1343
1344     case '$':
1345     case '\\':
1346       {
1347         const uchar *base = --buffer->cur;
1348         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1349
1350         if (forms_identifier_p (pfile, true, &nst))
1351           {
1352             result->type = CPP_NAME;
1353             result->val.node.node = lex_identifier (pfile, base, true, &nst);
1354             warn_about_normalization (pfile, result, &nst);
1355             break;
1356           }
1357         buffer->cur++;
1358       }
1359
1360     default:
1361       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1362       break;
1363     }
1364
1365   return result;
1366 }
1367
1368 /* An upper bound on the number of bytes needed to spell TOKEN.
1369    Does not include preceding whitespace.  */
1370 unsigned int
1371 cpp_token_len (const cpp_token *token)
1372 {
1373   unsigned int len;
1374
1375   switch (TOKEN_SPELL (token))
1376     {
1377     default:            len = 6;                                break;
1378     case SPELL_LITERAL: len = token->val.str.len;               break;
1379     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
1380     }
1381
1382   return len;
1383 }
1384
1385 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1386    Return the number of bytes read out of NAME.  (There are always
1387    10 bytes written to BUFFER.)  */
1388
1389 static size_t
1390 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1391 {
1392   int j;
1393   int ucn_len = 0;
1394   int ucn_len_c;
1395   unsigned t;
1396   unsigned long utf32;
1397
1398   /* Compute the length of the UTF-8 sequence.  */
1399   for (t = *name; t & 0x80; t <<= 1)
1400     ucn_len++;
1401
1402   utf32 = *name & (0x7F >> ucn_len);
1403   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1404     {
1405       utf32 = (utf32 << 6) | (*++name & 0x3F);
1406
1407       /* Ill-formed UTF-8.  */
1408       if ((*name & ~0x3F) != 0x80)
1409         abort ();
1410     }
1411
1412   *buffer++ = '\\';
1413   *buffer++ = 'U';
1414   for (j = 7; j >= 0; j--)
1415     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1416   return ucn_len;
1417 }
1418
1419 /* Given a token TYPE corresponding to a digraph, return a pointer to
1420    the spelling of the digraph.  */
1421 static const unsigned char *
1422 cpp_digraph2name (enum cpp_ttype type)
1423 {
1424   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
1425 }
1426
1427 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1428    already contain the enough space to hold the token's spelling.
1429    Returns a pointer to the character after the last character written.
1430    FORSTRING is true if this is to be the spelling after translation
1431    phase 1 (this is different for UCNs).
1432    FIXME: Would be nice if we didn't need the PFILE argument.  */
1433 unsigned char *
1434 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1435                  unsigned char *buffer, bool forstring)
1436 {
1437   switch (TOKEN_SPELL (token))
1438     {
1439     case SPELL_OPERATOR:
1440       {
1441         const unsigned char *spelling;
1442         unsigned char c;
1443
1444         if (token->flags & DIGRAPH)
1445           spelling = cpp_digraph2name (token->type);
1446         else if (token->flags & NAMED_OP)
1447           goto spell_ident;
1448         else
1449           spelling = TOKEN_NAME (token);
1450
1451         while ((c = *spelling++) != '\0')
1452           *buffer++ = c;
1453       }
1454       break;
1455
1456     spell_ident:
1457     case SPELL_IDENT:
1458       if (forstring)
1459         {
1460           memcpy (buffer, NODE_NAME (token->val.node.node),
1461                   NODE_LEN (token->val.node.node));
1462           buffer += NODE_LEN (token->val.node.node);
1463         }
1464       else
1465         {
1466           size_t i;
1467           const unsigned char * name = NODE_NAME (token->val.node.node);
1468
1469           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1470             if (name[i] & ~0x7F)
1471               {
1472                 i += utf8_to_ucn (buffer, name + i) - 1;
1473                 buffer += 10;
1474               }
1475             else
1476               *buffer++ = NODE_NAME (token->val.node.node)[i];
1477         }
1478       break;
1479
1480     case SPELL_LITERAL:
1481       memcpy (buffer, token->val.str.text, token->val.str.len);
1482       buffer += token->val.str.len;
1483       break;
1484
1485     case SPELL_NONE:
1486       cpp_error (pfile, CPP_DL_ICE,
1487                  "unspellable token %s", TOKEN_NAME (token));
1488       break;
1489     }
1490
1491   return buffer;
1492 }
1493
1494 /* Returns TOKEN spelt as a null-terminated string.  The string is
1495    freed when the reader is destroyed.  Useful for diagnostics.  */
1496 unsigned char *
1497 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1498 {
1499   unsigned int len = cpp_token_len (token) + 1;
1500   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1501
1502   end = cpp_spell_token (pfile, token, start, false);
1503   end[0] = '\0';
1504
1505   return start;
1506 }
1507
1508 /* Returns a pointer to a string which spells the token defined by
1509    TYPE and FLAGS.  Used by C front ends, which really should move to
1510    using cpp_token_as_text.  */
1511 const char *
1512 cpp_type2name (enum cpp_ttype type, unsigned char flags)
1513 {
1514   if (flags & DIGRAPH)
1515     return (const char *) cpp_digraph2name (type);
1516   else if (flags & NAMED_OP)
1517     return cpp_named_operator2name (type);
1518
1519   return (const char *) token_spellings[type].name;
1520 }
1521
1522 /* Writes the spelling of token to FP, without any preceding space.
1523    Separated from cpp_spell_token for efficiency - to avoid stdio
1524    double-buffering.  */
1525 void
1526 cpp_output_token (const cpp_token *token, FILE *fp)
1527 {
1528   switch (TOKEN_SPELL (token))
1529     {
1530     case SPELL_OPERATOR:
1531       {
1532         const unsigned char *spelling;
1533         int c;
1534
1535         if (token->flags & DIGRAPH)
1536           spelling = cpp_digraph2name (token->type);
1537         else if (token->flags & NAMED_OP)
1538           goto spell_ident;
1539         else
1540           spelling = TOKEN_NAME (token);
1541
1542         c = *spelling;
1543         do
1544           putc (c, fp);
1545         while ((c = *++spelling) != '\0');
1546       }
1547       break;
1548
1549     spell_ident:
1550     case SPELL_IDENT:
1551       {
1552         size_t i;
1553         const unsigned char * name = NODE_NAME (token->val.node.node);
1554
1555         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1556           if (name[i] & ~0x7F)
1557             {
1558               unsigned char buffer[10];
1559               i += utf8_to_ucn (buffer, name + i) - 1;
1560               fwrite (buffer, 1, 10, fp);
1561             }
1562           else
1563             fputc (NODE_NAME (token->val.node.node)[i], fp);
1564       }
1565       break;
1566
1567     case SPELL_LITERAL:
1568       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1569       break;
1570
1571     case SPELL_NONE:
1572       /* An error, most probably.  */
1573       break;
1574     }
1575 }
1576
1577 /* Compare two tokens.  */
1578 int
1579 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1580 {
1581   if (a->type == b->type && a->flags == b->flags)
1582     switch (TOKEN_SPELL (a))
1583       {
1584       default:                  /* Keep compiler happy.  */
1585       case SPELL_OPERATOR:
1586         /* token_no is used to track where multiple consecutive ##
1587            tokens were originally located.  */
1588         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
1589       case SPELL_NONE:
1590         return (a->type != CPP_MACRO_ARG
1591                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
1592       case SPELL_IDENT:
1593         return a->val.node.node == b->val.node.node;
1594       case SPELL_LITERAL:
1595         return (a->val.str.len == b->val.str.len
1596                 && !memcmp (a->val.str.text, b->val.str.text,
1597                             a->val.str.len));
1598       }
1599
1600   return 0;
1601 }
1602
1603 /* Returns nonzero if a space should be inserted to avoid an
1604    accidental token paste for output.  For simplicity, it is
1605    conservative, and occasionally advises a space where one is not
1606    needed, e.g. "." and ".2".  */
1607 int
1608 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1609                  const cpp_token *token2)
1610 {
1611   enum cpp_ttype a = token1->type, b = token2->type;
1612   cppchar_t c;
1613
1614   if (token1->flags & NAMED_OP)
1615     a = CPP_NAME;
1616   if (token2->flags & NAMED_OP)
1617     b = CPP_NAME;
1618
1619   c = EOF;
1620   if (token2->flags & DIGRAPH)
1621     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1622   else if (token_spellings[b].category == SPELL_OPERATOR)
1623     c = token_spellings[b].name[0];
1624
1625   /* Quickly get everything that can paste with an '='.  */
1626   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1627     return 1;
1628
1629   switch (a)
1630     {
1631     case CPP_GREATER:   return c == '>';
1632     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1633     case CPP_PLUS:      return c == '+';
1634     case CPP_MINUS:     return c == '-' || c == '>';
1635     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1636     case CPP_MOD:       return c == ':' || c == '>';
1637     case CPP_AND:       return c == '&';
1638     case CPP_OR:        return c == '|';
1639     case CPP_COLON:     return c == ':' || c == '>';
1640     case CPP_DEREF:     return c == '*';
1641     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1642     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1643     case CPP_NAME:      return ((b == CPP_NUMBER
1644                                  && name_p (pfile, &token2->val.str))
1645                                 || b == CPP_NAME
1646                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1647     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1648                                 || c == '.' || c == '+' || c == '-');
1649                                       /* UCNs */
1650     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1651                                  && b == CPP_NAME)
1652                                 || (CPP_OPTION (pfile, objc)
1653                                     && token1->val.str.text[0] == '@'
1654                                     && (b == CPP_NAME || b == CPP_STRING)));
1655     default:            break;
1656     }
1657
1658   return 0;
1659 }
1660
1661 /* Output all the remaining tokens on the current line, and a newline
1662    character, to FP.  Leading whitespace is removed.  If there are
1663    macros, special token padding is not performed.  */
1664 void
1665 cpp_output_line (cpp_reader *pfile, FILE *fp)
1666 {
1667   const cpp_token *token;
1668
1669   token = cpp_get_token (pfile);
1670   while (token->type != CPP_EOF)
1671     {
1672       cpp_output_token (token, fp);
1673       token = cpp_get_token (pfile);
1674       if (token->flags & PREV_WHITE)
1675         putc (' ', fp);
1676     }
1677
1678   putc ('\n', fp);
1679 }
1680
1681 /* Return a string representation of all the remaining tokens on the
1682    current line.  The result is allocated using xmalloc and must be
1683    freed by the caller.  */
1684 unsigned char *
1685 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1686 {
1687   const cpp_token *token;
1688   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1689   unsigned int alloced = 120 + out;
1690   unsigned char *result = (unsigned char *) xmalloc (alloced);
1691
1692   /* If DIR_NAME is empty, there are no initial contents.  */
1693   if (dir_name)
1694     {
1695       sprintf ((char *) result, "#%s ", dir_name);
1696       out += 2;
1697     }
1698
1699   token = cpp_get_token (pfile);
1700   while (token->type != CPP_EOF)
1701     {
1702       unsigned char *last;
1703       /* Include room for a possible space and the terminating nul.  */
1704       unsigned int len = cpp_token_len (token) + 2;
1705
1706       if (out + len > alloced)
1707         {
1708           alloced *= 2;
1709           if (out + len > alloced)
1710             alloced = out + len;
1711           result = (unsigned char *) xrealloc (result, alloced);
1712         }
1713
1714       last = cpp_spell_token (pfile, token, &result[out], 0);
1715       out = last - result;
1716
1717       token = cpp_get_token (pfile);
1718       if (token->flags & PREV_WHITE)
1719         result[out++] = ' ';
1720     }
1721
1722   result[out] = '\0';
1723   return result;
1724 }
1725
1726 /* Memory buffers.  Changing these three constants can have a dramatic
1727    effect on performance.  The values here are reasonable defaults,
1728    but might be tuned.  If you adjust them, be sure to test across a
1729    range of uses of cpplib, including heavy nested function-like macro
1730    expansion.  Also check the change in peak memory usage (NJAMD is a
1731    good tool for this).  */
1732 #define MIN_BUFF_SIZE 8000
1733 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1734 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1735         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1736
1737 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1738   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1739 #endif
1740
1741 /* Create a new allocation buffer.  Place the control block at the end
1742    of the buffer, so that buffer overflows will cause immediate chaos.  */
1743 static _cpp_buff *
1744 new_buff (size_t len)
1745 {
1746   _cpp_buff *result;
1747   unsigned char *base;
1748
1749   if (len < MIN_BUFF_SIZE)
1750     len = MIN_BUFF_SIZE;
1751   len = CPP_ALIGN (len);
1752
1753   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1754   result = (_cpp_buff *) (base + len);
1755   result->base = base;
1756   result->cur = base;
1757   result->limit = base + len;
1758   result->next = NULL;
1759   return result;
1760 }
1761
1762 /* Place a chain of unwanted allocation buffers on the free list.  */
1763 void
1764 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1765 {
1766   _cpp_buff *end = buff;
1767
1768   while (end->next)
1769     end = end->next;
1770   end->next = pfile->free_buffs;
1771   pfile->free_buffs = buff;
1772 }
1773
1774 /* Return a free buffer of size at least MIN_SIZE.  */
1775 _cpp_buff *
1776 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1777 {
1778   _cpp_buff *result, **p;
1779
1780   for (p = &pfile->free_buffs;; p = &(*p)->next)
1781     {
1782       size_t size;
1783
1784       if (*p == NULL)
1785         return new_buff (min_size);
1786       result = *p;
1787       size = result->limit - result->base;
1788       /* Return a buffer that's big enough, but don't waste one that's
1789          way too big.  */
1790       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1791         break;
1792     }
1793
1794   *p = result->next;
1795   result->next = NULL;
1796   result->cur = result->base;
1797   return result;
1798 }
1799
1800 /* Creates a new buffer with enough space to hold the uncommitted
1801    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1802    the excess bytes to the new buffer.  Chains the new buffer after
1803    BUFF, and returns the new buffer.  */
1804 _cpp_buff *
1805 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1806 {
1807   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1808   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1809
1810   buff->next = new_buff;
1811   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1812   return new_buff;
1813 }
1814
1815 /* Creates a new buffer with enough space to hold the uncommitted
1816    remaining bytes of the buffer pointed to by BUFF, and at least
1817    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1818    Chains the new buffer before the buffer pointed to by BUFF, and
1819    updates the pointer to point to the new buffer.  */
1820 void
1821 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1822 {
1823   _cpp_buff *new_buff, *old_buff = *pbuff;
1824   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1825
1826   new_buff = _cpp_get_buff (pfile, size);
1827   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1828   new_buff->next = old_buff;
1829   *pbuff = new_buff;
1830 }
1831
1832 /* Free a chain of buffers starting at BUFF.  */
1833 void
1834 _cpp_free_buff (_cpp_buff *buff)
1835 {
1836   _cpp_buff *next;
1837
1838   for (; buff; buff = next)
1839     {
1840       next = buff->next;
1841       free (buff->base);
1842     }
1843 }
1844
1845 /* Allocate permanent, unaligned storage of length LEN.  */
1846 unsigned char *
1847 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1848 {
1849   _cpp_buff *buff = pfile->u_buff;
1850   unsigned char *result = buff->cur;
1851
1852   if (len > (size_t) (buff->limit - result))
1853     {
1854       buff = _cpp_get_buff (pfile, len);
1855       buff->next = pfile->u_buff;
1856       pfile->u_buff = buff;
1857       result = buff->cur;
1858     }
1859
1860   buff->cur = result + len;
1861   return result;
1862 }
1863
1864 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1865    That buffer is used for growing allocations when saving macro
1866    replacement lists in a #define, and when parsing an answer to an
1867    assertion in #assert, #unassert or #if (and therefore possibly
1868    whilst expanding macros).  It therefore must not be used by any
1869    code that they might call: specifically the lexer and the guts of
1870    the macro expander.
1871
1872    All existing other uses clearly fit this restriction: storing
1873    registered pragmas during initialization.  */
1874 unsigned char *
1875 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1876 {
1877   _cpp_buff *buff = pfile->a_buff;
1878   unsigned char *result = buff->cur;
1879
1880   if (len > (size_t) (buff->limit - result))
1881     {
1882       buff = _cpp_get_buff (pfile, len);
1883       buff->next = pfile->a_buff;
1884       pfile->a_buff = buff;
1885       result = buff->cur;
1886     }
1887
1888   buff->cur = result + len;
1889   return result;
1890 }
1891
1892 /* Say which field of TOK is in use.  */
1893
1894 enum cpp_token_fld_kind
1895 cpp_token_val_index (cpp_token *tok)
1896 {
1897   switch (TOKEN_SPELL (tok))
1898     {
1899     case SPELL_IDENT:
1900       return CPP_TOKEN_FLD_NODE;
1901     case SPELL_LITERAL:
1902       return CPP_TOKEN_FLD_STR;
1903     case SPELL_OPERATOR:
1904       if (tok->type == CPP_PASTE)
1905         return CPP_TOKEN_FLD_TOKEN_NO;
1906       else
1907         return CPP_TOKEN_FLD_NONE;
1908     case SPELL_NONE:
1909       if (tok->type == CPP_MACRO_ARG)
1910         return CPP_TOKEN_FLD_ARG_NO;
1911       else if (tok->type == CPP_PADDING)
1912         return CPP_TOKEN_FLD_SOURCE;
1913       else if (tok->type == CPP_PRAGMA)
1914         return CPP_TOKEN_FLD_PRAGMA;
1915       /* else fall through */
1916     default:
1917       return CPP_TOKEN_FLD_NONE;
1918     }
1919 }