libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD 3dNOW-A
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 static void __attribute__((constructor))
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__) || defined(__3dNOW_A__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1 || edx & bit_3DNOWP)
 509         impl = search_line_mmx;
 510     }
 511
 512   search_line_fast = impl;
 513 }
 514
 515 #elif defined(__GNUC__) && defined(__ALTIVEC__)
 516
 517 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 518 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 519    so we can't compile this function without -maltivec on the command line
 520    (or implied by some other switch).  */
 521
 522 static const uchar *
 523 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 524 {
 525   typedef __attribute__((altivec(vector))) unsigned char vc;
 526
 527   const vc repl_nl = {
 528     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 530   };
 531   const vc repl_cr = {
 532     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 534   };
 535   const vc repl_bs = {
 536     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 538   };
 539   const vc repl_qm = {
 540     '?', '?', '?', '?', '?', '?', '?', '?',
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542   };
 543   const vc ones = {
 544     -1, -1, -1, -1, -1, -1, -1, -1,
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546   };
 547   const vc zero = { 0 };
 548
 549   vc data, mask, t;
 550
 551   /* Altivec loads automatically mask addresses with -16.  This lets us
 552      issue the first load as early as possible.  */
 553   data = __builtin_vec_ld(0, (const vc *)s);
 554
 555   /* Discard bytes before the beginning of the buffer.  Do this by
 556      beginning with all ones and shifting in zeros according to the
 557      mis-alignment.  The LVSR instruction pulls the exact shift we
 558      want from the address.  */
 559   mask = __builtin_vec_lvsr(0, s);
 560   mask = __builtin_vec_perm(zero, ones, mask);
 561   data &= mask;
 562
 563   /* While altivec loads mask addresses, we still need to align S so
 564      that the offset we compute at the end is correct.  */
 565   s = (const uchar *)((uintptr_t)s & -16);
 566
 567   /* Main loop processing 16 bytes at a time.  */
 568   goto start;
 569   do
 570     {
 571       vc m_nl, m_cr, m_bs, m_qm;
 572
 573       s += 16;
 574       data = __builtin_vec_ld(0, (const vc *)s);
 575
 576     start:
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 593     union {
 594       vc v;
 595       unsigned long l[N];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613       case 2:
 614         l = u.l[i++];
 615         if (l != 0)
 616           break;
 617         s += sizeof(unsigned long);
 618         l = u.l[i];
 619       }
 620
 621     /* L now contains 0xff in bytes for which we matched one of the
 622        relevant characters.  We can find the byte index by finding
 623        its bit index and dividing by 8.  */
 624     l = __builtin_clzl(l) >> 3;
 625     return s + l;
 626
 627 #undef N
 628   }
 629 }
 630
 631 #else
 632
 633 /* We only have one accellerated alternative.  Use a direct call so that
 634    we encourage inlining.  */
 635
 636 #define search_line_fast  search_line_acc_char
 637
 638 #endif
 639
 640 /* Returns with a logical line that contains no escaped newlines or
 641    trigraphs.  This is a time-critical inner loop.  */
 642 void
 643 _cpp_clean_line (cpp_reader *pfile)
 644 {
 645   cpp_buffer *buffer;
 646   const uchar *s;
 647   uchar c, *d, *p;
 648
 649   buffer = pfile->buffer;
 650   buffer->cur_note = buffer->notes_used = 0;
 651   buffer->cur = buffer->line_base = buffer->next_line;
 652   buffer->need_line = false;
 653   s = buffer->next_line;
 654
 655   if (!buffer->from_stage3)
 656     {
 657       const uchar *pbackslash = NULL;
 658
 659       /* Fast path.  This is the common case of an un-escaped line with
 660          no trigraphs.  The primary win here is by not writing any
 661          data back to memory until we have to.  */
 662       while (1)
 663         {
 664           /* Perform an optimized search for \n, \r, \\, ?.  */
 665           s = search_line_fast (s, buffer->rlimit);
 666
 667           c = *s;
 668           if (c == '\\')
 669             {
 670               /* Record the location of the backslash and continue.  */
 671               pbackslash = s++;
 672             }
 673           else if (__builtin_expect (c == '?', 0))
 674             {
 675               if (__builtin_expect (s[1] == '?', false)
 676                    && _cpp_trigraph_map[s[2]])
 677                 {
 678                   /* Have a trigraph.  We may or may not have to convert
 679                      it.  Add a line note regardless, for -Wtrigraphs.  */
 680                   add_line_note (buffer, s, s[2]);
 681                   if (CPP_OPTION (pfile, trigraphs))
 682                     {
 683                       /* We do, and that means we have to switch to the
 684                          slow path.  */
 685                       d = (uchar *) s;
 686                       *d = _cpp_trigraph_map[s[2]];
 687                       s += 2;
 688                       goto slow_path;
 689                     }
 690                 }
 691               /* Not a trigraph.  Continue on fast-path.  */
 692               s++;
 693             }
 694           else
 695             break;
 696         }
 697
 698       /* This must be \r or \n.  We're either done, or we'll be forced
 699          to write back to the buffer and continue on the slow path.  */
 700       d = (uchar *) s;
 701
 702       if (__builtin_expect (s == buffer->rlimit, false))
 703         goto done;
 704
 705       /* DOS line ending? */
 706       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 707         {
 708           s++;
 709           if (s == buffer->rlimit)
 710             goto done;
 711         }
 712
 713       if (__builtin_expect (pbackslash == NULL, true))
 714         goto done;
 715
 716       /* Check for escaped newline.  */
 717       p = d;
 718       while (is_nvspace (p[-1]))
 719         p--;
 720       if (p - 1 != pbackslash)
 721         goto done;
 722
 723       /* Have an escaped newline; process it and proceed to
 724          the slow path.  */
 725       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 726       d = p - 2;
 727       buffer->next_line = p - 1;
 728
 729     slow_path:
 730       while (1)
 731         {
 732           c = *++s;
 733           *++d = c;
 734
 735           if (c == '\n' || c == '\r')
 736             {
 737               /* Handle DOS line endings.  */
 738               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 739                 s++;
 740               if (s == buffer->rlimit)
 741                 break;
 742
 743               /* Escaped?  */
 744               p = d;
 745               while (p != buffer->next_line && is_nvspace (p[-1]))
 746                 p--;
 747               if (p == buffer->next_line || p[-1] != '\\')
 748                 break;
 749
 750               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 751               d = p - 2;
 752               buffer->next_line = p - 1;
 753             }
 754           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 755             {
 756               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 757               add_line_note (buffer, d, s[2]);
 758               if (CPP_OPTION (pfile, trigraphs))
 759                 {
 760                   *d = _cpp_trigraph_map[s[2]];
 761                   s += 2;
 762                 }
 763             }
 764         }
 765     }
 766   else
 767     {
 768       while (*s != '\n' && *s != '\r')
 769         s++;
 770       d = (uchar *) s;
 771
 772       /* Handle DOS line endings.  */
 773       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 774         s++;
 775     }
 776
 777  done:
 778   *d = '\n';
 779   /* A sentinel note that should never be processed.  */
 780   add_line_note (buffer, d + 1, '\n');
 781   buffer->next_line = s + 1;
 782 }
 783
 784 /* Return true if the trigraph indicated by NOTE should be warned
 785    about in a comment.  */
 786 static bool
 787 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 788 {
 789   const uchar *p;
 790
 791   /* Within comments we don't warn about trigraphs, unless the
 792      trigraph forms an escaped newline, as that may change
 793      behavior.  */
 794   if (note->type != '/')
 795     return false;
 796
 797   /* If -trigraphs, then this was an escaped newline iff the next note
 798      is coincident.  */
 799   if (CPP_OPTION (pfile, trigraphs))
 800     return note[1].pos == note->pos;
 801
 802   /* Otherwise, see if this forms an escaped newline.  */
 803   p = note->pos + 3;
 804   while (is_nvspace (*p))
 805     p++;
 806
 807   /* There might have been escaped newlines between the trigraph and the
 808      newline we found.  Hence the position test.  */
 809   return (*p == '\n' && p < note[1].pos);
 810 }
 811
 812 /* Process the notes created by add_line_note as far as the current
 813    location.  */
 814 void
 815 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 816 {
 817   cpp_buffer *buffer = pfile->buffer;
 818
 819   for (;;)
 820     {
 821       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 822       unsigned int col;
 823
 824       if (note->pos > buffer->cur)
 825         break;
 826
 827       buffer->cur_note++;
 828       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 829
 830       if (note->type == '\\' || note->type == ' ')
 831         {
 832           if (note->type == ' ' && !in_comment)
 833             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 834                                  "backslash and newline separated by space");
 835
 836           if (buffer->next_line > buffer->rlimit)
 837             {
 838               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 839                                    "backslash-newline at end of file");
 840               /* Prevent "no newline at end of file" warning.  */
 841               buffer->next_line = buffer->rlimit;
 842             }
 843
 844           buffer->line_base = note->pos;
 845           CPP_INCREMENT_LINE (pfile, 0);
 846         }
 847       else if (_cpp_trigraph_map[note->type])
 848         {
 849           if (CPP_OPTION (pfile, warn_trigraphs)
 850               && (!in_comment || warn_in_comment (pfile, note)))
 851             {
 852               if (CPP_OPTION (pfile, trigraphs))
 853                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 854                                        pfile->line_table->highest_line, col,
 855                                        "trigraph ??%c converted to %c",
 856                                        note->type,
 857                                        (int) _cpp_trigraph_map[note->type]);
 858               else
 859                 {
 860                   cpp_warning_with_line
 861                     (pfile, CPP_W_TRIGRAPHS,
 862                      pfile->line_table->highest_line, col,
 863                      "trigraph ??%c ignored, use -trigraphs to enable",
 864                      note->type);
 865                 }
 866             }
 867         }
 868       else if (note->type == 0)
 869         /* Already processed in lex_raw_string.  */;
 870       else
 871         abort ();
 872     }
 873 }
 874
 875 /* Skip a C-style block comment.  We find the end of the comment by
 876    seeing if an asterisk is before every '/' we encounter.  Returns
 877    nonzero if comment terminated by EOF, zero otherwise.
 878
 879    Buffer->cur points to the initial asterisk of the comment.  */
 880 bool
 881 _cpp_skip_block_comment (cpp_reader *pfile)
 882 {
 883   cpp_buffer *buffer = pfile->buffer;
 884   const uchar *cur = buffer->cur;
 885   uchar c;
 886
 887   cur++;
 888   if (*cur == '/')
 889     cur++;
 890
 891   for (;;)
 892     {
 893       /* People like decorating comments with '*', so check for '/'
 894          instead for efficiency.  */
 895       c = *cur++;
 896
 897       if (c == '/')
 898         {
 899           if (cur[-2] == '*')
 900             break;
 901
 902           /* Warn about potential nested comments, but not if the '/'
 903              comes immediately before the true comment delimiter.
 904              Don't bother to get it right across escaped newlines.  */
 905           if (CPP_OPTION (pfile, warn_comments)
 906               && cur[0] == '*' && cur[1] != '/')
 907             {
 908               buffer->cur = cur;
 909               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 910                                      pfile->line_table->highest_line,
 911                                      CPP_BUF_COL (buffer),
 912                                      "\"/*\" within comment");
 913             }
 914         }
 915       else if (c == '\n')
 916         {
 917           unsigned int cols;
 918           buffer->cur = cur - 1;
 919           _cpp_process_line_notes (pfile, true);
 920           if (buffer->next_line >= buffer->rlimit)
 921             return true;
 922           _cpp_clean_line (pfile);
 923
 924           cols = buffer->next_line - buffer->line_base;
 925           CPP_INCREMENT_LINE (pfile, cols);
 926
 927           cur = buffer->cur;
 928         }
 929     }
 930
 931   buffer->cur = cur;
 932   _cpp_process_line_notes (pfile, true);
 933   return false;
 934 }
 935
 936 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 937    terminating newline.  Handles escaped newlines.  Returns nonzero
 938    if a multiline comment.  */
 939 static int
 940 skip_line_comment (cpp_reader *pfile)
 941 {
 942   cpp_buffer *buffer = pfile->buffer;
 943   source_location orig_line = pfile->line_table->highest_line;
 944
 945   while (*buffer->cur != '\n')
 946     buffer->cur++;
 947
 948   _cpp_process_line_notes (pfile, true);
 949   return orig_line != pfile->line_table->highest_line;
 950 }
 951
 952 /* Skips whitespace, saving the next non-whitespace character.  */
 953 static void
 954 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 955 {
 956   cpp_buffer *buffer = pfile->buffer;
 957   bool saw_NUL = false;
 958
 959   do
 960     {
 961       /* Horizontal space always OK.  */
 962       if (c == ' ' || c == '\t')
 963         ;
 964       /* Just \f \v or \0 left.  */
 965       else if (c == '\0')
 966         saw_NUL = true;
 967       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 968         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 969                              CPP_BUF_COL (buffer),
 970                              "%s in preprocessing directive",
 971                              c == '\f' ? "form feed" : "vertical tab");
 972
 973       c = *buffer->cur++;
 974     }
 975   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 976   while (is_nvspace (c));
 977
 978   if (saw_NUL)
 979     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 980
 981   buffer->cur--;
 982 }
 983
 984 /* See if the characters of a number token are valid in a name (no
 985    '.', '+' or '-').  */
 986 static int
 987 name_p (cpp_reader *pfile, const cpp_string *string)
 988 {
 989   unsigned int i;
 990
 991   for (i = 0; i < string->len; i++)
 992     if (!is_idchar (string->text[i]))
 993       return 0;
 994
 995   return 1;
 996 }
 997
 998 /* After parsing an identifier or other sequence, produce a warning about
 999    sequences not in NFC/NFKC.  */
1000 static void
1001 warn_about_normalization (cpp_reader *pfile,
1002                           const cpp_token *token,
1003                           const struct normalize_state *s)
1004 {
1005   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1006       && !pfile->state.skipping)
1007     {
1008       /* Make sure that the token is printed using UCNs, even
1009          if we'd otherwise happily print UTF-8.  */
1010       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1011       size_t sz;
1012
1013       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1014       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1015         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1016                                "`%.*s' is not in NFKC", (int) sz, buf);
1017       else
1018         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1019                                "`%.*s' is not in NFC", (int) sz, buf);
1020     }
1021 }
1022
1023 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1024    an identifier.  FIRST is TRUE if this starts an identifier.  */
1025 static bool
1026 forms_identifier_p (cpp_reader *pfile, int first,
1027                     struct normalize_state *state)
1028 {
1029   cpp_buffer *buffer = pfile->buffer;
1030
1031   if (*buffer->cur == '$')
1032     {
1033       if (!CPP_OPTION (pfile, dollars_in_ident))
1034         return false;
1035
1036       buffer->cur++;
1037       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1038         {
1039           CPP_OPTION (pfile, warn_dollars) = 0;
1040           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1041         }
1042
1043       return true;
1044     }
1045
1046   /* Is this a syntactically valid UCN?  */
1047   if (CPP_OPTION (pfile, extended_identifiers)
1048       && *buffer->cur == '\\'
1049       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1050     {
1051       buffer->cur += 2;
1052       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1053                           state))
1054         return true;
1055       buffer->cur -= 2;
1056     }
1057
1058   return false;
1059 }
1060
1061 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1062 static cpp_hashnode *
1063 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1064 {
1065   cpp_hashnode *result;
1066   const uchar *cur;
1067   unsigned int len;
1068   unsigned int hash = HT_HASHSTEP (0, *base);
1069
1070   cur = base + 1;
1071   while (ISIDNUM (*cur))
1072     {
1073       hash = HT_HASHSTEP (hash, *cur);
1074       cur++;
1075     }
1076   len = cur - base;
1077   hash = HT_HASHFINISH (hash, len);
1078   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1079                                               base, len, hash, HT_ALLOC));
1080
1081   /* Rarely, identifiers require diagnostics when lexed.  */
1082   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1083                         && !pfile->state.skipping, 0))
1084     {
1085       /* It is allowed to poison the same identifier twice.  */
1086       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1087         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1088                    NODE_NAME (result));
1089
1090       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1091          replacement list of a variadic macro.  */
1092       if (result == pfile->spec_nodes.n__VA_ARGS__
1093           && !pfile->state.va_args_ok)
1094         cpp_error (pfile, CPP_DL_PEDWARN,
1095                    "__VA_ARGS__ can only appear in the expansion"
1096                    " of a C99 variadic macro");
1097
1098       /* For -Wc++-compat, warn about use of C++ named operators.  */
1099       if (result->flags & NODE_WARN_OPERATOR)
1100         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1101                      "identifier \"%s\" is a special operator name in C++",
1102                      NODE_NAME (result));
1103     }
1104
1105   return result;
1106 }
1107
1108 /* Get the cpp_hashnode of an identifier specified by NAME in
1109    the current cpp_reader object.  If none is found, NULL is returned.  */
1110 cpp_hashnode *
1111 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1112 {
1113   cpp_hashnode *result;
1114   result = lex_identifier_intern (pfile, (uchar *) name);
1115   return result;
1116 }
1117
1118 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1119 static cpp_hashnode *
1120 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1121                 struct normalize_state *nst)
1122 {
1123   cpp_hashnode *result;
1124   const uchar *cur;
1125   unsigned int len;
1126   unsigned int hash = HT_HASHSTEP (0, *base);
1127
1128   cur = pfile->buffer->cur;
1129   if (! starts_ucn)
1130     while (ISIDNUM (*cur))
1131       {
1132         hash = HT_HASHSTEP (hash, *cur);
1133         cur++;
1134       }
1135   pfile->buffer->cur = cur;
1136   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1137     {
1138       /* Slower version for identifiers containing UCNs (or $).  */
1139       do {
1140         while (ISIDNUM (*pfile->buffer->cur))
1141           {
1142             pfile->buffer->cur++;
1143             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1144           }
1145       } while (forms_identifier_p (pfile, false, nst));
1146       result = _cpp_interpret_identifier (pfile, base,
1147                                           pfile->buffer->cur - base);
1148     }
1149   else
1150     {
1151       len = cur - base;
1152       hash = HT_HASHFINISH (hash, len);
1153
1154       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1155                                                   base, len, hash, HT_ALLOC));
1156     }
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1186 static void
1187 lex_number (cpp_reader *pfile, cpp_string *number,
1188             struct normalize_state *nst)
1189 {
1190   const uchar *cur;
1191   const uchar *base;
1192   uchar *dest;
1193
1194   base = pfile->buffer->cur - 1;
1195   do
1196     {
1197       cur = pfile->buffer->cur;
1198
1199       /* N.B. ISIDNUM does not include $.  */
1200       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1201         {
1202           cur++;
1203           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1204         }
1205
1206       pfile->buffer->cur = cur;
1207     }
1208   while (forms_identifier_p (pfile, false, nst));
1209
1210   number->len = cur - base;
1211   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1212   memcpy (dest, base, number->len);
1213   dest[number->len] = '\0';
1214   number->text = dest;
1215 }
1216
1217 /* Create a token of type TYPE with a literal spelling.  */
1218 static void
1219 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1220                 unsigned int len, enum cpp_ttype type)
1221 {
1222   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1223
1224   memcpy (dest, base, len);
1225   dest[len] = '\0';
1226   token->type = type;
1227   token->val.str.len = len;
1228   token->val.str.text = dest;
1229 }
1230
1231 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1232    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1233
1234 static void
1235 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1236                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1237 {
1238   _cpp_buff *first_buff = *first_buff_p;
1239   _cpp_buff *last_buff = *last_buff_p;
1240
1241   if (first_buff == NULL)
1242     first_buff = last_buff = _cpp_get_buff (pfile, len);
1243   else if (len > BUFF_ROOM (last_buff))
1244     {
1245       size_t room = BUFF_ROOM (last_buff);
1246       memcpy (BUFF_FRONT (last_buff), base, room);
1247       BUFF_FRONT (last_buff) += room;
1248       base += room;
1249       len -= room;
1250       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1251     }
1252
1253   memcpy (BUFF_FRONT (last_buff), base, len);
1254   BUFF_FRONT (last_buff) += len;
1255
1256   *first_buff_p = first_buff;
1257   *last_buff_p = last_buff;
1258 }
1259
1260 /* Lexes a raw string.  The stored string contains the spelling, including
1261    double quotes, delimiter string, '(' and ')', any leading
1262    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1263    literal, or CPP_OTHER if it was not properly terminated.
1264
1265    The spelling is NUL-terminated, but it is not guaranteed that this
1266    is the first NUL since embedded NULs are preserved.  */
1267
1268 static void
1269 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1270                 const uchar *cur)
1271 {
1272   source_location saw_NUL = 0;
1273   const uchar *raw_prefix;
1274   unsigned int raw_prefix_len = 0;
1275   enum cpp_ttype type;
1276   size_t total_len = 0;
1277   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1278   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1279
1280   type = (*base == 'L' ? CPP_WSTRING :
1281           *base == 'U' ? CPP_STRING32 :
1282           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1283           : CPP_STRING);
1284
1285   raw_prefix = cur + 1;
1286   while (raw_prefix_len < 16)
1287     {
1288       switch (raw_prefix[raw_prefix_len])
1289         {
1290         case ' ': case '(': case ')': case '\\': case '\t':
1291         case '\v': case '\f': case '\n': default:
1292           break;
1293         /* Basic source charset except the above chars.  */
1294         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1295         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1296         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1297         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1298         case 'y': case 'z':
1299         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1300         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1301         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1302         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1303         case 'Y': case 'Z':
1304         case '0': case '1': case '2': case '3': case '4': case '5':
1305         case '6': case '7': case '8': case '9':
1306         case '_': case '{': case '}': case '#': case '[': case ']':
1307         case '<': case '>': case '%': case ':': case ';': case '.':
1308         case '?': case '*': case '+': case '-': case '/': case '^':
1309         case '&': case '|': case '~': case '!': case '=': case ',':
1310         case '"': case '\'':
1311           raw_prefix_len++;
1312           continue;
1313         }
1314       break;
1315     }
1316
1317   if (raw_prefix[raw_prefix_len] != '(')
1318     {
1319       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1320                 + 1;
1321       if (raw_prefix_len == 16)
1322         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1323                              "raw string delimiter longer than 16 characters");
1324       else
1325         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1326                              "invalid character '%c' in raw string delimiter",
1327                              (int) raw_prefix[raw_prefix_len]);
1328       pfile->buffer->cur = raw_prefix - 1;
1329       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1330       return;
1331     }
1332
1333   cur = raw_prefix + raw_prefix_len + 1;
1334   for (;;)
1335     {
1336 #define BUF_APPEND(STR,LEN)                                     \
1337       do {                                                      \
1338         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1339                         &first_buff, &last_buff);               \
1340         total_len += (LEN);                                     \
1341       } while (0);
1342
1343       cppchar_t c;
1344
1345       /* If we previously performed any trigraph or line splicing
1346          transformations, undo them within the body of the raw string.  */
1347       while (note->pos < cur)
1348         ++note;
1349       for (; note->pos == cur; ++note)
1350         {
1351           switch (note->type)
1352             {
1353             case '\\':
1354             case ' ':
1355               /* Restore backslash followed by newline.  */
1356               BUF_APPEND (base, cur - base);
1357               base = cur;
1358               BUF_APPEND ("\\", 1);
1359             after_backslash:
1360               if (note->type == ' ')
1361                 {
1362                   /* GNU backslash whitespace newline extension.  FIXME
1363                      could be any sequence of non-vertical space.  When we
1364                      can properly restore any such sequence, we should mark
1365                      this note as handled so _cpp_process_line_notes
1366                      doesn't warn.  */
1367                   BUF_APPEND (" ", 1);
1368                 }
1369
1370               BUF_APPEND ("\n", 1);
1371               break;
1372
1373             case 0:
1374               /* Already handled.  */
1375               break;
1376
1377             default:
1378               if (_cpp_trigraph_map[note->type])
1379                 {
1380                   /* Don't warn about this trigraph in
1381                      _cpp_process_line_notes, since trigraphs show up as
1382                      trigraphs in raw strings.  */
1383                   uchar type = note->type;
1384                   note->type = 0;
1385
1386                   if (!CPP_OPTION (pfile, trigraphs))
1387                     /* If we didn't convert the trigraph in the first
1388                        place, don't do anything now either.  */
1389                     break;
1390
1391                   BUF_APPEND (base, cur - base);
1392                   base = cur;
1393                   BUF_APPEND ("??", 2);
1394
1395                   /* ??/ followed by newline gets two line notes, one for
1396                      the trigraph and one for the backslash/newline.  */
1397                   if (type == '/' && note[1].pos == cur)
1398                     {
1399                       if (note[1].type != '\\'
1400                           && note[1].type != ' ')
1401                         abort ();
1402                       BUF_APPEND ("/", 1);
1403                       ++note;
1404                       goto after_backslash;
1405                     }
1406                   /* The ) from ??) could be part of the suffix.  */
1407                   else if (type == ')'
1408                            && strncmp ((const char *) cur+1,
1409                                        (const char *) raw_prefix,
1410                                        raw_prefix_len) == 0
1411                            && cur[raw_prefix_len+1] == '"')
1412                     {
1413                       cur += raw_prefix_len+2;
1414                       goto break_outer_loop;
1415                     }
1416                   else
1417                     {
1418                       /* Skip the replacement character.  */
1419                       base = ++cur;
1420                       BUF_APPEND (&type, 1);
1421                     }
1422                 }
1423               else
1424                 abort ();
1425               break;
1426             }
1427         }
1428       c = *cur++;
1429
1430       if (c == ')'
1431           && strncmp ((const char *) cur, (const char *) raw_prefix,
1432                       raw_prefix_len) == 0
1433           && cur[raw_prefix_len] == '"')
1434         {
1435           cur += raw_prefix_len + 1;
1436           break;
1437         }
1438       else if (c == '\n')
1439         {
1440           if (pfile->state.in_directive
1441               || pfile->state.parsing_args
1442               || pfile->state.in_deferred_pragma)
1443             {
1444               cur--;
1445               type = CPP_OTHER;
1446               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1447                                    "unterminated raw string");
1448               break;
1449             }
1450
1451           BUF_APPEND (base, cur - base);
1452
1453           if (pfile->buffer->cur < pfile->buffer->rlimit)
1454             CPP_INCREMENT_LINE (pfile, 0);
1455           pfile->buffer->need_line = true;
1456
1457           pfile->buffer->cur = cur-1;
1458           _cpp_process_line_notes (pfile, false);
1459           if (!_cpp_get_fresh_line (pfile))
1460             {
1461               source_location src_loc = token->src_loc;
1462               token->type = CPP_EOF;
1463               /* Tell the compiler the line number of the EOF token.  */
1464               token->src_loc = pfile->line_table->highest_line;
1465               token->flags = BOL;
1466               if (first_buff != NULL)
1467                 _cpp_release_buff (pfile, first_buff);
1468               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1469                                    "unterminated raw string");
1470               return;
1471             }
1472
1473           cur = base = pfile->buffer->cur;
1474           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1475         }
1476       else if (c == '\0' && !saw_NUL)
1477         LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
1478                                      CPP_BUF_COLUMN (pfile->buffer, cur));
1479     }
1480  break_outer_loop:
1481
1482   if (saw_NUL && !pfile->state.skipping)
1483     cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
1484                "null character(s) preserved in literal");
1485
1486   pfile->buffer->cur = cur;
1487   if (first_buff == NULL)
1488     create_literal (pfile, token, base, cur - base, type);
1489   else
1490     {
1491       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1492
1493       token->type = type;
1494       token->val.str.len = total_len + (cur - base);
1495       token->val.str.text = dest;
1496       last_buff = first_buff;
1497       while (last_buff != NULL)
1498         {
1499           memcpy (dest, last_buff->base,
1500                   BUFF_FRONT (last_buff) - last_buff->base);
1501           dest += BUFF_FRONT (last_buff) - last_buff->base;
1502           last_buff = last_buff->next;
1503         }
1504       _cpp_release_buff (pfile, first_buff);
1505       memcpy (dest, base, cur - base);
1506       dest[cur - base] = '\0';
1507     }
1508 }
1509
1510 /* Lexes a string, character constant, or angle-bracketed header file
1511    name.  The stored string contains the spelling, including opening
1512    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1513    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1514    if it was not properly terminated, or CPP_LESS for an unterminated
1515    header name which must be relexed as normal tokens.
1516
1517    The spelling is NUL-terminated, but it is not guaranteed that this
1518    is the first NUL since embedded NULs are preserved.  */
1519 static void
1520 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1521 {
1522   bool saw_NUL = false;
1523   const uchar *cur;
1524   cppchar_t terminator;
1525   enum cpp_ttype type;
1526
1527   cur = base;
1528   terminator = *cur++;
1529   if (terminator == 'L' || terminator == 'U')
1530     terminator = *cur++;
1531   else if (terminator == 'u')
1532     {
1533       terminator = *cur++;
1534       if (terminator == '8')
1535         terminator = *cur++;
1536     }
1537   if (terminator == 'R')
1538     {
1539       lex_raw_string (pfile, token, base, cur);
1540       return;
1541     }
1542   if (terminator == '"')
1543     type = (*base == 'L' ? CPP_WSTRING :
1544             *base == 'U' ? CPP_STRING32 :
1545             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1546                          : CPP_STRING);
1547   else if (terminator == '\'')
1548     type = (*base == 'L' ? CPP_WCHAR :
1549             *base == 'U' ? CPP_CHAR32 :
1550             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1551   else
1552     terminator = '>', type = CPP_HEADER_NAME;
1553
1554   for (;;)
1555     {
1556       cppchar_t c = *cur++;
1557
1558       /* In #include-style directives, terminators are not escapable.  */
1559       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1560         cur++;
1561       else if (c == terminator)
1562         break;
1563       else if (c == '\n')
1564         {
1565           cur--;
1566           /* Unmatched quotes always yield undefined behavior, but
1567              greedy lexing means that what appears to be an unterminated
1568              header name may actually be a legitimate sequence of tokens.  */
1569           if (terminator == '>')
1570             {
1571               token->type = CPP_LESS;
1572               return;
1573             }
1574           type = CPP_OTHER;
1575           break;
1576         }
1577       else if (c == '\0')
1578         saw_NUL = true;
1579     }
1580
1581   if (saw_NUL && !pfile->state.skipping)
1582     cpp_error (pfile, CPP_DL_WARNING,
1583                "null character(s) preserved in literal");
1584
1585   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1586     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1587                (int) terminator);
1588
1589   pfile->buffer->cur = cur;
1590   create_literal (pfile, token, base, cur - base, type);
1591 }
1592
1593 /* Return the comment table. The client may not make any assumption
1594    about the ordering of the table.  */
1595 cpp_comment_table *
1596 cpp_get_comments (cpp_reader *pfile)
1597 {
1598   return &pfile->comments;
1599 }
1600
1601 /* Append a comment to the end of the comment table. */
1602 static void
1603 store_comment (cpp_reader *pfile, cpp_token *token)
1604 {
1605   int len;
1606
1607   if (pfile->comments.allocated == 0)
1608     {
1609       pfile->comments.allocated = 256;
1610       pfile->comments.entries = (cpp_comment *) xmalloc
1611         (pfile->comments.allocated * sizeof (cpp_comment));
1612     }
1613
1614   if (pfile->comments.count == pfile->comments.allocated)
1615     {
1616       pfile->comments.allocated *= 2;
1617       pfile->comments.entries = (cpp_comment *) xrealloc
1618         (pfile->comments.entries,
1619          pfile->comments.allocated * sizeof (cpp_comment));
1620     }
1621
1622   len = token->val.str.len;
1623
1624   /* Copy comment. Note, token may not be NULL terminated. */
1625   pfile->comments.entries[pfile->comments.count].comment =
1626     (char *) xmalloc (sizeof (char) * (len + 1));
1627   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1628           token->val.str.text, len);
1629   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1630
1631   /* Set source location. */
1632   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1633
1634   /* Increment the count of entries in the comment table. */
1635   pfile->comments.count++;
1636 }
1637
1638 /* The stored comment includes the comment start and any terminator.  */
1639 static void
1640 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1641               cppchar_t type)
1642 {
1643   unsigned char *buffer;
1644   unsigned int len, clen;
1645
1646   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1647
1648   /* C++ comments probably (not definitely) have moved past a new
1649      line, which we don't want to save in the comment.  */
1650   if (is_vspace (pfile->buffer->cur[-1]))
1651     len--;
1652
1653   /* If we are currently in a directive, then we need to store all
1654      C++ comments as C comments internally, and so we need to
1655      allocate a little extra space in that case.
1656
1657      Note that the only time we encounter a directive here is
1658      when we are saving comments in a "#define".  */
1659   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1660
1661   buffer = _cpp_unaligned_alloc (pfile, clen);
1662
1663   token->type = CPP_COMMENT;
1664   token->val.str.len = clen;
1665   token->val.str.text = buffer;
1666
1667   buffer[0] = '/';
1668   memcpy (buffer + 1, from, len - 1);
1669
1670   /* Finish conversion to a C comment, if necessary.  */
1671   if (pfile->state.in_directive && type == '/')
1672     {
1673       buffer[1] = '*';
1674       buffer[clen - 2] = '*';
1675       buffer[clen - 1] = '/';
1676     }
1677
1678   /* Finally store this comment for use by clients of libcpp. */
1679   store_comment (pfile, token);
1680 }
1681
1682 /* Allocate COUNT tokens for RUN.  */
1683 void
1684 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1685 {
1686   run->base = XNEWVEC (cpp_token, count);
1687   run->limit = run->base + count;
1688   run->next = NULL;
1689 }
1690
1691 /* Returns the next tokenrun, or creates one if there is none.  */
1692 static tokenrun *
1693 next_tokenrun (tokenrun *run)
1694 {
1695   if (run->next == NULL)
1696     {
1697       run->next = XNEW (tokenrun);
1698       run->next->prev = run;
1699       _cpp_init_tokenrun (run->next, 250);
1700     }
1701
1702   return run->next;
1703 }
1704
1705 /* Look ahead in the input stream.  */
1706 const cpp_token *
1707 cpp_peek_token (cpp_reader *pfile, int index)
1708 {
1709   cpp_context *context = pfile->context;
1710   const cpp_token *peektok;
1711   int count;
1712
1713   /* First, scan through any pending cpp_context objects.  */
1714   while (context->prev)
1715     {
1716       ptrdiff_t sz = (context->direct_p
1717                       ? LAST (context).token - FIRST (context).token
1718                       : LAST (context).ptoken - FIRST (context).ptoken);
1719
1720       if (index < (int) sz)
1721         return (context->direct_p
1722                 ? FIRST (context).token + index
1723                 : *(FIRST (context).ptoken + index));
1724
1725       index -= (int) sz;
1726       context = context->prev;
1727     }
1728
1729   /* We will have to read some new tokens after all (and do so
1730      without invalidating preceding tokens).  */
1731   count = index;
1732   pfile->keep_tokens++;
1733
1734   do
1735     {
1736       peektok = _cpp_lex_token (pfile);
1737       if (peektok->type == CPP_EOF)
1738         return peektok;
1739     }
1740   while (index--);
1741
1742   _cpp_backup_tokens_direct (pfile, count + 1);
1743   pfile->keep_tokens--;
1744
1745   return peektok;
1746 }
1747
1748 /* Allocate a single token that is invalidated at the same time as the
1749    rest of the tokens on the line.  Has its line and col set to the
1750    same as the last lexed token, so that diagnostics appear in the
1751    right place.  */
1752 cpp_token *
1753 _cpp_temp_token (cpp_reader *pfile)
1754 {
1755   cpp_token *old, *result;
1756   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1757   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1758
1759   old = pfile->cur_token - 1;
1760   /* Any pre-existing lookaheads must not be clobbered.  */
1761   if (la)
1762     {
1763       if (sz <= la)
1764         {
1765           tokenrun *next = next_tokenrun (pfile->cur_run);
1766
1767           if (sz < la)
1768             memmove (next->base + 1, next->base,
1769                      (la - sz) * sizeof (cpp_token));
1770
1771           next->base[0] = pfile->cur_run->limit[-1];
1772         }
1773
1774       if (sz > 1)
1775         memmove (pfile->cur_token + 1, pfile->cur_token,
1776                  MIN (la, sz - 1) * sizeof (cpp_token));
1777     }
1778
1779   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1780     {
1781       pfile->cur_run = next_tokenrun (pfile->cur_run);
1782       pfile->cur_token = pfile->cur_run->base;
1783     }
1784
1785   result = pfile->cur_token++;
1786   result->src_loc = old->src_loc;
1787   return result;
1788 }
1789
1790 /* Lex a token into RESULT (external interface).  Takes care of issues
1791    like directive handling, token lookahead, multiple include
1792    optimization and skipping.  */
1793 const cpp_token *
1794 _cpp_lex_token (cpp_reader *pfile)
1795 {
1796   cpp_token *result;
1797
1798   for (;;)
1799     {
1800       if (pfile->cur_token == pfile->cur_run->limit)
1801         {
1802           pfile->cur_run = next_tokenrun (pfile->cur_run);
1803           pfile->cur_token = pfile->cur_run->base;
1804         }
1805       /* We assume that the current token is somewhere in the current
1806          run.  */
1807       if (pfile->cur_token < pfile->cur_run->base
1808           || pfile->cur_token >= pfile->cur_run->limit)
1809         abort ();
1810
1811       if (pfile->lookaheads)
1812         {
1813           pfile->lookaheads--;
1814           result = pfile->cur_token++;
1815         }
1816       else
1817         result = _cpp_lex_direct (pfile);
1818
1819       if (result->flags & BOL)
1820         {
1821           /* Is this a directive.  If _cpp_handle_directive returns
1822              false, it is an assembler #.  */
1823           if (result->type == CPP_HASH
1824               /* 6.10.3 p 11: Directives in a list of macro arguments
1825                  gives undefined behavior.  This implementation
1826                  handles the directive as normal.  */
1827               && pfile->state.parsing_args != 1)
1828             {
1829               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1830                 {
1831                   if (pfile->directive_result.type == CPP_PADDING)
1832                     continue;
1833                   result = &pfile->directive_result;
1834                 }
1835             }
1836           else if (pfile->state.in_deferred_pragma)
1837             result = &pfile->directive_result;
1838
1839           if (pfile->cb.line_change && !pfile->state.skipping)
1840             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1841         }
1842
1843       /* We don't skip tokens in directives.  */
1844       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1845         break;
1846
1847       /* Outside a directive, invalidate controlling macros.  At file
1848          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1849          get here and MI optimization works.  */
1850       pfile->mi_valid = false;
1851
1852       if (!pfile->state.skipping || result->type == CPP_EOF)
1853         break;
1854     }
1855
1856   return result;
1857 }
1858
1859 /* Returns true if a fresh line has been loaded.  */
1860 bool
1861 _cpp_get_fresh_line (cpp_reader *pfile)
1862 {
1863   int return_at_eof;
1864
1865   /* We can't get a new line until we leave the current directive.  */
1866   if (pfile->state.in_directive)
1867     return false;
1868
1869   for (;;)
1870     {
1871       cpp_buffer *buffer = pfile->buffer;
1872
1873       if (!buffer->need_line)
1874         return true;
1875
1876       if (buffer->next_line < buffer->rlimit)
1877         {
1878           _cpp_clean_line (pfile);
1879           return true;
1880         }
1881
1882       /* First, get out of parsing arguments state.  */
1883       if (pfile->state.parsing_args)
1884         return false;
1885
1886       /* End of buffer.  Non-empty files should end in a newline.  */
1887       if (buffer->buf != buffer->rlimit
1888           && buffer->next_line > buffer->rlimit
1889           && !buffer->from_stage3)
1890         {
1891           /* Clip to buffer size.  */
1892           buffer->next_line = buffer->rlimit;
1893         }
1894
1895       return_at_eof = buffer->return_at_eof;
1896       _cpp_pop_buffer (pfile);
1897       if (pfile->buffer == NULL || return_at_eof)
1898         return false;
1899     }
1900 }
1901
1902 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1903   do                                                    \
1904     {                                                   \
1905       result->type = ELSE_TYPE;                         \
1906       if (*buffer->cur == CHAR)                         \
1907         buffer->cur++, result->type = THEN_TYPE;        \
1908     }                                                   \
1909   while (0)
1910
1911 /* Lex a token into pfile->cur_token, which is also incremented, to
1912    get diagnostics pointing to the correct location.
1913
1914    Does not handle issues such as token lookahead, multiple-include
1915    optimization, directives, skipping etc.  This function is only
1916    suitable for use by _cpp_lex_token, and in special cases like
1917    lex_expansion_token which doesn't care for any of these issues.
1918
1919    When meeting a newline, returns CPP_EOF if parsing a directive,
1920    otherwise returns to the start of the token buffer if permissible.
1921    Returns the location of the lexed token.  */
1922 cpp_token *
1923 _cpp_lex_direct (cpp_reader *pfile)
1924 {
1925   cppchar_t c;
1926   cpp_buffer *buffer;
1927   const unsigned char *comment_start;
1928   cpp_token *result = pfile->cur_token++;
1929
1930  fresh_line:
1931   result->flags = 0;
1932   buffer = pfile->buffer;
1933   if (buffer->need_line)
1934     {
1935       if (pfile->state.in_deferred_pragma)
1936         {
1937           result->type = CPP_PRAGMA_EOL;
1938           pfile->state.in_deferred_pragma = false;
1939           if (!pfile->state.pragma_allow_expansion)
1940             pfile->state.prevent_expansion--;
1941           return result;
1942         }
1943       if (!_cpp_get_fresh_line (pfile))
1944         {
1945           result->type = CPP_EOF;
1946           if (!pfile->state.in_directive)
1947             {
1948               /* Tell the compiler the line number of the EOF token.  */
1949               result->src_loc = pfile->line_table->highest_line;
1950               result->flags = BOL;
1951             }
1952           return result;
1953         }
1954       if (!pfile->keep_tokens)
1955         {
1956           pfile->cur_run = &pfile->base_run;
1957           result = pfile->base_run.base;
1958           pfile->cur_token = result + 1;
1959         }
1960       result->flags = BOL;
1961       if (pfile->state.parsing_args == 2)
1962         result->flags |= PREV_WHITE;
1963     }
1964   buffer = pfile->buffer;
1965  update_tokens_line:
1966   result->src_loc = pfile->line_table->highest_line;
1967
1968  skipped_white:
1969   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1970       && !pfile->overlaid_buffer)
1971     {
1972       _cpp_process_line_notes (pfile, false);
1973       result->src_loc = pfile->line_table->highest_line;
1974     }
1975   c = *buffer->cur++;
1976
1977   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1978                                CPP_BUF_COLUMN (buffer, buffer->cur));
1979
1980   switch (c)
1981     {
1982     case ' ': case '\t': case '\f': case '\v': case '\0':
1983       result->flags |= PREV_WHITE;
1984       skip_whitespace (pfile, c);
1985       goto skipped_white;
1986
1987     case '\n':
1988       if (buffer->cur < buffer->rlimit)
1989         CPP_INCREMENT_LINE (pfile, 0);
1990       buffer->need_line = true;
1991       goto fresh_line;
1992
1993     case '0': case '1': case '2': case '3': case '4':
1994     case '5': case '6': case '7': case '8': case '9':
1995       {
1996         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1997         result->type = CPP_NUMBER;
1998         lex_number (pfile, &result->val.str, &nst);
1999         warn_about_normalization (pfile, result, &nst);
2000         break;
2001       }
2002
2003     case 'L':
2004     case 'u':
2005     case 'U':
2006     case 'R':
2007       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2008          wide strings or raw strings.  */
2009       if (c == 'L' || CPP_OPTION (pfile, uliterals))
2010         {
2011           if ((*buffer->cur == '\'' && c != 'R')
2012               || *buffer->cur == '"'
2013               || (*buffer->cur == 'R'
2014                   && c != 'R'
2015                   && buffer->cur[1] == '"'
2016                   && CPP_OPTION (pfile, uliterals))
2017               || (*buffer->cur == '8'
2018                   && c == 'u'
2019                   && (buffer->cur[1] == '"'
2020                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
2021             {
2022               lex_string (pfile, result, buffer->cur - 1);
2023               break;
2024             }
2025         }
2026       /* Fall through.  */
2027
2028     case '_':
2029     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2030     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2031     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2032     case 's': case 't':           case 'v': case 'w': case 'x':
2033     case 'y': case 'z':
2034     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2035     case 'G': case 'H': case 'I': case 'J': case 'K':
2036     case 'M': case 'N': case 'O': case 'P': case 'Q':
2037     case 'S': case 'T':           case 'V': case 'W': case 'X':
2038     case 'Y': case 'Z':
2039       result->type = CPP_NAME;
2040       {
2041         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2042         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2043                                                 &nst);
2044         warn_about_normalization (pfile, result, &nst);
2045       }
2046
2047       /* Convert named operators to their proper types.  */
2048       if (result->val.node.node->flags & NODE_OPERATOR)
2049         {
2050           result->flags |= NAMED_OP;
2051           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2052         }
2053       break;
2054
2055     case '\'':
2056     case '"':
2057       lex_string (pfile, result, buffer->cur - 1);
2058       break;
2059
2060     case '/':
2061       /* A potential block or line comment.  */
2062       comment_start = buffer->cur;
2063       c = *buffer->cur;
2064
2065       if (c == '*')
2066         {
2067           if (_cpp_skip_block_comment (pfile))
2068             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2069         }
2070       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2071                             || cpp_in_system_header (pfile)))
2072         {
2073           /* Warn about comments only if pedantically GNUC89, and not
2074              in system headers.  */
2075           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2076               && ! buffer->warned_cplusplus_comments)
2077             {
2078               cpp_error (pfile, CPP_DL_PEDWARN,
2079                          "C++ style comments are not allowed in ISO C90");
2080               cpp_error (pfile, CPP_DL_PEDWARN,
2081                          "(this will be reported only once per input file)");
2082               buffer->warned_cplusplus_comments = 1;
2083             }
2084
2085           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2086             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2087         }
2088       else if (c == '=')
2089         {
2090           buffer->cur++;
2091           result->type = CPP_DIV_EQ;
2092           break;
2093         }
2094       else
2095         {
2096           result->type = CPP_DIV;
2097           break;
2098         }
2099
2100       if (!pfile->state.save_comments)
2101         {
2102           result->flags |= PREV_WHITE;
2103           goto update_tokens_line;
2104         }
2105
2106       /* Save the comment as a token in its own right.  */
2107       save_comment (pfile, result, comment_start, c);
2108       break;
2109
2110     case '<':
2111       if (pfile->state.angled_headers)
2112         {
2113           lex_string (pfile, result, buffer->cur - 1);
2114           if (result->type != CPP_LESS)
2115             break;
2116         }
2117
2118       result->type = CPP_LESS;
2119       if (*buffer->cur == '=')
2120         buffer->cur++, result->type = CPP_LESS_EQ;
2121       else if (*buffer->cur == '<')
2122         {
2123           buffer->cur++;
2124           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2125         }
2126       else if (CPP_OPTION (pfile, digraphs))
2127         {
2128           if (*buffer->cur == ':')
2129             {
2130               buffer->cur++;
2131               result->flags |= DIGRAPH;
2132               result->type = CPP_OPEN_SQUARE;
2133             }
2134           else if (*buffer->cur == '%')
2135             {
2136               buffer->cur++;
2137               result->flags |= DIGRAPH;
2138               result->type = CPP_OPEN_BRACE;
2139             }
2140         }
2141       break;
2142
2143     case '>':
2144       result->type = CPP_GREATER;
2145       if (*buffer->cur == '=')
2146         buffer->cur++, result->type = CPP_GREATER_EQ;
2147       else if (*buffer->cur == '>')
2148         {
2149           buffer->cur++;
2150           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2151         }
2152       break;
2153
2154     case '%':
2155       result->type = CPP_MOD;
2156       if (*buffer->cur == '=')
2157         buffer->cur++, result->type = CPP_MOD_EQ;
2158       else if (CPP_OPTION (pfile, digraphs))
2159         {
2160           if (*buffer->cur == ':')
2161             {
2162               buffer->cur++;
2163               result->flags |= DIGRAPH;
2164               result->type = CPP_HASH;
2165               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2166                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2167             }
2168           else if (*buffer->cur == '>')
2169             {
2170               buffer->cur++;
2171               result->flags |= DIGRAPH;
2172               result->type = CPP_CLOSE_BRACE;
2173             }
2174         }
2175       break;
2176
2177     case '.':
2178       result->type = CPP_DOT;
2179       if (ISDIGIT (*buffer->cur))
2180         {
2181           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2182           result->type = CPP_NUMBER;
2183           lex_number (pfile, &result->val.str, &nst);
2184           warn_about_normalization (pfile, result, &nst);
2185         }
2186       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2187         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2188       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2189         buffer->cur++, result->type = CPP_DOT_STAR;
2190       break;
2191
2192     case '+':
2193       result->type = CPP_PLUS;
2194       if (*buffer->cur == '+')
2195         buffer->cur++, result->type = CPP_PLUS_PLUS;
2196       else if (*buffer->cur == '=')
2197         buffer->cur++, result->type = CPP_PLUS_EQ;
2198       break;
2199
2200     case '-':
2201       result->type = CPP_MINUS;
2202       if (*buffer->cur == '>')
2203         {
2204           buffer->cur++;
2205           result->type = CPP_DEREF;
2206           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2207             buffer->cur++, result->type = CPP_DEREF_STAR;
2208         }
2209       else if (*buffer->cur == '-')
2210         buffer->cur++, result->type = CPP_MINUS_MINUS;
2211       else if (*buffer->cur == '=')
2212         buffer->cur++, result->type = CPP_MINUS_EQ;
2213       break;
2214
2215     case '&':
2216       result->type = CPP_AND;
2217       if (*buffer->cur == '&')
2218         buffer->cur++, result->type = CPP_AND_AND;
2219       else if (*buffer->cur == '=')
2220         buffer->cur++, result->type = CPP_AND_EQ;
2221       break;
2222
2223     case '|':
2224       result->type = CPP_OR;
2225       if (*buffer->cur == '|')
2226         buffer->cur++, result->type = CPP_OR_OR;
2227       else if (*buffer->cur == '=')
2228         buffer->cur++, result->type = CPP_OR_EQ;
2229       break;
2230
2231     case ':':
2232       result->type = CPP_COLON;
2233       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2234         buffer->cur++, result->type = CPP_SCOPE;
2235       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2236         {
2237           buffer->cur++;
2238           result->flags |= DIGRAPH;
2239           result->type = CPP_CLOSE_SQUARE;
2240         }
2241       break;
2242
2243     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2244     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2245     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2246     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2247     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2248
2249     case '?': result->type = CPP_QUERY; break;
2250     case '~': result->type = CPP_COMPL; break;
2251     case ',': result->type = CPP_COMMA; break;
2252     case '(': result->type = CPP_OPEN_PAREN; break;
2253     case ')': result->type = CPP_CLOSE_PAREN; break;
2254     case '[': result->type = CPP_OPEN_SQUARE; break;
2255     case ']': result->type = CPP_CLOSE_SQUARE; break;
2256     case '{': result->type = CPP_OPEN_BRACE; break;
2257     case '}': result->type = CPP_CLOSE_BRACE; break;
2258     case ';': result->type = CPP_SEMICOLON; break;
2259
2260       /* @ is a punctuator in Objective-C.  */
2261     case '@': result->type = CPP_ATSIGN; break;
2262
2263     case '$':
2264     case '\\':
2265       {
2266         const uchar *base = --buffer->cur;
2267         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2268
2269         if (forms_identifier_p (pfile, true, &nst))
2270           {
2271             result->type = CPP_NAME;
2272             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2273             warn_about_normalization (pfile, result, &nst);
2274             break;
2275           }
2276         buffer->cur++;
2277       }
2278
2279     default:
2280       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2281       break;
2282     }
2283
2284   return result;
2285 }
2286
2287 /* An upper bound on the number of bytes needed to spell TOKEN.
2288    Does not include preceding whitespace.  */
2289 unsigned int
2290 cpp_token_len (const cpp_token *token)
2291 {
2292   unsigned int len;
2293
2294   switch (TOKEN_SPELL (token))
2295     {
2296     default:            len = 6;                                break;
2297     case SPELL_LITERAL: len = token->val.str.len;               break;
2298     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2299     }
2300
2301   return len;
2302 }
2303
2304 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2305    Return the number of bytes read out of NAME.  (There are always
2306    10 bytes written to BUFFER.)  */
2307
2308 static size_t
2309 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2310 {
2311   int j;
2312   int ucn_len = 0;
2313   int ucn_len_c;
2314   unsigned t;
2315   unsigned long utf32;
2316
2317   /* Compute the length of the UTF-8 sequence.  */
2318   for (t = *name; t & 0x80; t <<= 1)
2319     ucn_len++;
2320
2321   utf32 = *name & (0x7F >> ucn_len);
2322   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2323     {
2324       utf32 = (utf32 << 6) | (*++name & 0x3F);
2325
2326       /* Ill-formed UTF-8.  */
2327       if ((*name & ~0x3F) != 0x80)
2328         abort ();
2329     }
2330
2331   *buffer++ = '\\';
2332   *buffer++ = 'U';
2333   for (j = 7; j >= 0; j--)
2334     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2335   return ucn_len;
2336 }
2337
2338 /* Given a token TYPE corresponding to a digraph, return a pointer to
2339    the spelling of the digraph.  */
2340 static const unsigned char *
2341 cpp_digraph2name (enum cpp_ttype type)
2342 {
2343   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2344 }
2345
2346 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2347    already contain the enough space to hold the token's spelling.
2348    Returns a pointer to the character after the last character written.
2349    FORSTRING is true if this is to be the spelling after translation
2350    phase 1 (this is different for UCNs).
2351    FIXME: Would be nice if we didn't need the PFILE argument.  */
2352 unsigned char *
2353 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2354                  unsigned char *buffer, bool forstring)
2355 {
2356   switch (TOKEN_SPELL (token))
2357     {
2358     case SPELL_OPERATOR:
2359       {
2360         const unsigned char *spelling;
2361         unsigned char c;
2362
2363         if (token->flags & DIGRAPH)
2364           spelling = cpp_digraph2name (token->type);
2365         else if (token->flags & NAMED_OP)
2366           goto spell_ident;
2367         else
2368           spelling = TOKEN_NAME (token);
2369
2370         while ((c = *spelling++) != '\0')
2371           *buffer++ = c;
2372       }
2373       break;
2374
2375     spell_ident:
2376     case SPELL_IDENT:
2377       if (forstring)
2378         {
2379           memcpy (buffer, NODE_NAME (token->val.node.node),
2380                   NODE_LEN (token->val.node.node));
2381           buffer += NODE_LEN (token->val.node.node);
2382         }
2383       else
2384         {
2385           size_t i;
2386           const unsigned char * name = NODE_NAME (token->val.node.node);
2387
2388           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2389             if (name[i] & ~0x7F)
2390               {
2391                 i += utf8_to_ucn (buffer, name + i) - 1;
2392                 buffer += 10;
2393               }
2394             else
2395               *buffer++ = NODE_NAME (token->val.node.node)[i];
2396         }
2397       break;
2398
2399     case SPELL_LITERAL:
2400       memcpy (buffer, token->val.str.text, token->val.str.len);
2401       buffer += token->val.str.len;
2402       break;
2403
2404     case SPELL_NONE:
2405       cpp_error (pfile, CPP_DL_ICE,
2406                  "unspellable token %s", TOKEN_NAME (token));
2407       break;
2408     }
2409
2410   return buffer;
2411 }
2412
2413 /* Returns TOKEN spelt as a null-terminated string.  The string is
2414    freed when the reader is destroyed.  Useful for diagnostics.  */
2415 unsigned char *
2416 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2417 {
2418   unsigned int len = cpp_token_len (token) + 1;
2419   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2420
2421   end = cpp_spell_token (pfile, token, start, false);
2422   end[0] = '\0';
2423
2424   return start;
2425 }
2426
2427 /* Returns a pointer to a string which spells the token defined by
2428    TYPE and FLAGS.  Used by C front ends, which really should move to
2429    using cpp_token_as_text.  */
2430 const char *
2431 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2432 {
2433   if (flags & DIGRAPH)
2434     return (const char *) cpp_digraph2name (type);
2435   else if (flags & NAMED_OP)
2436     return cpp_named_operator2name (type);
2437
2438   return (const char *) token_spellings[type].name;
2439 }
2440
2441 /* Writes the spelling of token to FP, without any preceding space.
2442    Separated from cpp_spell_token for efficiency - to avoid stdio
2443    double-buffering.  */
2444 void
2445 cpp_output_token (const cpp_token *token, FILE *fp)
2446 {
2447   switch (TOKEN_SPELL (token))
2448     {
2449     case SPELL_OPERATOR:
2450       {
2451         const unsigned char *spelling;
2452         int c;
2453
2454         if (token->flags & DIGRAPH)
2455           spelling = cpp_digraph2name (token->type);
2456         else if (token->flags & NAMED_OP)
2457           goto spell_ident;
2458         else
2459           spelling = TOKEN_NAME (token);
2460
2461         c = *spelling;
2462         do
2463           putc (c, fp);
2464         while ((c = *++spelling) != '\0');
2465       }
2466       break;
2467
2468     spell_ident:
2469     case SPELL_IDENT:
2470       {
2471         size_t i;
2472         const unsigned char * name = NODE_NAME (token->val.node.node);
2473
2474         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2475           if (name[i] & ~0x7F)
2476             {
2477               unsigned char buffer[10];
2478               i += utf8_to_ucn (buffer, name + i) - 1;
2479               fwrite (buffer, 1, 10, fp);
2480             }
2481           else
2482             fputc (NODE_NAME (token->val.node.node)[i], fp);
2483       }
2484       break;
2485
2486     case SPELL_LITERAL:
2487       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2488       break;
2489
2490     case SPELL_NONE:
2491       /* An error, most probably.  */
2492       break;
2493     }
2494 }
2495
2496 /* Compare two tokens.  */
2497 int
2498 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2499 {
2500   if (a->type == b->type && a->flags == b->flags)
2501     switch (TOKEN_SPELL (a))
2502       {
2503       default:                  /* Keep compiler happy.  */
2504       case SPELL_OPERATOR:
2505         /* token_no is used to track where multiple consecutive ##
2506            tokens were originally located.  */
2507         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2508       case SPELL_NONE:
2509         return (a->type != CPP_MACRO_ARG
2510                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2511       case SPELL_IDENT:
2512         return a->val.node.node == b->val.node.node;
2513       case SPELL_LITERAL:
2514         return (a->val.str.len == b->val.str.len
2515                 && !memcmp (a->val.str.text, b->val.str.text,
2516                             a->val.str.len));
2517       }
2518
2519   return 0;
2520 }
2521
2522 /* Returns nonzero if a space should be inserted to avoid an
2523    accidental token paste for output.  For simplicity, it is
2524    conservative, and occasionally advises a space where one is not
2525    needed, e.g. "." and ".2".  */
2526 int
2527 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2528                  const cpp_token *token2)
2529 {
2530   enum cpp_ttype a = token1->type, b = token2->type;
2531   cppchar_t c;
2532
2533   if (token1->flags & NAMED_OP)
2534     a = CPP_NAME;
2535   if (token2->flags & NAMED_OP)
2536     b = CPP_NAME;
2537
2538   c = EOF;
2539   if (token2->flags & DIGRAPH)
2540     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2541   else if (token_spellings[b].category == SPELL_OPERATOR)
2542     c = token_spellings[b].name[0];
2543
2544   /* Quickly get everything that can paste with an '='.  */
2545   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2546     return 1;
2547
2548   switch (a)
2549     {
2550     case CPP_GREATER:   return c == '>';
2551     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2552     case CPP_PLUS:      return c == '+';
2553     case CPP_MINUS:     return c == '-' || c == '>';
2554     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2555     case CPP_MOD:       return c == ':' || c == '>';
2556     case CPP_AND:       return c == '&';
2557     case CPP_OR:        return c == '|';
2558     case CPP_COLON:     return c == ':' || c == '>';
2559     case CPP_DEREF:     return c == '*';
2560     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2561     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2562     case CPP_NAME:      return ((b == CPP_NUMBER
2563                                  && name_p (pfile, &token2->val.str))
2564                                 || b == CPP_NAME
2565                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2566     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2567                                 || c == '.' || c == '+' || c == '-');
2568                                       /* UCNs */
2569     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2570                                  && b == CPP_NAME)
2571                                 || (CPP_OPTION (pfile, objc)
2572                                     && token1->val.str.text[0] == '@'
2573                                     && (b == CPP_NAME || b == CPP_STRING)));
2574     default:            break;
2575     }
2576
2577   return 0;
2578 }
2579
2580 /* Output all the remaining tokens on the current line, and a newline
2581    character, to FP.  Leading whitespace is removed.  If there are
2582    macros, special token padding is not performed.  */
2583 void
2584 cpp_output_line (cpp_reader *pfile, FILE *fp)
2585 {
2586   const cpp_token *token;
2587
2588   token = cpp_get_token (pfile);
2589   while (token->type != CPP_EOF)
2590     {
2591       cpp_output_token (token, fp);
2592       token = cpp_get_token (pfile);
2593       if (token->flags & PREV_WHITE)
2594         putc (' ', fp);
2595     }
2596
2597   putc ('\n', fp);
2598 }
2599
2600 /* Return a string representation of all the remaining tokens on the
2601    current line.  The result is allocated using xmalloc and must be
2602    freed by the caller.  */
2603 unsigned char *
2604 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2605 {
2606   const cpp_token *token;
2607   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2608   unsigned int alloced = 120 + out;
2609   unsigned char *result = (unsigned char *) xmalloc (alloced);
2610
2611   /* If DIR_NAME is empty, there are no initial contents.  */
2612   if (dir_name)
2613     {
2614       sprintf ((char *) result, "#%s ", dir_name);
2615       out += 2;
2616     }
2617
2618   token = cpp_get_token (pfile);
2619   while (token->type != CPP_EOF)
2620     {
2621       unsigned char *last;
2622       /* Include room for a possible space and the terminating nul.  */
2623       unsigned int len = cpp_token_len (token) + 2;
2624
2625       if (out + len > alloced)
2626         {
2627           alloced *= 2;
2628           if (out + len > alloced)
2629             alloced = out + len;
2630           result = (unsigned char *) xrealloc (result, alloced);
2631         }
2632
2633       last = cpp_spell_token (pfile, token, &result[out], 0);
2634       out = last - result;
2635
2636       token = cpp_get_token (pfile);
2637       if (token->flags & PREV_WHITE)
2638         result[out++] = ' ';
2639     }
2640
2641   result[out] = '\0';
2642   return result;
2643 }
2644
2645 /* Memory buffers.  Changing these three constants can have a dramatic
2646    effect on performance.  The values here are reasonable defaults,
2647    but might be tuned.  If you adjust them, be sure to test across a
2648    range of uses of cpplib, including heavy nested function-like macro
2649    expansion.  Also check the change in peak memory usage (NJAMD is a
2650    good tool for this).  */
2651 #define MIN_BUFF_SIZE 8000
2652 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2653 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2654         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2655
2656 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2657   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2658 #endif
2659
2660 /* Create a new allocation buffer.  Place the control block at the end
2661    of the buffer, so that buffer overflows will cause immediate chaos.  */
2662 static _cpp_buff *
2663 new_buff (size_t len)
2664 {
2665   _cpp_buff *result;
2666   unsigned char *base;
2667
2668   if (len < MIN_BUFF_SIZE)
2669     len = MIN_BUFF_SIZE;
2670   len = CPP_ALIGN (len);
2671
2672   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2673   result = (_cpp_buff *) (base + len);
2674   result->base = base;
2675   result->cur = base;
2676   result->limit = base + len;
2677   result->next = NULL;
2678   return result;
2679 }
2680
2681 /* Place a chain of unwanted allocation buffers on the free list.  */
2682 void
2683 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2684 {
2685   _cpp_buff *end = buff;
2686
2687   while (end->next)
2688     end = end->next;
2689   end->next = pfile->free_buffs;
2690   pfile->free_buffs = buff;
2691 }
2692
2693 /* Return a free buffer of size at least MIN_SIZE.  */
2694 _cpp_buff *
2695 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2696 {
2697   _cpp_buff *result, **p;
2698
2699   for (p = &pfile->free_buffs;; p = &(*p)->next)
2700     {
2701       size_t size;
2702
2703       if (*p == NULL)
2704         return new_buff (min_size);
2705       result = *p;
2706       size = result->limit - result->base;
2707       /* Return a buffer that's big enough, but don't waste one that's
2708          way too big.  */
2709       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2710         break;
2711     }
2712
2713   *p = result->next;
2714   result->next = NULL;
2715   result->cur = result->base;
2716   return result;
2717 }
2718
2719 /* Creates a new buffer with enough space to hold the uncommitted
2720    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2721    the excess bytes to the new buffer.  Chains the new buffer after
2722    BUFF, and returns the new buffer.  */
2723 _cpp_buff *
2724 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2725 {
2726   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2727   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2728
2729   buff->next = new_buff;
2730   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2731   return new_buff;
2732 }
2733
2734 /* Creates a new buffer with enough space to hold the uncommitted
2735    remaining bytes of the buffer pointed to by BUFF, and at least
2736    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2737    Chains the new buffer before the buffer pointed to by BUFF, and
2738    updates the pointer to point to the new buffer.  */
2739 void
2740 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2741 {
2742   _cpp_buff *new_buff, *old_buff = *pbuff;
2743   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2744
2745   new_buff = _cpp_get_buff (pfile, size);
2746   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2747   new_buff->next = old_buff;
2748   *pbuff = new_buff;
2749 }
2750
2751 /* Free a chain of buffers starting at BUFF.  */
2752 void
2753 _cpp_free_buff (_cpp_buff *buff)
2754 {
2755   _cpp_buff *next;
2756
2757   for (; buff; buff = next)
2758     {
2759       next = buff->next;
2760       free (buff->base);
2761     }
2762 }
2763
2764 /* Allocate permanent, unaligned storage of length LEN.  */
2765 unsigned char *
2766 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2767 {
2768   _cpp_buff *buff = pfile->u_buff;
2769   unsigned char *result = buff->cur;
2770
2771   if (len > (size_t) (buff->limit - result))
2772     {
2773       buff = _cpp_get_buff (pfile, len);
2774       buff->next = pfile->u_buff;
2775       pfile->u_buff = buff;
2776       result = buff->cur;
2777     }
2778
2779   buff->cur = result + len;
2780   return result;
2781 }
2782
2783 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2784    That buffer is used for growing allocations when saving macro
2785    replacement lists in a #define, and when parsing an answer to an
2786    assertion in #assert, #unassert or #if (and therefore possibly
2787    whilst expanding macros).  It therefore must not be used by any
2788    code that they might call: specifically the lexer and the guts of
2789    the macro expander.
2790
2791    All existing other uses clearly fit this restriction: storing
2792    registered pragmas during initialization.  */
2793 unsigned char *
2794 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2795 {
2796   _cpp_buff *buff = pfile->a_buff;
2797   unsigned char *result = buff->cur;
2798
2799   if (len > (size_t) (buff->limit - result))
2800     {
2801       buff = _cpp_get_buff (pfile, len);
2802       buff->next = pfile->a_buff;
2803       pfile->a_buff = buff;
2804       result = buff->cur;
2805     }
2806
2807   buff->cur = result + len;
2808   return result;
2809 }
2810
2811 /* Say which field of TOK is in use.  */
2812
2813 enum cpp_token_fld_kind
2814 cpp_token_val_index (cpp_token *tok)
2815 {
2816   switch (TOKEN_SPELL (tok))
2817     {
2818     case SPELL_IDENT:
2819       return CPP_TOKEN_FLD_NODE;
2820     case SPELL_LITERAL:
2821       return CPP_TOKEN_FLD_STR;
2822     case SPELL_OPERATOR:
2823       if (tok->type == CPP_PASTE)
2824         return CPP_TOKEN_FLD_TOKEN_NO;
2825       else
2826         return CPP_TOKEN_FLD_NONE;
2827     case SPELL_NONE:
2828       if (tok->type == CPP_MACRO_ARG)
2829         return CPP_TOKEN_FLD_ARG_NO;
2830       else if (tok->type == CPP_PADDING)
2831         return CPP_TOKEN_FLD_SOURCE;
2832       else if (tok->type == CPP_PRAGMA)
2833         return CPP_TOKEN_FLD_PRAGMA;
2834       /* else fall through */
2835     default:
2836       return CPP_TOKEN_FLD_NONE;
2837     }
2838 }