libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD MMX
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 static void __attribute__((constructor))
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 594     union {
 595       vc v;
 596       unsigned long l[N];
 597     } u;
 598     unsigned long l, i = 0;
 599
 600     u.v = t;
 601
 602     /* Find the first word of T that is non-zero.  */
 603     switch (N)
 604       {
 605       case 4:
 606         l = u.l[i++];
 607         if (l != 0)
 608           break;
 609         s += sizeof(unsigned long);
 610         l = u.l[i++];
 611         if (l != 0)
 612           break;
 613         s += sizeof(unsigned long);
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625     l = __builtin_clzl(l) >> 3;
 626     return s + l;
 627
 628 #undef N
 629   }
 630 }
 631
 632 #else
 633
 634 /* We only have one accellerated alternative.  Use a direct call so that
 635    we encourage inlining.  */
 636
 637 #define search_line_fast  search_line_acc_char
 638
 639 #endif
 640
 641 /* Returns with a logical line that contains no escaped newlines or
 642    trigraphs.  This is a time-critical inner loop.  */
 643 void
 644 _cpp_clean_line (cpp_reader *pfile)
 645 {
 646   cpp_buffer *buffer;
 647   const uchar *s;
 648   uchar c, *d, *p;
 649
 650   buffer = pfile->buffer;
 651   buffer->cur_note = buffer->notes_used = 0;
 652   buffer->cur = buffer->line_base = buffer->next_line;
 653   buffer->need_line = false;
 654   s = buffer->next_line;
 655
 656   if (!buffer->from_stage3)
 657     {
 658       const uchar *pbackslash = NULL;
 659
 660       /* Fast path.  This is the common case of an un-escaped line with
 661          no trigraphs.  The primary win here is by not writing any
 662          data back to memory until we have to.  */
 663       while (1)
 664         {
 665           /* Perform an optimized search for \n, \r, \\, ?.  */
 666           s = search_line_fast (s, buffer->rlimit);
 667
 668           c = *s;
 669           if (c == '\\')
 670             {
 671               /* Record the location of the backslash and continue.  */
 672               pbackslash = s++;
 673             }
 674           else if (__builtin_expect (c == '?', 0))
 675             {
 676               if (__builtin_expect (s[1] == '?', false)
 677                    && _cpp_trigraph_map[s[2]])
 678                 {
 679                   /* Have a trigraph.  We may or may not have to convert
 680                      it.  Add a line note regardless, for -Wtrigraphs.  */
 681                   add_line_note (buffer, s, s[2]);
 682                   if (CPP_OPTION (pfile, trigraphs))
 683                     {
 684                       /* We do, and that means we have to switch to the
 685                          slow path.  */
 686                       d = (uchar *) s;
 687                       *d = _cpp_trigraph_map[s[2]];
 688                       s += 2;
 689                       goto slow_path;
 690                     }
 691                 }
 692               /* Not a trigraph.  Continue on fast-path.  */
 693               s++;
 694             }
 695           else
 696             break;
 697         }
 698
 699       /* This must be \r or \n.  We're either done, or we'll be forced
 700          to write back to the buffer and continue on the slow path.  */
 701       d = (uchar *) s;
 702
 703       if (__builtin_expect (s == buffer->rlimit, false))
 704         goto done;
 705
 706       /* DOS line ending? */
 707       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 708         {
 709           s++;
 710           if (s == buffer->rlimit)
 711             goto done;
 712         }
 713
 714       if (__builtin_expect (pbackslash == NULL, true))
 715         goto done;
 716
 717       /* Check for escaped newline.  */
 718       p = d;
 719       while (is_nvspace (p[-1]))
 720         p--;
 721       if (p - 1 != pbackslash)
 722         goto done;
 723
 724       /* Have an escaped newline; process it and proceed to
 725          the slow path.  */
 726       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 727       d = p - 2;
 728       buffer->next_line = p - 1;
 729
 730     slow_path:
 731       while (1)
 732         {
 733           c = *++s;
 734           *++d = c;
 735
 736           if (c == '\n' || c == '\r')
 737             {
 738               /* Handle DOS line endings.  */
 739               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 740                 s++;
 741               if (s == buffer->rlimit)
 742                 break;
 743
 744               /* Escaped?  */
 745               p = d;
 746               while (p != buffer->next_line && is_nvspace (p[-1]))
 747                 p--;
 748               if (p == buffer->next_line || p[-1] != '\\')
 749                 break;
 750
 751               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 752               d = p - 2;
 753               buffer->next_line = p - 1;
 754             }
 755           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 756             {
 757               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 758               add_line_note (buffer, d, s[2]);
 759               if (CPP_OPTION (pfile, trigraphs))
 760                 {
 761                   *d = _cpp_trigraph_map[s[2]];
 762                   s += 2;
 763                 }
 764             }
 765         }
 766     }
 767   else
 768     {
 769       while (*s != '\n' && *s != '\r')
 770         s++;
 771       d = (uchar *) s;
 772
 773       /* Handle DOS line endings.  */
 774       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 775         s++;
 776     }
 777
 778  done:
 779   *d = '\n';
 780   /* A sentinel note that should never be processed.  */
 781   add_line_note (buffer, d + 1, '\n');
 782   buffer->next_line = s + 1;
 783 }
 784
 785 /* Return true if the trigraph indicated by NOTE should be warned
 786    about in a comment.  */
 787 static bool
 788 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 789 {
 790   const uchar *p;
 791
 792   /* Within comments we don't warn about trigraphs, unless the
 793      trigraph forms an escaped newline, as that may change
 794      behavior.  */
 795   if (note->type != '/')
 796     return false;
 797
 798   /* If -trigraphs, then this was an escaped newline iff the next note
 799      is coincident.  */
 800   if (CPP_OPTION (pfile, trigraphs))
 801     return note[1].pos == note->pos;
 802
 803   /* Otherwise, see if this forms an escaped newline.  */
 804   p = note->pos + 3;
 805   while (is_nvspace (*p))
 806     p++;
 807
 808   /* There might have been escaped newlines between the trigraph and the
 809      newline we found.  Hence the position test.  */
 810   return (*p == '\n' && p < note[1].pos);
 811 }
 812
 813 /* Process the notes created by add_line_note as far as the current
 814    location.  */
 815 void
 816 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 817 {
 818   cpp_buffer *buffer = pfile->buffer;
 819
 820   for (;;)
 821     {
 822       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 823       unsigned int col;
 824
 825       if (note->pos > buffer->cur)
 826         break;
 827
 828       buffer->cur_note++;
 829       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 830
 831       if (note->type == '\\' || note->type == ' ')
 832         {
 833           if (note->type == ' ' && !in_comment)
 834             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 835                                  "backslash and newline separated by space");
 836
 837           if (buffer->next_line > buffer->rlimit)
 838             {
 839               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 840                                    "backslash-newline at end of file");
 841               /* Prevent "no newline at end of file" warning.  */
 842               buffer->next_line = buffer->rlimit;
 843             }
 844
 845           buffer->line_base = note->pos;
 846           CPP_INCREMENT_LINE (pfile, 0);
 847         }
 848       else if (_cpp_trigraph_map[note->type])
 849         {
 850           if (CPP_OPTION (pfile, warn_trigraphs)
 851               && (!in_comment || warn_in_comment (pfile, note)))
 852             {
 853               if (CPP_OPTION (pfile, trigraphs))
 854                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 855                                        pfile->line_table->highest_line, col,
 856                                        "trigraph ??%c converted to %c",
 857                                        note->type,
 858                                        (int) _cpp_trigraph_map[note->type]);
 859               else
 860                 {
 861                   cpp_warning_with_line
 862                     (pfile, CPP_W_TRIGRAPHS,
 863                      pfile->line_table->highest_line, col,
 864                      "trigraph ??%c ignored, use -trigraphs to enable",
 865                      note->type);
 866                 }
 867             }
 868         }
 869       else if (note->type == 0)
 870         /* Already processed in lex_raw_string.  */;
 871       else
 872         abort ();
 873     }
 874 }
 875
 876 /* Skip a C-style block comment.  We find the end of the comment by
 877    seeing if an asterisk is before every '/' we encounter.  Returns
 878    nonzero if comment terminated by EOF, zero otherwise.
 879
 880    Buffer->cur points to the initial asterisk of the comment.  */
 881 bool
 882 _cpp_skip_block_comment (cpp_reader *pfile)
 883 {
 884   cpp_buffer *buffer = pfile->buffer;
 885   const uchar *cur = buffer->cur;
 886   uchar c;
 887
 888   cur++;
 889   if (*cur == '/')
 890     cur++;
 891
 892   for (;;)
 893     {
 894       /* People like decorating comments with '*', so check for '/'
 895          instead for efficiency.  */
 896       c = *cur++;
 897
 898       if (c == '/')
 899         {
 900           if (cur[-2] == '*')
 901             break;
 902
 903           /* Warn about potential nested comments, but not if the '/'
 904              comes immediately before the true comment delimiter.
 905              Don't bother to get it right across escaped newlines.  */
 906           if (CPP_OPTION (pfile, warn_comments)
 907               && cur[0] == '*' && cur[1] != '/')
 908             {
 909               buffer->cur = cur;
 910               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 911                                      pfile->line_table->highest_line,
 912                                      CPP_BUF_COL (buffer),
 913                                      "\"/*\" within comment");
 914             }
 915         }
 916       else if (c == '\n')
 917         {
 918           unsigned int cols;
 919           buffer->cur = cur - 1;
 920           _cpp_process_line_notes (pfile, true);
 921           if (buffer->next_line >= buffer->rlimit)
 922             return true;
 923           _cpp_clean_line (pfile);
 924
 925           cols = buffer->next_line - buffer->line_base;
 926           CPP_INCREMENT_LINE (pfile, cols);
 927
 928           cur = buffer->cur;
 929         }
 930     }
 931
 932   buffer->cur = cur;
 933   _cpp_process_line_notes (pfile, true);
 934   return false;
 935 }
 936
 937 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 938    terminating newline.  Handles escaped newlines.  Returns nonzero
 939    if a multiline comment.  */
 940 static int
 941 skip_line_comment (cpp_reader *pfile)
 942 {
 943   cpp_buffer *buffer = pfile->buffer;
 944   source_location orig_line = pfile->line_table->highest_line;
 945
 946   while (*buffer->cur != '\n')
 947     buffer->cur++;
 948
 949   _cpp_process_line_notes (pfile, true);
 950   return orig_line != pfile->line_table->highest_line;
 951 }
 952
 953 /* Skips whitespace, saving the next non-whitespace character.  */
 954 static void
 955 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 956 {
 957   cpp_buffer *buffer = pfile->buffer;
 958   bool saw_NUL = false;
 959
 960   do
 961     {
 962       /* Horizontal space always OK.  */
 963       if (c == ' ' || c == '\t')
 964         ;
 965       /* Just \f \v or \0 left.  */
 966       else if (c == '\0')
 967         saw_NUL = true;
 968       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 969         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 970                              CPP_BUF_COL (buffer),
 971                              "%s in preprocessing directive",
 972                              c == '\f' ? "form feed" : "vertical tab");
 973
 974       c = *buffer->cur++;
 975     }
 976   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 977   while (is_nvspace (c));
 978
 979   if (saw_NUL)
 980     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 981
 982   buffer->cur--;
 983 }
 984
 985 /* See if the characters of a number token are valid in a name (no
 986    '.', '+' or '-').  */
 987 static int
 988 name_p (cpp_reader *pfile, const cpp_string *string)
 989 {
 990   unsigned int i;
 991
 992   for (i = 0; i < string->len; i++)
 993     if (!is_idchar (string->text[i]))
 994       return 0;
 995
 996   return 1;
 997 }
 998
 999 /* After parsing an identifier or other sequence, produce a warning about
1000    sequences not in NFC/NFKC.  */
1001 static void
1002 warn_about_normalization (cpp_reader *pfile,
1003                           const cpp_token *token,
1004                           const struct normalize_state *s)
1005 {
1006   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1007       && !pfile->state.skipping)
1008     {
1009       /* Make sure that the token is printed using UCNs, even
1010          if we'd otherwise happily print UTF-8.  */
1011       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1012       size_t sz;
1013
1014       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1015       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1016         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1017                                "`%.*s' is not in NFKC", (int) sz, buf);
1018       else
1019         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1020                                "`%.*s' is not in NFC", (int) sz, buf);
1021     }
1022 }
1023
1024 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1025    an identifier.  FIRST is TRUE if this starts an identifier.  */
1026 static bool
1027 forms_identifier_p (cpp_reader *pfile, int first,
1028                     struct normalize_state *state)
1029 {
1030   cpp_buffer *buffer = pfile->buffer;
1031
1032   if (*buffer->cur == '$')
1033     {
1034       if (!CPP_OPTION (pfile, dollars_in_ident))
1035         return false;
1036
1037       buffer->cur++;
1038       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1039         {
1040           CPP_OPTION (pfile, warn_dollars) = 0;
1041           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1042         }
1043
1044       return true;
1045     }
1046
1047   /* Is this a syntactically valid UCN?  */
1048   if (CPP_OPTION (pfile, extended_identifiers)
1049       && *buffer->cur == '\\'
1050       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1051     {
1052       buffer->cur += 2;
1053       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1054                           state))
1055         return true;
1056       buffer->cur -= 2;
1057     }
1058
1059   return false;
1060 }
1061
1062 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1063 static cpp_hashnode *
1064 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1065 {
1066   cpp_hashnode *result;
1067   const uchar *cur;
1068   unsigned int len;
1069   unsigned int hash = HT_HASHSTEP (0, *base);
1070
1071   cur = base + 1;
1072   while (ISIDNUM (*cur))
1073     {
1074       hash = HT_HASHSTEP (hash, *cur);
1075       cur++;
1076     }
1077   len = cur - base;
1078   hash = HT_HASHFINISH (hash, len);
1079   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1080                                               base, len, hash, HT_ALLOC));
1081
1082   /* Rarely, identifiers require diagnostics when lexed.  */
1083   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1084                         && !pfile->state.skipping, 0))
1085     {
1086       /* It is allowed to poison the same identifier twice.  */
1087       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1088         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1089                    NODE_NAME (result));
1090
1091       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1092          replacement list of a variadic macro.  */
1093       if (result == pfile->spec_nodes.n__VA_ARGS__
1094           && !pfile->state.va_args_ok)
1095         cpp_error (pfile, CPP_DL_PEDWARN,
1096                    "__VA_ARGS__ can only appear in the expansion"
1097                    " of a C99 variadic macro");
1098
1099       /* For -Wc++-compat, warn about use of C++ named operators.  */
1100       if (result->flags & NODE_WARN_OPERATOR)
1101         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1102                      "identifier \"%s\" is a special operator name in C++",
1103                      NODE_NAME (result));
1104     }
1105
1106   return result;
1107 }
1108
1109 /* Get the cpp_hashnode of an identifier specified by NAME in
1110    the current cpp_reader object.  If none is found, NULL is returned.  */
1111 cpp_hashnode *
1112 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1113 {
1114   cpp_hashnode *result;
1115   result = lex_identifier_intern (pfile, (uchar *) name);
1116   return result;
1117 }
1118
1119 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1120 static cpp_hashnode *
1121 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1122                 struct normalize_state *nst)
1123 {
1124   cpp_hashnode *result;
1125   const uchar *cur;
1126   unsigned int len;
1127   unsigned int hash = HT_HASHSTEP (0, *base);
1128
1129   cur = pfile->buffer->cur;
1130   if (! starts_ucn)
1131     while (ISIDNUM (*cur))
1132       {
1133         hash = HT_HASHSTEP (hash, *cur);
1134         cur++;
1135       }
1136   pfile->buffer->cur = cur;
1137   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1138     {
1139       /* Slower version for identifiers containing UCNs (or $).  */
1140       do {
1141         while (ISIDNUM (*pfile->buffer->cur))
1142           {
1143             pfile->buffer->cur++;
1144             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1145           }
1146       } while (forms_identifier_p (pfile, false, nst));
1147       result = _cpp_interpret_identifier (pfile, base,
1148                                           pfile->buffer->cur - base);
1149     }
1150   else
1151     {
1152       len = cur - base;
1153       hash = HT_HASHFINISH (hash, len);
1154
1155       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                                   base, len, hash, HT_ALLOC));
1157     }
1158
1159   /* Rarely, identifiers require diagnostics when lexed.  */
1160   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1161                         && !pfile->state.skipping, 0))
1162     {
1163       /* It is allowed to poison the same identifier twice.  */
1164       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1165         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1166                    NODE_NAME (result));
1167
1168       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1169          replacement list of a variadic macro.  */
1170       if (result == pfile->spec_nodes.n__VA_ARGS__
1171           && !pfile->state.va_args_ok)
1172         cpp_error (pfile, CPP_DL_PEDWARN,
1173                    "__VA_ARGS__ can only appear in the expansion"
1174                    " of a C99 variadic macro");
1175
1176       /* For -Wc++-compat, warn about use of C++ named operators.  */
1177       if (result->flags & NODE_WARN_OPERATOR)
1178         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1179                      "identifier \"%s\" is a special operator name in C++",
1180                      NODE_NAME (result));
1181     }
1182
1183   return result;
1184 }
1185
1186 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1187 static void
1188 lex_number (cpp_reader *pfile, cpp_string *number,
1189             struct normalize_state *nst)
1190 {
1191   const uchar *cur;
1192   const uchar *base;
1193   uchar *dest;
1194
1195   base = pfile->buffer->cur - 1;
1196   do
1197     {
1198       cur = pfile->buffer->cur;
1199
1200       /* N.B. ISIDNUM does not include $.  */
1201       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1202         {
1203           cur++;
1204           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1205         }
1206
1207       pfile->buffer->cur = cur;
1208     }
1209   while (forms_identifier_p (pfile, false, nst));
1210
1211   number->len = cur - base;
1212   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1213   memcpy (dest, base, number->len);
1214   dest[number->len] = '\0';
1215   number->text = dest;
1216 }
1217
1218 /* Create a token of type TYPE with a literal spelling.  */
1219 static void
1220 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1221                 unsigned int len, enum cpp_ttype type)
1222 {
1223   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1224
1225   memcpy (dest, base, len);
1226   dest[len] = '\0';
1227   token->type = type;
1228   token->val.str.len = len;
1229   token->val.str.text = dest;
1230 }
1231
1232 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1233    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1234
1235 static void
1236 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1237                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1238 {
1239   _cpp_buff *first_buff = *first_buff_p;
1240   _cpp_buff *last_buff = *last_buff_p;
1241
1242   if (first_buff == NULL)
1243     first_buff = last_buff = _cpp_get_buff (pfile, len);
1244   else if (len > BUFF_ROOM (last_buff))
1245     {
1246       size_t room = BUFF_ROOM (last_buff);
1247       memcpy (BUFF_FRONT (last_buff), base, room);
1248       BUFF_FRONT (last_buff) += room;
1249       base += room;
1250       len -= room;
1251       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1252     }
1253
1254   memcpy (BUFF_FRONT (last_buff), base, len);
1255   BUFF_FRONT (last_buff) += len;
1256
1257   *first_buff_p = first_buff;
1258   *last_buff_p = last_buff;
1259 }
1260
1261 /* Lexes a raw string.  The stored string contains the spelling, including
1262    double quotes, delimiter string, '(' and ')', any leading
1263    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1264    literal, or CPP_OTHER if it was not properly terminated.
1265
1266    The spelling is NUL-terminated, but it is not guaranteed that this
1267    is the first NUL since embedded NULs are preserved.  */
1268
1269 static void
1270 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1271                 const uchar *cur)
1272 {
1273   const uchar *raw_prefix;
1274   unsigned int raw_prefix_len = 0;
1275   enum cpp_ttype type;
1276   size_t total_len = 0;
1277   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1278   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1279
1280   type = (*base == 'L' ? CPP_WSTRING :
1281           *base == 'U' ? CPP_STRING32 :
1282           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1283           : CPP_STRING);
1284
1285   raw_prefix = cur + 1;
1286   while (raw_prefix_len < 16)
1287     {
1288       switch (raw_prefix[raw_prefix_len])
1289         {
1290         case ' ': case '(': case ')': case '\\': case '\t':
1291         case '\v': case '\f': case '\n': default:
1292           break;
1293         /* Basic source charset except the above chars.  */
1294         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1295         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1296         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1297         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1298         case 'y': case 'z':
1299         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1300         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1301         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1302         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1303         case 'Y': case 'Z':
1304         case '0': case '1': case '2': case '3': case '4': case '5':
1305         case '6': case '7': case '8': case '9':
1306         case '_': case '{': case '}': case '#': case '[': case ']':
1307         case '<': case '>': case '%': case ':': case ';': case '.':
1308         case '?': case '*': case '+': case '-': case '/': case '^':
1309         case '&': case '|': case '~': case '!': case '=': case ',':
1310         case '"': case '\'':
1311           raw_prefix_len++;
1312           continue;
1313         }
1314       break;
1315     }
1316
1317   if (raw_prefix[raw_prefix_len] != '(')
1318     {
1319       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1320                 + 1;
1321       if (raw_prefix_len == 16)
1322         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1323                              "raw string delimiter longer than 16 characters");
1324       else
1325         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1326                              "invalid character '%c' in raw string delimiter",
1327                              (int) raw_prefix[raw_prefix_len]);
1328       pfile->buffer->cur = raw_prefix - 1;
1329       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1330       return;
1331     }
1332
1333   cur = raw_prefix + raw_prefix_len + 1;
1334   for (;;)
1335     {
1336 #define BUF_APPEND(STR,LEN)                                     \
1337       do {                                                      \
1338         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1339                         &first_buff, &last_buff);               \
1340         total_len += (LEN);                                     \
1341       } while (0);
1342
1343       cppchar_t c;
1344
1345       /* If we previously performed any trigraph or line splicing
1346          transformations, undo them within the body of the raw string.  */
1347       while (note->pos < cur)
1348         ++note;
1349       for (; note->pos == cur; ++note)
1350         {
1351           switch (note->type)
1352             {
1353             case '\\':
1354             case ' ':
1355               /* Restore backslash followed by newline.  */
1356               BUF_APPEND (base, cur - base);
1357               base = cur;
1358               BUF_APPEND ("\\", 1);
1359             after_backslash:
1360               if (note->type == ' ')
1361                 {
1362                   /* GNU backslash whitespace newline extension.  FIXME
1363                      could be any sequence of non-vertical space.  When we
1364                      can properly restore any such sequence, we should mark
1365                      this note as handled so _cpp_process_line_notes
1366                      doesn't warn.  */
1367                   BUF_APPEND (" ", 1);
1368                 }
1369
1370               BUF_APPEND ("\n", 1);
1371               break;
1372
1373             case 0:
1374               /* Already handled.  */
1375               break;
1376
1377             default:
1378               if (_cpp_trigraph_map[note->type])
1379                 {
1380                   /* Don't warn about this trigraph in
1381                      _cpp_process_line_notes, since trigraphs show up as
1382                      trigraphs in raw strings.  */
1383                   uchar type = note->type;
1384                   note->type = 0;
1385
1386                   if (!CPP_OPTION (pfile, trigraphs))
1387                     /* If we didn't convert the trigraph in the first
1388                        place, don't do anything now either.  */
1389                     break;
1390
1391                   BUF_APPEND (base, cur - base);
1392                   base = cur;
1393                   BUF_APPEND ("??", 2);
1394
1395                   /* ??/ followed by newline gets two line notes, one for
1396                      the trigraph and one for the backslash/newline.  */
1397                   if (type == '/' && note[1].pos == cur)
1398                     {
1399                       if (note[1].type != '\\'
1400                           && note[1].type != ' ')
1401                         abort ();
1402                       BUF_APPEND ("/", 1);
1403                       ++note;
1404                       goto after_backslash;
1405                     }
1406                   /* The ) from ??) could be part of the suffix.  */
1407                   else if (type == ')'
1408                            && strncmp ((const char *) cur+1,
1409                                        (const char *) raw_prefix,
1410                                        raw_prefix_len) == 0
1411                            && cur[raw_prefix_len+1] == '"')
1412                     {
1413                       BUF_APPEND (")", 1);
1414                       base++;
1415                       cur += raw_prefix_len + 2;
1416                       goto break_outer_loop;
1417                     }
1418                   else
1419                     {
1420                       /* Skip the replacement character.  */
1421                       base = ++cur;
1422                       BUF_APPEND (&type, 1);
1423                     }
1424                 }
1425               else
1426                 abort ();
1427               break;
1428             }
1429         }
1430       c = *cur++;
1431
1432       if (c == ')'
1433           && strncmp ((const char *) cur, (const char *) raw_prefix,
1434                       raw_prefix_len) == 0
1435           && cur[raw_prefix_len] == '"')
1436         {
1437           cur += raw_prefix_len + 1;
1438           break;
1439         }
1440       else if (c == '\n')
1441         {
1442           if (pfile->state.in_directive
1443               || pfile->state.parsing_args
1444               || pfile->state.in_deferred_pragma)
1445             {
1446               cur--;
1447               type = CPP_OTHER;
1448               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1449                                    "unterminated raw string");
1450               break;
1451             }
1452
1453           BUF_APPEND (base, cur - base);
1454
1455           if (pfile->buffer->cur < pfile->buffer->rlimit)
1456             CPP_INCREMENT_LINE (pfile, 0);
1457           pfile->buffer->need_line = true;
1458
1459           pfile->buffer->cur = cur-1;
1460           _cpp_process_line_notes (pfile, false);
1461           if (!_cpp_get_fresh_line (pfile))
1462             {
1463               source_location src_loc = token->src_loc;
1464               token->type = CPP_EOF;
1465               /* Tell the compiler the line number of the EOF token.  */
1466               token->src_loc = pfile->line_table->highest_line;
1467               token->flags = BOL;
1468               if (first_buff != NULL)
1469                 _cpp_release_buff (pfile, first_buff);
1470               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1471                                    "unterminated raw string");
1472               return;
1473             }
1474
1475           cur = base = pfile->buffer->cur;
1476           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1477         }
1478     }
1479  break_outer_loop:
1480
1481   if (CPP_OPTION (pfile, user_literals))
1482     {
1483       /* Grab user defined literal suffix.  */
1484       if (ISIDST (*cur))
1485         {
1486           type = cpp_userdef_string_add_type (type);
1487           ++cur;
1488         }
1489       while (ISIDNUM (*cur))
1490         ++cur;
1491     }
1492
1493   pfile->buffer->cur = cur;
1494   if (first_buff == NULL)
1495     create_literal (pfile, token, base, cur - base, type);
1496   else
1497     {
1498       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1499
1500       token->type = type;
1501       token->val.str.len = total_len + (cur - base);
1502       token->val.str.text = dest;
1503       last_buff = first_buff;
1504       while (last_buff != NULL)
1505         {
1506           memcpy (dest, last_buff->base,
1507                   BUFF_FRONT (last_buff) - last_buff->base);
1508           dest += BUFF_FRONT (last_buff) - last_buff->base;
1509           last_buff = last_buff->next;
1510         }
1511       _cpp_release_buff (pfile, first_buff);
1512       memcpy (dest, base, cur - base);
1513       dest[cur - base] = '\0';
1514     }
1515 }
1516
1517 /* Lexes a string, character constant, or angle-bracketed header file
1518    name.  The stored string contains the spelling, including opening
1519    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1520    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1521    if it was not properly terminated, or CPP_LESS for an unterminated
1522    header name which must be relexed as normal tokens.
1523
1524    The spelling is NUL-terminated, but it is not guaranteed that this
1525    is the first NUL since embedded NULs are preserved.  */
1526 static void
1527 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1528 {
1529   bool saw_NUL = false;
1530   const uchar *cur;
1531   cppchar_t terminator;
1532   enum cpp_ttype type;
1533
1534   cur = base;
1535   terminator = *cur++;
1536   if (terminator == 'L' || terminator == 'U')
1537     terminator = *cur++;
1538   else if (terminator == 'u')
1539     {
1540       terminator = *cur++;
1541       if (terminator == '8')
1542         terminator = *cur++;
1543     }
1544   if (terminator == 'R')
1545     {
1546       lex_raw_string (pfile, token, base, cur);
1547       return;
1548     }
1549   if (terminator == '"')
1550     type = (*base == 'L' ? CPP_WSTRING :
1551             *base == 'U' ? CPP_STRING32 :
1552             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1553                          : CPP_STRING);
1554   else if (terminator == '\'')
1555     type = (*base == 'L' ? CPP_WCHAR :
1556             *base == 'U' ? CPP_CHAR32 :
1557             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1558   else
1559     terminator = '>', type = CPP_HEADER_NAME;
1560
1561   for (;;)
1562     {
1563       cppchar_t c = *cur++;
1564
1565       /* In #include-style directives, terminators are not escapable.  */
1566       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1567         cur++;
1568       else if (c == terminator)
1569         break;
1570       else if (c == '\n')
1571         {
1572           cur--;
1573           /* Unmatched quotes always yield undefined behavior, but
1574              greedy lexing means that what appears to be an unterminated
1575              header name may actually be a legitimate sequence of tokens.  */
1576           if (terminator == '>')
1577             {
1578               token->type = CPP_LESS;
1579               return;
1580             }
1581           type = CPP_OTHER;
1582           break;
1583         }
1584       else if (c == '\0')
1585         saw_NUL = true;
1586     }
1587
1588   if (saw_NUL && !pfile->state.skipping)
1589     cpp_error (pfile, CPP_DL_WARNING,
1590                "null character(s) preserved in literal");
1591
1592   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1593     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1594                (int) terminator);
1595
1596   if (CPP_OPTION (pfile, user_literals))
1597     {
1598       /* Grab user defined literal suffix.  */
1599       if (ISIDST (*cur))
1600         {
1601           type = cpp_userdef_char_add_type (type);
1602           type = cpp_userdef_string_add_type (type);
1603           ++cur;
1604         }
1605       while (ISIDNUM (*cur))
1606         ++cur;
1607     }
1608
1609   pfile->buffer->cur = cur;
1610   create_literal (pfile, token, base, cur - base, type);
1611 }
1612
1613 /* Return the comment table. The client may not make any assumption
1614    about the ordering of the table.  */
1615 cpp_comment_table *
1616 cpp_get_comments (cpp_reader *pfile)
1617 {
1618   return &pfile->comments;
1619 }
1620
1621 /* Append a comment to the end of the comment table. */
1622 static void
1623 store_comment (cpp_reader *pfile, cpp_token *token)
1624 {
1625   int len;
1626
1627   if (pfile->comments.allocated == 0)
1628     {
1629       pfile->comments.allocated = 256;
1630       pfile->comments.entries = (cpp_comment *) xmalloc
1631         (pfile->comments.allocated * sizeof (cpp_comment));
1632     }
1633
1634   if (pfile->comments.count == pfile->comments.allocated)
1635     {
1636       pfile->comments.allocated *= 2;
1637       pfile->comments.entries = (cpp_comment *) xrealloc
1638         (pfile->comments.entries,
1639          pfile->comments.allocated * sizeof (cpp_comment));
1640     }
1641
1642   len = token->val.str.len;
1643
1644   /* Copy comment. Note, token may not be NULL terminated. */
1645   pfile->comments.entries[pfile->comments.count].comment =
1646     (char *) xmalloc (sizeof (char) * (len + 1));
1647   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1648           token->val.str.text, len);
1649   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1650
1651   /* Set source location. */
1652   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1653
1654   /* Increment the count of entries in the comment table. */
1655   pfile->comments.count++;
1656 }
1657
1658 /* The stored comment includes the comment start and any terminator.  */
1659 static void
1660 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1661               cppchar_t type)
1662 {
1663   unsigned char *buffer;
1664   unsigned int len, clen, i;
1665
1666   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1667
1668   /* C++ comments probably (not definitely) have moved past a new
1669      line, which we don't want to save in the comment.  */
1670   if (is_vspace (pfile->buffer->cur[-1]))
1671     len--;
1672
1673   /* If we are currently in a directive or in argument parsing, then
1674      we need to store all C++ comments as C comments internally, and
1675      so we need to allocate a little extra space in that case.
1676
1677      Note that the only time we encounter a directive here is
1678      when we are saving comments in a "#define".  */
1679   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1680           && type == '/') ? len + 2 : len;
1681
1682   buffer = _cpp_unaligned_alloc (pfile, clen);
1683
1684   token->type = CPP_COMMENT;
1685   token->val.str.len = clen;
1686   token->val.str.text = buffer;
1687
1688   buffer[0] = '/';
1689   memcpy (buffer + 1, from, len - 1);
1690
1691   /* Finish conversion to a C comment, if necessary.  */
1692   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1693     {
1694       buffer[1] = '*';
1695       buffer[clen - 2] = '*';
1696       buffer[clen - 1] = '/';
1697       /* As there can be in a C++ comments illegal sequences for C comments
1698          we need to filter them out.  */
1699       for (i = 2; i < (clen - 2); i++)
1700         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1701           buffer[i] = '|';
1702     }
1703
1704   /* Finally store this comment for use by clients of libcpp. */
1705   store_comment (pfile, token);
1706 }
1707
1708 /* Allocate COUNT tokens for RUN.  */
1709 void
1710 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1711 {
1712   run->base = XNEWVEC (cpp_token, count);
1713   run->limit = run->base + count;
1714   run->next = NULL;
1715 }
1716
1717 /* Returns the next tokenrun, or creates one if there is none.  */
1718 static tokenrun *
1719 next_tokenrun (tokenrun *run)
1720 {
1721   if (run->next == NULL)
1722     {
1723       run->next = XNEW (tokenrun);
1724       run->next->prev = run;
1725       _cpp_init_tokenrun (run->next, 250);
1726     }
1727
1728   return run->next;
1729 }
1730
1731 /* Return the number of not yet processed token in a given
1732    context.  */
1733 int
1734 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1735 {
1736   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1737     return (LAST (context).token - FIRST (context).token);
1738   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1739            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1740     return (LAST (context).ptoken - FIRST (context).ptoken);
1741   else
1742       abort ();
1743 }
1744
1745 /* Returns the token present at index INDEX in a given context.  If
1746    INDEX is zero, the next token to be processed is returned.  */
1747 static const cpp_token*
1748 _cpp_token_from_context_at (cpp_context *context, int index)
1749 {
1750   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1751     return &(FIRST (context).token[index]);
1752   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1753            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1754     return FIRST (context).ptoken[index];
1755  else
1756    abort ();
1757 }
1758
1759 /* Look ahead in the input stream.  */
1760 const cpp_token *
1761 cpp_peek_token (cpp_reader *pfile, int index)
1762 {
1763   cpp_context *context = pfile->context;
1764   const cpp_token *peektok;
1765   int count;
1766
1767   /* First, scan through any pending cpp_context objects.  */
1768   while (context->prev)
1769     {
1770       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1771
1772       if (index < (int) sz)
1773         return _cpp_token_from_context_at (context, index);
1774       index -= (int) sz;
1775       context = context->prev;
1776     }
1777
1778   /* We will have to read some new tokens after all (and do so
1779      without invalidating preceding tokens).  */
1780   count = index;
1781   pfile->keep_tokens++;
1782
1783   do
1784     {
1785       peektok = _cpp_lex_token (pfile);
1786       if (peektok->type == CPP_EOF)
1787         return peektok;
1788     }
1789   while (index--);
1790
1791   _cpp_backup_tokens_direct (pfile, count + 1);
1792   pfile->keep_tokens--;
1793
1794   return peektok;
1795 }
1796
1797 /* Allocate a single token that is invalidated at the same time as the
1798    rest of the tokens on the line.  Has its line and col set to the
1799    same as the last lexed token, so that diagnostics appear in the
1800    right place.  */
1801 cpp_token *
1802 _cpp_temp_token (cpp_reader *pfile)
1803 {
1804   cpp_token *old, *result;
1805   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1806   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1807
1808   old = pfile->cur_token - 1;
1809   /* Any pre-existing lookaheads must not be clobbered.  */
1810   if (la)
1811     {
1812       if (sz <= la)
1813         {
1814           tokenrun *next = next_tokenrun (pfile->cur_run);
1815
1816           if (sz < la)
1817             memmove (next->base + 1, next->base,
1818                      (la - sz) * sizeof (cpp_token));
1819
1820           next->base[0] = pfile->cur_run->limit[-1];
1821         }
1822
1823       if (sz > 1)
1824         memmove (pfile->cur_token + 1, pfile->cur_token,
1825                  MIN (la, sz - 1) * sizeof (cpp_token));
1826     }
1827
1828   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1829     {
1830       pfile->cur_run = next_tokenrun (pfile->cur_run);
1831       pfile->cur_token = pfile->cur_run->base;
1832     }
1833
1834   result = pfile->cur_token++;
1835   result->src_loc = old->src_loc;
1836   return result;
1837 }
1838
1839 /* Lex a token into RESULT (external interface).  Takes care of issues
1840    like directive handling, token lookahead, multiple include
1841    optimization and skipping.  */
1842 const cpp_token *
1843 _cpp_lex_token (cpp_reader *pfile)
1844 {
1845   cpp_token *result;
1846
1847   for (;;)
1848     {
1849       if (pfile->cur_token == pfile->cur_run->limit)
1850         {
1851           pfile->cur_run = next_tokenrun (pfile->cur_run);
1852           pfile->cur_token = pfile->cur_run->base;
1853         }
1854       /* We assume that the current token is somewhere in the current
1855          run.  */
1856       if (pfile->cur_token < pfile->cur_run->base
1857           || pfile->cur_token >= pfile->cur_run->limit)
1858         abort ();
1859
1860       if (pfile->lookaheads)
1861         {
1862           pfile->lookaheads--;
1863           result = pfile->cur_token++;
1864         }
1865       else
1866         result = _cpp_lex_direct (pfile);
1867
1868       if (result->flags & BOL)
1869         {
1870           /* Is this a directive.  If _cpp_handle_directive returns
1871              false, it is an assembler #.  */
1872           if (result->type == CPP_HASH
1873               /* 6.10.3 p 11: Directives in a list of macro arguments
1874                  gives undefined behavior.  This implementation
1875                  handles the directive as normal.  */
1876               && pfile->state.parsing_args != 1)
1877             {
1878               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1879                 {
1880                   if (pfile->directive_result.type == CPP_PADDING)
1881                     continue;
1882                   result = &pfile->directive_result;
1883                 }
1884             }
1885           else if (pfile->state.in_deferred_pragma)
1886             result = &pfile->directive_result;
1887
1888           if (pfile->cb.line_change && !pfile->state.skipping)
1889             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1890         }
1891
1892       /* We don't skip tokens in directives.  */
1893       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1894         break;
1895
1896       /* Outside a directive, invalidate controlling macros.  At file
1897          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1898          get here and MI optimization works.  */
1899       pfile->mi_valid = false;
1900
1901       if (!pfile->state.skipping || result->type == CPP_EOF)
1902         break;
1903     }
1904
1905   return result;
1906 }
1907
1908 /* Returns true if a fresh line has been loaded.  */
1909 bool
1910 _cpp_get_fresh_line (cpp_reader *pfile)
1911 {
1912   int return_at_eof;
1913
1914   /* We can't get a new line until we leave the current directive.  */
1915   if (pfile->state.in_directive)
1916     return false;
1917
1918   for (;;)
1919     {
1920       cpp_buffer *buffer = pfile->buffer;
1921
1922       if (!buffer->need_line)
1923         return true;
1924
1925       if (buffer->next_line < buffer->rlimit)
1926         {
1927           _cpp_clean_line (pfile);
1928           return true;
1929         }
1930
1931       /* First, get out of parsing arguments state.  */
1932       if (pfile->state.parsing_args)
1933         return false;
1934
1935       /* End of buffer.  Non-empty files should end in a newline.  */
1936       if (buffer->buf != buffer->rlimit
1937           && buffer->next_line > buffer->rlimit
1938           && !buffer->from_stage3)
1939         {
1940           /* Clip to buffer size.  */
1941           buffer->next_line = buffer->rlimit;
1942         }
1943
1944       return_at_eof = buffer->return_at_eof;
1945       _cpp_pop_buffer (pfile);
1946       if (pfile->buffer == NULL || return_at_eof)
1947         return false;
1948     }
1949 }
1950
1951 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1952   do                                                    \
1953     {                                                   \
1954       result->type = ELSE_TYPE;                         \
1955       if (*buffer->cur == CHAR)                         \
1956         buffer->cur++, result->type = THEN_TYPE;        \
1957     }                                                   \
1958   while (0)
1959
1960 /* Lex a token into pfile->cur_token, which is also incremented, to
1961    get diagnostics pointing to the correct location.
1962
1963    Does not handle issues such as token lookahead, multiple-include
1964    optimization, directives, skipping etc.  This function is only
1965    suitable for use by _cpp_lex_token, and in special cases like
1966    lex_expansion_token which doesn't care for any of these issues.
1967
1968    When meeting a newline, returns CPP_EOF if parsing a directive,
1969    otherwise returns to the start of the token buffer if permissible.
1970    Returns the location of the lexed token.  */
1971 cpp_token *
1972 _cpp_lex_direct (cpp_reader *pfile)
1973 {
1974   cppchar_t c;
1975   cpp_buffer *buffer;
1976   const unsigned char *comment_start;
1977   cpp_token *result = pfile->cur_token++;
1978
1979  fresh_line:
1980   result->flags = 0;
1981   buffer = pfile->buffer;
1982   if (buffer->need_line)
1983     {
1984       if (pfile->state.in_deferred_pragma)
1985         {
1986           result->type = CPP_PRAGMA_EOL;
1987           pfile->state.in_deferred_pragma = false;
1988           if (!pfile->state.pragma_allow_expansion)
1989             pfile->state.prevent_expansion--;
1990           return result;
1991         }
1992       if (!_cpp_get_fresh_line (pfile))
1993         {
1994           result->type = CPP_EOF;
1995           if (!pfile->state.in_directive)
1996             {
1997               /* Tell the compiler the line number of the EOF token.  */
1998               result->src_loc = pfile->line_table->highest_line;
1999               result->flags = BOL;
2000             }
2001           return result;
2002         }
2003       if (!pfile->keep_tokens)
2004         {
2005           pfile->cur_run = &pfile->base_run;
2006           result = pfile->base_run.base;
2007           pfile->cur_token = result + 1;
2008         }
2009       result->flags = BOL;
2010       if (pfile->state.parsing_args == 2)
2011         result->flags |= PREV_WHITE;
2012     }
2013   buffer = pfile->buffer;
2014  update_tokens_line:
2015   result->src_loc = pfile->line_table->highest_line;
2016
2017  skipped_white:
2018   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2019       && !pfile->overlaid_buffer)
2020     {
2021       _cpp_process_line_notes (pfile, false);
2022       result->src_loc = pfile->line_table->highest_line;
2023     }
2024   c = *buffer->cur++;
2025
2026   if (pfile->forced_token_location_p)
2027     result->src_loc = *pfile->forced_token_location_p;
2028   else
2029     result->src_loc = linemap_position_for_column (pfile->line_table,
2030                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2031
2032   switch (c)
2033     {
2034     case ' ': case '\t': case '\f': case '\v': case '\0':
2035       result->flags |= PREV_WHITE;
2036       skip_whitespace (pfile, c);
2037       goto skipped_white;
2038
2039     case '\n':
2040       if (buffer->cur < buffer->rlimit)
2041         CPP_INCREMENT_LINE (pfile, 0);
2042       buffer->need_line = true;
2043       goto fresh_line;
2044
2045     case '0': case '1': case '2': case '3': case '4':
2046     case '5': case '6': case '7': case '8': case '9':
2047       {
2048         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2049         result->type = CPP_NUMBER;
2050         lex_number (pfile, &result->val.str, &nst);
2051         warn_about_normalization (pfile, result, &nst);
2052         break;
2053       }
2054
2055     case 'L':
2056     case 'u':
2057     case 'U':
2058     case 'R':
2059       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2060          wide strings or raw strings.  */
2061       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2062           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2063         {
2064           if ((*buffer->cur == '\'' && c != 'R')
2065               || *buffer->cur == '"'
2066               || (*buffer->cur == 'R'
2067                   && c != 'R'
2068                   && buffer->cur[1] == '"'
2069                   && CPP_OPTION (pfile, rliterals))
2070               || (*buffer->cur == '8'
2071                   && c == 'u'
2072                   && (buffer->cur[1] == '"'
2073                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2074                           && CPP_OPTION (pfile, rliterals)))))
2075             {
2076               lex_string (pfile, result, buffer->cur - 1);
2077               break;
2078             }
2079         }
2080       /* Fall through.  */
2081
2082     case '_':
2083     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2084     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2085     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2086     case 's': case 't':           case 'v': case 'w': case 'x':
2087     case 'y': case 'z':
2088     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2089     case 'G': case 'H': case 'I': case 'J': case 'K':
2090     case 'M': case 'N': case 'O': case 'P': case 'Q':
2091     case 'S': case 'T':           case 'V': case 'W': case 'X':
2092     case 'Y': case 'Z':
2093       result->type = CPP_NAME;
2094       {
2095         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2096         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2097                                                 &nst);
2098         warn_about_normalization (pfile, result, &nst);
2099       }
2100
2101       /* Convert named operators to their proper types.  */
2102       if (result->val.node.node->flags & NODE_OPERATOR)
2103         {
2104           result->flags |= NAMED_OP;
2105           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2106         }
2107       break;
2108
2109     case '\'':
2110     case '"':
2111       lex_string (pfile, result, buffer->cur - 1);
2112       break;
2113
2114     case '/':
2115       /* A potential block or line comment.  */
2116       comment_start = buffer->cur;
2117       c = *buffer->cur;
2118
2119       if (c == '*')
2120         {
2121           if (_cpp_skip_block_comment (pfile))
2122             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2123         }
2124       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2125                             || cpp_in_system_header (pfile)))
2126         {
2127           /* Warn about comments only if pedantically GNUC89, and not
2128              in system headers.  */
2129           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2130               && ! buffer->warned_cplusplus_comments)
2131             {
2132               cpp_error (pfile, CPP_DL_PEDWARN,
2133                          "C++ style comments are not allowed in ISO C90");
2134               cpp_error (pfile, CPP_DL_PEDWARN,
2135                          "(this will be reported only once per input file)");
2136               buffer->warned_cplusplus_comments = 1;
2137             }
2138
2139           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2140             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2141         }
2142       else if (c == '=')
2143         {
2144           buffer->cur++;
2145           result->type = CPP_DIV_EQ;
2146           break;
2147         }
2148       else
2149         {
2150           result->type = CPP_DIV;
2151           break;
2152         }
2153
2154       if (!pfile->state.save_comments)
2155         {
2156           result->flags |= PREV_WHITE;
2157           goto update_tokens_line;
2158         }
2159
2160       /* Save the comment as a token in its own right.  */
2161       save_comment (pfile, result, comment_start, c);
2162       break;
2163
2164     case '<':
2165       if (pfile->state.angled_headers)
2166         {
2167           lex_string (pfile, result, buffer->cur - 1);
2168           if (result->type != CPP_LESS)
2169             break;
2170         }
2171
2172       result->type = CPP_LESS;
2173       if (*buffer->cur == '=')
2174         buffer->cur++, result->type = CPP_LESS_EQ;
2175       else if (*buffer->cur == '<')
2176         {
2177           buffer->cur++;
2178           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2179         }
2180       else if (CPP_OPTION (pfile, digraphs))
2181         {
2182           if (*buffer->cur == ':')
2183             {
2184               buffer->cur++;
2185               result->flags |= DIGRAPH;
2186               result->type = CPP_OPEN_SQUARE;
2187             }
2188           else if (*buffer->cur == '%')
2189             {
2190               buffer->cur++;
2191               result->flags |= DIGRAPH;
2192               result->type = CPP_OPEN_BRACE;
2193             }
2194         }
2195       break;
2196
2197     case '>':
2198       result->type = CPP_GREATER;
2199       if (*buffer->cur == '=')
2200         buffer->cur++, result->type = CPP_GREATER_EQ;
2201       else if (*buffer->cur == '>')
2202         {
2203           buffer->cur++;
2204           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2205         }
2206       break;
2207
2208     case '%':
2209       result->type = CPP_MOD;
2210       if (*buffer->cur == '=')
2211         buffer->cur++, result->type = CPP_MOD_EQ;
2212       else if (CPP_OPTION (pfile, digraphs))
2213         {
2214           if (*buffer->cur == ':')
2215             {
2216               buffer->cur++;
2217               result->flags |= DIGRAPH;
2218               result->type = CPP_HASH;
2219               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2220                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2221             }
2222           else if (*buffer->cur == '>')
2223             {
2224               buffer->cur++;
2225               result->flags |= DIGRAPH;
2226               result->type = CPP_CLOSE_BRACE;
2227             }
2228         }
2229       break;
2230
2231     case '.':
2232       result->type = CPP_DOT;
2233       if (ISDIGIT (*buffer->cur))
2234         {
2235           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2236           result->type = CPP_NUMBER;
2237           lex_number (pfile, &result->val.str, &nst);
2238           warn_about_normalization (pfile, result, &nst);
2239         }
2240       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2241         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2242       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2243         buffer->cur++, result->type = CPP_DOT_STAR;
2244       break;
2245
2246     case '+':
2247       result->type = CPP_PLUS;
2248       if (*buffer->cur == '+')
2249         buffer->cur++, result->type = CPP_PLUS_PLUS;
2250       else if (*buffer->cur == '=')
2251         buffer->cur++, result->type = CPP_PLUS_EQ;
2252       break;
2253
2254     case '-':
2255       result->type = CPP_MINUS;
2256       if (*buffer->cur == '>')
2257         {
2258           buffer->cur++;
2259           result->type = CPP_DEREF;
2260           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2261             buffer->cur++, result->type = CPP_DEREF_STAR;
2262         }
2263       else if (*buffer->cur == '-')
2264         buffer->cur++, result->type = CPP_MINUS_MINUS;
2265       else if (*buffer->cur == '=')
2266         buffer->cur++, result->type = CPP_MINUS_EQ;
2267       break;
2268
2269     case '&':
2270       result->type = CPP_AND;
2271       if (*buffer->cur == '&')
2272         buffer->cur++, result->type = CPP_AND_AND;
2273       else if (*buffer->cur == '=')
2274         buffer->cur++, result->type = CPP_AND_EQ;
2275       break;
2276
2277     case '|':
2278       result->type = CPP_OR;
2279       if (*buffer->cur == '|')
2280         buffer->cur++, result->type = CPP_OR_OR;
2281       else if (*buffer->cur == '=')
2282         buffer->cur++, result->type = CPP_OR_EQ;
2283       break;
2284
2285     case ':':
2286       result->type = CPP_COLON;
2287       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2288         buffer->cur++, result->type = CPP_SCOPE;
2289       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2290         {
2291           buffer->cur++;
2292           result->flags |= DIGRAPH;
2293           result->type = CPP_CLOSE_SQUARE;
2294         }
2295       break;
2296
2297     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2298     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2299     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2300     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2301     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2302
2303     case '?': result->type = CPP_QUERY; break;
2304     case '~': result->type = CPP_COMPL; break;
2305     case ',': result->type = CPP_COMMA; break;
2306     case '(': result->type = CPP_OPEN_PAREN; break;
2307     case ')': result->type = CPP_CLOSE_PAREN; break;
2308     case '[': result->type = CPP_OPEN_SQUARE; break;
2309     case ']': result->type = CPP_CLOSE_SQUARE; break;
2310     case '{': result->type = CPP_OPEN_BRACE; break;
2311     case '}': result->type = CPP_CLOSE_BRACE; break;
2312     case ';': result->type = CPP_SEMICOLON; break;
2313
2314       /* @ is a punctuator in Objective-C.  */
2315     case '@': result->type = CPP_ATSIGN; break;
2316
2317     case '$':
2318     case '\\':
2319       {
2320         const uchar *base = --buffer->cur;
2321         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2322
2323         if (forms_identifier_p (pfile, true, &nst))
2324           {
2325             result->type = CPP_NAME;
2326             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2327             warn_about_normalization (pfile, result, &nst);
2328             break;
2329           }
2330         buffer->cur++;
2331       }
2332
2333     default:
2334       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2335       break;
2336     }
2337
2338   return result;
2339 }
2340
2341 /* An upper bound on the number of bytes needed to spell TOKEN.
2342    Does not include preceding whitespace.  */
2343 unsigned int
2344 cpp_token_len (const cpp_token *token)
2345 {
2346   unsigned int len;
2347
2348   switch (TOKEN_SPELL (token))
2349     {
2350     default:            len = 6;                                break;
2351     case SPELL_LITERAL: len = token->val.str.len;               break;
2352     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2353     }
2354
2355   return len;
2356 }
2357
2358 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2359    Return the number of bytes read out of NAME.  (There are always
2360    10 bytes written to BUFFER.)  */
2361
2362 static size_t
2363 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2364 {
2365   int j;
2366   int ucn_len = 0;
2367   int ucn_len_c;
2368   unsigned t;
2369   unsigned long utf32;
2370
2371   /* Compute the length of the UTF-8 sequence.  */
2372   for (t = *name; t & 0x80; t <<= 1)
2373     ucn_len++;
2374
2375   utf32 = *name & (0x7F >> ucn_len);
2376   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2377     {
2378       utf32 = (utf32 << 6) | (*++name & 0x3F);
2379
2380       /* Ill-formed UTF-8.  */
2381       if ((*name & ~0x3F) != 0x80)
2382         abort ();
2383     }
2384
2385   *buffer++ = '\\';
2386   *buffer++ = 'U';
2387   for (j = 7; j >= 0; j--)
2388     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2389   return ucn_len;
2390 }
2391
2392 /* Given a token TYPE corresponding to a digraph, return a pointer to
2393    the spelling of the digraph.  */
2394 static const unsigned char *
2395 cpp_digraph2name (enum cpp_ttype type)
2396 {
2397   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2398 }
2399
2400 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2401    already contain the enough space to hold the token's spelling.
2402    Returns a pointer to the character after the last character written.
2403    FORSTRING is true if this is to be the spelling after translation
2404    phase 1 (this is different for UCNs).
2405    FIXME: Would be nice if we didn't need the PFILE argument.  */
2406 unsigned char *
2407 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2408                  unsigned char *buffer, bool forstring)
2409 {
2410   switch (TOKEN_SPELL (token))
2411     {
2412     case SPELL_OPERATOR:
2413       {
2414         const unsigned char *spelling;
2415         unsigned char c;
2416
2417         if (token->flags & DIGRAPH)
2418           spelling = cpp_digraph2name (token->type);
2419         else if (token->flags & NAMED_OP)
2420           goto spell_ident;
2421         else
2422           spelling = TOKEN_NAME (token);
2423
2424         while ((c = *spelling++) != '\0')
2425           *buffer++ = c;
2426       }
2427       break;
2428
2429     spell_ident:
2430     case SPELL_IDENT:
2431       if (forstring)
2432         {
2433           memcpy (buffer, NODE_NAME (token->val.node.node),
2434                   NODE_LEN (token->val.node.node));
2435           buffer += NODE_LEN (token->val.node.node);
2436         }
2437       else
2438         {
2439           size_t i;
2440           const unsigned char * name = NODE_NAME (token->val.node.node);
2441
2442           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2443             if (name[i] & ~0x7F)
2444               {
2445                 i += utf8_to_ucn (buffer, name + i) - 1;
2446                 buffer += 10;
2447               }
2448             else
2449               *buffer++ = NODE_NAME (token->val.node.node)[i];
2450         }
2451       break;
2452
2453     case SPELL_LITERAL:
2454       memcpy (buffer, token->val.str.text, token->val.str.len);
2455       buffer += token->val.str.len;
2456       break;
2457
2458     case SPELL_NONE:
2459       cpp_error (pfile, CPP_DL_ICE,
2460                  "unspellable token %s", TOKEN_NAME (token));
2461       break;
2462     }
2463
2464   return buffer;
2465 }
2466
2467 /* Returns TOKEN spelt as a null-terminated string.  The string is
2468    freed when the reader is destroyed.  Useful for diagnostics.  */
2469 unsigned char *
2470 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2471 {
2472   unsigned int len = cpp_token_len (token) + 1;
2473   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2474
2475   end = cpp_spell_token (pfile, token, start, false);
2476   end[0] = '\0';
2477
2478   return start;
2479 }
2480
2481 /* Returns a pointer to a string which spells the token defined by
2482    TYPE and FLAGS.  Used by C front ends, which really should move to
2483    using cpp_token_as_text.  */
2484 const char *
2485 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2486 {
2487   if (flags & DIGRAPH)
2488     return (const char *) cpp_digraph2name (type);
2489   else if (flags & NAMED_OP)
2490     return cpp_named_operator2name (type);
2491
2492   return (const char *) token_spellings[type].name;
2493 }
2494
2495 /* Writes the spelling of token to FP, without any preceding space.
2496    Separated from cpp_spell_token for efficiency - to avoid stdio
2497    double-buffering.  */
2498 void
2499 cpp_output_token (const cpp_token *token, FILE *fp)
2500 {
2501   switch (TOKEN_SPELL (token))
2502     {
2503     case SPELL_OPERATOR:
2504       {
2505         const unsigned char *spelling;
2506         int c;
2507
2508         if (token->flags & DIGRAPH)
2509           spelling = cpp_digraph2name (token->type);
2510         else if (token->flags & NAMED_OP)
2511           goto spell_ident;
2512         else
2513           spelling = TOKEN_NAME (token);
2514
2515         c = *spelling;
2516         do
2517           putc (c, fp);
2518         while ((c = *++spelling) != '\0');
2519       }
2520       break;
2521
2522     spell_ident:
2523     case SPELL_IDENT:
2524       {
2525         size_t i;
2526         const unsigned char * name = NODE_NAME (token->val.node.node);
2527
2528         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2529           if (name[i] & ~0x7F)
2530             {
2531               unsigned char buffer[10];
2532               i += utf8_to_ucn (buffer, name + i) - 1;
2533               fwrite (buffer, 1, 10, fp);
2534             }
2535           else
2536             fputc (NODE_NAME (token->val.node.node)[i], fp);
2537       }
2538       break;
2539
2540     case SPELL_LITERAL:
2541       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2542       break;
2543
2544     case SPELL_NONE:
2545       /* An error, most probably.  */
2546       break;
2547     }
2548 }
2549
2550 /* Compare two tokens.  */
2551 int
2552 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2553 {
2554   if (a->type == b->type && a->flags == b->flags)
2555     switch (TOKEN_SPELL (a))
2556       {
2557       default:                  /* Keep compiler happy.  */
2558       case SPELL_OPERATOR:
2559         /* token_no is used to track where multiple consecutive ##
2560            tokens were originally located.  */
2561         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2562       case SPELL_NONE:
2563         return (a->type != CPP_MACRO_ARG
2564                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2565       case SPELL_IDENT:
2566         return a->val.node.node == b->val.node.node;
2567       case SPELL_LITERAL:
2568         return (a->val.str.len == b->val.str.len
2569                 && !memcmp (a->val.str.text, b->val.str.text,
2570                             a->val.str.len));
2571       }
2572
2573   return 0;
2574 }
2575
2576 /* Returns nonzero if a space should be inserted to avoid an
2577    accidental token paste for output.  For simplicity, it is
2578    conservative, and occasionally advises a space where one is not
2579    needed, e.g. "." and ".2".  */
2580 int
2581 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2582                  const cpp_token *token2)
2583 {
2584   enum cpp_ttype a = token1->type, b = token2->type;
2585   cppchar_t c;
2586
2587   if (token1->flags & NAMED_OP)
2588     a = CPP_NAME;
2589   if (token2->flags & NAMED_OP)
2590     b = CPP_NAME;
2591
2592   c = EOF;
2593   if (token2->flags & DIGRAPH)
2594     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2595   else if (token_spellings[b].category == SPELL_OPERATOR)
2596     c = token_spellings[b].name[0];
2597
2598   /* Quickly get everything that can paste with an '='.  */
2599   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2600     return 1;
2601
2602   switch (a)
2603     {
2604     case CPP_GREATER:   return c == '>';
2605     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2606     case CPP_PLUS:      return c == '+';
2607     case CPP_MINUS:     return c == '-' || c == '>';
2608     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2609     case CPP_MOD:       return c == ':' || c == '>';
2610     case CPP_AND:       return c == '&';
2611     case CPP_OR:        return c == '|';
2612     case CPP_COLON:     return c == ':' || c == '>';
2613     case CPP_DEREF:     return c == '*';
2614     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2615     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2616     case CPP_NAME:      return ((b == CPP_NUMBER
2617                                  && name_p (pfile, &token2->val.str))
2618                                 || b == CPP_NAME
2619                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2620     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2621                                 || c == '.' || c == '+' || c == '-');
2622                                       /* UCNs */
2623     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2624                                  && b == CPP_NAME)
2625                                 || (CPP_OPTION (pfile, objc)
2626                                     && token1->val.str.text[0] == '@'
2627                                     && (b == CPP_NAME || b == CPP_STRING)));
2628     default:            break;
2629     }
2630
2631   return 0;
2632 }
2633
2634 /* Output all the remaining tokens on the current line, and a newline
2635    character, to FP.  Leading whitespace is removed.  If there are
2636    macros, special token padding is not performed.  */
2637 void
2638 cpp_output_line (cpp_reader *pfile, FILE *fp)
2639 {
2640   const cpp_token *token;
2641
2642   token = cpp_get_token (pfile);
2643   while (token->type != CPP_EOF)
2644     {
2645       cpp_output_token (token, fp);
2646       token = cpp_get_token (pfile);
2647       if (token->flags & PREV_WHITE)
2648         putc (' ', fp);
2649     }
2650
2651   putc ('\n', fp);
2652 }
2653
2654 /* Return a string representation of all the remaining tokens on the
2655    current line.  The result is allocated using xmalloc and must be
2656    freed by the caller.  */
2657 unsigned char *
2658 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2659 {
2660   const cpp_token *token;
2661   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2662   unsigned int alloced = 120 + out;
2663   unsigned char *result = (unsigned char *) xmalloc (alloced);
2664
2665   /* If DIR_NAME is empty, there are no initial contents.  */
2666   if (dir_name)
2667     {
2668       sprintf ((char *) result, "#%s ", dir_name);
2669       out += 2;
2670     }
2671
2672   token = cpp_get_token (pfile);
2673   while (token->type != CPP_EOF)
2674     {
2675       unsigned char *last;
2676       /* Include room for a possible space and the terminating nul.  */
2677       unsigned int len = cpp_token_len (token) + 2;
2678
2679       if (out + len > alloced)
2680         {
2681           alloced *= 2;
2682           if (out + len > alloced)
2683             alloced = out + len;
2684           result = (unsigned char *) xrealloc (result, alloced);
2685         }
2686
2687       last = cpp_spell_token (pfile, token, &result[out], 0);
2688       out = last - result;
2689
2690       token = cpp_get_token (pfile);
2691       if (token->flags & PREV_WHITE)
2692         result[out++] = ' ';
2693     }
2694
2695   result[out] = '\0';
2696   return result;
2697 }
2698
2699 /* Memory buffers.  Changing these three constants can have a dramatic
2700    effect on performance.  The values here are reasonable defaults,
2701    but might be tuned.  If you adjust them, be sure to test across a
2702    range of uses of cpplib, including heavy nested function-like macro
2703    expansion.  Also check the change in peak memory usage (NJAMD is a
2704    good tool for this).  */
2705 #define MIN_BUFF_SIZE 8000
2706 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2707 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2708         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2709
2710 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2711   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2712 #endif
2713
2714 /* Create a new allocation buffer.  Place the control block at the end
2715    of the buffer, so that buffer overflows will cause immediate chaos.  */
2716 static _cpp_buff *
2717 new_buff (size_t len)
2718 {
2719   _cpp_buff *result;
2720   unsigned char *base;
2721
2722   if (len < MIN_BUFF_SIZE)
2723     len = MIN_BUFF_SIZE;
2724   len = CPP_ALIGN (len);
2725
2726   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2727   result = (_cpp_buff *) (base + len);
2728   result->base = base;
2729   result->cur = base;
2730   result->limit = base + len;
2731   result->next = NULL;
2732   return result;
2733 }
2734
2735 /* Place a chain of unwanted allocation buffers on the free list.  */
2736 void
2737 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2738 {
2739   _cpp_buff *end = buff;
2740
2741   while (end->next)
2742     end = end->next;
2743   end->next = pfile->free_buffs;
2744   pfile->free_buffs = buff;
2745 }
2746
2747 /* Return a free buffer of size at least MIN_SIZE.  */
2748 _cpp_buff *
2749 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2750 {
2751   _cpp_buff *result, **p;
2752
2753   for (p = &pfile->free_buffs;; p = &(*p)->next)
2754     {
2755       size_t size;
2756
2757       if (*p == NULL)
2758         return new_buff (min_size);
2759       result = *p;
2760       size = result->limit - result->base;
2761       /* Return a buffer that's big enough, but don't waste one that's
2762          way too big.  */
2763       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2764         break;
2765     }
2766
2767   *p = result->next;
2768   result->next = NULL;
2769   result->cur = result->base;
2770   return result;
2771 }
2772
2773 /* Creates a new buffer with enough space to hold the uncommitted
2774    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2775    the excess bytes to the new buffer.  Chains the new buffer after
2776    BUFF, and returns the new buffer.  */
2777 _cpp_buff *
2778 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2779 {
2780   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2781   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2782
2783   buff->next = new_buff;
2784   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2785   return new_buff;
2786 }
2787
2788 /* Creates a new buffer with enough space to hold the uncommitted
2789    remaining bytes of the buffer pointed to by BUFF, and at least
2790    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2791    Chains the new buffer before the buffer pointed to by BUFF, and
2792    updates the pointer to point to the new buffer.  */
2793 void
2794 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2795 {
2796   _cpp_buff *new_buff, *old_buff = *pbuff;
2797   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2798
2799   new_buff = _cpp_get_buff (pfile, size);
2800   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2801   new_buff->next = old_buff;
2802   *pbuff = new_buff;
2803 }
2804
2805 /* Free a chain of buffers starting at BUFF.  */
2806 void
2807 _cpp_free_buff (_cpp_buff *buff)
2808 {
2809   _cpp_buff *next;
2810
2811   for (; buff; buff = next)
2812     {
2813       next = buff->next;
2814       free (buff->base);
2815     }
2816 }
2817
2818 /* Allocate permanent, unaligned storage of length LEN.  */
2819 unsigned char *
2820 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2821 {
2822   _cpp_buff *buff = pfile->u_buff;
2823   unsigned char *result = buff->cur;
2824
2825   if (len > (size_t) (buff->limit - result))
2826     {
2827       buff = _cpp_get_buff (pfile, len);
2828       buff->next = pfile->u_buff;
2829       pfile->u_buff = buff;
2830       result = buff->cur;
2831     }
2832
2833   buff->cur = result + len;
2834   return result;
2835 }
2836
2837 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2838    That buffer is used for growing allocations when saving macro
2839    replacement lists in a #define, and when parsing an answer to an
2840    assertion in #assert, #unassert or #if (and therefore possibly
2841    whilst expanding macros).  It therefore must not be used by any
2842    code that they might call: specifically the lexer and the guts of
2843    the macro expander.
2844
2845    All existing other uses clearly fit this restriction: storing
2846    registered pragmas during initialization.  */
2847 unsigned char *
2848 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2849 {
2850   _cpp_buff *buff = pfile->a_buff;
2851   unsigned char *result = buff->cur;
2852
2853   if (len > (size_t) (buff->limit - result))
2854     {
2855       buff = _cpp_get_buff (pfile, len);
2856       buff->next = pfile->a_buff;
2857       pfile->a_buff = buff;
2858       result = buff->cur;
2859     }
2860
2861   buff->cur = result + len;
2862   return result;
2863 }
2864
2865 /* Say which field of TOK is in use.  */
2866
2867 enum cpp_token_fld_kind
2868 cpp_token_val_index (cpp_token *tok)
2869 {
2870   switch (TOKEN_SPELL (tok))
2871     {
2872     case SPELL_IDENT:
2873       return CPP_TOKEN_FLD_NODE;
2874     case SPELL_LITERAL:
2875       return CPP_TOKEN_FLD_STR;
2876     case SPELL_OPERATOR:
2877       if (tok->type == CPP_PASTE)
2878         return CPP_TOKEN_FLD_TOKEN_NO;
2879       else
2880         return CPP_TOKEN_FLD_NONE;
2881     case SPELL_NONE:
2882       if (tok->type == CPP_MACRO_ARG)
2883         return CPP_TOKEN_FLD_ARG_NO;
2884       else if (tok->type == CPP_PADDING)
2885         return CPP_TOKEN_FLD_SOURCE;
2886       else if (tok->type == CPP_PRAGMA)
2887         return CPP_TOKEN_FLD_PRAGMA;
2888       /* else fall through */
2889     default:
2890       return CPP_TOKEN_FLD_NONE;
2891     }
2892 }
2893
2894 /* All tokens lexed in R after calling this function will be forced to have
2895    their source_location the same as the location referenced by P, until
2896    cpp_stop_forcing_token_locations is called for R.  */
2897
2898 void
2899 cpp_force_token_locations (cpp_reader *r, source_location *p)
2900 {
2901   r->forced_token_location_p = p;
2902 }
2903
2904 /* Go back to assigning locations naturally for lexed tokens.  */
2905
2906 void
2907 cpp_stop_forcing_token_locations (cpp_reader *r)
2908 {
2909   r->forced_token_location_p = NULL;
2910 }