gcc:

[pf3gnuchains/gcc-fork.git] / libcpp / lex.c
diff --git a/libcpp/lex.c b/libcpp/lex.c

index 7eafb13..621b9a6 100644 (file)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1,5 +1,6 @@
  /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
+   Free Software Foundation, Inc.
     Contributed by Per Bothner, 1994-95.
     Based on CCCP program by Paul Rubin, June 1986
     Adapted to ANSI C, Richard Stallman, Jan 1987
@@ -7,7 +8,7 @@
  
  This program is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
  later version.
  
  This program is distributed in the hope that it will be useful,
@@ -16,8 +17,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+along with this program; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
  
  #include "config.h"
  #include "system.h"
@@ -39,10 +40,10 @@ struct token_spelling
  };
  
  static const unsigned char *const digraph_spellings[] =
-{ U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
+{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  
-#define OP(e, s) { SPELL_OPERATOR, U s           },
-#define TK(e, s) { s,              U #e },
+#define OP(e, s) { SPELL_OPERATOR, UC s  },
+#define TK(e, s) { SPELL_ ## s,    UC #e },
  static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  #undef OP
  #undef TK
@@ -53,11 +54,9 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  static int skip_line_comment (cpp_reader *);
  static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
-static void lex_number (cpp_reader *, cpp_string *);
-static bool forms_identifier_p (cpp_reader *, int);
  static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
+static void store_comment (cpp_reader *, cpp_token *);
  static void create_literal (cpp_reader *, cpp_token *, const uchar *,
                             unsigned int, enum cpp_ttype);
  static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
@@ -77,7 +76,7 @@ cpp_ideq (const cpp_token *token, const char *string)
    if (token->type != CPP_NAME)
      return 0;
  
-  return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
+  return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  }
  
  /* Record a note TYPE at byte POS into the current cleaned logical
@@ -88,8 +87,8 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
    if (buffer->notes_used == buffer->notes_cap)
      {
        buffer->notes_cap = buffer->notes_cap * 2 + 200;
-      buffer->notes = xrealloc (buffer->notes,
-                               buffer->notes_cap * sizeof (_cpp_line_note));
+      buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
+                                  buffer->notes_cap);
      }
  
    buffer->notes[buffer->notes_used].pos = pos;
@@ -114,31 +113,39 @@ _cpp_clean_line (cpp_reader *pfile)
  
    if (!buffer->from_stage3)
      {
+      const uchar *pbackslash = NULL;
+
        /* Short circuit for the common case of an un-escaped line with
          no trigraphs.  The primary win here is by not writing any
          data back to memory until we have to.  */
        for (;;)
         {
           c = *++s;
-         if (c == '\n' || c == '\r')
+         if (__builtin_expect (c == '\n', false)
+             || __builtin_expect (c == '\r', false))
             {
               d = (uchar *) s;
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (s == buffer->rlimit, false))
                 goto done;
  
               /* DOS line ending? */
-             if (c == '\r' && s[1] == '\n')
-               s++;
+             if (__builtin_expect (c == '\r', false)
+                 && s[1] == '\n')
+               {
+                 s++;
+                 if (s == buffer->rlimit)
+                   goto done;
+               }
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (pbackslash == NULL, true))
                 goto done;
  
-             /* check for escaped newline */
+             /* Check for escaped newline.  */
               p = d;
-             while (p != buffer->next_line && is_nvspace (p[-1]))
+             while (is_nvspace (p[-1]))
                 p--;
-             if (p == buffer->next_line || p[-1] != '\\')
+             if (p - 1 != pbackslash)
                 goto done;
  
               /* Have an escaped newline; process it and proceed to
@@ -148,7 +155,11 @@ _cpp_clean_line (cpp_reader *pfile)
               buffer->next_line = p - 1;
               break;
             }
-         if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
+         if (__builtin_expect (c == '\\', false))
+           pbackslash = s;
+         else if (__builtin_expect (c == '?', false)
+                  && __builtin_expect (s[1] == '?', false)
+                  && _cpp_trigraph_map[s[2]])
             {
               /* Have a trigraph.  We may or may not have to convert
                  it.  Add a line note regardless, for -Wtrigraphs.  */
@@ -290,19 +301,23 @@ _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
               && (!in_comment || warn_in_comment (pfile, note)))
             {
               if (CPP_OPTION (pfile, trigraphs))
-               cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
-                                    "trigraph ??%c converted to %c",
-                                    note->type,
-                                    (int) _cpp_trigraph_map[note->type]);
+               cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
+                                       pfile->line_table->highest_line, col,
+                                      "trigraph ??%c converted to %c",
+                                      note->type,
+                                      (int) _cpp_trigraph_map[note->type]);
               else
                 {
-                 cpp_error_with_line 
-                   (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
+                 cpp_warning_with_line 
+                   (pfile, CPP_W_TRIGRAPHS,
+                     pfile->line_table->highest_line, col,
                      "trigraph ??%c ignored, use -trigraphs to enable",
                      note->type);
                 }
             }
         }
+      else if (note->type == 0)
+       /* Already processed in lex_raw_string.  */;
        else
         abort ();
      }
@@ -342,9 +357,10 @@ _cpp_skip_block_comment (cpp_reader *pfile)
               && cur[0] == '*' && cur[1] != '/')
             {
               buffer->cur = cur;
-             cpp_error_with_line (pfile, CPP_DL_WARNING,
-                                  pfile->line_table->highest_line, CPP_BUF_COL (buffer),
-                                  "\"/*\" within comment");
+             cpp_warning_with_line (pfile, CPP_W_COMMENTS,
+                                    pfile->line_table->highest_line,
+                                    CPP_BUF_COL (buffer),
+                                    "\"/*\" within comment");
             }
         }
        else if (c == '\n')
@@ -375,7 +391,7 @@ static int
  skip_line_comment (cpp_reader *pfile)
  {
    cpp_buffer *buffer = pfile->buffer;
-  unsigned int orig_line = pfile->line_table->highest_line;
+  source_location orig_line = pfile->line_table->highest_line;
  
    while (*buffer->cur != '\n')
      buffer->cur++;
@@ -430,10 +446,36 @@ name_p (cpp_reader *pfile, const cpp_string *string)
    return 1;
  }
  
+/* After parsing an identifier or other sequence, produce a warning about
+   sequences not in NFC/NFKC.  */
+static void
+warn_about_normalization (cpp_reader *pfile, 
+                         const cpp_token *token,
+                         const struct normalize_state *s)
+{
+  if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
+      && !pfile->state.skipping)
+    {
+      /* Make sure that the token is printed using UCNs, even
+        if we'd otherwise happily print UTF-8.  */
+      unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
+      size_t sz;
+
+      sz = cpp_spell_token (pfile, token, buf, false) - buf;
+      if (NORMALIZE_STATE_RESULT (s) == normalized_C)
+       cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
+                              "`%.*s' is not in NFKC", (int) sz, buf);
+      else
+       cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
+                              "`%.*s' is not in NFC", (int) sz, buf);
+    }
+}
+
  /* Returns TRUE if the sequence starting at buffer->cur is invalid in
     an identifier.  FIRST is TRUE if this starts an identifier.  */
  static bool
-forms_identifier_p (cpp_reader *pfile, int first)
+forms_identifier_p (cpp_reader *pfile, int first,
+                   struct normalize_state *state)
  {
    cpp_buffer *buffer = pfile->buffer;
  
@@ -453,11 +495,13 @@ forms_identifier_p (cpp_reader *pfile, int first)
      }
  
    /* Is this a syntactically valid UCN?  */
-  if (0 && *buffer->cur == '\\'
+  if (CPP_OPTION (pfile, extended_identifiers)
+      && *buffer->cur == '\\'
        && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
      {
        buffer->cur += 2;
-      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
+      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+                         state))
         return true;
        buffer->cur -= 2;
      }
@@ -465,41 +509,102 @@ forms_identifier_p (cpp_reader *pfile, int first)
    return false;
  }
  
-/* Lex an identifier starting at BUFFER->CUR - 1.  */
+/* Helper function to get the cpp_hashnode of the identifier BASE.  */
  static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier_intern (cpp_reader *pfile, const uchar *base)
  {
    cpp_hashnode *result;
-  const uchar *cur, *limit;
+  const uchar *cur;
    unsigned int len;
    unsigned int hash = HT_HASHSTEP (0, *base);
  
-  cur = pfile->buffer->cur;
-  for (;;)
+  cur = base + 1;
+  while (ISIDNUM (*cur))
      {
-      /* N.B. ISIDNUM does not include $.  */
-      while (ISIDNUM (*cur))
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
-
-      pfile->buffer->cur = cur;
-      if (!forms_identifier_p (pfile, false))
-       break;
-
-      limit = pfile->buffer->cur;
-      while (cur < limit)
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
+      hash = HT_HASHSTEP (hash, *cur);
+      cur++;
      }
    len = cur - base;
    hash = HT_HASHFINISH (hash, len);
+  result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
+                                             base, len, hash, HT_ALLOC));
+
+  /* Rarely, identifiers require diagnostics when lexed.  */
+  if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
+                       && !pfile->state.skipping, 0))
+    {
+      /* It is allowed to poison the same identifier twice.  */
+      if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
+       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
+                  NODE_NAME (result));
+
+      /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
+        replacement list of a variadic macro.  */
+      if (result == pfile->spec_nodes.n__VA_ARGS__
+         && !pfile->state.va_args_ok)
+       cpp_error (pfile, CPP_DL_PEDWARN,
+                  "__VA_ARGS__ can only appear in the expansion"
+                  " of a C99 variadic macro");
+
+      /* For -Wc++-compat, warn about use of C++ named operators.  */
+      if (result->flags & NODE_WARN_OPERATOR)
+       cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
+                    "identifier \"%s\" is a special operator name in C++",
+                    NODE_NAME (result));
+    }
+
+  return result;
+}
+
+/* Get the cpp_hashnode of an identifier specified by NAME in
+   the current cpp_reader object.  If none is found, NULL is returned.  */
+cpp_hashnode *
+_cpp_lex_identifier (cpp_reader *pfile, const char *name)
+{
+  cpp_hashnode *result;
+  result = lex_identifier_intern (pfile, (uchar *) name);
+  return result;
+}
  
-  result = (cpp_hashnode *)
-    ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+/* Lex an identifier starting at BUFFER->CUR - 1.  */
+static cpp_hashnode *
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
+               struct normalize_state *nst)
+{
+  cpp_hashnode *result;
+  const uchar *cur;
+  unsigned int len;
+  unsigned int hash = HT_HASHSTEP (0, *base);
+
+  cur = pfile->buffer->cur;
+  if (! starts_ucn)
+    while (ISIDNUM (*cur))
+      {
+       hash = HT_HASHSTEP (hash, *cur);
+       cur++;
+      }
+  pfile->buffer->cur = cur;
+  if (starts_ucn || forms_identifier_p (pfile, false, nst))
+    {
+      /* Slower version for identifiers containing UCNs (or $).  */
+      do {
+       while (ISIDNUM (*pfile->buffer->cur))
+         {
+           pfile->buffer->cur++;
+           NORMALIZE_STATE_UPDATE_IDNUM (nst);
+         }
+      } while (forms_identifier_p (pfile, false, nst));
+      result = _cpp_interpret_identifier (pfile, base,
+                                         pfile->buffer->cur - base);
+    }
+  else
+    {
+      len = cur - base;
+      hash = HT_HASHFINISH (hash, len);
+
+      result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
+                                                 base, len, hash, HT_ALLOC));
+    }
  
    /* Rarely, identifiers require diagnostics when lexed.  */
    if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
@@ -517,6 +622,12 @@ lex_identifier (cpp_reader *pfile, const uchar *base)
         cpp_error (pfile, CPP_DL_PEDWARN,
                    "__VA_ARGS__ can only appear in the expansion"
                    " of a C99 variadic macro");
+
+      /* For -Wc++-compat, warn about use of C++ named operators.  */
+      if (result->flags & NODE_WARN_OPERATOR)
+       cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
+                    "identifier \"%s\" is a special operator name in C++",
+                    NODE_NAME (result));
      }
  
    return result;
@@ -524,7 +635,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base)
  
  /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
  static void
-lex_number (cpp_reader *pfile, cpp_string *number)
+lex_number (cpp_reader *pfile, cpp_string *number,
+           struct normalize_state *nst)
  {
    const uchar *cur;
    const uchar *base;
@@ -537,11 +649,14 @@ lex_number (cpp_reader *pfile, cpp_string *number)
  
        /* N.B. ISIDNUM does not include $.  */
        while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
-       cur++;
+       {
+         cur++;
+         NORMALIZE_STATE_UPDATE_IDNUM (nst);
+       }
  
        pfile->buffer->cur = cur;
      }
-  while (forms_identifier_p (pfile, false));
+  while (forms_identifier_p (pfile, false, nst));
  
    number->len = cur - base;
    dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -564,10 +679,291 @@ create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
    token->val.str.text = dest;
  }
  
+/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
+   sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
+
+static void
+bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
+               _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
+{
+  _cpp_buff *first_buff = *first_buff_p;
+  _cpp_buff *last_buff = *last_buff_p;
+
+  if (first_buff == NULL)
+    first_buff = last_buff = _cpp_get_buff (pfile, len);
+  else if (len > BUFF_ROOM (last_buff))
+    {
+      size_t room = BUFF_ROOM (last_buff);
+      memcpy (BUFF_FRONT (last_buff), base, room);
+      BUFF_FRONT (last_buff) += room;
+      base += room;
+      len -= room;
+      last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
+    }
+
+  memcpy (BUFF_FRONT (last_buff), base, len);
+  BUFF_FRONT (last_buff) += len;
+
+  *first_buff_p = first_buff;
+  *last_buff_p = last_buff;
+}
+
+/* Lexes a raw string.  The stored string contains the spelling, including
+   double quotes, delimiter string, '(' and ')', any leading
+   'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
+   literal, or CPP_OTHER if it was not properly terminated.
+
+   The spelling is NUL-terminated, but it is not guaranteed that this
+   is the first NUL since embedded NULs are preserved.  */
+
+static void
+lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
+               const uchar *cur)
+{
+  source_location saw_NUL = 0;
+  const uchar *raw_prefix;
+  unsigned int raw_prefix_len = 0;
+  enum cpp_ttype type;
+  size_t total_len = 0;
+  _cpp_buff *first_buff = NULL, *last_buff = NULL;
+  _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
+
+  type = (*base == 'L' ? CPP_WSTRING :
+         *base == 'U' ? CPP_STRING32 :
+         *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+         : CPP_STRING);
+
+  raw_prefix = cur + 1;
+  while (raw_prefix_len < 16)
+    {
+      switch (raw_prefix[raw_prefix_len])
+       {
+       case ' ': case '(': case ')': case '\\': case '\t':
+       case '\v': case '\f': case '\n': default:
+         break;
+       /* Basic source charset except the above chars.  */
+       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+       case 'y': case 'z':
+       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+       case 'Y': case 'Z':
+       case '0': case '1': case '2': case '3': case '4': case '5':
+       case '6': case '7': case '8': case '9':
+       case '_': case '{': case '}': case '#': case '[': case ']':
+       case '<': case '>': case '%': case ':': case ';': case '.':
+       case '?': case '*': case '+': case '-': case '/': case '^':
+       case '&': case '|': case '~': case '!': case '=': case ',':
+       case '"': case '\'':
+         raw_prefix_len++;
+         continue;
+       }
+      break;
+    }
+
+  if (raw_prefix[raw_prefix_len] != '(')
+    {
+      int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
+               + 1;
+      if (raw_prefix_len == 16)
+       cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+                            "raw string delimiter longer than 16 characters");
+      else
+       cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+                            "invalid character '%c' in raw string delimiter",
+                            (int) raw_prefix[raw_prefix_len]);
+      pfile->buffer->cur = raw_prefix - 1;
+      create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
+      return;
+    }
+
+  cur = raw_prefix + raw_prefix_len + 1;
+  for (;;)
+    {
+#define BUF_APPEND(STR,LEN)                                    \
+      do {                                                     \
+       bufring_append (pfile, (const uchar *)(STR), (LEN),     \
+                       &first_buff, &last_buff);               \
+       total_len += (LEN);                                     \
+      } while (0);
+
+      cppchar_t c;
+
+      /* If we previously performed any trigraph or line splicing
+        transformations, undo them within the body of the raw string.  */
+      while (note->pos < cur)
+       ++note;
+      for (; note->pos == cur; ++note)
+       {
+         switch (note->type)
+           {
+           case '\\':
+           case ' ':
+             /* Restore backslash followed by newline.  */
+             BUF_APPEND (base, cur - base);
+             base = cur;
+             BUF_APPEND ("\\", 1);
+           after_backslash:
+             if (note->type == ' ')
+               {
+                 /* GNU backslash whitespace newline extension.  FIXME
+                    could be any sequence of non-vertical space.  When we
+                    can properly restore any such sequence, we should mark
+                    this note as handled so _cpp_process_line_notes
+                    doesn't warn.  */
+                 BUF_APPEND (" ", 1);
+               }
+
+             BUF_APPEND ("\n", 1);
+             break;
+
+           case 0:
+             /* Already handled.  */
+             break;
+
+           default:
+             if (_cpp_trigraph_map[note->type])
+               {
+                 /* Don't warn about this trigraph in
+                    _cpp_process_line_notes, since trigraphs show up as
+                    trigraphs in raw strings.  */
+                 unsigned type = note->type;
+                 note->type = 0;
+
+                 if (!CPP_OPTION (pfile, trigraphs))
+                   /* If we didn't convert the trigraph in the first
+                      place, don't do anything now either.  */
+                   break;
+
+                 BUF_APPEND (base, cur - base);
+                 base = cur;
+                 BUF_APPEND ("??", 2);
+
+                 /* ??/ followed by newline gets two line notes, one for
+                    the trigraph and one for the backslash/newline.  */
+                 if (type == '/' && note[1].pos == cur)
+                   {
+                     if (note[1].type != '\\'
+                         && note[1].type != ' ')
+                       abort ();
+                     BUF_APPEND ("/", 1);
+                     ++note;
+                     goto after_backslash;
+                   }
+                 /* The ) from ??) could be part of the suffix.  */
+                 else if (type == ')'
+                          && strncmp ((const char *) cur+1,
+                                      (const char *) raw_prefix,
+                                      raw_prefix_len) == 0
+                          && cur[raw_prefix_len+1] == '"')
+                   {
+                     cur += raw_prefix_len+2;
+                     goto break_outer_loop;
+                   }
+                 else
+                   {
+                     /* Skip the replacement character.  */
+                     base = ++cur;
+                     BUF_APPEND (&type, 1);
+                   }
+               }
+             else
+               abort ();
+             break;
+           }
+       }
+      c = *cur++;
+
+      if (c == ')'
+         && strncmp ((const char *) cur, (const char *) raw_prefix,
+                     raw_prefix_len) == 0
+         && cur[raw_prefix_len] == '"')
+       {
+         cur += raw_prefix_len + 1;
+         break;
+       }
+      else if (c == '\n')
+       {
+         if (pfile->state.in_directive
+             || pfile->state.parsing_args
+             || pfile->state.in_deferred_pragma)
+           {
+             cur--;
+             type = CPP_OTHER;
+             cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
+                                  "unterminated raw string");
+             break;
+           }
+
+         BUF_APPEND (base, cur - base);
+
+         if (pfile->buffer->cur < pfile->buffer->rlimit)
+           CPP_INCREMENT_LINE (pfile, 0);
+         pfile->buffer->need_line = true;
+
+         pfile->buffer->cur = cur-1;
+         _cpp_process_line_notes (pfile, false);
+         if (!_cpp_get_fresh_line (pfile))
+           {
+             source_location src_loc = token->src_loc;
+             token->type = CPP_EOF;
+             /* Tell the compiler the line number of the EOF token.  */
+             token->src_loc = pfile->line_table->highest_line;
+             token->flags = BOL;
+             if (first_buff != NULL)
+               _cpp_release_buff (pfile, first_buff);
+             cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
+                                  "unterminated raw string");
+             return;
+           }
+
+         cur = base = pfile->buffer->cur;
+         note = &pfile->buffer->notes[pfile->buffer->cur_note];
+       }
+      else if (c == '\0' && !saw_NUL)
+       LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
+                                    CPP_BUF_COLUMN (pfile->buffer, cur));
+    }
+ break_outer_loop:
+
+  if (saw_NUL && !pfile->state.skipping)
+    cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
+              "null character(s) preserved in literal");
+
+  pfile->buffer->cur = cur;
+  if (first_buff == NULL)
+    create_literal (pfile, token, base, cur - base, type);
+  else
+    {
+      uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
+
+      token->type = type;
+      token->val.str.len = total_len + (cur - base);
+      token->val.str.text = dest;
+      last_buff = first_buff;
+      while (last_buff != NULL)
+       {
+         memcpy (dest, last_buff->base,
+                 BUFF_FRONT (last_buff) - last_buff->base);
+         dest += BUFF_FRONT (last_buff) - last_buff->base;
+         last_buff = last_buff->next;
+       }
+      _cpp_release_buff (pfile, first_buff);
+      memcpy (dest, base, cur - base);
+      dest[cur - base] = '\0';
+    }
+}
+
  /* Lexes a string, character constant, or angle-bracketed header file
     name.  The stored string contains the spelling, including opening
-   quote and leading any leading 'L'.  It returns the type of the
-   literal, or CPP_OTHER if it was not properly terminated.
+   quote and any leading 'L', 'u', 'U' or 'u8' and optional
+   'R' modifier.  It returns the type of the literal, or CPP_OTHER
+   if it was not properly terminated, or CPP_LESS for an unterminated
+   header name which must be relexed as normal tokens.
  
     The spelling is NUL-terminated, but it is not guaranteed that this
     is the first NUL since embedded NULs are preserved.  */
@@ -581,12 +977,28 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
  
    cur = base;
    terminator = *cur++;
-  if (terminator == 'L')
+  if (terminator == 'L' || terminator == 'U')
      terminator = *cur++;
-  if (terminator == '\"')
-    type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
+  else if (terminator == 'u')
+    {
+      terminator = *cur++;
+      if (terminator == '8')
+       terminator = *cur++;
+    }
+  if (terminator == 'R')
+    {
+      lex_raw_string (pfile, token, base, cur);
+      return;
+    }
+  if (terminator == '"')
+    type = (*base == 'L' ? CPP_WSTRING :
+           *base == 'U' ? CPP_STRING32 :
+           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+                        : CPP_STRING);
    else if (terminator == '\'')
-    type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
+    type = (*base == 'L' ? CPP_WCHAR :
+           *base == 'U' ? CPP_CHAR32 :
+           *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
    else
      terminator = '>', type = CPP_HEADER_NAME;
  
@@ -602,6 +1014,14 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
        else if (c == '\n')
         {
           cur--;
+         /* Unmatched quotes always yield undefined behavior, but
+            greedy lexing means that what appears to be an unterminated
+            header name may actually be a legitimate sequence of tokens.  */
+         if (terminator == '>')
+           {
+             token->type = CPP_LESS;
+             return;
+           }
           type = CPP_OTHER;
           break;
         }
@@ -613,10 +1033,59 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
      cpp_error (pfile, CPP_DL_WARNING,
                "null character(s) preserved in literal");
  
+  if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
+    cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
+              (int) terminator);
+
    pfile->buffer->cur = cur;
    create_literal (pfile, token, base, cur - base, type);
  }
  
+/* Return the comment table. The client may not make any assumption
+   about the ordering of the table.  */
+cpp_comment_table *
+cpp_get_comments (cpp_reader *pfile)
+{
+  return &pfile->comments;
+}
+
+/* Append a comment to the end of the comment table. */
+static void 
+store_comment (cpp_reader *pfile, cpp_token *token) 
+{
+  int len;
+
+  if (pfile->comments.allocated == 0)
+    {
+      pfile->comments.allocated = 256; 
+      pfile->comments.entries = (cpp_comment *) xmalloc
+       (pfile->comments.allocated * sizeof (cpp_comment));
+    }
+
+  if (pfile->comments.count == pfile->comments.allocated)
+    {
+      pfile->comments.allocated *= 2;
+      pfile->comments.entries = (cpp_comment *) xrealloc
+       (pfile->comments.entries,
+        pfile->comments.allocated * sizeof (cpp_comment));
+    }
+
+  len = token->val.str.len;
+
+  /* Copy comment. Note, token may not be NULL terminated. */
+  pfile->comments.entries[pfile->comments.count].comment = 
+    (char *) xmalloc (sizeof (char) * (len + 1));
+  memcpy (pfile->comments.entries[pfile->comments.count].comment,
+         token->val.str.text, len);
+  pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
+
+  /* Set source location. */
+  pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
+
+  /* Increment the count of entries in the comment table. */
+  pfile->comments.count++;
+}
+
  /* The stored comment includes the comment start and any terminator.  */
  static void
  save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
@@ -656,13 +1125,16 @@ save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
        buffer[clen - 2] = '*';
        buffer[clen - 1] = '/';
      }
+
+  /* Finally store this comment for use by clients of libcpp. */
+  store_comment (pfile, token);
  }
  
  /* Allocate COUNT tokens for RUN.  */
  void
  _cpp_init_tokenrun (tokenrun *run, unsigned int count)
  {
-  run->base = xnewvec (cpp_token, count);
+  run->base = XNEWVEC (cpp_token, count);
    run->limit = run->base + count;
    run->next = NULL;
  }
@@ -673,7 +1145,7 @@ next_tokenrun (tokenrun *run)
  {
    if (run->next == NULL)
      {
-      run->next = xnew (tokenrun);
+      run->next = XNEW (tokenrun);
        run->next->prev = run;
        _cpp_init_tokenrun (run->next, 250);
      }
@@ -681,6 +1153,49 @@ next_tokenrun (tokenrun *run)
    return run->next;
  }
  
+/* Look ahead in the input stream.  */
+const cpp_token *
+cpp_peek_token (cpp_reader *pfile, int index)
+{
+  cpp_context *context = pfile->context;
+  const cpp_token *peektok;
+  int count;
+
+  /* First, scan through any pending cpp_context objects.  */
+  while (context->prev)
+    {
+      ptrdiff_t sz = (context->direct_p
+                      ? LAST (context).token - FIRST (context).token
+                      : LAST (context).ptoken - FIRST (context).ptoken);
+
+      if (index < (int) sz)
+        return (context->direct_p
+                ? FIRST (context).token + index
+                : *(FIRST (context).ptoken + index));
+
+      index -= (int) sz;
+      context = context->prev;
+    }
+
+  /* We will have to read some new tokens after all (and do so
+     without invalidating preceding tokens).  */
+  count = index;
+  pfile->keep_tokens++;
+
+  do
+    {
+      peektok = _cpp_lex_token (pfile);
+      if (peektok->type == CPP_EOF)
+       return peektok;
+    }
+  while (index--);
+
+  _cpp_backup_tokens_direct (pfile, count + 1);
+  pfile->keep_tokens--;
+
+  return peektok;
+}
+
  /* Allocate a single token that is invalidated at the same time as the
     rest of the tokens on the line.  Has its line and col set to the
     same as the last lexed token, so that diagnostics appear in the
@@ -689,9 +1204,30 @@ cpp_token *
  _cpp_temp_token (cpp_reader *pfile)
  {
    cpp_token *old, *result;
+  ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
+  ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
  
    old = pfile->cur_token - 1;
-  if (pfile->cur_token == pfile->cur_run->limit)
+  /* Any pre-existing lookaheads must not be clobbered.  */
+  if (la)
+    {
+      if (sz <= la)
+        {
+          tokenrun *next = next_tokenrun (pfile->cur_run);
+
+          if (sz < la)
+            memmove (next->base + 1, next->base,
+                     (la - sz) * sizeof (cpp_token));
+
+          next->base[0] = pfile->cur_run->limit[-1];
+        }
+
+      if (sz > 1)
+        memmove (pfile->cur_token + 1, pfile->cur_token,
+                 MIN (la, sz - 1) * sizeof (cpp_token));
+    }
+
+  if (!sz && pfile->cur_token == pfile->cur_run->limit)
      {
        pfile->cur_run = next_tokenrun (pfile->cur_run);
        pfile->cur_token = pfile->cur_run->base;
@@ -717,6 +1253,11 @@ _cpp_lex_token (cpp_reader *pfile)
           pfile->cur_run = next_tokenrun (pfile->cur_run);
           pfile->cur_token = pfile->cur_run->base;
         }
+      /* We assume that the current token is somewhere in the current
+        run.  */
+      if (pfile->cur_token < pfile->cur_run->base
+         || pfile->cur_token >= pfile->cur_run->limit)
+       abort ();
  
        if (pfile->lookaheads)
         {
@@ -734,15 +1275,24 @@ _cpp_lex_token (cpp_reader *pfile)
               /* 6.10.3 p 11: Directives in a list of macro arguments
                  gives undefined behavior.  This implementation
                  handles the directive as normal.  */
-             && pfile->state.parsing_args != 1
-             && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
-           continue;
+             && pfile->state.parsing_args != 1)
+           {
+             if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
+               {
+                 if (pfile->directive_result.type == CPP_PADDING)
+                   continue;
+                 result = &pfile->directive_result;
+               }
+           }
+         else if (pfile->state.in_deferred_pragma)
+           result = &pfile->directive_result;
+
           if (pfile->cb.line_change && !pfile->state.skipping)
             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
         }
  
        /* We don't skip tokens in directives.  */
-      if (pfile->state.in_directive)
+      if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
         break;
  
        /* Outside a directive, invalidate controlling macros.  At file
@@ -789,11 +1339,8 @@ _cpp_get_fresh_line (cpp_reader *pfile)
           && buffer->next_line > buffer->rlimit
           && !buffer->from_stage3)
         {
-         /* Only warn once.  */
+         /* Clip to buffer size.  */
           buffer->next_line = buffer->rlimit;
-         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
-                              CPP_BUF_COLUMN (buffer, buffer->cur),
-                              "no newline at end of file");
         }
  
        return_at_eof = buffer->return_at_eof;
@@ -836,6 +1383,14 @@ _cpp_lex_direct (cpp_reader *pfile)
    buffer = pfile->buffer;
    if (buffer->need_line)
      {
+      if (pfile->state.in_deferred_pragma)
+       {
+         result->type = CPP_PRAGMA_EOL;
+         pfile->state.in_deferred_pragma = false;
+         if (!pfile->state.pragma_allow_expansion)
+           pfile->state.prevent_expansion--;
+         return result;
+       }
        if (!_cpp_get_fresh_line (pfile))
         {
           result->type = CPP_EOF;
@@ -888,16 +1443,36 @@ _cpp_lex_direct (cpp_reader *pfile)
  
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
-      result->type = CPP_NUMBER;
-      lex_number (pfile, &result->val.str);
-      break;
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->type = CPP_NUMBER;
+       lex_number (pfile, &result->val.str, &nst);
+       warn_about_normalization (pfile, result, &nst);
+       break;
+      }
  
      case 'L':
-      /* 'L' may introduce wide characters or strings.  */
-      if (*buffer->cur == '\'' || *buffer->cur == '"')
+    case 'u':
+    case 'U':
+    case 'R':
+      /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
+        wide strings or raw strings.  */
+      if (c == 'L' || CPP_OPTION (pfile, uliterals))
         {
-         lex_string (pfile, result, buffer->cur - 1);
-         break;
+         if ((*buffer->cur == '\'' && c != 'R')
+             || *buffer->cur == '"'
+             || (*buffer->cur == 'R'
+                 && c != 'R'
+                 && buffer->cur[1] == '"'
+                 && CPP_OPTION (pfile, uliterals))
+             || (*buffer->cur == '8'
+                 && c == 'u'
+                 && (buffer->cur[1] == '"'
+                     || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
+           {
+             lex_string (pfile, result, buffer->cur - 1);
+             break;
+           }
         }
        /* Fall through.  */
  
@@ -905,21 +1480,26 @@ _cpp_lex_direct (cpp_reader *pfile)
      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
      case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
      case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
-    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+    case 's': case 't':           case 'v': case 'w': case 'x':
      case 'y': case 'z':
      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
      case 'G': case 'H': case 'I': case 'J': case 'K':
-    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
-    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+    case 'M': case 'N': case 'O': case 'P': case 'Q':
+    case 'S': case 'T':           case 'V': case 'W': case 'X':
      case 'Y': case 'Z':
        result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile, buffer->cur - 1);
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
+                                               &nst);
+       warn_about_normalization (pfile, result, &nst);
+      }
  
        /* Convert named operators to their proper types.  */
-      if (result->val.node->flags & NODE_OPERATOR)
+      if (result->val.node.node->flags & NODE_OPERATOR)
         {
           result->flags |= NAMED_OP;
-         result->type = result->val.node->directive_index;
+         result->type = (enum cpp_ttype) result->val.node.node->directive_index;
         }
        break;
  
@@ -954,7 +1534,7 @@ _cpp_lex_direct (cpp_reader *pfile)
             }
  
           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
-           cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
+           cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
         }
        else if (c == '=')
         {
@@ -982,7 +1562,8 @@ _cpp_lex_direct (cpp_reader *pfile)
        if (pfile->state.angled_headers)
         {
           lex_string (pfile, result, buffer->cur - 1);
-         break;
+         if (result->type != CPP_LESS)
+           break;
         }
  
        result->type = CPP_LESS;
@@ -993,11 +1574,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
-       }
        else if (CPP_OPTION (pfile, digraphs))
         {
           if (*buffer->cur == ':')
@@ -1024,11 +1600,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
-       }
        break;
  
      case '%':
@@ -1043,7 +1614,7 @@ _cpp_lex_direct (cpp_reader *pfile)
               result->flags |= DIGRAPH;
               result->type = CPP_HASH;
               if (*buffer->cur == '%' && buffer->cur[1] == ':')
-               buffer->cur += 2, result->type = CPP_PASTE;
+               buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
             }
           else if (*buffer->cur == '>')
             {
@@ -1058,8 +1629,10 @@ _cpp_lex_direct (cpp_reader *pfile)
        result->type = CPP_DOT;
        if (ISDIGIT (*buffer->cur))
         {
+         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
           result->type = CPP_NUMBER;
-         lex_number (pfile, &result->val.str);
+         lex_number (pfile, &result->val.str, &nst);
+         warn_about_normalization (pfile, result, &nst);
         }
        else if (*buffer->cur == '.' && buffer->cur[1] == '.')
         buffer->cur += 2, result->type = CPP_ELLIPSIS;
@@ -1122,7 +1695,7 @@ _cpp_lex_direct (cpp_reader *pfile)
      case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
      case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
      case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
-    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
+    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
  
      case '?': result->type = CPP_QUERY; break;
      case '~': result->type = CPP_COMPL; break;
@@ -1142,11 +1715,13 @@ _cpp_lex_direct (cpp_reader *pfile)
      case '\\':
        {
         const uchar *base = --buffer->cur;
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
  
-       if (forms_identifier_p (pfile, true))
+       if (forms_identifier_p (pfile, true, &nst))
           {
             result->type = CPP_NAME;
-           result->val.node = lex_identifier (pfile, base);
+           result->val.node.node = lex_identifier (pfile, base, true, &nst);
+           warn_about_normalization (pfile, result, &nst);
             break;
           }
         buffer->cur++;
@@ -1169,21 +1744,65 @@ cpp_token_len (const cpp_token *token)
  
    switch (TOKEN_SPELL (token))
      {
-    default:           len = 4;                                break;
+    default:           len = 6;                                break;
      case SPELL_LITERAL:        len = token->val.str.len;               break;
-    case SPELL_IDENT:  len = NODE_LEN (token->val.node);       break;
+    case SPELL_IDENT:  len = NODE_LEN (token->val.node.node) * 10;     break;
      }
  
    return len;
  }
  
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+   Return the number of bytes read out of NAME.  (There are always
+   10 bytes written to BUFFER.)  */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+  int j;
+  int ucn_len = 0;
+  int ucn_len_c;
+  unsigned t;
+  unsigned long utf32;
+  
+  /* Compute the length of the UTF-8 sequence.  */
+  for (t = *name; t & 0x80; t <<= 1)
+    ucn_len++;
+  
+  utf32 = *name & (0x7F >> ucn_len);
+  for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+    {
+      utf32 = (utf32 << 6) | (*++name & 0x3F);
+      
+      /* Ill-formed UTF-8.  */
+      if ((*name & ~0x3F) != 0x80)
+       abort ();
+    }
+  
+  *buffer++ = '\\';
+  *buffer++ = 'U';
+  for (j = 7; j >= 0; j--)
+    *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+  return ucn_len;
+}
+
+/* Given a token TYPE corresponding to a digraph, return a pointer to
+   the spelling of the digraph.  */
+static const unsigned char *
+cpp_digraph2name (enum cpp_ttype type)
+{
+  return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
+}
+
  /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
     already contain the enough space to hold the token's spelling.
     Returns a pointer to the character after the last character written.
+   FORSTRING is true if this is to be the spelling after translation
+   phase 1 (this is different for UCNs).
     FIXME: Would be nice if we didn't need the PFILE argument.  */
  unsigned char *
  cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
-                unsigned char *buffer)
+                unsigned char *buffer, bool forstring)
  {
    switch (TOKEN_SPELL (token))
      {
@@ -1193,8 +1812,7 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
         unsigned char c;
  
         if (token->flags & DIGRAPH)
-         spelling
-           = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
+         spelling = cpp_digraph2name (token->type);
         else if (token->flags & NAMED_OP)
           goto spell_ident;
         else
@@ -1207,8 +1825,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
  
      spell_ident:
      case SPELL_IDENT:
-      memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
-      buffer += NODE_LEN (token->val.node);
+      if (forstring)
+       {
+         memcpy (buffer, NODE_NAME (token->val.node.node),
+                 NODE_LEN (token->val.node.node));
+         buffer += NODE_LEN (token->val.node.node);
+       }
+      else
+       {
+         size_t i;
+         const unsigned char * name = NODE_NAME (token->val.node.node);
+         
+         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
+           if (name[i] & ~0x7F)
+             {
+               i += utf8_to_ucn (buffer, name + i) - 1;
+               buffer += 10;
+             }
+           else
+             *buffer++ = NODE_NAME (token->val.node.node)[i];
+       }
        break;
  
      case SPELL_LITERAL:
@@ -1233,17 +1869,23 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
    unsigned int len = cpp_token_len (token) + 1;
    unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
  
-  end = cpp_spell_token (pfile, token, start);
+  end = cpp_spell_token (pfile, token, start, false);
    end[0] = '\0';
  
    return start;
  }
  
-/* Used by C front ends, which really should move to using
-   cpp_token_as_text.  */
+/* Returns a pointer to a string which spells the token defined by
+   TYPE and FLAGS.  Used by C front ends, which really should move to
+   using cpp_token_as_text.  */
  const char *
-cpp_type2name (enum cpp_ttype type)
+cpp_type2name (enum cpp_ttype type, unsigned char flags)
  {
+  if (flags & DIGRAPH)
+    return (const char *) cpp_digraph2name (type);
+  else if (flags & NAMED_OP)
+    return cpp_named_operator2name (type);
+
    return (const char *) token_spellings[type].name;
  }
  
@@ -1261,8 +1903,7 @@ cpp_output_token (const cpp_token *token, FILE *fp)
         int c;
  
         if (token->flags & DIGRAPH)
-         spelling
-           = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
+         spelling = cpp_digraph2name (token->type);
         else if (token->flags & NAMED_OP)
           goto spell_ident;
         else
@@ -1277,8 +1918,21 @@ cpp_output_token (const cpp_token *token, FILE *fp)
  
      spell_ident:
      case SPELL_IDENT:
-      fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
-    break;
+      {
+       size_t i;
+       const unsigned char * name = NODE_NAME (token->val.node.node);
+       
+       for (i = 0; i < NODE_LEN (token->val.node.node); i++)
+         if (name[i] & ~0x7F)
+           {
+             unsigned char buffer[10];
+             i += utf8_to_ucn (buffer, name + i) - 1;
+             fwrite (buffer, 1, 10, fp);
+           }
+         else
+           fputc (NODE_NAME (token->val.node.node)[i], fp);
+      }
+      break;
  
      case SPELL_LITERAL:
        fwrite (token->val.str.text, 1, token->val.str.len, fp);
@@ -1299,11 +1953,14 @@ _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
        {
        default:                 /* Keep compiler happy.  */
        case SPELL_OPERATOR:
-       return 1;
+       /* token_no is used to track where multiple consecutive ##
+          tokens were originally located.  */
+       return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
        case SPELL_NONE:
-       return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
+       return (a->type != CPP_MACRO_ARG
+               || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
        case SPELL_IDENT:
-       return a->val.node == b->val.node;
+       return a->val.node.node == b->val.node.node;
        case SPELL_LITERAL:
         return (a->val.str.len == b->val.str.len
                 && !memcmp (a->val.str.text, b->val.str.text,
@@ -1341,8 +1998,8 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
  
    switch (a)
      {
-    case CPP_GREATER:  return c == '>' || c == '?';
-    case CPP_LESS:     return c == '<' || c == '?' || c == '%' || c == ':';
+    case CPP_GREATER:  return c == '>';
+    case CPP_LESS:     return c == '<' || c == '%' || c == ':';
      case CPP_PLUS:     return c == '+';
      case CPP_MINUS:    return c == '-' || c == '>';
      case CPP_DIV:      return c == '/' || c == '*'; /* Comments.  */
@@ -1391,6 +2048,51 @@ cpp_output_line (cpp_reader *pfile, FILE *fp)
    putc ('\n', fp);
  }
  
+/* Return a string representation of all the remaining tokens on the
+   current line.  The result is allocated using xmalloc and must be
+   freed by the caller.  */
+unsigned char *
+cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
+{
+  const cpp_token *token;
+  unsigned int out = dir_name ? ustrlen (dir_name) : 0;
+  unsigned int alloced = 120 + out;
+  unsigned char *result = (unsigned char *) xmalloc (alloced);
+
+  /* If DIR_NAME is empty, there are no initial contents.  */
+  if (dir_name)
+    {
+      sprintf ((char *) result, "#%s ", dir_name);
+      out += 2;
+    }
+
+  token = cpp_get_token (pfile);
+  while (token->type != CPP_EOF)
+    {
+      unsigned char *last;
+      /* Include room for a possible space and the terminating nul.  */
+      unsigned int len = cpp_token_len (token) + 2;
+
+      if (out + len > alloced)
+       {
+         alloced *= 2;
+         if (out + len > alloced)
+           alloced = out + len;
+         result = (unsigned char *) xrealloc (result, alloced);
+       }
+
+      last = cpp_spell_token (pfile, token, &result[out], 0);
+      out = last - result;
+
+      token = cpp_get_token (pfile);
+      if (token->flags & PREV_WHITE)
+       result[out++] = ' ';
+    }
+
+  result[out] = '\0';
+  return result;
+}
+
  /* Memory buffers.  Changing these three constants can have a dramatic
     effect on performance.  The values here are reasonable defaults,
     but might be tuned.  If you adjust them, be sure to test across a
@@ -1418,7 +2120,7 @@ new_buff (size_t len)
      len = MIN_BUFF_SIZE;
    len = CPP_ALIGN (len);
  
-  base = xmalloc (len + sizeof (_cpp_buff));
+  base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
    result = (_cpp_buff *) (base + len);
    result->base = base;
    result->cur = base;
@@ -1568,11 +2270,18 @@ cpp_token_val_index (cpp_token *tok)
        return CPP_TOKEN_FLD_NODE;
      case SPELL_LITERAL:
        return CPP_TOKEN_FLD_STR;
+    case SPELL_OPERATOR:
+      if (tok->type == CPP_PASTE)
+       return CPP_TOKEN_FLD_TOKEN_NO;
+      else
+       return CPP_TOKEN_FLD_NONE;
      case SPELL_NONE:
        if (tok->type == CPP_MACRO_ARG)
         return CPP_TOKEN_FLD_ARG_NO;
        else if (tok->type == CPP_PADDING)
         return CPP_TOKEN_FLD_SOURCE;
+      else if (tok->type == CPP_PRAGMA)
+       return CPP_TOKEN_FLD_PRAGMA;
        /* else fall through */
      default:
        return CPP_TOKEN_FLD_NONE;