PR bootstrap/38862

[pf3gnuchains/gcc-fork.git] / libcpp / lex.c
diff --git a/libcpp/lex.c b/libcpp/lex.c

index 37df6ef..96d1a99 100644 (file)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1,5 +1,5 @@
  /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
     Contributed by Per Bothner, 1994-95.
     Based on CCCP program by Paul Rubin, June 1986
     Adapted to ANSI C, Richard Stallman, Jan 1987
@@ -17,7 +17,7 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
-Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  
  #include "config.h"
  #include "system.h"
@@ -39,10 +39,10 @@ struct token_spelling
  };
  
  static const unsigned char *const digraph_spellings[] =
-{ U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
+{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  
-#define OP(e, s) { SPELL_OPERATOR, U s           },
-#define TK(e, s) { s,              U #e },
+#define OP(e, s) { SPELL_OPERATOR, UC s  },
+#define TK(e, s) { SPELL_ ## s,    UC #e },
  static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  #undef OP
  #undef TK
@@ -53,11 +53,9 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  static int skip_line_comment (cpp_reader *);
  static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
-static void lex_number (cpp_reader *, cpp_string *);
-static bool forms_identifier_p (cpp_reader *, int);
  static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
+static void store_comment (cpp_reader *, cpp_token *);
  static void create_literal (cpp_reader *, cpp_token *, const uchar *,
                             unsigned int, enum cpp_ttype);
  static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
@@ -88,8 +86,8 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
    if (buffer->notes_used == buffer->notes_cap)
      {
        buffer->notes_cap = buffer->notes_cap * 2 + 200;
-      buffer->notes = xrealloc (buffer->notes,
-                               buffer->notes_cap * sizeof (_cpp_line_note));
+      buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
+                                  buffer->notes_cap);
      }
  
    buffer->notes[buffer->notes_used].pos = pos;
@@ -114,31 +112,39 @@ _cpp_clean_line (cpp_reader *pfile)
  
    if (!buffer->from_stage3)
      {
+      const uchar *pbackslash = NULL;
+
        /* Short circuit for the common case of an un-escaped line with
          no trigraphs.  The primary win here is by not writing any
          data back to memory until we have to.  */
        for (;;)
         {
           c = *++s;
-         if (c == '\n' || c == '\r')
+         if (__builtin_expect (c == '\n', false)
+             || __builtin_expect (c == '\r', false))
             {
               d = (uchar *) s;
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (s == buffer->rlimit, false))
                 goto done;
  
               /* DOS line ending? */
-             if (c == '\r' && s[1] == '\n')
-               s++;
+             if (__builtin_expect (c == '\r', false)
+                 && s[1] == '\n')
+               {
+                 s++;
+                 if (s == buffer->rlimit)
+                   goto done;
+               }
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (pbackslash == NULL, true))
                 goto done;
  
-             /* check for escaped newline */
+             /* Check for escaped newline.  */
               p = d;
-             while (p != buffer->next_line && is_nvspace (p[-1]))
+             while (is_nvspace (p[-1]))
                 p--;
-             if (p == buffer->next_line || p[-1] != '\\')
+             if (p - 1 != pbackslash)
                 goto done;
  
               /* Have an escaped newline; process it and proceed to
@@ -148,7 +154,11 @@ _cpp_clean_line (cpp_reader *pfile)
               buffer->next_line = p - 1;
               break;
             }
-         if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
+         if (__builtin_expect (c == '\\', false))
+           pbackslash = s;
+         else if (__builtin_expect (c == '?', false)
+                  && __builtin_expect (s[1] == '?', false)
+                  && _cpp_trigraph_map[s[2]])
             {
               /* Have a trigraph.  We may or may not have to convert
                  it.  Add a line note regardless, for -Wtrigraphs.  */
@@ -375,7 +385,7 @@ static int
  skip_line_comment (cpp_reader *pfile)
  {
    cpp_buffer *buffer = pfile->buffer;
-  unsigned int orig_line = pfile->line_table->highest_line;
+  source_location orig_line = pfile->line_table->highest_line;
  
    while (*buffer->cur != '\n')
      buffer->cur++;
@@ -430,10 +440,36 @@ name_p (cpp_reader *pfile, const cpp_string *string)
    return 1;
  }
  
+/* After parsing an identifier or other sequence, produce a warning about
+   sequences not in NFC/NFKC.  */
+static void
+warn_about_normalization (cpp_reader *pfile, 
+                         const cpp_token *token,
+                         const struct normalize_state *s)
+{
+  if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
+      && !pfile->state.skipping)
+    {
+      /* Make sure that the token is printed using UCNs, even
+        if we'd otherwise happily print UTF-8.  */
+      unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
+      size_t sz;
+
+      sz = cpp_spell_token (pfile, token, buf, false) - buf;
+      if (NORMALIZE_STATE_RESULT (s) == normalized_C)
+       cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+                            "`%.*s' is not in NFKC", (int) sz, buf);
+      else
+       cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+                            "`%.*s' is not in NFC", (int) sz, buf);
+    }
+}
+
  /* Returns TRUE if the sequence starting at buffer->cur is invalid in
     an identifier.  FIRST is TRUE if this starts an identifier.  */
  static bool
-forms_identifier_p (cpp_reader *pfile, int first)
+forms_identifier_p (cpp_reader *pfile, int first,
+                   struct normalize_state *state)
  {
    cpp_buffer *buffer = pfile->buffer;
  
@@ -453,11 +489,13 @@ forms_identifier_p (cpp_reader *pfile, int first)
      }
  
    /* Is this a syntactically valid UCN?  */
-  if (0 && *buffer->cur == '\\'
+  if (CPP_OPTION (pfile, extended_identifiers)
+      && *buffer->cur == '\\'
        && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
      {
        buffer->cur += 2;
-      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
+      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+                         state))
         return true;
        buffer->cur -= 2;
      }
@@ -467,39 +505,43 @@ forms_identifier_p (cpp_reader *pfile, int first)
  
  /* Lex an identifier starting at BUFFER->CUR - 1.  */
  static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
+               struct normalize_state *nst)
  {
    cpp_hashnode *result;
-  const uchar *cur, *limit;
+  const uchar *cur;
    unsigned int len;
    unsigned int hash = HT_HASHSTEP (0, *base);
  
    cur = pfile->buffer->cur;
-  for (;;)
+  if (! starts_ucn)
+    while (ISIDNUM (*cur))
+      {
+       hash = HT_HASHSTEP (hash, *cur);
+       cur++;
+      }
+  pfile->buffer->cur = cur;
+  if (starts_ucn || forms_identifier_p (pfile, false, nst))
      {
-      /* N.B. ISIDNUM does not include $.  */
-      while (ISIDNUM (*cur))
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
-
-      pfile->buffer->cur = cur;
-      if (!forms_identifier_p (pfile, false))
-       break;
-
-      limit = pfile->buffer->cur;
-      while (cur < limit)
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
+      /* Slower version for identifiers containing UCNs (or $).  */
+      do {
+       while (ISIDNUM (*pfile->buffer->cur))
+         {
+           pfile->buffer->cur++;
+           NORMALIZE_STATE_UPDATE_IDNUM (nst);
+         }
+      } while (forms_identifier_p (pfile, false, nst));
+      result = _cpp_interpret_identifier (pfile, base,
+                                         pfile->buffer->cur - base);
      }
-  len = cur - base;
-  hash = HT_HASHFINISH (hash, len);
+  else
+    {
+      len = cur - base;
+      hash = HT_HASHFINISH (hash, len);
  
-  result = (cpp_hashnode *)
-    ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+      result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
+                                                 base, len, hash, HT_ALLOC));
+    }
  
    /* Rarely, identifiers require diagnostics when lexed.  */
    if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
@@ -524,7 +566,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base)
  
  /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
  static void
-lex_number (cpp_reader *pfile, cpp_string *number)
+lex_number (cpp_reader *pfile, cpp_string *number,
+           struct normalize_state *nst)
  {
    const uchar *cur;
    const uchar *base;
@@ -537,11 +580,14 @@ lex_number (cpp_reader *pfile, cpp_string *number)
  
        /* N.B. ISIDNUM does not include $.  */
        while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
-       cur++;
+       {
+         cur++;
+         NORMALIZE_STATE_UPDATE_IDNUM (nst);
+       }
  
        pfile->buffer->cur = cur;
      }
-  while (forms_identifier_p (pfile, false));
+  while (forms_identifier_p (pfile, false, nst));
  
    number->len = cur - base;
    dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -566,8 +612,8 @@ create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
  
  /* Lexes a string, character constant, or angle-bracketed header file
     name.  The stored string contains the spelling, including opening
-   quote and leading any leading 'L'.  It returns the type of the
-   literal, or CPP_OTHER if it was not properly terminated.
+   quote and leading any leading 'L', 'u' or 'U'.  It returns the type
+   of the literal, or CPP_OTHER if it was not properly terminated.
  
     The spelling is NUL-terminated, but it is not guaranteed that this
     is the first NUL since embedded NULs are preserved.  */
@@ -581,12 +627,16 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
  
    cur = base;
    terminator = *cur++;
-  if (terminator == 'L')
+  if (terminator == 'L' || terminator == 'u' || terminator == 'U')
      terminator = *cur++;
    if (terminator == '\"')
-    type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
+    type = (*base == 'L' ? CPP_WSTRING :
+           *base == 'U' ? CPP_STRING32 :
+           *base == 'u' ? CPP_STRING16 : CPP_STRING);
    else if (terminator == '\'')
-    type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
+    type = (*base == 'L' ? CPP_WCHAR :
+           *base == 'U' ? CPP_CHAR32 :
+           *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
    else
      terminator = '>', type = CPP_HEADER_NAME;
  
@@ -613,10 +663,59 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
      cpp_error (pfile, CPP_DL_WARNING,
                "null character(s) preserved in literal");
  
+  if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
+    cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
+              (int) terminator);
+
    pfile->buffer->cur = cur;
    create_literal (pfile, token, base, cur - base, type);
  }
  
+/* Return the comment table. The client may not make any assumption
+   about the ordering of the table.  */
+cpp_comment_table *
+cpp_get_comments (cpp_reader *pfile)
+{
+  return &pfile->comments;
+}
+
+/* Append a comment to the end of the comment table. */
+static void 
+store_comment (cpp_reader *pfile, cpp_token *token) 
+{
+  int len;
+
+  if (pfile->comments.allocated == 0)
+    {
+      pfile->comments.allocated = 256; 
+      pfile->comments.entries = (cpp_comment *) xmalloc
+       (pfile->comments.allocated * sizeof (cpp_comment));
+    }
+
+  if (pfile->comments.count == pfile->comments.allocated)
+    {
+      pfile->comments.allocated *= 2;
+      pfile->comments.entries = (cpp_comment *) xrealloc
+       (pfile->comments.entries,
+        pfile->comments.allocated * sizeof (cpp_comment));
+    }
+
+  len = token->val.str.len;
+
+  /* Copy comment. Note, token may not be NULL terminated. */
+  pfile->comments.entries[pfile->comments.count].comment = 
+    (char *) xmalloc (sizeof (char) * (len + 1));
+  memcpy (pfile->comments.entries[pfile->comments.count].comment,
+         token->val.str.text, len);
+  pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
+
+  /* Set source location. */
+  pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
+
+  /* Increment the count of entries in the comment table. */
+  pfile->comments.count++;
+}
+
  /* The stored comment includes the comment start and any terminator.  */
  static void
  save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
@@ -656,13 +755,16 @@ save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
        buffer[clen - 2] = '*';
        buffer[clen - 1] = '/';
      }
+
+  /* Finally store this comment for use by clients of libcpp. */
+  store_comment (pfile, token);
  }
  
  /* Allocate COUNT tokens for RUN.  */
  void
  _cpp_init_tokenrun (tokenrun *run, unsigned int count)
  {
-  run->base = xnewvec (cpp_token, count);
+  run->base = XNEWVEC (cpp_token, count);
    run->limit = run->base + count;
    run->next = NULL;
  }
@@ -673,7 +775,7 @@ next_tokenrun (tokenrun *run)
  {
    if (run->next == NULL)
      {
-      run->next = xnew (tokenrun);
+      run->next = XNEW (tokenrun);
        run->next->prev = run;
        _cpp_init_tokenrun (run->next, 250);
      }
@@ -681,6 +783,49 @@ next_tokenrun (tokenrun *run)
    return run->next;
  }
  
+/* Look ahead in the input stream.  */
+const cpp_token *
+cpp_peek_token (cpp_reader *pfile, int index)
+{
+  cpp_context *context = pfile->context;
+  const cpp_token *peektok;
+  int count;
+
+  /* First, scan through any pending cpp_context objects.  */
+  while (context->prev)
+    {
+      ptrdiff_t sz = (context->direct_p
+                      ? LAST (context).token - FIRST (context).token
+                      : LAST (context).ptoken - FIRST (context).ptoken);
+
+      if (index < (int) sz)
+        return (context->direct_p
+                ? FIRST (context).token + index
+                : *(FIRST (context).ptoken + index));
+
+      index -= (int) sz;
+      context = context->prev;
+    }
+
+  /* We will have to read some new tokens after all (and do so
+     without invalidating preceding tokens).  */
+  count = index;
+  pfile->keep_tokens++;
+
+  do
+    {
+      peektok = _cpp_lex_token (pfile);
+      if (peektok->type == CPP_EOF)
+       return peektok;
+    }
+  while (index--);
+
+  _cpp_backup_tokens_direct (pfile, count + 1);
+  pfile->keep_tokens--;
+
+  return peektok;
+}
+
  /* Allocate a single token that is invalidated at the same time as the
     rest of the tokens on the line.  Has its line and col set to the
     same as the last lexed token, so that diagnostics appear in the
@@ -689,9 +834,30 @@ cpp_token *
  _cpp_temp_token (cpp_reader *pfile)
  {
    cpp_token *old, *result;
+  ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
+  ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
  
    old = pfile->cur_token - 1;
-  if (pfile->cur_token == pfile->cur_run->limit)
+  /* Any pre-existing lookaheads must not be clobbered.  */
+  if (la)
+    {
+      if (sz <= la)
+        {
+          tokenrun *next = next_tokenrun (pfile->cur_run);
+
+          if (sz < la)
+            memmove (next->base + 1, next->base,
+                     (la - sz) * sizeof (cpp_token));
+
+          next->base[0] = pfile->cur_run->limit[-1];
+        }
+
+      if (sz > 1)
+        memmove (pfile->cur_token + 1, pfile->cur_token,
+                 MIN (la, sz - 1) * sizeof (cpp_token));
+    }
+
+  if (!sz && pfile->cur_token == pfile->cur_run->limit)
      {
        pfile->cur_run = next_tokenrun (pfile->cur_run);
        pfile->cur_token = pfile->cur_run->base;
@@ -717,6 +883,11 @@ _cpp_lex_token (cpp_reader *pfile)
           pfile->cur_run = next_tokenrun (pfile->cur_run);
           pfile->cur_token = pfile->cur_run->base;
         }
+      /* We assume that the current token is somewhere in the current
+        run.  */
+      if (pfile->cur_token < pfile->cur_run->base
+         || pfile->cur_token >= pfile->cur_run->limit)
+       abort ();
  
        if (pfile->lookaheads)
         {
@@ -734,15 +905,24 @@ _cpp_lex_token (cpp_reader *pfile)
               /* 6.10.3 p 11: Directives in a list of macro arguments
                  gives undefined behavior.  This implementation
                  handles the directive as normal.  */
-             && pfile->state.parsing_args != 1
-             && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
-           continue;
+             && pfile->state.parsing_args != 1)
+           {
+             if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
+               {
+                 if (pfile->directive_result.type == CPP_PADDING)
+                   continue;
+                 result = &pfile->directive_result;
+               }
+           }
+         else if (pfile->state.in_deferred_pragma)
+           result = &pfile->directive_result;
+
           if (pfile->cb.line_change && !pfile->state.skipping)
             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
         }
  
        /* We don't skip tokens in directives.  */
-      if (pfile->state.in_directive)
+      if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
         break;
  
        /* Outside a directive, invalidate controlling macros.  At file
@@ -789,11 +969,8 @@ _cpp_get_fresh_line (cpp_reader *pfile)
           && buffer->next_line > buffer->rlimit
           && !buffer->from_stage3)
         {
-         /* Only warn once.  */
+         /* Clip to buffer size.  */
           buffer->next_line = buffer->rlimit;
-         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
-                              CPP_BUF_COLUMN (buffer, buffer->cur),
-                              "no newline at end of file");
         }
  
        return_at_eof = buffer->return_at_eof;
@@ -836,6 +1013,14 @@ _cpp_lex_direct (cpp_reader *pfile)
    buffer = pfile->buffer;
    if (buffer->need_line)
      {
+      if (pfile->state.in_deferred_pragma)
+       {
+         result->type = CPP_PRAGMA_EOL;
+         pfile->state.in_deferred_pragma = false;
+         if (!pfile->state.pragma_allow_expansion)
+           pfile->state.prevent_expansion--;
+         return result;
+       }
        if (!_cpp_get_fresh_line (pfile))
         {
           result->type = CPP_EOF;
@@ -888,16 +1073,25 @@ _cpp_lex_direct (cpp_reader *pfile)
  
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
-      result->type = CPP_NUMBER;
-      lex_number (pfile, &result->val.str);
-      break;
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->type = CPP_NUMBER;
+       lex_number (pfile, &result->val.str, &nst);
+       warn_about_normalization (pfile, result, &nst);
+       break;
+      }
  
      case 'L':
-      /* 'L' may introduce wide characters or strings.  */
-      if (*buffer->cur == '\'' || *buffer->cur == '"')
+    case 'u':
+    case 'U':
+      /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
+      if (c == 'L' || CPP_OPTION (pfile, uliterals))
         {
-         lex_string (pfile, result, buffer->cur - 1);
-         break;
+         if (*buffer->cur == '\'' || *buffer->cur == '"')
+           {
+             lex_string (pfile, result, buffer->cur - 1);
+             break;
+           }
         }
        /* Fall through.  */
  
@@ -905,21 +1099,26 @@ _cpp_lex_direct (cpp_reader *pfile)
      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
      case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
      case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
-    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+    case 's': case 't':           case 'v': case 'w': case 'x':
      case 'y': case 'z':
      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
      case 'G': case 'H': case 'I': case 'J': case 'K':
      case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
-    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+    case 'S': case 'T':           case 'V': case 'W': case 'X':
      case 'Y': case 'Z':
        result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile, buffer->cur - 1);
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
+                                          &nst);
+       warn_about_normalization (pfile, result, &nst);
+      }
  
        /* Convert named operators to their proper types.  */
        if (result->val.node->flags & NODE_OPERATOR)
         {
           result->flags |= NAMED_OP;
-         result->type = result->val.node->directive_index;
+         result->type = (enum cpp_ttype) result->val.node->directive_index;
         }
        break;
  
@@ -993,11 +1192,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
-       }
        else if (CPP_OPTION (pfile, digraphs))
         {
           if (*buffer->cur == ':')
@@ -1024,11 +1218,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
-       }
        break;
  
      case '%':
@@ -1058,8 +1247,10 @@ _cpp_lex_direct (cpp_reader *pfile)
        result->type = CPP_DOT;
        if (ISDIGIT (*buffer->cur))
         {
+         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
           result->type = CPP_NUMBER;
-         lex_number (pfile, &result->val.str);
+         lex_number (pfile, &result->val.str, &nst);
+         warn_about_normalization (pfile, result, &nst);
         }
        else if (*buffer->cur == '.' && buffer->cur[1] == '.')
         buffer->cur += 2, result->type = CPP_ELLIPSIS;
@@ -1142,11 +1333,13 @@ _cpp_lex_direct (cpp_reader *pfile)
      case '\\':
        {
         const uchar *base = --buffer->cur;
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
  
-       if (forms_identifier_p (pfile, true))
+       if (forms_identifier_p (pfile, true, &nst))
           {
             result->type = CPP_NAME;
-           result->val.node = lex_identifier (pfile, base);
+           result->val.node = lex_identifier (pfile, base, true, &nst);
+           warn_about_normalization (pfile, result, &nst);
             break;
           }
         buffer->cur++;
@@ -1169,21 +1362,58 @@ cpp_token_len (const cpp_token *token)
  
    switch (TOKEN_SPELL (token))
      {
-    default:           len = 4;                                break;
+    default:           len = 6;                                break;
      case SPELL_LITERAL:        len = token->val.str.len;               break;
-    case SPELL_IDENT:  len = NODE_LEN (token->val.node);       break;
+    case SPELL_IDENT:  len = NODE_LEN (token->val.node) * 10;  break;
      }
  
    return len;
  }
  
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+   Return the number of bytes read out of NAME.  (There are always
+   10 bytes written to BUFFER.)  */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+  int j;
+  int ucn_len = 0;
+  int ucn_len_c;
+  unsigned t;
+  unsigned long utf32;
+  
+  /* Compute the length of the UTF-8 sequence.  */
+  for (t = *name; t & 0x80; t <<= 1)
+    ucn_len++;
+  
+  utf32 = *name & (0x7F >> ucn_len);
+  for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+    {
+      utf32 = (utf32 << 6) | (*++name & 0x3F);
+      
+      /* Ill-formed UTF-8.  */
+      if ((*name & ~0x3F) != 0x80)
+       abort ();
+    }
+  
+  *buffer++ = '\\';
+  *buffer++ = 'U';
+  for (j = 7; j >= 0; j--)
+    *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+  return ucn_len;
+}
+
+
  /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
     already contain the enough space to hold the token's spelling.
     Returns a pointer to the character after the last character written.
+   FORSTRING is true if this is to be the spelling after translation
+   phase 1 (this is different for UCNs).
     FIXME: Would be nice if we didn't need the PFILE argument.  */
  unsigned char *
  cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
-                unsigned char *buffer)
+                unsigned char *buffer, bool forstring)
  {
    switch (TOKEN_SPELL (token))
      {
@@ -1207,8 +1437,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
  
      spell_ident:
      case SPELL_IDENT:
-      memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
-      buffer += NODE_LEN (token->val.node);
+      if (forstring)
+       {
+         memcpy (buffer, NODE_NAME (token->val.node),
+                 NODE_LEN (token->val.node));
+         buffer += NODE_LEN (token->val.node);
+       }
+      else
+       {
+         size_t i;
+         const unsigned char * name = NODE_NAME (token->val.node);
+         
+         for (i = 0; i < NODE_LEN (token->val.node); i++)
+           if (name[i] & ~0x7F)
+             {
+               i += utf8_to_ucn (buffer, name + i) - 1;
+               buffer += 10;
+             }
+           else
+             *buffer++ = NODE_NAME (token->val.node)[i];
+       }
        break;
  
      case SPELL_LITERAL:
@@ -1233,7 +1481,7 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
    unsigned int len = cpp_token_len (token) + 1;
    unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
  
-  end = cpp_spell_token (pfile, token, start);
+  end = cpp_spell_token (pfile, token, start, false);
    end[0] = '\0';
  
    return start;
@@ -1277,8 +1525,21 @@ cpp_output_token (const cpp_token *token, FILE *fp)
  
      spell_ident:
      case SPELL_IDENT:
-      fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
-    break;
+      {
+       size_t i;
+       const unsigned char * name = NODE_NAME (token->val.node);
+       
+       for (i = 0; i < NODE_LEN (token->val.node); i++)
+         if (name[i] & ~0x7F)
+           {
+             unsigned char buffer[10];
+             i += utf8_to_ucn (buffer, name + i) - 1;
+             fwrite (buffer, 1, 10, fp);
+           }
+         else
+           fputc (NODE_NAME (token->val.node)[i], fp);
+      }
+      break;
  
      case SPELL_LITERAL:
        fwrite (token->val.str.text, 1, token->val.str.len, fp);
@@ -1341,8 +1602,8 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
  
    switch (a)
      {
-    case CPP_GREATER:  return c == '>' || c == '?';
-    case CPP_LESS:     return c == '<' || c == '?' || c == '%' || c == ':';
+    case CPP_GREATER:  return c == '>';
+    case CPP_LESS:     return c == '<' || c == '%' || c == ':';
      case CPP_PLUS:     return c == '+';
      case CPP_MINUS:    return c == '-' || c == '>';
      case CPP_DIV:      return c == '/' || c == '*'; /* Comments.  */
@@ -1391,6 +1652,51 @@ cpp_output_line (cpp_reader *pfile, FILE *fp)
    putc ('\n', fp);
  }
  
+/* Return a string representation of all the remaining tokens on the
+   current line.  The result is allocated using xmalloc and must be
+   freed by the caller.  */
+unsigned char *
+cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
+{
+  const cpp_token *token;
+  unsigned int out = dir_name ? ustrlen (dir_name) : 0;
+  unsigned int alloced = 120 + out;
+  unsigned char *result = (unsigned char *) xmalloc (alloced);
+
+  /* If DIR_NAME is empty, there are no initial contents.  */
+  if (dir_name)
+    {
+      sprintf ((char *) result, "#%s ", dir_name);
+      out += 2;
+    }
+
+  token = cpp_get_token (pfile);
+  while (token->type != CPP_EOF)
+    {
+      unsigned char *last;
+      /* Include room for a possible space and the terminating nul.  */
+      unsigned int len = cpp_token_len (token) + 2;
+
+      if (out + len > alloced)
+       {
+         alloced *= 2;
+         if (out + len > alloced)
+           alloced = out + len;
+         result = (unsigned char *) xrealloc (result, alloced);
+       }
+
+      last = cpp_spell_token (pfile, token, &result[out], 0);
+      out = last - result;
+
+      token = cpp_get_token (pfile);
+      if (token->flags & PREV_WHITE)
+       result[out++] = ' ';
+    }
+
+  result[out] = '\0';
+  return result;
+}
+
  /* Memory buffers.  Changing these three constants can have a dramatic
     effect on performance.  The values here are reasonable defaults,
     but might be tuned.  If you adjust them, be sure to test across a
@@ -1418,7 +1724,7 @@ new_buff (size_t len)
      len = MIN_BUFF_SIZE;
    len = CPP_ALIGN (len);
  
-  base = xmalloc (len + sizeof (_cpp_buff));
+  base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
    result = (_cpp_buff *) (base + len);
    result->base = base;
    result->cur = base;
@@ -1556,3 +1862,27 @@ _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
    buff->cur = result + len;
    return result;
  }
+
+/* Say which field of TOK is in use.  */
+
+enum cpp_token_fld_kind
+cpp_token_val_index (cpp_token *tok)
+{
+  switch (TOKEN_SPELL (tok))
+    {
+    case SPELL_IDENT:
+      return CPP_TOKEN_FLD_NODE;
+    case SPELL_LITERAL:
+      return CPP_TOKEN_FLD_STR;
+    case SPELL_NONE:
+      if (tok->type == CPP_MACRO_ARG)
+       return CPP_TOKEN_FLD_ARG_NO;
+      else if (tok->type == CPP_PADDING)
+       return CPP_TOKEN_FLD_SOURCE;
+      else if (tok->type == CPP_PRAGMA)
+       return CPP_TOKEN_FLD_PRAGMA;
+      /* else fall through */
+    default:
+      return CPP_TOKEN_FLD_NONE;
+    }
+}