2007-01-17 Marco Trudel <mtrudel@gmx.ch>

[pf3gnuchains/gcc-fork.git] / libcpp / lex.c
diff --git a/libcpp/lex.c b/libcpp/lex.c

index 7eafb13..5d1a688 100644 (file)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1,5 +1,5 @@
  /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
     Contributed by Per Bothner, 1994-95.
     Based on CCCP program by Paul Rubin, June 1986
     Adapted to ANSI C, Richard Stallman, Jan 1987
@@ -17,7 +17,7 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
-Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  
  #include "config.h"
  #include "system.h"
@@ -41,8 +41,8 @@ struct token_spelling
  static const unsigned char *const digraph_spellings[] =
  { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  
-#define OP(e, s) { SPELL_OPERATOR, U s           },
-#define TK(e, s) { s,              U #e },
+#define OP(e, s) { SPELL_OPERATOR, U s  },
+#define TK(e, s) { SPELL_ ## s,    U #e },
  static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  #undef OP
  #undef TK
@@ -53,9 +53,6 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  static int skip_line_comment (cpp_reader *);
  static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
-static void lex_number (cpp_reader *, cpp_string *);
-static bool forms_identifier_p (cpp_reader *, int);
  static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  static void create_literal (cpp_reader *, cpp_token *, const uchar *,
@@ -88,8 +85,8 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
    if (buffer->notes_used == buffer->notes_cap)
      {
        buffer->notes_cap = buffer->notes_cap * 2 + 200;
-      buffer->notes = xrealloc (buffer->notes,
-                               buffer->notes_cap * sizeof (_cpp_line_note));
+      buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
+                                  buffer->notes_cap);
      }
  
    buffer->notes[buffer->notes_used].pos = pos;
@@ -114,31 +111,39 @@ _cpp_clean_line (cpp_reader *pfile)
  
    if (!buffer->from_stage3)
      {
+      const uchar *pbackslash = NULL;
+
        /* Short circuit for the common case of an un-escaped line with
          no trigraphs.  The primary win here is by not writing any
          data back to memory until we have to.  */
        for (;;)
         {
           c = *++s;
-         if (c == '\n' || c == '\r')
+         if (__builtin_expect (c == '\n', false)
+             || __builtin_expect (c == '\r', false))
             {
               d = (uchar *) s;
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (s == buffer->rlimit, false))
                 goto done;
  
               /* DOS line ending? */
-             if (c == '\r' && s[1] == '\n')
-               s++;
+             if (__builtin_expect (c == '\r', false)
+                 && s[1] == '\n')
+               {
+                 s++;
+                 if (s == buffer->rlimit)
+                   goto done;
+               }
  
-             if (s == buffer->rlimit)
+             if (__builtin_expect (pbackslash == NULL, true))
                 goto done;
  
-             /* check for escaped newline */
+             /* Check for escaped newline.  */
               p = d;
-             while (p != buffer->next_line && is_nvspace (p[-1]))
+             while (is_nvspace (p[-1]))
                 p--;
-             if (p == buffer->next_line || p[-1] != '\\')
+             if (p - 1 != pbackslash)
                 goto done;
  
               /* Have an escaped newline; process it and proceed to
@@ -148,7 +153,11 @@ _cpp_clean_line (cpp_reader *pfile)
               buffer->next_line = p - 1;
               break;
             }
-         if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
+         if (__builtin_expect (c == '\\', false))
+           pbackslash = s;
+         else if (__builtin_expect (c == '?', false)
+                  && __builtin_expect (s[1] == '?', false)
+                  && _cpp_trigraph_map[s[2]])
             {
               /* Have a trigraph.  We may or may not have to convert
                  it.  Add a line note regardless, for -Wtrigraphs.  */
@@ -430,10 +439,36 @@ name_p (cpp_reader *pfile, const cpp_string *string)
    return 1;
  }
  
+/* After parsing an identifier or other sequence, produce a warning about
+   sequences not in NFC/NFKC.  */
+static void
+warn_about_normalization (cpp_reader *pfile, 
+                         const cpp_token *token,
+                         const struct normalize_state *s)
+{
+  if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
+      && !pfile->state.skipping)
+    {
+      /* Make sure that the token is printed using UCNs, even
+        if we'd otherwise happily print UTF-8.  */
+      unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
+      size_t sz;
+
+      sz = cpp_spell_token (pfile, token, buf, false) - buf;
+      if (NORMALIZE_STATE_RESULT (s) == normalized_C)
+       cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+                            "`%.*s' is not in NFKC", (int) sz, buf);
+      else
+       cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+                            "`%.*s' is not in NFC", (int) sz, buf);
+    }
+}
+
  /* Returns TRUE if the sequence starting at buffer->cur is invalid in
     an identifier.  FIRST is TRUE if this starts an identifier.  */
  static bool
-forms_identifier_p (cpp_reader *pfile, int first)
+forms_identifier_p (cpp_reader *pfile, int first,
+                   struct normalize_state *state)
  {
    cpp_buffer *buffer = pfile->buffer;
  
@@ -453,11 +488,13 @@ forms_identifier_p (cpp_reader *pfile, int first)
      }
  
    /* Is this a syntactically valid UCN?  */
-  if (0 && *buffer->cur == '\\'
+  if (CPP_OPTION (pfile, extended_identifiers)
+      && *buffer->cur == '\\'
        && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
      {
        buffer->cur += 2;
-      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
+      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+                         state))
         return true;
        buffer->cur -= 2;
      }
@@ -467,39 +504,43 @@ forms_identifier_p (cpp_reader *pfile, int first)
  
  /* Lex an identifier starting at BUFFER->CUR - 1.  */
  static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
+               struct normalize_state *nst)
  {
    cpp_hashnode *result;
-  const uchar *cur, *limit;
+  const uchar *cur;
    unsigned int len;
    unsigned int hash = HT_HASHSTEP (0, *base);
  
    cur = pfile->buffer->cur;
-  for (;;)
+  if (! starts_ucn)
+    while (ISIDNUM (*cur))
+      {
+       hash = HT_HASHSTEP (hash, *cur);
+       cur++;
+      }
+  pfile->buffer->cur = cur;
+  if (starts_ucn || forms_identifier_p (pfile, false, nst))
      {
-      /* N.B. ISIDNUM does not include $.  */
-      while (ISIDNUM (*cur))
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
-
-      pfile->buffer->cur = cur;
-      if (!forms_identifier_p (pfile, false))
-       break;
-
-      limit = pfile->buffer->cur;
-      while (cur < limit)
-       {
-         hash = HT_HASHSTEP (hash, *cur);
-         cur++;
-       }
+      /* Slower version for identifiers containing UCNs (or $).  */
+      do {
+       while (ISIDNUM (*pfile->buffer->cur))
+         {
+           pfile->buffer->cur++;
+           NORMALIZE_STATE_UPDATE_IDNUM (nst);
+         }
+      } while (forms_identifier_p (pfile, false, nst));
+      result = _cpp_interpret_identifier (pfile, base,
+                                         pfile->buffer->cur - base);
      }
-  len = cur - base;
-  hash = HT_HASHFINISH (hash, len);
+  else
+    {
+      len = cur - base;
+      hash = HT_HASHFINISH (hash, len);
  
-  result = (cpp_hashnode *)
-    ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+      result = (cpp_hashnode *)
+       ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+    }
  
    /* Rarely, identifiers require diagnostics when lexed.  */
    if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
@@ -524,7 +565,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base)
  
  /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
  static void
-lex_number (cpp_reader *pfile, cpp_string *number)
+lex_number (cpp_reader *pfile, cpp_string *number,
+           struct normalize_state *nst)
  {
    const uchar *cur;
    const uchar *base;
@@ -537,11 +579,14 @@ lex_number (cpp_reader *pfile, cpp_string *number)
  
        /* N.B. ISIDNUM does not include $.  */
        while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
-       cur++;
+       {
+         cur++;
+         NORMALIZE_STATE_UPDATE_IDNUM (nst);
+       }
  
        pfile->buffer->cur = cur;
      }
-  while (forms_identifier_p (pfile, false));
+  while (forms_identifier_p (pfile, false, nst));
  
    number->len = cur - base;
    dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -613,6 +658,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
      cpp_error (pfile, CPP_DL_WARNING,
                "null character(s) preserved in literal");
  
+  if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
+    cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
+              (int) terminator);
+
    pfile->buffer->cur = cur;
    create_literal (pfile, token, base, cur - base, type);
  }
@@ -662,7 +711,7 @@ save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
  void
  _cpp_init_tokenrun (tokenrun *run, unsigned int count)
  {
-  run->base = xnewvec (cpp_token, count);
+  run->base = XNEWVEC (cpp_token, count);
    run->limit = run->base + count;
    run->next = NULL;
  }
@@ -673,7 +722,7 @@ next_tokenrun (tokenrun *run)
  {
    if (run->next == NULL)
      {
-      run->next = xnew (tokenrun);
+      run->next = XNEW (tokenrun);
        run->next->prev = run;
        _cpp_init_tokenrun (run->next, 250);
      }
@@ -734,15 +783,24 @@ _cpp_lex_token (cpp_reader *pfile)
               /* 6.10.3 p 11: Directives in a list of macro arguments
                  gives undefined behavior.  This implementation
                  handles the directive as normal.  */
-             && pfile->state.parsing_args != 1
-             && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
-           continue;
+             && pfile->state.parsing_args != 1)
+           {
+             if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
+               {
+                 if (pfile->directive_result.type == CPP_PADDING)
+                   continue;
+                 result = &pfile->directive_result;
+               }
+           }
+         else if (pfile->state.in_deferred_pragma)
+           result = &pfile->directive_result;
+
           if (pfile->cb.line_change && !pfile->state.skipping)
             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
         }
  
        /* We don't skip tokens in directives.  */
-      if (pfile->state.in_directive)
+      if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
         break;
  
        /* Outside a directive, invalidate controlling macros.  At file
@@ -836,6 +894,14 @@ _cpp_lex_direct (cpp_reader *pfile)
    buffer = pfile->buffer;
    if (buffer->need_line)
      {
+      if (pfile->state.in_deferred_pragma)
+       {
+         result->type = CPP_PRAGMA_EOL;
+         pfile->state.in_deferred_pragma = false;
+         if (!pfile->state.pragma_allow_expansion)
+           pfile->state.prevent_expansion--;
+         return result;
+       }
        if (!_cpp_get_fresh_line (pfile))
         {
           result->type = CPP_EOF;
@@ -888,9 +954,13 @@ _cpp_lex_direct (cpp_reader *pfile)
  
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
-      result->type = CPP_NUMBER;
-      lex_number (pfile, &result->val.str);
-      break;
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->type = CPP_NUMBER;
+       lex_number (pfile, &result->val.str, &nst);
+       warn_about_normalization (pfile, result, &nst);
+       break;
+      }
  
      case 'L':
        /* 'L' may introduce wide characters or strings.  */
@@ -913,13 +983,18 @@ _cpp_lex_direct (cpp_reader *pfile)
      case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
      case 'Y': case 'Z':
        result->type = CPP_NAME;
-      result->val.node = lex_identifier (pfile, buffer->cur - 1);
+      {
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+       result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
+                                          &nst);
+       warn_about_normalization (pfile, result, &nst);
+      }
  
        /* Convert named operators to their proper types.  */
        if (result->val.node->flags & NODE_OPERATOR)
         {
           result->flags |= NAMED_OP;
-         result->type = result->val.node->directive_index;
+         result->type = (enum cpp_ttype) result->val.node->directive_index;
         }
        break;
  
@@ -993,11 +1068,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
-       }
        else if (CPP_OPTION (pfile, digraphs))
         {
           if (*buffer->cur == ':')
@@ -1024,11 +1094,6 @@ _cpp_lex_direct (cpp_reader *pfile)
           buffer->cur++;
           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
         }
-      else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
-       {
-         buffer->cur++;
-         IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
-       }
        break;
  
      case '%':
@@ -1058,8 +1123,10 @@ _cpp_lex_direct (cpp_reader *pfile)
        result->type = CPP_DOT;
        if (ISDIGIT (*buffer->cur))
         {
+         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
           result->type = CPP_NUMBER;
-         lex_number (pfile, &result->val.str);
+         lex_number (pfile, &result->val.str, &nst);
+         warn_about_normalization (pfile, result, &nst);
         }
        else if (*buffer->cur == '.' && buffer->cur[1] == '.')
         buffer->cur += 2, result->type = CPP_ELLIPSIS;
@@ -1142,11 +1209,13 @@ _cpp_lex_direct (cpp_reader *pfile)
      case '\\':
        {
         const uchar *base = --buffer->cur;
+       struct normalize_state nst = INITIAL_NORMALIZE_STATE;
  
-       if (forms_identifier_p (pfile, true))
+       if (forms_identifier_p (pfile, true, &nst))
           {
             result->type = CPP_NAME;
-           result->val.node = lex_identifier (pfile, base);
+           result->val.node = lex_identifier (pfile, base, true, &nst);
+           warn_about_normalization (pfile, result, &nst);
             break;
           }
         buffer->cur++;
@@ -1171,19 +1240,56 @@ cpp_token_len (const cpp_token *token)
      {
      default:           len = 4;                                break;
      case SPELL_LITERAL:        len = token->val.str.len;               break;
-    case SPELL_IDENT:  len = NODE_LEN (token->val.node);       break;
+    case SPELL_IDENT:  len = NODE_LEN (token->val.node) * 10;  break;
      }
  
    return len;
  }
  
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+   Return the number of bytes read out of NAME.  (There are always
+   10 bytes written to BUFFER.)  */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+  int j;
+  int ucn_len = 0;
+  int ucn_len_c;
+  unsigned t;
+  unsigned long utf32;
+  
+  /* Compute the length of the UTF-8 sequence.  */
+  for (t = *name; t & 0x80; t <<= 1)
+    ucn_len++;
+  
+  utf32 = *name & (0x7F >> ucn_len);
+  for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+    {
+      utf32 = (utf32 << 6) | (*++name & 0x3F);
+      
+      /* Ill-formed UTF-8.  */
+      if ((*name & ~0x3F) != 0x80)
+       abort ();
+    }
+  
+  *buffer++ = '\\';
+  *buffer++ = 'U';
+  for (j = 7; j >= 0; j--)
+    *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+  return ucn_len;
+}
+
+
  /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
     already contain the enough space to hold the token's spelling.
     Returns a pointer to the character after the last character written.
+   FORSTRING is true if this is to be the spelling after translation
+   phase 1 (this is different for UCNs).
     FIXME: Would be nice if we didn't need the PFILE argument.  */
  unsigned char *
  cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
-                unsigned char *buffer)
+                unsigned char *buffer, bool forstring)
  {
    switch (TOKEN_SPELL (token))
      {
@@ -1207,8 +1313,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
  
      spell_ident:
      case SPELL_IDENT:
-      memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
-      buffer += NODE_LEN (token->val.node);
+      if (forstring)
+       {
+         memcpy (buffer, NODE_NAME (token->val.node),
+                 NODE_LEN (token->val.node));
+         buffer += NODE_LEN (token->val.node);
+       }
+      else
+       {
+         size_t i;
+         const unsigned char * name = NODE_NAME (token->val.node);
+         
+         for (i = 0; i < NODE_LEN (token->val.node); i++)
+           if (name[i] & ~0x7F)
+             {
+               i += utf8_to_ucn (buffer, name + i) - 1;
+               buffer += 10;
+             }
+           else
+             *buffer++ = NODE_NAME (token->val.node)[i];
+       }
        break;
  
      case SPELL_LITERAL:
@@ -1233,7 +1357,7 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
    unsigned int len = cpp_token_len (token) + 1;
    unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
  
-  end = cpp_spell_token (pfile, token, start);
+  end = cpp_spell_token (pfile, token, start, false);
    end[0] = '\0';
  
    return start;
@@ -1277,8 +1401,21 @@ cpp_output_token (const cpp_token *token, FILE *fp)
  
      spell_ident:
      case SPELL_IDENT:
-      fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
-    break;
+      {
+       size_t i;
+       const unsigned char * name = NODE_NAME (token->val.node);
+       
+       for (i = 0; i < NODE_LEN (token->val.node); i++)
+         if (name[i] & ~0x7F)
+           {
+             unsigned char buffer[10];
+             i += utf8_to_ucn (buffer, name + i) - 1;
+             fwrite (buffer, 1, 10, fp);
+           }
+         else
+           fputc (NODE_NAME (token->val.node)[i], fp);
+      }
+      break;
  
      case SPELL_LITERAL:
        fwrite (token->val.str.text, 1, token->val.str.len, fp);
@@ -1341,8 +1478,8 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
  
    switch (a)
      {
-    case CPP_GREATER:  return c == '>' || c == '?';
-    case CPP_LESS:     return c == '<' || c == '?' || c == '%' || c == ':';
+    case CPP_GREATER:  return c == '>';
+    case CPP_LESS:     return c == '<' || c == '%' || c == ':';
      case CPP_PLUS:     return c == '+';
      case CPP_MINUS:    return c == '-' || c == '>';
      case CPP_DIV:      return c == '/' || c == '*'; /* Comments.  */
@@ -1418,7 +1555,7 @@ new_buff (size_t len)
      len = MIN_BUFF_SIZE;
    len = CPP_ALIGN (len);
  
-  base = xmalloc (len + sizeof (_cpp_buff));
+  base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
    result = (_cpp_buff *) (base + len);
    result->base = base;
    result->cur = base;
@@ -1573,6 +1710,8 @@ cpp_token_val_index (cpp_token *tok)
         return CPP_TOKEN_FLD_ARG_NO;
        else if (tok->type == CPP_PADDING)
         return CPP_TOKEN_FLD_SOURCE;
+      else if (tok->type == CPP_PRAGMA)
+       return CPP_TOKEN_FLD_PRAGMA;
        /* else fall through */
      default:
        return CPP_TOKEN_FLD_NONE;