Fix PR c++/42225, take 2

[pf3gnuchains/gcc-fork.git] / libcpp / lex.c
diff --git a/libcpp/lex.c b/libcpp/lex.c

index 63e291c..ac28f92 100644 (file)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -76,7 +76,7 @@ cpp_ideq (const cpp_token *token, const char *string)
    if (token->type != CPP_NAME)
      return 0;
  
-  return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
+  return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  }
  
  /* Record a note TYPE at byte POS into the current cleaned logical
@@ -504,6 +504,63 @@ forms_identifier_p (cpp_reader *pfile, int first,
    return false;
  }
  
+/* Helper function to get the cpp_hashnode of the identifier BASE.  */
+static cpp_hashnode *
+lex_identifier_intern (cpp_reader *pfile, const uchar *base)
+{
+  cpp_hashnode *result;
+  const uchar *cur;
+  unsigned int len;
+  unsigned int hash = HT_HASHSTEP (0, *base);
+
+  cur = base + 1;
+  while (ISIDNUM (*cur))
+    {
+      hash = HT_HASHSTEP (hash, *cur);
+      cur++;
+    }
+  len = cur - base;
+  hash = HT_HASHFINISH (hash, len);
+  result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
+                                             base, len, hash, HT_ALLOC));
+
+  /* Rarely, identifiers require diagnostics when lexed.  */
+  if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
+                       && !pfile->state.skipping, 0))
+    {
+      /* It is allowed to poison the same identifier twice.  */
+      if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
+       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
+                  NODE_NAME (result));
+
+      /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
+        replacement list of a variadic macro.  */
+      if (result == pfile->spec_nodes.n__VA_ARGS__
+         && !pfile->state.va_args_ok)
+       cpp_error (pfile, CPP_DL_PEDWARN,
+                  "__VA_ARGS__ can only appear in the expansion"
+                  " of a C99 variadic macro");
+
+      /* For -Wc++-compat, warn about use of C++ named operators.  */
+      if (result->flags & NODE_WARN_OPERATOR)
+       cpp_error (pfile, CPP_DL_WARNING,
+                  "identifier \"%s\" is a special operator name in C++",
+                  NODE_NAME (result));
+    }
+
+  return result;
+}
+
+/* Get the cpp_hashnode of an identifier specified by NAME in
+   the current cpp_reader object.  If none is found, NULL is returned.  */
+cpp_hashnode *
+_cpp_lex_identifier (cpp_reader *pfile, const char *name)
+{
+  cpp_hashnode *result;
+  result = lex_identifier_intern (pfile, (uchar *) name);
+  return result;
+}
+
  /* Lex an identifier starting at BUFFER->CUR - 1.  */
  static cpp_hashnode *
  lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
@@ -560,6 +617,12 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
         cpp_error (pfile, CPP_DL_PEDWARN,
                    "__VA_ARGS__ can only appear in the expansion"
                    " of a C99 variadic macro");
+
+      /* For -Wc++-compat, warn about use of C++ named operators.  */
+      if (result->flags & NODE_WARN_OPERATOR)
+       cpp_error (pfile, CPP_DL_WARNING,
+                  "identifier \"%s\" is a special operator name in C++",
+                  NODE_NAME (result));
      }
  
    return result;
@@ -611,12 +674,192 @@ create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
    token->val.str.text = dest;
  }
  
+/* Lexes a raw string.  The stored string contains the spelling, including
+   double quotes, delimiter string, '[' and ']', any leading
+   'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
+   literal, or CPP_OTHER if it was not properly terminated.
+
+   The spelling is NUL-terminated, but it is not guaranteed that this
+   is the first NUL since embedded NULs are preserved.  */
+
+static void
+lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
+               const uchar *cur)
+{
+  source_location saw_NUL = 0;
+  const uchar *raw_prefix;
+  unsigned int raw_prefix_len = 0;
+  enum cpp_ttype type;
+  size_t total_len = 0;
+  _cpp_buff *first_buff = NULL, *last_buff = NULL;
+
+  type = (*base == 'L' ? CPP_WSTRING :
+         *base == 'U' ? CPP_STRING32 :
+         *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+         : CPP_STRING);
+
+  raw_prefix = cur + 1;
+  while (raw_prefix_len < 16)
+    {
+      switch (raw_prefix[raw_prefix_len])
+       {
+       case ' ': case '[': case ']': case '\t':
+       case '\v': case '\f': case '\n': default:
+         break;
+       /* Basic source charset except the above chars.  */
+       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+       case 'y': case 'z':
+       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+       case 'Y': case 'Z':
+       case '0': case '1': case '2': case '3': case '4': case '5':
+       case '6': case '7': case '8': case '9':
+       case '_': case '{': case '}': case '#': case '(': case ')':
+       case '<': case '>': case '%': case ':': case ';': case '.':
+       case '?': case '*': case '+': case '-': case '/': case '^':
+       case '&': case '|': case '~': case '!': case '=': case ',':
+       case '\\': case '"': case '\'':
+         raw_prefix_len++;
+         continue;
+       }
+      break;
+    }
+
+  if (raw_prefix[raw_prefix_len] != '[')
+    {
+      int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
+               + 1;
+      if (raw_prefix_len == 16)
+       cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+                            "raw string delimiter longer than 16 characters");
+      else
+       cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
+                            "invalid character '%c' in raw string delimiter",
+                            (int) raw_prefix[raw_prefix_len]);
+      pfile->buffer->cur = raw_prefix - 1;
+      create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
+      return;
+    }
+
+  cur = raw_prefix + raw_prefix_len + 1;
+  for (;;)
+    {
+      cppchar_t c = *cur++;
+
+      if (c == ']'
+         && strncmp ((const char *) cur, (const char *) raw_prefix,
+                     raw_prefix_len) == 0
+         && cur[raw_prefix_len] == '"')
+       {
+         cur += raw_prefix_len + 1;
+         break;
+       }
+      else if (c == '\n')
+       {
+         if (pfile->state.in_directive
+             || pfile->state.parsing_args
+             || pfile->state.in_deferred_pragma)
+           {
+             cur--;
+             type = CPP_OTHER;
+             cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
+                                  "unterminated raw string");
+             break;
+           }
+
+         /* raw strings allow embedded non-escaped newlines, which
+            complicates this routine a lot.  */
+         if (first_buff == NULL)
+           {
+             total_len = cur - base;
+             first_buff = last_buff = _cpp_get_buff (pfile, total_len);
+             memcpy (BUFF_FRONT (last_buff), base, total_len);
+             raw_prefix = BUFF_FRONT (last_buff) + (raw_prefix - base);
+             BUFF_FRONT (last_buff) += total_len;
+           }
+         else
+           {
+             size_t len = cur - base;
+             size_t cur_len = len > BUFF_ROOM (last_buff)
+                              ? BUFF_ROOM (last_buff) : len;
+
+             total_len += len;
+             memcpy (BUFF_FRONT (last_buff), base, cur_len);
+             BUFF_FRONT (last_buff) += cur_len;
+             if (len > cur_len)
+               {
+                 last_buff = _cpp_append_extend_buff (pfile, last_buff,
+                                                      len - cur_len);
+                 memcpy (BUFF_FRONT (last_buff), base + cur_len,
+                         len - cur_len);
+                 BUFF_FRONT (last_buff) += len - cur_len;
+               }
+           }
+
+         if (pfile->buffer->cur < pfile->buffer->rlimit)
+           CPP_INCREMENT_LINE (pfile, 0);
+         pfile->buffer->need_line = true;
+
+         if (!_cpp_get_fresh_line (pfile))
+           {
+             source_location src_loc = token->src_loc;
+             token->type = CPP_EOF;
+             /* Tell the compiler the line number of the EOF token.  */
+             token->src_loc = pfile->line_table->highest_line;
+             token->flags = BOL;
+             if (first_buff != NULL)
+               _cpp_release_buff (pfile, first_buff);
+             cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
+                                  "unterminated raw string");
+             return;
+           }
+
+         cur = base = pfile->buffer->cur;
+       }
+      else if (c == '\0' && !saw_NUL)
+       LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
+                                    CPP_BUF_COLUMN (pfile->buffer, cur));
+    }
+
+  if (saw_NUL && !pfile->state.skipping)
+    cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
+              "null character(s) preserved in literal");
+
+  pfile->buffer->cur = cur;
+  if (first_buff == NULL)
+    create_literal (pfile, token, base, cur - base, type);
+  else
+    {
+      uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
+
+      token->type = type;
+      token->val.str.len = total_len + (cur - base);
+      token->val.str.text = dest;
+      last_buff = first_buff;
+      while (last_buff != NULL)
+       {
+         memcpy (dest, last_buff->base,
+                 BUFF_FRONT (last_buff) - last_buff->base);
+         dest += BUFF_FRONT (last_buff) - last_buff->base;
+         last_buff = last_buff->next;
+       }
+      _cpp_release_buff (pfile, first_buff);
+      memcpy (dest, base, cur - base);
+      dest[cur - base] = '\0';
+    }
+}
+
  /* Lexes a string, character constant, or angle-bracketed header file
     name.  The stored string contains the spelling, including opening
-   quote and leading any leading 'L', 'u' or 'U'.  It returns the type
-   of the literal, or CPP_OTHER if it was not properly terminated, or
-   CPP_LESS for an unterminated header name which must be relexed as
-   normal tokens.
+   quote and any leading 'L', 'u', 'U' or 'u8' and optional
+   'R' modifier.  It returns the type of the literal, or CPP_OTHER
+   if it was not properly terminated, or CPP_LESS for an unterminated
+   header name which must be relexed as normal tokens.
  
     The spelling is NUL-terminated, but it is not guaranteed that this
     is the first NUL since embedded NULs are preserved.  */
@@ -630,12 +873,24 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
  
    cur = base;
    terminator = *cur++;
-  if (terminator == 'L' || terminator == 'u' || terminator == 'U')
+  if (terminator == 'L' || terminator == 'U')
      terminator = *cur++;
-  if (terminator == '\"')
+  else if (terminator == 'u')
+    {
+      terminator = *cur++;
+      if (terminator == '8')
+       terminator = *cur++;
+    }
+  if (terminator == 'R')
+    {
+      lex_raw_string (pfile, token, base, cur);
+      return;
+    }
+  if (terminator == '"')
      type = (*base == 'L' ? CPP_WSTRING :
             *base == 'U' ? CPP_STRING32 :
-           *base == 'u' ? CPP_STRING16 : CPP_STRING);
+           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
+                        : CPP_STRING);
    else if (terminator == '\'')
      type = (*base == 'L' ? CPP_WCHAR :
             *base == 'U' ? CPP_CHAR32 :
@@ -1095,10 +1350,21 @@ _cpp_lex_direct (cpp_reader *pfile)
      case 'L':
      case 'u':
      case 'U':
-      /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
+    case 'R':
+      /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
+        wide strings or raw strings.  */
        if (c == 'L' || CPP_OPTION (pfile, uliterals))
         {
-         if (*buffer->cur == '\'' || *buffer->cur == '"')
+         if ((*buffer->cur == '\'' && c != 'R')
+             || *buffer->cur == '"'
+             || (*buffer->cur == 'R'
+                 && c != 'R'
+                 && buffer->cur[1] == '"'
+                 && CPP_OPTION (pfile, uliterals))
+             || (*buffer->cur == '8'
+                 && c == 'u'
+                 && (buffer->cur[1] == '"'
+                     || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
             {
               lex_string (pfile, result, buffer->cur - 1);
               break;
@@ -1114,22 +1380,22 @@ _cpp_lex_direct (cpp_reader *pfile)
      case 'y': case 'z':
      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
      case 'G': case 'H': case 'I': case 'J': case 'K':
-    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+    case 'M': case 'N': case 'O': case 'P': case 'Q':
      case 'S': case 'T':           case 'V': case 'W': case 'X':
      case 'Y': case 'Z':
        result->type = CPP_NAME;
        {
         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
-       result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
-                                          &nst);
+       result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
+                                               &nst);
         warn_about_normalization (pfile, result, &nst);
        }
  
        /* Convert named operators to their proper types.  */
-      if (result->val.node->flags & NODE_OPERATOR)
+      if (result->val.node.node->flags & NODE_OPERATOR)
         {
           result->flags |= NAMED_OP;
-         result->type = (enum cpp_ttype) result->val.node->directive_index;
+         result->type = (enum cpp_ttype) result->val.node.node->directive_index;
         }
        break;
  
@@ -1244,7 +1510,7 @@ _cpp_lex_direct (cpp_reader *pfile)
               result->flags |= DIGRAPH;
               result->type = CPP_HASH;
               if (*buffer->cur == '%' && buffer->cur[1] == ':')
-               buffer->cur += 2, result->type = CPP_PASTE, result->val.arg_no = 0;
+               buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
             }
           else if (*buffer->cur == '>')
             {
@@ -1325,7 +1591,7 @@ _cpp_lex_direct (cpp_reader *pfile)
      case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
      case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
      case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
-    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.arg_no = 0; break;
+    case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
  
      case '?': result->type = CPP_QUERY; break;
      case '~': result->type = CPP_COMPL; break;
@@ -1350,7 +1616,7 @@ _cpp_lex_direct (cpp_reader *pfile)
         if (forms_identifier_p (pfile, true, &nst))
           {
             result->type = CPP_NAME;
-           result->val.node = lex_identifier (pfile, base, true, &nst);
+           result->val.node.node = lex_identifier (pfile, base, true, &nst);
             warn_about_normalization (pfile, result, &nst);
             break;
           }
@@ -1376,7 +1642,7 @@ cpp_token_len (const cpp_token *token)
      {
      default:           len = 6;                                break;
      case SPELL_LITERAL:        len = token->val.str.len;               break;
-    case SPELL_IDENT:  len = NODE_LEN (token->val.node) * 10;  break;
+    case SPELL_IDENT:  len = NODE_LEN (token->val.node.node) * 10;     break;
      }
  
    return len;
@@ -1416,6 +1682,13 @@ utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
    return ucn_len;
  }
  
+/* Given a token TYPE corresponding to a digraph, return a pointer to
+   the spelling of the digraph.  */
+static const unsigned char *
+cpp_digraph2name (enum cpp_ttype type)
+{
+  return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
+}
  
  /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
     already contain the enough space to hold the token's spelling.
@@ -1435,8 +1708,7 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
         unsigned char c;
  
         if (token->flags & DIGRAPH)
-         spelling
-           = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
+         spelling = cpp_digraph2name (token->type);
         else if (token->flags & NAMED_OP)
           goto spell_ident;
         else
@@ -1451,23 +1723,23 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
      case SPELL_IDENT:
        if (forstring)
         {
-         memcpy (buffer, NODE_NAME (token->val.node),
-                 NODE_LEN (token->val.node));
-         buffer += NODE_LEN (token->val.node);
+         memcpy (buffer, NODE_NAME (token->val.node.node),
+                 NODE_LEN (token->val.node.node));
+         buffer += NODE_LEN (token->val.node.node);
         }
        else
         {
           size_t i;
-         const unsigned char * name = NODE_NAME (token->val.node);
+         const unsigned char * name = NODE_NAME (token->val.node.node);
           
-         for (i = 0; i < NODE_LEN (token->val.node); i++)
+         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
             if (name[i] & ~0x7F)
               {
                 i += utf8_to_ucn (buffer, name + i) - 1;
                 buffer += 10;
               }
             else
-             *buffer++ = NODE_NAME (token->val.node)[i];
+             *buffer++ = NODE_NAME (token->val.node.node)[i];
         }
        break;
  
@@ -1499,11 +1771,17 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
    return start;
  }
  
-/* Used by C front ends, which really should move to using
-   cpp_token_as_text.  */
+/* Returns a pointer to a string which spells the token defined by
+   TYPE and FLAGS.  Used by C front ends, which really should move to
+   using cpp_token_as_text.  */
  const char *
-cpp_type2name (enum cpp_ttype type)
+cpp_type2name (enum cpp_ttype type, unsigned char flags)
  {
+  if (flags & DIGRAPH)
+    return (const char *) cpp_digraph2name (type);
+  else if (flags & NAMED_OP)
+    return cpp_named_operator2name (type);
+
    return (const char *) token_spellings[type].name;
  }
  
@@ -1521,8 +1799,7 @@ cpp_output_token (const cpp_token *token, FILE *fp)
         int c;
  
         if (token->flags & DIGRAPH)
-         spelling
-           = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
+         spelling = cpp_digraph2name (token->type);
         else if (token->flags & NAMED_OP)
           goto spell_ident;
         else
@@ -1539,9 +1816,9 @@ cpp_output_token (const cpp_token *token, FILE *fp)
      case SPELL_IDENT:
        {
         size_t i;
-       const unsigned char * name = NODE_NAME (token->val.node);
+       const unsigned char * name = NODE_NAME (token->val.node.node);
         
-       for (i = 0; i < NODE_LEN (token->val.node); i++)
+       for (i = 0; i < NODE_LEN (token->val.node.node); i++)
           if (name[i] & ~0x7F)
             {
               unsigned char buffer[10];
@@ -1549,7 +1826,7 @@ cpp_output_token (const cpp_token *token, FILE *fp)
               fwrite (buffer, 1, 10, fp);
             }
           else
-           fputc (NODE_NAME (token->val.node)[i], fp);
+           fputc (NODE_NAME (token->val.node.node)[i], fp);
        }
        break;
  
@@ -1572,13 +1849,14 @@ _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
        {
        default:                 /* Keep compiler happy.  */
        case SPELL_OPERATOR:
-       /* arg_no is used to track where multiple consecutive ##
+       /* token_no is used to track where multiple consecutive ##
            tokens were originally located.  */
-       return (a->type != CPP_PASTE || a->val.arg_no == b->val.arg_no);
+       return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
        case SPELL_NONE:
-       return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
+       return (a->type != CPP_MACRO_ARG
+               || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
        case SPELL_IDENT:
-       return a->val.node == b->val.node;
+       return a->val.node.node == b->val.node.node;
        case SPELL_LITERAL:
         return (a->val.str.len == b->val.str.len
                 && !memcmp (a->val.str.text, b->val.str.text,
@@ -1890,7 +2168,7 @@ cpp_token_val_index (cpp_token *tok)
        return CPP_TOKEN_FLD_STR;
      case SPELL_OPERATOR:
        if (tok->type == CPP_PASTE)
-       return CPP_TOKEN_FLD_ARG_NO;
+       return CPP_TOKEN_FLD_TOKEN_NO;
        else
         return CPP_TOKEN_FLD_NONE;
      case SPELL_NONE: