#include "system.h"
#include "cpplib.h"
#include "cpphash.h"
-#include "symcat.h"
+
+/* MULTIBYTE_CHARS support only works for native compilers.
+ ??? Ideally what we want is to model widechar support after
+ the current floating point support. */
+#ifdef CROSS_COMPILE
+#undef MULTIBYTE_CHARS
+#endif
+
+#ifdef MULTIBYTE_CHARS
+#include "mbchar.h"
+#include <locale.h>
+#endif
/* Tokens with SPELL_STRING store their spelling in the token list,
and it's length in the token->val.name.len. */
static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
static int name_p PARAMS ((cpp_reader *, const cpp_string *));
+static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
+ const unsigned char *, unsigned int *));
static cpp_chunk *new_chunk PARAMS ((unsigned int));
static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
+static unsigned int hex_digit_value PARAMS ((unsigned int));
/* Utility routine:
if (token->type != CPP_NAME)
return 0;
- return !ustrcmp (token->val.node->name, (const U_CHAR *) string);
+ return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
}
/* Call when meeting a newline. Returns the character after the newline
{
cpp_hashnode *result;
cpp_buffer *buffer = pfile->buffer;
- unsigned char *dest, *limit;
- unsigned int r = 0, saw_dollar = 0;
-
- dest = POOL_FRONT (&pfile->ident_pool);
- limit = POOL_LIMIT (&pfile->ident_pool);
+ unsigned int saw_dollar = 0, len;
+ struct obstack *stack = &pfile->hash_table->stack;
do
{
do
{
- /* Need room for terminating null. */
- if (dest + 1 >= limit)
- limit = _cpp_next_chunk (&pfile->ident_pool, 0, &dest);
-
- *dest++ = c;
- r = HASHSTEP (r, c);
+ obstack_1grow (stack, c);
if (c == '$')
saw_dollar++;
cpp_pedwarn (pfile, "'$' character(s) in identifier");
/* Identifiers are null-terminated. */
- *dest = '\0';
+ len = obstack_object_size (stack);
+ obstack_1grow (stack, '\0');
/* This routine commits the memory if necessary. */
- result = _cpp_lookup_with_hash (pfile,
- dest - POOL_FRONT (&pfile->ident_pool), r);
+ result = (cpp_hashnode *)
+ ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
/* Some identifiers require diagnostics when lexed. */
if (result->flags & NODE_DIAGNOSTIC && !pfile->skipping)
{
/* It is allowed to poison the same identifier twice. */
if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
- cpp_error (pfile, "attempt to use poisoned \"%s\"", result->name);
+ cpp_error (pfile, "attempt to use poisoned \"%s\"",
+ NODE_NAME (result));
/* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
replacement list of a variadic macro. */
}
/* Parses a string, character constant, or angle-bracketed header file
- name. Handles embedded trigraphs and escaped newlines.
+ name. Handles embedded trigraphs and escaped newlines. The stored
+ string is guaranteed NUL-terminated, but it is not guaranteed that
+ this is the first NUL since embedded NULs are preserved.
- Multi-line strings are allowed, but they are deprecated within
- directives. */
+ Multi-line strings are allowed, but they are deprecated. */
static void
parse_string (pfile, token, terminator)
cpp_reader *pfile;
for (;;)
{
if (buffer->cur == buffer->rlimit)
+ c = EOF;
+ else
+ c = *buffer->cur++;
+
+ have_char:
+ /* We need space for the terminating NUL. */
+ if (dest >= limit)
+ limit = _cpp_next_chunk (pool, 0, &dest);
+
+ if (c == EOF)
{
- c = EOF;
unterminated (pfile, terminator);
break;
}
- c = *buffer->cur++;
- have_char:
/* Handle trigraphs, escaped newlines etc. */
if (c == '?' || c == '\\')
c = skip_escaped_newlines (buffer, c);
break;
}
+ cpp_pedwarn (pfile, "multi-line string literals are deprecated");
if (pfile->mlstring_pos.line == 0)
- {
- pfile->mlstring_pos = pfile->lexer_pos;
- if (CPP_PEDANTIC (pfile))
- cpp_pedwarn (pfile, "multi-line string constant");
- }
+ pfile->mlstring_pos = pfile->lexer_pos;
- handle_newline (buffer, c); /* Stores to read_ahead. */
- c = '\n';
+ c = handle_newline (buffer, c);
+ *dest++ = '\n';
+ goto have_char;
}
else if (c == '\0')
{
cpp_warning (pfile, "null character(s) preserved in literal");
}
- /* No terminating null for strings - they could contain nulls. */
- if (dest >= limit)
- limit = _cpp_next_chunk (pool, 0, &dest);
*dest++ = c;
-
- /* If we had a new line, the next character is in read_ahead. */
- if (c != '\n')
- continue;
- c = buffer->read_ahead;
- if (c != EOF)
- goto have_char;
}
/* Remember the next character. */
buffer->read_ahead = c;
+ *dest = '\0';
token->val.str.text = POOL_FRONT (pool);
token->val.str.len = dest - token->val.str.text;
- POOL_COMMIT (pool, token->val.str.len);
+ POOL_COMMIT (pool, token->val.str.len + 1);
}
/* The stored comment includes the comment start and any terminator. */
switch (c)
{
case EOF:
- /* Non-empty files should end in a newline. Ignore for command
- line and _Pragma buffers. */
- if (pfile->lexer_pos.col != 0 && !buffer->from_stage3)
+ /* Non-empty files should end in a newline. Checking "bol" too
+ prevents multiple warnings when hitting the EOF more than
+ once, like in a directive. Don't warn for command line and
+ _Pragma buffers. */
+ if (pfile->lexer_pos.col != 0 && !bol && !buffer->from_stage3)
cpp_pedwarn (pfile, "no newline at end of file");
pfile->state.next_bol = 1;
pfile->skipping = 0; /* In case missing #endif. */
ACCEPT_CHAR (CPP_DIV_EQ);
if (c != '/' && c != '*')
break;
-
+
if (c == '*')
{
if (skip_block_comment (pfile))
}
/* Skip_line_comment updates buffer->read_ahead. */
- if (skip_line_comment (pfile))
+ if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
cpp_warning_with_line (pfile, pfile->lexer_pos.line,
pfile->lexer_pos.col,
"multi-line comment");
result->type = CPP_HASH;
do_hash:
- if (bol)
- {
- if (pfile->state.parsing_args)
- {
- /* 6.10.3 paragraph 11: If there are sequences of
- preprocessing tokens within the list of arguments that
- would otherwise act as preprocessing directives, the
- behavior is undefined.
-
- This implementation will report a hard error, terminate
- the macro invocation, and proceed to process the
- directive. */
- cpp_error (pfile,
- "directives may not be used inside a macro argument");
-
- /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
- buffer->extra_char = buffer->read_ahead;
- buffer->read_ahead = '#';
- pfile->state.next_bol = 1;
- result->type = CPP_EOF;
+ if (!bol)
+ break;
+ /* 6.10.3 paragraph 11: If there are sequences of preprocessing
+ tokens within the list of arguments that would otherwise act
+ as preprocessing directives, the behavior is undefined.
- /* Get whitespace right - newline_in_args sets it. */
- if (pfile->lexer_pos.col == 1)
- result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
- }
- else
- {
- /* This is the hash introducing a directive. */
- if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
- goto done_directive; /* bol still 1. */
- /* This is in fact an assembler #. */
- }
+ This implementation will report a hard error, terminate the
+ macro invocation, and proceed to process the directive. */
+ if (pfile->state.parsing_args)
+ {
+ if (pfile->state.parsing_args == 2)
+ cpp_error (pfile,
+ "directives may not be used inside a macro argument");
+
+ /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
+ buffer->extra_char = buffer->read_ahead;
+ buffer->read_ahead = '#';
+ pfile->state.next_bol = 1;
+ result->type = CPP_EOF;
+
+ /* Get whitespace right - newline_in_args sets it. */
+ if (pfile->lexer_pos.col == 1)
+ result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
+ }
+ else
+ {
+ /* This is the hash introducing a directive. */
+ if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
+ goto done_directive; /* bol still 1. */
+ /* This is in fact an assembler #. */
}
break;
case '}': result->type = CPP_CLOSE_BRACE; break;
case ';': result->type = CPP_SEMICOLON; break;
- case '@':
- if (CPP_OPTION (pfile, objc))
- {
- /* In Objective C, '@' may begin keywords or strings, like
- @keyword or @"string". It would be nice to call
- get_effective_char here and test the result. However, we
- would then need to pass 2 characters to parse_identifier,
- making it ugly and slowing down its main loop. Instead,
- we assume we have an identifier, and recover if not. */
- result->type = CPP_NAME;
- result->val.node = parse_identifier (pfile, c);
- if (result->val.node->length != 1)
- break;
-
- /* OK, so it wasn't an identifier. Maybe a string? */
- if (buffer->read_ahead == '"')
- {
- c = '"';
- ACCEPT_CHAR (CPP_OSTRING);
- goto make_string;
- }
- }
- goto random_char;
+ /* @ is a punctuator in Objective C. */
+ case '@': result->type = CPP_ATSIGN; break;
random_char:
default:
switch (TOKEN_SPELL (token))
{
- default: len = 0; break;
- case SPELL_STRING: len = token->val.str.len; break;
- case SPELL_IDENT: len = token->val.node->length; break;
+ default: len = 0; break;
+ case SPELL_STRING: len = token->val.str.len; break;
+ case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
}
/* 1 for whitespace, 4 for comment delimeters. */
return len + 5;
case SPELL_IDENT:
spell_ident:
- memcpy (buffer, token->val.node->name, token->val.node->length);
- buffer += token->val.node->length;
+ memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
+ buffer += NODE_LEN (token->val.node);
break;
case SPELL_STRING:
{
case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
- case CPP_OSTRING: left = '"'; right = '"'; tag = '@'; break;
case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
spell_ident:
case SPELL_IDENT:
- ufputs (token->val.node->name, fp);
+ ufputs (NODE_NAME (token->val.node), fp);
break;
case SPELL_STRING:
{
case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
- case CPP_OSTRING: left = '"'; right = '"'; tag = '@'; break;
case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
return 0;
}
-#if 0
-/* Compare two token lists. */
-int
-_cpp_equiv_toklists (a, b)
- const struct toklist *a, *b;
-{
- unsigned int i, count;
-
- count = a->limit - a->first;
- if (count != (b->limit - b->first))
- return 0;
-
- for (i = 0; i < count; i++)
- if (! _cpp_equiv_tokens (&a->first[i], &b->first[i]))
- return 0;
-
- return 1;
-}
-#endif
-
/* Determine whether two tokens can be pasted together, and if so,
what the resulting token is. Returns CPP_EOF if the tokens cannot
be pasted, or the appropriate type for the merged token if they
return CPP_NUMBER;
break;
- case CPP_OTHER:
- if (CPP_OPTION (pfile, objc) && token1->val.c == '@')
- {
- if (b == CPP_NAME) return CPP_NAME;
- if (b == CPP_STRING) return CPP_OSTRING;
- }
-
default:
break;
}
putc ('\n', fp);
}
+/* Returns the value of a hexadecimal digit. */
+static unsigned int
+hex_digit_value (c)
+ unsigned int c;
+{
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ abort ();
+}
+
+/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
+ failure if cpplib is not parsing C++ or C99. Such failure is
+ silent, and no variables are updated. Otherwise returns 0, and
+ warns if -Wtraditional.
+
+ [lex.charset]: The character designated by the universal character
+ name \UNNNNNNNN is that character whose character short name in
+ ISO/IEC 10646 is NNNNNNNN; the character designated by the
+ universal character name \uNNNN is that character whose character
+ short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
+ for a universal character name is less than 0x20 or in the range
+ 0x7F-0x9F (inclusive), or if the universal character name
+ designates a character in the basic source character set, then the
+ program is ill-formed.
+
+ We assume that wchar_t is Unicode, so we don't need to do any
+ mapping. Is this ever wrong?
+
+ PC points to the 'u' or 'U', PSTR is points to the byte after PC,
+ LIMIT is the end of the string or charconst. PSTR is updated to
+ point after the UCS on return, and the UCS is written into PC. */
+
+static int
+maybe_read_ucs (pfile, pstr, limit, pc)
+ cpp_reader *pfile;
+ const unsigned char **pstr;
+ const unsigned char *limit;
+ unsigned int *pc;
+{
+ const unsigned char *p = *pstr;
+ unsigned int code = 0;
+ unsigned int c = *pc, length;
+
+ /* Only attempt to interpret a UCS for C++ and C99. */
+ if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
+ return 1;
+
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
+
+ length = (c == 'u' ? 4: 8);
+
+ if ((size_t) (limit - p) < length)
+ {
+ cpp_error (pfile, "incomplete universal-character-name");
+ /* Skip to the end to avoid more diagnostics. */
+ p = limit;
+ }
+ else
+ {
+ for (; length; length--, p++)
+ {
+ c = *p;
+ if (ISXDIGIT (c))
+ code = (code << 4) + hex_digit_value (c);
+ else
+ {
+ cpp_error (pfile,
+ "non-hex digit '%c' in universal-character-name", c);
+ /* We shouldn't skip in case there are multibyte chars. */
+ break;
+ }
+ }
+ }
+
+#ifdef TARGET_EBCDIC
+ cpp_error (pfile, "universal-character-name on EBCDIC target");
+ code = 0x3f; /* EBCDIC invalid character */
+#else
+ /* True extended characters are OK. */
+ if (code >= 0xa0
+ && !(code & 0x80000000)
+ && !(code >= 0xD800 && code <= 0xDFFF))
+ ;
+ /* The standard permits $, @ and ` to be specified as UCNs. We use
+ hex escapes so that this also works with EBCDIC hosts. */
+ else if (code == 0x24 || code == 0x40 || code == 0x60)
+ ;
+ /* Don't give another error if one occurred above. */
+ else if (length == 0)
+ cpp_error (pfile, "universal-character-name out of range");
+#endif
+
+ *pstr = p;
+ *pc = code;
+ return 0;
+}
+
+/* Interpret an escape sequence, and return its value. PSTR points to
+ the input pointer, which is just after the backslash. LIMIT is how
+ much text we have. MASK is a bitmask for the precision for the
+ destination type (char or wchar_t). TRADITIONAL, if true, does not
+ interpret escapes that did not exist in traditional C.
+
+ Handles all relevant diagnostics. */
+
+unsigned int
+cpp_parse_escape (pfile, pstr, limit, mask, traditional)
+ cpp_reader *pfile;
+ const unsigned char **pstr;
+ const unsigned char *limit;
+ unsigned HOST_WIDE_INT mask;
+ int traditional;
+{
+ int unknown = 0;
+ const unsigned char *str = *pstr;
+ unsigned int c = *str++;
+
+ switch (c)
+ {
+ case '\\': case '\'': case '"': case '?': break;
+ case 'b': c = TARGET_BS; break;
+ case 'f': c = TARGET_FF; break;
+ case 'n': c = TARGET_NEWLINE; break;
+ case 'r': c = TARGET_CR; break;
+ case 't': c = TARGET_TAB; break;
+ case 'v': c = TARGET_VT; break;
+
+ case '(': case '{': case '[': case '%':
+ /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
+ '\%' is used to prevent SCCS from getting confused. */
+ unknown = CPP_PEDANTIC (pfile);
+ break;
+
+ case 'a':
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
+ if (!traditional)
+ c = TARGET_BELL;
+ break;
+
+ case 'e': case 'E':
+ if (CPP_PEDANTIC (pfile))
+ cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
+ c = TARGET_ESC;
+ break;
+
+ case 'u': case 'U':
+ unknown = maybe_read_ucs (pfile, &str, limit, &c);
+ break;
+
+ case 'x':
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
+
+ if (!traditional)
+ {
+ unsigned int i = 0, overflow = 0;
+ int digits_found = 0;
+
+ while (str < limit)
+ {
+ c = *str;
+ if (! ISXDIGIT (c))
+ break;
+ str++;
+ overflow |= i ^ (i << 4 >> 4);
+ i = (i << 4) + hex_digit_value (c);
+ digits_found = 1;
+ }
+
+ if (!digits_found)
+ cpp_error (pfile, "\\x used with no following hex digits");
+
+ if (overflow | (i != (i & mask)))
+ {
+ cpp_pedwarn (pfile, "hex escape sequence out of range");
+ i &= mask;
+ }
+ c = i;
+ }
+ break;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ {
+ unsigned int i = c - '0';
+ int count = 0;
+
+ while (str < limit && ++count < 3)
+ {
+ c = *str;
+ if (c < '0' || c > '7')
+ break;
+ str++;
+ i = (i << 3) + c - '0';
+ }
+
+ if (i != (i & mask))
+ {
+ cpp_pedwarn (pfile, "octal escape sequence out of range");
+ i &= mask;
+ }
+ c = i;
+ }
+ break;
+
+ default:
+ unknown = 1;
+ break;
+ }
+
+ if (unknown)
+ {
+ if (ISGRAPH (c))
+ cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
+ else
+ cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
+ }
+
+ if (c > mask)
+ cpp_pedwarn (pfile, "escape sequence out of range for character");
+
+ *pstr = str;
+ return c;
+}
+
+#ifndef MAX_CHAR_TYPE_SIZE
+#define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
+#endif
+
+#ifndef MAX_WCHAR_TYPE_SIZE
+#define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
+#endif
+
+/* Interpret a (possibly wide) character constant in TOKEN.
+ WARN_MULTI warns about multi-character charconsts, if not
+ TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
+ that did not exist in traditional C. PCHARS_SEEN points to a
+ variable that is filled in with the number of characters seen. */
+HOST_WIDE_INT
+cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
+ cpp_reader *pfile;
+ const cpp_token *token;
+ int warn_multi;
+ int traditional;
+ unsigned int *pchars_seen;
+{
+ const unsigned char *str = token->val.str.text;
+ const unsigned char *limit = str + token->val.str.len;
+ unsigned int chars_seen = 0;
+ unsigned int width, max_chars, c;
+ unsigned HOST_WIDE_INT mask;
+ HOST_WIDE_INT result = 0;
+
+#ifdef MULTIBYTE_CHARS
+ (void) local_mbtowc (NULL, NULL, 0);
+#endif
+
+ /* Width in bits. */
+ if (token->type == CPP_CHAR)
+ width = MAX_CHAR_TYPE_SIZE;
+ else
+ width = MAX_WCHAR_TYPE_SIZE;
+
+ if (width < HOST_BITS_PER_WIDE_INT)
+ mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
+ else
+ mask = ~0;
+ max_chars = HOST_BITS_PER_WIDE_INT / width;
+
+ while (str < limit)
+ {
+#ifdef MULTIBYTE_CHARS
+ wchar_t wc;
+ int char_len;
+
+ char_len = local_mbtowc (&wc, str, limit - str);
+ if (char_len == -1)
+ {
+ cpp_warning (pfile, "ignoring invalid multibyte character");
+ c = *str++;
+ }
+ else
+ {
+ str += char_len;
+ c = wc;
+ }
+#else
+ c = *str++;
+#endif
+
+ if (c == '\\')
+ c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
+
+#ifdef MAP_CHARACTER
+ if (ISPRINT (c))
+ c = MAP_CHARACTER (c);
+#endif
+
+ /* Merge character into result; ignore excess chars. */
+ if (++chars_seen <= max_chars)
+ {
+ if (width < HOST_BITS_PER_WIDE_INT)
+ result = (result << width) | (c & mask);
+ else
+ result = c;
+ }
+ }
+
+ if (chars_seen == 0)
+ cpp_error (pfile, "empty character constant");
+ else if (chars_seen > max_chars)
+ {
+ chars_seen = max_chars;
+ cpp_warning (pfile, "character constant too long");
+ }
+ else if (chars_seen > 1 && !traditional && warn_multi)
+ cpp_warning (pfile, "multi-character character constant");
+
+ /* If char type is signed, sign-extend the constant. The
+ __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
+ if (token->type == CPP_CHAR && chars_seen)
+ {
+ unsigned int nbits = chars_seen * width;
+ unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
+
+ if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
+ || ((result >> (nbits - 1)) & 1) == 0)
+ result &= mask;
+ else
+ result |= ~mask;
+ }
+
+ *pchars_seen = chars_seen;
+ return result;
+}
+
/* Memory pools. */
struct dummy