#include "cpplib.h"
#include "cpphash.h"
-/* Tokens with SPELL_STRING store their spelling in the token list,
- and it's length in the token->val.name.len. */
enum spell_type
{
SPELL_OPERATOR = 0,
- SPELL_CHAR,
SPELL_IDENT,
- SPELL_NUMBER,
- SPELL_STRING,
+ SPELL_LITERAL,
SPELL_NONE
};
static void add_line_note PARAMS ((cpp_buffer *, const uchar *, unsigned int));
static int skip_line_comment PARAMS ((cpp_reader *));
static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
-static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *));
+static cpp_hashnode *lex_identifier PARAMS ((cpp_reader *, const uchar *));
static void lex_number PARAMS ((cpp_reader *, cpp_string *));
-static bool continues_identifier_p PARAMS ((cpp_reader *));
-static void lex_string PARAMS ((cpp_reader *, cpp_token *));
+static bool forms_identifier_p PARAMS ((cpp_reader *, int));
+static void lex_string PARAMS ((cpp_reader *, cpp_token *, const uchar *));
static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
cppchar_t));
+static void create_literal PARAMS ((cpp_reader *, cpp_token *, const uchar *,
+ unsigned int, enum cpp_ttype));
+static bool warn_in_comment PARAMS ((cpp_reader *, _cpp_line_note *));
static int name_p PARAMS ((cpp_reader *, const cpp_string *));
-static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
- const unsigned char *, cppchar_t *));
+static cppchar_t maybe_read_ucn PARAMS ((cpp_reader *, const uchar **));
static tokenrun *next_tokenrun PARAMS ((tokenrun *));
static unsigned int hex_digit_value PARAMS ((unsigned int));
if (p == buffer->next_line || p[-1] != '\\')
break;
- add_line_note (buffer, p - 1,
- p != d ? NOTE_ESC_SPACE_NL: NOTE_ESC_NL);
+ add_line_note (buffer, p - 1, p != d ? ' ': '\\');
d = p - 2;
buffer->next_line = p - 1;
}
else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
{
/* Add a note regardless, for the benefit of -Wtrigraphs. */
- add_line_note (buffer, d, NOTE_TRIGRAPH);
+ add_line_note (buffer, d, s[2]);
if (CPP_OPTION (pfile, trigraphs))
{
*d = _cpp_trigraph_map[s[2]];
}
*d = '\n';
- add_line_note (buffer, d + 1, NOTE_NEWLINE);
+ /* A sentinel note that should never be processed. */
+ add_line_note (buffer, d + 1, '\n');
buffer->next_line = s + 1;
}
+/* Return true if the trigraph indicated by NOTE should be warned
+ about in a comment. */
+static bool
+warn_in_comment (pfile, note)
+ cpp_reader *pfile;
+ _cpp_line_note *note;
+{
+ const uchar *p;
+
+ /* Within comments we don't warn about trigraphs, unless the
+ trigraph forms an escaped newline, as that may change
+ behaviour. */
+ if (note->type != '/')
+ return false;
+
+ /* If -trigraphs, then this was an escaped newline iff the next note
+ is coincident. */
+ if (CPP_OPTION (pfile, trigraphs))
+ return note[1].pos == note->pos;
+
+ /* Otherwise, see if this forms an escaped newline. */
+ p = note->pos + 3;
+ while (is_nvspace (*p))
+ p++;
+
+ /* There might have been escaped newlines between the trigraph and the
+ newline we found. Hence the position test. */
+ return (*p == '\n' && p < note[1].pos);
+}
+
/* Process the notes created by add_line_note as far as the current
location. */
void
buffer->cur_note++;
col = CPP_BUF_COLUMN (buffer, note->pos + 1);
- switch (note->type)
+ if (note->type == '\\' || note->type == ' ')
{
- case NOTE_NEWLINE:
- /* This note is a kind of sentinel we should never reach. */
- abort ();
-
- case NOTE_TRIGRAPH:
- if (!in_comment && CPP_OPTION (pfile, warn_trigraphs))
- {
- if (CPP_OPTION (pfile, trigraphs))
- cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
- "trigraph converted to %c",
- (int) note->pos[0]);
- else
- cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
- "trigraph ??%c ignored",
- (int) note->pos[2]);
- }
- break;
-
- case NOTE_ESC_SPACE_NL:
- if (!in_comment)
+ if (note->type == ' ' && !in_comment)
cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
"backslash and newline separated by space");
- /* Fall through... */
- case NOTE_ESC_NL:
+
if (buffer->next_line > buffer->rlimit)
{
cpp_error_with_line (pfile, DL_PEDWARN, pfile->line, col,
buffer->line_base = note->pos;
pfile->line++;
}
+ else if (_cpp_trigraph_map[note->type])
+ {
+ if (CPP_OPTION (pfile, warn_trigraphs)
+ && (!in_comment || warn_in_comment (pfile, note)))
+ {
+ if (CPP_OPTION (pfile, trigraphs))
+ cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
+ "trigraph ??%c converted to %c",
+ note->type,
+ (int) _cpp_trigraph_map[note->type]);
+ else
+ cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
+ "trigraph ??%c ignored",
+ note->type);
+ }
+ }
+ else
+ abort ();
}
}
}
}
+ _cpp_process_line_notes (pfile, true);
return false;
}
}
/* Returns TRUE if the sequence starting at buffer->cur is invalid in
- an identifier. */
+ an identifier. FIRST is TRUE if this starts an identifier. */
static bool
-continues_identifier_p (pfile)
+forms_identifier_p (pfile, first)
cpp_reader *pfile;
+ int first;
{
- if (*pfile->buffer->cur != '$' || !CPP_OPTION (pfile, dollars_in_ident))
- return false;
+ cpp_buffer *buffer = pfile->buffer;
- if (CPP_PEDANTIC (pfile) && !pfile->state.skipping && !pfile->warned_dollar)
+ if (*buffer->cur == '$')
{
- pfile->warned_dollar = true;
- cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
+ if (!CPP_OPTION (pfile, dollars_in_ident))
+ return false;
+
+ buffer->cur++;
+ if (pfile->warn_dollars && !pfile->state.skipping)
+ {
+ pfile->warn_dollars = false;
+ cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
+ }
+
+ return true;
}
- pfile->buffer->cur++;
- return true;
+ /* Is this a syntactically valid UCN? */
+ if (0 && *buffer->cur == '\\'
+ && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+ {
+ buffer->cur += 2;
+ if (_cpp_valid_ucn (pfile, &buffer->cur, 1 + !first))
+ return true;
+ buffer->cur -= 2;
+ }
+
+ return false;
}
/* Lex an identifier starting at BUFFER->CUR - 1. */
static cpp_hashnode *
-lex_identifier (pfile)
+lex_identifier (pfile, base)
cpp_reader *pfile;
+ const uchar *base;
{
cpp_hashnode *result;
- const uchar *cur, *base;
+ const uchar *cur;
- base = pfile->buffer->cur - 1;
do
{
cur = pfile->buffer->cur;
pfile->buffer->cur = cur;
}
- while (continues_identifier_p (pfile));
+ while (forms_identifier_p (pfile, false));
result = (cpp_hashnode *)
ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
pfile->buffer->cur = cur;
}
- while (continues_identifier_p (pfile));
+ while (forms_identifier_p (pfile, false));
number->len = cur - base;
dest = _cpp_unaligned_alloc (pfile, number->len + 1);
number->text = dest;
}
+/* Create a token of type TYPE with a literal spelling. */
+static void
+create_literal (pfile, token, base, len, type)
+ cpp_reader *pfile;
+ cpp_token *token;
+ const uchar *base;
+ unsigned int len;
+ enum cpp_ttype type;
+{
+ uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
+
+ memcpy (dest, base, len);
+ dest[len] = '\0';
+ token->type = type;
+ token->val.str.len = len;
+ token->val.str.text = dest;
+}
+
/* Lexes a string, character constant, or angle-bracketed header file
- name. The stored string is guaranteed NUL-terminated, but it is
- not guaranteed that this is the first NUL since embedded NULs are
- preserved. */
+ name. The stored string contains the spelling, including opening
+ quote and leading any leading 'L'. It returns the type of the
+ literal, or CPP_OTHER if it was not properly terminated.
+
+ The spelling is NUL-terminated, but it is not guaranteed that this
+ is the first NUL since embedded NULs are preserved. */
static void
-lex_string (pfile, token)
+lex_string (pfile, token, base)
cpp_reader *pfile;
cpp_token *token;
+ const uchar *base;
{
- cpp_buffer *buffer = pfile->buffer;
- bool warned_nulls = false;
- const uchar *base;
- uchar *dest;
+ bool saw_NUL = false;
+ const uchar *cur;
cppchar_t terminator;
-
- base = buffer->cur;
- terminator = base[-1];
- if (terminator == '<')
- terminator = '>';
+ enum cpp_ttype type;
+
+ cur = base;
+ terminator = *cur++;
+ if (terminator == 'L')
+ terminator = *cur++;
+ if (terminator == '\"')
+ type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
+ else if (terminator == '\'')
+ type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
+ else
+ terminator = '>', type = CPP_HEADER_NAME;
for (;;)
{
- cppchar_t c = *buffer->cur++;
+ cppchar_t c = *cur++;
/* In #include-style directives, terminators are not escapable. */
- if (c == '\\' && !pfile->state.angled_headers && *buffer->cur != '\n')
- buffer->cur++;
- else if (c == terminator || c == '\n')
+ if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
+ cur++;
+ else if (c == terminator)
break;
- else if (c == '\0')
+ else if (c == '\n')
{
- if (!warned_nulls)
- {
- warned_nulls = true;
- cpp_error (pfile, DL_WARNING,
- "null character(s) preserved in literal");
- }
+ cur--;
+ type = CPP_OTHER;
+ break;
}
+ else if (c == '\0')
+ saw_NUL = true;
}
- token->val.str.len = buffer->cur - base - 1;
- dest = _cpp_unaligned_alloc (pfile, token->val.str.len + 1);
- memcpy (dest, base, token->val.str.len);
- dest[token->val.str.len] = '\0';
- token->val.str.text = dest;
+ if (saw_NUL && !pfile->state.skipping)
+ cpp_error (pfile, DL_WARNING, "null character(s) preserved in literal");
- if (buffer->cur[-1] == '\n')
- {
- /* No string literal may extend over multiple lines. In
- assembly language, suppress the error except for <>
- includes. This is a kludge around not knowing where
- comments are. */
- if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
- cpp_error (pfile, DL_ERROR, "missing terminating %c character",
- (int) terminator);
- buffer->cur--;
- }
+ pfile->buffer->cur = cur;
+ create_literal (pfile, token, base, cur - base, type);
}
/* The stored comment includes the comment start and any terminator. */
"no newline at end of file");
}
+ if (!buffer->prev)
+ return false;
+
if (buffer->return_at_eof)
{
- buffer->return_at_eof = false;
+ _cpp_pop_buffer (pfile);
return false;
}
- if (!buffer->prev)
- return false;
-
_cpp_pop_buffer (pfile);
}
}
if (!_cpp_get_fresh_line (pfile))
{
result->type = CPP_EOF;
+ if (!pfile->state.in_directive)
+ {
+ /* Tell the compiler the line number of the EOF token. */
+ result->line = pfile->line;
+ result->flags = BOL;
+ }
return result;
}
if (!pfile->keep_tokens)
/* 'L' may introduce wide characters or strings. */
if (*buffer->cur == '\'' || *buffer->cur == '"')
{
- result->type = (*buffer->cur == '"' ? CPP_WSTRING: CPP_WCHAR);
- buffer->cur++;
- lex_string (pfile, result);
+ lex_string (pfile, result, buffer->cur - 1);
break;
}
/* Fall through. */
- start_ident:
case '_':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile);
+ result->val.node = lex_identifier (pfile, buffer->cur - 1);
/* Convert named operators to their proper types. */
if (result->val.node->flags & NODE_OPERATOR)
case '\'':
case '"':
- result->type = c == '"' ? CPP_STRING: CPP_CHAR;
- lex_string (pfile, result);
+ lex_string (pfile, result, buffer->cur - 1);
break;
case '/':
case '<':
if (pfile->state.angled_headers)
{
- result->type = CPP_HEADER_NAME;
- lex_string (pfile, result);
+ lex_string (pfile, result, buffer->cur - 1);
break;
}
case '@': result->type = CPP_ATSIGN; break;
case '$':
- if (CPP_OPTION (pfile, dollars_in_ident))
- goto start_ident;
- /* Fall through... */
+ case '\\':
+ {
+ const uchar *base = --buffer->cur;
+
+ if (forms_identifier_p (pfile, true))
+ {
+ result->type = CPP_NAME;
+ result->val.node = lex_identifier (pfile, base);
+ break;
+ }
+ buffer->cur++;
+ }
default:
- result->type = CPP_OTHER;
- result->val.c = c;
+ create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
break;
}
return result;
}
-/* An upper bound on the number of bytes needed to spell TOKEN,
- including preceding whitespace. */
+/* An upper bound on the number of bytes needed to spell TOKEN.
+ Does not include preceding whitespace. */
unsigned int
cpp_token_len (token)
const cpp_token *token;
switch (TOKEN_SPELL (token))
{
- default: len = 0; break;
- case SPELL_NUMBER:
- case SPELL_STRING: len = token->val.str.len; break;
+ default: len = 4; break;
+ case SPELL_LITERAL: len = token->val.str.len; break;
case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
}
- /* 1 for whitespace, 4 for comment delimiters. */
- return len + 5;
+
+ return len;
}
/* Write the spelling of a token TOKEN to BUFFER. The buffer must
}
break;
- case SPELL_CHAR:
- *buffer++ = token->val.c;
- break;
-
spell_ident:
case SPELL_IDENT:
memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
buffer += NODE_LEN (token->val.node);
break;
- case SPELL_NUMBER:
+ case SPELL_LITERAL:
memcpy (buffer, token->val.str.text, token->val.str.len);
buffer += token->val.str.len;
break;
- case SPELL_STRING:
- {
- int left, right, tag;
- switch (token->type)
- {
- case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
- case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
- case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
- case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
- case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
- default:
- cpp_error (pfile, DL_ICE, "unknown string token %s\n",
- TOKEN_NAME (token));
- return buffer;
- }
- if (tag) *buffer++ = tag;
- *buffer++ = left;
- memcpy (buffer, token->val.str.text, token->val.str.len);
- buffer += token->val.str.len;
- *buffer++ = right;
- }
- break;
-
case SPELL_NONE:
cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
break;
cpp_token_as_text (pfile, token)
cpp_reader *pfile;
const cpp_token *token;
-{
- unsigned int len = cpp_token_len (token);
+{
+ unsigned int len = cpp_token_len (token) + 1;
unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
end = cpp_spell_token (pfile, token, start);
}
break;
- case SPELL_CHAR:
- putc (token->val.c, fp);
- break;
-
spell_ident:
case SPELL_IDENT:
fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
break;
- case SPELL_NUMBER:
+ case SPELL_LITERAL:
fwrite (token->val.str.text, 1, token->val.str.len, fp);
break;
- case SPELL_STRING:
- {
- int left, right, tag;
- switch (token->type)
- {
- case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
- case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
- case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
- case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
- case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
- default:
- fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
- return;
- }
- if (tag) putc (tag, fp);
- putc (left, fp);
- fwrite (token->val.str.text, 1, token->val.str.len, fp);
- putc (right, fp);
- }
- break;
-
case SPELL_NONE:
/* An error, most probably. */
break;
default: /* Keep compiler happy. */
case SPELL_OPERATOR:
return 1;
- case SPELL_CHAR:
- return a->val.c == b->val.c; /* Character. */
case SPELL_NONE:
return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
case SPELL_IDENT:
return a->val.node == b->val.node;
- case SPELL_NUMBER:
- case SPELL_STRING:
+ case SPELL_LITERAL:
return (a->val.str.len == b->val.str.len
&& !memcmp (a->val.str.text, b->val.str.text,
a->val.str.len));
|| b == CPP_CHAR || b == CPP_STRING); /* L */
case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
|| c == '.' || c == '+' || c == '-');
- case CPP_OTHER: return (CPP_OPTION (pfile, objc)
- && token1->val.c == '@'
- && (b == CPP_NAME || b == CPP_STRING));
+ /* UCNs */
+ case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
+ && b == CPP_NAME)
+ || (CPP_OPTION (pfile, objc)
+ && token1->val.str.text[0] == '@'
+ && (b == CPP_NAME || b == CPP_STRING)));
default: break;
}
abort ();
}
-/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
- failure if cpplib is not parsing C++ or C99. Such failure is
- silent, and no variables are updated. Otherwise returns 0, and
- warns if -Wtraditional.
-
- [lex.charset]: The character designated by the universal character
- name \UNNNNNNNN is that character whose character short name in
- ISO/IEC 10646 is NNNNNNNN; the character designated by the
- universal character name \uNNNN is that character whose character
- short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
- for a universal character name is less than 0x20 or in the range
- 0x7F-0x9F (inclusive), or if the universal character name
- designates a character in the basic source character set, then the
- program is ill-formed.
-
- We assume that wchar_t is Unicode, so we don't need to do any
- mapping. Is this ever wrong?
-
- PC points to the 'u' or 'U', PSTR is points to the byte after PC,
- LIMIT is the end of the string or charconst. PSTR is updated to
- point after the UCS on return, and the UCS is written into PC. */
-
-static int
-maybe_read_ucs (pfile, pstr, limit, pc)
+/* Read a possible universal character name starting at *PSTR. */
+static cppchar_t
+maybe_read_ucn (pfile, pstr)
cpp_reader *pfile;
- const unsigned char **pstr;
- const unsigned char *limit;
- cppchar_t *pc;
+ const uchar **pstr;
{
- const unsigned char *p = *pstr;
- unsigned int code = 0;
- unsigned int c = *pc, length;
-
- /* Only attempt to interpret a UCS for C++ and C99. */
- if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
- return 1;
-
- if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
- "the meaning of '\\%c' is different in traditional C", c);
-
- length = (c == 'u' ? 4: 8);
+ cppchar_t result, c = (*pstr)[-1];
- if ((size_t) (limit - p) < length)
+ result = _cpp_valid_ucn (pfile, pstr, false);
+ if (result)
{
- cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
- /* Skip to the end to avoid more diagnostics. */
- p = limit;
- }
- else
- {
- for (; length; length--, p++)
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_error (pfile, DL_WARNING,
+ "the meaning of '\\%c' is different in traditional C",
+ (int) c);
+
+ if (CPP_OPTION (pfile, EBCDIC))
{
- c = *p;
- if (ISXDIGIT (c))
- code = (code << 4) + hex_digit_value (c);
- else
- {
- cpp_error (pfile, DL_ERROR,
- "non-hex digit '%c' in universal-character-name", c);
- /* We shouldn't skip in case there are multibyte chars. */
- break;
- }
+ cpp_error (pfile, DL_ERROR,
+ "universal character with an EBCDIC target");
+ result = 0x3f; /* EBCDIC invalid character */
}
}
- if (CPP_OPTION (pfile, EBCDIC))
- {
- cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
- code = 0x3f; /* EBCDIC invalid character */
- }
- /* True extended characters are OK. */
- else if (code >= 0xa0
- && !(code & 0x80000000)
- && !(code >= 0xD800 && code <= 0xDFFF))
- ;
- /* The standard permits $, @ and ` to be specified as UCNs. We use
- hex escapes so that this also works with EBCDIC hosts. */
- else if (code == 0x24 || code == 0x40 || code == 0x60)
- ;
- /* Don't give another error if one occurred above. */
- else if (length == 0)
- cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
-
- *pstr = p;
- *pc = code;
- return 0;
+ return result;
}
/* Returns the value of an escape sequence, truncated to the correct
int unknown = 0;
const unsigned char *str = *pstr, *charconsts;
- cppchar_t c, mask;
+ cppchar_t c, ucn, mask;
unsigned int width;
if (CPP_OPTION (pfile, EBCDIC))
break;
case 'u': case 'U':
- unknown = maybe_read_ucs (pfile, &str, limit, &c);
+ ucn = maybe_read_ucn (pfile, &str);
+ if (ucn)
+ c = ucn;
+ else
+ unknown = true;
break;
case 'x':
unsigned int *pchars_seen;
int *unsignedp;
{
- const unsigned char *str = token->val.str.text;
- const unsigned char *limit = str + token->val.str.len;
+ const unsigned char *str, *limit;
unsigned int chars_seen = 0;
size_t width, max_chars;
cppchar_t c, mask, result = 0;
bool unsigned_p;
- /* Width in bits. */
+ str = token->val.str.text + 1 + (token->type == CPP_WCHAR);
+ limit = token->val.str.text + token->val.str.len - 1;
+
if (token->type == CPP_CHAR)
{
width = CPP_OPTION (pfile, char_precision);