/* CPP Library - lexical analysis.
- Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
Contributed by Per Bothner, 1994-95.
Based on CCCP program by Paul Rubin, June 1986
Adapted to ANSI C, Richard Stallman, Jan 1987
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
-Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
#include "config.h"
#include "system.h"
static const unsigned char *const digraph_spellings[] =
{ U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
-#define OP(e, s) { SPELL_OPERATOR, U s },
-#define TK(e, s) { s, U #e },
+#define OP(e, s) { SPELL_OPERATOR, U s },
+#define TK(e, s) { SPELL_ ## s, U #e },
static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
#undef OP
#undef TK
static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
static int skip_line_comment (cpp_reader *);
static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
-static void lex_number (cpp_reader *, cpp_string *);
-static bool forms_identifier_p (cpp_reader *, int);
static void lex_string (cpp_reader *, cpp_token *, const uchar *);
static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
static void create_literal (cpp_reader *, cpp_token *, const uchar *,
if (buffer->notes_used == buffer->notes_cap)
{
buffer->notes_cap = buffer->notes_cap * 2 + 200;
- buffer->notes = xrealloc (buffer->notes,
- buffer->notes_cap * sizeof (_cpp_line_note));
+ buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
+ buffer->notes_cap);
}
buffer->notes[buffer->notes_used].pos = pos;
if (!buffer->from_stage3)
{
+ const uchar *pbackslash = NULL;
+
/* Short circuit for the common case of an un-escaped line with
no trigraphs. The primary win here is by not writing any
data back to memory until we have to. */
for (;;)
{
c = *++s;
- if (c == '\n' || c == '\r')
+ if (__builtin_expect (c == '\n', false)
+ || __builtin_expect (c == '\r', false))
{
d = (uchar *) s;
- if (s == buffer->rlimit)
+ if (__builtin_expect (s == buffer->rlimit, false))
goto done;
/* DOS line ending? */
- if (c == '\r' && s[1] == '\n')
- s++;
+ if (__builtin_expect (c == '\r', false)
+ && s[1] == '\n')
+ {
+ s++;
+ if (s == buffer->rlimit)
+ goto done;
+ }
- if (s == buffer->rlimit)
+ if (__builtin_expect (pbackslash == NULL, true))
goto done;
- /* check for escaped newline */
+ /* Check for escaped newline. */
p = d;
- while (p != buffer->next_line && is_nvspace (p[-1]))
+ while (is_nvspace (p[-1]))
p--;
- if (p == buffer->next_line || p[-1] != '\\')
+ if (p - 1 != pbackslash)
goto done;
/* Have an escaped newline; process it and proceed to
buffer->next_line = p - 1;
break;
}
- if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
+ if (__builtin_expect (c == '\\', false))
+ pbackslash = s;
+ else if (__builtin_expect (c == '?', false)
+ && __builtin_expect (s[1] == '?', false)
+ && _cpp_trigraph_map[s[2]])
{
/* Have a trigraph. We may or may not have to convert
it. Add a line note regardless, for -Wtrigraphs. */
return 1;
}
+/* After parsing an identifier or other sequence, produce a warning about
+ sequences not in NFC/NFKC. */
+static void
+warn_about_normalization (cpp_reader *pfile,
+ const cpp_token *token,
+ const struct normalize_state *s)
+{
+ if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
+ && !pfile->state.skipping)
+ {
+ /* Make sure that the token is printed using UCNs, even
+ if we'd otherwise happily print UTF-8. */
+ unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
+ size_t sz;
+
+ sz = cpp_spell_token (pfile, token, buf, false) - buf;
+ if (NORMALIZE_STATE_RESULT (s) == normalized_C)
+ cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+ "`%.*s' is not in NFKC", (int) sz, buf);
+ else
+ cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+ "`%.*s' is not in NFC", (int) sz, buf);
+ }
+}
+
/* Returns TRUE if the sequence starting at buffer->cur is invalid in
an identifier. FIRST is TRUE if this starts an identifier. */
static bool
-forms_identifier_p (cpp_reader *pfile, int first)
+forms_identifier_p (cpp_reader *pfile, int first,
+ struct normalize_state *state)
{
cpp_buffer *buffer = pfile->buffer;
}
/* Is this a syntactically valid UCN? */
- if (0 && *buffer->cur == '\\'
+ if (CPP_OPTION (pfile, extended_identifiers)
+ && *buffer->cur == '\\'
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
{
buffer->cur += 2;
- if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
+ if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state))
return true;
buffer->cur -= 2;
}
/* Lex an identifier starting at BUFFER->CUR - 1. */
static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
+ struct normalize_state *nst)
{
cpp_hashnode *result;
- const uchar *cur, *limit;
+ const uchar *cur;
unsigned int len;
unsigned int hash = HT_HASHSTEP (0, *base);
cur = pfile->buffer->cur;
- for (;;)
+ if (! starts_ucn)
+ while (ISIDNUM (*cur))
+ {
+ hash = HT_HASHSTEP (hash, *cur);
+ cur++;
+ }
+ pfile->buffer->cur = cur;
+ if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
- /* N.B. ISIDNUM does not include $. */
- while (ISIDNUM (*cur))
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
-
- pfile->buffer->cur = cur;
- if (!forms_identifier_p (pfile, false))
- break;
-
- limit = pfile->buffer->cur;
- while (cur < limit)
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
+ /* Slower version for identifiers containing UCNs (or $). */
+ do {
+ while (ISIDNUM (*pfile->buffer->cur))
+ {
+ pfile->buffer->cur++;
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ }
+ } while (forms_identifier_p (pfile, false, nst));
+ result = _cpp_interpret_identifier (pfile, base,
+ pfile->buffer->cur - base);
}
- len = cur - base;
- hash = HT_HASHFINISH (hash, len);
+ else
+ {
+ len = cur - base;
+ hash = HT_HASHFINISH (hash, len);
- result = (cpp_hashnode *)
- ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ result = (cpp_hashnode *)
+ ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ }
/* Rarely, identifiers require diagnostics when lexed. */
if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
static void
-lex_number (cpp_reader *pfile, cpp_string *number)
+lex_number (cpp_reader *pfile, cpp_string *number,
+ struct normalize_state *nst)
{
const uchar *cur;
const uchar *base;
/* N.B. ISIDNUM does not include $. */
while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
- cur++;
+ {
+ cur++;
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ }
pfile->buffer->cur = cur;
}
- while (forms_identifier_p (pfile, false));
+ while (forms_identifier_p (pfile, false, nst));
number->len = cur - base;
dest = _cpp_unaligned_alloc (pfile, number->len + 1);
cpp_error (pfile, CPP_DL_WARNING,
"null character(s) preserved in literal");
+ if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
+ cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
+ (int) terminator);
+
pfile->buffer->cur = cur;
create_literal (pfile, token, base, cur - base, type);
}
void
_cpp_init_tokenrun (tokenrun *run, unsigned int count)
{
- run->base = xnewvec (cpp_token, count);
+ run->base = XNEWVEC (cpp_token, count);
run->limit = run->base + count;
run->next = NULL;
}
{
if (run->next == NULL)
{
- run->next = xnew (tokenrun);
+ run->next = XNEW (tokenrun);
run->next->prev = run;
_cpp_init_tokenrun (run->next, 250);
}
pfile->cur_run = next_tokenrun (pfile->cur_run);
pfile->cur_token = pfile->cur_run->base;
}
+ /* We assume that the current token is somewhere in the current
+ run. */
+ if (pfile->cur_token < pfile->cur_run->base
+ || pfile->cur_token >= pfile->cur_run->limit)
+ abort ();
if (pfile->lookaheads)
{
/* 6.10.3 p 11: Directives in a list of macro arguments
gives undefined behavior. This implementation
handles the directive as normal. */
- && pfile->state.parsing_args != 1
- && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
- continue;
+ && pfile->state.parsing_args != 1)
+ {
+ if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
+ {
+ if (pfile->directive_result.type == CPP_PADDING)
+ continue;
+ result = &pfile->directive_result;
+ }
+ }
+ else if (pfile->state.in_deferred_pragma)
+ result = &pfile->directive_result;
+
if (pfile->cb.line_change && !pfile->state.skipping)
pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
}
/* We don't skip tokens in directives. */
- if (pfile->state.in_directive)
+ if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
break;
/* Outside a directive, invalidate controlling macros. At file
&& buffer->next_line > buffer->rlimit
&& !buffer->from_stage3)
{
- /* Only warn once. */
+ /* Clip to buffer size. */
buffer->next_line = buffer->rlimit;
- cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
- CPP_BUF_COLUMN (buffer, buffer->cur),
- "no newline at end of file");
}
return_at_eof = buffer->return_at_eof;
buffer = pfile->buffer;
if (buffer->need_line)
{
+ if (pfile->state.in_deferred_pragma)
+ {
+ result->type = CPP_PRAGMA_EOL;
+ pfile->state.in_deferred_pragma = false;
+ if (!pfile->state.pragma_allow_expansion)
+ pfile->state.prevent_expansion--;
+ return result;
+ }
if (!_cpp_get_fresh_line (pfile))
{
result->type = CPP_EOF;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- result->type = CPP_NUMBER;
- lex_number (pfile, &result->val.str);
- break;
+ {
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ result->type = CPP_NUMBER;
+ lex_number (pfile, &result->val.str, &nst);
+ warn_about_normalization (pfile, result, &nst);
+ break;
+ }
case 'L':
/* 'L' may introduce wide characters or strings. */
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, buffer->cur - 1);
+ {
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
+ &nst);
+ warn_about_normalization (pfile, result, &nst);
+ }
/* Convert named operators to their proper types. */
if (result->val.node->flags & NODE_OPERATOR)
{
result->flags |= NAMED_OP;
- result->type = result->val.node->directive_index;
+ result->type = (enum cpp_ttype) result->val.node->directive_index;
}
break;
buffer->cur++;
IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
}
- else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
- {
- buffer->cur++;
- IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
- }
else if (CPP_OPTION (pfile, digraphs))
{
if (*buffer->cur == ':')
buffer->cur++;
IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
}
- else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
- {
- buffer->cur++;
- IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
- }
break;
case '%':
result->type = CPP_DOT;
if (ISDIGIT (*buffer->cur))
{
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
result->type = CPP_NUMBER;
- lex_number (pfile, &result->val.str);
+ lex_number (pfile, &result->val.str, &nst);
+ warn_about_normalization (pfile, result, &nst);
}
else if (*buffer->cur == '.' && buffer->cur[1] == '.')
buffer->cur += 2, result->type = CPP_ELLIPSIS;
case '\\':
{
const uchar *base = --buffer->cur;
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
- if (forms_identifier_p (pfile, true))
+ if (forms_identifier_p (pfile, true, &nst))
{
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, base);
+ result->val.node = lex_identifier (pfile, base, true, &nst);
+ warn_about_normalization (pfile, result, &nst);
break;
}
buffer->cur++;
{
default: len = 4; break;
case SPELL_LITERAL: len = token->val.str.len; break;
- case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
+ case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
}
return len;
}
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+ Return the number of bytes read out of NAME. (There are always
+ 10 bytes written to BUFFER.) */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+ int j;
+ int ucn_len = 0;
+ int ucn_len_c;
+ unsigned t;
+ unsigned long utf32;
+
+ /* Compute the length of the UTF-8 sequence. */
+ for (t = *name; t & 0x80; t <<= 1)
+ ucn_len++;
+
+ utf32 = *name & (0x7F >> ucn_len);
+ for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+ {
+ utf32 = (utf32 << 6) | (*++name & 0x3F);
+
+ /* Ill-formed UTF-8. */
+ if ((*name & ~0x3F) != 0x80)
+ abort ();
+ }
+
+ *buffer++ = '\\';
+ *buffer++ = 'U';
+ for (j = 7; j >= 0; j--)
+ *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+ return ucn_len;
+}
+
+
/* Write the spelling of a token TOKEN to BUFFER. The buffer must
already contain the enough space to hold the token's spelling.
Returns a pointer to the character after the last character written.
+ FORSTRING is true if this is to be the spelling after translation
+ phase 1 (this is different for UCNs).
FIXME: Would be nice if we didn't need the PFILE argument. */
unsigned char *
cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
- unsigned char *buffer)
+ unsigned char *buffer, bool forstring)
{
switch (TOKEN_SPELL (token))
{
spell_ident:
case SPELL_IDENT:
- memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
- buffer += NODE_LEN (token->val.node);
+ if (forstring)
+ {
+ memcpy (buffer, NODE_NAME (token->val.node),
+ NODE_LEN (token->val.node));
+ buffer += NODE_LEN (token->val.node);
+ }
+ else
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ buffer += 10;
+ }
+ else
+ *buffer++ = NODE_NAME (token->val.node)[i];
+ }
break;
case SPELL_LITERAL:
unsigned int len = cpp_token_len (token) + 1;
unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
- end = cpp_spell_token (pfile, token, start);
+ end = cpp_spell_token (pfile, token, start, false);
end[0] = '\0';
return start;
spell_ident:
case SPELL_IDENT:
- fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
- break;
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ unsigned char buffer[10];
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ fwrite (buffer, 1, 10, fp);
+ }
+ else
+ fputc (NODE_NAME (token->val.node)[i], fp);
+ }
+ break;
case SPELL_LITERAL:
fwrite (token->val.str.text, 1, token->val.str.len, fp);
switch (a)
{
- case CPP_GREATER: return c == '>' || c == '?';
- case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
+ case CPP_GREATER: return c == '>';
+ case CPP_LESS: return c == '<' || c == '%' || c == ':';
case CPP_PLUS: return c == '+';
case CPP_MINUS: return c == '-' || c == '>';
case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
len = MIN_BUFF_SIZE;
len = CPP_ALIGN (len);
- base = xmalloc (len + sizeof (_cpp_buff));
+ base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
result = (_cpp_buff *) (base + len);
result->base = base;
result->cur = base;
return CPP_TOKEN_FLD_ARG_NO;
else if (tok->type == CPP_PADDING)
return CPP_TOKEN_FLD_SOURCE;
+ else if (tok->type == CPP_PRAGMA)
+ return CPP_TOKEN_FLD_PRAGMA;
/* else fall through */
default:
return CPP_TOKEN_FLD_NONE;