X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=libcpp%2Flex.c;h=ef59f900c64ae8f6b24109e1b68b25071d29c1cd;hb=293e5c85a838327341c557c063c2647e463ad631;hp=7eafb13d3b432982d92e83bc4f7e447552ea3382;hpb=c39ed964c8cd9aec2fe6060830804416b0856b60;p=pf3gnuchains%2Fgcc-fork.git diff --git a/libcpp/lex.c b/libcpp/lex.c index 7eafb13d3b4..ef59f900c64 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1,5 +1,5 @@ /* CPP Library - lexical analysis. - Copyright (C) 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc. + Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. Contributed by Per Bothner, 1994-95. Based on CCCP program by Paul Rubin, June 1986 Adapted to ANSI C, Richard Stallman, Jan 1987 @@ -17,7 +17,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software -Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "system.h" @@ -41,8 +41,8 @@ struct token_spelling static const unsigned char *const digraph_spellings[] = { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" }; -#define OP(e, s) { SPELL_OPERATOR, U s }, -#define TK(e, s) { s, U #e }, +#define OP(e, s) { SPELL_OPERATOR, U s }, +#define TK(e, s) { SPELL_ ## s, U #e }, static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; #undef OP #undef TK @@ -53,9 +53,6 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; static void add_line_note (cpp_buffer *, const uchar *, unsigned int); static int skip_line_comment (cpp_reader *); static void skip_whitespace (cpp_reader *, cppchar_t); -static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *); -static void lex_number (cpp_reader *, cpp_string *); -static bool forms_identifier_p (cpp_reader *, int); static void lex_string (cpp_reader *, cpp_token *, const uchar *); static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t); static void create_literal (cpp_reader *, cpp_token *, const uchar *, @@ -88,8 +85,8 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type) if (buffer->notes_used == buffer->notes_cap) { buffer->notes_cap = buffer->notes_cap * 2 + 200; - buffer->notes = xrealloc (buffer->notes, - buffer->notes_cap * sizeof (_cpp_line_note)); + buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes, + buffer->notes_cap); } buffer->notes[buffer->notes_used].pos = pos; @@ -114,31 +111,39 @@ _cpp_clean_line (cpp_reader *pfile) if (!buffer->from_stage3) { + const uchar *pbackslash = NULL; + /* Short circuit for the common case of an un-escaped line with no trigraphs. The primary win here is by not writing any data back to memory until we have to. */ for (;;) { c = *++s; - if (c == '\n' || c == '\r') + if (__builtin_expect (c == '\n', false) + || __builtin_expect (c == '\r', false)) { d = (uchar *) s; - if (s == buffer->rlimit) + if (__builtin_expect (s == buffer->rlimit, false)) goto done; /* DOS line ending? */ - if (c == '\r' && s[1] == '\n') - s++; + if (__builtin_expect (c == '\r', false) + && s[1] == '\n') + { + s++; + if (s == buffer->rlimit) + goto done; + } - if (s == buffer->rlimit) + if (__builtin_expect (pbackslash == NULL, true)) goto done; - /* check for escaped newline */ + /* Check for escaped newline. */ p = d; - while (p != buffer->next_line && is_nvspace (p[-1])) + while (is_nvspace (p[-1])) p--; - if (p == buffer->next_line || p[-1] != '\\') + if (p - 1 != pbackslash) goto done; /* Have an escaped newline; process it and proceed to @@ -148,7 +153,11 @@ _cpp_clean_line (cpp_reader *pfile) buffer->next_line = p - 1; break; } - if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]]) + if (__builtin_expect (c == '\\', false)) + pbackslash = s; + else if (__builtin_expect (c == '?', false) + && __builtin_expect (s[1] == '?', false) + && _cpp_trigraph_map[s[2]]) { /* Have a trigraph. We may or may not have to convert it. Add a line note regardless, for -Wtrigraphs. */ @@ -430,10 +439,36 @@ name_p (cpp_reader *pfile, const cpp_string *string) return 1; } +/* After parsing an identifier or other sequence, produce a warning about + sequences not in NFC/NFKC. */ +static void +warn_about_normalization (cpp_reader *pfile, + const cpp_token *token, + const struct normalize_state *s) +{ + if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) + && !pfile->state.skipping) + { + /* Make sure that the token is printed using UCNs, even + if we'd otherwise happily print UTF-8. */ + unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); + size_t sz; + + sz = cpp_spell_token (pfile, token, buf, false) - buf; + if (NORMALIZE_STATE_RESULT (s) == normalized_C) + cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0, + "`%.*s' is not in NFKC", (int) sz, buf); + else + cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0, + "`%.*s' is not in NFC", (int) sz, buf); + } +} + /* Returns TRUE if the sequence starting at buffer->cur is invalid in an identifier. FIRST is TRUE if this starts an identifier. */ static bool -forms_identifier_p (cpp_reader *pfile, int first) +forms_identifier_p (cpp_reader *pfile, int first, + struct normalize_state *state) { cpp_buffer *buffer = pfile->buffer; @@ -453,11 +488,13 @@ forms_identifier_p (cpp_reader *pfile, int first) } /* Is this a syntactically valid UCN? */ - if (0 && *buffer->cur == '\\' + if (CPP_OPTION (pfile, extended_identifiers) + && *buffer->cur == '\\' && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) { buffer->cur += 2; - if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first)) + if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state)) return true; buffer->cur -= 2; } @@ -467,39 +504,43 @@ forms_identifier_p (cpp_reader *pfile, int first) /* Lex an identifier starting at BUFFER->CUR - 1. */ static cpp_hashnode * -lex_identifier (cpp_reader *pfile, const uchar *base) +lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, + struct normalize_state *nst) { cpp_hashnode *result; - const uchar *cur, *limit; + const uchar *cur; unsigned int len; unsigned int hash = HT_HASHSTEP (0, *base); cur = pfile->buffer->cur; - for (;;) + if (! starts_ucn) + while (ISIDNUM (*cur)) + { + hash = HT_HASHSTEP (hash, *cur); + cur++; + } + pfile->buffer->cur = cur; + if (starts_ucn || forms_identifier_p (pfile, false, nst)) { - /* N.B. ISIDNUM does not include $. */ - while (ISIDNUM (*cur)) - { - hash = HT_HASHSTEP (hash, *cur); - cur++; - } - - pfile->buffer->cur = cur; - if (!forms_identifier_p (pfile, false)) - break; - - limit = pfile->buffer->cur; - while (cur < limit) - { - hash = HT_HASHSTEP (hash, *cur); - cur++; - } + /* Slower version for identifiers containing UCNs (or $). */ + do { + while (ISIDNUM (*pfile->buffer->cur)) + { + pfile->buffer->cur++; + NORMALIZE_STATE_UPDATE_IDNUM (nst); + } + } while (forms_identifier_p (pfile, false, nst)); + result = _cpp_interpret_identifier (pfile, base, + pfile->buffer->cur - base); } - len = cur - base; - hash = HT_HASHFINISH (hash, len); + else + { + len = cur - base; + hash = HT_HASHFINISH (hash, len); - result = (cpp_hashnode *) - ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC); + result = (cpp_hashnode *) + ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC); + } /* Rarely, identifiers require diagnostics when lexed. */ if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) @@ -524,7 +565,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base) /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */ static void -lex_number (cpp_reader *pfile, cpp_string *number) +lex_number (cpp_reader *pfile, cpp_string *number, + struct normalize_state *nst) { const uchar *cur; const uchar *base; @@ -537,11 +579,14 @@ lex_number (cpp_reader *pfile, cpp_string *number) /* N.B. ISIDNUM does not include $. */ while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1])) - cur++; + { + cur++; + NORMALIZE_STATE_UPDATE_IDNUM (nst); + } pfile->buffer->cur = cur; } - while (forms_identifier_p (pfile, false)); + while (forms_identifier_p (pfile, false, nst)); number->len = cur - base; dest = _cpp_unaligned_alloc (pfile, number->len + 1); @@ -613,6 +658,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) cpp_error (pfile, CPP_DL_WARNING, "null character(s) preserved in literal"); + if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM) + cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character", + (int) terminator); + pfile->buffer->cur = cur; create_literal (pfile, token, base, cur - base, type); } @@ -662,7 +711,7 @@ save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from, void _cpp_init_tokenrun (tokenrun *run, unsigned int count) { - run->base = xnewvec (cpp_token, count); + run->base = XNEWVEC (cpp_token, count); run->limit = run->base + count; run->next = NULL; } @@ -673,7 +722,7 @@ next_tokenrun (tokenrun *run) { if (run->next == NULL) { - run->next = xnew (tokenrun); + run->next = XNEW (tokenrun); run->next->prev = run; _cpp_init_tokenrun (run->next, 250); } @@ -717,6 +766,11 @@ _cpp_lex_token (cpp_reader *pfile) pfile->cur_run = next_tokenrun (pfile->cur_run); pfile->cur_token = pfile->cur_run->base; } + /* We assume that the current token is somewhere in the current + run. */ + if (pfile->cur_token < pfile->cur_run->base + || pfile->cur_token >= pfile->cur_run->limit) + abort (); if (pfile->lookaheads) { @@ -734,15 +788,24 @@ _cpp_lex_token (cpp_reader *pfile) /* 6.10.3 p 11: Directives in a list of macro arguments gives undefined behavior. This implementation handles the directive as normal. */ - && pfile->state.parsing_args != 1 - && _cpp_handle_directive (pfile, result->flags & PREV_WHITE)) - continue; + && pfile->state.parsing_args != 1) + { + if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE)) + { + if (pfile->directive_result.type == CPP_PADDING) + continue; + result = &pfile->directive_result; + } + } + else if (pfile->state.in_deferred_pragma) + result = &pfile->directive_result; + if (pfile->cb.line_change && !pfile->state.skipping) pfile->cb.line_change (pfile, result, pfile->state.parsing_args); } /* We don't skip tokens in directives. */ - if (pfile->state.in_directive) + if (pfile->state.in_directive || pfile->state.in_deferred_pragma) break; /* Outside a directive, invalidate controlling macros. At file @@ -789,11 +852,8 @@ _cpp_get_fresh_line (cpp_reader *pfile) && buffer->next_line > buffer->rlimit && !buffer->from_stage3) { - /* Only warn once. */ + /* Clip to buffer size. */ buffer->next_line = buffer->rlimit; - cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, - CPP_BUF_COLUMN (buffer, buffer->cur), - "no newline at end of file"); } return_at_eof = buffer->return_at_eof; @@ -836,6 +896,14 @@ _cpp_lex_direct (cpp_reader *pfile) buffer = pfile->buffer; if (buffer->need_line) { + if (pfile->state.in_deferred_pragma) + { + result->type = CPP_PRAGMA_EOL; + pfile->state.in_deferred_pragma = false; + if (!pfile->state.pragma_allow_expansion) + pfile->state.prevent_expansion--; + return result; + } if (!_cpp_get_fresh_line (pfile)) { result->type = CPP_EOF; @@ -888,9 +956,13 @@ _cpp_lex_direct (cpp_reader *pfile) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - result->type = CPP_NUMBER; - lex_number (pfile, &result->val.str); - break; + { + struct normalize_state nst = INITIAL_NORMALIZE_STATE; + result->type = CPP_NUMBER; + lex_number (pfile, &result->val.str, &nst); + warn_about_normalization (pfile, result, &nst); + break; + } case 'L': /* 'L' may introduce wide characters or strings. */ @@ -913,13 +985,18 @@ _cpp_lex_direct (cpp_reader *pfile) case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': result->type = CPP_NAME; - result->val.node = lex_identifier (pfile, buffer->cur - 1); + { + struct normalize_state nst = INITIAL_NORMALIZE_STATE; + result->val.node = lex_identifier (pfile, buffer->cur - 1, false, + &nst); + warn_about_normalization (pfile, result, &nst); + } /* Convert named operators to their proper types. */ if (result->val.node->flags & NODE_OPERATOR) { result->flags |= NAMED_OP; - result->type = result->val.node->directive_index; + result->type = (enum cpp_ttype) result->val.node->directive_index; } break; @@ -993,11 +1070,6 @@ _cpp_lex_direct (cpp_reader *pfile) buffer->cur++; IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT); } - else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus)) - { - buffer->cur++; - IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN); - } else if (CPP_OPTION (pfile, digraphs)) { if (*buffer->cur == ':') @@ -1024,11 +1096,6 @@ _cpp_lex_direct (cpp_reader *pfile) buffer->cur++; IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT); } - else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus)) - { - buffer->cur++; - IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX); - } break; case '%': @@ -1058,8 +1125,10 @@ _cpp_lex_direct (cpp_reader *pfile) result->type = CPP_DOT; if (ISDIGIT (*buffer->cur)) { + struct normalize_state nst = INITIAL_NORMALIZE_STATE; result->type = CPP_NUMBER; - lex_number (pfile, &result->val.str); + lex_number (pfile, &result->val.str, &nst); + warn_about_normalization (pfile, result, &nst); } else if (*buffer->cur == '.' && buffer->cur[1] == '.') buffer->cur += 2, result->type = CPP_ELLIPSIS; @@ -1142,11 +1211,13 @@ _cpp_lex_direct (cpp_reader *pfile) case '\\': { const uchar *base = --buffer->cur; + struct normalize_state nst = INITIAL_NORMALIZE_STATE; - if (forms_identifier_p (pfile, true)) + if (forms_identifier_p (pfile, true, &nst)) { result->type = CPP_NAME; - result->val.node = lex_identifier (pfile, base); + result->val.node = lex_identifier (pfile, base, true, &nst); + warn_about_normalization (pfile, result, &nst); break; } buffer->cur++; @@ -1171,19 +1242,56 @@ cpp_token_len (const cpp_token *token) { default: len = 4; break; case SPELL_LITERAL: len = token->val.str.len; break; - case SPELL_IDENT: len = NODE_LEN (token->val.node); break; + case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break; } return len; } +/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER. + Return the number of bytes read out of NAME. (There are always + 10 bytes written to BUFFER.) */ + +static size_t +utf8_to_ucn (unsigned char *buffer, const unsigned char *name) +{ + int j; + int ucn_len = 0; + int ucn_len_c; + unsigned t; + unsigned long utf32; + + /* Compute the length of the UTF-8 sequence. */ + for (t = *name; t & 0x80; t <<= 1) + ucn_len++; + + utf32 = *name & (0x7F >> ucn_len); + for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++) + { + utf32 = (utf32 << 6) | (*++name & 0x3F); + + /* Ill-formed UTF-8. */ + if ((*name & ~0x3F) != 0x80) + abort (); + } + + *buffer++ = '\\'; + *buffer++ = 'U'; + for (j = 7; j >= 0; j--) + *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF]; + return ucn_len; +} + + /* Write the spelling of a token TOKEN to BUFFER. The buffer must already contain the enough space to hold the token's spelling. Returns a pointer to the character after the last character written. + FORSTRING is true if this is to be the spelling after translation + phase 1 (this is different for UCNs). FIXME: Would be nice if we didn't need the PFILE argument. */ unsigned char * cpp_spell_token (cpp_reader *pfile, const cpp_token *token, - unsigned char *buffer) + unsigned char *buffer, bool forstring) { switch (TOKEN_SPELL (token)) { @@ -1207,8 +1315,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token, spell_ident: case SPELL_IDENT: - memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node)); - buffer += NODE_LEN (token->val.node); + if (forstring) + { + memcpy (buffer, NODE_NAME (token->val.node), + NODE_LEN (token->val.node)); + buffer += NODE_LEN (token->val.node); + } + else + { + size_t i; + const unsigned char * name = NODE_NAME (token->val.node); + + for (i = 0; i < NODE_LEN (token->val.node); i++) + if (name[i] & ~0x7F) + { + i += utf8_to_ucn (buffer, name + i) - 1; + buffer += 10; + } + else + *buffer++ = NODE_NAME (token->val.node)[i]; + } break; case SPELL_LITERAL: @@ -1233,7 +1359,7 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token) unsigned int len = cpp_token_len (token) + 1; unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end; - end = cpp_spell_token (pfile, token, start); + end = cpp_spell_token (pfile, token, start, false); end[0] = '\0'; return start; @@ -1277,8 +1403,21 @@ cpp_output_token (const cpp_token *token, FILE *fp) spell_ident: case SPELL_IDENT: - fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp); - break; + { + size_t i; + const unsigned char * name = NODE_NAME (token->val.node); + + for (i = 0; i < NODE_LEN (token->val.node); i++) + if (name[i] & ~0x7F) + { + unsigned char buffer[10]; + i += utf8_to_ucn (buffer, name + i) - 1; + fwrite (buffer, 1, 10, fp); + } + else + fputc (NODE_NAME (token->val.node)[i], fp); + } + break; case SPELL_LITERAL: fwrite (token->val.str.text, 1, token->val.str.len, fp); @@ -1341,8 +1480,8 @@ cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, switch (a) { - case CPP_GREATER: return c == '>' || c == '?'; - case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':'; + case CPP_GREATER: return c == '>'; + case CPP_LESS: return c == '<' || c == '%' || c == ':'; case CPP_PLUS: return c == '+'; case CPP_MINUS: return c == '-' || c == '>'; case CPP_DIV: return c == '/' || c == '*'; /* Comments. */ @@ -1418,7 +1557,7 @@ new_buff (size_t len) len = MIN_BUFF_SIZE; len = CPP_ALIGN (len); - base = xmalloc (len + sizeof (_cpp_buff)); + base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff)); result = (_cpp_buff *) (base + len); result->base = base; result->cur = base; @@ -1573,6 +1712,8 @@ cpp_token_val_index (cpp_token *tok) return CPP_TOKEN_FLD_ARG_NO; else if (tok->type == CPP_PADDING) return CPP_TOKEN_FLD_SOURCE; + else if (tok->type == CPP_PRAGMA) + return CPP_TOKEN_FLD_PRAGMA; /* else fall through */ default: return CPP_TOKEN_FLD_NONE;