/* CPP Library - charsets
- Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
- Free Software Foundation, Inc.
+ Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009,
+ 2010 Free Software Foundation, Inc.
Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
later version.
This program is distributed in the hope that it will be useful,
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+along with this program; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "cpplib.h"
#include "internal.h"
-#include "ucnid.h"
/* Character set handling for C-family languages.
#if HOST_CHARSET == HOST_CHARSET_ASCII
#define SOURCE_CHARSET "UTF-8"
+#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
#define SOURCE_CHARSET "UTF-EBCDIC"
+#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
#else
#error "Unrecognized basic host character set"
#endif
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
cppchar_t *cp)
{
- static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
cppchar_t c;
outbytesleft += OUTBUF_BLOCK_SIZE;
to->asize += OUTBUF_BLOCK_SIZE;
- to->text = xrealloc (to->text, to->asize);
+ to->text = XRESIZEVEC (uchar, to->text, to->asize);
outbuf = to->text + to->asize - outbytesleft;
}
}
if (to->len + flen > to->asize)
{
to->asize = to->len + flen;
- to->text = xrealloc (to->text, to->asize);
+ to->text = XRESIZEVEC (uchar, to->text, to->asize);
}
memcpy (to->text + to->len, from, flen);
to->len += flen;
/* And this one uses the system iconv primitive. It's a little
different, since iconv's interface is a little different. */
#if HAVE_ICONV
+
+#define CONVERT_ICONV_GROW_BUFFER \
+ do { \
+ outbytesleft += OUTBUF_BLOCK_SIZE; \
+ to->asize += OUTBUF_BLOCK_SIZE; \
+ to->text = XRESIZEVEC (uchar, to->text, to->asize); \
+ outbuf = (char *)to->text + to->asize - outbytesleft; \
+ } while (0)
+
static bool
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
struct _cpp_strbuf *to)
iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (__builtin_expect (inbytesleft == 0, 1))
{
+ /* Close out any shift states, returning to the initial state. */
+ if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
+ {
+ if (errno != E2BIG)
+ return false;
+
+ CONVERT_ICONV_GROW_BUFFER;
+ if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
+ return false;
+ }
+
to->len = to->asize - outbytesleft;
return true;
}
if (errno != E2BIG)
return false;
- outbytesleft += OUTBUF_BLOCK_SIZE;
- to->asize += OUTBUF_BLOCK_SIZE;
- to->text = xrealloc (to->text, to->asize);
- outbuf = (char *)to->text + to->asize - outbytesleft;
+ CONVERT_ICONV_GROW_BUFFER;
}
}
#else
{
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
+ ret.width = -1;
return ret;
}
- pair = alloca(strlen(to) + strlen(from) + 2);
+ pair = (char *) alloca(strlen(to) + strlen(from) + 2);
strcpy(pair, from);
strcat(pair, "/");
{
ret.func = conversion_tab[i].func;
ret.cd = conversion_tab[i].fake_cd;
+ ret.width = -1;
return ret;
}
{
ret.func = convert_using_iconv;
ret.cd = iconv_open (to, from);
+ ret.width = -1;
if (ret.cd == (iconv_t) -1)
{
from, to);
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
+ ret.width = -1;
}
return ret;
}
wcset = default_wcset;
pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
+ pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
+ pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
+ pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
+ pfile->char16_cset_desc = init_iconv_desc (pfile,
+ be ? "UTF-16BE" : "UTF-16LE",
+ SOURCE_CHARSET);
+ pfile->char16_cset_desc.width = 16;
+ pfile->char32_cset_desc = init_iconv_desc (pfile,
+ be ? "UTF-32BE" : "UTF-32LE",
+ SOURCE_CHARSET);
+ pfile->char32_cset_desc.width = 32;
pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
+ pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
}
+/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
void
_cpp_destroy_iconv (cpp_reader *pfile)
{
{
if (pfile->narrow_cset_desc.func == convert_using_iconv)
iconv_close (pfile->narrow_cset_desc.cd);
+ if (pfile->utf8_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->utf8_cset_desc.cd);
+ if (pfile->char16_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char16_cset_desc.cd);
+ if (pfile->char32_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char32_cset_desc.cd);
if (pfile->wide_cset_desc.func == convert_using_iconv)
iconv_close (pfile->wide_cset_desc.cd);
}
}
+/* Utility routine for use by a full compiler. C is a character taken
+ from the *basic* source character set, encoded in the host's
+ execution encoding. Convert it to (the target's) execution
+ encoding, and return that value.
+
+ Issues an internal error if C's representation in the narrow
+ execution character set fails to be a single-byte value (C99
+ 5.2.1p3: "The representation of each member of the source and
+ execution character sets shall fit in a byte.") May also issue an
+ internal error if C fails to be a member of the basic source
+ character set (testing this exactly is too hard, especially when
+ the host character set is EBCDIC). */
+cppchar_t
+cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
+{
+ uchar sbuf[1];
+ struct _cpp_strbuf tbuf;
+
+ /* This test is merely an approximation, but it suffices to catch
+ the most important thing, which is that we don't get handed a
+ character outside the unibyte range of the host character set. */
+ if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
+ {
+ cpp_error (pfile, CPP_DL_ICE,
+ "character 0x%lx is not in the basic source character set\n",
+ (unsigned long)c);
+ return 0;
+ }
+
+ /* Being a character in the unibyte range of the host character set,
+ we can safely splat it into a one-byte buffer and trust that that
+ is a well-formed string. */
+ sbuf[0] = c;
+
+ /* This should never need to reallocate, but just in case... */
+ tbuf.asize = 1;
+ tbuf.text = XNEWVEC (uchar, tbuf.asize);
+ tbuf.len = 0;
+
+ if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
+ {
+ cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
+ return 0;
+ }
+ if (tbuf.len != 1)
+ {
+ cpp_error (pfile, CPP_DL_ICE,
+ "character 0x%lx is not unibyte in execution character set",
+ (unsigned long)c);
+ return 0;
+ }
+ c = tbuf.text[0];
+ free(tbuf.text);
+ return c;
+}
+
+\f
/* Utility routine that computes a mask of the form 0000...111... with
WIDTH 1-bits. */
return ((size_t) 1 << width) - 1;
}
-\f
+/* A large table of unicode character information. */
+enum {
+ /* Valid in a C99 identifier? */
+ C99 = 1,
+ /* Valid in a C99 identifier, but not as the first character? */
+ DIG = 2,
+ /* Valid in a C++ identifier? */
+ CXX = 4,
+ /* NFC representation is not valid in an identifier? */
+ CID = 8,
+ /* Might be valid NFC form? */
+ NFC = 16,
+ /* Might be valid NFKC form? */
+ NKC = 32,
+ /* Certain preceding characters might make it not valid NFC/NKFC form? */
+ CTX = 64
+};
+
+static const struct {
+ /* Bitmap of flags above. */
+ unsigned char flags;
+ /* Combining class of the character. */
+ unsigned char combine;
+ /* Last character in the range described by this entry. */
+ unsigned short end;
+} ucnranges[] = {
+#include "ucnid.h"
+};
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
identifier. We assume C has already gone through the checks of
- _cpp_valid_ucn. The algorithm is a simple binary search on the
- table defined in cppucnid.h. */
+ _cpp_valid_ucn. Also update NST for C if returning nonzero. The
+ algorithm is a simple binary search on the table defined in
+ ucnid.h. */
static int
-ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
+ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
+ struct normalize_state *nst)
{
int mn, mx, md;
- mn = -1;
- mx = ARRAY_SIZE (ucnranges);
- while (mx - mn > 1)
+ if (c > 0xFFFF)
+ return 0;
+
+ mn = 0;
+ mx = ARRAY_SIZE (ucnranges) - 1;
+ while (mx != mn)
{
md = (mn + mx) / 2;
- if (c < ucnranges[md].lo)
+ if (c <= ucnranges[md].end)
mx = md;
- else if (c > ucnranges[md].hi)
- mn = md;
else
- goto found;
+ mn = md + 1;
}
- return 0;
- found:
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
union of the acceptable sets for C++98 and C99. */
+ if (! (ucnranges[mn].flags & (C99 | CXX)))
+ return 0;
+
if (CPP_PEDANTIC (pfile)
- && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
+ && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
|| (CPP_OPTION (pfile, cplusplus)
- && !(ucnranges[md].flags & CXX))))
+ && !(ucnranges[mn].flags & CXX))))
return 0;
+ /* Update NST. */
+ if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
+ nst->level = normalized_none;
+ else if (ucnranges[mn].flags & CTX)
+ {
+ bool safe;
+ cppchar_t p = nst->previous;
+
+ /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
+ if (c == 0x09BE)
+ safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
+ else if (c == 0x0B3E)
+ safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
+ else if (c == 0x0BBE)
+ safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
+ else if (c == 0x0CC2)
+ safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
+ else if (c == 0x0D3E)
+ safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
+ /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
+ and are combined algorithmically from a sequence of the form
+ 1100-1112 1161-1175 11A8-11C2
+ (if the third is not present, it is treated as 11A7, which is not
+ really a valid character).
+ Unfortunately, C99 allows (only) the NFC form, but C++ allows
+ only the combining characters. */
+ else if (c >= 0x1161 && c <= 0x1175)
+ safe = p < 0x1100 || p > 0x1112;
+ else if (c >= 0x11A8 && c <= 0x11C2)
+ safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
+ else
+ {
+ /* Uh-oh, someone updated ucnid.h without updating this code. */
+ cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
+ safe = true;
+ }
+ if (!safe && c < 0x1161)
+ nst->level = normalized_none;
+ else if (!safe)
+ nst->level = MAX (nst->level, normalized_identifier_C);
+ }
+ else if (ucnranges[mn].flags & NKC)
+ ;
+ else if (ucnranges[mn].flags & NFC)
+ nst->level = MAX (nst->level, normalized_C);
+ else if (ucnranges[mn].flags & CID)
+ nst->level = MAX (nst->level, normalized_identifier_C);
+ else
+ nst->level = normalized_none;
+ nst->previous = c;
+ nst->prev_class = ucnranges[mn].combine;
+
/* In C99, UCN digits may not begin identifiers. */
- if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
+ if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
return 2;
return 1;
ISO/IEC 10646 is NNNNNNNN; the character designated by the
universal character name \uNNNN is that character whose character
short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
- for a universal character name is less than 0x20 or in the range
- 0x7F-0x9F (inclusive), or if the universal character name
- designates a character in the basic source character set, then the
- program is ill-formed.
+ for a universal character name corresponds to a surrogate code point
+ (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
+ Additionally, if the hexadecimal value for a universal-character-name
+ outside a character or string literal corresponds to a control character
+ (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
+ character in the basic source character set, the program is ill-formed.
+
+ C99 6.4.3: A universal character name shall not specify a character
+ whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
+ or 0060 (`), nor one in the range D800 through DFFF inclusive.
*PSTR must be preceded by "\u" or "\U"; it is assumed that the
- buffer end is delimited by a non-hex digit. Returns zero if UCNs
- are not part of the relevant standard, or if the string beginning
- at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
+ buffer end is delimited by a non-hex digit. Returns zero if the
+ UCN has not been consumed.
Otherwise the nonzero value of the UCN, whether valid or invalid,
is returned. Diagnostics are emitted for invalid values. PSTR
invalid character.
IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
- an identifier, or 2 otherwise.
-*/
+ an identifier, or 2 otherwise. */
cppchar_t
_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
- const uchar *limit, int identifier_pos)
+ const uchar *limit, int identifier_pos,
+ struct normalize_state *nst)
{
cppchar_t result, c;
unsigned int length;
cpp_error (pfile, CPP_DL_WARNING,
"universal character names are only valid in C++ and C99");
else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\%c' is different in traditional C",
- (int) str[-1]);
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\%c' is different in traditional C",
+ (int) str[-1]);
if (str[-1] == 'u')
length = 4;
else if (str[-1] == 'U')
length = 8;
else
- abort();
+ {
+ cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
+ length = 4;
+ }
result = 0;
do
}
while (--length && str < limit);
+ /* Partial UCNs are not valid in strings, but decompose into
+ multiple tokens in identifiers, so we can't give a helpful
+ error message in that case. */
+ if (length && identifier_pos)
+ return 0;
+
*pstr = str;
if (length)
{
- /* We'll error when we try it out as the start of an identifier. */
cpp_error (pfile, CPP_DL_ERROR,
"incomplete universal character name %.*s",
(int) (str - base), base);
result = 1;
}
- /* The standard permits $, @ and ` to be specified as UCNs. We use
- hex escapes so that this also works with EBCDIC hosts. */
+ /* The C99 standard permits $, @ and ` to be specified as UCNs. We use
+ hex escapes so that this also works with EBCDIC hosts.
+ C++0x permits everything below 0xa0 within literals;
+ ucn_valid_in_identifier will complain about identifiers. */
else if ((result < 0xa0
+ && !CPP_OPTION (pfile, cplusplus)
&& (result != 0x24 && result != 0x40 && result != 0x60))
|| (result & 0x80000000)
|| (result >= 0xD800 && result <= 0xDFFF))
(int) (str - base), base);
result = 1;
}
+ else if (identifier_pos && result == 0x24
+ && CPP_OPTION (pfile, dollars_in_ident))
+ {
+ if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
+ {
+ CPP_OPTION (pfile, warn_dollars) = 0;
+ cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
+ }
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ }
else if (identifier_pos)
{
- int validity = ucn_valid_in_identifier (pfile, result);
+ int validity = ucn_valid_in_identifier (pfile, result, nst);
if (validity == 0)
cpp_error (pfile, CPP_DL_ERROR,
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
it to the execution character set and write the result into TBUF.
An advanced pointer is returned. Issues all relevant diagnostics. */
-
-
static const uchar *
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct _cpp_strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, struct cset_converter cvt)
{
cppchar_t ucn;
uchar buf[6];
uchar *bufp = buf;
size_t bytesleft = 6;
int rval;
- struct cset_converter cvt
- = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
from++; /* Skip u/U. */
- ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
+ ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
if (rval)
return from;
}
+/* Subroutine of convert_hex and convert_oct. N is the representation
+ in the execution character set of a numeric escape; write it into the
+ string buffer TBUF and update the end-of-string pointer therein. WIDE
+ is true if it's a wide string that's being assembled in TBUF. This
+ function issues no diagnostics and never fails. */
static void
emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
- struct _cpp_strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, struct cset_converter cvt)
{
- if (wide)
+ size_t width = cvt.width;
+
+ if (width != CPP_OPTION (pfile, char_precision))
{
/* We have to render this into the target byte order, which may not
be our byte order. */
bool bigend = CPP_OPTION (pfile, bytes_big_endian);
- size_t width = CPP_OPTION (pfile, wchar_precision);
size_t cwidth = CPP_OPTION (pfile, char_precision);
size_t cmask = width_to_mask (cwidth);
size_t nbwc = width / cwidth;
if (tbuf->len + nbwc > tbuf->asize)
{
tbuf->asize += OUTBUF_BLOCK_SIZE;
- tbuf->text = xrealloc (tbuf->text, tbuf->asize);
+ tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
}
for (i = 0; i < nbwc; i++)
}
else
{
+ /* Note: this code does not handle the case where the target
+ and host have a different number of bits in a byte. */
if (tbuf->len + 1 > tbuf->asize)
{
tbuf->asize += OUTBUF_BLOCK_SIZE;
- tbuf->text = xrealloc (tbuf->text, tbuf->asize);
+ tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
}
tbuf->text[tbuf->len++] = n;
}
number. You can, e.g. generate surrogate pairs this way. */
static const uchar *
convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct _cpp_strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, struct cset_converter cvt)
{
cppchar_t c, n = 0, overflow = 0;
int digits_found = 0;
- size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
- : CPP_OPTION (pfile, char_precision));
+ size_t width = cvt.width;
size_t mask = width_to_mask (width);
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\x' is different in traditional C");
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\x' is different in traditional C");
from++; /* Skip 'x'. */
while (from < limit)
n &= mask;
}
- emit_numeric_escape (pfile, n, tbuf, wide);
+ emit_numeric_escape (pfile, n, tbuf, cvt);
return from;
}
number. */
static const uchar *
convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct _cpp_strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, struct cset_converter cvt)
{
size_t count = 0;
cppchar_t c, n = 0;
- size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
- : CPP_OPTION (pfile, char_precision));
+ size_t width = cvt.width;
size_t mask = width_to_mask (width);
bool overflow = false;
n &= mask;
}
- emit_numeric_escape (pfile, n, tbuf, wide);
+ emit_numeric_escape (pfile, n, tbuf, cvt);
return from;
}
pointer. Handles all relevant diagnostics. */
static const uchar *
convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct _cpp_strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, struct cset_converter cvt)
{
/* Values of \a \b \e \f \n \r \t \v respectively. */
#if HOST_CHARSET == HOST_CHARSET_ASCII
#endif
uchar c;
- struct cset_converter cvt
- = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
c = *from;
switch (c)
{
/* UCNs, hex escapes, and octal escapes are processed separately. */
case 'u': case 'U':
- return convert_ucn (pfile, from, limit, tbuf, wide);
+ return convert_ucn (pfile, from, limit, tbuf, cvt);
case 'x':
- return convert_hex (pfile, from, limit, tbuf, wide);
+ return convert_hex (pfile, from, limit, tbuf, cvt);
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
- return convert_oct (pfile, from, limit, tbuf, wide);
+ return convert_oct (pfile, from, limit, tbuf, cvt);
/* Various letter escapes. Get the appropriate host-charset
value into C. */
case 'a':
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\a' is different in traditional C");
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\a' is different in traditional C");
c = charconsts[0];
break;
unknown:
if (ISGRAPH (c))
cpp_error (pfile, CPP_DL_PEDWARN,
- "unknown escape sequence '\\%c'", (int) c);
+ "unknown escape sequence: '\\%c'", (int) c);
else
- cpp_error (pfile, CPP_DL_PEDWARN,
- "unknown escape sequence: '\\%03o'", (int) c);
+ {
+ /* diagnostic.c does not support "%03o". When it does, this
+ code can use %03o directly in the diagnostic again. */
+ char buf[32];
+ sprintf(buf, "%03o", (int) c);
+ cpp_error (pfile, CPP_DL_PEDWARN,
+ "unknown escape sequence: '\\%s'", buf);
+ }
}
/* Now convert what we have to the execution character set. */
return from + 1;
}
\f
+/* TYPE is a token type. The return value is the conversion needed to
+ convert from source to execution character set for the given type. */
+static struct cset_converter
+converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
+{
+ switch (type)
+ {
+ default:
+ return pfile->narrow_cset_desc;
+ case CPP_UTF8STRING:
+ return pfile->utf8_cset_desc;
+ case CPP_CHAR16:
+ case CPP_STRING16:
+ return pfile->char16_cset_desc;
+ case CPP_CHAR32:
+ case CPP_STRING32:
+ return pfile->char32_cset_desc;
+ case CPP_WCHAR:
+ case CPP_WSTRING:
+ return pfile->wide_cset_desc;
+ }
+}
+
/* FROM is an array of cpp_string structures of length COUNT. These
are to be converted from the source to the execution character set,
escape sequences translated, and finally all are to be
false for failure. */
bool
cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
- cpp_string *to, bool wide)
+ cpp_string *to, enum cpp_ttype type)
{
struct _cpp_strbuf tbuf;
const uchar *p, *base, *limit;
size_t i;
- struct cset_converter cvt
- = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+ struct cset_converter cvt = converter_for_type (pfile, type);
tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
- tbuf.text = xmalloc (tbuf.asize);
+ tbuf.text = XNEWVEC (uchar, tbuf.asize);
tbuf.len = 0;
for (i = 0; i < count; i++)
{
p = from[i].text;
- if (*p == 'L') p++;
+ if (*p == 'u')
+ {
+ if (*++p == '8')
+ p++;
+ }
+ else if (*p == 'L' || *p == 'U') p++;
+ if (*p == 'R')
+ {
+ const uchar *prefix;
+
+ /* Skip over 'R"'. */
+ p += 2;
+ prefix = p;
+ while (*p != '(')
+ p++;
+ p++;
+ limit = from[i].text + from[i].len;
+ if (limit >= p + (p - prefix) + 1)
+ limit -= (p - prefix) + 1;
+
+ /* Raw strings are all normal characters; these can be fed
+ directly to convert_cset. */
+ if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
+ goto fail;
+
+ continue;
+ }
+
p++; /* Skip leading quote. */
limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
if (p == limit)
break;
- p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
+ p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
}
}
/* NUL-terminate the 'to' buffer and translate it to a cpp_string
structure. */
- emit_numeric_escape (pfile, 0, &tbuf, wide);
- tbuf.text = xrealloc (tbuf.text, tbuf.len);
+ emit_numeric_escape (pfile, 0, &tbuf, cvt);
+ tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
to->text = tbuf.text;
to->len = tbuf.len;
return true;
in a string, but do not perform character set conversion. */
bool
cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
- size_t count, cpp_string *to, bool wide)
+ size_t count, cpp_string *to,
+ enum cpp_ttype type ATTRIBUTE_UNUSED)
{
struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
bool retval;
pfile->narrow_cset_desc.func = convert_no_conversion;
pfile->narrow_cset_desc.cd = (iconv_t) -1;
+ pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
- retval = cpp_interpret_string (pfile, from, count, to, wide);
+ retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
pfile->narrow_cset_desc = save_narrow_cset_desc;
return retval;
"character constant too long for its type");
}
else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
- cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
+ cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
/* Multichar constants are of type int and therefore signed. */
if (i > 1)
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for wide strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
- cpp_interpret_charconst. */
+ cpp_interpret_charconst. TYPE is the token type. */
static cppchar_t
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
- unsigned int *pchars_seen, int *unsignedp)
+ unsigned int *pchars_seen, int *unsignedp,
+ enum cpp_ttype type)
{
bool bigend = CPP_OPTION (pfile, bytes_big_endian);
- size_t width = CPP_OPTION (pfile, wchar_precision);
+ size_t width = converter_for_type (pfile, type).width;
size_t cwidth = CPP_OPTION (pfile, char_precision);
size_t mask = width_to_mask (width);
size_t cmask = width_to_mask (cwidth);
/* Wide character constants have type wchar_t, and a single
character exactly fills a wchar_t, so a multi-character wide
character constant is guaranteed to overflow. */
- if (off > 0)
+ if (str.len > nbwc * 2)
cpp_error (pfile, CPP_DL_WARNING,
"character constant too long for its type");
sign- or zero-extend to the full width of cppchar_t. */
if (width < BITS_PER_CPPCHAR_T)
{
- if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
+ if (type == CPP_CHAR16 || type == CPP_CHAR32
+ || CPP_OPTION (pfile, unsigned_wchar)
+ || !(result & (1 << (width - 1))))
result &= mask;
else
result |= ~mask;
}
- *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
+ if (type == CPP_CHAR16 || type == CPP_CHAR32
+ || CPP_OPTION (pfile, unsigned_wchar))
+ *unsignedp = 1;
+ else
+ *unsignedp = 0;
+
*pchars_seen = 1;
return result;
}
unsigned int *pchars_seen, int *unsignedp)
{
cpp_string str = { 0, 0 };
- bool wide = (token->type == CPP_WCHAR);
+ bool wide = (token->type != CPP_CHAR);
cppchar_t result;
- /* an empty constant will appear as L'' or '' */
+ /* an empty constant will appear as L'', u'', U'' or '' */
if (token->val.str.len == (size_t) (2 + wide))
{
cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
return 0;
}
- else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
+ else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
return 0;
if (wide)
- result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
+ result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
+ token->type);
else
result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
return result;
}
+\f
+/* Convert an identifier denoted by ID and LEN, which might contain
+ UCN escapes, to the source character set, either UTF-8 or
+ UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
+cpp_hashnode *
+_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
+{
+ /* It turns out that a UCN escape always turns into fewer characters
+ than the escape itself, so we can allocate a temporary in advance. */
+ uchar * buf = (uchar *) alloca (len + 1);
+ uchar * bufp = buf;
+ size_t idp;
+
+ for (idp = 0; idp < len; idp++)
+ if (id[idp] != '\\')
+ *bufp++ = id[idp];
+ else
+ {
+ unsigned length = id[idp+1] == 'u' ? 4 : 8;
+ cppchar_t value = 0;
+ size_t bufleft = len - (bufp - buf);
+ int rval;
+
+ idp += 2;
+ while (length && idp < len && ISXDIGIT (id[idp]))
+ {
+ value = (value << 4) + hex_value (id[idp]);
+ idp++;
+ length--;
+ }
+ idp--;
+
+ /* Special case for EBCDIC: if the identifier contains
+ a '$' specified using a UCN, translate it to EBCDIC. */
+ if (value == 0x24)
+ {
+ *bufp++ = '$';
+ continue;
+ }
+
+ rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
+ if (rval)
+ {
+ errno = rval;
+ cpp_errno (pfile, CPP_DL_ERROR,
+ "converting UCN to source character set");
+ break;
+ }
+ }
-uchar *
+ return CPP_HASHNODE (ht_lookup (pfile->hash_table,
+ buf, bufp - buf, HT_ALLOC));
+}
+\f
+/* Convert an input buffer (containing the complete contents of one
+ source file) from INPUT_CHARSET to the source character set. INPUT
+ points to the input buffer, SIZE is its allocated size, and LEN is
+ the length of the meaningful data within the buffer. The
+ translated buffer is returned, *ST_SIZE is set to the length of
+ the meaningful data within the translated buffer, and *BUFFER_START
+ is set to the start of the returned buffer. *BUFFER_START may
+ differ from the return value in the case of a BOM or other ignored
+ marker information.
+
+ INPUT is expected to have been allocated with xmalloc. This
+ function will either set *BUFFER_START to INPUT, or free it and set
+ *BUFFER_START to a pointer to another xmalloc-allocated block of
+ memory. */
+uchar *
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
- uchar *input, size_t size, size_t len, off_t *st_size)
+ uchar *input, size_t size, size_t len,
+ const unsigned char **buffer_start, off_t *st_size)
{
struct cset_converter input_cset;
struct _cpp_strbuf to;
+ unsigned char *buffer;
input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
if (input_cset.func == convert_no_conversion)
else
{
to.asize = MAX (65536, len);
- to.text = xmalloc (to.asize);
+ to.text = XNEWVEC (uchar, to.asize);
to.len = 0;
if (!APPLY_CONVERSION (input_cset, input, len, &to))
/* Resize buffer if we allocated substantially too much, or if we
haven't enough space for the \n-terminator. */
if (to.len + 4096 < to.asize || to.len >= to.asize)
- to.text = xrealloc (to.text, to.len + 1);
+ to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
+
+ /* If the file is using old-school Mac line endings (\r only),
+ terminate with another \r, not an \n, so that we do not mistake
+ the \r\n sequence for a single DOS line ending and erroneously
+ issue the "No newline at end of file" diagnostic. */
+ if (to.len && to.text[to.len - 1] == '\r')
+ to.text[to.len] = '\r';
+ else
+ to.text[to.len] = '\n';
- to.text[to.len] = '\n';
+ buffer = to.text;
*st_size = to.len;
- return to.text;
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ /* The HOST_CHARSET test just above ensures that the source charset
+ is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
+ glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+ BOM -- however, even if it did, we would still need this code due
+ to the 'convert_no_conversion' case. */
+ if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
+ && to.text[2] == 0xbf)
+ {
+ *st_size -= 3;
+ buffer += 3;
+ }
+#endif
+
+ *buffer_start = to.text;
+ return buffer;
}
+/* Decide on the default encoding to assume for input files. */
const char *
_cpp_default_encoding (void)
{
- now we can parse something like "#pragma GCC encoding <xyz>
on the first line, or even Emacs/VIM's mode line tags (there's
a problem here in that VIM uses the last line, and Emacs has
- its more elaborate "Local variables:" convention).
+ its more elaborate "local variables" convention).
- investigate whether Java has another common convention, which
would be friendly to support.
(Zack Weinberg and Paolo Bonzini, May 20th 2004) */