/* CPP Library - charsets
- Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008
+ Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009
Free Software Foundation, Inc.
Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
later version.
This program is distributed in the hope that it will be useful,
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+along with this program; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
cppchar_t *cp)
{
- static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
cppchar_t c;
pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
+ pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
+ pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
pfile->char16_cset_desc = init_iconv_desc (pfile,
be ? "UTF-16BE" : "UTF-16LE",
SOURCE_CHARSET);
{
if (pfile->narrow_cset_desc.func == convert_using_iconv)
iconv_close (pfile->narrow_cset_desc.cd);
+ if (pfile->utf8_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->utf8_cset_desc.cd);
+ if (pfile->char16_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char16_cset_desc.cd);
+ if (pfile->char32_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->char32_cset_desc.cd);
if (pfile->wide_cset_desc.func == convert_using_iconv)
iconv_close (pfile->wide_cset_desc.cd);
}
ISO/IEC 10646 is NNNNNNNN; the character designated by the
universal character name \uNNNN is that character whose character
short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
- for a universal character name is less than 0x20 or in the range
- 0x7F-0x9F (inclusive), or if the universal character name
- designates a character in the basic source character set, then the
- program is ill-formed.
+ for a universal character name corresponds to a surrogate code point
+ (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
+ Additionally, if the hexadecimal value for a universal-character-name
+ outside a character or string literal corresponds to a control character
+ (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
+ character in the basic source character set, the program is ill-formed.
+
+ C99 6.4.3: A universal character name shall not specify a character
+ whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
+ or 0060 (`), nor one in the range D800 through DFFF inclusive.
*PSTR must be preceded by "\u" or "\U"; it is assumed that the
buffer end is delimited by a non-hex digit. Returns zero if the
cpp_error (pfile, CPP_DL_WARNING,
"universal character names are only valid in C++ and C99");
else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\%c' is different in traditional C",
- (int) str[-1]);
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\%c' is different in traditional C",
+ (int) str[-1]);
if (str[-1] == 'u')
length = 4;
(int) (str - base), base);
result = 1;
}
- /* The standard permits $, @ and ` to be specified as UCNs. We use
- hex escapes so that this also works with EBCDIC hosts. */
+ /* The C99 standard permits $, @ and ` to be specified as UCNs. We use
+ hex escapes so that this also works with EBCDIC hosts.
+ C++0x permits everything below 0xa0 within literals;
+ ucn_valid_in_identifier will complain about identifiers. */
else if ((result < 0xa0
+ && !CPP_OPTION (pfile, cplusplus)
&& (result != 0x24 && result != 0x40 && result != 0x60))
|| (result & 0x80000000)
|| (result >= 0xD800 && result <= 0xDFFF))
size_t mask = width_to_mask (width);
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\x' is different in traditional C");
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\x' is different in traditional C");
from++; /* Skip 'x'. */
while (from < limit)
case 'a':
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, CPP_DL_WARNING,
- "the meaning of '\\a' is different in traditional C");
+ cpp_warning (pfile, CPP_W_TRADITIONAL,
+ "the meaning of '\\a' is different in traditional C");
c = charconsts[0];
break;
unknown:
if (ISGRAPH (c))
cpp_error (pfile, CPP_DL_PEDWARN,
- "unknown escape sequence '\\%c'", (int) c);
+ "unknown escape sequence: '\\%c'", (int) c);
else
{
/* diagnostic.c does not support "%03o". When it does, this
{
default:
return pfile->narrow_cset_desc;
+ case CPP_UTF8STRING:
+ return pfile->utf8_cset_desc;
case CPP_CHAR16:
case CPP_STRING16:
return pfile->char16_cset_desc;
for (i = 0; i < count; i++)
{
p = from[i].text;
- if (*p == 'L' || *p == 'u' || *p == 'U') p++;
+ if (*p == 'u')
+ {
+ if (*++p == '8')
+ p++;
+ }
+ else if (*p == 'L' || *p == 'U') p++;
+ if (*p == 'R')
+ {
+ const uchar *prefix;
+
+ /* Skip over 'R"'. */
+ p += 2;
+ prefix = p;
+ while (*p != '(')
+ p++;
+ p++;
+ limit = from[i].text + from[i].len;
+ if (limit >= p + (p - prefix) + 1)
+ limit -= (p - prefix) + 1;
+
+ /* Raw strings are all normal characters; these can be fed
+ directly to convert_cset. */
+ if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
+ goto fail;
+
+ continue;
+ }
+
p++; /* Skip leading quote. */
limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
pfile->narrow_cset_desc.func = convert_no_conversion;
pfile->narrow_cset_desc.cd = (iconv_t) -1;
+ pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
"character constant too long for its type");
}
else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
- cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
+ cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
/* Multichar constants are of type int and therefore signed. */
if (i > 1)