/* CPP Library - charsets
- Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
Free Software Foundation, Inc.
Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
#include "config.h"
#include "system.h"
-#include "coretypes.h"
-#include "tm.h"
#include "cpplib.h"
#include "cpphash.h"
#include "cppucnid.h"
#endif
/* This structure is used for a resizable string buffer throughout. */
-struct strbuf
+/* Don't call it strbuf, as that conflicts with unistd.h on systems
+ such as DYNIX/ptx where unistd.h includes stropts.h. */
+struct _cpp_strbuf
{
uchar *text;
size_t asize;
{
static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
+
cppchar_t c;
const uchar *inbuf = *inbufp;
size_t nbytes, i;
The return value is either 0 for success, or an errno value for
failure, which may be E2BIG (need more space), EILSEQ (ill-formed
input sequence), ir EINVAL (incomplete input sequence). */
-
+
static inline int
one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
{
uchar *outbuf;
- cppchar_t s;
+ cppchar_t s = 0;
int rval;
/* Check for space first, since we know exactly how much we need. */
uchar **outbufp, size_t *outbytesleftp)
{
int rval;
- cppchar_t s;
+ cppchar_t s = 0;
const uchar *save_inbuf = *inbufp;
size_t save_inbytesleft = *inbytesleftp;
uchar *outbuf = *outbufp;
/* Helper routine for the next few functions. The 'const' on
one_conversion means that we promise not to modify what function is
- pointed to, which lets the inliner see through it. */
+ pointed to, which lets the inliner see through it. */
static inline bool
conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
uchar **, size_t *),
- iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
+ iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
{
const uchar *inbuf;
uchar *outbuf;
outbuf = to->text + to->asize - outbytesleft;
}
}
-
+
/* These functions convert entire strings between character sets.
They all have the signature
- bool (*)(iconv_t cd, const uchar *from, size_t flen, struct strbuf *to);
+ bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
The input string FROM is converted as specified by the function
name plus the iconv descriptor CD (which may be fake), and the
/* These four use the custom conversion code above. */
static bool
convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
- struct strbuf *to)
+ struct _cpp_strbuf *to)
{
return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
}
static bool
convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
- struct strbuf *to)
+ struct _cpp_strbuf *to)
{
return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
}
static bool
convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
- struct strbuf *to)
+ struct _cpp_strbuf *to)
{
return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
}
static bool
convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
- struct strbuf *to)
+ struct _cpp_strbuf *to)
{
return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}
/* Identity conversion, used when we have no alternative. */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
- const uchar *from, size_t flen, struct strbuf *to)
+ const uchar *from, size_t flen, struct _cpp_strbuf *to)
{
if (to->len + flen > to->asize)
{
#if HAVE_ICONV
static bool
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
- struct strbuf *to)
+ struct _cpp_strbuf *to)
{
ICONV_CONST char *inbuf;
char *outbuf;
struct cset_converter ret;
char *pair;
size_t i;
-
+
if (!strcasecmp (to, from))
{
ret.func = convert_no_conversion;
if (ret.cd == (iconv_t) -1)
{
if (errno == EINVAL)
- cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
"conversion from %s to %s not supported by iconv",
from, to);
else
- cpp_errno (pfile, DL_ERROR, "iconv_open");
+ cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
ret.func = convert_no_conversion;
}
}
else
{
- cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
"no iconv implementation, cannot convert from %s to %s",
from, to);
ret.func = convert_no_conversion;
const uchar *base = str - 2;
if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
- cpp_error (pfile, DL_WARNING,
+ cpp_error (pfile, CPP_DL_WARNING,
"universal character names are only valid in C++ and C99");
else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
- cpp_error (pfile, DL_WARNING,
+ cpp_error (pfile, CPP_DL_WARNING,
"the meaning of '\\%c' is different in traditional C",
(int) str[-1]);
if (length)
{
/* We'll error when we try it out as the start of an identifier. */
- cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
+ cpp_error (pfile, CPP_DL_ERROR,
+ "incomplete universal character name %.*s",
(int) (str - base), base);
result = 1;
}
|| (result & 0x80000000)
|| (result >= 0xD800 && result <= 0xDFFF))
{
- cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
+ cpp_error (pfile, CPP_DL_ERROR,
+ "%.*s is not a valid universal character",
(int) (str - base), base);
result = 1;
}
int validity = ucn_valid_in_identifier (pfile, result);
if (validity == 0)
- cpp_error (pfile, DL_ERROR,
+ cpp_error (pfile, CPP_DL_ERROR,
"universal character %.*s is not valid in an identifier",
(int) (str - base), base);
else if (validity == 2 && identifier_pos == 1)
- cpp_error (pfile, DL_ERROR,
+ cpp_error (pfile, CPP_DL_ERROR,
"universal character %.*s is not valid at the start of an identifier",
(int) (str - base), base);
}
static const uchar *
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, bool wide)
{
cppchar_t ucn;
uchar buf[6];
struct cset_converter cvt
= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
- from++; /* skip u/U */
+ from++; /* Skip u/U. */
ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
if (rval)
{
errno = rval;
- cpp_errno (pfile, DL_ERROR, "converting UCN to source character set");
+ cpp_errno (pfile, CPP_DL_ERROR,
+ "converting UCN to source character set");
}
else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
- cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
+ cpp_errno (pfile, CPP_DL_ERROR,
+ "converting UCN to execution character set");
return from;
}
static void
emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
- struct strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, bool wide)
{
if (wide)
{
number. You can, e.g. generate surrogate pairs this way. */
static const uchar *
convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, bool wide)
{
cppchar_t c, n = 0, overflow = 0;
int digits_found = 0;
size_t mask = width_to_mask (width);
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
+ cpp_error (pfile, CPP_DL_WARNING,
"the meaning of '\\x' is different in traditional C");
- from++; /* skip 'x' */
+ from++; /* Skip 'x'. */
while (from < limit)
{
c = *from;
if (!digits_found)
{
- cpp_error (pfile, DL_ERROR,
+ cpp_error (pfile, CPP_DL_ERROR,
"\\x used with no following hex digits");
return from;
}
if (overflow | (n != (n & mask)))
{
- cpp_error (pfile, DL_PEDWARN,
+ cpp_error (pfile, CPP_DL_PEDWARN,
"hex escape sequence out of range");
n &= mask;
}
number. */
static const uchar *
convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, bool wide)
{
size_t count = 0;
cppchar_t c, n = 0;
if (n != (n & mask))
{
- cpp_error (pfile, DL_PEDWARN,
+ cpp_error (pfile, CPP_DL_PEDWARN,
"octal escape sequence out of range");
n &= mask;
}
pointer. Handles all relevant diagnostics. */
static const uchar *
convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
- struct strbuf *tbuf, bool wide)
+ struct _cpp_strbuf *tbuf, bool wide)
{
/* Values of \a \b \e \f \n \r \t \v respectively. */
#if HOST_CHARSET == HOST_CHARSET_ASCII
case 'a':
if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
+ cpp_error (pfile, CPP_DL_WARNING,
"the meaning of '\\a' is different in traditional C");
c = charconsts[0];
break;
case 'e': case 'E':
if (CPP_PEDANTIC (pfile))
- cpp_error (pfile, DL_PEDWARN,
+ cpp_error (pfile, CPP_DL_PEDWARN,
"non-ISO-standard escape sequence, '\\%c'", (int) c);
c = charconsts[2];
break;
default:
unknown:
if (ISGRAPH (c))
- cpp_error (pfile, DL_PEDWARN,
+ cpp_error (pfile, CPP_DL_PEDWARN,
"unknown escape sequence '\\%c'", (int) c);
else
- cpp_error (pfile, DL_PEDWARN,
+ cpp_error (pfile, CPP_DL_PEDWARN,
"unknown escape sequence: '\\%03o'", (int) c);
}
/* Now convert what we have to the execution character set. */
if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
- cpp_errno (pfile, DL_ERROR,
+ cpp_errno (pfile, CPP_DL_ERROR,
"converting escape sequence to execution character set");
return from + 1;
cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
cpp_string *to, bool wide)
{
- struct strbuf tbuf;
+ struct _cpp_strbuf tbuf;
const uchar *p, *base, *limit;
size_t i;
struct cset_converter cvt
{
p = from[i].text;
if (*p == 'L') p++;
- p++; /* skip leading quote */
- limit = from[i].text + from[i].len - 1; /* skip trailing quote */
+ p++; /* Skip leading quote. */
+ limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
for (;;)
{
return true;
fail:
- cpp_errno (pfile, DL_ERROR, "converting to execution character set");
+ cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
free (tbuf.text);
return false;
}
/* Subroutine of do_line and do_linemarker. Convert escape sequences
in a string, but do not perform character set conversion. */
bool
-_cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
- cpp_string *out)
+cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
+ size_t count, cpp_string *to, bool wide)
{
struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
bool retval;
pfile->narrow_cset_desc.func = convert_no_conversion;
pfile->narrow_cset_desc.cd = (iconv_t) -1;
- retval = cpp_interpret_string (pfile, in, 1, out, false);
+ retval = cpp_interpret_string (pfile, from, count, to, wide);
pfile->narrow_cset_desc = save_narrow_cset_desc;
return retval;
if (i > max_chars)
{
i = max_chars;
- cpp_error (pfile, DL_WARNING, "character constant too long for its type");
+ cpp_error (pfile, CPP_DL_WARNING,
+ "character constant too long for its type");
}
else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
- cpp_error (pfile, DL_WARNING, "multi-character character constant");
+ cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
/* Multichar constants are of type int and therefore signed. */
if (i > 1)
*unsignedp = unsigned_p;
return result;
}
-
+
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for wide strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
character exactly fills a wchar_t, so a multi-character wide
character constant is guaranteed to overflow. */
if (off > 0)
- cpp_error (pfile, DL_WARNING, "character constant too long for its type");
+ cpp_error (pfile, CPP_DL_WARNING,
+ "character constant too long for its type");
/* Truncate the constant to its natural width, and simultaneously
sign- or zero-extend to the full width of cppchar_t. */
/* an empty constant will appear as L'' or '' */
if (token->val.str.len == (size_t) (2 + wide))
{
- cpp_error (pfile, DL_ERROR, "empty character constant");
+ cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
return 0;
}
else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
return result;
}
+
+uchar *
+_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
+ uchar *input, size_t size, size_t len, off_t *st_size)
+{
+ struct cset_converter input_cset;
+ struct _cpp_strbuf to;
+
+ input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
+ if (input_cset.func == convert_no_conversion)
+ {
+ to.text = input;
+ to.asize = size;
+ to.len = len;
+ }
+ else
+ {
+ to.asize = MAX (65536, len);
+ to.text = xmalloc (to.asize);
+ to.len = 0;
+
+ if (!APPLY_CONVERSION (input_cset, input, len, &to))
+ cpp_error (pfile, CPP_DL_ERROR,
+ "failure to convert %s to %s",
+ CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
+
+ free (input);
+ }
+
+ /* Clean up the mess. */
+ if (input_cset.func == convert_using_iconv)
+ iconv_close (input_cset.cd);
+
+ /* Resize buffer if we allocated substantially too much, or if we
+ haven't enough space for the \n-terminator. */
+ if (to.len + 4096 < to.asize || to.len >= to.asize)
+ to.text = xrealloc (to.text, to.len + 1);
+
+ to.text[to.len] = '\n';
+ *st_size = to.len;
+ return to.text;
+}
+
+const char *
+_cpp_default_encoding (void)
+{
+ const char *current_encoding = NULL;
+
+#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET)
+ setlocale (LC_CTYPE, "");
+ current_encoding = nl_langinfo (CODESET);
+#endif
+ if (current_encoding == NULL || *current_encoding == '\0')
+ current_encoding = SOURCE_CHARSET;
+
+ return current_encoding;
+}