1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "coretypes.h"
29 /* Character set handling for C-family languages.
31 Terminological note: In what follows, "charset" or "character set"
32 will be taken to mean both an abstract set of characters and an
33 encoding for that set.
35 The C99 standard discusses two character sets: source and execution.
36 The source character set is used for internal processing in translation
37 phases 1 through 4; the execution character set is used thereafter.
38 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
39 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
40 of these terms). Furthermore, the "basic character set" (listed in
41 5.2.1p3) is to be encoded in each with values one byte wide, and is
42 to appear in the initial shift state.
44 It is not explicitly mentioned, but there is also a "wide execution
45 character set" used to encode wide character constants and wide
46 string literals; this is supposed to be the result of applying the
47 standard library function mbstowcs() to an equivalent narrow string
48 (6.4.5p5). However, the behavior of hexadecimal and octal
49 \-escapes is at odds with this; they are supposed to be translated
50 directly to wchar_t values (6.4.4.4p5,6).
52 The source character set is not necessarily the character set used
53 to encode physical source files on disk; translation phase 1 converts
54 from whatever that encoding is to the source character set.
56 The presence of universal character names in C99 (6.4.3 et seq.)
57 forces the source character set to be isomorphic to ISO 10646,
58 that is, Unicode. There is no such constraint on the execution
59 character set; note also that the conversion from source to
60 execution character set does not occur for identifiers (5.1.1.2p1#5).
62 For convenience of implementation, the source character set's
63 encoding of the basic character set should be identical to the
64 execution character set OF THE HOST SYSTEM's encoding of the basic
65 character set, and it should not be a state-dependent encoding.
67 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
68 depending on whether the host is based on ASCII or EBCDIC (see
69 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
70 Technical Report #16). It relies on the system library's iconv()
71 primitive to do charset conversion (specified in SUSv2). If this
72 primitive is not present, the source and execution character sets
73 must be identical and are limited to the basic ASCII or EBCDIC
74 range, and wide characters are implemented by padding narrow
75 characters to the size of wchar_t. */
78 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
79 below, which are guarded only by if statements with compile-time
80 constant conditions, do not cause link errors. */
81 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
82 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
83 #define iconv_close(x) 0
87 #if HOST_CHARSET == HOST_CHARSET_ASCII
88 #define SOURCE_CHARSET "UTF-8"
89 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
90 #define SOURCE_CHARSET "UTF-EBCDIC"
92 #error "Unrecognized basic host character set"
95 /* This structure is used for a resizable string buffer, mostly by
96 convert_cset and cpp_interpret_string. */
104 /* This is enough to hold any string that fits on a single 80-column
105 line, even if iconv quadruples its size (e.g. conversion from
106 ASCII to UCS-4) rounded up to a power of two. */
107 #define OUTBUF_BLOCK_SIZE 256
109 /* Subroutine of cpp_init_iconv: initialize and return an iconv
110 descriptor for conversion from FROM to TO. If iconv_open() fails,
111 issue an error and return (iconv_t) -1. Silently return
112 (iconv_t) -1 if FROM and TO are identical. */
114 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
118 if (!strcmp (to, from))
121 dsc = iconv_open (to, from);
122 if (dsc == (iconv_t) -1)
125 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
126 "conversion from %s to %s not supported by iconv",
129 cpp_errno (pfile, DL_ERROR, "iconv_open");
134 /* If charset conversion is requested, initialize iconv(3) descriptors
135 for conversion from the source character set to the execution
136 character sets. If iconv is not present in the C library, and
137 conversion is requested, issue an error. */
140 cpp_init_iconv (cpp_reader *pfile)
142 const char *ncset = CPP_OPTION (pfile, narrow_charset);
143 const char *wcset = CPP_OPTION (pfile, wide_charset);
144 const char *default_wcset;
146 bool be = CPP_OPTION (pfile, bytes_big_endian);
148 if (CPP_OPTION (pfile, wchar_precision) >= 32)
149 default_wcset = be ? "UCS-4BE" : "UCS-4LE";
150 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
151 default_wcset = be ? "UCS-2BE" : "UCS-2LE";
153 /* This effectively means that wide strings are not supported,
154 so don't do any conversion at all. */
155 default_wcset = SOURCE_CHARSET;
159 if (ncset && strcmp (ncset, SOURCE_CHARSET))
160 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
161 "no iconv implementation, cannot convert to %s", ncset);
163 if (wcset && strcmp (wcset, default_wcset))
164 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
165 "no iconv implementation, cannot convert to %s", wcset);
170 ncset = SOURCE_CHARSET;
172 wcset = default_wcset;
174 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
175 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
180 _cpp_destroy_iconv (cpp_reader *pfile)
184 if (pfile->narrow_cset_desc != (iconv_t) -1)
185 iconv_close (pfile->narrow_cset_desc);
186 if (pfile->wide_cset_desc != (iconv_t) -1)
187 iconv_close (pfile->wide_cset_desc);
191 /* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
192 according to the iconv descriptor CD. The result is appended to
193 the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
194 available, the string is simply copied into TO.
196 Returns true on success, false on error. */
199 convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
201 if (!HAVE_ICONV || cd == (iconv_t)-1)
203 if (to->len + flen > to->asize)
205 to->asize = to->len + flen;
206 to->text = xrealloc (to->text, to->asize);
208 memcpy (to->text + to->len, from, flen);
214 ICONV_CONST char *inbuf;
216 size_t inbytesleft, outbytesleft;
218 /* Reset conversion descriptor and check that it is valid. */
219 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
222 inbuf = (ICONV_CONST char *)from;
224 outbuf = (char *)to->text + to->len;
225 outbytesleft = to->asize - to->len;
229 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
230 if (__builtin_expect (inbytesleft == 0, 1))
232 to->len = to->asize - outbytesleft;
238 outbytesleft += OUTBUF_BLOCK_SIZE;
239 to->asize += OUTBUF_BLOCK_SIZE;
240 to->text = xrealloc (to->text, to->asize);
241 outbuf = (char *)to->text + to->asize - outbytesleft;
246 /* Utility routine that computes a mask of the form 0000...111... with
249 width_to_mask (size_t width)
251 width = MIN (width, BITS_PER_CPPCHAR_T);
252 if (width >= CHAR_BIT * sizeof (size_t))
255 return ((size_t) 1 << width) - 1;
260 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
261 the start of an identifier, and 0 if C is not valid in an
262 identifier. We assume C has already gone through the checks of
263 _cpp_valid_ucn. The algorithm is a simple binary search on the
264 table defined in cppucnid.h. */
267 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
272 mx = ARRAY_SIZE (ucnranges);
276 if (c < ucnranges[md].lo)
278 else if (c > ucnranges[md].hi)
286 /* When -pedantic, we require the character to have been listed by
287 the standard for the current language. Otherwise, we accept the
288 union of the acceptable sets for C++98 and C99. */
289 if (CPP_PEDANTIC (pfile)
290 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
291 || (CPP_OPTION (pfile, cplusplus)
292 && !(ucnranges[md].flags & CXX))))
295 /* In C99, UCN digits may not begin identifiers. */
296 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
302 /* [lex.charset]: The character designated by the universal character
303 name \UNNNNNNNN is that character whose character short name in
304 ISO/IEC 10646 is NNNNNNNN; the character designated by the
305 universal character name \uNNNN is that character whose character
306 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
307 for a universal character name is less than 0x20 or in the range
308 0x7F-0x9F (inclusive), or if the universal character name
309 designates a character in the basic source character set, then the
310 program is ill-formed.
312 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
313 buffer end is delimited by a non-hex digit. Returns zero if UCNs
314 are not part of the relevant standard, or if the string beginning
315 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
317 Otherwise the nonzero value of the UCN, whether valid or invalid,
318 is returned. Diagnostics are emitted for invalid values. PSTR
319 is updated to point one beyond the UCN, or to the syntactically
322 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
323 an identifier, or 2 otherwise.
327 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
328 const uchar *limit, int identifier_pos)
332 const uchar *str = *pstr;
333 const uchar *base = str - 2;
335 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
336 cpp_error (pfile, DL_WARNING,
337 "universal character names are only valid in C++ and C99");
338 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
339 cpp_error (pfile, DL_WARNING,
340 "the meaning of '\\%c' is different in traditional C",
345 else if (str[-1] == 'U')
357 result = (result << 4) + hex_value (c);
359 while (--length && str < limit);
364 /* We'll error when we try it out as the start of an identifier. */
365 cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
366 (int) (str - base), base);
369 /* The standard permits $, @ and ` to be specified as UCNs. We use
370 hex escapes so that this also works with EBCDIC hosts. */
371 else if ((result < 0xa0
372 && (result != 0x24 && result != 0x40 && result != 0x60))
373 || (result & 0x80000000)
374 || (result >= 0xD800 && result <= 0xDFFF))
376 cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
377 (int) (str - base), base);
380 else if (identifier_pos)
382 int validity = ucn_valid_in_identifier (pfile, result);
385 cpp_error (pfile, DL_ERROR,
386 "universal character %.*s is not valid in an identifier",
387 (int) (str - base), base);
388 else if (validity == 2 && identifier_pos == 1)
389 cpp_error (pfile, DL_ERROR,
390 "universal character %.*s is not valid at the start of an identifier",
391 (int) (str - base), base);
393 /* We don't accept UCNs if iconv is not available or will not
394 convert to the target wide character set. */
395 else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
397 /* XXX should be DL_SORRY */
398 cpp_error (pfile, DL_ERROR,
399 "universal character names are not supported in this configuration");
409 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
410 it to the execution character set and write the result into TBUF.
411 An advanced pointer is returned. Issues all relevant diagnostics.
413 UTF-8 encoding looks like this:
415 value range encoded as
416 00000000-0000007F 0xxxxxxx
417 00000080-000007FF 110xxxxx 10xxxxxx
418 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
419 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
420 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
421 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
423 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
424 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
425 never occur. Note also that any value that can be encoded by a
426 given row of the table can also be encoded by all successive rows,
427 but this is not done; only the shortest possible encoding for any
428 given value is valid. For instance, the character 07C0 could be
429 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
430 FC 80 80 80 9F 80. Only the first is valid. */
433 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
434 struct strbuf *tbuf, bool wide)
437 uchar buf[6], *p = &buf[6];
438 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
441 from++; /* skip u/U */
442 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
453 *--p = ((ucn & 0x3F) | 0x80);
457 while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
458 *--p = (ucn | masks[nbytes-1]);
461 if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
463 cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
469 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
470 struct strbuf *tbuf, bool wide)
474 /* We have to render this into the target byte order, which may not
475 be our byte order. */
476 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
477 size_t width = CPP_OPTION (pfile, wchar_precision);
478 size_t cwidth = CPP_OPTION (pfile, char_precision);
479 size_t cmask = width_to_mask (cwidth);
480 size_t nbwc = width / cwidth;
482 size_t off = tbuf->len;
485 if (tbuf->len + nbwc > tbuf->asize)
487 tbuf->asize += OUTBUF_BLOCK_SIZE;
488 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
491 for (i = 0; i < nbwc; i++)
495 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
501 if (tbuf->len + 1 > tbuf->asize)
503 tbuf->asize += OUTBUF_BLOCK_SIZE;
504 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
506 tbuf->text[tbuf->len++] = n;
510 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
511 character set and write it into the string buffer TBUF. Returns an
512 advanced pointer, and issues diagnostics as necessary.
513 No character set translation occurs; this routine always produces the
514 execution-set character with numeric value equal to the given hex
515 number. You can, e.g. generate surrogate pairs this way. */
517 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
518 struct strbuf *tbuf, bool wide)
520 cppchar_t c, n = 0, overflow = 0;
521 int digits_found = 0;
522 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
523 : CPP_OPTION (pfile, char_precision));
524 size_t mask = width_to_mask (width);
526 if (CPP_WTRADITIONAL (pfile))
527 cpp_error (pfile, DL_WARNING,
528 "the meaning of '\\x' is different in traditional C");
530 from++; /* skip 'x' */
537 overflow |= n ^ (n << 4 >> 4);
538 n = (n << 4) + hex_value (c);
544 cpp_error (pfile, DL_ERROR,
545 "\\x used with no following hex digits");
549 if (overflow | (n != (n & mask)))
551 cpp_error (pfile, DL_PEDWARN,
552 "hex escape sequence out of range");
556 emit_numeric_escape (pfile, n, tbuf, wide);
561 /* Convert an octal escape, pointed to by FROM, to the execution
562 character set and write it into the string buffer TBUF. Returns an
563 advanced pointer, and issues diagnostics as necessary.
564 No character set translation occurs; this routine always produces the
565 execution-set character with numeric value equal to the given octal
568 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
569 struct strbuf *tbuf, bool wide)
573 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
574 : CPP_OPTION (pfile, char_precision));
575 size_t mask = width_to_mask (width);
576 bool overflow = false;
578 while (from < limit && count++ < 3)
581 if (c < '0' || c > '7')
584 overflow |= n ^ (n << 3 >> 3);
585 n = (n << 3) + c - '0';
590 cpp_error (pfile, DL_PEDWARN,
591 "octal escape sequence out of range");
595 emit_numeric_escape (pfile, n, tbuf, wide);
600 /* Convert an escape sequence (pointed to by FROM) to its value on
601 the target, and to the execution character set. Do not scan past
602 LIMIT. Write the converted value into TBUF. Returns an advanced
603 pointer. Handles all relevant diagnostics. */
605 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
606 struct strbuf *tbuf, bool wide)
608 /* Values of \a \b \e \f \n \r \t \v respectively. */
609 #if HOST_CHARSET == HOST_CHARSET_ASCII
610 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
611 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
612 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
614 #error "unknown host character set"
622 /* UCNs, hex escapes, and octal escapes are processed separately. */
624 return convert_ucn (pfile, from, limit, tbuf, wide);
627 return convert_hex (pfile, from, limit, tbuf, wide);
630 case '0': case '1': case '2': case '3':
631 case '4': case '5': case '6': case '7':
632 return convert_oct (pfile, from, limit, tbuf, wide);
634 /* Various letter escapes. Get the appropriate host-charset
636 case '\\': case '\'': case '"': case '?': break;
638 case '(': case '{': case '[': case '%':
639 /* '\(', etc, can be used at the beginning of a line in a long
640 string split onto multiple lines with \-newline, to prevent
641 Emacs or other text editors from getting confused. '\%' can
642 be used to prevent SCCS from mangling printf format strings. */
643 if (CPP_PEDANTIC (pfile))
647 case 'b': c = charconsts[1]; break;
648 case 'f': c = charconsts[3]; break;
649 case 'n': c = charconsts[4]; break;
650 case 'r': c = charconsts[5]; break;
651 case 't': c = charconsts[6]; break;
652 case 'v': c = charconsts[7]; break;
655 if (CPP_WTRADITIONAL (pfile))
656 cpp_error (pfile, DL_WARNING,
657 "the meaning of '\\a' is different in traditional C");
662 if (CPP_PEDANTIC (pfile))
663 cpp_error (pfile, DL_PEDWARN,
664 "non-ISO-standard escape sequence, '\\%c'", (int) c);
671 cpp_error (pfile, DL_PEDWARN,
672 "unknown escape sequence '\\%c'", (int) c);
674 cpp_error (pfile, DL_PEDWARN,
675 "unknown escape sequence: '\\%03o'", (int) c);
678 /* Now convert what we have to the execution character set. */
679 if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
681 cpp_errno (pfile, DL_ERROR,
682 "converting escape sequence to execution character set");
687 /* FROM is an array of cpp_string structures of length COUNT. These
688 are to be converted from the source to the execution character set,
689 escape sequences translated, and finally all are to be
690 concatenated. WIDE indicates whether or not to produce a wide
691 string. The result is written into TO. Returns true for success,
692 false for failure. */
694 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
695 cpp_string *to, bool wide)
698 const uchar *p, *base, *limit;
700 iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
702 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
703 tbuf.text = xmalloc (tbuf.asize);
706 for (i = 0; i < count; i++)
710 p++; /* skip leading quote */
711 limit = from[i].text + from[i].len - 1; /* skip trailing quote */
716 while (p < limit && *p != '\\')
720 /* We have a run of normal characters; these can be fed
721 directly to convert_cset. */
722 if (!convert_cset (cd, base, p - base, &tbuf))
728 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
731 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
733 emit_numeric_escape (pfile, 0, &tbuf, wide);
734 tbuf.text = xrealloc (tbuf.text, tbuf.len);
735 to->text = tbuf.text;
740 cpp_errno (pfile, DL_ERROR, "converting to execution character set");
745 /* Subroutine of cpp_interpret_charconst which performs the conversion
746 to a number, for narrow strings. STR is the string structure returned
747 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
748 cpp_interpret_charconst. */
750 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
751 unsigned int *pchars_seen, int *unsignedp)
753 size_t width = CPP_OPTION (pfile, char_precision);
754 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
755 size_t mask = width_to_mask (width);
760 /* The value of a multi-character character constant, or a
761 single-character character constant whose representation in the
762 execution character set is more than one byte long, is
763 implementation defined. This implementation defines it to be the
764 number formed by interpreting the byte sequence in memory as a
765 big-endian binary number. If overflow occurs, the high bytes are
766 lost, and a warning is issued.
768 We don't want to process the NUL terminator handed back by
769 cpp_interpret_string. */
771 for (i = 0; i < str.len - 1; i++)
773 c = str.text[i] & mask;
774 if (width < BITS_PER_CPPCHAR_T)
775 result = (result << width) | c;
783 cpp_error (pfile, DL_WARNING, "character constant too long for its type");
785 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
786 cpp_error (pfile, DL_WARNING, "multi-character character constant");
788 /* Multichar constants are of type int and therefore signed. */
792 unsigned_p = CPP_OPTION (pfile, unsigned_char);
794 /* Truncate the constant to its natural width, and simultaneously
795 sign- or zero-extend to the full width of cppchar_t.
796 For single-character constants, the value is WIDTH bits wide.
797 For multi-character constants, the value is INT_PRECISION bits wide. */
799 width = CPP_OPTION (pfile, int_precision);
800 if (width < BITS_PER_CPPCHAR_T)
802 mask = ((cppchar_t) 1 << width) - 1;
803 if (unsigned_p || !(result & (1 << (width - 1))))
809 *unsignedp = unsigned_p;
813 /* Subroutine of cpp_interpret_charconst which performs the conversion
814 to a number, for wide strings. STR is the string structure returned
815 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
816 cpp_interpret_charconst. */
818 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
819 unsigned int *pchars_seen, int *unsignedp)
821 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
822 size_t width = CPP_OPTION (pfile, wchar_precision);
823 size_t cwidth = CPP_OPTION (pfile, char_precision);
824 size_t mask = width_to_mask (width);
825 size_t cmask = width_to_mask (cwidth);
826 size_t nbwc = width / cwidth;
828 cppchar_t result = 0, c;
830 /* This is finicky because the string is in the target's byte order,
831 which may not be our byte order. Only the last character, ignoring
832 the NUL terminator, is relevant. */
833 off = str.len - (nbwc * 2);
835 for (i = 0; i < nbwc; i++)
837 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
838 result = (result << cwidth) | (c & cmask);
841 /* Wide character constants have type wchar_t, and a single
842 character exactly fills a wchar_t, so a multi-character wide
843 character constant is guaranteed to overflow. */
845 cpp_error (pfile, DL_WARNING, "character constant too long for its type");
847 /* Truncate the constant to its natural width, and simultaneously
848 sign- or zero-extend to the full width of cppchar_t. */
849 if (width < BITS_PER_CPPCHAR_T)
851 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
857 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
862 /* Interpret a (possibly wide) character constant in TOKEN.
863 PCHARS_SEEN points to a variable that is filled in with the number
864 of characters seen, and UNSIGNEDP to a variable that indicates
865 whether the result has signed type. */
867 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
868 unsigned int *pchars_seen, int *unsignedp)
870 cpp_string str = { 0, 0 };
871 bool wide = (token->type == CPP_WCHAR);
874 /* an empty constant will appear as L'' or '' */
875 if (token->val.str.len == (size_t) (2 + wide))
877 cpp_error (pfile, DL_ERROR, "empty character constant");
880 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
884 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
886 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
888 if (str.text != token->val.str.text)
889 free ((void *)str.text);