gcc/cppcharset.c

   1 /* CPP Library - charsets
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
   3    Free Software Foundation, Inc.
   4
   5    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
   6
   7 This program is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 This program is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with this program; if not, write to the Free Software
  19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "cpplib.h"
  26 #include "cpphash.h"
  27 #include "cppucnid.h"
  28
  29 /* Character set handling for C-family languages.
  30
  31    Terminological note: In what follows, "charset" or "character set"
  32    will be taken to mean both an abstract set of characters and an
  33    encoding for that set.
  34
  35    The C99 standard discusses two character sets: source and execution.
  36    The source character set is used for internal processing in translation
  37    phases 1 through 4; the execution character set is used thereafter.
  38    Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
  39    character encodings (see 3.7.2, 3.7.3 for the standardese meanings
  40    of these terms).  Furthermore, the "basic character set" (listed in
  41    5.2.1p3) is to be encoded in each with values one byte wide, and is
  42    to appear in the initial shift state.
  43
  44    It is not explicitly mentioned, but there is also a "wide execution
  45    character set" used to encode wide character constants and wide
  46    string literals; this is supposed to be the result of applying the
  47    standard library function mbstowcs() to an equivalent narrow string
  48    (6.4.5p5).  However, the behavior of hexadecimal and octal
  49    \-escapes is at odds with this; they are supposed to be translated
  50    directly to wchar_t values (6.4.4.4p5,6).
  51
  52    The source character set is not necessarily the character set used
  53    to encode physical source files on disk; translation phase 1 converts
  54    from whatever that encoding is to the source character set.
  55
  56    The presence of universal character names in C99 (6.4.3 et seq.)
  57    forces the source character set to be isomorphic to ISO 10646,
  58    that is, Unicode.  There is no such constraint on the execution
  59    character set; note also that the conversion from source to
  60    execution character set does not occur for identifiers (5.1.1.2p1#5).
  61
  62    For convenience of implementation, the source character set's
  63    encoding of the basic character set should be identical to the
  64    execution character set OF THE HOST SYSTEM's encoding of the basic
  65    character set, and it should not be a state-dependent encoding.
  66
  67    cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
  68    depending on whether the host is based on ASCII or EBCDIC (see
  69    respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
  70    Technical Report #16).  It relies on the system library's iconv()
  71    primitive to do charset conversion (specified in SUSv2).  If this
  72    primitive is not present, the source and execution character sets
  73    must be identical and are limited to the basic ASCII or EBCDIC
  74    range, and wide characters are implemented by padding narrow
  75    characters to the size of wchar_t.  */
  76
  77 #if !HAVE_ICONV
  78 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
  79    below, which are guarded only by if statements with compile-time
  80    constant conditions, do not cause link errors.  */
  81 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
  82 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
  83 #define iconv_close(x)   0
  84 #define ICONV_CONST
  85 #endif
  86
  87 #if HOST_CHARSET == HOST_CHARSET_ASCII
  88 #define SOURCE_CHARSET "UTF-8"
  89 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
  90 #define SOURCE_CHARSET "UTF-EBCDIC"
  91 #else
  92 #error "Unrecognized basic host character set"
  93 #endif
  94
  95 /* This structure is used for a resizable string buffer, mostly by
  96    convert_cset and cpp_interpret_string.  */
  97 struct strbuf
  98 {
  99   uchar *text;
 100   size_t asize;
 101   size_t len;
 102 };
 103
 104 /* This is enough to hold any string that fits on a single 80-column
 105    line, even if iconv quadruples its size (e.g. conversion from
 106    ASCII to UCS-4) rounded up to a power of two.  */
 107 #define OUTBUF_BLOCK_SIZE 256
 108
 109 /* Subroutine of cpp_init_iconv: initialize and return an iconv
 110    descriptor for conversion from FROM to TO.  If iconv_open() fails,
 111    issue an error and return (iconv_t) -1.  Silently return
 112    (iconv_t) -1 if FROM and TO are identical.  */
 113 static iconv_t
 114 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 115 {
 116   iconv_t dsc;
 117
 118   if (!strcmp (to, from))
 119     return (iconv_t) -1;
 120
 121   dsc = iconv_open (to, from);
 122   if (dsc == (iconv_t) -1)
 123     {
 124       if (errno == EINVAL)
 125         cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
 126                    "conversion from %s to %s not supported by iconv",
 127                    from, to);
 128       else
 129         cpp_errno (pfile, DL_ERROR, "iconv_open");
 130     }
 131   return dsc;
 132 }
 133
 134 /* If charset conversion is requested, initialize iconv(3) descriptors
 135    for conversion from the source character set to the execution
 136    character sets.  If iconv is not present in the C library, and
 137    conversion is requested, issue an error.  */
 138
 139 void
 140 cpp_init_iconv (cpp_reader *pfile)
 141 {
 142   const char *ncset = CPP_OPTION (pfile, narrow_charset);
 143   const char *wcset = CPP_OPTION (pfile, wide_charset);
 144   const char *default_wcset;
 145
 146   bool be = CPP_OPTION (pfile, bytes_big_endian);
 147
 148   if (CPP_OPTION (pfile, wchar_precision) >= 32)
 149     default_wcset = be ? "UCS-4BE" : "UCS-4LE";
 150   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
 151     default_wcset = be ? "UCS-2BE" : "UCS-2LE";
 152   else
 153     /* This effectively means that wide strings are not supported,
 154        so don't do any conversion at all.  */
 155    default_wcset = SOURCE_CHARSET;
 156
 157   if (!HAVE_ICONV)
 158     {
 159       if (ncset && strcmp (ncset, SOURCE_CHARSET))
 160         cpp_error (pfile, DL_ERROR,  /* XXX should be DL_SORRY */
 161                    "no iconv implementation, cannot convert to %s", ncset);
 162
 163       if (wcset && strcmp (wcset, default_wcset))
 164         cpp_error (pfile, DL_ERROR,  /* XXX should be DL_SORRY */
 165                    "no iconv implementation, cannot convert to %s", wcset);
 166     }
 167   else
 168     {
 169       if (!ncset)
 170         ncset = SOURCE_CHARSET;
 171       if (!wcset)
 172         wcset = default_wcset;
 173
 174       pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
 175       pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
 176     }
 177 }
 178
 179 void
 180 _cpp_destroy_iconv (cpp_reader *pfile)
 181 {
 182   if (HAVE_ICONV)
 183     {
 184       if (pfile->narrow_cset_desc != (iconv_t) -1)
 185         iconv_close (pfile->narrow_cset_desc);
 186       if (pfile->wide_cset_desc != (iconv_t) -1)
 187         iconv_close (pfile->wide_cset_desc);
 188     }
 189 }
 190
 191 /* iconv(3) utility wrapper.  Convert the string FROM, of length FLEN,
 192    according to the iconv descriptor CD.  The result is appended to
 193    the string buffer TO.  If DESC is (iconv_t)-1 or iconv is not
 194    available, the string is simply copied into TO.
 195
 196    Returns true on success, false on error.  */
 197
 198 static bool
 199 convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
 200 {
 201   if (!HAVE_ICONV || cd == (iconv_t)-1)
 202     {
 203       if (to->len + flen > to->asize)
 204         {
 205           to->asize = to->len + flen;
 206           to->text = xrealloc (to->text, to->asize);
 207         }
 208       memcpy (to->text + to->len, from, flen);
 209       to->len += flen;
 210       return true;
 211     }
 212   else
 213     {
 214       ICONV_CONST char *inbuf;
 215       char *outbuf;
 216       size_t inbytesleft, outbytesleft;
 217
 218       /* Reset conversion descriptor and check that it is valid.  */
 219       if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
 220         return false;
 221
 222       inbuf = (ICONV_CONST char *)from;
 223       inbytesleft = flen;
 224       outbuf = (char *)to->text + to->len;
 225       outbytesleft = to->asize - to->len;
 226
 227       for (;;)
 228         {
 229           iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 230           if (__builtin_expect (inbytesleft == 0, 1))
 231             {
 232               to->len = to->asize - outbytesleft;
 233               return true;
 234             }
 235           if (errno != E2BIG)
 236             return false;
 237
 238           outbytesleft += OUTBUF_BLOCK_SIZE;
 239           to->asize += OUTBUF_BLOCK_SIZE;
 240           to->text = xrealloc (to->text, to->asize);
 241           outbuf = (char *)to->text + to->asize - outbytesleft;
 242         }
 243     }
 244 }
 245
 246 /* Utility routine that computes a mask of the form 0000...111... with
 247    WIDTH 1-bits.  */
 248 static inline size_t
 249 width_to_mask (size_t width)
 250 {
 251   width = MIN (width, BITS_PER_CPPCHAR_T);
 252   if (width >= CHAR_BIT * sizeof (size_t))
 253     return ~(size_t) 0;
 254   else
 255     return ((size_t) 1 << width) - 1;
 256 }
 257
 258 \f
 259
 260 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
 261    the start of an identifier, and 0 if C is not valid in an
 262    identifier.  We assume C has already gone through the checks of
 263    _cpp_valid_ucn.  The algorithm is a simple binary search on the
 264    table defined in cppucnid.h.  */
 265
 266 static int
 267 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
 268 {
 269   int mn, mx, md;
 270
 271   mn = -1;
 272   mx = ARRAY_SIZE (ucnranges);
 273   while (mx - mn > 1)
 274     {
 275       md = (mn + mx) / 2;
 276       if (c < ucnranges[md].lo)
 277         mx = md;
 278       else if (c > ucnranges[md].hi)
 279         mn = md;
 280       else
 281         goto found;
 282     }
 283   return 0;
 284
 285  found:
 286   /* When -pedantic, we require the character to have been listed by
 287      the standard for the current language.  Otherwise, we accept the
 288      union of the acceptable sets for C++98 and C99.  */
 289   if (CPP_PEDANTIC (pfile)
 290       && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
 291           || (CPP_OPTION (pfile, cplusplus)
 292               && !(ucnranges[md].flags & CXX))))
 293     return 0;
 294
 295   /* In C99, UCN digits may not begin identifiers.  */
 296   if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
 297     return 2;
 298
 299   return 1;
 300 }
 301
 302 /* [lex.charset]: The character designated by the universal character
 303    name \UNNNNNNNN is that character whose character short name in
 304    ISO/IEC 10646 is NNNNNNNN; the character designated by the
 305    universal character name \uNNNN is that character whose character
 306    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
 307    for a universal character name is less than 0x20 or in the range
 308    0x7F-0x9F (inclusive), or if the universal character name
 309    designates a character in the basic source character set, then the
 310    program is ill-formed.
 311
 312    *PSTR must be preceded by "\u" or "\U"; it is assumed that the
 313    buffer end is delimited by a non-hex digit.  Returns zero if UCNs
 314    are not part of the relevant standard, or if the string beginning
 315    at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
 316
 317    Otherwise the nonzero value of the UCN, whether valid or invalid,
 318    is returned.  Diagnostics are emitted for invalid values.  PSTR
 319    is updated to point one beyond the UCN, or to the syntactically
 320    invalid character.
 321
 322    IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
 323    an identifier, or 2 otherwise.
 324 */
 325
 326 cppchar_t
 327 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 328                 const uchar *limit, int identifier_pos)
 329 {
 330   cppchar_t result, c;
 331   unsigned int length;
 332   const uchar *str = *pstr;
 333   const uchar *base = str - 2;
 334
 335   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
 336     cpp_error (pfile, DL_WARNING,
 337                "universal character names are only valid in C++ and C99");
 338   else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
 339     cpp_error (pfile, DL_WARNING,
 340                "the meaning of '\\%c' is different in traditional C",
 341                (int) str[-1]);
 342
 343   if (str[-1] == 'u')
 344     length = 4;
 345   else if (str[-1] == 'U')
 346     length = 8;
 347   else
 348     abort();
 349
 350   result = 0;
 351   do
 352     {
 353       c = *str;
 354       if (!ISXDIGIT (c))
 355         break;
 356       str++;
 357       result = (result << 4) + hex_value (c);
 358     }
 359   while (--length && str < limit);
 360
 361   *pstr = str;
 362   if (length)
 363     {
 364       /* We'll error when we try it out as the start of an identifier.  */
 365       cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
 366                  (int) (str - base), base);
 367       result = 1;
 368     }
 369   /* The standard permits $, @ and ` to be specified as UCNs.  We use
 370      hex escapes so that this also works with EBCDIC hosts.  */
 371   else if ((result < 0xa0
 372             && (result != 0x24 && result != 0x40 && result != 0x60))
 373            || (result & 0x80000000)
 374            || (result >= 0xD800 && result <= 0xDFFF))
 375     {
 376       cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
 377                  (int) (str - base), base);
 378       result = 1;
 379     }
 380   else if (identifier_pos)
 381     {
 382       int validity = ucn_valid_in_identifier (pfile, result);
 383
 384       if (validity == 0)
 385         cpp_error (pfile, DL_ERROR,
 386                    "universal character %.*s is not valid in an identifier",
 387                    (int) (str - base), base);
 388       else if (validity == 2 && identifier_pos == 1)
 389         cpp_error (pfile, DL_ERROR,
 390    "universal character %.*s is not valid at the start of an identifier",
 391                    (int) (str - base), base);
 392     }
 393   /* We don't accept UCNs if iconv is not available or will not
 394      convert to the target wide character set.  */
 395   else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
 396     {
 397       /* XXX should be DL_SORRY */
 398       cpp_error (pfile, DL_ERROR,
 399         "universal character names are not supported in this configuration");
 400     }
 401
 402
 403   if (result == 0)
 404     result = 1;
 405
 406   return result;
 407 }
 408
 409 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
 410    it to the execution character set and write the result into TBUF.
 411    An advanced pointer is returned.  Issues all relevant diagnostics.
 412
 413    UTF-8 encoding looks like this:
 414
 415    value range         encoded as
 416    00000000-0000007F   0xxxxxxx
 417    00000080-000007FF   110xxxxx 10xxxxxx
 418    00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
 419    00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 420    00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 421    04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 422
 423    Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
 424    which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
 425    never occur.  Note also that any value that can be encoded by a
 426    given row of the table can also be encoded by all successive rows,
 427    but this is not done; only the shortest possible encoding for any
 428    given value is valid.  For instance, the character 07C0 could be
 429    encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
 430    FC 80 80 80 9F 80.  Only the first is valid.  */
 431
 432 static const uchar *
 433 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 434              struct strbuf *tbuf, bool wide)
 435 {
 436   int nbytes;
 437   uchar buf[6], *p = &buf[6];
 438   static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 439   cppchar_t ucn;
 440
 441   from++; /* skip u/U */
 442   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
 443   if (!ucn)
 444     return from;
 445
 446   nbytes = 1;
 447   if (ucn < 0x80)
 448     *--p = ucn;
 449   else
 450     {
 451       do
 452         {
 453           *--p = ((ucn & 0x3F) | 0x80);
 454           ucn >>= 6;
 455           nbytes++;
 456         }
 457       while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
 458       *--p = (ucn | masks[nbytes-1]);
 459     }
 460
 461   if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
 462                      p, nbytes, tbuf))
 463     cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
 464
 465   return from;
 466 }
 467
 468 static void
 469 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
 470                      struct strbuf *tbuf, bool wide)
 471 {
 472   if (wide)
 473     {
 474       /* We have to render this into the target byte order, which may not
 475          be our byte order.  */
 476       bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 477       size_t width = CPP_OPTION (pfile, wchar_precision);
 478       size_t cwidth = CPP_OPTION (pfile, char_precision);
 479       size_t cmask = width_to_mask (cwidth);
 480       size_t nbwc = width / cwidth;
 481       size_t i;
 482       size_t off = tbuf->len;
 483       cppchar_t c;
 484
 485       if (tbuf->len + nbwc > tbuf->asize)
 486         {
 487           tbuf->asize += OUTBUF_BLOCK_SIZE;
 488           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 489         }
 490
 491       for (i = 0; i < nbwc; i++)
 492         {
 493           c = n & cmask;
 494           n >>= cwidth;
 495           tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
 496         }
 497       tbuf->len += nbwc;
 498     }
 499   else
 500     {
 501       if (tbuf->len + 1 > tbuf->asize)
 502         {
 503           tbuf->asize += OUTBUF_BLOCK_SIZE;
 504           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 505         }
 506       tbuf->text[tbuf->len++] = n;
 507     }
 508 }
 509
 510 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
 511    character set and write it into the string buffer TBUF.  Returns an
 512    advanced pointer, and issues diagnostics as necessary.
 513    No character set translation occurs; this routine always produces the
 514    execution-set character with numeric value equal to the given hex
 515    number.  You can, e.g. generate surrogate pairs this way.  */
 516 static const uchar *
 517 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
 518              struct strbuf *tbuf, bool wide)
 519 {
 520   cppchar_t c, n = 0, overflow = 0;
 521   int digits_found = 0;
 522   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
 523                   : CPP_OPTION (pfile, char_precision));
 524   size_t mask = width_to_mask (width);
 525
 526   if (CPP_WTRADITIONAL (pfile))
 527     cpp_error (pfile, DL_WARNING,
 528                "the meaning of '\\x' is different in traditional C");
 529
 530   from++;  /* skip 'x' */
 531   while (from < limit)
 532     {
 533       c = *from;
 534       if (! hex_p (c))
 535         break;
 536       from++;
 537       overflow |= n ^ (n << 4 >> 4);
 538       n = (n << 4) + hex_value (c);
 539       digits_found = 1;
 540     }
 541
 542   if (!digits_found)
 543     {
 544       cpp_error (pfile, DL_ERROR,
 545                  "\\x used with no following hex digits");
 546       return from;
 547     }
 548
 549   if (overflow | (n != (n & mask)))
 550     {
 551       cpp_error (pfile, DL_PEDWARN,
 552                  "hex escape sequence out of range");
 553       n &= mask;
 554     }
 555
 556   emit_numeric_escape (pfile, n, tbuf, wide);
 557
 558   return from;
 559 }
 560
 561 /* Convert an octal escape, pointed to by FROM, to the execution
 562    character set and write it into the string buffer TBUF.  Returns an
 563    advanced pointer, and issues diagnostics as necessary.
 564    No character set translation occurs; this routine always produces the
 565    execution-set character with numeric value equal to the given octal
 566    number.  */
 567 static const uchar *
 568 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
 569              struct strbuf *tbuf, bool wide)
 570 {
 571   size_t count = 0;
 572   cppchar_t c, n = 0;
 573   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
 574                   : CPP_OPTION (pfile, char_precision));
 575   size_t mask = width_to_mask (width);
 576   bool overflow = false;
 577
 578   while (from < limit && count++ < 3)
 579     {
 580       c = *from;
 581       if (c < '0' || c > '7')
 582         break;
 583       from++;
 584       overflow |= n ^ (n << 3 >> 3);
 585       n = (n << 3) + c - '0';
 586     }
 587
 588   if (n != (n & mask))
 589     {
 590       cpp_error (pfile, DL_PEDWARN,
 591                  "octal escape sequence out of range");
 592       n &= mask;
 593     }
 594
 595   emit_numeric_escape (pfile, n, tbuf, wide);
 596
 597   return from;
 598 }
 599
 600 /* Convert an escape sequence (pointed to by FROM) to its value on
 601    the target, and to the execution character set.  Do not scan past
 602    LIMIT.  Write the converted value into TBUF.  Returns an advanced
 603    pointer.  Handles all relevant diagnostics.  */
 604 static const uchar *
 605 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
 606                 struct strbuf *tbuf, bool wide)
 607 {
 608   /* Values of \a \b \e \f \n \r \t \v respectively.  */
 609 #if HOST_CHARSET == HOST_CHARSET_ASCII
 610   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
 611 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
 612   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
 613 #else
 614 #error "unknown host character set"
 615 #endif
 616
 617   uchar c;
 618
 619   c = *from;
 620   switch (c)
 621     {
 622       /* UCNs, hex escapes, and octal escapes are processed separately.  */
 623     case 'u': case 'U':
 624       return convert_ucn (pfile, from, limit, tbuf, wide);
 625
 626     case 'x':
 627       return convert_hex (pfile, from, limit, tbuf, wide);
 628       break;
 629
 630     case '0':  case '1':  case '2':  case '3':
 631     case '4':  case '5':  case '6':  case '7':
 632       return convert_oct (pfile, from, limit, tbuf, wide);
 633
 634       /* Various letter escapes.  Get the appropriate host-charset
 635          value into C.  */
 636     case '\\': case '\'': case '"': case '?': break;
 637
 638     case '(': case '{': case '[': case '%':
 639       /* '\(', etc, can be used at the beginning of a line in a long
 640          string split onto multiple lines with \-newline, to prevent
 641          Emacs or other text editors from getting confused.  '\%' can
 642          be used to prevent SCCS from mangling printf format strings.  */
 643       if (CPP_PEDANTIC (pfile))
 644         goto unknown;
 645       break;
 646
 647     case 'b': c = charconsts[1];  break;
 648     case 'f': c = charconsts[3];  break;
 649     case 'n': c = charconsts[4];  break;
 650     case 'r': c = charconsts[5];  break;
 651     case 't': c = charconsts[6];  break;
 652     case 'v': c = charconsts[7];  break;
 653
 654     case 'a':
 655       if (CPP_WTRADITIONAL (pfile))
 656         cpp_error (pfile, DL_WARNING,
 657                    "the meaning of '\\a' is different in traditional C");
 658       c = charconsts[0];
 659       break;
 660
 661     case 'e': case 'E':
 662       if (CPP_PEDANTIC (pfile))
 663         cpp_error (pfile, DL_PEDWARN,
 664                    "non-ISO-standard escape sequence, '\\%c'", (int) c);
 665       c = charconsts[2];
 666       break;
 667
 668     default:
 669     unknown:
 670       if (ISGRAPH (c))
 671         cpp_error (pfile, DL_PEDWARN,
 672                    "unknown escape sequence '\\%c'", (int) c);
 673       else
 674         cpp_error (pfile, DL_PEDWARN,
 675                    "unknown escape sequence: '\\%03o'", (int) c);
 676     }
 677
 678   /* Now convert what we have to the execution character set.  */
 679   if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
 680                      &c, 1, tbuf))
 681     cpp_errno (pfile, DL_ERROR,
 682                "converting escape sequence to execution character set");
 683
 684   return from + 1;
 685 }
 686 \f
 687 /* FROM is an array of cpp_string structures of length COUNT.  These
 688    are to be converted from the source to the execution character set,
 689    escape sequences translated, and finally all are to be
 690    concatenated.  WIDE indicates whether or not to produce a wide
 691    string.  The result is written into TO.  Returns true for success,
 692    false for failure.  */
 693 bool
 694 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
 695                       cpp_string *to, bool wide)
 696 {
 697   struct strbuf tbuf;
 698   const uchar *p, *base, *limit;
 699   size_t i;
 700   iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 701
 702   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
 703   tbuf.text = xmalloc (tbuf.asize);
 704   tbuf.len = 0;
 705
 706   for (i = 0; i < count; i++)
 707     {
 708       p = from[i].text;
 709       if (*p == 'L') p++;
 710       p++; /* skip leading quote */
 711       limit = from[i].text + from[i].len - 1; /* skip trailing quote */
 712
 713       for (;;)
 714         {
 715           base = p;
 716           while (p < limit && *p != '\\')
 717             p++;
 718           if (p > base)
 719             {
 720               /* We have a run of normal characters; these can be fed
 721                  directly to convert_cset.  */
 722               if (!convert_cset (cd, base, p - base, &tbuf))
 723                 goto fail;
 724             }
 725           if (p == limit)
 726             break;
 727
 728           p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
 729         }
 730     }
 731   /* NUL-terminate the 'to' buffer and translate it to a cpp_string
 732      structure.  */
 733   emit_numeric_escape (pfile, 0, &tbuf, wide);
 734   tbuf.text = xrealloc (tbuf.text, tbuf.len);
 735   to->text = tbuf.text;
 736   to->len = tbuf.len;
 737   return true;
 738
 739  fail:
 740   cpp_errno (pfile, DL_ERROR, "converting to execution character set");
 741   free (tbuf.text);
 742   return false;
 743 }
 744 \f
 745 /* Subroutine of cpp_interpret_charconst which performs the conversion
 746    to a number, for narrow strings.  STR is the string structure returned
 747    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
 748    cpp_interpret_charconst.  */
 749 static cppchar_t
 750 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 751                          unsigned int *pchars_seen, int *unsignedp)
 752 {
 753   size_t width = CPP_OPTION (pfile, char_precision);
 754   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
 755   size_t mask = width_to_mask (width);
 756   size_t i;
 757   cppchar_t result, c;
 758   bool unsigned_p;
 759
 760   /* The value of a multi-character character constant, or a
 761      single-character character constant whose representation in the
 762      execution character set is more than one byte long, is
 763      implementation defined.  This implementation defines it to be the
 764      number formed by interpreting the byte sequence in memory as a
 765      big-endian binary number.  If overflow occurs, the high bytes are
 766      lost, and a warning is issued.
 767
 768      We don't want to process the NUL terminator handed back by
 769      cpp_interpret_string.  */
 770   result = 0;
 771   for (i = 0; i < str.len - 1; i++)
 772     {
 773       c = str.text[i] & mask;
 774       if (width < BITS_PER_CPPCHAR_T)
 775         result = (result << width) | c;
 776       else
 777         result = c;
 778     }
 779
 780   if (i > max_chars)
 781     {
 782       i = max_chars;
 783       cpp_error (pfile, DL_WARNING, "character constant too long for its type");
 784     }
 785   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
 786     cpp_error (pfile, DL_WARNING, "multi-character character constant");
 787
 788   /* Multichar constants are of type int and therefore signed.  */
 789   if (i > 1)
 790     unsigned_p = 0;
 791   else
 792     unsigned_p = CPP_OPTION (pfile, unsigned_char);
 793
 794   /* Truncate the constant to its natural width, and simultaneously
 795      sign- or zero-extend to the full width of cppchar_t.
 796      For single-character constants, the value is WIDTH bits wide.
 797      For multi-character constants, the value is INT_PRECISION bits wide.  */
 798   if (i > 1)
 799     width = CPP_OPTION (pfile, int_precision);
 800   if (width < BITS_PER_CPPCHAR_T)
 801     {
 802       mask = ((cppchar_t) 1 << width) - 1;
 803       if (unsigned_p || !(result & (1 << (width - 1))))
 804         result &= mask;
 805       else
 806         result |= ~mask;
 807     }
 808   *pchars_seen = i;
 809   *unsignedp = unsigned_p;
 810   return result;
 811 }
 812
 813 /* Subroutine of cpp_interpret_charconst which performs the conversion
 814    to a number, for wide strings.  STR is the string structure returned
 815    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
 816    cpp_interpret_charconst.  */
 817 static cppchar_t
 818 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
 819                        unsigned int *pchars_seen, int *unsignedp)
 820 {
 821   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 822   size_t width = CPP_OPTION (pfile, wchar_precision);
 823   size_t cwidth = CPP_OPTION (pfile, char_precision);
 824   size_t mask = width_to_mask (width);
 825   size_t cmask = width_to_mask (cwidth);
 826   size_t nbwc = width / cwidth;
 827   size_t off, i;
 828   cppchar_t result = 0, c;
 829
 830   /* This is finicky because the string is in the target's byte order,
 831      which may not be our byte order.  Only the last character, ignoring
 832      the NUL terminator, is relevant.  */
 833   off = str.len - (nbwc * 2);
 834   result = 0;
 835   for (i = 0; i < nbwc; i++)
 836     {
 837       c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
 838       result = (result << cwidth) | (c & cmask);
 839     }
 840
 841   /* Wide character constants have type wchar_t, and a single
 842      character exactly fills a wchar_t, so a multi-character wide
 843      character constant is guaranteed to overflow.  */
 844   if (off > 0)
 845     cpp_error (pfile, DL_WARNING, "character constant too long for its type");
 846
 847   /* Truncate the constant to its natural width, and simultaneously
 848      sign- or zero-extend to the full width of cppchar_t.  */
 849   if (width < BITS_PER_CPPCHAR_T)
 850     {
 851       if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
 852         result &= mask;
 853       else
 854         result |= ~mask;
 855     }
 856
 857   *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
 858   *pchars_seen = 1;
 859   return result;
 860 }
 861
 862 /* Interpret a (possibly wide) character constant in TOKEN.
 863    PCHARS_SEEN points to a variable that is filled in with the number
 864    of characters seen, and UNSIGNEDP to a variable that indicates
 865    whether the result has signed type.  */
 866 cppchar_t
 867 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
 868                          unsigned int *pchars_seen, int *unsignedp)
 869 {
 870   cpp_string str = { 0, 0 };
 871   bool wide = (token->type == CPP_WCHAR);
 872   cppchar_t result;
 873
 874   /* an empty constant will appear as L'' or '' */
 875   if (token->val.str.len == (size_t) (2 + wide))
 876     {
 877       cpp_error (pfile, DL_ERROR, "empty character constant");
 878       return 0;
 879     }
 880   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
 881     return 0;
 882
 883   if (wide)
 884     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
 885   else
 886     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
 887
 888   if (str.text != token->val.str.text)
 889     free ((void *)str.text);
 890
 891   return result;
 892 }