libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100
 101 #define _GNU_SOURCE
 102 #define _ISOC99_SOURCE
 103 #include <errno.h>
 104 #include <stddef.h>
 105 #include <limits.h>
 106 #include <stdint.h>
 107 #include <inttypes.h>
 108 #include <stdlib.h>
 109 #include <stdio.h>
 110 #include <assert.h>
 111 #include <locale.h>
 112 #include <wchar.h>
 113 #include <bits/uClibc_uwchar.h>
 114
 115 /**********************************************************************/
 116 #ifdef __UCLIBC_HAS_LOCALE__
 117 #ifdef __UCLIBC_MJN3_ONLY__
 118 #ifdef L_iswspace
 119 /* generates one warning */
 120 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
 121 #endif
 122 #endif /* __UCLIBC_MJN3_ONLY__ */
 123
 124 #define ENCODING                ((__UCLIBC_CURLOCALE_DATA).encoding)
 125
 126 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
 127 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
 128 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
 129 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
 130 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
 131 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
 132
 133 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 134 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 135 #endif
 136
 137 #else  /* __UCLIBC_HAS_LOCALE__ */
 138
 139 #ifdef __UCLIBC_MJN3_ONLY__
 140 #ifdef L_btowc
 141 /* emit only once */
 142 #warning fix preprocessor logic testing locale settings
 143 #endif
 144 #endif
 145
 146 #define ENCODING (__ctype_encoding_7_bit)
 147 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 148 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 149 #endif
 150 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 151 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 152 #endif
 153 #undef L__wchar_utf8sntowcs
 154 #undef L__wchar_wcsntoutf8s
 155
 156 #endif /* __UCLIBC_HAS_LOCALE__ */
 157 /**********************************************************************/
 158
 159 #if WCHAR_MAX > 0xffffUL
 160 #define UTF_8_MAX_LEN 6
 161 #else
 162 #define UTF_8_MAX_LEN 3
 163 #endif
 164
 165 #define KUHN 1
 166
 167 extern size_t __mbrtowc (wchar_t *__restrict __pwc,
 168                        __const char *__restrict __s, size_t __n,
 169                        mbstate_t *__p) attribute_hidden;
 170
 171 extern size_t __wcrtomb (char *__restrict __s, wchar_t __wc,
 172                        mbstate_t *__restrict __ps) attribute_hidden;
 173
 174 /* Implementation-specific work functions. */
 175
 176 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 177                                         const char **__restrict src, size_t n,
 178                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
 179
 180 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 181                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
 182
 183 /* glibc extensions. */
 184
 185 extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
 186                                    const char **__restrict src,
 187                                    size_t NMC, size_t len, mbstate_t *__restrict ps) attribute_hidden;
 188
 189 extern size_t __wcsnrtombs(char *__restrict dst,
 190                                    const wchar_t **__restrict src,
 191                                    size_t NWC, size_t len, mbstate_t *__restrict ps) attribute_hidden;
 192
 193 /**********************************************************************/
 194 #ifdef L_btowc
 195
 196 wint_t attribute_hidden __btowc(int c)
 197 {
 198 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 199
 200         wchar_t wc;
 201         unsigned char buf[1];
 202         mbstate_t mbstate;
 203
 204         if (c != EOF) {
 205                 *buf = (unsigned char) c;
 206                 mbstate.__mask = 0;             /* Initialize the mbstate. */
 207                 if (__mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
 208                         return wc;
 209                 }
 210         }
 211         return WEOF;
 212
 213 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 214
 215 #ifdef __UCLIBC_HAS_LOCALE__
 216         assert((ENCODING == __ctype_encoding_7_bit)
 217                    || (ENCODING == __ctype_encoding_utf8));
 218 #endif /* __UCLIBC_HAS_LOCALE__ */
 219
 220         /* If we don't have 8-bit locale support, then this is trivial since
 221          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 222         return (((unsigned int)c) < 0x80) ? c : WEOF;
 223
 224 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 225 }
 226 strong_alias(__btowc,btowc)
 227
 228 #endif
 229 /**********************************************************************/
 230 #ifdef L_wctob
 231
 232 /* Note: We completely ignore ps in all currently supported conversions. */
 233
 234 int wctob(wint_t c)
 235 {
 236 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 237
 238         unsigned char buf[MB_LEN_MAX];
 239
 240         return (__wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
 241
 242 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 243
 244 #ifdef __UCLIBC_HAS_LOCALE__
 245         assert((ENCODING == __ctype_encoding_7_bit)
 246                    || (ENCODING == __ctype_encoding_utf8));
 247 #endif /* __UCLIBC_HAS_LOCALE__ */
 248
 249         /* If we don't have 8-bit locale support, then this is trivial since
 250          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 251
 252         /* TODO: need unsigned version of wint_t... */
 253 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 254         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 255
 256 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 257 }
 258
 259 #endif
 260 /**********************************************************************/
 261 #ifdef L_mbsinit
 262
 263 int mbsinit(const mbstate_t *ps)
 264 {
 265         return !ps || !ps->__mask;
 266 }
 267
 268 #endif
 269 /**********************************************************************/
 270 #ifdef L_mbrlen
 271
 272 size_t attribute_hidden __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 273 {
 274         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 275
 276         return __mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 277 }
 278 strong_alias(__mbrlen,mbrlen)
 279
 280 #endif
 281 /**********************************************************************/
 282 #ifdef L_mbrtowc
 283
 284 size_t attribute_hidden __mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 285                            size_t n, mbstate_t *__restrict ps)
 286 {
 287         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 288         wchar_t wcbuf[1];
 289         const char *p;
 290         size_t r;
 291         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 292
 293         if (!ps) {
 294                 ps = &mbstate;
 295         }
 296
 297         if (!s) {
 298                 pwc = (wchar_t *) s;    /* NULL */
 299                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 300                 s = empty_string;
 301                 n = 1;
 302         } else if (!n) {
 303                 /* TODO: change error code? */
 304                 return (ps->__mask && (ps->__wc == 0xffffU))
 305                         ? ((size_t) -1) : ((size_t) -2);
 306         }
 307
 308         p = s;
 309
 310 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 311         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 312         if (ENCODING == __ctype_encoding_utf8) {
 313                 if (!pwc) {
 314                         pwc = wcbuf;
 315                 }
 316                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 317                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 318         }
 319 #endif
 320
 321 #ifdef __UCLIBC_MJN3_ONLY__
 322 #warning TODO: This adds a trailing nul!
 323 #endif /* __UCLIBC_MJN3_ONLY__ */
 324
 325         r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 326
 327         if (((ssize_t) r) >= 0) {
 328                 if (pwc) {
 329                         *pwc = *wcbuf;
 330                 }
 331         }
 332         return (size_t) r;
 333 }
 334 strong_alias(__mbrtowc,mbrtowc)
 335
 336 #endif
 337 /**********************************************************************/
 338 #ifdef L_wcrtomb
 339
 340 /* Note: We completely ignore ps in all currently supported conversions. */
 341 /* TODO: Check for valid state anyway? */
 342
 343 size_t attribute_hidden __wcrtomb(register char *__restrict s, wchar_t wc,
 344                            mbstate_t *__restrict ps)
 345 {
 346 #ifdef __UCLIBC_MJN3_ONLY__
 347 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
 348 #endif /* __UCLIBC_MJN3_ONLY__ */
 349         wchar_t wcbuf[1];
 350         const wchar_t *pwc;
 351         size_t r;
 352         char buf[MB_LEN_MAX];
 353
 354         if (!s) {
 355                 s = buf;
 356                 wc = 0;
 357         }
 358
 359         pwc = wcbuf;
 360         wcbuf[0] = wc;
 361
 362         r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
 363         return (r != 0) ? r : 1;
 364 }
 365 strong_alias(__wcrtomb,wcrtomb)
 366
 367 #endif
 368 /**********************************************************************/
 369 #ifdef L_mbsrtowcs
 370
 371 size_t attribute_hidden __mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 372                                  size_t len, mbstate_t *__restrict ps)
 373 {
 374         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 375
 376         return __mbsnrtowcs(dst, src, SIZE_MAX, len,
 377                                                 ((ps != NULL) ? ps : &mbstate));
 378 }
 379 strong_alias(__mbsrtowcs,mbsrtowcs)
 380
 381 #endif
 382 /**********************************************************************/
 383 #ifdef L_wcsrtombs
 384
 385 /* Note: We completely ignore ps in all currently supported conversions.
 386
 387  * TODO: Check for valid state anyway? */
 388
 389 size_t attribute_hidden __wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 390                                  size_t len, mbstate_t *__restrict ps)
 391 {
 392         return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 393 }
 394 strong_alias(__wcsrtombs,wcsrtombs)
 395
 396 #endif
 397 /**********************************************************************/
 398 #ifdef L__wchar_utf8sntowcs
 399
 400 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 401  * UTF-8-test.txt strss test.
 402  */
 403 /*  #define DECODER */
 404
 405 #ifdef DECODER
 406 #ifndef KUHN
 407 #define KUHN
 408 #endif
 409 #endif
 410
 411 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 412                                                   const char **__restrict src, size_t n,
 413                                                   mbstate_t *ps, int allow_continuation)
 414 {
 415         register const char *s;
 416         __uwchar_t mask;
 417         __uwchar_t wc;
 418         wchar_t wcbuf[1];
 419         size_t count;
 420         int incr;
 421
 422         s = *src;
 423
 424         assert(s != NULL);
 425         assert(ps != NULL);
 426
 427         incr = 1;
 428         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 429          * wprintf, we need to be able to compute the number of wchars needed
 430          * for the mbs conversion, not to exceed the precision specified.
 431          * But if dst is NULL, the return value is the length assuming a
 432          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 433          * as pwc in order to flag that we really want the length, subject
 434          * to the restricted buffer size and no partial conversions.
 435          * See mbsnrtowcs() as well. */
 436         if (!pwc || (pwc == ((wchar_t *)ps))) {
 437                 if (!pwc) {
 438                         wn = SIZE_MAX;
 439                 }
 440                 pwc = wcbuf;
 441                 incr = 0;
 442         }
 443
 444         /* This is really here only to support the glibc extension function
 445          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 446          * check on the validity of the mbstate. */
 447         if (!(count = wn)) {
 448                 return 0;
 449         }
 450
 451         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
 452 #ifdef DECODER
 453                 wc = (__uwchar_t) ps->__wc;
 454                 if (n) {
 455                         goto CONTINUE;
 456                 }
 457                 goto DONE;
 458 #else
 459                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
 460                         /* TODO: change error code here and below? */
 461                         if (n) {
 462                                 goto CONTINUE;
 463                         }
 464                         goto DONE;
 465                 }
 466                 __set_errno(EILSEQ);
 467                 return (size_t) -1;             /* We're in an error state. */
 468 #endif
 469         }
 470
 471         do {
 472                 if (!n) {
 473                         goto DONE;
 474                 }
 475                 --n;
 476                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 477                         mask = 0x40;
 478 #ifdef __UCLIBC_MJN3_ONLY__
 479 #warning TODO: Fix range for 16 bit wchar_t case.
 480 #endif
 481                         if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
 482                                 goto START;
 483                         }
 484                 BAD:
 485 #ifdef DECODER
 486                         wc = 0xfffdU;
 487                         goto COMPLETE;
 488 #else
 489                         ps->__mask = mask;
 490                         ps->__wc = 0xffffU;
 491                         __set_errno(EILSEQ);
 492                         return (size_t) -1;     /* Illegal start byte! */
 493 #endif
 494
 495                 CONTINUE:
 496                         while (n) {
 497                                 --n;
 498                                 if ((*s & 0xc0) != 0x80) {
 499                                         goto BAD;
 500                                 }
 501                                 mask <<= 5;
 502                                 wc <<= 6;
 503                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 504                                 ++s;
 505                         START:
 506                                 wc &= ~(mask << 1);
 507
 508                                 if ((wc & mask) == 0) { /* Character completed. */
 509                                         if ((mask >>= 5) == 0x40) {
 510                                                 mask += mask;
 511                                         }
 512                                         /* Check for invalid sequences (longer than necessary)
 513                                          * and invalid chars.  */
 514                                         if ( (wc < mask) /* Sequence not minimal length. */
 515 #ifdef KUHN
 516 #if UTF_8_MAX_LEN == 3
 517 #error broken since mask can overflow!!
 518                                                  /* For plane 0, these are the only defined values.*/
 519                                                  || (wc > 0xfffdU)
 520 #else
 521                                                  /* Note that we don't need to worry about exceeding */
 522                                                  /* 31 bits as that is the most that UTF-8 provides. */
 523                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 524 #endif
 525                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 526 #endif /* KUHN */
 527                                                  ) {
 528                                                 goto BAD;
 529                                         }
 530                                         goto COMPLETE;
 531                                 }
 532                         }
 533                         /* Character potentially valid but incomplete. */
 534                         if (!allow_continuation) {
 535                                 if (count != wn) {
 536                                         return 0;
 537                                 }
 538                                 /* NOTE: The following can fail if you allow and then disallow
 539                                  * continuation!!! */
 540 #if UTF_8_MAX_LEN == 3
 541 #error broken since mask can overflow!!
 542 #endif
 543                                 /* Need to back up... */
 544                                 do {
 545                                         --s;
 546                                 } while ((mask >>= 5) >= 0x40);
 547                                 goto DONE;
 548                         }
 549                         ps->__mask = (wchar_t) mask;
 550                         ps->__wc = (wchar_t) wc;
 551                         *src = s;
 552                         return (size_t) -2;
 553                 }
 554         COMPLETE:
 555                 *pwc = wc;
 556                 pwc += incr;
 557         }
 558 #ifdef DECODER
 559         while (--count);
 560 #else
 561         while (wc && --count);
 562
 563         if (!wc) {
 564                 s = NULL;
 565         }
 566 #endif
 567
 568  DONE:
 569         /* ps->__wc is irrelavent here. */
 570         ps->__mask = 0;
 571         if (pwc != wcbuf) {
 572                 *src = s;
 573         }
 574
 575         return wn - count;
 576 }
 577
 578 #endif
 579 /**********************************************************************/
 580 #ifdef L__wchar_wcsntoutf8s
 581
 582 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 583                                                   const wchar_t **__restrict src, size_t wn)
 584 {
 585         register char *p;
 586         size_t len, t;
 587         __uwchar_t wc;
 588         const __uwchar_t *swc;
 589         int store;
 590         char buf[MB_LEN_MAX];
 591         char m;
 592
 593         store = 1;
 594         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 595          * printf, we need to be able to compute the number of bytes needed
 596          * for the mbs conversion, not to exceed the precision specified.
 597          * But if dst is NULL, the return value is the length assuming a
 598          * sufficiently sized buffer.  So, we allow passing of (char *) src
 599          * as dst in order to flag that we really want the length, subject
 600          * to the restricted buffer size and no partial conversions.
 601          * See wcsnrtombs() as well. */
 602         if (!s || (s == ((char *) src))) {
 603                 if (!s) {
 604                         n = SIZE_MAX;
 605                 }
 606             s = buf;
 607                 store = 0;
 608         }
 609
 610         t = n;
 611         swc = (const __uwchar_t *) *src;
 612
 613         assert(swc != NULL);
 614
 615         while (wn && t) {
 616                 wc = *swc;
 617
 618                 *s = wc;
 619                 len = 1;
 620
 621                 if (wc >= 0x80) {
 622 #ifdef KUHN
 623                         if (
 624 #if UTF_8_MAX_LEN == 3
 625                                 /* For plane 0, these are the only defined values.*/
 626                                 /* Note that we don't need to worry about exceeding */
 627                                 /* 31 bits as that is the most that UTF-8 provides. */
 628                                 (wc > 0xfffdU)
 629 #else
 630                                 /* UTF_8_MAX_LEN == 6 */
 631                                 (wc > 0x7fffffffUL)
 632                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 633 #endif
 634                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 635                                 ) {
 636                                 __set_errno(EILSEQ);
 637                                 return (size_t) -1;
 638                         }
 639 #else  /* KUHN */
 640 #if UTF_8_MAX_LEN != 3
 641                         if (wc > 0x7fffffffUL) { /* Value too large. */
 642                                 __set_errno(EILSEQ);
 643                                 return (size_t) -1;
 644                         }
 645 #endif
 646 #endif /* KUHN */
 647
 648                         wc >>= 1;
 649                         p = s;
 650                         do {
 651                                 ++p;
 652                         } while (wc >>= 5);
 653                         wc = *swc;
 654                         if ((len = p - s) > t) { /* Not enough space. */
 655                                 break;
 656                         }
 657
 658                         m = 0x80;
 659                         while( p>s ) {
 660                                 m = (m >> 1) | 0x80;
 661                                 *--p = (wc & 0x3f) | 0x80;
 662                                 wc >>= 6;
 663                         }
 664                         *s |= (m << 1);
 665                 } else if (wc == 0) {   /* End of string. */
 666                         swc = NULL;
 667                         break;
 668                 }
 669
 670                 ++swc;
 671                 --wn;
 672                 t -= len;
 673                 if (store) {
 674                         s += len;
 675                 }
 676         }
 677
 678         if (store) {
 679                 *src = (const wchar_t *) swc;
 680         }
 681
 682         return n - t;
 683 }
 684
 685
 686 #endif
 687 /**********************************************************************/
 688 #ifdef L___mbsnrtowcs
 689
 690 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 691
 692 size_t attribute_hidden __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 693                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 694 {
 695         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 696         wchar_t wcbuf[1];
 697         const char *s;
 698         size_t count;
 699         int incr;
 700
 701         if (!ps) {
 702                 ps = &mbstate;
 703         }
 704
 705 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 706         if (ENCODING == __ctype_encoding_utf8) {
 707                 size_t r;
 708                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 709                                 != (size_t) -2) ? r : 0;
 710         }
 711 #endif
 712         incr = 1;
 713         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 714          * wprintf, we need to be able to compute the number of wchars needed
 715          * for the mbs conversion, not to exceed the precision specified.
 716          * But if dst is NULL, the return value is the length assuming a
 717          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 718          * as dst in order to flag that we really want the length, subject
 719          * to the restricted buffer size and no partial conversions.
 720          * See _wchar_utf8sntowcs() as well. */
 721         if (!dst || (dst == ((wchar_t *)ps))) {
 722                 if (!dst) {
 723                         len = SIZE_MAX;
 724                 }
 725                 dst = wcbuf;
 726                 incr = 0;
 727         }
 728
 729         /* Since all the following encodings are single-byte encodings... */
 730         if (len > NMC) {
 731                 len = NMC;
 732         }
 733
 734         count = len;
 735         s = *src;
 736
 737 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 738         if (ENCODING == __ctype_encoding_8_bit) {
 739                 wchar_t wc;
 740                 while (count) {
 741                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 742                                 wc -= 0x80;
 743                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
 744                                                   (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 745                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 746                                 if (!wc) {
 747                                         goto BAD;
 748                                 }
 749                         }
 750                         if (!(*dst = wc)) {
 751                                 s = NULL;
 752                                 break;
 753                         }
 754                         dst += incr;
 755                         ++s;
 756                         --count;
 757                 }
 758                 if (dst != wcbuf) {
 759                         *src = s;
 760                 }
 761                 return len - count;
 762         }
 763 #endif
 764
 765 #ifdef __UCLIBC_HAS_LOCALE__
 766         assert(ENCODING == __ctype_encoding_7_bit);
 767 #endif
 768
 769         while (count) {
 770                 if ((*dst = (unsigned char) *s) == 0) {
 771                         s = NULL;
 772                         break;
 773                 }
 774                 if (*dst >= 0x80) {
 775 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 776                 BAD:
 777 #endif
 778                         __set_errno(EILSEQ);
 779                         return (size_t) -1;
 780                 }
 781                 ++s;
 782                 dst += incr;
 783                 --count;
 784         }
 785         if (dst != wcbuf) {
 786                 *src = s;
 787         }
 788         return len - count;
 789 }
 790
 791 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 792                                   size_t NMC, size_t len, mbstate_t *__restrict ps)
 793          __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
 794
 795 #endif
 796 /**********************************************************************/
 797 #ifdef L___wcsnrtombs
 798
 799 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 800
 801 /* Note: We completely ignore ps in all currently supported conversions.
 802  * TODO: Check for valid state anyway? */
 803
 804 size_t attribute_hidden __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 805                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 806 {
 807         const __uwchar_t *s;
 808         size_t count;
 809         int incr;
 810         char buf[MB_LEN_MAX];
 811
 812 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 813         if (ENCODING == __ctype_encoding_utf8) {
 814                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 815         }
 816 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 817
 818         incr = 1;
 819         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 820          * printf, we need to be able to compute the number of bytes needed
 821          * for the mbs conversion, not to exceed the precision specified.
 822          * But if dst is NULL, the return value is the length assuming a
 823          * sufficiently sized buffer.  So, we allow passing of (char *) src
 824          * as dst in order to flag that we really want the length, subject
 825          * to the restricted buffer size and no partial conversions.
 826          * See _wchar_wcsntoutf8s() as well. */
 827         if (!dst || (dst == ((char *) src))) {
 828                 if (!dst) {
 829                         len = SIZE_MAX;
 830                 }
 831                 dst = buf;
 832                 incr = 0;
 833         }
 834
 835         /* Since all the following encodings are single-byte encodings... */
 836         if (len > NWC) {
 837                 len = NWC;
 838         }
 839
 840         count = len;
 841         s = (const __uwchar_t *) *src;
 842
 843 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 844         if (ENCODING == __ctype_encoding_8_bit) {
 845                 __uwchar_t wc;
 846                 __uwchar_t u;
 847                 while (count) {
 848                         if ((wc = *s) <= 0x7f) {
 849                                 if (!(*dst = (unsigned char) wc)) {
 850                                         s = NULL;
 851                                         break;
 852                                 }
 853                         } else {
 854                                 u = 0;
 855                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 856                                         u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 857                                                                                                                 + Cwc2c_TT_SHIFT)];
 858                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 859                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 860                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 861                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
 862                                                                         + (u << Cwc2c_TT_SHIFT)
 863                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 864                                 }
 865
 866 #define __WCHAR_REPLACEMENT_CHAR '?'
 867 #ifdef __WCHAR_REPLACEMENT_CHAR
 868                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 869 #else  /* __WCHAR_REPLACEMENT_CHAR */
 870                                 if (!u) {
 871                                         goto BAD;
 872                                 }
 873                                 *dst = (unsigned char) u;
 874 #endif /* __WCHAR_REPLACEMENT_CHAR */
 875                         }
 876                         ++s;
 877                         dst += incr;
 878                         --count;
 879                 }
 880                 if (dst != buf) {
 881                         *src = (const wchar_t *) s;
 882                 }
 883                 return len - count;
 884         }
 885 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 886
 887 #ifdef __UCLIBC_HAS_LOCALE__
 888         assert(ENCODING == __ctype_encoding_7_bit);
 889 #endif
 890
 891         while (count) {
 892                 if (*s >= 0x80) {
 893 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 894                 BAD:
 895 #endif
 896                         __set_errno(EILSEQ);
 897                         return (size_t) -1;
 898                 }
 899                 if ((*dst = (unsigned char) *s) == 0) {
 900                         s = NULL;
 901                         break;
 902                 }
 903                 ++s;
 904                 dst += incr;
 905                 --count;
 906         }
 907         if (dst != buf) {
 908                 *src = (const wchar_t *) s;
 909         }
 910         return len - count;
 911 }
 912
 913 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 914                                   size_t NWC, size_t len, mbstate_t *__restrict ps)
 915          __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
 916
 917 #endif
 918 /**********************************************************************/
 919 #ifdef L_wcswidth
 920
 921 #ifdef __UCLIBC_MJN3_ONLY__
 922 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
 923 #warning TODO: Update wcwidth to match latest by Kuhn.
 924 #endif
 925
 926 #if defined(__UCLIBC_HAS_LOCALE__) && \
 927 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 928
 929 static const unsigned char new_idx[] = {
 930         0,    5,    5,    6,   10,   15,   28,   39,
 931         48,   48,   71,   94,  113,  128,  139,  154,
 932         175,  186,  188,  188,  188,  188,  188,  188,
 933         203,  208,  208,  208,  208,  208,  208,  208,
 934         208,  219,  219,  219,  222,  222,  222,  222,
 935         222,  222,  222,  222,  222,  222,  222,  224,
 936         224,  231,  231,  231,  231,  231,  231,  231,
 937         231,  231,  231,  231,  231,  231,  231,  231,
 938         231,  231,  231,  231,  231,  231,  231,  231,
 939         231,  231,  231,  231,  231,  231,  231,  231,
 940         231,  231,  231,  231,  231,  231,  231,  231,
 941         231,  231,  231,  231,  231,  231,  231,  231,
 942         231,  231,  231,  231,  231,  231,  231,  231,
 943         231,  231,  231,  231,  231,  231,  231,  231,
 944         231,  231,  231,  231,  231,  231,  231,  231,
 945         231,  231,  231,  231,  231,  231,  231,  231,
 946         231,  231,  231,  231,  231,  231,  231,  231,
 947         231,  231,  231,  231,  231,  231,  231,  231,
 948         231,  231,  231,  231,  231,  231,  231,  231,
 949         231,  231,  231,  231,  231,  231,  231,  231,
 950         231,  231,  231,  231,  231,  233,  233,  233,
 951         233,  233,  233,  233,  234,  234,  234,  234,
 952         234,  234,  234,  234,  234,  234,  234,  234,
 953         234,  234,  234,  234,  234,  234,  234,  234,
 954         234,  234,  234,  234,  234,  234,  234,  234,
 955         234,  234,  234,  234,  234,  234,  234,  234,
 956         234,  234,  234,  234,  234,  234,  234,  234,
 957         236,  236,  236,  236,  236,  236,  236,  236,
 958         236,  236,  236,  236,  236,  236,  236,  236,
 959         236,  236,  236,  236,  236,  236,  236,  236,
 960         236,  236,  236,  236,  236,  236,  236,  236,
 961         236,  237,  237,  238,  241,  241,  242,  249,
 962         255,
 963 };
 964
 965 static const unsigned char new_tbl[] = {
 966         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 967         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 968         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 969         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 970         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 971         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 972         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 973         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 974         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 975         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 976         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 977         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 978         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 979         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 980         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 981         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 982         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 983         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 984         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 985         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 986         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 987         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 988         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 989         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 990         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 991         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
 992         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
 993         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
 994         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
 995         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
 996         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
 997         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
 998 };
 999
1000 static const signed char new_wtbl[] = {
1001         0,   -1,    1,   -1,    1,    1,    0,    1,
1002         0,    1,    1,    0,    1,    0,    1,    1,
1003         0,    1,    0,    1,    0,    1,    0,    1,
1004         0,    1,    0,    1,    1,    0,    1,    0,
1005         1,    0,    1,    0,    1,    0,    1,    1,
1006         0,    1,    0,    1,    0,    1,    0,    1,
1007         1,    0,    1,    0,    1,    0,    1,    0,
1008         1,    0,    1,    0,    1,    0,    1,    0,
1009         1,    0,    1,    0,    1,    0,    1,    1,
1010         0,    1,    0,    1,    0,    1,    0,    1,
1011         0,    1,    0,    1,    0,    1,    0,    1,
1012         0,    1,    0,    1,    0,    1,    1,    0,
1013         1,    0,    1,    0,    1,    0,    1,    0,
1014         1,    0,    1,    0,    1,    0,    1,    0,
1015         1,    1,    0,    1,    0,    1,    0,    1,
1016         0,    1,    0,    1,    0,    1,    0,    1,
1017         1,    0,    1,    0,    1,    0,    1,    0,
1018         1,    0,    1,    1,    0,    1,    0,    1,
1019         0,    1,    0,    1,    0,    1,    0,    1,
1020         0,    1,    1,    0,    1,    0,    1,    0,
1021         1,    0,    1,    0,    1,    0,    1,    0,
1022         1,    0,    1,    0,    1,    0,    1,    1,
1023         0,    1,    0,    1,    0,    1,    0,    1,
1024         0,    1,    2,    0,    1,    0,    1,    0,
1025         1,    0,    1,    0,    1,    0,    1,    0,
1026         1,    0,    1,    1,    0,    1,    0,    1,
1027         1,    0,    1,    0,    1,    0,    1,    0,
1028         1,    0,    1,    1,    2,    1,    1,    2,
1029         2,    0,    2,    1,    2,    0,    2,    2,
1030         1,    1,    2,    1,    1,    2,    1,    0,
1031         1,    1,    0,    1,    0,    1,    2,    1,
1032         0,    2,    1,    2,    1,    0,    1,
1033 };
1034
1035 int attribute_hidden __wcswidth(const wchar_t *pwcs, size_t n)
1036 {
1037     int h, l, m, count;
1038     wchar_t wc;
1039     unsigned char b;
1040
1041         if (ENCODING == __ctype_encoding_7_bit) {
1042                 size_t i;
1043
1044                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1045                         if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
1046                                 return -1;
1047                         }
1048                 }
1049         }
1050 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1051         else if (ENCODING == __ctype_encoding_8_bit) {
1052                 mbstate_t mbstate;
1053
1054                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1055                 if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1056                         return -1;
1057                 }
1058         }
1059 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1060 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1061         /* For stricter handling of allowed unicode values... see comments above. */
1062         else if (ENCODING == __ctype_encoding_utf8) {
1063                 size_t i;
1064
1065                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1066                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1067                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1068                                 ) {
1069                                 return -1;
1070                         }
1071                 }
1072         }
1073 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1074
1075     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1076                 if (wc <= 0xff) {
1077                         /* If we're here, wc != 0. */
1078                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1079                                 return -1;
1080                         }
1081                         ++count;
1082                         continue;
1083                 }
1084                 if (((unsigned int) wc) <= 0xffff) {
1085                         b = wc & 0xff;
1086                         h = (wc >> 8);
1087                         l = new_idx[h];
1088                         h = new_idx[h+1];
1089                         while ((m = (l+h) >> 1) != l) {
1090                                 if (b >= new_tbl[m]) {
1091                                         l = m;
1092                                 } else {                /* wc < tbl[m] */
1093                                         h = m;
1094                                 }
1095                         }
1096                         count += new_wtbl[l]; /* none should be -1. */
1097                         continue;
1098                 }
1099
1100                 /* Redo this to minimize average number of compares?*/
1101                 if (wc >= 0x1d167) {
1102                         if (wc <= 0x1d1ad) {
1103                                 if ((wc <= 0x1d169
1104                                          || (wc >= 0x1d173
1105                                                  && (wc <= 0x1d182
1106                                                          || (wc >= 0x1d185
1107                                                                  && (wc <= 0x1d18b
1108                                                                          || (wc >= 0x1d1aa))))))
1109                                         ) {
1110                                         continue;
1111                                 }
1112                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1113                                 continue;
1114                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1115                                 ++count;                /* need 2.. add one here */
1116                         }
1117 #if (WCHAR_MAX > 0x7fffffffL)
1118                         else if (wc > 0x7fffffffL) {
1119                                 return -1;
1120                         }
1121 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1122                 }
1123
1124                 ++count;
1125     }
1126
1127     return count;
1128 }
1129
1130 #else  /*  __UCLIBC_HAS_LOCALE__ */
1131
1132 int attribute_hidden __wcswidth(const wchar_t *pwcs, size_t n)
1133 {
1134         int count;
1135         wchar_t wc;
1136
1137     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1138                 if (wc <= 0xff) {
1139                         /* If we're here, wc != 0. */
1140                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1141                                 return -1;
1142                         }
1143                         ++count;
1144                         continue;
1145                 } else {
1146                         return -1;
1147                 }
1148         }
1149
1150         return count;
1151 }
1152
1153 #endif /*  __UCLIBC_HAS_LOCALE__ */
1154
1155 strong_alias(__wcswidth,wcswidth)
1156
1157 #endif
1158 /**********************************************************************/
1159 #ifdef L_wcwidth
1160
1161 extern int __wcswidth (__const wchar_t *__s, size_t __n) attribute_hidden;
1162
1163 int wcwidth(wchar_t wc)
1164 {
1165     return __wcswidth(&wc, 1);
1166 }
1167
1168 #endif
1169 /**********************************************************************/
1170
1171
1172 typedef struct {
1173         mbstate_t tostate;
1174         mbstate_t fromstate;
1175         int tocodeset;
1176         int fromcodeset;
1177         int frombom;
1178         int tobom;
1179         int fromcodeset0;
1180         int frombom0;
1181         int tobom0;
1182         int skip_invalid_input;         /* To support iconv -c option. */
1183 } _UC_iconv_t;
1184
1185
1186
1187 #ifdef L_iconv
1188
1189 #include <iconv.h>
1190 #include <string.h>
1191 #include <endian.h>
1192 #include <byteswap.h>
1193
1194 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1195 #error unsupported endianness for iconv
1196 #endif
1197
1198 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1199 #error currently iconv requires 8 bit locales
1200 #endif
1201 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1202 #error currently iconv requires UTF-8 locales
1203 #endif
1204
1205
1206 enum {
1207         IC_WCHAR_T = 0xe0,
1208         IC_MULTIBYTE = 0xe0,
1209 #if __BYTE_ORDER == __BIG_ENDIAN
1210         IC_UCS_4 =      0xec,
1211         IC_UTF_32 = 0xe4,
1212         IC_UCS_2 =      0xe2,
1213         IC_UTF_16 = 0xea,
1214 #else
1215         IC_UCS_4 =      0xed,
1216         IC_UTF_32 = 0xe5,
1217         IC_UCS_2 =      0xe3,
1218         IC_UTF_16 = 0xeb,
1219 #endif
1220         IC_UTF_8 = 2,
1221         IC_ASCII = 1
1222 };
1223
1224 /* For the multibyte
1225  * bit 0 means swap endian
1226  * bit 1 means 2 byte
1227  * bit 2 means 4 byte
1228  *
1229  */
1230
1231 const unsigned char __iconv_codesets[] =
1232         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1233 #if __BYTE_ORDER == __BIG_ENDIAN
1234         "\x08\xec""UCS-4\x00"           /* always BE */
1235         "\x0a\xec""UCS-4BE\x00"
1236         "\x0a\xed""UCS-4LE\x00"
1237         "\x09\fe4""UTF-32\x00"          /* platform endian with BOM */
1238         "\x0b\xe4""UTF-32BE\x00"
1239         "\x0b\xe5""UTF-32LE\x00"
1240         "\x08\xe2""UCS-2\x00"           /* always BE */
1241         "\x0a\xe2""UCS-2BE\x00"
1242         "\x0a\xe3""UCS-2LE\x00"
1243         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1244         "\x0b\xea""UTF-16BE\x00"
1245         "\x0b\xeb""UTF-16LE\x00"
1246 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1247         "\x08\xed""UCS-4\x00"           /* always BE */
1248         "\x0a\xed""UCS-4BE\x00"
1249         "\x0a\xec""UCS-4LE\x00"
1250         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1251         "\x0b\xe5""UTF-32BE\x00"
1252         "\x0b\xe4""UTF-32LE\x00"
1253         "\x08\xe3""UCS-2\x00"           /* always BE */
1254         "\x0a\xe3""UCS-2BE\x00"
1255         "\x0a\xe2""UCS-2LE\x00"
1256         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1257         "\x0b\xeb""UTF-16BE\x00"
1258         "\x0b\xea""UTF-16LE\x00"
1259 #endif
1260         "\x08\x02""UTF-8\x00"
1261         "\x0b\x01""US-ASCII\x00"
1262         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1263
1264 static int find_codeset(const char *name)
1265 {
1266         const unsigned char *s;
1267         int codeset;
1268
1269         for (s = __iconv_codesets ; *s ; s += *s) {
1270                 if (!strcasecmp(s+2, name)) {
1271                         return s[1];
1272                 }
1273         }
1274
1275         /* The following is ripped from find_locale in locale.c. */
1276
1277         /* TODO: maybe CODESET_LIST + *s ??? */
1278         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1279         codeset = 2;
1280         s = __LOCALE_DATA_CODESET_LIST;
1281         do {
1282                 ++codeset;              /* Increment codeset first. */
1283                 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1284                         return codeset;
1285                 }
1286         } while (*++s);
1287
1288         return 0;                       /* No matching codeset! */
1289 }
1290
1291 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1292 {
1293         register _UC_iconv_t *px;
1294         int tocodeset, fromcodeset;
1295
1296         if (((tocodeset = find_codeset(tocode)) != 0)
1297                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1298                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1299                         px->tocodeset = tocodeset;
1300                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1301                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1302                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1303                         px->skip_invalid_input = px->tostate.__mask
1304                                 = px->fromstate.__mask = 0;
1305                         return (iconv_t) px;
1306                 }
1307         } else {
1308                 __set_errno(EINVAL);
1309         }
1310         return (iconv_t)(-1);
1311 }
1312
1313 int weak_function iconv_close(iconv_t cd)
1314 {
1315         free(cd);
1316
1317         return 0;
1318 }
1319
1320 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1321                                                    size_t *__restrict inbytesleft,
1322                                                    char **__restrict outbuf,
1323                                                    size_t *__restrict outbytesleft)
1324 {
1325         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1326         size_t nrcount, r;
1327         wchar_t wc, wc2;
1328         int inci, inco;
1329
1330         assert(px != (_UC_iconv_t *)(-1));
1331         assert(sizeof(wchar_t) == 4);
1332
1333         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1334                 /* Note: For shift-state encodings we possibly need to output the
1335                  * shift sequence to return to initial state! */
1336                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1337                 }
1338                 px->tostate.__mask = px->fromstate.__mask = 0;
1339                 px->fromcodeset = px->fromcodeset0;
1340                 px->tobom = px->tobom0;
1341                 px->frombom = px->frombom0;
1342                 return 0;
1343         }
1344
1345         nrcount = 0;
1346         while (*inbytesleft) {
1347                 if (!*outbytesleft) {
1348                 TOO_BIG:
1349                         __set_errno(E2BIG);
1350                         return (size_t) -1;
1351                 }
1352
1353                 inci = inco = 1;
1354                 if (px->fromcodeset >= IC_MULTIBYTE) {
1355                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1356                         if (*inbytesleft < inci) goto INVALID;
1357                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1358                                 + ((unsigned char)((*inbuf)[1]));
1359                         if (inci == 4) {
1360                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1361                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1362                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1363                         } else {
1364                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1365                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1366                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1367                                         ) {                     /* surrogate */
1368                                         wc =- 0xd800U;
1369                                         if (*inbytesleft < 4) goto INVALID;
1370                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1371                                                 + ((unsigned char)((*inbuf)[3]));
1372                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1373                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1374                                                 goto ILLEGAL;
1375                                         }
1376                                         inci = 4;       /* Change inci here in case skipping illegals. */
1377                                         wc = 0x10000UL + (wc << 10) + wc2;
1378                                 }
1379                         }
1380
1381                         if (px->frombom) {
1382                                 px->frombom = 0;
1383                                 if ((wc == 0xfeffU)
1384                                         || (wc == ((inci == 4)
1385                                                            ? (((wchar_t) 0xfffe0000UL))
1386                                                            : ((wchar_t)(0xfffeUL))))
1387                                         ) {
1388                                         if (wc != 0xfeffU) {
1389                                                 px->fromcodeset ^= 1; /* toggle endianness */
1390                                                 wc = 0xfeffU;
1391                                         }
1392                                         if (!px->frombom) {
1393                                                 goto BOM_SKIP_OUTPUT;
1394                                         }
1395                                         goto GOT_BOM;
1396                                 }
1397                         }
1398
1399                         if (px->fromcodeset != IC_WCHAR_T) {
1400                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1401                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1402 #ifdef KUHN
1403                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1404                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1405 #endif
1406                                         ) {
1407                                         goto ILLEGAL;
1408                                 }
1409                         }
1410                 } else if (px->fromcodeset == IC_UTF_8) {
1411                         const char *p = *inbuf;
1412                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1413                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1414                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1415                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1416                                         if (r == (size_t)(-2)) {
1417                                         INVALID:
1418                                                 __set_errno(EINVAL);
1419                                         } else {
1420                                                 px->fromstate.__mask = 0;
1421                                                 inci = 1;
1422                                         ILLEGAL:
1423                                                 if (px->skip_invalid_input) {
1424                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1425                                                         goto BOM_SKIP_OUTPUT;
1426                                                 }
1427                                                 __set_errno(EILSEQ);
1428                                         }
1429                                         return (size_t)(-1);
1430                                 }
1431 #ifdef __UCLIBC_MJN3_ONLY__
1432 #warning TODO: optimize this.
1433 #endif
1434                                 if (p != NULL) { /* incomplete char case */
1435                                         goto INVALID;
1436                                 }
1437                                 p = *inbuf + 1; /* nul */
1438                         }
1439                         inci = p - *inbuf;
1440                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1441                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1442                                 goto ILLEGAL;
1443                         } else {                        /* some other 8-bit ascii-extension codeset */
1444                                 const __codeset_8_bit_t *c8b
1445                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1446                                 wc -= 0x80;
1447                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
1448                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1449                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1450                                 if (!wc) {
1451                                         goto ILLEGAL;
1452                                 }
1453                         }
1454                 }
1455
1456
1457                 if (px->tobom) {
1458                         inci = 0;
1459                         wc = 0xfeffU;
1460         GOT_BOM:
1461                         px->tobom = 0;
1462                 }
1463
1464                 if (px->tocodeset >= IC_MULTIBYTE) {
1465                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1466                         if (*outbytesleft < inco) goto TOO_BIG;
1467                         if (px->tocodeset != IC_WCHAR_T) {
1468                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1469                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1470 #ifdef KUHN
1471                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1472                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1473 #endif
1474                                         ) {
1475                                 REPLACE_32:
1476                                         wc = 0xfffd;
1477                                         ++nrcount;
1478                                 }
1479                         }
1480                         if (inco == 4) {
1481                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1482                         } else {
1483                                 if (((__uwchar_t)wc ) > 0xffffU) {
1484                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1485                                                 goto REPLACE_32;
1486                                         }
1487                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1488                                         wc2 = 0xdc00U + (wc & 0x3ff);
1489                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1490                                         if (px->tocodeset & 1) {
1491                                                 wc = bswap_16(wc);
1492                                                 wc2 = bswap_16(wc2);
1493                                         }
1494                                         wc += (wc2 << 16);
1495                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1496                         }
1497                         (*outbuf)[0] = (char)((unsigned char)(wc));
1498                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1499                         if (inco == 4) {
1500                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1501                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1502                         }
1503                 } else if (px->tocodeset == IC_UTF_8) {
1504                         const wchar_t *pw = &wc;
1505                         do {
1506                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1507                                 if (r != (size_t)(-1)) {
1508 #ifdef __UCLIBC_MJN3_ONLY__
1509 #warning TODO: What happens for a nul?
1510 #endif
1511                                         if (r == 0) {
1512                                                 if (wc != 0) {
1513                                                         goto TOO_BIG;
1514                                                 }
1515                                                 ++r;
1516                                         }
1517                                         break;
1518                                 }
1519                                 wc = 0xfffdU;
1520                                 ++nrcount;
1521                         } while (1);
1522                         inco = r;
1523                 } else if (((__uwchar_t)(wc)) < 0x80) {
1524                 CHAR_GOOD:
1525                                 **outbuf = wc;
1526                 } else {
1527                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1528                                 const __codeset_8_bit_t *c8b
1529                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1530                                 __uwchar_t u;
1531                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1532                                 u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1533                                                  + ((wc >> Cwc2c_TT_SHIFT)
1534                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1535                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
1536                                                  + (u << Cwc2c_TT_SHIFT)
1537                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1538                                 if (wc) {
1539                                         goto CHAR_GOOD;
1540                                 }
1541                         }
1542                         **outbuf = '?';
1543                         ++nrcount;
1544                 }
1545
1546                 *outbuf += inco;
1547                 *outbytesleft -= inco;
1548         BOM_SKIP_OUTPUT:
1549                 *inbuf += inci;
1550                 *inbytesleft -= inci;
1551         }
1552         return nrcount;
1553 }
1554
1555 #endif
1556 /**********************************************************************/
1557 #ifdef L_iconv_main
1558
1559 #include <stdio.h>
1560 #include <stdlib.h>
1561 #include <string.h>
1562 #include <wchar.h>
1563 #include <iconv.h>
1564 #include <stdarg.h>
1565 #include <libgen.h>
1566
1567 extern const unsigned char __iconv_codesets[];
1568
1569 #define IBUF BUFSIZ
1570 #define OBUF BUFSIZ
1571
1572 char *progname;
1573 int hide_errors;
1574
1575 static void error_msg(const char *fmt, ...)
1576          __attribute__ ((noreturn, format (printf, 1, 2)));
1577
1578 static void error_msg(const char *fmt, ...)
1579 {
1580         va_list arg;
1581
1582         if (!hide_errors) {
1583                 fprintf(stderr, "%s: ", progname);
1584                 va_start(arg, fmt);
1585                 vfprintf(stderr, fmt, arg);
1586                 va_end(arg);
1587         }
1588
1589         exit(EXIT_FAILURE);
1590 }
1591
1592 int main(int argc, char **argv)
1593 {
1594         FILE *ifile;
1595         FILE *ofile = stdout;
1596         const char *p;
1597         const char *s;
1598         static const char opt_chars[] = "tfocsl";
1599                                       /* 012345 */
1600         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1601         iconv_t ic;
1602         char ibuf[IBUF];
1603         char obuf[OBUF];
1604         char *pi;
1605         char *po;
1606         size_t ni, no, r, pos;
1607
1608         hide_errors = 0;
1609
1610         for (s = opt_chars ; *s ; s++) {
1611                 opts[ s - opt_chars ] = NULL;
1612         }
1613
1614         progname = *argv;
1615         while (--argc) {
1616                 p = *++argv;
1617                 if ((*p != '-') || (*++p == 0)) {
1618                         break;
1619                 }
1620                 do {
1621                         if ((s = __strchr(opt_chars,*p)) == NULL) {
1622                         USAGE:
1623                                 s = basename(progname);
1624                                 fprintf(stderr,
1625                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1626                                                 "  or\n%s -l\n", s, s);
1627                                 return EXIT_FAILURE;
1628                         }
1629                         if ((s - opt_chars) < 3) {
1630                                 if ((--argc == 0) || opts[s - opt_chars]) {
1631                                         goto USAGE;
1632                                 }
1633                                 opts[s - opt_chars] = *++argv;
1634                         } else {
1635                                 opts[s - opt_chars] = p;
1636                         }
1637                 } while (*++p);
1638         }
1639
1640         if (opts[5]) {                          /* -l */
1641                 fprintf(stderr, "Recognized codesets:\n");
1642                 for (s = __iconv_codesets ; *s ; s += *s) {
1643                         fprintf(stderr,"  %s\n", s+2);
1644                 }
1645                 s = __LOCALE_DATA_CODESET_LIST;
1646                 do {
1647                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1648                 } while (*++s);
1649
1650                 return EXIT_SUCCESS;
1651         }
1652
1653         if (opts[4]) {
1654                 hide_errors = 1;
1655         }
1656
1657         if (!opts[0] || !opts[1]) {
1658                 goto USAGE;
1659         }
1660         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1661                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1662         }
1663         if (opts[3]) {                          /* -c */
1664                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1665         }
1666
1667         if ((s = opts[2]) != NULL) {
1668                 if (!(ofile = fopen(s, "w"))) {
1669                         error_msg( "couldn't open %s for writing\n", s);
1670                 }
1671         }
1672
1673         pos = ni = 0;
1674         do {
1675                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1676                         ifile = stdin;          /* we don't check for duplicates */
1677                 } else if (!(ifile = fopen(*argv, "r"))) {
1678                         error_msg( "couldn't open %s for reading\n", *argv);
1679                 }
1680
1681                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1682                         pos += r;
1683                         ni += r;
1684                         no = OBUF;
1685                         pi = ibuf;
1686                         po = obuf;
1687                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1688                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1689                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1690                                 }
1691                         }
1692                         if ((r = OBUF - no) > 0) {
1693                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1694                                         error_msg( "write error\n");
1695                                 }
1696                         }
1697                         if (ni) {                       /* still bytes in buffer! */
1698                                 __memmove(ibuf, pi, ni);
1699                         }
1700                 }
1701
1702                 if (ferror(ifile)) {
1703                         error_msg( "read error\n");
1704                 }
1705
1706                 ++argv;
1707
1708                 if (ifile != stdin) {
1709                         fclose(ifile);
1710                 }
1711
1712         } while (--argc > 0);
1713
1714         iconv_close(ic);
1715
1716         if (ni) {
1717                 error_msg( "incomplete sequence\n");
1718         }
1719
1720         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1721                 ? EXIT_SUCCESS : EXIT_FAILURE;
1722 }
1723
1724 #endif
1725 /**********************************************************************/