10 int (*__mbtowc) (struct _reent *, wchar_t *, const char *, size_t,
11 const char *, mbstate_t *)
19 _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
26 return __mbtowc (r, pwc, s, n, __locale_charset (), state);
30 _DEFUN (__ascii_mbtowc, (r, pwc, s, n, charset, state),
35 const char *charset _AND
39 unsigned char *t = (unsigned char *)s;
59 typedef enum { ESCAPE, DOLLAR, BRACKET, AT, B, J,
60 NUL, JIS_CHAR, OTHER, JIS_C_NUM } JIS_CHAR_TYPE;
61 typedef enum { ASCII, JIS, A_ESC, A_ESC_DL, JIS_1, J_ESC, J_ESC_BR,
62 INV, JIS_S_NUM } JIS_STATE;
63 typedef enum { COPY_A, COPY_J1, COPY_J2, MAKE_A, NOOP, EMPTY, ERROR } JIS_ACTION;
65 /**************************************************************************************
66 * state/action tables for processing JIS encoding
67 * Where possible, switches to JIS are grouped with proceding JIS characters and switches
68 * to ASCII are grouped with preceding JIS characters. Thus, maximum returned length
69 * is 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.
70 *************************************************************************************/
72 static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
73 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */
74 /* ASCII */ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII },
75 /* JIS */ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1, INV },
76 /* A_ESC */ { ASCII, A_ESC_DL, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII },
77 /* A_ESC_DL */{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII, ASCII, ASCII },
78 /* JIS_1 */ { INV, JIS, JIS, JIS, JIS, JIS, INV, JIS, INV },
79 /* J_ESC */ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
80 /* J_ESC_BR */{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
83 static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
84 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */
85 /* ASCII */ { NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, EMPTY, COPY_A, COPY_A},
86 /* JIS */ { NOOP, COPY_J1, COPY_J1, COPY_J1, COPY_J1, COPY_J1, ERROR, COPY_J1, ERROR },
87 /* A_ESC */ { COPY_A, NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A},
88 /* A_ESC_DL */{ COPY_A, COPY_A, COPY_A, NOOP, NOOP, COPY_A, COPY_A, COPY_A, COPY_A},
89 /* JIS_1 */ { ERROR, COPY_J2, COPY_J2, COPY_J2, COPY_J2, COPY_J2, ERROR, COPY_J2, ERROR },
90 /* J_ESC */ { ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR },
91 /* J_ESC_BR */{ ERROR, ERROR, ERROR, ERROR, MAKE_A, MAKE_A, ERROR, ERROR, ERROR },
94 /* we override the mbstate_t __count field for more complex encodings and use it store a state value */
95 #define __state __count
97 #ifdef _MB_EXTENDED_CHARSETS_ISO
99 _DEFUN (__iso_mbtowc, (r, pwc, s, n, charset, state),
100 struct _reent *r _AND
104 const char *charset _AND
108 unsigned char *t = (unsigned char *)s;
121 int iso_idx = __iso_8859_index (charset + 9);
124 *pwc = __iso_8859_conv[iso_idx][*t - 0xa0];
125 if (*pwc == 0) /* Invalid character */
141 #endif /* _MB_EXTENDED_CHARSETS_ISO */
143 #ifdef _MB_EXTENDED_CHARSETS_WINDOWS
145 _DEFUN (__cp_mbtowc, (r, pwc, s, n, charset, state),
146 struct _reent *r _AND
150 const char *charset _AND
154 unsigned char *t = (unsigned char *)s;
167 int cp_idx = __cp_index (charset + 2);
170 *pwc = __cp_conv[cp_idx][*t - 0x80];
171 if (*pwc == 0) /* Invalid character */
187 #endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
190 _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
191 struct _reent *r _AND
195 const char *charset _AND
199 unsigned char *t = (unsigned char *)s;
212 if (state->__count == 0)
215 ch = state->__value.__wchb[0];
221 return 0; /* s points to the null character */
226 /* single-byte sequence */
231 if (ch >= 0xc0 && ch <= 0xdf)
233 /* two-byte sequence */
234 state->__value.__wchb[0] = ch;
235 if (state->__count == 0)
237 else if (n < (size_t)-1)
242 if (ch < 0x80 || ch > 0xbf)
247 if (state->__value.__wchb[0] < 0xc2)
249 /* overlong UTF-8 sequence */
254 *pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6)
255 | (wchar_t)(ch & 0x3f);
258 if (ch >= 0xe0 && ch <= 0xef)
260 /* three-byte sequence */
262 state->__value.__wchb[0] = ch;
263 if (state->__count == 0)
265 else if (n < (size_t)-1)
269 ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
270 if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0)
272 /* overlong UTF-8 sequence */
276 if (ch < 0x80 || ch > 0xbf)
281 state->__value.__wchb[1] = ch;
282 if (state->__count == 1)
284 else if (n < (size_t)-1)
289 if (ch < 0x80 || ch > 0xbf)
295 tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
296 | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
297 | (wchar_t)(ch & 0x3f);
301 if (ch >= 0xf0 && ch <= 0xf4)
303 /* four-byte sequence */
305 state->__value.__wchb[0] = ch;
306 if (state->__count == 0)
308 else if (n < (size_t)-1)
312 ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
313 if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90)
314 || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90))
316 /* overlong UTF-8 sequence or result is > 0x10ffff */
320 if (ch < 0x80 || ch > 0xbf)
325 state->__value.__wchb[1] = ch;
326 if (state->__count == 1)
328 else if (n < (size_t)-1)
332 ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
333 if (ch < 0x80 || ch > 0xbf)
338 state->__value.__wchb[2] = ch;
339 if (state->__count == 2)
341 else if (n < (size_t)-1)
343 if (state->__count == 3 && sizeof(wchar_t) == 2)
345 /* On systems which have wchar_t being UTF-16 values, the value
346 doesn't fit into a single wchar_t in this case. So what we
347 do here is to store the state with a special value of __count
348 and return the first half of a surrogate pair. The first
349 three bytes of a UTF-8 sequence are enough to generate the
350 first half of a UTF-16 surrogate pair. As return value we
351 choose to return the number of bytes actually read up to
353 The second half of the surrogate pair is returned in case we
354 recognize the special __count value of four, and the next
355 byte is actually a valid value. See below. */
356 tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
357 | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
358 | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
360 *pwc = 0xd800 | ((tmp - 0x10000) >> 10);
366 if (ch < 0x80 || ch > 0xbf)
371 tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
372 | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
373 | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
374 | (wint_t)(ch & 0x3f);
375 if (state->__count == 4 && sizeof(wchar_t) == 2)
376 /* Create the second half of the surrogate pair for systems with
377 wchar_t == UTF-16 . */
378 *pwc = 0xdc00 | (tmp & 0x3ff);
389 /* Cygwin defines its own doublebyte charset conversion functions
390 because the underlying OS requires wchar_t == UTF-16. */
393 _DEFUN (__sjis_mbtowc, (r, pwc, s, n, charset, state),
394 struct _reent *r _AND
398 const char *charset _AND
402 unsigned char *t = (unsigned char *)s;
410 return 0; /* not state-dependent */
416 if (state->__count == 0)
420 state->__value.__wchb[0] = ch;
427 if (state->__count == 1)
431 *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
451 _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, charset, state),
452 struct _reent *r _AND
456 const char *charset _AND
460 unsigned char *t = (unsigned char *)s;
474 if (state->__count == 0)
478 state->__value.__wchb[0] = ch;
485 if (state->__count == 1)
489 if (state->__value.__wchb[0] == 0x8f)
491 state->__value.__wchb[1] = ch;
499 *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
510 if (state->__count == 2)
514 *pwc = (((wchar_t)state->__value.__wchb[1]) << 8)
515 + (wchar_t)(ch & 0x7f);
535 _DEFUN (__jis_mbtowc, (r, pwc, s, n, charset, state),
536 struct _reent *r _AND
540 const char *charset _AND
544 unsigned char *t = (unsigned char *)s;
545 JIS_STATE curr_state;
557 state->__state = ASCII;
558 return 1; /* state-dependent */
564 curr_state = state->__state;
567 for (i = 0; i < n; ++i)
594 if (_isjis (curr_ch))
600 action = JIS_action_table[curr_state][ch];
601 curr_state = JIS_state_table[curr_state][ch];
608 state->__state = ASCII;
612 state->__state = ASCII;
613 *pwc = (wchar_t)*ptr;
616 state->__value.__wchb[0] = t[i];
619 state->__state = JIS;
620 *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]);
623 ptr = (unsigned char *)(t + i + 1);
633 state->__state = curr_state;
634 return -2; /* n < bytes needed */
636 #endif /* !__CYGWIN__*/
637 #endif /* _MB_CAPABLE */