2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.0.9"
24 #define NKF_RELEASE_DATE "2009-02-20"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
471 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
472 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
477 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
478 static int base64_count = 0;
480 /* X0208 -> ASCII converter */
483 static int f_line = 0; /* chars in line */
484 static int f_prev = 0;
485 static int fold_preserve_f = FALSE; /* preserve new lines */
486 static int fold_f = FALSE;
487 static int fold_len = 0;
490 static unsigned char kanji_intro = DEFAULT_J;
491 static unsigned char ascii_intro = DEFAULT_R;
495 #define FOLD_MARGIN 10
496 #define DEFAULT_FOLD 60
498 static int fold_margin = FOLD_MARGIN;
500 /* process default */
503 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
505 fprintf(stderr,"nkf internal module connection failure.\n");
511 no_connection(nkf_char c2, nkf_char c1)
513 no_connection2(c2,c1,0);
516 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
517 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
527 /* static redirections */
529 static void (*o_putc)(nkf_char c) = std_putc;
531 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
532 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
534 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
535 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
537 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
539 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
540 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
542 /* for strict mime */
543 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
544 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
547 static int output_mode = ASCII; /* output kanji mode */
548 static int input_mode = ASCII; /* input kanji mode */
549 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
551 /* X0201 / X0208 conversion tables */
553 /* X0201 kana conversion table */
555 static const unsigned char cv[]= {
556 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
557 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
558 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
559 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
560 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
561 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
562 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
563 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
564 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
565 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
566 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
567 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
568 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
569 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
570 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
571 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
575 /* X0201 kana conversion table for daguten */
577 static const unsigned char dv[]= {
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
583 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
584 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
585 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
586 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
587 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
589 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 /* X0201 kana conversion table for han-daguten */
598 static const unsigned char ev[]= {
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
610 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 /* X0208 kigou conversion table */
619 /* 0x8140 - 0x819e */
620 static const unsigned char fv[] = {
622 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
623 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
624 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
626 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
627 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
628 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
630 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
638 static int option_mode = 0;
639 static int file_out_f = FALSE;
641 static int overwrite_f = FALSE;
642 static int preserve_time_f = FALSE;
643 static int backup_f = FALSE;
644 static char *backup_suffix = "";
647 static int eolmode_f = 0; /* CR, LF, CRLF */
648 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
649 static nkf_char prev_cr = 0; /* CR or 0 */
650 #ifdef EASYWIN /*Easy Win */
651 static int end_check;
655 nkf_xmalloc(size_t size)
659 if (size == 0) size = 1;
663 perror("can't malloc");
671 nkf_xrealloc(void *ptr, size_t size)
673 if (size == 0) size = 1;
675 ptr = realloc(ptr, size);
677 perror("can't realloc");
684 #define nkf_xfree(ptr) free(ptr)
687 nkf_str_caseeql(const char *src, const char *target)
690 for (i = 0; src[i] && target[i]; i++) {
691 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
693 if (src[i] || target[i]) return FALSE;
698 nkf_enc_from_index(int idx)
700 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
703 return &nkf_encoding_table[idx];
707 nkf_enc_find_index(const char *name)
710 if (name[0] == 'X' && *(name+1) == '-') name += 2;
711 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
712 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
713 return encoding_name_to_id_table[i].id;
720 nkf_enc_find(const char *name)
723 idx = nkf_enc_find_index(name);
724 if (idx < 0) return 0;
725 return nkf_enc_from_index(idx);
728 #define nkf_enc_name(enc) (enc)->name
729 #define nkf_enc_to_index(enc) (enc)->id
730 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
731 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
732 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
733 #define nkf_enc_asciicompat(enc) (\
734 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
735 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
736 #define nkf_enc_unicode_p(enc) (\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
738 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
739 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
740 #define nkf_enc_cp5022x_p(enc) (\
741 nkf_enc_to_index(enc) == CP50220 ||\
742 nkf_enc_to_index(enc) == CP50221 ||\
743 nkf_enc_to_index(enc) == CP50222)
745 #ifdef DEFAULT_CODE_LOCALE
749 #ifdef HAVE_LANGINFO_H
750 return nl_langinfo(CODESET);
751 #elif defined(__WIN32__)
753 sprintf(buf, "CP%d", GetACP());
755 #elif defined(__OS2__)
756 # if defined(INT_IS_SHORT)
762 ULONG ulCP[1], ulncp;
763 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
764 if (ulCP[0] == 932 || ulCP[0] == 943)
765 strcpy(buf, "Shift_JIS");
767 sprintf(buf, "CP%lu", ulCP[0]);
775 nkf_locale_encoding()
777 nkf_encoding *enc = 0;
778 const char *encname = nkf_locale_charmap();
780 enc = nkf_enc_find(encname);
783 #endif /* DEFAULT_CODE_LOCALE */
788 return &nkf_encoding_table[UTF_8];
792 nkf_default_encoding()
794 nkf_encoding *enc = 0;
795 #ifdef DEFAULT_CODE_LOCALE
796 enc = nkf_locale_encoding();
797 #elif defined(DEFAULT_ENCIDX)
798 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
800 if (!enc) enc = nkf_utf8_encoding();
811 nkf_buf_new(int length)
813 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
814 buf->ptr = nkf_xmalloc(length);
822 nkf_buf_dispose(nkf_buf_t *buf)
829 #define nkf_buf_length(buf) ((buf)->len)
830 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
833 nkf_buf_at(nkf_buf_t *buf, int index)
835 assert(index <= buf->len);
836 return buf->ptr[index];
840 nkf_buf_clear(nkf_buf_t *buf)
846 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
848 if (buf->capa <= buf->len) {
851 buf->ptr[buf->len++] = c;
855 nkf_buf_pop(nkf_buf_t *buf)
857 assert(!nkf_buf_empty_p(buf));
858 return buf->ptr[--buf->len];
861 /* Normalization Form C */
864 #define fprintf dllprintf
870 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
877 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
878 #ifdef UTF8_OUTPUT_ENABLE
879 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
880 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
883 #ifdef UTF8_INPUT_ENABLE
884 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
885 " UTF option is -W[8,[16,32][B,L]]\n"
887 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
891 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
892 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
893 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
896 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
897 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
898 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
899 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
902 " O Output to File (DEFAULT 'nkf.out')\n"
903 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
906 " --ic=<encoding> Specify the input encoding\n"
907 " --oc=<encoding> Specify the output encoding\n"
908 " --hiragana --katakana Hiragana/Katakana Conversion\n"
909 " --katakana-hiragana Converts each other\n"
913 " --{cap, url}-input Convert hex after ':' or '%%'\n"
915 #ifdef NUMCHAR_OPTION
916 " --numchar-input Convert Unicode Character Reference\n"
918 #ifdef UTF8_INPUT_ENABLE
919 " --fb-{skip, html, xml, perl, java, subchar}\n"
920 " Specify unassigned character's replacement\n"
925 " --in-place[=SUF] Overwrite original files\n"
926 " --overwrite[=SUF] Preserve timestamp of original files\n"
928 " -g --guess Guess the input code\n"
929 " -v --version Print the version\n"
930 " --help/-V Print this help / configuration\n"
936 show_configuration(void)
939 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
940 " Compile-time options:\n"
941 " Compiled at: " __DATE__ " " __TIME__ "\n"
944 " Default output encoding: "
945 #ifdef DEFAULT_CODE_LOCALE
946 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
947 #elif defined(DEFAULT_ENCIDX)
948 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
954 " Default output end of line: "
955 #if DEFAULT_NEWLINE == CR
957 #elif DEFAULT_NEWLINE == CRLF
963 " Decode MIME encoded string: "
964 #if MIME_DECODE_DEFAULT
970 " Convert JIS X 0201 Katakana: "
977 " --help, --version output: "
978 #if HELP_OUTPUT_HELP_OUTPUT
989 get_backup_filename(const char *suffix, const char *filename)
991 char *backup_filename;
992 int asterisk_count = 0;
994 int filename_length = strlen(filename);
996 for(i = 0; suffix[i]; i++){
997 if(suffix[i] == '*') asterisk_count++;
1001 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1002 for(i = 0, j = 0; suffix[i];){
1003 if(suffix[i] == '*'){
1004 backup_filename[j] = '\0';
1005 strncat(backup_filename, filename, filename_length);
1007 j += filename_length;
1009 backup_filename[j++] = suffix[i++];
1012 backup_filename[j] = '\0';
1014 j = filename_length + strlen(suffix);
1015 backup_filename = nkf_xmalloc(j + 1);
1016 strcpy(backup_filename, filename);
1017 strcat(backup_filename, suffix);
1018 backup_filename[j] = '\0';
1020 return backup_filename;
1024 #ifdef UTF8_INPUT_ENABLE
1026 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1033 (*f)(0, bin2hex(c>>shift));
1044 encode_fallback_html(nkf_char c)
1049 if(c >= NKF_INT32_C(1000000))
1050 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1051 if(c >= NKF_INT32_C(100000))
1052 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1054 (*oconv)(0, 0x30+(c/10000 )%10);
1056 (*oconv)(0, 0x30+(c/1000 )%10);
1058 (*oconv)(0, 0x30+(c/100 )%10);
1060 (*oconv)(0, 0x30+(c/10 )%10);
1062 (*oconv)(0, 0x30+ c %10);
1068 encode_fallback_xml(nkf_char c)
1073 nkf_each_char_to_hex(oconv, c);
1079 encode_fallback_java(nkf_char c)
1083 if(!nkf_char_unicode_bmp_p(c)){
1087 (*oconv)(0, bin2hex(c>>20));
1088 (*oconv)(0, bin2hex(c>>16));
1092 (*oconv)(0, bin2hex(c>>12));
1093 (*oconv)(0, bin2hex(c>> 8));
1094 (*oconv)(0, bin2hex(c>> 4));
1095 (*oconv)(0, bin2hex(c ));
1100 encode_fallback_perl(nkf_char c)
1105 nkf_each_char_to_hex(oconv, c);
1111 encode_fallback_subchar(nkf_char c)
1113 c = unicode_subchar;
1114 (*oconv)((c>>8)&0xFF, c&0xFF);
1119 static const struct {
1143 {"katakana-hiragana","h3"},
1151 #ifdef UTF8_OUTPUT_ENABLE
1161 {"fb-subchar=", ""},
1163 #ifdef UTF8_INPUT_ENABLE
1164 {"utf8-input", "W"},
1165 {"utf16-input", "W16"},
1166 {"no-cp932ext", ""},
1167 {"no-best-fit-chars",""},
1169 #ifdef UNICODE_NORMALIZATION
1170 {"utf8mac-input", ""},
1182 #ifdef NUMCHAR_OPTION
1183 {"numchar-input", ""},
1189 #ifdef SHIFTJIS_CP932
1200 set_input_encoding(nkf_encoding *enc)
1202 switch (nkf_enc_to_index(enc)) {
1209 #ifdef SHIFTJIS_CP932
1212 #ifdef UTF8_OUTPUT_ENABLE
1213 ms_ucs_map_f = UCS_MAP_CP932;
1223 case ISO_2022_JP_2004:
1230 #ifdef SHIFTJIS_CP932
1233 #ifdef UTF8_OUTPUT_ENABLE
1234 ms_ucs_map_f = UCS_MAP_CP932;
1239 #ifdef SHIFTJIS_CP932
1242 #ifdef UTF8_OUTPUT_ENABLE
1243 ms_ucs_map_f = UCS_MAP_CP10001;
1251 #ifdef SHIFTJIS_CP932
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_CP932;
1259 #ifdef SHIFTJIS_CP932
1262 #ifdef UTF8_OUTPUT_ENABLE
1263 ms_ucs_map_f = UCS_MAP_MS;
1267 #ifdef SHIFTJIS_CP932
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_ASCII;
1274 case SHIFT_JISX0213:
1275 case SHIFT_JIS_2004:
1277 #ifdef SHIFTJIS_CP932
1284 #ifdef SHIFTJIS_CP932
1288 #ifdef UTF8_INPUT_ENABLE
1289 #ifdef UNICODE_NORMALIZATION
1297 input_endian = ENDIAN_BIG;
1301 input_endian = ENDIAN_LITTLE;
1306 input_endian = ENDIAN_BIG;
1310 input_endian = ENDIAN_LITTLE;
1317 set_output_encoding(nkf_encoding *enc)
1319 switch (nkf_enc_to_index(enc)) {
1322 #ifdef SHIFTJIS_CP932
1323 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1325 #ifdef UTF8_OUTPUT_ENABLE
1326 ms_ucs_map_f = UCS_MAP_CP932;
1330 #ifdef SHIFTJIS_CP932
1331 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1333 #ifdef UTF8_OUTPUT_ENABLE
1334 ms_ucs_map_f = UCS_MAP_CP932;
1339 #ifdef SHIFTJIS_CP932
1340 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1346 #ifdef SHIFTJIS_CP932
1347 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1353 #ifdef UTF8_OUTPUT_ENABLE
1354 ms_ucs_map_f = UCS_MAP_CP932;
1358 #ifdef UTF8_OUTPUT_ENABLE
1359 ms_ucs_map_f = UCS_MAP_CP10001;
1364 #ifdef SHIFTJIS_CP932
1365 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1367 #ifdef UTF8_OUTPUT_ENABLE
1368 ms_ucs_map_f = UCS_MAP_ASCII;
1373 #ifdef SHIFTJIS_CP932
1374 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_ASCII;
1381 #ifdef SHIFTJIS_CP932
1382 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1384 #ifdef UTF8_OUTPUT_ENABLE
1385 ms_ucs_map_f = UCS_MAP_CP932;
1390 #ifdef UTF8_OUTPUT_ENABLE
1391 ms_ucs_map_f = UCS_MAP_MS;
1396 #ifdef UTF8_OUTPUT_ENABLE
1397 ms_ucs_map_f = UCS_MAP_ASCII;
1400 case SHIFT_JISX0213:
1401 case SHIFT_JIS_2004:
1403 #ifdef SHIFTJIS_CP932
1404 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1411 #ifdef SHIFTJIS_CP932
1412 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1415 #ifdef UTF8_OUTPUT_ENABLE
1417 output_bom_f = TRUE;
1421 output_bom_f = TRUE;
1424 output_endian = ENDIAN_LITTLE;
1425 output_bom_f = FALSE;
1428 output_endian = ENDIAN_LITTLE;
1429 output_bom_f = TRUE;
1432 output_bom_f = TRUE;
1435 output_endian = ENDIAN_LITTLE;
1436 output_bom_f = FALSE;
1439 output_endian = ENDIAN_LITTLE;
1440 output_bom_f = TRUE;
1446 static struct input_code*
1447 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1450 struct input_code *p = input_code_list;
1452 if (iconv_func == p->iconv_func){
1462 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1464 #ifdef INPUT_CODE_FIX
1465 if (f || !input_encoding)
1472 #ifdef INPUT_CODE_FIX
1473 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1479 if (estab_f && iconv_for_check != iconv){
1480 struct input_code *p = find_inputcode_byfunc(iconv);
1482 set_input_codename(p->name);
1485 iconv_for_check = iconv;
1492 x0212_shift(nkf_char c)
1497 if (0x75 <= c && c <= 0x7f){
1498 ret = c + (0x109 - 0x75);
1501 if (0x75 <= c && c <= 0x7f){
1502 ret = c + (0x113 - 0x75);
1510 x0212_unshift(nkf_char c)
1513 if (0x7f <= c && c <= 0x88){
1514 ret = c + (0x75 - 0x7f);
1515 }else if (0x89 <= c && c <= 0x92){
1516 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1520 #endif /* X0212_ENABLE */
1523 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1529 if((0x21 <= ndx && ndx <= 0x2F)){
1530 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1531 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1533 }else if(0x6E <= ndx && ndx <= 0x7E){
1534 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1535 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1541 else if(nkf_isgraph(ndx)){
1543 const unsigned short *ptr;
1544 ptr = x0212_shiftjis[ndx - 0x21];
1546 val = ptr[(c1 & 0x7f) - 0x21];
1555 c2 = x0212_shift(c2);
1557 #endif /* X0212_ENABLE */
1559 if(0x7F < c2) return 1;
1560 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1561 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1566 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1568 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1571 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1572 if (0xFC < c1) return 1;
1573 #ifdef SHIFTJIS_CP932
1574 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1575 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1582 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1583 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1589 #endif /* SHIFTJIS_CP932 */
1591 if (!x0213_f && is_ibmext_in_sjis(c2)){
1592 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1595 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1608 if(x0213_f && c2 >= 0xF0){
1609 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1610 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1611 }else{ /* 78<=k<=94 */
1612 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1613 if (0x9E < c1) c2++;
1616 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1617 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1618 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1619 if (0x9E < c1) c2++;
1622 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1629 c2 = x0212_unshift(c2);
1636 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1638 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1646 }else if (val < 0x800){
1647 *p1 = 0xc0 | (val >> 6);
1648 *p2 = 0x80 | (val & 0x3f);
1651 } else if (nkf_char_unicode_bmp_p(val)) {
1652 *p1 = 0xe0 | (val >> 12);
1653 *p2 = 0x80 | ((val >> 6) & 0x3f);
1654 *p3 = 0x80 | ( val & 0x3f);
1656 } else if (nkf_char_unicode_value_p(val)) {
1657 *p1 = 0xe0 | (val >> 16);
1658 *p2 = 0x80 | ((val >> 12) & 0x3f);
1659 *p3 = 0x80 | ((val >> 6) & 0x3f);
1660 *p4 = 0x80 | ( val & 0x3f);
1670 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1677 else if (c1 <= 0xC3) {
1678 /* trail byte or invalid */
1681 else if (c1 <= 0xDF) {
1683 wc = (c1 & 0x1F) << 6;
1686 else if (c1 <= 0xEF) {
1688 wc = (c1 & 0x0F) << 12;
1689 wc |= (c2 & 0x3F) << 6;
1692 else if (c2 <= 0xF4) {
1694 wc = (c1 & 0x0F) << 18;
1695 wc |= (c2 & 0x3F) << 12;
1696 wc |= (c3 & 0x3F) << 6;
1706 #ifdef UTF8_INPUT_ENABLE
1708 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1709 const unsigned short *const *pp, nkf_char psize,
1710 nkf_char *p2, nkf_char *p1)
1713 const unsigned short *p;
1716 if (pp == 0) return 1;
1719 if (c1 < 0 || psize <= c1) return 1;
1721 if (p == 0) return 1;
1724 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1726 if (val == 0) return 1;
1727 if (no_cp932ext_f && (
1728 (val>>8) == 0x2D || /* NEC special characters */
1729 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1737 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1745 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1747 const unsigned short *const *pp;
1748 const unsigned short *const *const *ppp;
1749 static const char no_best_fit_chars_table_C2[] =
1750 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1753 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1754 static const char no_best_fit_chars_table_C2_ms[] =
1755 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1758 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1759 static const char no_best_fit_chars_table_932_C2[] =
1760 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1763 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1764 static const char no_best_fit_chars_table_932_C3[] =
1765 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1774 }else if(c2 < 0xe0){
1775 if(no_best_fit_chars_f){
1776 if(ms_ucs_map_f == UCS_MAP_CP932){
1779 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1782 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1785 }else if(!cp932inv_f){
1788 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1791 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1794 }else if(ms_ucs_map_f == UCS_MAP_MS){
1795 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1796 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1814 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1815 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1816 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1818 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1819 }else if(c0 < 0xF0){
1820 if(no_best_fit_chars_f){
1821 if(ms_ucs_map_f == UCS_MAP_CP932){
1822 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1823 }else if(ms_ucs_map_f == UCS_MAP_MS){
1828 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1831 if(c0 == 0x92) return 1;
1836 if(c1 == 0x80 || c0 == 0x9C) return 1;
1839 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1844 if(c0 == 0x94) return 1;
1847 if(c0 == 0xBB) return 1;
1857 if(c0 == 0x95) return 1;
1860 if(c0 == 0xA5) return 1;
1867 if(c0 == 0x8D) return 1;
1870 if(c0 == 0x9E && !cp932inv_f) return 1;
1873 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1881 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1882 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1883 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1885 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1887 #ifdef SHIFTJIS_CP932
1888 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1890 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1891 s2e_conv(s2, s1, p2, p1);
1900 #ifdef UTF8_OUTPUT_ENABLE
1902 e2w_conv(nkf_char c2, nkf_char c1)
1904 const unsigned short *p;
1906 if (c2 == JIS_X_0201_1976_K) {
1907 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1915 p = euc_to_utf8_1byte;
1917 } else if (is_eucg3(c2)){
1918 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1921 c2 = (c2&0x7f) - 0x21;
1922 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1923 p = x0212_to_utf8_2bytes[c2];
1929 c2 = (c2&0x7f) - 0x21;
1930 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1932 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1933 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1934 euc_to_utf8_2bytes_ms[c2];
1939 c1 = (c1 & 0x7f) - 0x21;
1940 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1947 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1954 }else if (0xc0 <= c2 && c2 <= 0xef) {
1955 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1956 #ifdef NUMCHAR_OPTION
1959 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1967 #ifdef UTF8_INPUT_ENABLE
1969 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1971 nkf_char c1, c2, c3, c4;
1978 else if (nkf_char_unicode_bmp_p(val)){
1979 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1980 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1983 *p1 = nkf_char_unicode_new(val);
1989 *p1 = nkf_char_unicode_new(val);
1996 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1998 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1999 if (iso2022jp_f && !x0201_f) {
2000 c2 = GETA1; c1 = GETA2;
2002 c2 = JIS_X_0201_1976_K;
2006 }else if (c2 == 0x8f){
2010 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2011 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2012 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2015 c2 = (c2 << 8) | (c1 & 0x7f);
2017 #ifdef SHIFTJIS_CP932
2020 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2021 s2e_conv(s2, s1, &c2, &c1);
2028 #endif /* SHIFTJIS_CP932 */
2030 #endif /* X0212_ENABLE */
2031 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2034 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2035 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2036 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2041 #ifdef SHIFTJIS_CP932
2042 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2044 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2045 s2e_conv(s2, s1, &c2, &c1);
2052 #endif /* SHIFTJIS_CP932 */
2060 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2062 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2063 if (iso2022jp_f && !x0201_f) {
2064 c2 = GETA1; c1 = GETA2;
2068 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2070 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2072 if(c1 == 0x7F) return 0;
2073 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2076 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2077 if (ret) return ret;
2084 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2086 nkf_char ret = 0, c4 = 0;
2087 static const char w_iconv_utf8_1st_byte[] =
2089 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2090 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2091 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2092 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2099 if (c1 < 0 || 0xff < c1) {
2100 }else if (c1 == 0) { /* 0 : 1 byte*/
2102 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2105 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2107 if (c2 < 0x80 || 0xBF < c2) return 0;
2110 if (c3 == 0) return -1;
2111 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2116 if (c3 == 0) return -1;
2117 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2121 if (c3 == 0) return -1;
2122 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2126 if (c3 == 0) return -2;
2127 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2131 if (c3 == 0) return -2;
2132 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2136 if (c3 == 0) return -2;
2137 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2145 if (c1 == 0 || c1 == EOF){
2146 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2147 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2150 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2158 #define NKF_ICONV_INVALID_CODE_RANGE -13
2160 unicode_iconv(nkf_char wc)
2168 }else if ((wc>>11) == 27) {
2169 /* unpaired surrogate */
2170 return NKF_ICONV_INVALID_CODE_RANGE;
2171 }else if (wc < 0xFFFF) {
2172 ret = w16e_conv(wc, &c2, &c1);
2173 if (ret) return ret;
2174 }else if (wc < 0x10FFFF) {
2176 c1 = nkf_char_unicode_new(wc);
2178 return NKF_ICONV_INVALID_CODE_RANGE;
2184 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2185 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2186 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2188 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2197 if (input_endian == ENDIAN_BIG) {
2198 if (0xD8 <= c1 && c1 <= 0xDB) {
2199 if (0xDC <= c3 && c3 <= 0xDF) {
2200 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2201 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2206 if (0xD8 <= c2 && c2 <= 0xDB) {
2207 if (0xDC <= c4 && c4 <= 0xDF) {
2208 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2209 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2215 return (*unicode_iconv)(wc);
2219 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2225 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2231 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2240 switch(input_endian){
2242 wc = c2 << 16 | c3 << 8 | c4;
2245 wc = c3 << 16 | c2 << 8 | c1;
2248 wc = c1 << 16 | c4 << 8 | c3;
2251 wc = c4 << 16 | c1 << 8 | c2;
2254 return NKF_ICONV_INVALID_CODE_RANGE;
2257 return (*unicode_iconv)(wc);
2261 #define output_ascii_escape_sequence(mode) do { \
2262 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2265 (*o_putc)(ascii_intro); \
2266 output_mode = mode; \
2271 output_escape_sequence(int mode)
2273 if (output_mode == mode)
2281 case JIS_X_0201_1976_K:
2289 (*o_putc)(kanji_intro);
2314 j_oconv(nkf_char c2, nkf_char c1)
2316 #ifdef NUMCHAR_OPTION
2317 if (c2 == 0 && nkf_char_unicode_p(c1)){
2318 w16e_conv(c1, &c2, &c1);
2319 if (c2 == 0 && nkf_char_unicode_p(c1)){
2320 c2 = c1 & VALUE_MASK;
2321 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2324 c2 = 0x7F + c1 / 94;
2325 c1 = 0x21 + c1 % 94;
2327 if (encode_fallback) (*encode_fallback)(c1);
2334 output_ascii_escape_sequence(ASCII);
2337 else if (c2 == EOF) {
2338 output_ascii_escape_sequence(ASCII);
2341 else if (c2 == ISO_8859_1) {
2342 output_ascii_escape_sequence(ISO_8859_1);
2345 else if (c2 == JIS_X_0201_1976_K) {
2346 output_escape_sequence(JIS_X_0201_1976_K);
2349 } else if (is_eucg3(c2)){
2350 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2351 (*o_putc)(c2 & 0x7f);
2356 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2357 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2358 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2365 e_oconv(nkf_char c2, nkf_char c1)
2367 if (c2 == 0 && nkf_char_unicode_p(c1)){
2368 w16e_conv(c1, &c2, &c1);
2369 if (c2 == 0 && nkf_char_unicode_p(c1)){
2370 c2 = c1 & VALUE_MASK;
2371 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2375 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2376 c1 = 0x21 + c1 % 94;
2379 (*o_putc)((c2 & 0x7f) | 0x080);
2380 (*o_putc)(c1 | 0x080);
2382 (*o_putc)((c2 & 0x7f) | 0x080);
2383 (*o_putc)(c1 | 0x080);
2387 if (encode_fallback) (*encode_fallback)(c1);
2395 } else if (c2 == 0) {
2396 output_mode = ASCII;
2398 } else if (c2 == JIS_X_0201_1976_K) {
2399 output_mode = EUC_JP;
2400 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2401 } else if (c2 == ISO_8859_1) {
2402 output_mode = ISO_8859_1;
2403 (*o_putc)(c1 | 0x080);
2405 } else if (is_eucg3(c2)){
2406 output_mode = EUC_JP;
2407 #ifdef SHIFTJIS_CP932
2410 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2411 s2e_conv(s2, s1, &c2, &c1);
2416 output_mode = ASCII;
2418 }else if (is_eucg3(c2)){
2421 (*o_putc)((c2 & 0x7f) | 0x080);
2422 (*o_putc)(c1 | 0x080);
2425 (*o_putc)((c2 & 0x7f) | 0x080);
2426 (*o_putc)(c1 | 0x080);
2430 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2431 set_iconv(FALSE, 0);
2432 return; /* too late to rescue this char */
2434 output_mode = EUC_JP;
2435 (*o_putc)(c2 | 0x080);
2436 (*o_putc)(c1 | 0x080);
2441 s_oconv(nkf_char c2, nkf_char c1)
2443 #ifdef NUMCHAR_OPTION
2444 if (c2 == 0 && nkf_char_unicode_p(c1)){
2445 w16e_conv(c1, &c2, &c1);
2446 if (c2 == 0 && nkf_char_unicode_p(c1)){
2447 c2 = c1 & VALUE_MASK;
2448 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2451 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2453 c1 += 0x40 + (c1 > 0x3e);
2458 if(encode_fallback)(*encode_fallback)(c1);
2467 } else if (c2 == 0) {
2468 output_mode = ASCII;
2470 } else if (c2 == JIS_X_0201_1976_K) {
2471 output_mode = SHIFT_JIS;
2473 } else if (c2 == ISO_8859_1) {
2474 output_mode = ISO_8859_1;
2475 (*o_putc)(c1 | 0x080);
2477 } else if (is_eucg3(c2)){
2478 output_mode = SHIFT_JIS;
2479 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2485 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2486 set_iconv(FALSE, 0);
2487 return; /* too late to rescue this char */
2489 output_mode = SHIFT_JIS;
2490 e2s_conv(c2, c1, &c2, &c1);
2492 #ifdef SHIFTJIS_CP932
2494 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2495 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2501 #endif /* SHIFTJIS_CP932 */
2504 if (prefix_table[(unsigned char)c1]){
2505 (*o_putc)(prefix_table[(unsigned char)c1]);
2511 #ifdef UTF8_OUTPUT_ENABLE
2513 w_oconv(nkf_char c2, nkf_char c1)
2519 output_bom_f = FALSE;
2530 if (c2 == 0 && nkf_char_unicode_p(c1)){
2531 val = c1 & VALUE_MASK;
2532 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2534 if (c2) (*o_putc)(c2);
2535 if (c3) (*o_putc)(c3);
2536 if (c4) (*o_putc)(c4);
2543 val = e2w_conv(c2, c1);
2545 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2547 if (c2) (*o_putc)(c2);
2548 if (c3) (*o_putc)(c3);
2549 if (c4) (*o_putc)(c4);
2555 w_oconv16(nkf_char c2, nkf_char c1)
2558 output_bom_f = FALSE;
2559 if (output_endian == ENDIAN_LITTLE){
2573 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2574 if (nkf_char_unicode_bmp_p(c1)) {
2575 c2 = (c1 >> 8) & 0xff;
2579 if (c1 <= UNICODE_MAX) {
2580 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2581 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2582 if (output_endian == ENDIAN_LITTLE){
2583 (*o_putc)(c2 & 0xff);
2584 (*o_putc)((c2 >> 8) & 0xff);
2585 (*o_putc)(c1 & 0xff);
2586 (*o_putc)((c1 >> 8) & 0xff);
2588 (*o_putc)((c2 >> 8) & 0xff);
2589 (*o_putc)(c2 & 0xff);
2590 (*o_putc)((c1 >> 8) & 0xff);
2591 (*o_putc)(c1 & 0xff);
2597 nkf_char val = e2w_conv(c2, c1);
2598 c2 = (val >> 8) & 0xff;
2603 if (output_endian == ENDIAN_LITTLE){
2613 w_oconv32(nkf_char c2, nkf_char c1)
2616 output_bom_f = FALSE;
2617 if (output_endian == ENDIAN_LITTLE){
2635 if (c2 == ISO_8859_1) {
2637 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2640 c1 = e2w_conv(c2, c1);
2643 if (output_endian == ENDIAN_LITTLE){
2644 (*o_putc)( c1 & 0xFF);
2645 (*o_putc)((c1 >> 8) & 0xFF);
2646 (*o_putc)((c1 >> 16) & 0xFF);
2650 (*o_putc)((c1 >> 16) & 0xFF);
2651 (*o_putc)((c1 >> 8) & 0xFF);
2652 (*o_putc)( c1 & 0xFF);
2657 #define SCORE_L2 (1) /* Kanji Level 2 */
2658 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2659 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2660 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2661 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2662 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2663 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2664 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2666 #define SCORE_INIT (SCORE_iMIME)
2668 static const nkf_char score_table_A0[] = {
2671 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2672 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2675 static const nkf_char score_table_F0[] = {
2676 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2677 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2678 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2679 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2683 set_code_score(struct input_code *ptr, nkf_char score)
2686 ptr->score |= score;
2691 clr_code_score(struct input_code *ptr, nkf_char score)
2694 ptr->score &= ~score;
2699 code_score(struct input_code *ptr)
2701 nkf_char c2 = ptr->buf[0];
2702 #ifdef UTF8_OUTPUT_ENABLE
2703 nkf_char c1 = ptr->buf[1];
2706 set_code_score(ptr, SCORE_ERROR);
2707 }else if (c2 == SS2){
2708 set_code_score(ptr, SCORE_KANA);
2709 }else if (c2 == 0x8f){
2710 set_code_score(ptr, SCORE_X0212);
2711 #ifdef UTF8_OUTPUT_ENABLE
2712 }else if (!e2w_conv(c2, c1)){
2713 set_code_score(ptr, SCORE_NO_EXIST);
2715 }else if ((c2 & 0x70) == 0x20){
2716 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2717 }else if ((c2 & 0x70) == 0x70){
2718 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2719 }else if ((c2 & 0x70) >= 0x50){
2720 set_code_score(ptr, SCORE_L2);
2725 status_disable(struct input_code *ptr)
2730 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2734 status_push_ch(struct input_code *ptr, nkf_char c)
2736 ptr->buf[ptr->index++] = c;
2740 status_clear(struct input_code *ptr)
2747 status_reset(struct input_code *ptr)
2750 ptr->score = SCORE_INIT;
2754 status_reinit(struct input_code *ptr)
2757 ptr->_file_stat = 0;
2761 status_check(struct input_code *ptr, nkf_char c)
2763 if (c <= DEL && estab_f){
2769 s_status(struct input_code *ptr, nkf_char c)
2773 status_check(ptr, c);
2778 }else if (nkf_char_unicode_p(c)){
2780 }else if (0xa1 <= c && c <= 0xdf){
2781 status_push_ch(ptr, SS2);
2782 status_push_ch(ptr, c);
2785 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2787 status_push_ch(ptr, c);
2788 }else if (0xed <= c && c <= 0xee){
2790 status_push_ch(ptr, c);
2791 #ifdef SHIFTJIS_CP932
2792 }else if (is_ibmext_in_sjis(c)){
2794 status_push_ch(ptr, c);
2795 #endif /* SHIFTJIS_CP932 */
2797 }else if (0xf0 <= c && c <= 0xfc){
2799 status_push_ch(ptr, c);
2800 #endif /* X0212_ENABLE */
2802 status_disable(ptr);
2806 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2807 status_push_ch(ptr, c);
2808 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2812 status_disable(ptr);
2816 #ifdef SHIFTJIS_CP932
2817 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2818 status_push_ch(ptr, c);
2819 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2820 set_code_score(ptr, SCORE_CP932);
2825 #endif /* SHIFTJIS_CP932 */
2826 status_disable(ptr);
2829 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2830 status_push_ch(ptr, c);
2831 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2832 set_code_score(ptr, SCORE_CP932);
2835 status_disable(ptr);
2842 e_status(struct input_code *ptr, nkf_char c)
2846 status_check(ptr, c);
2851 }else if (nkf_char_unicode_p(c)){
2853 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2855 status_push_ch(ptr, c);
2857 }else if (0x8f == c){
2859 status_push_ch(ptr, c);
2860 #endif /* X0212_ENABLE */
2862 status_disable(ptr);
2866 if (0xa1 <= c && c <= 0xfe){
2867 status_push_ch(ptr, c);
2871 status_disable(ptr);
2876 if (0xa1 <= c && c <= 0xfe){
2878 status_push_ch(ptr, c);
2880 status_disable(ptr);
2882 #endif /* X0212_ENABLE */
2886 #ifdef UTF8_INPUT_ENABLE
2888 w_status(struct input_code *ptr, nkf_char c)
2892 status_check(ptr, c);
2897 }else if (nkf_char_unicode_p(c)){
2899 }else if (0xc0 <= c && c <= 0xdf){
2901 status_push_ch(ptr, c);
2902 }else if (0xe0 <= c && c <= 0xef){
2904 status_push_ch(ptr, c);
2905 }else if (0xf0 <= c && c <= 0xf4){
2907 status_push_ch(ptr, c);
2909 status_disable(ptr);
2914 if (0x80 <= c && c <= 0xbf){
2915 status_push_ch(ptr, c);
2916 if (ptr->index > ptr->stat){
2917 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2918 && ptr->buf[2] == 0xbf);
2919 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2920 &ptr->buf[0], &ptr->buf[1]);
2927 status_disable(ptr);
2931 if (0x80 <= c && c <= 0xbf){
2932 if (ptr->index < ptr->stat){
2933 status_push_ch(ptr, c);
2938 status_disable(ptr);
2946 code_status(nkf_char c)
2948 int action_flag = 1;
2949 struct input_code *result = 0;
2950 struct input_code *p = input_code_list;
2952 if (!p->status_func) {
2956 if (!p->status_func)
2958 (p->status_func)(p, c);
2961 }else if(p->stat == 0){
2972 if (result && !estab_f){
2973 set_iconv(TRUE, result->iconv_func);
2974 }else if (c <= DEL){
2975 struct input_code *ptr = input_code_list;
2985 nkf_buf_t *std_gc_buf;
2986 nkf_char broken_state;
2987 nkf_buf_t *broken_buf;
2988 nkf_char mimeout_state;
2992 static nkf_state_t *nkf_state = NULL;
2994 #define STD_GC_BUFSIZE (256)
2997 nkf_state_init(void)
3000 nkf_buf_clear(nkf_state->std_gc_buf);
3001 nkf_buf_clear(nkf_state->broken_buf);
3002 nkf_buf_clear(nkf_state->nfc_buf);
3005 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3006 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3007 nkf_state->broken_buf = nkf_buf_new(3);
3008 nkf_state->nfc_buf = nkf_buf_new(9);
3010 nkf_state->broken_state = 0;
3011 nkf_state->mimeout_state = 0;
3018 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3019 return nkf_buf_pop(nkf_state->std_gc_buf);
3026 std_ungetc(nkf_char c, FILE *f)
3028 nkf_buf_push(nkf_state->std_gc_buf, c);
3034 std_putc(nkf_char c)
3041 static unsigned char hold_buf[HOLD_SIZE*2];
3042 static int hold_count = 0;
3044 push_hold_buf(nkf_char c2)
3046 if (hold_count >= HOLD_SIZE*2)
3048 hold_buf[hold_count++] = (unsigned char)c2;
3049 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3053 h_conv(FILE *f, int c1, int c2)
3059 /** it must NOT be in the kanji shifte sequence */
3060 /** it must NOT be written in JIS7 */
3061 /** and it must be after 2 byte 8bit code */
3067 while ((c2 = (*i_getc)(f)) != EOF) {
3073 if (push_hold_buf(c2) == EOF || estab_f) {
3079 struct input_code *p = input_code_list;
3080 struct input_code *result = p;
3085 if (p->status_func && p->score < result->score) {
3090 set_iconv(TRUE, result->iconv_func);
3095 ** 1) EOF is detected, or
3096 ** 2) Code is established, or
3097 ** 3) Buffer is FULL (but last word is pushed)
3099 ** in 1) and 3) cases, we continue to use
3100 ** Kanji codes by oconv and leave estab_f unchanged.
3105 while (hold_index < hold_count){
3106 c1 = hold_buf[hold_index++];
3110 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3111 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3114 if (hold_index < hold_count){
3115 c2 = hold_buf[hold_index++];
3125 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3128 if (hold_index < hold_count){
3129 c3 = hold_buf[hold_index++];
3130 } else if ((c3 = (*i_getc)(f)) == EOF) {
3135 if (hold_index < hold_count){
3136 c4 = hold_buf[hold_index++];
3137 } else if ((c4 = (*i_getc)(f)) == EOF) {
3142 (*iconv)(c1, c2, (c3<<8)|c4);
3147 /* 3 bytes EUC or UTF-8 */
3148 if (hold_index < hold_count){
3149 c3 = hold_buf[hold_index++];
3150 } else if ((c3 = (*i_getc)(f)) == EOF) {
3156 (*iconv)(c1, c2, c3);
3159 if (c3 == EOF) break;
3165 * Check and Ignore BOM
3171 switch(c2 = (*i_getc)(f)){
3173 if((c2 = (*i_getc)(f)) == 0x00){
3174 if((c2 = (*i_getc)(f)) == 0xFE){
3175 if((c2 = (*i_getc)(f)) == 0xFF){
3176 if(!input_encoding){
3177 set_iconv(TRUE, w_iconv32);
3179 if (iconv == w_iconv32) {
3180 input_endian = ENDIAN_BIG;
3183 (*i_ungetc)(0xFF,f);
3184 }else (*i_ungetc)(c2,f);
3185 (*i_ungetc)(0xFE,f);
3186 }else if(c2 == 0xFF){
3187 if((c2 = (*i_getc)(f)) == 0xFE){
3188 if(!input_encoding){
3189 set_iconv(TRUE, w_iconv32);
3191 if (iconv == w_iconv32) {
3192 input_endian = ENDIAN_2143;
3195 (*i_ungetc)(0xFF,f);
3196 }else (*i_ungetc)(c2,f);
3197 (*i_ungetc)(0xFF,f);
3198 }else (*i_ungetc)(c2,f);
3199 (*i_ungetc)(0x00,f);
3200 }else (*i_ungetc)(c2,f);
3201 (*i_ungetc)(0x00,f);
3204 if((c2 = (*i_getc)(f)) == 0xBB){
3205 if((c2 = (*i_getc)(f)) == 0xBF){
3206 if(!input_encoding){
3207 set_iconv(TRUE, w_iconv);
3209 if (iconv == w_iconv) {
3212 (*i_ungetc)(0xBF,f);
3213 }else (*i_ungetc)(c2,f);
3214 (*i_ungetc)(0xBB,f);
3215 }else (*i_ungetc)(c2,f);
3216 (*i_ungetc)(0xEF,f);
3219 if((c2 = (*i_getc)(f)) == 0xFF){
3220 if((c2 = (*i_getc)(f)) == 0x00){
3221 if((c2 = (*i_getc)(f)) == 0x00){
3222 if(!input_encoding){
3223 set_iconv(TRUE, w_iconv32);
3225 if (iconv == w_iconv32) {
3226 input_endian = ENDIAN_3412;
3229 (*i_ungetc)(0x00,f);
3230 }else (*i_ungetc)(c2,f);
3231 (*i_ungetc)(0x00,f);
3232 }else (*i_ungetc)(c2,f);
3233 if(!input_encoding){
3234 set_iconv(TRUE, w_iconv16);
3236 if (iconv == w_iconv16) {
3237 input_endian = ENDIAN_BIG;
3240 (*i_ungetc)(0xFF,f);
3241 }else (*i_ungetc)(c2,f);
3242 (*i_ungetc)(0xFE,f);
3245 if((c2 = (*i_getc)(f)) == 0xFE){
3246 if((c2 = (*i_getc)(f)) == 0x00){
3247 if((c2 = (*i_getc)(f)) == 0x00){
3248 if(!input_encoding){
3249 set_iconv(TRUE, w_iconv32);
3251 if (iconv == w_iconv32) {
3252 input_endian = ENDIAN_LITTLE;
3255 (*i_ungetc)(0x00,f);
3256 }else (*i_ungetc)(c2,f);
3257 (*i_ungetc)(0x00,f);
3258 }else (*i_ungetc)(c2,f);
3259 if(!input_encoding){
3260 set_iconv(TRUE, w_iconv16);
3262 if (iconv == w_iconv16) {
3263 input_endian = ENDIAN_LITTLE;
3266 (*i_ungetc)(0xFE,f);
3267 }else (*i_ungetc)(c2,f);
3268 (*i_ungetc)(0xFF,f);
3277 broken_getc(FILE *f)
3281 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3282 return nkf_buf_pop(nkf_state->broken_buf);
3285 if (c=='$' && nkf_state->broken_state != ESC
3286 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3288 nkf_state->broken_state = 0;
3289 if (c1=='@'|| c1=='B') {
3290 nkf_buf_push(nkf_state->broken_buf, c1);
3291 nkf_buf_push(nkf_state->broken_buf, c);
3297 } else if (c=='(' && nkf_state->broken_state != ESC
3298 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3300 nkf_state->broken_state = 0;
3301 if (c1=='J'|| c1=='B') {
3302 nkf_buf_push(nkf_state->broken_buf, c1);
3303 nkf_buf_push(nkf_state->broken_buf, c);
3310 nkf_state->broken_state = c;
3316 broken_ungetc(nkf_char c, FILE *f)
3318 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3319 nkf_buf_push(nkf_state->broken_buf, c);
3324 eol_conv(nkf_char c2, nkf_char c1)
3326 if (guess_f && input_eol != EOF) {
3327 if (c2 == 0 && c1 == LF) {
3328 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3329 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3330 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3332 else if (!input_eol) input_eol = CR;
3333 else if (input_eol != CR) input_eol = EOF;
3335 if (prev_cr || (c2 == 0 && c1 == LF)) {
3337 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3338 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3340 if (c2 == 0 && c1 == CR) prev_cr = CR;
3341 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3345 Return value of fold_conv()
3347 LF add newline and output char
3348 CR add newline and output nothing
3351 1 (or else) normal output
3353 fold state in prev (previous character)
3355 >0x80 Japanese (X0208/X0201)
3360 This fold algorthm does not preserve heading space in a line.
3361 This is the main difference from fmt.
3364 #define char_size(c2,c1) (c2?2:1)
3367 fold_conv(nkf_char c2, nkf_char c1)
3370 nkf_char fold_state;
3372 if (c1== CR && !fold_preserve_f) {
3373 fold_state=0; /* ignore cr */
3374 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3376 fold_state=0; /* ignore cr */
3377 } else if (c1== BS) {
3378 if (f_line>0) f_line--;
3380 } else if (c2==EOF && f_line != 0) { /* close open last line */
3382 } else if ((c1==LF && !fold_preserve_f)
3383 || ((c1==CR||(c1==LF&&f_prev!=CR))
3384 && fold_preserve_f)) {
3386 if (fold_preserve_f) {
3390 } else if ((f_prev == c1 && !fold_preserve_f)
3391 || (f_prev == LF && fold_preserve_f)
3392 ) { /* duplicate newline */
3395 fold_state = LF; /* output two newline */
3401 if (f_prev&0x80) { /* Japanese? */
3403 fold_state = 0; /* ignore given single newline */
3404 } else if (f_prev==SP) {
3408 if (++f_line<=fold_len)
3412 fold_state = CR; /* fold and output nothing */
3416 } else if (c1=='\f') {
3419 fold_state = LF; /* output newline and clear */
3420 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3421 /* X0208 kankaku or ascii space */
3423 fold_state = 0; /* remove duplicate spaces */
3426 if (++f_line<=fold_len)
3427 fold_state = SP; /* output ASCII space only */
3429 f_prev = SP; f_line = 0;
3430 fold_state = CR; /* fold and output nothing */
3434 prev0 = f_prev; /* we still need this one... , but almost done */
3436 if (c2 || c2 == JIS_X_0201_1976_K)
3437 f_prev |= 0x80; /* this is Japanese */
3438 f_line += char_size(c2,c1);
3439 if (f_line<=fold_len) { /* normal case */
3442 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3443 f_line = char_size(c2,c1);
3444 fold_state = LF; /* We can't wait, do fold now */
3445 } else if (c2 == JIS_X_0201_1976_K) {
3446 /* simple kinsoku rules return 1 means no folding */
3447 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3448 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3449 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3450 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3451 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3452 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3453 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3455 fold_state = LF;/* add one new f_line before this character */
3458 fold_state = LF;/* add one new f_line before this character */
3461 /* kinsoku point in ASCII */
3462 if ( c1==')'|| /* { [ ( */
3473 /* just after special */
3474 } else if (!is_alnum(prev0)) {
3475 f_line = char_size(c2,c1);
3477 } else if ((prev0==SP) || /* ignored new f_line */
3478 (prev0==LF)|| /* ignored new f_line */
3479 (prev0&0x80)) { /* X0208 - ASCII */
3480 f_line = char_size(c2,c1);
3481 fold_state = LF;/* add one new f_line before this character */
3483 fold_state = 1; /* default no fold in ASCII */
3487 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3488 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3489 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3490 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3491 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3492 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3493 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3494 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3495 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3496 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3497 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3498 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3499 /* default no fold in kinsoku */
3502 f_line = char_size(c2,c1);
3503 /* add one new f_line before this character */
3506 f_line = char_size(c2,c1);
3508 /* add one new f_line before this character */
3513 /* terminator process */
3514 switch(fold_state) {
3516 OCONV_NEWLINE((*o_fconv));
3522 OCONV_NEWLINE((*o_fconv));
3533 static nkf_char z_prev2=0,z_prev1=0;
3536 z_conv(nkf_char c2, nkf_char c1)
3539 /* if (c2) c1 &= 0x7f; assertion */
3541 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3547 if (z_prev2 == JIS_X_0201_1976_K) {
3548 if (c2 == JIS_X_0201_1976_K) {
3549 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3551 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3553 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3555 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3560 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3562 if (c2 == JIS_X_0201_1976_K) {
3563 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3564 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3569 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3580 if (alpha_f&1 && c2 == 0x23) {
3581 /* JISX0208 Alphabet */
3583 } else if (c2 == 0x21) {
3584 /* JISX0208 Kigou */
3589 } else if (alpha_f&4) {
3594 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3600 if (alpha_f&8 && c2 == 0) {
3602 const char *entity = 0;
3604 case '>': entity = ">"; break;
3605 case '<': entity = "<"; break;
3606 case '\"': entity = """; break;
3607 case '&': entity = "&"; break;
3610 while (*entity) (*o_zconv)(0, *entity++);
3616 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3621 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3625 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3629 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3633 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3637 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3641 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3645 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3649 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3654 (*o_zconv)(JIS_X_0201_1976_K, c);
3657 } else if (c2 == 0x25) {
3658 /* JISX0208 Katakana */
3659 static const int fullwidth_to_halfwidth[] =
3661 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3662 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3663 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3664 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3665 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3666 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3667 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3668 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3669 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3670 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3671 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3672 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3674 if (fullwidth_to_halfwidth[c1-0x20]){
3675 c2 = fullwidth_to_halfwidth[c1-0x20];
3676 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3678 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3688 #define rot13(c) ( \
3690 (c <= 'M') ? (c + 13): \
3691 (c <= 'Z') ? (c - 13): \
3693 (c <= 'm') ? (c + 13): \
3694 (c <= 'z') ? (c - 13): \
3698 #define rot47(c) ( \
3700 ( c <= 'O') ? (c + 47) : \
3701 ( c <= '~') ? (c - 47) : \
3706 rot_conv(nkf_char c2, nkf_char c1)
3708 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3714 (*o_rot_conv)(c2,c1);
3718 hira_conv(nkf_char c2, nkf_char c1)
3722 if (0x20 < c1 && c1 < 0x74) {
3724 (*o_hira_conv)(c2,c1);
3726 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3728 c1 = nkf_char_unicode_new(0x3094);
3729 (*o_hira_conv)(c2,c1);
3732 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3734 (*o_hira_conv)(c2,c1);
3739 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3742 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3744 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3748 (*o_hira_conv)(c2,c1);
3753 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3755 #define RANGE_NUM_MAX 18
3756 static const nkf_char range[RANGE_NUM_MAX][2] = {
3777 nkf_char start, end, c;
3779 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3783 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3788 for (i = 0; i < RANGE_NUM_MAX; i++) {
3789 start = range[i][0];
3792 if (c >= start && c <= end) {
3797 (*o_iso2022jp_check_conv)(c2,c1);
3801 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3803 static const unsigned char *mime_pattern[] = {
3804 (const unsigned char *)"\075?EUC-JP?B?",
3805 (const unsigned char *)"\075?SHIFT_JIS?B?",
3806 (const unsigned char *)"\075?ISO-8859-1?Q?",
3807 (const unsigned char *)"\075?ISO-8859-1?B?",
3808 (const unsigned char *)"\075?ISO-2022-JP?B?",
3809 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3810 #if defined(UTF8_INPUT_ENABLE)
3811 (const unsigned char *)"\075?UTF-8?B?",
3812 (const unsigned char *)"\075?UTF-8?Q?",
3814 (const unsigned char *)"\075?US-ASCII?Q?",
3819 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3820 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3821 e_iconv, s_iconv, 0, 0, 0, 0,
3822 #if defined(UTF8_INPUT_ENABLE)
3828 static const nkf_char mime_encode[] = {
3829 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3830 #if defined(UTF8_INPUT_ENABLE)
3837 static const nkf_char mime_encode_method[] = {
3838 'B', 'B','Q', 'B', 'B', 'Q',
3839 #if defined(UTF8_INPUT_ENABLE)
3847 /* MIME preprocessor fifo */
3849 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3850 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3851 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3853 unsigned char buf[MIME_BUF_SIZE];
3855 unsigned int last; /* decoded */
3856 unsigned int input; /* undecoded */
3858 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3860 #define MAXRECOVER 20
3863 mime_input_buf_unshift(nkf_char c)
3865 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3869 mime_ungetc(nkf_char c, FILE *f)
3871 mime_input_buf_unshift(c);
3876 mime_ungetc_buf(nkf_char c, FILE *f)
3879 (*i_mungetc_buf)(c,f);
3881 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3886 mime_getc_buf(FILE *f)
3888 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3889 a terminator. It was checked in mime_integrity. */
3890 return ((mimebuf_f)?
3891 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3895 switch_mime_getc(void)
3897 if (i_getc!=mime_getc) {
3898 i_mgetc = i_getc; i_getc = mime_getc;
3899 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3900 if(mime_f==STRICT_MIME) {
3901 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3902 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3908 unswitch_mime_getc(void)
3910 if(mime_f==STRICT_MIME) {
3911 i_mgetc = i_mgetc_buf;
3912 i_mungetc = i_mungetc_buf;
3915 i_ungetc = i_mungetc;
3916 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3917 mime_iconv_back = NULL;
3921 mime_integrity(FILE *f, const unsigned char *p)
3925 /* In buffered mode, read until =? or NL or buffer full
3927 mime_input_state.input = mime_input_state.top;
3928 mime_input_state.last = mime_input_state.top;
3930 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3932 q = mime_input_state.input;
3933 while((c=(*i_getc)(f))!=EOF) {
3934 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3935 break; /* buffer full */
3937 if (c=='=' && d=='?') {
3938 /* checked. skip header, start decode */
3939 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3940 /* mime_last_input = mime_input_state.input; */
3941 mime_input_state.input = q;
3945 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3947 /* Should we check length mod 4? */
3948 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3951 /* In case of Incomplete MIME, no MIME decode */
3952 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3953 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3954 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3955 switch_mime_getc(); /* anyway we need buffered getc */
3960 mime_begin_strict(FILE *f)
3964 const unsigned char *p,*q;
3965 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3967 mime_decode_mode = FALSE;
3968 /* =? has been checked */
3970 p = mime_pattern[j];
3973 for(i=2;p[i]>SP;i++) { /* start at =? */
3974 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3975 /* pattern fails, try next one */
3977 while (mime_pattern[++j]) {
3978 p = mime_pattern[j];
3979 for(k=2;k<i;k++) /* assume length(p) > i */
3980 if (p[k]!=q[k]) break;
3981 if (k==i && nkf_toupper(c1)==p[k]) break;
3983 p = mime_pattern[j];
3984 if (p) continue; /* found next one, continue */
3985 /* all fails, output from recovery buffer */
3993 mime_decode_mode = p[i-2];
3995 mime_iconv_back = iconv;
3996 set_iconv(FALSE, mime_priority_func[j]);
3997 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3999 if (mime_decode_mode=='B') {
4000 mimebuf_f = unbuf_f;
4002 /* do MIME integrity check */
4003 return mime_integrity(f,mime_pattern[j]);
4017 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4018 /* re-read and convert again from mime_buffer. */
4020 /* =? has been checked */
4021 k = mime_input_state.last;
4022 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4023 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4024 /* We accept any character type even if it is breaked by new lines */
4025 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4026 if (c1==LF||c1==SP||c1==CR||
4027 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4029 /* Failed. But this could be another MIME preemble */
4031 mime_input_state.last--;
4037 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4038 if (!(++i<MAXRECOVER) || c1==EOF) break;
4039 if (c1=='b'||c1=='B') {
4040 mime_decode_mode = 'B';
4041 } else if (c1=='q'||c1=='Q') {
4042 mime_decode_mode = 'Q';
4046 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4047 if (!(++i<MAXRECOVER) || c1==EOF) break;
4049 mime_decode_mode = FALSE;
4055 if (!mime_decode_mode) {
4056 /* false MIME premble, restart from mime_buffer */
4057 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4058 /* Since we are in MIME mode until buffer becomes empty, */
4059 /* we never go into mime_begin again for a while. */
4062 /* discard mime preemble, and goto MIME mode */
4063 mime_input_state.last = k;
4064 /* do no MIME integrity check */
4065 return c1; /* used only for checking EOF */
4076 debug(const char *str)
4079 fprintf(stderr, "%s\n", str ? str : "NULL");
4085 set_input_codename(const char *codename)
4087 if (!input_codename) {
4088 input_codename = codename;
4089 } else if (strcmp(codename, input_codename) != 0) {
4090 input_codename = "";
4095 get_guessed_code(void)
4097 if (input_codename && !*input_codename) {
4098 input_codename = "BINARY";
4100 struct input_code *p = find_inputcode_byfunc(iconv);
4101 if (!input_codename) {
4102 input_codename = "ASCII";
4103 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4104 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4105 input_codename = "CP932";
4106 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4107 if (p->score & (SCORE_X0212))
4108 input_codename = "EUCJP-MS";
4109 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4110 input_codename = "CP51932";
4111 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4112 if (p->score & (SCORE_KANA))
4113 input_codename = "CP50221";
4114 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4115 input_codename = "CP50220";
4118 return input_codename;
4121 #if !defined(PERL_XS) && !defined(WIN32DLL)
4123 print_guessed_code(char *filename)
4125 if (filename != NULL) printf("%s: ", filename);
4126 if (input_codename && !*input_codename) {
4129 input_codename = get_guessed_code();
4131 printf("%s\n", input_codename);
4135 input_eol == CR ? " (CR)" :
4136 input_eol == LF ? " (LF)" :
4137 input_eol == CRLF ? " (CRLF)" :
4138 input_eol == EOF ? " (MIXED NL)" :
4148 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4150 nkf_char c1, c2, c3;
4156 if (!nkf_isxdigit(c2)){
4161 if (!nkf_isxdigit(c3)){
4166 return (hex2bin(c2) << 4) | hex2bin(c3);
4172 return hex_getc(':', f, i_cgetc, i_cungetc);
4176 cap_ungetc(nkf_char c, FILE *f)
4178 return (*i_cungetc)(c, f);
4184 return hex_getc('%', f, i_ugetc, i_uungetc);
4188 url_ungetc(nkf_char c, FILE *f)
4190 return (*i_uungetc)(c, f);
4194 #ifdef NUMCHAR_OPTION
4196 numchar_getc(FILE *f)
4198 nkf_char (*g)(FILE *) = i_ngetc;
4199 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4210 if (buf[i] == 'x' || buf[i] == 'X'){
4211 for (j = 0; j < 7; j++){
4213 if (!nkf_isxdigit(buf[i])){
4220 c |= hex2bin(buf[i]);
4223 for (j = 0; j < 8; j++){
4227 if (!nkf_isdigit(buf[i])){
4234 c += hex2bin(buf[i]);
4240 return nkf_char_unicode_new(c);
4250 numchar_ungetc(nkf_char c, FILE *f)
4252 return (*i_nungetc)(c, f);
4256 #ifdef UNICODE_NORMALIZATION
4261 nkf_char (*g)(FILE *f) = i_nfc_getc;
4262 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4263 nkf_buf_t *buf = nkf_state->nfc_buf;
4264 const unsigned char *array;
4265 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4266 nkf_char c = (*g)(f);
4268 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4270 nkf_buf_push(buf, c);
4272 while (lower <= upper) {
4273 int mid = (lower+upper) / 2;
4275 array = normalization_table[mid].nfd;
4276 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4277 if (len >= nkf_buf_length(buf)) {
4281 lower = 1, upper = 0;
4284 nkf_buf_push(buf, c);
4286 if (array[len] != nkf_buf_at(buf, len)) {
4287 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4288 else upper = mid - 1;
4295 array = normalization_table[mid].nfc;
4297 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4298 nkf_buf_push(buf, array[i]);
4302 } while (lower <= upper);
4304 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4305 c = nkf_buf_pop(buf);
4311 nfc_ungetc(nkf_char c, FILE *f)
4313 return (*i_nfc_ungetc)(c, f);
4315 #endif /* UNICODE_NORMALIZATION */
4319 base64decode(nkf_char c)
4324 i = c - 'A'; /* A..Z 0-25 */
4325 } else if (c == '_') {
4326 i = '?' /* 63 */ ; /* _ 63 */
4328 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4330 } else if (c > '/') {
4331 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4332 } else if (c == '+' || c == '-') {
4333 i = '>' /* 62 */ ; /* + and - 62 */
4335 i = '?' /* 63 */ ; /* / 63 */
4343 nkf_char c1, c2, c3, c4, cc;
4344 nkf_char t1, t2, t3, t4, mode, exit_mode;
4345 nkf_char lwsp_count;
4348 nkf_char lwsp_size = 128;
4350 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4351 return mime_input_buf(mime_input_state.top++);
4353 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4354 mime_decode_mode=FALSE;
4355 unswitch_mime_getc();
4356 return (*i_getc)(f);
4359 if (mimebuf_f == FIXED_MIME)
4360 exit_mode = mime_decode_mode;
4363 if (mime_decode_mode == 'Q') {
4364 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4366 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4367 if (c1<=SP || DEL<=c1) {
4368 mime_decode_mode = exit_mode; /* prepare for quit */
4371 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4375 mime_decode_mode = exit_mode; /* prepare for quit */
4376 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4377 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4378 /* end Q encoding */
4379 input_mode = exit_mode;
4381 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4382 while ((c1=(*i_getc)(f))!=EOF) {
4387 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4395 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4396 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4411 lwsp_buf[lwsp_count] = (unsigned char)c1;
4412 if (lwsp_count++>lwsp_size){
4414 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4415 lwsp_buf = lwsp_buf_new;
4421 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4423 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4424 i_ungetc(lwsp_buf[lwsp_count],f);
4427 nkf_xfree(lwsp_buf);
4430 if (c1=='='&&c2<SP) { /* this is soft wrap */
4431 while((c1 = (*i_mgetc)(f)) <=SP) {
4432 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4434 mime_decode_mode = 'Q'; /* still in MIME */
4435 goto restart_mime_q;
4438 mime_decode_mode = 'Q'; /* still in MIME */
4442 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4443 if (c2<=SP) return c2;
4444 mime_decode_mode = 'Q'; /* still in MIME */
4445 return ((hex2bin(c2)<<4) + hex2bin(c3));
4448 if (mime_decode_mode != 'B') {
4449 mime_decode_mode = FALSE;
4450 return (*i_mgetc)(f);
4454 /* Base64 encoding */
4456 MIME allows line break in the middle of
4457 Base64, but we are very pessimistic in decoding
4458 in unbuf mode because MIME encoded code may broken by
4459 less or editor's control sequence (such as ESC-[-K in unbuffered
4460 mode. ignore incomplete MIME.
4462 mode = mime_decode_mode;
4463 mime_decode_mode = exit_mode; /* prepare for quit */
4465 while ((c1 = (*i_mgetc)(f))<=SP) {
4470 if ((c2 = (*i_mgetc)(f))<=SP) {
4473 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4474 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4477 if ((c1 == '?') && (c2 == '=')) {
4480 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4481 while ((c1=(*i_getc)(f))!=EOF) {
4486 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4494 if ((c1=(*i_getc)(f))!=EOF) {
4498 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4513 lwsp_buf[lwsp_count] = (unsigned char)c1;
4514 if (lwsp_count++>lwsp_size){
4516 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4517 lwsp_buf = lwsp_buf_new;
4523 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4525 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4526 i_ungetc(lwsp_buf[lwsp_count],f);
4529 nkf_xfree(lwsp_buf);
4533 if ((c3 = (*i_mgetc)(f))<=SP) {
4536 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4537 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4541 if ((c4 = (*i_mgetc)(f))<=SP) {
4544 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4545 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4549 mime_decode_mode = mode; /* still in MIME sigh... */
4551 /* BASE 64 decoding */
4553 t1 = 0x3f & base64decode(c1);
4554 t2 = 0x3f & base64decode(c2);
4555 t3 = 0x3f & base64decode(c3);
4556 t4 = 0x3f & base64decode(c4);
4557 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4559 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4560 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4562 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4563 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4565 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4570 return mime_input_buf(mime_input_state.top++);
4573 static const char basis_64[] =
4574 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4576 #define MIMEOUT_BUF_LENGTH 74
4578 char buf[MIMEOUT_BUF_LENGTH+1];
4582 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4585 open_mime(nkf_char mode)
4587 const unsigned char *p;
4590 p = mime_pattern[0];
4591 for(i=0;mime_pattern[i];i++) {
4592 if (mode == mime_encode[i]) {
4593 p = mime_pattern[i];
4597 mimeout_mode = mime_encode_method[i];
4599 if (base64_count>45) {
4600 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4601 (*o_mputc)(mimeout_state.buf[i]);
4604 PUT_NEWLINE((*o_mputc));
4607 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4611 for (;i<mimeout_state.count;i++) {
4612 if (nkf_isspace(mimeout_state.buf[i])) {
4613 (*o_mputc)(mimeout_state.buf[i]);
4623 j = mimeout_state.count;
4624 mimeout_state.count = 0;
4626 mime_putc(mimeout_state.buf[i]);
4631 mime_prechar(nkf_char c2, nkf_char c1)
4633 if (mimeout_mode > 0){
4635 if (base64_count + mimeout_state.count/3*4> 73){
4636 (*o_base64conv)(EOF,0);
4637 OCONV_NEWLINE((*o_base64conv));
4638 (*o_base64conv)(0,SP);
4642 if (base64_count + mimeout_state.count/3*4> 66) {
4643 (*o_base64conv)(EOF,0);
4644 OCONV_NEWLINE((*o_base64conv));
4645 (*o_base64conv)(0,SP);
4651 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4652 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4653 open_mime(output_mode);
4654 (*o_base64conv)(EOF,0);
4655 OCONV_NEWLINE((*o_base64conv));
4656 (*o_base64conv)(0,SP);
4675 switch(mimeout_mode) {
4680 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4686 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4691 if (mimeout_mode > 0) {
4692 if (mimeout_f!=FIXED_MIME) {
4694 } else if (mimeout_mode != 'Q')
4700 mimeout_addchar(nkf_char c)
4702 switch(mimeout_mode) {
4707 } else if(!nkf_isalnum(c)) {
4709 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4710 (*o_mputc)(bin2hex((c&0xf)));
4718 nkf_state->mimeout_state=c;
4719 (*o_mputc)(basis_64[c>>2]);
4724 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4725 nkf_state->mimeout_state=c;
4730 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4731 (*o_mputc)(basis_64[c & 0x3F]);
4743 mime_putc(nkf_char c)
4748 if (mimeout_f == FIXED_MIME){
4749 if (mimeout_mode == 'Q'){
4750 if (base64_count > 71){
4751 if (c!=CR && c!=LF) {
4753 PUT_NEWLINE((*o_mputc));
4758 if (base64_count > 71){
4760 PUT_NEWLINE((*o_mputc));
4763 if (c == EOF) { /* c==EOF */
4767 if (c != EOF) { /* c==EOF */
4773 /* mimeout_f != FIXED_MIME */
4775 if (c == EOF) { /* c==EOF */
4776 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4777 j = mimeout_state.count;
4778 mimeout_state.count = 0;
4780 if (mimeout_mode > 0) {
4781 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4783 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4786 mimeout_addchar(mimeout_state.buf[i]);
4790 mimeout_addchar(mimeout_state.buf[i]);
4794 mimeout_addchar(mimeout_state.buf[i]);
4800 mimeout_addchar(mimeout_state.buf[i]);
4806 if (mimeout_state.count > 0){
4807 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4812 if (mimeout_mode=='Q') {
4813 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4814 if (c == CR || c == LF) {
4819 } else if (c <= SP) {
4821 if (base64_count > 70) {
4822 PUT_NEWLINE((*o_mputc));
4825 if (!nkf_isblank(c)) {
4830 if (base64_count > 70) {
4832 PUT_NEWLINE((*o_mputc));
4835 open_mime(output_mode);
4837 if (!nkf_noescape_mime(c)) {
4848 if (mimeout_mode <= 0) {
4849 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4850 if (nkf_isspace(c)) {
4852 if (mimeout_mode == -1) {
4855 if (c==CR || c==LF) {
4857 open_mime(output_mode);
4863 for (i=0;i<mimeout_state.count;i++) {
4864 (*o_mputc)(mimeout_state.buf[i]);
4865 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4876 mimeout_state.buf[0] = (char)c;
4877 mimeout_state.count = 1;
4879 if (base64_count > 1
4880 && base64_count + mimeout_state.count > 76
4881 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4882 static const char *str = "boundary=\"";
4883 static int len = 10;
4886 for (; i < mimeout_state.count - len; ++i) {
4887 if (!strncmp(mimeout_state.buf+i, str, len)) {
4893 if (i == 0 || i == mimeout_state.count - len) {
4894 PUT_NEWLINE((*o_mputc));
4896 if (!nkf_isspace(mimeout_state.buf[0])){
4903 for (j = 0; j <= i; ++j) {
4904 (*o_mputc)(mimeout_state.buf[j]);
4906 PUT_NEWLINE((*o_mputc));
4908 for (; j <= mimeout_state.count; ++j) {
4909 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4911 mimeout_state.count -= i;
4914 mimeout_state.buf[mimeout_state.count++] = (char)c;
4915 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4916 open_mime(output_mode);
4921 if (lastchar==CR || lastchar == LF){
4922 for (i=0;i<mimeout_state.count;i++) {
4923 (*o_mputc)(mimeout_state.buf[i]);
4926 mimeout_state.count = 0;
4929 for (i=0;i<mimeout_state.count-1;i++) {
4930 (*o_mputc)(mimeout_state.buf[i]);
4933 mimeout_state.buf[0] = SP;
4934 mimeout_state.count = 1;
4936 open_mime(output_mode);
4939 /* mimeout_mode == 'B', 1, 2 */
4940 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4941 if (lastchar == CR || lastchar == LF){
4942 if (nkf_isblank(c)) {
4943 for (i=0;i<mimeout_state.count;i++) {
4944 mimeout_addchar(mimeout_state.buf[i]);
4946 mimeout_state.count = 0;
4947 } else if (SP<c && c<DEL) {
4949 for (i=0;i<mimeout_state.count;i++) {
4950 (*o_mputc)(mimeout_state.buf[i]);
4953 mimeout_state.count = 0;
4955 mimeout_state.buf[mimeout_state.count++] = (char)c;
4958 if (nkf_isspace(c)) {
4959 for (i=0;i<mimeout_state.count;i++) {
4960 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4962 for (i=0;i<mimeout_state.count;i++) {
4963 (*o_mputc)(mimeout_state.buf[i]);
4966 mimeout_state.count = 0;
4969 mimeout_state.buf[mimeout_state.count++] = (char)c;
4970 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4972 for (i=0;i<mimeout_state.count;i++) {
4973 (*o_mputc)(mimeout_state.buf[i]);
4976 mimeout_state.count = 0;
4980 if (mimeout_state.count>0 && SP<c && c!='=') {
4981 mimeout_state.buf[mimeout_state.count++] = (char)c;
4982 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4983 j = mimeout_state.count;
4984 mimeout_state.count = 0;
4986 mimeout_addchar(mimeout_state.buf[i]);
4993 if (mimeout_state.count>0) {
4994 j = mimeout_state.count;
4995 mimeout_state.count = 0;
4997 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4999 mimeout_addchar(mimeout_state.buf[i]);
5005 (*o_mputc)(mimeout_state.buf[i]);
5007 open_mime(output_mode);
5014 base64_conv(nkf_char c2, nkf_char c1)
5016 mime_prechar(c2, c1);
5017 (*o_base64conv)(c2,c1);
5021 typedef struct nkf_iconv_t {
5024 size_t input_buffer_size;
5025 char *output_buffer;
5026 size_t output_buffer_size;
5030 nkf_iconv_new(char *tocode, char *fromcode)
5032 nkf_iconv_t converter;
5034 converter->input_buffer_size = IOBUF_SIZE;
5035 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5036 converter->output_buffer_size = IOBUF_SIZE * 2;
5037 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5038 converter->cd = iconv_open(tocode, fromcode);
5039 if (converter->cd == (iconv_t)-1)
5043 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5046 perror("can't iconv_open");
5052 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5054 size_t invalid = (size_t)0;
5055 char *input_buffer = converter->input_buffer;
5056 size_t input_length = (size_t)0;
5057 char *output_buffer = converter->output_buffer;
5058 size_t output_length = converter->output_buffer_size;
5063 while ((c = (*i_getc)(f)) != EOF) {
5064 input_buffer[input_length++] = c;
5065 if (input_length < converter->input_buffer_size) break;
5069 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5070 while (output_length-- > 0) {
5071 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5073 if (ret == (size_t) - 1) {
5076 if (input_buffer != converter->input_buffer)
5077 memmove(converter->input_buffer, input_buffer, input_length);
5080 converter->output_buffer_size *= 2;
5081 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5082 if (output_buffer == NULL) {
5083 perror("can't realloc");
5086 converter->output_buffer = output_buffer;
5089 perror("can't iconv");
5102 nkf_iconv_close(nkf_iconv_t *convert)
5104 nkf_xfree(converter->inbuf);
5105 nkf_xfree(converter->outbuf);
5106 iconv_close(converter->cd);
5115 struct input_code *p = input_code_list;
5127 mime_f = MIME_DECODE_DEFAULT;
5128 mime_decode_f = FALSE;
5133 x0201_f = X0201_DEFAULT;
5134 iso2022jp_f = FALSE;
5135 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5136 ms_ucs_map_f = UCS_MAP_ASCII;
5138 #ifdef UTF8_INPUT_ENABLE
5139 no_cp932ext_f = FALSE;
5140 no_best_fit_chars_f = FALSE;
5141 encode_fallback = NULL;
5142 unicode_subchar = '?';
5143 input_endian = ENDIAN_BIG;
5145 #ifdef UTF8_OUTPUT_ENABLE
5146 output_bom_f = FALSE;
5147 output_endian = ENDIAN_BIG;
5149 #ifdef UNICODE_NORMALIZATION
5165 #ifdef SHIFTJIS_CP932
5175 for (i = 0; i < 256; i++){
5176 prefix_table[i] = 0;
5180 mimeout_state.count = 0;
5185 fold_preserve_f = FALSE;
5188 kanji_intro = DEFAULT_J;
5189 ascii_intro = DEFAULT_R;
5190 fold_margin = FOLD_MARGIN;
5191 o_zconv = no_connection;
5192 o_fconv = no_connection;
5193 o_eol_conv = no_connection;
5194 o_rot_conv = no_connection;
5195 o_hira_conv = no_connection;
5196 o_base64conv = no_connection;
5197 o_iso2022jp_check_conv = no_connection;
5200 i_ungetc = std_ungetc;
5202 i_bungetc = std_ungetc;
5205 i_mungetc = std_ungetc;
5206 i_mgetc_buf = std_getc;
5207 i_mungetc_buf = std_ungetc;
5208 output_mode = ASCII;
5210 mime_decode_mode = FALSE;
5216 z_prev2=0,z_prev1=0;
5218 iconv_for_check = 0;
5220 input_codename = NULL;
5221 input_encoding = NULL;
5222 output_encoding = NULL;
5230 module_connection(void)
5232 if (input_encoding) set_input_encoding(input_encoding);
5233 if (!output_encoding) {
5234 output_encoding = nkf_default_encoding();
5236 if (!output_encoding) {
5237 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5240 set_output_encoding(output_encoding);
5241 oconv = nkf_enc_to_oconv(output_encoding);
5244 /* replace continucation module, from output side */
5246 /* output redicrection */
5248 if (noout_f || guess_f){
5255 if (mimeout_f == TRUE) {
5256 o_base64conv = oconv; oconv = base64_conv;
5258 /* base64_count = 0; */
5261 if (eolmode_f || guess_f) {
5262 o_eol_conv = oconv; oconv = eol_conv;
5265 o_rot_conv = oconv; oconv = rot_conv;
5268 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5271 o_hira_conv = oconv; oconv = hira_conv;
5274 o_fconv = oconv; oconv = fold_conv;
5277 if (alpha_f || x0201_f) {
5278 o_zconv = oconv; oconv = z_conv;
5282 i_ungetc = std_ungetc;
5283 /* input redicrection */
5286 i_cgetc = i_getc; i_getc = cap_getc;
5287 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5290 i_ugetc = i_getc; i_getc = url_getc;
5291 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5294 #ifdef NUMCHAR_OPTION
5296 i_ngetc = i_getc; i_getc = numchar_getc;
5297 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5300 #ifdef UNICODE_NORMALIZATION
5302 i_nfc_getc = i_getc; i_getc = nfc_getc;
5303 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5306 if (mime_f && mimebuf_f==FIXED_MIME) {
5307 i_mgetc = i_getc; i_getc = mime_getc;
5308 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5311 i_bgetc = i_getc; i_getc = broken_getc;
5312 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5314 if (input_encoding) {
5315 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5317 set_iconv(FALSE, e_iconv);
5321 struct input_code *p = input_code_list;
5330 Conversion main loop. Code detection only.
5333 #if !defined(PERL_XS) && !defined(WIN32DLL)
5340 module_connection();
5341 while ((c = (*i_getc)(f)) != EOF)
5348 #define NEXT continue /* no output, get next */
5349 #define SKIP c2=0;continue /* no output, get next */
5350 #define MORE c2=c1;continue /* need one more byte */
5351 #define SEND ; /* output c1 and c2, get next */
5352 #define LAST break /* end of loop, go closing */
5353 #define set_input_mode(mode) do { \
5354 input_mode = mode; \
5356 set_input_codename("ISO-2022-JP"); \
5357 debug("ISO-2022-JP"); \
5361 kanji_convert(FILE *f)
5363 nkf_char c1=0, c2=0, c3=0, c4=0;
5364 int shift_mode = 0; /* 0, 1, 2, 3 */
5366 int is_8bit = FALSE;
5368 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5373 output_mode = ASCII;
5375 if (module_connection() < 0) {
5376 #if !defined(PERL_XS) && !defined(WIN32DLL)
5377 fprintf(stderr, "no output encoding given\n");
5383 #ifdef UTF8_INPUT_ENABLE
5384 if(iconv == w_iconv32){
5385 while ((c1 = (*i_getc)(f)) != EOF &&
5386 (c2 = (*i_getc)(f)) != EOF &&
5387 (c3 = (*i_getc)(f)) != EOF &&
5388 (c4 = (*i_getc)(f)) != EOF) {
5389 nkf_iconv_utf_32(c1, c2, c3, c4);
5391 (*i_ungetc)(EOF, f);
5393 else if (iconv == w_iconv16) {
5394 while ((c1 = (*i_getc)(f)) != EOF &&
5395 (c2 = (*i_getc)(f)) != EOF) {
5396 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5397 (c3 = (*i_getc)(f)) != EOF &&
5398 (c4 = (*i_getc)(f)) != EOF) {
5399 nkf_iconv_utf_16(c1, c2, c3, c4);
5402 (*i_ungetc)(EOF, f);
5406 while ((c1 = (*i_getc)(f)) != EOF) {
5407 #ifdef INPUT_CODE_FIX
5408 if (!input_encoding)
5414 /* in case of 8th bit is on */
5415 if (!estab_f&&!mime_decode_mode) {
5416 /* in case of not established yet */
5417 /* It is still ambiguious */
5418 if (h_conv(f, c2, c1)==EOF) {
5426 /* in case of already established */
5428 /* ignore bogus code */
5436 /* 2nd byte of 7 bit code or SJIS */
5440 else if (nkf_char_unicode_p(c1)) {
5446 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5449 } else if (c1 > DEL) {
5451 if (!estab_f && !iso8859_f) {
5452 /* not established yet */
5454 } else { /* estab_f==TRUE */
5460 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5461 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5463 c2 = JIS_X_0201_1976_K;
5468 /* already established */
5472 } else if (SP < c1 && c1 < DEL) {
5473 /* in case of Roman characters */
5475 /* output 1 shifted byte */
5479 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5480 /* output 1 shifted byte */
5481 c2 = JIS_X_0201_1976_K;
5484 /* look like bogus code */
5487 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5488 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5489 /* in case of Kanji shifted */
5491 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5492 /* Check MIME code */
5493 if ((c1 = (*i_getc)(f)) == EOF) {
5496 } else if (c1 == '?') {
5497 /* =? is mime conversion start sequence */
5498 if(mime_f == STRICT_MIME) {
5499 /* check in real detail */
5500 if (mime_begin_strict(f) == EOF)
5503 } else if (mime_begin(f) == EOF)
5512 /* normal ASCII code */
5515 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5518 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5521 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5522 if ((c1 = (*i_getc)(f)) == EOF) {
5523 /* (*oconv)(0, ESC); don't send bogus code */
5526 else if (c1 == '&') {
5528 if ((c1 = (*i_getc)(f)) == EOF) {
5534 else if (c1 == '$') {
5536 if ((c1 = (*i_getc)(f)) == EOF) {
5537 /* don't send bogus code
5539 (*oconv)(0, '$'); */
5541 } else if (c1 == '@' || c1 == 'B') {
5543 set_input_mode(JIS_X_0208);
5545 } else if (c1 == '(') {
5547 if ((c1 = (*i_getc)(f)) == EOF) {
5548 /* don't send bogus code
5554 } else if (c1 == '@'|| c1 == 'B') {
5556 set_input_mode(JIS_X_0208);
5559 } else if (c1 == 'D'){
5560 set_input_mode(JIS_X_0212);
5562 #endif /* X0212_ENABLE */
5563 } else if (c1 == 'O' || c1 == 'Q'){
5564 set_input_mode(JIS_X_0213_1);
5566 } else if (c1 == 'P'){
5567 set_input_mode(JIS_X_0213_2);
5570 /* could be some special code */
5577 } else if (broken_f&0x2) {
5578 /* accept any ESC-(-x as broken code ... */
5579 input_mode = JIS_X_0208;
5588 } else if (c1 == '(') {
5590 if ((c1 = (*i_getc)(f)) == EOF) {
5591 /* don't send bogus code
5593 (*oconv)(0, '('); */
5596 else if (c1 == 'I') {
5597 /* JIS X 0201 Katakana */
5598 set_input_mode(JIS_X_0201_1976_K);
5601 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5602 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5603 set_input_mode(ASCII);
5606 else if (broken_f&0x2) {
5607 set_input_mode(ASCII);
5616 else if (c1 == '.') {
5618 if ((c1 = (*i_getc)(f)) == EOF) {
5621 else if (c1 == 'A') {
5632 else if (c1 == 'N') {
5635 if (g2 == ISO_8859_1) {
5650 } else if (c1 == ESC && iconv == s_iconv) {
5651 /* ESC in Shift_JIS */
5652 if ((c1 = (*i_getc)(f)) == EOF) {
5653 /* (*oconv)(0, ESC); don't send bogus code */
5655 } else if (c1 == '$') {
5657 if ((c1 = (*i_getc)(f)) == EOF) {
5659 } else if (('E' <= c1 && c1 <= 'G') ||
5660 ('O' <= c1 && c1 <= 'Q')) {
5668 static const nkf_char jphone_emoji_first_table[7] =
5669 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5670 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5671 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5672 while (SP <= c1 && c1 <= 'z') {
5673 (*oconv)(0, c1 + c3);
5674 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5689 } else if (c1 == LF || c1 == CR) {
5691 input_mode = ASCII; set_iconv(FALSE, 0);
5693 } else if (mime_decode_f && !mime_decode_mode){
5695 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5703 } else { /* if (c1 == CR)*/
5704 if ((c1=(*i_getc)(f))!=EOF) {
5708 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5728 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5731 if ((c3 = (*i_getc)(f)) != EOF) {
5734 if ((c4 = (*i_getc)(f)) != EOF) {
5736 (*iconv)(c2, c1, c3|c4);
5741 /* 3 bytes EUC or UTF-8 */
5742 if ((c3 = (*i_getc)(f)) != EOF) {
5744 (*iconv)(c2, c1, c3);
5752 0x7F <= c2 && c2 <= 0x92 &&
5753 0x21 <= c1 && c1 <= 0x7E) {
5755 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5758 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5762 (*oconv)(PREFIX_EUCG3 | c2, c1);
5764 #endif /* X0212_ENABLE */
5766 (*oconv)(PREFIX_EUCG3 | c2, c1);
5769 (*oconv)(input_mode, c1); /* other special case */
5775 /* goto next_word */
5779 (*iconv)(EOF, 0, 0);
5780 if (!input_codename)
5783 struct input_code *p = input_code_list;
5784 struct input_code *result = p;
5786 if (p->score < result->score) result = p;
5789 set_input_codename(result->name);
5791 debug(result->name);
5799 * int options(unsigned char *cp)
5806 options(unsigned char *cp)
5810 unsigned char *cp_back = NULL;
5815 while(*cp && *cp++!='-');
5816 while (*cp || cp_back) {
5824 case '-': /* literal options */
5825 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5829 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5830 p = (unsigned char *)long_option[i].name;
5831 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5832 if (*p == cp[j] || cp[j] == SP){
5839 #if !defined(PERL_XS) && !defined(WIN32DLL)
5840 fprintf(stderr, "unknown long option: --%s\n", cp);
5844 while(*cp && *cp != SP && cp++);
5845 if (long_option[i].alias[0]){
5847 cp = (unsigned char *)long_option[i].alias;
5850 if (strcmp(long_option[i].name, "help") == 0){
5855 if (strcmp(long_option[i].name, "ic=") == 0){
5856 enc = nkf_enc_find((char *)p);
5858 input_encoding = enc;
5861 if (strcmp(long_option[i].name, "oc=") == 0){
5862 enc = nkf_enc_find((char *)p);
5863 /* if (enc <= 0) continue; */
5865 output_encoding = enc;
5868 if (strcmp(long_option[i].name, "guess=") == 0){
5869 if (p[0] == '0' || p[0] == '1') {
5877 if (strcmp(long_option[i].name, "overwrite") == 0){
5880 preserve_time_f = TRUE;
5883 if (strcmp(long_option[i].name, "overwrite=") == 0){
5886 preserve_time_f = TRUE;
5888 backup_suffix = (char *)p;
5891 if (strcmp(long_option[i].name, "in-place") == 0){
5894 preserve_time_f = FALSE;
5897 if (strcmp(long_option[i].name, "in-place=") == 0){
5900 preserve_time_f = FALSE;
5902 backup_suffix = (char *)p;
5907 if (strcmp(long_option[i].name, "cap-input") == 0){
5911 if (strcmp(long_option[i].name, "url-input") == 0){
5916 #ifdef NUMCHAR_OPTION
5917 if (strcmp(long_option[i].name, "numchar-input") == 0){
5923 if (strcmp(long_option[i].name, "no-output") == 0){
5927 if (strcmp(long_option[i].name, "debug") == 0){
5932 if (strcmp(long_option[i].name, "cp932") == 0){
5933 #ifdef SHIFTJIS_CP932
5937 #ifdef UTF8_OUTPUT_ENABLE
5938 ms_ucs_map_f = UCS_MAP_CP932;
5942 if (strcmp(long_option[i].name, "no-cp932") == 0){
5943 #ifdef SHIFTJIS_CP932
5947 #ifdef UTF8_OUTPUT_ENABLE
5948 ms_ucs_map_f = UCS_MAP_ASCII;
5952 #ifdef SHIFTJIS_CP932
5953 if (strcmp(long_option[i].name, "cp932inv") == 0){
5960 if (strcmp(long_option[i].name, "x0212") == 0){
5967 if (strcmp(long_option[i].name, "exec-in") == 0){
5971 if (strcmp(long_option[i].name, "exec-out") == 0){
5976 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5977 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5978 no_cp932ext_f = TRUE;
5981 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5982 no_best_fit_chars_f = TRUE;
5985 if (strcmp(long_option[i].name, "fb-skip") == 0){
5986 encode_fallback = NULL;
5989 if (strcmp(long_option[i].name, "fb-html") == 0){
5990 encode_fallback = encode_fallback_html;
5993 if (strcmp(long_option[i].name, "fb-xml") == 0){
5994 encode_fallback = encode_fallback_xml;
5997 if (strcmp(long_option[i].name, "fb-java") == 0){
5998 encode_fallback = encode_fallback_java;
6001 if (strcmp(long_option[i].name, "fb-perl") == 0){
6002 encode_fallback = encode_fallback_perl;
6005 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6006 encode_fallback = encode_fallback_subchar;
6009 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6010 encode_fallback = encode_fallback_subchar;
6011 unicode_subchar = 0;
6013 /* decimal number */
6014 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6015 unicode_subchar *= 10;
6016 unicode_subchar += hex2bin(p[i]);
6018 }else if(p[1] == 'x' || p[1] == 'X'){
6019 /* hexadecimal number */
6020 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6021 unicode_subchar <<= 4;
6022 unicode_subchar |= hex2bin(p[i]);
6026 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6027 unicode_subchar *= 8;
6028 unicode_subchar += hex2bin(p[i]);
6031 w16e_conv(unicode_subchar, &i, &j);
6032 unicode_subchar = i<<8 | j;
6036 #ifdef UTF8_OUTPUT_ENABLE
6037 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6038 ms_ucs_map_f = UCS_MAP_MS;
6042 #ifdef UNICODE_NORMALIZATION
6043 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6048 if (strcmp(long_option[i].name, "prefix=") == 0){
6049 if (nkf_isgraph(p[0])){
6050 for (i = 1; nkf_isgraph(p[i]); i++){
6051 prefix_table[p[i]] = p[0];
6056 #if !defined(PERL_XS) && !defined(WIN32DLL)
6057 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6062 case 'b': /* buffered mode */
6065 case 'u': /* non bufferd mode */
6068 case 't': /* transparent mode */
6073 } else if (*cp=='2') {
6077 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6085 case 'j': /* JIS output */
6087 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6089 case 'e': /* AT&T EUC output */
6090 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6092 case 's': /* SJIS output */
6093 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6095 case 'l': /* ISO8859 Latin-1 support, no conversion */
6096 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6097 input_encoding = nkf_enc_from_index(ISO_8859_1);
6099 case 'i': /* Kanji IN ESC-$-@/B */
6100 if (*cp=='@'||*cp=='B')
6101 kanji_intro = *cp++;
6103 case 'o': /* ASCII IN ESC-(-J/B/H */
6104 /* ESC ( H was used in initial JUNET messages */
6105 if (*cp=='J'||*cp=='B'||*cp=='H')
6106 ascii_intro = *cp++;
6110 bit:1 katakana->hiragana
6111 bit:2 hiragana->katakana
6113 if ('9'>= *cp && *cp>='0')
6114 hira_f |= (*cp++ -'0');
6121 #if defined(MSDOS) || defined(__OS2__)
6128 show_configuration();
6136 #ifdef UTF8_OUTPUT_ENABLE
6137 case 'w': /* UTF-8 output */
6142 output_encoding = nkf_enc_from_index(UTF_8N);
6144 output_bom_f = TRUE;
6145 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6149 if ('1'== cp[0] && '6'==cp[1]) {
6152 } else if ('3'== cp[0] && '2'==cp[1]) {
6156 output_encoding = nkf_enc_from_index(UTF_8);
6161 output_endian = ENDIAN_LITTLE;
6162 } else if (cp[0] == 'B') {
6165 output_encoding = nkf_enc_from_index(enc_idx);
6170 enc_idx = enc_idx == UTF_16
6171 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6172 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6174 output_bom_f = TRUE;
6175 enc_idx = enc_idx == UTF_16
6176 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6177 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6179 output_encoding = nkf_enc_from_index(enc_idx);
6183 #ifdef UTF8_INPUT_ENABLE
6184 case 'W': /* UTF input */
6187 input_encoding = nkf_enc_from_index(UTF_8);
6190 if ('1'== cp[0] && '6'==cp[1]) {
6192 input_endian = ENDIAN_BIG;
6194 } else if ('3'== cp[0] && '2'==cp[1]) {
6196 input_endian = ENDIAN_BIG;
6199 input_encoding = nkf_enc_from_index(UTF_8);
6204 input_endian = ENDIAN_LITTLE;
6205 } else if (cp[0] == 'B') {
6207 input_endian = ENDIAN_BIG;
6209 enc_idx = (enc_idx == UTF_16
6210 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6211 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6212 input_encoding = nkf_enc_from_index(enc_idx);
6216 /* Input code assumption */
6217 case 'J': /* ISO-2022-JP input */
6218 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6220 case 'E': /* EUC-JP input */
6221 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6223 case 'S': /* Shift_JIS input */
6224 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6226 case 'Z': /* Convert X0208 alphabet to asii */
6228 bit:0 Convert JIS X 0208 Alphabet to ASCII
6229 bit:1 Convert Kankaku to one space
6230 bit:2 Convert Kankaku to two spaces
6231 bit:3 Convert HTML Entity
6232 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6234 while ('0'<= *cp && *cp <='9') {
6235 alpha_f |= 1 << (*cp++ - '0');
6237 if (alpha_f & ((1 << 2) | (1 << 3))) alpha_f |= 1;
6238 if (!alpha_f) alpha_f = 1;
6240 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6241 x0201_f = FALSE; /* No X0201->X0208 conversion */
6243 ESC-(-I in JIS, EUC, MS Kanji
6244 SI/SO in JIS, EUC, MS Kanji
6245 SS2 in EUC, JIS, not in MS Kanji
6246 MS Kanji (0xa0-0xdf)
6248 ESC-(-I in JIS (0x20-0x5f)
6249 SS2 in EUC (0xa0-0xdf)
6250 0xa0-0xd in MS Kanji (0xa0-0xdf)
6253 case 'X': /* Convert X0201 kana to X0208 */
6256 case 'F': /* prserve new lines */
6257 fold_preserve_f = TRUE;
6258 case 'f': /* folding -f60 or -f */
6261 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6263 fold_len += *cp++ - '0';
6265 if (!(0<fold_len && fold_len<BUFSIZ))
6266 fold_len = DEFAULT_FOLD;
6270 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6272 fold_margin += *cp++ - '0';
6276 case 'm': /* MIME support */
6277 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6278 if (*cp=='B'||*cp=='Q') {
6279 mime_decode_mode = *cp++;
6280 mimebuf_f = FIXED_MIME;
6281 } else if (*cp=='N') {
6282 mime_f = TRUE; cp++;
6283 } else if (*cp=='S') {
6284 mime_f = STRICT_MIME; cp++;
6285 } else if (*cp=='0') {
6286 mime_decode_f = FALSE;
6287 mime_f = FALSE; cp++;
6289 mime_f = STRICT_MIME;
6292 case 'M': /* MIME output */
6295 mimeout_f = FIXED_MIME; cp++;
6296 } else if (*cp=='Q') {
6298 mimeout_f = FIXED_MIME; cp++;
6303 case 'B': /* Broken JIS support */
6305 bit:1 allow any x on ESC-(-x or ESC-$-x
6306 bit:2 reset to ascii on NL
6308 if ('9'>= *cp && *cp>='0')
6309 broken_f |= 1<<(*cp++ -'0');
6314 case 'O':/* for Output file */
6318 case 'c':/* add cr code */
6321 case 'd':/* delete cr code */
6324 case 'I': /* ISO-2022-JP output */
6327 case 'L': /* line mode */
6328 if (*cp=='u') { /* unix */
6329 eolmode_f = LF; cp++;
6330 } else if (*cp=='m') { /* mac */
6331 eolmode_f = CR; cp++;
6332 } else if (*cp=='w') { /* windows */
6333 eolmode_f = CRLF; cp++;
6334 } else if (*cp=='0') { /* no conversion */
6335 eolmode_f = 0; cp++;
6340 if ('2' <= *cp && *cp <= '9') {
6343 } else if (*cp == '0' || *cp == '1') {
6352 /* module muliple options in a string are allowed for Perl moudle */
6353 while(*cp && *cp++!='-');
6356 #if !defined(PERL_XS) && !defined(WIN32DLL)
6357 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6359 /* bogus option but ignored */
6367 #include "nkf32dll.c"
6368 #elif defined(PERL_XS)
6369 #else /* WIN32DLL */
6371 main(int argc, char **argv)
6376 char *outfname = NULL;
6379 #ifdef EASYWIN /*Easy Win */
6380 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6382 #ifdef DEFAULT_CODE_LOCALE
6383 setlocale(LC_CTYPE, "");
6387 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6388 cp = (unsigned char *)*argv;
6393 if (pipe(fds) < 0 || (pid = fork()) < 0){
6404 execvp(argv[1], &argv[1]);
6421 int debug_f_back = debug_f;
6424 int exec_f_back = exec_f;
6427 int x0212_f_back = x0212_f;
6429 int x0213_f_back = x0213_f;
6430 int guess_f_back = guess_f;
6432 guess_f = guess_f_back;
6435 debug_f = debug_f_back;
6438 exec_f = exec_f_back;
6440 x0212_f = x0212_f_back;
6441 x0213_f = x0213_f_back;
6444 if (binmode_f == TRUE)
6445 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6446 if (freopen("","wb",stdout) == NULL)
6453 setbuf(stdout, (char *) NULL);
6455 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6458 if (binmode_f == TRUE)
6459 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6460 if (freopen("","rb",stdin) == NULL) return (-1);
6464 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6468 kanji_convert(stdin);
6469 if (guess_f) print_guessed_code(NULL);
6473 int is_argument_error = FALSE;
6475 input_codename = NULL;
6478 iconv_for_check = 0;
6480 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6482 is_argument_error = TRUE;
6490 /* reopen file for stdout */
6491 if (file_out_f == TRUE) {
6494 outfname = nkf_xmalloc(strlen(origfname)
6495 + strlen(".nkftmpXXXXXX")
6497 strcpy(outfname, origfname);
6501 for (i = strlen(outfname); i; --i){
6502 if (outfname[i - 1] == '/'
6503 || outfname[i - 1] == '\\'){
6509 strcat(outfname, "ntXXXXXX");
6511 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6512 S_IREAD | S_IWRITE);
6514 strcat(outfname, ".nkftmpXXXXXX");
6515 fd = mkstemp(outfname);
6518 || (fd_backup = dup(fileno(stdout))) < 0
6519 || dup2(fd, fileno(stdout)) < 0
6530 outfname = "nkf.out";
6533 if(freopen(outfname, "w", stdout) == NULL) {
6537 if (binmode_f == TRUE) {
6538 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6539 if (freopen("","wb",stdout) == NULL)
6546 if (binmode_f == TRUE)
6547 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6548 if (freopen("","rb",fin) == NULL)
6553 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6557 char *filename = NULL;
6559 if (nfiles > 1) filename = origfname;
6560 if (guess_f) print_guessed_code(filename);
6566 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6574 if (dup2(fd_backup, fileno(stdout)) < 0){
6577 if (stat(origfname, &sb)) {
6578 fprintf(stderr, "Can't stat %s\n", origfname);
6580 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6581 if (chmod(outfname, sb.st_mode)) {
6582 fprintf(stderr, "Can't set permission %s\n", outfname);
6585 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6586 if(preserve_time_f){
6587 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6588 tb[0] = tb[1] = sb.st_mtime;
6589 if (utime(outfname, tb)) {
6590 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6593 tb.actime = sb.st_atime;
6594 tb.modtime = sb.st_mtime;
6595 if (utime(outfname, &tb)) {
6596 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6601 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6603 unlink(backup_filename);
6605 if (rename(origfname, backup_filename)) {
6606 perror(backup_filename);
6607 fprintf(stderr, "Can't rename %s to %s\n",
6608 origfname, backup_filename);
6610 nkf_xfree(backup_filename);
6613 if (unlink(origfname)){
6618 if (rename(outfname, origfname)) {
6620 fprintf(stderr, "Can't rename %s to %s\n",
6621 outfname, origfname);
6623 nkf_xfree(outfname);
6628 if (is_argument_error)
6631 #ifdef EASYWIN /*Easy Win */
6632 if (file_out_f == FALSE)
6633 scanf("%d",&end_check);
6636 #else /* for Other OS */
6637 if (file_out_f == TRUE)
6639 #endif /*Easy Win */
6642 #endif /* WIN32DLL */