1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.157 2007/12/22 08:07:23 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-12-22"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #if defined(DEFAULT_CODE_JIS)
44 #elif defined(DEFAULT_CODE_SJIS)
45 #elif defined(DEFAULT_CODE_EUC)
46 #elif defined(DEFAULT_CODE_UTF8)
48 #define DEFAULT_CODE_JIS 1
51 #ifndef MIME_DECODE_DEFAULT
52 #define MIME_DECODE_DEFAULT STRICT_MIME
55 #define X0201_DEFAULT TRUE
58 #if DEFAULT_NEWLINE == 0x0D0A
59 #define PUT_NEWLINE(func) do {\
63 #define OCONV_NEWLINE(func) do {\
67 #elif DEFAULT_NEWLINE == 0x0D
68 #define PUT_NEWLINE(func) func(0x0D)
69 #define OCONV_NEWLINE(func) func(0, 0x0D)
71 #define DEFAULT_NEWLINE 0x0A
72 #define PUT_NEWLINE(func) func(0x0A)
73 #define OCONV_NEWLINE(func) func(0, 0x0A)
76 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
78 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
94 #if defined(MSDOS) || defined(__OS2__)
97 #if defined(_MSC_VER) || defined(__WATCOMC__)
98 #define mktemp _mktemp
104 #define setbinmode(fp) fsetbin(fp)
105 #elif defined(__DJGPP__)
106 #include <libc/dosio.h>
107 #define setbinmode(fp) djgpp_setbinmode(fp)
108 #else /* Microsoft C, Turbo C */
109 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
112 #define setbinmode(fp)
115 #if defined(__DJGPP__)
116 void djgpp_setbinmode(FILE *fp)
118 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
121 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
122 __file_handle_set(fd, m);
126 #ifdef _IOFBF /* SysV and MSDOS, Windows */
127 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
129 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
132 /*Borland C++ 4.5 EasyWin*/
133 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
142 /* added by satoru@isoternet.org */
144 #include <sys/types.h>
146 #include <sys/stat.h>
147 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
149 #if defined(__WATCOMC__)
150 #include <sys/utime.h>
154 #else /* defined(MSDOS) */
156 #ifdef __BORLANDC__ /* BCC32 */
158 #else /* !defined(__BORLANDC__) */
159 #include <sys/utime.h>
160 #endif /* (__BORLANDC__) */
161 #else /* !defined(__WIN32__) */
162 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
163 #include <sys/utime.h>
164 #elif defined(__TURBOC__) /* BCC */
166 #elif defined(LSI_C) /* LSI C */
167 #endif /* (__WIN32__) */
175 /* state of output_mode and input_mode
186 /* Input Assumption */
191 #define LATIN1_INPUT 6
192 #define UTF8_INPUT 13
193 #define UTF16_INPUT 1015
194 #define UTF32_INPUT 1017
197 #define STRICT_MIME 8
204 #define ENDIAN_BIG 1234
205 #define ENDIAN_LITTLE 4321
206 #define ENDIAN_2143 2143
207 #define ENDIAN_3412 3412
271 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
272 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
273 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
274 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
275 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
276 void j_oconv(nkf_char c2, nkf_char c1);
277 void s_oconv(nkf_char c2, nkf_char c1);
278 void e_oconv(nkf_char c2, nkf_char c1);
279 void w_oconv(nkf_char c2, nkf_char c1);
280 void w_oconv16(nkf_char c2, nkf_char c1);
281 void w_oconv32(nkf_char c2, nkf_char c1);
285 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
286 void (*oconv_func)(nkf_char c2, nkf_char c1);
287 } nkf_native_encoding;
289 nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv };
290 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
291 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
292 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
293 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
294 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
295 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
300 nkf_native_encoding *based_encoding;
302 nkf_encoding nkf_encoding_table[] = {
303 {ASCII, "ASCII", &NkfEncodingASCII},
304 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
305 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingASCII},
306 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
307 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
308 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
309 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
310 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
311 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
312 {WINDOWS_31J, "WINDOWS-31J", &NkfEncodingShift_JIS},
313 {CP10001, "CP10001", &NkfEncodingShift_JIS},
314 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
315 {CP51932, "CP51932", &NkfEncodingEUC_JP},
316 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
317 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
318 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
319 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
320 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
321 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
322 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
323 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
324 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
325 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
326 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
327 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
328 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
329 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
330 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
331 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
332 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
333 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
334 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
335 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
336 {BINARY, "BINARY", &NkfEncodingASCII},
339 #define NKF_ENCODING_TABLE_SIZE 34
343 } encoding_name_to_id_table[] = {
345 {"ISO-2022-JP", ISO_2022_JP},
346 {"X-ISO2022JP-CP932", CP50220},
347 {"CP50220", CP50220},
348 {"CP50221", CP50221},
349 {"CP50222", CP50222},
350 {"ISO-2022-JP-1", ISO_2022_JP_1},
351 {"ISO-2022-JP-3", ISO_2022_JP_3},
352 {"SHIFT_JIS", SHIFT_JIS},
354 {"WINDOWS-31J", WINDOWS_31J},
355 {"CSWINDOWS31J", WINDOWS_31J},
356 {"CP932", WINDOWS_31J},
357 {"MS932", WINDOWS_31J},
358 {"CP10001", CP10001},
361 {"CP51932", CP51932},
362 {"EUC-JP-MS", EUCJP_MS},
363 {"EUCJP-MS", EUCJP_MS},
364 {"EUCJPMS", EUCJP_MS},
365 {"EUC-JP-ASCII", EUCJP_ASCII},
366 {"EUCJP-ASCII", EUCJP_ASCII},
367 {"SHIFT_JISX0213", SHIFT_JISX0213},
368 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
369 {"EUC-JISX0213", EUC_JISX0213},
370 {"EUC-JIS-2004", EUC_JIS_2004},
373 {"UTF-8-BOM", UTF_8_BOM},
374 {"UTF8-MAC", UTF8_MAC},
375 {"UTF-8-MAC", UTF8_MAC},
377 {"UTF-16BE", UTF_16BE},
378 {"UTF-16BE-BOM", UTF_16BE_BOM},
379 {"UTF-16LE", UTF_16LE},
380 {"UTF-16LE-BOM", UTF_16LE_BOM},
382 {"UTF-32BE", UTF_32BE},
383 {"UTF-32BE-BOM", UTF_32BE_BOM},
384 {"UTF-32LE", UTF_32LE},
385 {"UTF-32LE-BOM", UTF_32LE_BOM},
389 #if defined(DEFAULT_CODE_JIS)
390 #define DEFAULT_ENCODING ISO_2022_JP
391 #elif defined(DEFAULT_CODE_SJIS)
392 #define DEFAULT_ENCODING SHIFT_JIS
393 #elif defined(DEFAULT_CODE_EUC)
394 #define DEFAULT_ENCODING EUC_JP
395 #elif defined(DEFAULT_CODE_UTF8)
396 #define DEFAULT_ENCODING UTF_8
400 #define is_alnum(c) \
401 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
403 /* I don't trust portablity of toupper */
404 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
405 #define nkf_isoctal(c) ('0'<=c && c<='7')
406 #define nkf_isdigit(c) ('0'<=c && c<='9')
407 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
408 #define nkf_isblank(c) (c == SP || c == TAB)
409 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
410 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
411 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
412 #define nkf_isprint(c) (SP<=c && c<='~')
413 #define nkf_isgraph(c) ('!'<=c && c<='~')
414 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
415 ('A'<=c&&c<='F') ? (c-'A'+10) : \
416 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
417 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
418 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
419 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
420 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
421 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
423 #define CP932_TABLE_BEGIN 0xFA
424 #define CP932_TABLE_END 0xFC
425 #define CP932INV_TABLE_BEGIN 0xED
426 #define CP932INV_TABLE_END 0xEE
427 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
429 #define HOLD_SIZE 1024
430 #if defined(INT_IS_SHORT)
431 #define IOBUF_SIZE 2048
433 #define IOBUF_SIZE 16384
436 #define DEFAULT_J 'B'
437 #define DEFAULT_R 'B'
439 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
440 #define SJ6394 0x0161 /* 63 - 94 ku offset */
442 #define RANGE_NUM_MAX 18
447 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
448 #define sizeof_euc_to_utf8_1byte 94
449 #define sizeof_euc_to_utf8_2bytes 94
450 #define sizeof_utf8_to_euc_C2 64
451 #define sizeof_utf8_to_euc_E5B8 64
452 #define sizeof_utf8_to_euc_2bytes 112
453 #define sizeof_utf8_to_euc_3bytes 16
456 /* MIME preprocessor */
458 #ifdef EASYWIN /*Easy Win */
459 extern POINT _BufferSize;
468 void (*status_func)(struct input_code *, nkf_char);
469 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
473 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
474 static nkf_encoding *output_encoding;
\r
476 #if !defined(PERL_XS) && !defined(WIN32DLL)
477 static nkf_char noconvert(FILE *f);
479 static void module_connection(void);
480 static nkf_char kanji_convert(FILE *f);
481 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
482 static nkf_char push_hold_buf(nkf_char c2);
483 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
484 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
485 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
487 * 0: Shift_JIS, eucJP-ascii
492 #define UCS_MAP_ASCII 0
494 #define UCS_MAP_CP932 2
495 #define UCS_MAP_CP10001 3
496 static int ms_ucs_map_f = UCS_MAP_ASCII;
498 #ifdef UTF8_INPUT_ENABLE
499 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
500 static int no_cp932ext_f = FALSE;
501 /* ignore ZERO WIDTH NO-BREAK SPACE */
502 static int no_best_fit_chars_f = FALSE;
503 static int input_endian = ENDIAN_BIG;
504 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
505 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
506 static void encode_fallback_html(nkf_char c);
507 static void encode_fallback_xml(nkf_char c);
508 static void encode_fallback_java(nkf_char c);
509 static void encode_fallback_perl(nkf_char c);
510 static void encode_fallback_subchar(nkf_char c);
511 static void (*encode_fallback)(nkf_char c) = NULL;
512 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
513 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
514 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
515 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
516 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
517 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
518 static void w_status(struct input_code *, nkf_char);
520 #ifdef UTF8_OUTPUT_ENABLE
521 static int output_bom_f = FALSE;
522 static int output_endian = ENDIAN_BIG;
523 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
525 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
526 static void fold_conv(nkf_char c2,nkf_char c1);
527 static void nl_conv(nkf_char c2,nkf_char c1);
528 static void z_conv(nkf_char c2,nkf_char c1);
529 static void rot_conv(nkf_char c2,nkf_char c1);
530 static void hira_conv(nkf_char c2,nkf_char c1);
531 static void base64_conv(nkf_char c2,nkf_char c1);
532 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
533 static void no_connection(nkf_char c2,nkf_char c1);
534 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
536 static void code_score(struct input_code *ptr);
537 static void code_status(nkf_char c);
539 static void std_putc(nkf_char c);
540 static nkf_char std_getc(FILE *f);
541 static nkf_char std_ungetc(nkf_char c,FILE *f);
543 static nkf_char broken_getc(FILE *f);
544 static nkf_char broken_ungetc(nkf_char c,FILE *f);
546 static nkf_char mime_begin(FILE *f);
547 static nkf_char mime_getc(FILE *f);
548 static nkf_char mime_ungetc(nkf_char c,FILE *f);
550 static void switch_mime_getc(void);
551 static void unswitch_mime_getc(void);
552 static nkf_char mime_begin_strict(FILE *f);
553 static nkf_char mime_getc_buf(FILE *f);
554 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
555 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
557 static nkf_char base64decode(nkf_char c);
558 static void mime_prechar(nkf_char c2, nkf_char c1);
559 static void mime_putc(nkf_char c);
560 static void open_mime(nkf_char c);
561 static void close_mime(void);
562 static void eof_mime(void);
563 static void mimeout_addchar(nkf_char c);
565 static void usage(void);
566 static void version(void);
567 static void show_configuration(void);
569 static void options(unsigned char *c);
570 static void reinit(void);
574 #if !defined(PERL_XS) && !defined(WIN32DLL)
575 static unsigned char stdibuf[IOBUF_SIZE];
576 static unsigned char stdobuf[IOBUF_SIZE];
578 static unsigned char hold_buf[HOLD_SIZE*2];
579 static int hold_count = 0;
581 /* MIME preprocessor fifo */
583 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
584 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
585 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
586 static unsigned char mime_buf[MIME_BUF_SIZE];
587 static unsigned int mime_top = 0;
588 static unsigned int mime_last = 0; /* decoded */
589 static unsigned int mime_input = 0; /* undecoded */
590 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
593 static int unbuf_f = FALSE;
594 static int estab_f = FALSE;
595 static int nop_f = FALSE;
596 static int binmode_f = TRUE; /* binary mode */
597 static int rot_f = FALSE; /* rot14/43 mode */
598 static int hira_f = FALSE; /* hira/kata henkan */
599 static int input_f = FALSE; /* non fixed input code */
600 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
601 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
602 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
603 static int mimebuf_f = FALSE; /* MIME buffered input */
604 static int broken_f = FALSE; /* convert ESC-less broken JIS */
605 static int iso8859_f = FALSE; /* ISO8859 through */
606 static int mimeout_f = FALSE; /* base64 mode */
607 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
608 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
610 #ifdef UNICODE_NORMALIZATION
611 static int nfc_f = FALSE;
612 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
613 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
614 static nkf_char nfc_getc(FILE *f);
615 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
619 static int cap_f = FALSE;
620 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
621 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
622 static nkf_char cap_getc(FILE *f);
623 static nkf_char cap_ungetc(nkf_char c,FILE *f);
625 static int url_f = FALSE;
626 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
627 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
628 static nkf_char url_getc(FILE *f);
629 static nkf_char url_ungetc(nkf_char c,FILE *f);
632 #if defined(INT_IS_SHORT)
633 #define NKF_INT32_C(n) (n##L)
635 #define NKF_INT32_C(n) (n)
637 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
638 #define CLASS_MASK NKF_INT32_C(0xFF000000)
639 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
640 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
641 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
642 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
643 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
645 #ifdef NUMCHAR_OPTION
646 static int numchar_f = FALSE;
647 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
648 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
649 static nkf_char numchar_getc(FILE *f);
650 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
654 static int noout_f = FALSE;
655 static void no_putc(nkf_char c);
656 static int debug_f = FALSE;
657 static void debug(const char *str);
658 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
661 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
663 static void print_guessed_code(char *filename);
665 static void set_input_codename(char *codename);
668 static int exec_f = 0;
671 #ifdef SHIFTJIS_CP932
672 /* invert IBM extended characters to others */
673 static int cp51932_f = FALSE;
675 /* invert NEC-selected IBM extended characters to IBM extended characters */
676 static int cp932inv_f = TRUE;
678 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
679 #endif /* SHIFTJIS_CP932 */
682 static int x0212_f = FALSE;
683 static nkf_char x0212_shift(nkf_char c);
684 static nkf_char x0212_unshift(nkf_char c);
686 static int x0213_f = FALSE;
688 static unsigned char prefix_table[256];
690 static void set_code_score(struct input_code *ptr, nkf_char score);
691 static void clr_code_score(struct input_code *ptr, nkf_char score);
692 static void status_disable(struct input_code *ptr);
693 static void status_push_ch(struct input_code *ptr, nkf_char c);
694 static void status_clear(struct input_code *ptr);
695 static void status_reset(struct input_code *ptr);
696 static void status_reinit(struct input_code *ptr);
697 static void status_check(struct input_code *ptr, nkf_char c);
698 static void e_status(struct input_code *, nkf_char);
699 static void s_status(struct input_code *, nkf_char);
701 struct input_code input_code_list[] = {
702 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
703 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
704 #ifdef UTF8_INPUT_ENABLE
705 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
706 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
707 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
712 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
713 static int base64_count = 0;
715 /* X0208 -> ASCII converter */
718 static int f_line = 0; /* chars in line */
719 static int f_prev = 0;
720 static int fold_preserve_f = FALSE; /* preserve new lines */
721 static int fold_f = FALSE;
722 static int fold_len = 0;
725 static unsigned char kanji_intro = DEFAULT_J;
726 static unsigned char ascii_intro = DEFAULT_R;
730 #define FOLD_MARGIN 10
731 #define DEFAULT_FOLD 60
733 static int fold_margin = FOLD_MARGIN;
737 #ifdef DEFAULT_CODE_JIS
738 # define DEFAULT_CONV j_oconv
740 #ifdef DEFAULT_CODE_SJIS
741 # define DEFAULT_CONV s_oconv
743 #ifdef DEFAULT_CODE_EUC
744 # define DEFAULT_CONV e_oconv
746 #ifdef DEFAULT_CODE_UTF8
747 # define DEFAULT_CONV w_oconv
750 /* process default */
751 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
753 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
754 /* s_iconv or oconv */
755 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
757 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
758 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
759 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
760 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
761 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
762 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
763 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
765 /* static redirections */
767 static void (*o_putc)(nkf_char c) = std_putc;
769 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
770 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
772 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
773 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
775 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
777 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
778 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
780 /* for strict mime */
781 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
782 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
785 static int output_mode = ASCII, /* output kanji mode */
786 input_mode = ASCII, /* input kanji mode */
787 shift_mode = FALSE; /* TRUE shift out, or X0201 */
788 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
790 /* X0201 / X0208 conversion tables */
792 /* X0201 kana conversion table */
794 static const unsigned char cv[]= {
795 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
796 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
797 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
798 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
799 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
800 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
801 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
802 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
803 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
804 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
805 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
806 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
807 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
808 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
809 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
810 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
814 /* X0201 kana conversion table for daguten */
816 static const unsigned char dv[]= {
817 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
818 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
819 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
820 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
821 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
822 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
823 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
824 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
825 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
826 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
827 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
828 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
829 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
830 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
831 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
832 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
835 /* X0201 kana conversion table for han-daguten */
837 static const unsigned char ev[]= {
838 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
839 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
840 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
841 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
842 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
843 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
844 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
845 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
846 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
847 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
848 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
849 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
850 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
851 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
852 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
853 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
857 /* X0208 kigou conversion table */
858 /* 0x8140 - 0x819e */
859 static const unsigned char fv[] = {
861 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
862 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
863 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
864 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
865 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
866 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
867 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
868 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
869 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
870 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
871 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
872 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
877 static int file_out_f = FALSE;
879 static int overwrite_f = FALSE;
880 static int preserve_time_f = FALSE;
881 static int backup_f = FALSE;
882 static char *backup_suffix = "";
883 static char *get_backup_filename(const char *suffix, const char *filename);
886 static int nlmode_f = 0; /* CR, LF, CRLF */
887 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
888 static nkf_char prev_cr = 0; /* CR or 0 */
889 #ifdef EASYWIN /*Easy Win */
890 static int end_check;
893 #define STD_GC_BUFSIZE (256)
894 nkf_char std_gc_buf[STD_GC_BUFSIZE];
897 char* nkf_strcpy(const char *str)
899 char* result = malloc(strlen(str) + 1);
908 static void nkf_str_upcase(const char *str, char *res, size_t length)
911 for (; i < length && str[i]; i++) {
912 res[i] = nkf_toupper(str[i]);
917 static nkf_encoding *nkf_enc_from_index(int idx)
919 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
922 return &nkf_encoding_table[idx];
925 static int nkf_enc_find_index(const char *name)
928 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
929 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
930 return encoding_name_to_id_table[i].id;
936 static nkf_encoding *nkf_enc_find(const char *name)
939 idx = nkf_enc_find_index(name);
940 if (idx < 0) return 0;
941 return nkf_enc_from_index(idx);
944 #define nkf_enc_name(enc) (enc)->name
\r
945 #define nkf_enc_to_index(enc) (enc)->id
946 #define nkf_enc_to_base_encoding(enc) (enc)->based_encoding
949 #include "nkf32dll.c"
950 #elif defined(PERL_XS)
952 int main(int argc, char **argv)
957 char *outfname = NULL;
960 #ifdef EASYWIN /*Easy Win */
961 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
964 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
965 cp = (unsigned char *)*argv;
969 int debug_f_back = debug_f;
972 int exec_f_back = exec_f;
975 int x0212_f_back = x0212_f;
977 int x0213_f_back = x0213_f;
978 int guess_f_back = guess_f;
980 guess_f = guess_f_back;
983 debug_f = debug_f_back;
986 exec_f = exec_f_back;
989 x0212_f = x0212_f_back;
991 x0213_f = x0213_f_back;
996 if (pipe(fds) < 0 || (pid = fork()) < 0){
1007 execvp(argv[1], &argv[1]);
1022 if (binmode_f == TRUE)
1023 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1024 if (freopen("","wb",stdout) == NULL)
1031 setbuf(stdout, (char *) NULL);
1033 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
1036 if (binmode_f == TRUE)
1037 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1038 if (freopen("","rb",stdin) == NULL) return (-1);
1042 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
1046 kanji_convert(stdin);
1047 if (guess_f) print_guessed_code(NULL);
1051 int is_argument_error = FALSE;
1053 input_codename = NULL;
1056 iconv_for_check = 0;
1058 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
1061 is_argument_error = TRUE;
1069 /* reopen file for stdout */
1070 if (file_out_f == TRUE) {
1073 outfname = malloc(strlen(origfname)
1074 + strlen(".nkftmpXXXXXX")
1080 strcpy(outfname, origfname);
1084 for (i = strlen(outfname); i; --i){
1085 if (outfname[i - 1] == '/'
1086 || outfname[i - 1] == '\\'){
1092 strcat(outfname, "ntXXXXXX");
1094 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
1095 S_IREAD | S_IWRITE);
1097 strcat(outfname, ".nkftmpXXXXXX");
1098 fd = mkstemp(outfname);
1101 || (fd_backup = dup(fileno(stdout))) < 0
1102 || dup2(fd, fileno(stdout)) < 0
1113 outfname = "nkf.out";
1116 if(freopen(outfname, "w", stdout) == NULL) {
1120 if (binmode_f == TRUE) {
1121 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1122 if (freopen("","wb",stdout) == NULL)
1129 if (binmode_f == TRUE)
1130 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1131 if (freopen("","rb",fin) == NULL)
1136 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1140 char *filename = NULL;
1142 if (nfiles > 1) filename = origfname;
1143 if (guess_f) print_guessed_code(filename);
1149 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1157 if (dup2(fd_backup, fileno(stdout)) < 0){
1160 if (stat(origfname, &sb)) {
1161 fprintf(stderr, "Can't stat %s\n", origfname);
1163 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1164 if (chmod(outfname, sb.st_mode)) {
1165 fprintf(stderr, "Can't set permission %s\n", outfname);
1168 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1169 if(preserve_time_f){
1170 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1171 tb[0] = tb[1] = sb.st_mtime;
1172 if (utime(outfname, tb)) {
1173 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1176 tb.actime = sb.st_atime;
1177 tb.modtime = sb.st_mtime;
1178 if (utime(outfname, &tb)) {
1179 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1184 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1186 unlink(backup_filename);
1188 if (rename(origfname, backup_filename)) {
1189 perror(backup_filename);
1190 fprintf(stderr, "Can't rename %s to %s\n",
1191 origfname, backup_filename);
1195 if (unlink(origfname)){
1200 if (rename(outfname, origfname)) {
1202 fprintf(stderr, "Can't rename %s to %s\n",
1203 outfname, origfname);
1210 if (is_argument_error)
1213 #ifdef EASYWIN /*Easy Win */
1214 if (file_out_f == FALSE)
1215 scanf("%d",&end_check);
1218 #else /* for Other OS */
1219 if (file_out_f == TRUE)
1221 #endif /*Easy Win */
1224 #endif /* WIN32DLL */
1227 char *get_backup_filename(const char *suffix, const char *filename)
1229 char *backup_filename;
1230 int asterisk_count = 0;
1232 int filename_length = strlen(filename);
1234 for(i = 0; suffix[i]; i++){
1235 if(suffix[i] == '*') asterisk_count++;
1239 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1240 if (!backup_filename){
1241 perror("Can't malloc backup filename.");
1245 for(i = 0, j = 0; suffix[i];){
1246 if(suffix[i] == '*'){
1247 backup_filename[j] = '\0';
1248 strncat(backup_filename, filename, filename_length);
1250 j += filename_length;
1252 backup_filename[j++] = suffix[i++];
1255 backup_filename[j] = '\0';
1257 j = strlen(suffix) + filename_length;
1258 backup_filename = malloc( + 1);
1259 strcpy(backup_filename, filename);
1260 strcat(backup_filename, suffix);
1261 backup_filename[j] = '\0';
1263 return backup_filename;
1267 static const struct {
1291 {"katakana-hiragana","h3"},
1299 #ifdef UTF8_OUTPUT_ENABLE
1309 {"fb-subchar=", ""},
1311 #ifdef UTF8_INPUT_ENABLE
1312 {"utf8-input", "W"},
1313 {"utf16-input", "W16"},
1314 {"no-cp932ext", ""},
1315 {"no-best-fit-chars",""},
1317 #ifdef UNICODE_NORMALIZATION
1318 {"utf8mac-input", ""},
1330 #ifdef NUMCHAR_OPTION
1331 {"numchar-input", ""},
1337 #ifdef SHIFTJIS_CP932
1347 static int option_mode = 0;
1349 void options(unsigned char *cp)
1353 unsigned char *cp_back = NULL;
1359 while(*cp && *cp++!='-');
1360 while (*cp || cp_back) {
1368 case '-': /* literal options */
1369 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1373 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1374 p = (unsigned char *)long_option[i].name;
1375 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1376 if (*p == cp[j] || cp[j] == SP){
1383 fprintf(stderr, "unknown long option: --%s\n", cp);
1386 while(*cp && *cp != SP && cp++);
1387 if (long_option[i].alias[0]){
1389 cp = (unsigned char *)long_option[i].alias;
1391 if (strcmp(long_option[i].name, "ic=") == 0){
1392 nkf_str_upcase(p, codeset, 32);
1393 enc = nkf_enc_find(codeset);
1394 switch (nkf_enc_to_index(enc)) {
1396 input_f = JIS_INPUT;
1401 input_f = JIS_INPUT;
1402 #ifdef SHIFTJIS_CP932
1405 #ifdef UTF8_OUTPUT_ENABLE
1406 ms_ucs_map_f = UCS_MAP_CP932;
1410 input_f = JIS_INPUT;
1416 input_f = JIS_INPUT;
1423 input_f = SJIS_INPUT;
1426 input_f = SJIS_INPUT;
1427 #ifdef SHIFTJIS_CP932
1430 #ifdef UTF8_OUTPUT_ENABLE
1431 ms_ucs_map_f = UCS_MAP_CP932;
1435 input_f = SJIS_INPUT;
1436 #ifdef SHIFTJIS_CP932
1439 #ifdef UTF8_OUTPUT_ENABLE
1440 ms_ucs_map_f = UCS_MAP_CP10001;
1444 input_f = EUC_INPUT;
1447 input_f = EUC_INPUT;
1448 #ifdef SHIFTJIS_CP932
1451 #ifdef UTF8_OUTPUT_ENABLE
1452 ms_ucs_map_f = UCS_MAP_CP932;
1456 input_f = EUC_INPUT;
1457 #ifdef SHIFTJIS_CP932
1460 #ifdef UTF8_OUTPUT_ENABLE
1461 ms_ucs_map_f = UCS_MAP_MS;
1465 input_f = EUC_INPUT;
1466 #ifdef SHIFTJIS_CP932
1469 #ifdef UTF8_OUTPUT_ENABLE
1470 ms_ucs_map_f = UCS_MAP_ASCII;
1473 case SHIFT_JISX0213:
1474 case SHIFT_JIS_2004:
1475 input_f = SJIS_INPUT;
1477 #ifdef SHIFTJIS_CP932
1483 input_f = EUC_INPUT;
1485 #ifdef SHIFTJIS_CP932
1489 #ifdef UTF8_INPUT_ENABLE
1493 input_f = UTF8_INPUT;
1495 #ifdef UNICODE_NORMALIZATION
1497 input_f = UTF8_INPUT;
1504 input_f = UTF16_INPUT;
1505 input_endian = ENDIAN_BIG;
1509 input_f = UTF16_INPUT;
1510 input_endian = ENDIAN_LITTLE;
1515 input_f = UTF32_INPUT;
1516 input_endian = ENDIAN_BIG;
1520 input_f = UTF32_INPUT;
1521 input_endian = ENDIAN_LITTLE;
1525 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1530 if (strcmp(long_option[i].name, "oc=") == 0){
1532 nkf_str_upcase(p, codeset, 32);
1533 output_encoding = nkf_enc_find(codeset);
\r
1534 switch (nkf_enc_to_index(output_encoding)) {
\r
1536 output_conv = j_oconv;
1539 output_conv = j_oconv;
1541 #ifdef SHIFTJIS_CP932
1544 #ifdef UTF8_OUTPUT_ENABLE
1545 ms_ucs_map_f = UCS_MAP_CP932;
1549 output_conv = j_oconv;
1550 #ifdef SHIFTJIS_CP932
1553 #ifdef UTF8_OUTPUT_ENABLE
1554 ms_ucs_map_f = UCS_MAP_CP932;
1558 output_conv = j_oconv;
1562 #ifdef SHIFTJIS_CP932
1567 output_conv = j_oconv;
1572 #ifdef SHIFTJIS_CP932
1577 output_conv = s_oconv;
1580 output_conv = s_oconv;
1581 #ifdef UTF8_OUTPUT_ENABLE
1582 ms_ucs_map_f = UCS_MAP_CP932;
1586 output_conv = s_oconv;
1587 #ifdef UTF8_OUTPUT_ENABLE
1588 ms_ucs_map_f = UCS_MAP_CP10001;
1592 output_conv = e_oconv;
1595 output_conv = e_oconv;
1596 #ifdef SHIFTJIS_CP932
1599 #ifdef UTF8_OUTPUT_ENABLE
1600 ms_ucs_map_f = UCS_MAP_CP932;
1604 output_conv = e_oconv;
1608 #ifdef UTF8_OUTPUT_ENABLE
1609 ms_ucs_map_f = UCS_MAP_MS;
1613 output_conv = e_oconv;
1617 #ifdef UTF8_OUTPUT_ENABLE
1618 ms_ucs_map_f = UCS_MAP_ASCII;
1621 case SHIFT_JISX0213:
1622 case SHIFT_JIS_2004:
1623 output_conv = s_oconv;
1625 #ifdef SHIFTJIS_CP932
1631 output_conv = e_oconv;
1636 #ifdef SHIFTJIS_CP932
1640 #ifdef UTF8_OUTPUT_ENABLE
1643 output_conv = w_oconv;
1646 output_conv = w_oconv;
1647 output_bom_f = TRUE;
1650 output_conv = w_oconv16;
1654 output_conv = w_oconv16;
1655 output_bom_f = TRUE;
1658 output_conv = w_oconv16;
1659 output_endian = ENDIAN_LITTLE;
1662 output_conv = w_oconv16;
1663 output_endian = ENDIAN_LITTLE;
1664 output_bom_f = TRUE;
1668 output_conv = w_oconv32;
1671 output_conv = w_oconv32;
1672 output_bom_f = TRUE;
1675 output_conv = w_oconv32;
1676 output_endian = ENDIAN_LITTLE;
1679 output_conv = w_oconv32;
1680 output_endian = ENDIAN_LITTLE;
1681 output_bom_f = TRUE;
1685 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1690 if (strcmp(long_option[i].name, "guess=") == 0){
1699 if (strcmp(long_option[i].name, "overwrite") == 0){
1702 preserve_time_f = TRUE;
1705 if (strcmp(long_option[i].name, "overwrite=") == 0){
1708 preserve_time_f = TRUE;
1710 backup_suffix = malloc(strlen((char *) p) + 1);
1711 strcpy(backup_suffix, (char *) p);
1714 if (strcmp(long_option[i].name, "in-place") == 0){
1717 preserve_time_f = FALSE;
1720 if (strcmp(long_option[i].name, "in-place=") == 0){
1723 preserve_time_f = FALSE;
1725 backup_suffix = malloc(strlen((char *) p) + 1);
1726 strcpy(backup_suffix, (char *) p);
1731 if (strcmp(long_option[i].name, "cap-input") == 0){
1735 if (strcmp(long_option[i].name, "url-input") == 0){
1740 #ifdef NUMCHAR_OPTION
1741 if (strcmp(long_option[i].name, "numchar-input") == 0){
1747 if (strcmp(long_option[i].name, "no-output") == 0){
1751 if (strcmp(long_option[i].name, "debug") == 0){
1756 if (strcmp(long_option[i].name, "cp932") == 0){
1757 #ifdef SHIFTJIS_CP932
1761 #ifdef UTF8_OUTPUT_ENABLE
1762 ms_ucs_map_f = UCS_MAP_CP932;
1766 if (strcmp(long_option[i].name, "no-cp932") == 0){
1767 #ifdef SHIFTJIS_CP932
1771 #ifdef UTF8_OUTPUT_ENABLE
1772 ms_ucs_map_f = UCS_MAP_ASCII;
1776 #ifdef SHIFTJIS_CP932
1777 if (strcmp(long_option[i].name, "cp932inv") == 0){
1784 if (strcmp(long_option[i].name, "x0212") == 0){
1791 if (strcmp(long_option[i].name, "exec-in") == 0){
1795 if (strcmp(long_option[i].name, "exec-out") == 0){
1800 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1801 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1802 no_cp932ext_f = TRUE;
1805 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1806 no_best_fit_chars_f = TRUE;
1809 if (strcmp(long_option[i].name, "fb-skip") == 0){
1810 encode_fallback = NULL;
1813 if (strcmp(long_option[i].name, "fb-html") == 0){
1814 encode_fallback = encode_fallback_html;
1817 if (strcmp(long_option[i].name, "fb-xml") == 0){
1818 encode_fallback = encode_fallback_xml;
1821 if (strcmp(long_option[i].name, "fb-java") == 0){
1822 encode_fallback = encode_fallback_java;
1825 if (strcmp(long_option[i].name, "fb-perl") == 0){
1826 encode_fallback = encode_fallback_perl;
1829 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1830 encode_fallback = encode_fallback_subchar;
1833 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1834 encode_fallback = encode_fallback_subchar;
1835 unicode_subchar = 0;
1837 /* decimal number */
1838 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1839 unicode_subchar *= 10;
1840 unicode_subchar += hex2bin(p[i]);
1842 }else if(p[1] == 'x' || p[1] == 'X'){
1843 /* hexadecimal number */
1844 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1845 unicode_subchar <<= 4;
1846 unicode_subchar |= hex2bin(p[i]);
1850 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1851 unicode_subchar *= 8;
1852 unicode_subchar += hex2bin(p[i]);
1855 w16e_conv(unicode_subchar, &i, &j);
1856 unicode_subchar = i<<8 | j;
1860 #ifdef UTF8_OUTPUT_ENABLE
1861 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1862 ms_ucs_map_f = UCS_MAP_MS;
1866 #ifdef UNICODE_NORMALIZATION
1867 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1868 input_f = UTF8_INPUT;
1873 if (strcmp(long_option[i].name, "prefix=") == 0){
1874 if (nkf_isgraph(p[0])){
1875 for (i = 1; nkf_isgraph(p[i]); i++){
1876 prefix_table[p[i]] = p[0];
1883 case 'b': /* buffered mode */
1886 case 'u': /* non bufferd mode */
1889 case 't': /* transparent mode */
1894 } else if (*cp=='2') {
1898 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1906 case 'j': /* JIS output */
1908 output_conv = j_oconv;
1909 output_encoding = nkf_enc_from_index(ISO_2022_JP);
\r
1911 case 'e': /* AT&T EUC output */
1912 output_conv = e_oconv;
1914 output_encoding = nkf_enc_from_index(EUC_JP);
\r
1916 case 's': /* SJIS output */
1917 output_conv = s_oconv;
1918 output_encoding = nkf_enc_from_index(SHIFT_JIS);
\r
1920 case 'l': /* ISO8859 Latin-1 support, no conversion */
1921 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1922 input_f = LATIN1_INPUT;
1924 case 'i': /* Kanji IN ESC-$-@/B */
1925 if (*cp=='@'||*cp=='B')
1926 kanji_intro = *cp++;
1928 case 'o': /* ASCII IN ESC-(-J/B */
1929 if (*cp=='J'||*cp=='B'||*cp=='H')
1930 ascii_intro = *cp++;
1934 bit:1 katakana->hiragana
1935 bit:2 hiragana->katakana
1937 if ('9'>= *cp && *cp>='0')
1938 hira_f |= (*cp++ -'0');
1945 #if defined(MSDOS) || defined(__OS2__)
1952 show_configuration();
1960 #ifdef UTF8_OUTPUT_ENABLE
1961 case 'w': /* UTF-8 output */
1963 output_conv = w_oconv; cp++;
1966 output_encoding = nkf_enc_from_index(UTF_8N);
\r
1968 output_bom_f = TRUE;
1969 output_encoding = nkf_enc_from_index(UTF_8_BOM);
\r
1973 if ('1'== cp[0] && '6'==cp[1]) {
\r
1974 output_conv = w_oconv16; cp+=2;
1976 } else if ('3'== cp[0] && '2'==cp[1]) {
1977 output_conv = w_oconv32; cp+=2;
1980 output_conv = w_oconv;
1981 output_encoding = nkf_enc_from_index(UTF_8);
\r
1986 output_endian = ENDIAN_LITTLE;
1987 } else if (cp[0] == 'B') {
1990 output_encoding = nkf_enc_from_index(enc_idx);
\r
1995 enc_idx = enc_idx == UTF_16
\r
1996 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1997 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1999 output_bom_f = TRUE;
2000 enc_idx = enc_idx == UTF_16
\r
2001 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
2002 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
2004 output_encoding = nkf_enc_from_index(enc_idx);
\r
2008 #ifdef UTF8_INPUT_ENABLE
2009 case 'W': /* UTF input */
2012 input_f = UTF8_INPUT;
2014 if ('1'== cp[0] && '6'==cp[1]) {
2016 input_f = UTF16_INPUT;
2017 input_endian = ENDIAN_BIG;
2018 } else if ('3'== cp[0] && '2'==cp[1]) {
2020 input_f = UTF32_INPUT;
2021 input_endian = ENDIAN_BIG;
2023 input_f = UTF8_INPUT;
2028 input_endian = ENDIAN_LITTLE;
2029 } else if (cp[0] == 'B') {
2035 /* Input code assumption */
2036 case 'J': /* JIS input */
2037 input_f = JIS_INPUT;
2039 case 'E': /* AT&T EUC input */
2040 input_f = EUC_INPUT;
2042 case 'S': /* MS Kanji input */
2043 input_f = SJIS_INPUT;
2045 case 'Z': /* Convert X0208 alphabet to asii */
2047 bit:0 Convert JIS X 0208 Alphabet to ASCII
2048 bit:1 Convert Kankaku to one space
2049 bit:2 Convert Kankaku to two spaces
2050 bit:3 Convert HTML Entity
2051 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
2053 while ('0'<= *cp && *cp <='9') {
2054 alpha_f |= 1 << (*cp++ - '0');
2056 if (!alpha_f) alpha_f = 1;
2058 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
2059 x0201_f = FALSE; /* No X0201->X0208 conversion */
2061 ESC-(-I in JIS, EUC, MS Kanji
2062 SI/SO in JIS, EUC, MS Kanji
2063 SSO in EUC, JIS, not in MS Kanji
2064 MS Kanji (0xa0-0xdf)
2066 ESC-(-I in JIS (0x20-0x5f)
2067 SSO in EUC (0xa0-0xdf)
2068 0xa0-0xd in MS Kanji (0xa0-0xdf)
2071 case 'X': /* Convert X0201 kana to X0208 */
2074 case 'F': /* prserve new lines */
2075 fold_preserve_f = TRUE;
2076 case 'f': /* folding -f60 or -f */
2079 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2081 fold_len += *cp++ - '0';
2083 if (!(0<fold_len && fold_len<BUFSIZ))
2084 fold_len = DEFAULT_FOLD;
2088 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
2090 fold_margin += *cp++ - '0';
2094 case 'm': /* MIME support */
2095 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
2096 if (*cp=='B'||*cp=='Q') {
2097 mime_decode_mode = *cp++;
2098 mimebuf_f = FIXED_MIME;
2099 } else if (*cp=='N') {
2100 mime_f = TRUE; cp++;
2101 } else if (*cp=='S') {
2102 mime_f = STRICT_MIME; cp++;
2103 } else if (*cp=='0') {
2104 mime_decode_f = FALSE;
2105 mime_f = FALSE; cp++;
2108 case 'M': /* MIME output */
2111 mimeout_f = FIXED_MIME; cp++;
2112 } else if (*cp=='Q') {
2114 mimeout_f = FIXED_MIME; cp++;
2119 case 'B': /* Broken JIS support */
2121 bit:1 allow any x on ESC-(-x or ESC-$-x
2122 bit:2 reset to ascii on NL
2124 if ('9'>= *cp && *cp>='0')
2125 broken_f |= 1<<(*cp++ -'0');
2130 case 'O':/* for Output file */
2134 case 'c':/* add cr code */
2137 case 'd':/* delete cr code */
2140 case 'I': /* ISO-2022-JP output */
2143 case 'L': /* line mode */
2144 if (*cp=='u') { /* unix */
2145 nlmode_f = LF; cp++;
2146 } else if (*cp=='m') { /* mac */
2147 nlmode_f = CR; cp++;
2148 } else if (*cp=='w') { /* windows */
2149 nlmode_f = CRLF; cp++;
2150 } else if (*cp=='0') { /* no conversion */
2159 } else if (*cp == '0') {
2168 /* module muliple options in a string are allowed for Perl moudle */
2169 while(*cp && *cp++!='-');
2172 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2173 /* bogus option but ignored */
2179 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2182 struct input_code *p = input_code_list;
2184 if (iconv_func == p->iconv_func){
2193 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2195 #ifdef INPUT_CODE_FIX
2203 #ifdef INPUT_CODE_FIX
2204 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
2210 if (estab_f && iconv_for_check != iconv){
2211 struct input_code *p = find_inputcode_byfunc(iconv);
2213 set_input_codename(p->name);
2216 iconv_for_check = iconv;
2221 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2222 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2223 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2224 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2225 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2226 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2227 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2228 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2230 #define SCORE_INIT (SCORE_iMIME)
2232 static const char score_table_A0[] = {
2235 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2236 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2239 static const char score_table_F0[] = {
2240 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2241 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2242 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2243 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2246 void set_code_score(struct input_code *ptr, nkf_char score)
2249 ptr->score |= score;
2253 void clr_code_score(struct input_code *ptr, nkf_char score)
2256 ptr->score &= ~score;
2260 void code_score(struct input_code *ptr)
2262 nkf_char c2 = ptr->buf[0];
2263 #ifdef UTF8_OUTPUT_ENABLE
2264 nkf_char c1 = ptr->buf[1];
2267 set_code_score(ptr, SCORE_ERROR);
2268 }else if (c2 == SSO){
2269 set_code_score(ptr, SCORE_KANA);
2270 }else if (c2 == 0x8f){
2271 set_code_score(ptr, SCORE_X0212);
2272 #ifdef UTF8_OUTPUT_ENABLE
2273 }else if (!e2w_conv(c2, c1)){
2274 set_code_score(ptr, SCORE_NO_EXIST);
2276 }else if ((c2 & 0x70) == 0x20){
2277 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2278 }else if ((c2 & 0x70) == 0x70){
2279 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2280 }else if ((c2 & 0x70) >= 0x50){
2281 set_code_score(ptr, SCORE_L2);
2285 void status_disable(struct input_code *ptr)
2290 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2293 void status_push_ch(struct input_code *ptr, nkf_char c)
2295 ptr->buf[ptr->index++] = c;
2298 void status_clear(struct input_code *ptr)
2304 void status_reset(struct input_code *ptr)
2307 ptr->score = SCORE_INIT;
2310 void status_reinit(struct input_code *ptr)
2313 ptr->_file_stat = 0;
2316 void status_check(struct input_code *ptr, nkf_char c)
2318 if (c <= DEL && estab_f){
2323 void s_status(struct input_code *ptr, nkf_char c)
2327 status_check(ptr, c);
2332 #ifdef NUMCHAR_OPTION
2333 }else if (is_unicode_capsule(c)){
2336 }else if (0xa1 <= c && c <= 0xdf){
2337 status_push_ch(ptr, SSO);
2338 status_push_ch(ptr, c);
2341 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2343 status_push_ch(ptr, c);
2344 }else if (0xed <= c && c <= 0xee){
2346 status_push_ch(ptr, c);
2347 #ifdef SHIFTJIS_CP932
2348 }else if (is_ibmext_in_sjis(c)){
2350 status_push_ch(ptr, c);
2351 #endif /* SHIFTJIS_CP932 */
2353 }else if (0xf0 <= c && c <= 0xfc){
2355 status_push_ch(ptr, c);
2356 #endif /* X0212_ENABLE */
2358 status_disable(ptr);
2362 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2363 status_push_ch(ptr, c);
2364 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2368 status_disable(ptr);
2372 #ifdef SHIFTJIS_CP932
2373 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2374 status_push_ch(ptr, c);
2375 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2376 set_code_score(ptr, SCORE_CP932);
2381 #endif /* SHIFTJIS_CP932 */
2382 status_disable(ptr);
2385 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2386 status_push_ch(ptr, c);
2387 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2388 set_code_score(ptr, SCORE_CP932);
2391 status_disable(ptr);
2397 void e_status(struct input_code *ptr, nkf_char c)
2401 status_check(ptr, c);
2406 #ifdef NUMCHAR_OPTION
2407 }else if (is_unicode_capsule(c)){
2410 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2412 status_push_ch(ptr, c);
2414 }else if (0x8f == c){
2416 status_push_ch(ptr, c);
2417 #endif /* X0212_ENABLE */
2419 status_disable(ptr);
2423 if (0xa1 <= c && c <= 0xfe){
2424 status_push_ch(ptr, c);
2428 status_disable(ptr);
2433 if (0xa1 <= c && c <= 0xfe){
2435 status_push_ch(ptr, c);
2437 status_disable(ptr);
2439 #endif /* X0212_ENABLE */
2443 #ifdef UTF8_INPUT_ENABLE
2444 void w_status(struct input_code *ptr, nkf_char c)
2448 status_check(ptr, c);
2453 #ifdef NUMCHAR_OPTION
2454 }else if (is_unicode_capsule(c)){
2457 }else if (0xc0 <= c && c <= 0xdf){
2459 status_push_ch(ptr, c);
2460 }else if (0xe0 <= c && c <= 0xef){
2462 status_push_ch(ptr, c);
2463 }else if (0xf0 <= c && c <= 0xf4){
2465 status_push_ch(ptr, c);
2467 status_disable(ptr);
2472 if (0x80 <= c && c <= 0xbf){
2473 status_push_ch(ptr, c);
2474 if (ptr->index > ptr->stat){
2475 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2476 && ptr->buf[2] == 0xbf);
2477 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2478 &ptr->buf[0], &ptr->buf[1]);
2485 status_disable(ptr);
2489 if (0x80 <= c && c <= 0xbf){
2490 if (ptr->index < ptr->stat){
2491 status_push_ch(ptr, c);
2496 status_disable(ptr);
2503 void code_status(nkf_char c)
2505 int action_flag = 1;
2506 struct input_code *result = 0;
2507 struct input_code *p = input_code_list;
2509 if (!p->status_func) {
2513 if (!p->status_func)
2515 (p->status_func)(p, c);
2518 }else if(p->stat == 0){
2529 if (result && !estab_f){
2530 set_iconv(TRUE, result->iconv_func);
2531 }else if (c <= DEL){
2532 struct input_code *ptr = input_code_list;
2542 nkf_char std_getc(FILE *f)
2545 return std_gc_buf[--std_gc_ndx];
2551 nkf_char std_ungetc(nkf_char c, FILE *f)
2553 if (std_gc_ndx == STD_GC_BUFSIZE){
2556 std_gc_buf[std_gc_ndx++] = c;
2561 void std_putc(nkf_char c)
2568 #if !defined(PERL_XS) && !defined(WIN32DLL)
2569 nkf_char noconvert(FILE *f)
2574 module_connection();
2575 while ((c = (*i_getc)(f)) != EOF)
2582 void module_connection(void)
2584 oconv = output_conv;
2587 /* replace continucation module, from output side */
2589 /* output redicrection */
2591 if (noout_f || guess_f){
2598 if (mimeout_f == TRUE) {
2599 o_base64conv = oconv; oconv = base64_conv;
2601 /* base64_count = 0; */
2604 if (nlmode_f || guess_f) {
2605 o_nlconv = oconv; oconv = nl_conv;
2608 o_rot_conv = oconv; oconv = rot_conv;
2611 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2614 o_hira_conv = oconv; oconv = hira_conv;
2617 o_fconv = oconv; oconv = fold_conv;
2620 if (alpha_f || x0201_f) {
2621 o_zconv = oconv; oconv = z_conv;
2625 i_ungetc = std_ungetc;
2626 /* input redicrection */
2629 i_cgetc = i_getc; i_getc = cap_getc;
2630 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2633 i_ugetc = i_getc; i_getc = url_getc;
2634 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2637 #ifdef NUMCHAR_OPTION
2639 i_ngetc = i_getc; i_getc = numchar_getc;
2640 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2643 #ifdef UNICODE_NORMALIZATION
2644 if (nfc_f && input_f == UTF8_INPUT){
2645 i_nfc_getc = i_getc; i_getc = nfc_getc;
2646 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2649 if (mime_f && mimebuf_f==FIXED_MIME) {
2650 i_mgetc = i_getc; i_getc = mime_getc;
2651 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2654 i_bgetc = i_getc; i_getc = broken_getc;
2655 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2657 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2658 set_iconv(-TRUE, e_iconv);
2659 } else if (input_f == SJIS_INPUT) {
2660 set_iconv(-TRUE, s_iconv);
2661 #ifdef UTF8_INPUT_ENABLE
2662 } else if (input_f == UTF8_INPUT) {
2663 set_iconv(-TRUE, w_iconv);
2664 } else if (input_f == UTF16_INPUT) {
2665 set_iconv(-TRUE, w_iconv16);
2666 } else if (input_f == UTF32_INPUT) {
2667 set_iconv(-TRUE, w_iconv32);
2670 set_iconv(FALSE, e_iconv);
2674 struct input_code *p = input_code_list;
2682 * Check and Ignore BOM
2684 void check_bom(FILE *f)
2687 switch(c2 = (*i_getc)(f)){
2689 if((c2 = (*i_getc)(f)) == 0x00){
2690 if((c2 = (*i_getc)(f)) == 0xFE){
2691 if((c2 = (*i_getc)(f)) == 0xFF){
2693 set_iconv(TRUE, w_iconv32);
2695 if (iconv == w_iconv32) {
2696 input_endian = ENDIAN_BIG;
2699 (*i_ungetc)(0xFF,f);
2700 }else (*i_ungetc)(c2,f);
2701 (*i_ungetc)(0xFE,f);
2702 }else if(c2 == 0xFF){
2703 if((c2 = (*i_getc)(f)) == 0xFE){
2705 set_iconv(TRUE, w_iconv32);
2707 if (iconv == w_iconv32) {
2708 input_endian = ENDIAN_2143;
2711 (*i_ungetc)(0xFF,f);
2712 }else (*i_ungetc)(c2,f);
2713 (*i_ungetc)(0xFF,f);
2714 }else (*i_ungetc)(c2,f);
2715 (*i_ungetc)(0x00,f);
2716 }else (*i_ungetc)(c2,f);
2717 (*i_ungetc)(0x00,f);
2720 if((c2 = (*i_getc)(f)) == 0xBB){
2721 if((c2 = (*i_getc)(f)) == 0xBF){
2723 set_iconv(TRUE, w_iconv);
2725 if (iconv == w_iconv) {
2728 (*i_ungetc)(0xBF,f);
2729 }else (*i_ungetc)(c2,f);
2730 (*i_ungetc)(0xBB,f);
2731 }else (*i_ungetc)(c2,f);
2732 (*i_ungetc)(0xEF,f);
2735 if((c2 = (*i_getc)(f)) == 0xFF){
2736 if((c2 = (*i_getc)(f)) == 0x00){
2737 if((c2 = (*i_getc)(f)) == 0x00){
2739 set_iconv(TRUE, w_iconv32);
2741 if (iconv == w_iconv32) {
2742 input_endian = ENDIAN_3412;
2745 (*i_ungetc)(0x00,f);
2746 }else (*i_ungetc)(c2,f);
2747 (*i_ungetc)(0x00,f);
2748 }else (*i_ungetc)(c2,f);
2750 set_iconv(TRUE, w_iconv16);
2752 if (iconv == w_iconv16) {
2753 input_endian = ENDIAN_BIG;
2756 (*i_ungetc)(0xFF,f);
2757 }else (*i_ungetc)(c2,f);
2758 (*i_ungetc)(0xFE,f);
2761 if((c2 = (*i_getc)(f)) == 0xFE){
2762 if((c2 = (*i_getc)(f)) == 0x00){
2763 if((c2 = (*i_getc)(f)) == 0x00){
2765 set_iconv(TRUE, w_iconv32);
2767 if (iconv == w_iconv32) {
2768 input_endian = ENDIAN_LITTLE;
2771 (*i_ungetc)(0x00,f);
2772 }else (*i_ungetc)(c2,f);
2773 (*i_ungetc)(0x00,f);
2774 }else (*i_ungetc)(c2,f);
2776 set_iconv(TRUE, w_iconv16);
2778 if (iconv == w_iconv16) {
2779 input_endian = ENDIAN_LITTLE;
2782 (*i_ungetc)(0xFE,f);
2783 }else (*i_ungetc)(c2,f);
2784 (*i_ungetc)(0xFF,f);
2793 Conversion main loop. Code detection only.
2796 nkf_char kanji_convert(FILE *f)
2798 nkf_char c3, c2=0, c1, c0=0;
2799 int is_8bit = FALSE;
2801 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2802 #ifdef UTF8_INPUT_ENABLE
2803 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2810 output_mode = ASCII;
2813 #define NEXT continue /* no output, get next */
2814 #define SEND ; /* output c1 and c2, get next */
2815 #define LAST break /* end of loop, go closing */
2817 module_connection();
2820 while ((c1 = (*i_getc)(f)) != EOF) {
2821 #ifdef INPUT_CODE_FIX
2827 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2828 /* in case of 8th bit is on */
2829 if (!estab_f&&!mime_decode_mode) {
2830 /* in case of not established yet */
2831 /* It is still ambiguious */
2832 if (h_conv(f, c2, c1)==EOF)
2838 /* in case of already established */
2840 /* ignore bogus code and not CP5022x UCD */
2848 /* second byte, 7 bit code */
2849 /* it might be kanji shitfted */
2850 if ((c1 == DEL) || (c1 <= SP)) {
2851 /* ignore bogus first code */
2858 #ifdef UTF8_INPUT_ENABLE
2859 if (iconv == w_iconv16) {
2860 if (input_endian == ENDIAN_BIG) {
2862 if ((c1 = (*i_getc)(f)) != EOF) {
2863 if (0xD8 <= c2 && c2 <= 0xDB) {
2864 if ((c0 = (*i_getc)(f)) != EOF) {
2866 if ((c3 = (*i_getc)(f)) != EOF) {
2873 if ((c2 = (*i_getc)(f)) != EOF) {
2874 if (0xD8 <= c2 && c2 <= 0xDB) {
2875 if ((c3 = (*i_getc)(f)) != EOF) {
2876 if ((c0 = (*i_getc)(f)) != EOF) {
2885 } else if(iconv == w_iconv32){
2887 if((c2 = (*i_getc)(f)) != EOF &&
2888 (c1 = (*i_getc)(f)) != EOF &&
2889 (c0 = (*i_getc)(f)) != EOF){
2890 switch(input_endian){
2892 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2895 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2898 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2901 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2911 #ifdef NUMCHAR_OPTION
2912 if (is_unicode_capsule(c1)){
2916 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2918 if (!estab_f && !iso8859_f) {
2919 /* not established yet */
2922 } else { /* estab_f==TRUE */
2927 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2928 /* SJIS X0201 Case... */
2929 if (iso2022jp_f && !x0201_f) {
2930 (*oconv)(GETA1, GETA2);
2937 } else if (c1==SSO && iconv != s_iconv) {
2938 /* EUC X0201 Case */
2939 c1 = (*i_getc)(f); /* skip SSO */
2941 if (SSP<=c1 && c1<0xe0) {
2942 if (iso2022jp_f && !x0201_f) {
2943 (*oconv)(GETA1, GETA2);
2950 } else { /* bogus code, skip SSO and one byte */
2953 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2954 (c1 == 0xFD || c1 == 0xFE)) {
2960 /* already established */
2965 } else if ((c1 > SP) && (c1 != DEL)) {
2966 /* in case of Roman characters */
2968 /* output 1 shifted byte */
2972 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2973 /* output 1 shifted byte */
2974 if (iso2022jp_f && !x0201_f) {
2975 (*oconv)(GETA1, GETA2);
2982 /* look like bogus code */
2985 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2986 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2987 /* in case of Kanji shifted */
2990 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2991 /* Check MIME code */
2992 if ((c1 = (*i_getc)(f)) == EOF) {
2995 } else if (c1 == '?') {
2996 /* =? is mime conversion start sequence */
2997 if(mime_f == STRICT_MIME) {
2998 /* check in real detail */
2999 if (mime_begin_strict(f) == EOF)
3003 } else if (mime_begin(f) == EOF)
3013 /* normal ASCII code */
3016 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
3019 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
3022 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
3023 if ((c1 = (*i_getc)(f)) == EOF) {
3024 /* (*oconv)(0, ESC); don't send bogus code */
3026 } else if (c1 == '$') {
3027 if ((c1 = (*i_getc)(f)) == EOF) {
3029 (*oconv)(0, ESC); don't send bogus code
3030 (*oconv)(0, '$'); */
3032 } else if (c1 == '@'|| c1 == 'B') {
3033 /* This is kanji introduction */
3034 input_mode = JIS_X_0208;
3036 set_input_codename("ISO-2022-JP");
3038 debug("ISO-2022-JP");
3041 } else if (c1 == '(') {
3042 if ((c1 = (*i_getc)(f)) == EOF) {
3043 /* don't send bogus code
3049 } else if (c1 == '@'|| c1 == 'B') {
3050 /* This is kanji introduction */
3051 input_mode = JIS_X_0208;
3055 } else if (c1 == 'D'){
3056 input_mode = JIS_X_0212;
3059 #endif /* X0212_ENABLE */
3060 } else if (c1 == 0x4F){
3061 input_mode = JIS_X_0213_1;
3064 } else if (c1 == 0x50){
3065 input_mode = JIS_X_0213_2;
3069 /* could be some special code */
3076 } else if (broken_f&0x2) {
3077 /* accept any ESC-(-x as broken code ... */
3078 input_mode = JIS_X_0208;
3087 } else if (c1 == '(') {
3088 if ((c1 = (*i_getc)(f)) == EOF) {
3089 /* don't send bogus code
3091 (*oconv)(0, '('); */
3095 /* This is X0201 kana introduction */
3096 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
3098 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
3099 /* This is X0208 kanji introduction */
3100 input_mode = ASCII; shift_mode = FALSE;
3102 } else if (broken_f&0x2) {
3103 input_mode = ASCII; shift_mode = FALSE;
3108 /* maintain various input_mode here */
3112 } else if ( c1 == 'N' || c1 == 'n'){
3114 c3 = (*i_getc)(f); /* skip SS2 */
3115 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
3130 } else if (c1 == ESC && iconv == s_iconv) {
3131 /* ESC in Shift_JIS */
3132 if ((c1 = (*i_getc)(f)) == EOF) {
3133 /* (*oconv)(0, ESC); don't send bogus code */
3135 } else if (c1 == '$') {
3137 if ((c1 = (*i_getc)(f)) == EOF) {
3139 (*oconv)(0, ESC); don't send bogus code
3140 (*oconv)(0, '$'); */
3143 if (('E' <= c1 && c1 <= 'G') ||
3144 ('O' <= c1 && c1 <= 'Q')) {
3152 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
3153 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
3154 while ((c1 = (*i_getc)(f)) != EOF) {
3155 if (SP <= c1 && c1 <= 'z') {
3156 (*oconv)(0, c1 + c0);
3157 } else break; /* c1 == SO */
3161 if (c1 == EOF) LAST;
3168 } else if (c1 == LF || c1 == CR) {
3170 input_mode = ASCII; set_iconv(FALSE, 0);
3172 } else if (mime_decode_f && !mime_decode_mode){
3174 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3182 } else { /* if (c1 == CR)*/
3183 if ((c1=(*i_getc)(f))!=EOF) {
3187 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3201 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3211 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3214 if ((c0 = (*i_getc)(f)) != EOF) {
3217 if ((c3 = (*i_getc)(f)) != EOF) {
3219 (*iconv)(c2, c1, c0|c3);
3224 /* 3 bytes EUC or UTF-8 */
3225 if ((c0 = (*i_getc)(f)) != EOF) {
3227 (*iconv)(c2, c1, c0);
3235 0x7F <= c2 && c2 <= 0x92 &&
3236 0x21 <= c1 && c1 <= 0x7E) {
3238 if(c1 == 0x7F) return 0;
3239 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3242 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3246 (*oconv)(PREFIX_EUCG3 | c2, c1);
3248 #endif /* X0212_ENABLE */
3250 (*oconv)(PREFIX_EUCG3 | c2, c1);
3253 (*oconv)(input_mode, c1); /* other special case */
3259 /* goto next_word */
3263 (*iconv)(EOF, 0, 0);
3264 if (!input_codename)
3267 struct input_code *p = input_code_list;
3268 struct input_code *result = p;
3270 if (p->score < result->score) result = p;
3273 set_input_codename(result->name);
3275 debug(result->name);
3283 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3285 nkf_char ret, c3, c0;
3289 /** it must NOT be in the kanji shifte sequence */
3290 /** it must NOT be written in JIS7 */
3291 /** and it must be after 2 byte 8bit code */
3297 while ((c1 = (*i_getc)(f)) != EOF) {
3303 if (push_hold_buf(c1) == EOF || estab_f){
3309 struct input_code *p = input_code_list;
3310 struct input_code *result = p;
3315 if (p->status_func && p->score < result->score){
3320 set_iconv(TRUE, result->iconv_func);
3325 ** 1) EOF is detected, or
3326 ** 2) Code is established, or
3327 ** 3) Buffer is FULL (but last word is pushed)
3329 ** in 1) and 3) cases, we continue to use
3330 ** Kanji codes by oconv and leave estab_f unchanged.
3335 while (hold_index < hold_count){
3336 c2 = hold_buf[hold_index++];
3338 #ifdef NUMCHAR_OPTION
3339 || is_unicode_capsule(c2)
3344 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3345 (*iconv)(JIS_X_0201, c2, 0);
3348 if (hold_index < hold_count){
3349 c1 = hold_buf[hold_index++];
3359 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3362 if (hold_index < hold_count){
3363 c0 = hold_buf[hold_index++];
3364 } else if ((c0 = (*i_getc)(f)) == EOF) {
3370 if (hold_index < hold_count){
3371 c3 = hold_buf[hold_index++];
3372 } else if ((c3 = (*i_getc)(f)) == EOF) {
3377 (*iconv)(c2, c1, c0|c3);
3382 /* 3 bytes EUC or UTF-8 */
3383 if (hold_index < hold_count){
3384 c0 = hold_buf[hold_index++];
3385 } else if ((c0 = (*i_getc)(f)) == EOF) {
3391 (*iconv)(c2, c1, c0);
3394 if (c0 == EOF) break;
3399 nkf_char push_hold_buf(nkf_char c2)
3401 if (hold_count >= HOLD_SIZE*2)
3403 hold_buf[hold_count++] = (unsigned char)c2;
3404 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3407 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3409 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3412 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3413 #ifdef SHIFTJIS_CP932
3414 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3415 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3422 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3423 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3429 #endif /* SHIFTJIS_CP932 */
3431 if (!x0213_f && is_ibmext_in_sjis(c2)){
3432 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3435 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3448 if(x0213_f && c2 >= 0xF0){
3449 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3450 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3451 }else{ /* 78<=k<=94 */
3452 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3453 if (0x9E < c1) c2++;
3456 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3457 if (0x9E < c1) c2++;
3460 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3467 c2 = x0212_unshift(c2);
3474 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3476 if (c2 == JIS_X_0201) {
3478 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3480 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3482 if(c1 == 0x7F) return 0;
3483 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3486 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3487 if (ret) return ret;
3493 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3495 if (c2 == JIS_X_0201) {
3498 }else if (c2 == 0x8f){
3502 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3503 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3504 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3507 c2 = (c2 << 8) | (c1 & 0x7f);
3509 #ifdef SHIFTJIS_CP932
3512 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3513 s2e_conv(s2, s1, &c2, &c1);
3520 #endif /* SHIFTJIS_CP932 */
3522 #endif /* X0212_ENABLE */
3523 } else if (c2 == SSO){
3526 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3529 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3530 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3531 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3536 #ifdef SHIFTJIS_CP932
3537 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3539 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3540 s2e_conv(s2, s1, &c2, &c1);
3547 #endif /* SHIFTJIS_CP932 */
3554 #ifdef UTF8_INPUT_ENABLE
3555 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3562 }else if (0xc0 <= c2 && c2 <= 0xef) {
3563 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3564 #ifdef NUMCHAR_OPTION
3567 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3575 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3578 static const char w_iconv_utf8_1st_byte[] =
3580 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3581 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3582 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3583 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3585 if (c2 < 0 || 0xff < c2) {
3586 }else if (c2 == 0) { /* 0 : 1 byte*/
3588 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3591 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3593 if (c1 < 0x80 || 0xBF < c1) return 0;
3596 if (c0 == 0) return -1;
3597 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3602 if (c0 == 0) return -1;
3603 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3607 if (c0 == 0) return -1;
3608 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3612 if (c0 == 0) return -2;
3613 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3617 if (c0 == 0) return -2;
3618 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3622 if (c0 == 0) return -2;
3623 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3631 if (c2 == 0 || c2 == EOF){
3632 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3633 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3636 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3645 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3646 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3653 }else if (val < 0x800){
3654 *p2 = 0xc0 | (val >> 6);
3655 *p1 = 0x80 | (val & 0x3f);
3657 } else if (val <= NKF_INT32_C(0xFFFF)) {
3658 *p2 = 0xe0 | (val >> 12);
3659 *p1 = 0x80 | ((val >> 6) & 0x3f);
3660 *p0 = 0x80 | (val & 0x3f);
3661 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3662 *p2 = 0xe0 | (val >> 16);
3663 *p1 = 0x80 | ((val >> 12) & 0x3f);
3664 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3673 #ifdef UTF8_INPUT_ENABLE
3674 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3679 } else if (c2 >= 0xf0){
3680 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3681 val = (c2 & 0x0f) << 18;
3682 val |= (c1 & 0x3f) << 12;
3683 val |= (c0 & 0x3f00) >> 2;
3685 }else if (c2 >= 0xe0){
3686 val = (c2 & 0x0f) << 12;
3687 val |= (c1 & 0x3f) << 6;
3689 }else if (c2 >= 0xc0){
3690 val = (c2 & 0x1f) << 6;
3698 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3700 nkf_char c2, c1, c0;
3707 w16w_conv(val, &c2, &c1, &c0);
3708 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3709 #ifdef NUMCHAR_OPTION
3712 *p1 = CLASS_UNICODE | val;
3721 #ifdef UTF8_INPUT_ENABLE
3722 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3725 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3728 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3729 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3731 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3733 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3738 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3739 if (ret) return ret;
3744 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3748 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3749 } else if (is_unicode_bmp(c1)) {
3750 ret = w16e_conv(c1, &c2, &c1);
3753 c1 = CLASS_UNICODE | c1;
3755 if (ret) return ret;
3760 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3762 const unsigned short *const *pp;
3763 const unsigned short *const *const *ppp;
3764 static const char no_best_fit_chars_table_C2[] =
3765 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3767 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3768 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3769 static const char no_best_fit_chars_table_C2_ms[] =
3770 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3772 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3773 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3774 static const char no_best_fit_chars_table_932_C2[] =
3775 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3777 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3778 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3779 static const char no_best_fit_chars_table_932_C3[] =
3780 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3783 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3789 }else if(c2 < 0xe0){
3790 if(no_best_fit_chars_f){
3791 if(ms_ucs_map_f == UCS_MAP_CP932){
3794 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3797 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3800 }else if(!cp932inv_f){
3803 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3806 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3809 }else if(ms_ucs_map_f == UCS_MAP_MS){
3810 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3811 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3829 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3830 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3831 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3833 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3834 }else if(c0 < 0xF0){
3835 if(no_best_fit_chars_f){
3836 if(ms_ucs_map_f == UCS_MAP_CP932){
3837 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3838 }else if(ms_ucs_map_f == UCS_MAP_MS){
3843 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3846 if(c0 == 0x92) return 1;
3851 if(c1 == 0x80 || c0 == 0x9C) return 1;
3854 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3859 if(c0 == 0x94) return 1;
3862 if(c0 == 0xBB) return 1;
3872 if(c0 == 0x95) return 1;
3875 if(c0 == 0xA5) return 1;
3882 if(c0 == 0x8D) return 1;
3885 if(c0 == 0x9E && !cp932inv_f) return 1;
3888 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3896 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3897 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3898 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3900 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3902 #ifdef SHIFTJIS_CP932
3903 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3905 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3906 s2e_conv(s2, s1, p2, p1);
3915 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3918 const unsigned short *p;
3921 if (pp == 0) return 1;
3924 if (c1 < 0 || psize <= c1) return 1;
3926 if (p == 0) return 1;
3929 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3931 if (val == 0) return 1;
3932 if (no_cp932ext_f && (
3933 (val>>8) == 0x2D || /* NEC special characters */
3934 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3942 if (c2 == SO) c2 = JIS_X_0201;
3949 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3956 (*f)(0, bin2hex(c>>shift));
3966 void encode_fallback_html(nkf_char c)
3971 if(c >= NKF_INT32_C(1000000))
3972 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3973 if(c >= NKF_INT32_C(100000))
3974 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3976 (*oconv)(0, 0x30+(c/10000 )%10);
3978 (*oconv)(0, 0x30+(c/1000 )%10);
3980 (*oconv)(0, 0x30+(c/100 )%10);
3982 (*oconv)(0, 0x30+(c/10 )%10);
3984 (*oconv)(0, 0x30+ c %10);
3989 void encode_fallback_xml(nkf_char c)
3994 nkf_each_char_to_hex(oconv, c);
3999 void encode_fallback_java(nkf_char c)
4003 if(!is_unicode_bmp(c)){
4007 (*oconv)(0, bin2hex(c>>20));
4008 (*oconv)(0, bin2hex(c>>16));
4012 (*oconv)(0, bin2hex(c>>12));
4013 (*oconv)(0, bin2hex(c>> 8));
4014 (*oconv)(0, bin2hex(c>> 4));
4015 (*oconv)(0, bin2hex(c ));
4019 void encode_fallback_perl(nkf_char c)
4024 nkf_each_char_to_hex(oconv, c);
4029 void encode_fallback_subchar(nkf_char c)
4031 c = unicode_subchar;
4032 (*oconv)((c>>8)&0xFF, c&0xFF);
4037 #ifdef UTF8_OUTPUT_ENABLE
4038 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
4040 const unsigned short *p;
4042 if (c2 == JIS_X_0201) {
4043 if (ms_ucs_map_f == UCS_MAP_CP10001) {
4051 p = euc_to_utf8_1byte;
4053 } else if (is_eucg3(c2)){
4054 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
4057 c2 = (c2&0x7f) - 0x21;
4058 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4059 p = x0212_to_utf8_2bytes[c2];
4065 c2 = (c2&0x7f) - 0x21;
4066 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
4068 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
4069 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
4070 euc_to_utf8_2bytes_ms[c2];
4075 c1 = (c1 & 0x7f) - 0x21;
4076 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
4081 void w_oconv(nkf_char c2, nkf_char c1)
4087 output_bom_f = FALSE;
4098 #ifdef NUMCHAR_OPTION
4099 if (c2 == 0 && is_unicode_capsule(c1)){
4100 val = c1 & VALUE_MASK;
4103 }else if (val < 0x800){
4104 (*o_putc)(0xC0 | (val >> 6));
4105 (*o_putc)(0x80 | (val & 0x3f));
4106 } else if (val <= NKF_INT32_C(0xFFFF)) {
4107 (*o_putc)(0xE0 | (val >> 12));
4108 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
4109 (*o_putc)(0x80 | (val & 0x3f));
4110 } else if (val <= NKF_INT32_C(0x10FFFF)) {
4111 (*o_putc)(0xF0 | ( val>>18));
4112 (*o_putc)(0x80 | ((val>>12) & 0x3f));
4113 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
4114 (*o_putc)(0x80 | ( val & 0x3f));
4121 output_mode = ASCII;
4123 } else if (c2 == ISO_8859_1) {
4124 output_mode = UTF_8;
4125 (*o_putc)(c1 | 0x080);
4127 output_mode = UTF_8;
4128 val = e2w_conv(c2, c1);
4130 w16w_conv(val, &c2, &c1, &c0);
4134 if (c0) (*o_putc)(c0);
4140 void w_oconv16(nkf_char c2, nkf_char c1)
4143 output_bom_f = FALSE;
4144 if (output_endian == ENDIAN_LITTLE){
4145 (*o_putc)((unsigned char)'\377');
4149 (*o_putc)((unsigned char)'\377');
4158 if (c2 == ISO_8859_1) {
4161 #ifdef NUMCHAR_OPTION
4162 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4163 if (is_unicode_bmp(c1)) {
4164 c2 = (c1 >> 8) & 0xff;
4168 if (c1 <= UNICODE_MAX) {
4169 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
4170 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
4171 if (output_endian == ENDIAN_LITTLE){
4172 (*o_putc)(c2 & 0xff);
4173 (*o_putc)((c2 >> 8) & 0xff);
4174 (*o_putc)(c1 & 0xff);
4175 (*o_putc)((c1 >> 8) & 0xff);
4177 (*o_putc)((c2 >> 8) & 0xff);
4178 (*o_putc)(c2 & 0xff);
4179 (*o_putc)((c1 >> 8) & 0xff);
4180 (*o_putc)(c1 & 0xff);
4187 nkf_char val = e2w_conv(c2, c1);
4188 c2 = (val >> 8) & 0xff;
4192 if (output_endian == ENDIAN_LITTLE){
4201 void w_oconv32(nkf_char c2, nkf_char c1)
4204 output_bom_f = FALSE;
4205 if (output_endian == ENDIAN_LITTLE){
4206 (*o_putc)((unsigned char)'\377');
4214 (*o_putc)((unsigned char)'\377');
4223 if (c2 == ISO_8859_1) {
4225 #ifdef NUMCHAR_OPTION
4226 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4230 c1 = e2w_conv(c2, c1);
4233 if (output_endian == ENDIAN_LITTLE){
4234 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4235 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4236 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4240 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4241 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4242 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4247 void e_oconv(nkf_char c2, nkf_char c1)
4249 #ifdef NUMCHAR_OPTION
4250 if (c2 == 0 && is_unicode_capsule(c1)){
4251 w16e_conv(c1, &c2, &c1);
4252 if (c2 == 0 && is_unicode_capsule(c1)){
4253 c2 = c1 & VALUE_MASK;
4254 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4258 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4259 c1 = 0x21 + c1 % 94;
4262 (*o_putc)((c2 & 0x7f) | 0x080);
4263 (*o_putc)(c1 | 0x080);
4265 (*o_putc)((c2 & 0x7f) | 0x080);
4266 (*o_putc)(c1 | 0x080);
4270 if (encode_fallback) (*encode_fallback)(c1);
4279 } else if (c2 == 0) {
4280 output_mode = ASCII;
4282 } else if (c2 == JIS_X_0201) {
4283 output_mode = EUC_JP;
4284 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4285 } else if (c2 == ISO_8859_1) {
4286 output_mode = ISO_8859_1;
4287 (*o_putc)(c1 | 0x080);
4289 } else if (is_eucg3(c2)){
4290 output_mode = EUC_JP;
4291 #ifdef SHIFTJIS_CP932
4294 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4295 s2e_conv(s2, s1, &c2, &c1);
4300 output_mode = ASCII;
4302 }else if (is_eucg3(c2)){
4305 (*o_putc)((c2 & 0x7f) | 0x080);
4306 (*o_putc)(c1 | 0x080);
4309 (*o_putc)((c2 & 0x7f) | 0x080);
4310 (*o_putc)(c1 | 0x080);
4314 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4315 set_iconv(FALSE, 0);
4316 return; /* too late to rescue this char */
4318 output_mode = EUC_JP;
4319 (*o_putc)(c2 | 0x080);
4320 (*o_putc)(c1 | 0x080);
4325 nkf_char x0212_shift(nkf_char c)
4330 if (0x75 <= c && c <= 0x7f){
4331 ret = c + (0x109 - 0x75);
4334 if (0x75 <= c && c <= 0x7f){
4335 ret = c + (0x113 - 0x75);
4342 nkf_char x0212_unshift(nkf_char c)
4345 if (0x7f <= c && c <= 0x88){
4346 ret = c + (0x75 - 0x7f);
4347 }else if (0x89 <= c && c <= 0x92){
4348 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4352 #endif /* X0212_ENABLE */
4354 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4360 if((0x21 <= ndx && ndx <= 0x2F)){
4361 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4362 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4364 }else if(0x6E <= ndx && ndx <= 0x7E){
4365 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4366 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4372 else if(nkf_isgraph(ndx)){
4374 const unsigned short *ptr;
4375 ptr = x0212_shiftjis[ndx - 0x21];
4377 val = ptr[(c1 & 0x7f) - 0x21];
4386 c2 = x0212_shift(c2);
4388 #endif /* X0212_ENABLE */
4390 if(0x7F < c2) return 1;
4391 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4392 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4396 void s_oconv(nkf_char c2, nkf_char c1)
4398 #ifdef NUMCHAR_OPTION
4399 if (c2 == 0 && is_unicode_capsule(c1)){
4400 w16e_conv(c1, &c2, &c1);
4401 if (c2 == 0 && is_unicode_capsule(c1)){
4402 c2 = c1 & VALUE_MASK;
4403 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4406 c2 = c1 / 188 + 0xF0;
4408 c1 += 0x40 + (c1 > 0x3e);
4413 if(encode_fallback)(*encode_fallback)(c1);
4422 } else if (c2 == 0) {
4423 output_mode = ASCII;
4425 } else if (c2 == JIS_X_0201) {
4426 output_mode = SHIFT_JIS;
4428 } else if (c2 == ISO_8859_1) {
4429 output_mode = ISO_8859_1;
4430 (*o_putc)(c1 | 0x080);
4432 } else if (is_eucg3(c2)){
4433 output_mode = SHIFT_JIS;
4434 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4440 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4441 set_iconv(FALSE, 0);
4442 return; /* too late to rescue this char */
4444 output_mode = SHIFT_JIS;
4445 e2s_conv(c2, c1, &c2, &c1);
4447 #ifdef SHIFTJIS_CP932
4449 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4450 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4456 #endif /* SHIFTJIS_CP932 */
4459 if (prefix_table[(unsigned char)c1]){
4460 (*o_putc)(prefix_table[(unsigned char)c1]);
4466 void j_oconv(nkf_char c2, nkf_char c1)
4468 #ifdef NUMCHAR_OPTION
4469 if (c2 == 0 && is_unicode_capsule(c1)){
4470 w16e_conv(c1, &c2, &c1);
4471 if (c2 == 0 && is_unicode_capsule(c1)){
4472 c2 = c1 & VALUE_MASK;
4473 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4476 c2 = 0x7F + c1 / 94;
4477 c1 = 0x21 + c1 % 94;
4479 if (encode_fallback) (*encode_fallback)(c1);
4486 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4489 (*o_putc)(ascii_intro);
4490 output_mode = ASCII;
4494 } else if (is_eucg3(c2)){
4496 if(output_mode!=JIS_X_0213_2){
4497 output_mode = JIS_X_0213_2;
4504 if(output_mode!=JIS_X_0212){
4505 output_mode = JIS_X_0212;
4512 (*o_putc)(c2 & 0x7f);
4515 } else if (c2==JIS_X_0201) {
4516 if (output_mode!=JIS_X_0201) {
4517 output_mode = JIS_X_0201;
4523 } else if (c2==ISO_8859_1) {
4524 /* iso8859 introduction, or 8th bit on */
4525 /* Can we convert in 7bit form using ESC-'-'-A ?
4527 output_mode = ISO_8859_1;
4529 } else if (c2 == 0) {
4530 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4533 (*o_putc)(ascii_intro);
4534 output_mode = ASCII;
4539 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4540 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4542 if (output_mode!=JIS_X_0213_1) {
4543 output_mode = JIS_X_0213_1;
4549 }else if (output_mode != JIS_X_0208) {
4550 output_mode = JIS_X_0208;
4553 (*o_putc)(kanji_intro);
4560 void base64_conv(nkf_char c2, nkf_char c1)
4562 mime_prechar(c2, c1);
4563 (*o_base64conv)(c2,c1);
4567 static nkf_char broken_buf[3];
4568 static int broken_counter = 0;
4569 static int broken_last = 0;
4570 nkf_char broken_getc(FILE *f)
4574 if (broken_counter>0) {
4575 return broken_buf[--broken_counter];
4578 if (c=='$' && broken_last != ESC
4579 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4582 if (c1=='@'|| c1=='B') {
4583 broken_buf[0]=c1; broken_buf[1]=c;
4590 } else if (c=='(' && broken_last != ESC
4591 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4594 if (c1=='J'|| c1=='B') {
4595 broken_buf[0]=c1; broken_buf[1]=c;
4608 nkf_char broken_ungetc(nkf_char c, FILE *f)
4610 if (broken_counter<2)
4611 broken_buf[broken_counter++]=c;
4615 void nl_conv(nkf_char c2, nkf_char c1)
4617 if (guess_f && input_newline != EOF) {
4618 if (c2 == 0 && c1 == LF) {
4619 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4620 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4621 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4623 else if (!input_newline) input_newline = CR;
4624 else if (input_newline != CR) input_newline = EOF;
4626 if (prev_cr || (c2 == 0 && c1 == LF)) {
4628 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4629 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4631 if (c2 == 0 && c1 == CR) prev_cr = CR;
4632 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4636 Return value of fold_conv()
4638 LF add newline and output char
4639 CR add newline and output nothing
4642 1 (or else) normal output
4644 fold state in prev (previous character)
4646 >0x80 Japanese (X0208/X0201)
4651 This fold algorthm does not preserve heading space in a line.
4652 This is the main difference from fmt.
4655 #define char_size(c2,c1) (c2?2:1)
4657 void fold_conv(nkf_char c2, nkf_char c1)
4660 nkf_char fold_state;
4662 if (c1== CR && !fold_preserve_f) {
4663 fold_state=0; /* ignore cr */
4664 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4666 fold_state=0; /* ignore cr */
4667 } else if (c1== BS) {
4668 if (f_line>0) f_line--;
4670 } else if (c2==EOF && f_line != 0) { /* close open last line */
4672 } else if ((c1==LF && !fold_preserve_f)
4673 || ((c1==CR||(c1==LF&&f_prev!=CR))
4674 && fold_preserve_f)) {
4676 if (fold_preserve_f) {
4680 } else if ((f_prev == c1 && !fold_preserve_f)
4681 || (f_prev == LF && fold_preserve_f)
4682 ) { /* duplicate newline */
4685 fold_state = LF; /* output two newline */
4691 if (f_prev&0x80) { /* Japanese? */
4693 fold_state = 0; /* ignore given single newline */
4694 } else if (f_prev==SP) {
4698 if (++f_line<=fold_len)
4702 fold_state = CR; /* fold and output nothing */
4706 } else if (c1=='\f') {
4709 fold_state = LF; /* output newline and clear */
4710 } else if ( (c2==0 && c1==SP)||
4711 (c2==0 && c1==TAB)||
4712 (c2=='!'&& c1=='!')) {
4713 /* X0208 kankaku or ascii space */
4715 fold_state = 0; /* remove duplicate spaces */
4718 if (++f_line<=fold_len)
4719 fold_state = SP; /* output ASCII space only */
4721 f_prev = SP; f_line = 0;
4722 fold_state = CR; /* fold and output nothing */
4726 prev0 = f_prev; /* we still need this one... , but almost done */
4728 if (c2 || c2==JIS_X_0201)
4729 f_prev |= 0x80; /* this is Japanese */
4730 f_line += char_size(c2,c1);
4731 if (f_line<=fold_len) { /* normal case */
4734 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4735 f_line = char_size(c2,c1);
4736 fold_state = LF; /* We can't wait, do fold now */
4737 } else if (c2==JIS_X_0201) {
4738 /* simple kinsoku rules return 1 means no folding */
4739 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4740 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4741 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4742 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4743 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4744 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4745 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4747 fold_state = LF;/* add one new f_line before this character */
4750 fold_state = LF;/* add one new f_line before this character */
4753 /* kinsoku point in ASCII */
4754 if ( c1==')'|| /* { [ ( */
4765 /* just after special */
4766 } else if (!is_alnum(prev0)) {
4767 f_line = char_size(c2,c1);
4769 } else if ((prev0==SP) || /* ignored new f_line */
4770 (prev0==LF)|| /* ignored new f_line */
4771 (prev0&0x80)) { /* X0208 - ASCII */
4772 f_line = char_size(c2,c1);
4773 fold_state = LF;/* add one new f_line before this character */
4775 fold_state = 1; /* default no fold in ASCII */
4779 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4780 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4781 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4782 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4783 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4784 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4785 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4786 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4787 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4788 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4789 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4790 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4791 /* default no fold in kinsoku */
4794 f_line = char_size(c2,c1);
4795 /* add one new f_line before this character */
4798 f_line = char_size(c2,c1);
4800 /* add one new f_line before this character */
4805 /* terminator process */
4806 switch(fold_state) {
4808 OCONV_NEWLINE((*o_fconv));
4814 OCONV_NEWLINE((*o_fconv));
4825 nkf_char z_prev2=0,z_prev1=0;
4827 void z_conv(nkf_char c2, nkf_char c1)
4830 /* if (c2) c1 &= 0x7f; assertion */
4832 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4838 if (z_prev2 == JIS_X_0201) {
4839 if (c2 == JIS_X_0201) {
4840 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4842 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4844 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4846 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4851 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4853 if (c2 == JIS_X_0201) {
4854 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4855 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4860 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4871 if (alpha_f&1 && c2 == 0x23) {
4872 /* JISX0208 Alphabet */
4874 } else if (c2 == 0x21) {
4875 /* JISX0208 Kigou */
4880 } else if (alpha_f&4) {
4885 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4891 if (alpha_f&8 && c2 == 0) {
4895 case '>': entity = ">"; break;
4896 case '<': entity = "<"; break;
4897 case '\"': entity = """; break;
4898 case '&': entity = "&"; break;
4901 while (*entity) (*o_zconv)(0, *entity++);
4907 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4912 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4916 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4920 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4924 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4928 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4932 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4936 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4940 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4945 (*o_zconv)(JIS_X_0201, c);
4948 } else if (c2 == 0x25) {
4949 /* JISX0208 Katakana */
4950 static const int fullwidth_to_halfwidth[] =
4952 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4953 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4954 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4955 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4956 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4957 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4958 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4959 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4960 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4961 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4962 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4963 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4965 if (fullwidth_to_halfwidth[c1-0x20]){
4966 c2 = fullwidth_to_halfwidth[c1-0x20];
4967 (*o_zconv)(JIS_X_0201, c2>>8);
4969 (*o_zconv)(JIS_X_0201, c2&0xFF);
4979 #define rot13(c) ( \
4981 (c <= 'M') ? (c + 13): \
4982 (c <= 'Z') ? (c - 13): \
4984 (c <= 'm') ? (c + 13): \
4985 (c <= 'z') ? (c - 13): \
4989 #define rot47(c) ( \
4991 ( c <= 'O') ? (c + 47) : \
4992 ( c <= '~') ? (c - 47) : \
4996 void rot_conv(nkf_char c2, nkf_char c1)
4998 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
5004 (*o_rot_conv)(c2,c1);
5007 void hira_conv(nkf_char c2, nkf_char c1)
5011 if (0x20 < c1 && c1 < 0x74) {
5013 (*o_hira_conv)(c2,c1);
5015 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
5017 c1 = CLASS_UNICODE | 0x3094;
5018 (*o_hira_conv)(c2,c1);
5021 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
5023 (*o_hira_conv)(c2,c1);
5028 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
5031 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
5033 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
5037 (*o_hira_conv)(c2,c1);
5041 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
5043 static const nkf_char range[RANGE_NUM_MAX][2] = {
5064 nkf_char start, end, c;
5066 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
5070 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
5075 for (i = 0; i < RANGE_NUM_MAX; i++) {
5076 start = range[i][0];
5079 if (c >= start && c <= end) {
5084 (*o_iso2022jp_check_conv)(c2,c1);
5088 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
5090 static const unsigned char *mime_pattern[] = {
5091 (const unsigned char *)"\075?EUC-JP?B?",
5092 (const unsigned char *)"\075?SHIFT_JIS?B?",
5093 (const unsigned char *)"\075?ISO-8859-1?Q?",
5094 (const unsigned char *)"\075?ISO-8859-1?B?",
5095 (const unsigned char *)"\075?ISO-2022-JP?B?",
5096 (const unsigned char *)"\075?ISO-2022-JP?Q?",
5097 #if defined(UTF8_INPUT_ENABLE)
5098 (const unsigned char *)"\075?UTF-8?B?",
5099 (const unsigned char *)"\075?UTF-8?Q?",
5101 (const unsigned char *)"\075?US-ASCII?Q?",
5106 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
5107 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
5108 e_iconv, s_iconv, 0, 0, 0, 0,
5109 #if defined(UTF8_INPUT_ENABLE)
5115 static const nkf_char mime_encode[] = {
5116 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
5117 #if defined(UTF8_INPUT_ENABLE)
5124 static const nkf_char mime_encode_method[] = {
5125 'B', 'B','Q', 'B', 'B', 'Q',
5126 #if defined(UTF8_INPUT_ENABLE)
5134 #define MAXRECOVER 20
5136 void switch_mime_getc(void)
5138 if (i_getc!=mime_getc) {
5139 i_mgetc = i_getc; i_getc = mime_getc;
5140 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5141 if(mime_f==STRICT_MIME) {
5142 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
5143 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
5148 void unswitch_mime_getc(void)
5150 if(mime_f==STRICT_MIME) {
5151 i_mgetc = i_mgetc_buf;
5152 i_mungetc = i_mungetc_buf;
5155 i_ungetc = i_mungetc;
5156 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
5157 mime_iconv_back = NULL;
5160 nkf_char mime_begin_strict(FILE *f)
5164 const unsigned char *p,*q;
5165 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
5167 mime_decode_mode = FALSE;
5168 /* =? has been checked */
5170 p = mime_pattern[j];
5173 for(i=2;p[i]>SP;i++) { /* start at =? */
5174 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
5175 /* pattern fails, try next one */
5177 while (mime_pattern[++j]) {
5178 p = mime_pattern[j];
5179 for(k=2;k<i;k++) /* assume length(p) > i */
5180 if (p[k]!=q[k]) break;
5181 if (k==i && nkf_toupper(c1)==p[k]) break;
5183 p = mime_pattern[j];
5184 if (p) continue; /* found next one, continue */
5185 /* all fails, output from recovery buffer */
5193 mime_decode_mode = p[i-2];
5195 mime_iconv_back = iconv;
5196 set_iconv(FALSE, mime_priority_func[j]);
5197 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5199 if (mime_decode_mode=='B') {
5200 mimebuf_f = unbuf_f;
5202 /* do MIME integrity check */
5203 return mime_integrity(f,mime_pattern[j]);
5211 nkf_char mime_getc_buf(FILE *f)
5213 /* we don't keep eof of Fifo, becase it contains ?= as
5214 a terminator. It was checked in mime_integrity. */
5215 return ((mimebuf_f)?
5216 (*i_mgetc_buf)(f):Fifo(mime_input++));
5219 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5222 (*i_mungetc_buf)(c,f);
5224 Fifo(--mime_input) = (unsigned char)c;
5228 nkf_char mime_begin(FILE *f)
5233 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5234 /* re-read and convert again from mime_buffer. */
5236 /* =? has been checked */
5238 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5239 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5240 /* We accept any character type even if it is breaked by new lines */
5241 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5242 if (c1==LF||c1==SP||c1==CR||
5243 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5245 /* Failed. But this could be another MIME preemble */
5253 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5254 if (!(++i<MAXRECOVER) || c1==EOF) break;
5255 if (c1=='b'||c1=='B') {
5256 mime_decode_mode = 'B';
5257 } else if (c1=='q'||c1=='Q') {
5258 mime_decode_mode = 'Q';
5262 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5263 if (!(++i<MAXRECOVER) || c1==EOF) break;
5265 mime_decode_mode = FALSE;
5271 if (!mime_decode_mode) {
5272 /* false MIME premble, restart from mime_buffer */
5273 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5274 /* Since we are in MIME mode until buffer becomes empty, */
5275 /* we never go into mime_begin again for a while. */
5278 /* discard mime preemble, and goto MIME mode */
5280 /* do no MIME integrity check */
5281 return c1; /* used only for checking EOF */
5285 void no_putc(nkf_char c)
5290 void debug(const char *str)
5293 fprintf(stderr, "%s\n", str ? str : "NULL");
5298 void set_input_codename(char *codename)
5300 if (!input_codename) {
5301 input_codename = codename;
5302 } else if (strcmp(codename, input_codename) != 0) {
5303 input_codename = "";
5307 static char* get_guessed_code(void)
5309 if (input_codename && !*input_codename) {
5310 input_codename = "BINARY";
5312 struct input_code *p = find_inputcode_byfunc(iconv);
5313 if (!input_codename) {
5314 input_codename = "ASCII";
5315 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5316 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5317 input_codename = "CP932";
5318 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5319 if (p->score & (SCORE_X0212))
5320 input_codename = "EUCJP-MS";
5321 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5322 input_codename = "CP51932";
5323 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5324 if (p->score & (SCORE_KANA))
5325 input_codename = "CP50221";
5326 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5327 input_codename = "CP50220";
5330 return input_codename;
5333 #if !defined(PERL_XS) && !defined(WIN32DLL)
5334 void print_guessed_code(char *filename)
5336 if (filename != NULL) printf("%s: ", filename);
5337 if (input_codename && !*input_codename) {
5340 input_codename = get_guessed_code();
5342 printf("%s\n", input_codename);
5346 input_newline == CR ? " (CR)" :
5347 input_newline == LF ? " (LF)" :
5348 input_newline == CRLF ? " (CRLF)" :
5349 input_newline == EOF ? " (MIXED NL)" :
5358 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5360 nkf_char c1, c2, c3;
5366 if (!nkf_isxdigit(c2)){
5371 if (!nkf_isxdigit(c3)){
5376 return (hex2bin(c2) << 4) | hex2bin(c3);
5379 nkf_char cap_getc(FILE *f)
5381 return hex_getc(':', f, i_cgetc, i_cungetc);
5384 nkf_char cap_ungetc(nkf_char c, FILE *f)
5386 return (*i_cungetc)(c, f);
5389 nkf_char url_getc(FILE *f)
5391 return hex_getc('%', f, i_ugetc, i_uungetc);
5394 nkf_char url_ungetc(nkf_char c, FILE *f)
5396 return (*i_uungetc)(c, f);
5400 #ifdef NUMCHAR_OPTION
5401 nkf_char numchar_getc(FILE *f)
5403 nkf_char (*g)(FILE *) = i_ngetc;
5404 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5415 if (buf[i] == 'x' || buf[i] == 'X'){
5416 for (j = 0; j < 7; j++){
5418 if (!nkf_isxdigit(buf[i])){
5425 c |= hex2bin(buf[i]);
5428 for (j = 0; j < 8; j++){
5432 if (!nkf_isdigit(buf[i])){
5439 c += hex2bin(buf[i]);
5445 return CLASS_UNICODE | c;
5454 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5456 return (*i_nungetc)(c, f);
5460 #ifdef UNICODE_NORMALIZATION
5462 /* Normalization Form C */
5463 nkf_char nfc_getc(FILE *f)
5465 nkf_char (*g)(FILE *f) = i_nfc_getc;
5466 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5467 int i=0, j, k=1, lower, upper;
5469 const nkf_nfchar *array;
5472 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5473 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5474 while (upper >= lower) {
5475 j = (lower+upper) / 2;
5476 array = normalization_table[j].nfd;
5477 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5478 if (array[k] != buf[k]){
5479 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5486 array = normalization_table[j].nfc;
5487 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5488 buf[i] = (nkf_char)(array[i]);
5499 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5501 return (*i_nfc_ungetc)(c, f);
5503 #endif /* UNICODE_NORMALIZATION */
5509 nkf_char c1, c2, c3, c4, cc;
5510 nkf_char t1, t2, t3, t4, mode, exit_mode;
5511 nkf_char lwsp_count;
5514 nkf_char lwsp_size = 128;
5516 if (mime_top != mime_last) { /* Something is in FIFO */
5517 return Fifo(mime_top++);
5519 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5520 mime_decode_mode=FALSE;
5521 unswitch_mime_getc();
5522 return (*i_getc)(f);
5525 if (mimebuf_f == FIXED_MIME)
5526 exit_mode = mime_decode_mode;
5529 if (mime_decode_mode == 'Q') {
5530 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5532 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5533 if (c1<=SP || DEL<=c1) {
5534 mime_decode_mode = exit_mode; /* prepare for quit */
5537 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5541 mime_decode_mode = exit_mode; /* prepare for quit */
5542 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5543 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5544 /* end Q encoding */
5545 input_mode = exit_mode;
5547 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5548 if (lwsp_buf==NULL) {
5549 perror("can't malloc");
5552 while ((c1=(*i_getc)(f))!=EOF) {
5557 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5565 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5566 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5581 lwsp_buf[lwsp_count] = (unsigned char)c1;
5582 if (lwsp_count++>lwsp_size){
5584 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5585 if (lwsp_buf_new==NULL) {
5587 perror("can't realloc");
5590 lwsp_buf = lwsp_buf_new;
5596 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5598 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5599 i_ungetc(lwsp_buf[lwsp_count],f);
5605 if (c1=='='&&c2<SP) { /* this is soft wrap */
5606 while((c1 = (*i_mgetc)(f)) <=SP) {
5607 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5609 mime_decode_mode = 'Q'; /* still in MIME */
5610 goto restart_mime_q;
5613 mime_decode_mode = 'Q'; /* still in MIME */
5617 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5618 if (c2<=SP) return c2;
5619 mime_decode_mode = 'Q'; /* still in MIME */
5620 return ((hex2bin(c2)<<4) + hex2bin(c3));
5623 if (mime_decode_mode != 'B') {
5624 mime_decode_mode = FALSE;
5625 return (*i_mgetc)(f);
5629 /* Base64 encoding */
5631 MIME allows line break in the middle of
5632 Base64, but we are very pessimistic in decoding
5633 in unbuf mode because MIME encoded code may broken by
5634 less or editor's control sequence (such as ESC-[-K in unbuffered
5635 mode. ignore incomplete MIME.
5637 mode = mime_decode_mode;
5638 mime_decode_mode = exit_mode; /* prepare for quit */
5640 while ((c1 = (*i_mgetc)(f))<=SP) {
5645 if ((c2 = (*i_mgetc)(f))<=SP) {
5648 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5649 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5652 if ((c1 == '?') && (c2 == '=')) {
5655 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5656 if (lwsp_buf==NULL) {
5657 perror("can't malloc");
5660 while ((c1=(*i_getc)(f))!=EOF) {
5665 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5673 if ((c1=(*i_getc)(f))!=EOF) {
5677 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5692 lwsp_buf[lwsp_count] = (unsigned char)c1;
5693 if (lwsp_count++>lwsp_size){
5695 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5696 if (lwsp_buf_new==NULL) {
5698 perror("can't realloc");
5701 lwsp_buf = lwsp_buf_new;
5707 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5709 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5710 i_ungetc(lwsp_buf[lwsp_count],f);
5717 if ((c3 = (*i_mgetc)(f))<=SP) {
5720 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5721 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5725 if ((c4 = (*i_mgetc)(f))<=SP) {
5728 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5729 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5733 mime_decode_mode = mode; /* still in MIME sigh... */
5735 /* BASE 64 decoding */
5737 t1 = 0x3f & base64decode(c1);
5738 t2 = 0x3f & base64decode(c2);
5739 t3 = 0x3f & base64decode(c3);
5740 t4 = 0x3f & base64decode(c4);
5741 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5743 Fifo(mime_last++) = (unsigned char)cc;
5744 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5746 Fifo(mime_last++) = (unsigned char)cc;
5747 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5749 Fifo(mime_last++) = (unsigned char)cc;
5754 return Fifo(mime_top++);
5757 nkf_char mime_ungetc(nkf_char c, FILE *f)
5759 Fifo(--mime_top) = (unsigned char)c;
5763 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5767 /* In buffered mode, read until =? or NL or buffer full
5769 mime_input = mime_top;
5770 mime_last = mime_top;
5772 while(*p) Fifo(mime_input++) = *p++;
5775 while((c=(*i_getc)(f))!=EOF) {
5776 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5777 break; /* buffer full */
5779 if (c=='=' && d=='?') {
5780 /* checked. skip header, start decode */
5781 Fifo(mime_input++) = (unsigned char)c;
5782 /* mime_last_input = mime_input; */
5787 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5789 /* Should we check length mod 4? */
5790 Fifo(mime_input++) = (unsigned char)c;
5793 /* In case of Incomplete MIME, no MIME decode */
5794 Fifo(mime_input++) = (unsigned char)c;
5795 mime_last = mime_input; /* point undecoded buffer */
5796 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5797 switch_mime_getc(); /* anyway we need buffered getc */
5801 nkf_char base64decode(nkf_char c)
5806 i = c - 'A'; /* A..Z 0-25 */
5807 } else if (c == '_') {
5808 i = '?' /* 63 */ ; /* _ 63 */
5810 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5812 } else if (c > '/') {
5813 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5814 } else if (c == '+' || c == '-') {
5815 i = '>' /* 62 */ ; /* + and - 62 */
5817 i = '?' /* 63 */ ; /* / 63 */
5822 static const char basis_64[] =
5823 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5825 static nkf_char b64c;
5826 #define MIMEOUT_BUF_LENGTH (60)
5827 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5828 int mimeout_buf_count = 0;
5830 void open_mime(nkf_char mode)
5832 const unsigned char *p;
5835 p = mime_pattern[0];
5836 for(i=0;mime_pattern[i];i++) {
5837 if (mode == mime_encode[i]) {
5838 p = mime_pattern[i];
5842 mimeout_mode = mime_encode_method[i];
5844 if (base64_count>45) {
5845 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5846 (*o_mputc)(mimeout_buf[i]);
5849 PUT_NEWLINE((*o_mputc));
5852 if (mimeout_buf_count>0
5853 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5854 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5858 for (;i<mimeout_buf_count;i++) {
5859 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5860 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5861 (*o_mputc)(mimeout_buf[i]);
5871 j = mimeout_buf_count;
5872 mimeout_buf_count = 0;
5874 mime_putc(mimeout_buf[i]);
5878 void close_mime(void)
5888 switch(mimeout_mode) {
5893 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5899 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5904 if (mimeout_mode > 0) {
5905 if (mimeout_f!=FIXED_MIME) {
5907 } else if (mimeout_mode != 'Q')
5912 void mimeout_addchar(nkf_char c)
5914 switch(mimeout_mode) {
5919 } else if(!nkf_isalnum(c)) {
5921 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5922 (*o_mputc)(bin2hex((c&0xf)));
5931 (*o_mputc)(basis_64[c>>2]);
5936 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5942 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5943 (*o_mputc)(basis_64[c & 0x3F]);
5954 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5956 void mime_prechar(nkf_char c2, nkf_char c1)
5958 if (mimeout_mode > 0){
5960 if (base64_count + mimeout_buf_count/3*4> 73){
5961 (*o_base64conv)(EOF,0);
5962 OCONV_NEWLINE((*o_base64conv));
5963 (*o_base64conv)(0,SP);
5967 if (base64_count + mimeout_buf_count/3*4> 66) {
5968 (*o_base64conv)(EOF,0);
5969 OCONV_NEWLINE((*o_base64conv));
5970 (*o_base64conv)(0,SP);
5976 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5977 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5978 open_mime(output_mode);
5979 (*o_base64conv)(EOF,0);
5980 OCONV_NEWLINE((*o_base64conv));
5981 (*o_base64conv)(0,SP);
5988 void mime_putc(nkf_char c)
5993 if (mimeout_f == FIXED_MIME){
5994 if (mimeout_mode == 'Q'){
5995 if (base64_count > 71){
5996 if (c!=CR && c!=LF) {
5998 PUT_NEWLINE((*o_mputc));
6003 if (base64_count > 71){
6005 PUT_NEWLINE((*o_mputc));
6008 if (c == EOF) { /* c==EOF */
6012 if (c != EOF) { /* c==EOF */
6018 /* mimeout_f != FIXED_MIME */
6020 if (c == EOF) { /* c==EOF */
6021 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
6022 j = mimeout_buf_count;
6023 mimeout_buf_count = 0;
6025 if (mimeout_mode > 0) {
6026 if (!nkf_isblank(mimeout_buf[j-1])) {
6028 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
6031 mimeout_addchar(mimeout_buf[i]);
6035 mimeout_addchar(mimeout_buf[i]);
6039 mimeout_addchar(mimeout_buf[i]);
6045 mimeout_addchar(mimeout_buf[i]);
6051 if (mimeout_buf_count > 0){
6052 lastchar = mimeout_buf[mimeout_buf_count - 1];
6057 if (mimeout_mode=='Q') {
6058 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6059 if (c == CR || c == LF) {
6064 } else if (c <= SP) {
6066 if (base64_count > 70) {
6067 PUT_NEWLINE((*o_mputc));
6070 if (!nkf_isblank(c)) {
6075 if (base64_count > 70) {
6077 PUT_NEWLINE((*o_mputc));
6080 open_mime(output_mode);
6082 if (!nkf_noescape_mime(c)) {
6093 if (mimeout_mode <= 0) {
6094 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6095 if (nkf_isspace(c)) {
6097 if (mimeout_mode == -1) {
6100 if (c==CR || c==LF) {
6102 open_mime(output_mode);
6108 for (i=0;i<mimeout_buf_count;i++) {
6109 (*o_mputc)(mimeout_buf[i]);
6110 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
6121 mimeout_buf[0] = (char)c;
6122 mimeout_buf_count = 1;
6124 if (base64_count > 1
6125 && base64_count + mimeout_buf_count > 76
6126 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
6127 PUT_NEWLINE((*o_mputc));
6129 if (!nkf_isspace(mimeout_buf[0])){
6134 mimeout_buf[mimeout_buf_count++] = (char)c;
6135 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6136 open_mime(output_mode);
6141 if (lastchar==CR || lastchar == LF){
6142 for (i=0;i<mimeout_buf_count;i++) {
6143 (*o_mputc)(mimeout_buf[i]);
6146 mimeout_buf_count = 0;
6149 for (i=0;i<mimeout_buf_count-1;i++) {
6150 (*o_mputc)(mimeout_buf[i]);
6153 mimeout_buf[0] = SP;
6154 mimeout_buf_count = 1;
6156 open_mime(output_mode);
6159 /* mimeout_mode == 'B', 1, 2 */
6160 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
6161 if (lastchar == CR || lastchar == LF){
6162 if (nkf_isblank(c)) {
6163 for (i=0;i<mimeout_buf_count;i++) {
6164 mimeout_addchar(mimeout_buf[i]);
6166 mimeout_buf_count = 0;
6167 } else if (SP<c && c<DEL) {
6169 for (i=0;i<mimeout_buf_count;i++) {
6170 (*o_mputc)(mimeout_buf[i]);
6173 mimeout_buf_count = 0;
6175 mimeout_buf[mimeout_buf_count++] = (char)c;
6178 if (c==SP || c==TAB || c==CR || c==LF) {
6179 for (i=0;i<mimeout_buf_count;i++) {
6180 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6182 for (i=0;i<mimeout_buf_count;i++) {
6183 (*o_mputc)(mimeout_buf[i]);
6186 mimeout_buf_count = 0;
6189 mimeout_buf[mimeout_buf_count++] = (char)c;
6190 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6192 for (i=0;i<mimeout_buf_count;i++) {
6193 (*o_mputc)(mimeout_buf[i]);
6196 mimeout_buf_count = 0;
6200 if (mimeout_buf_count>0 && SP<c && c!='=') {
6201 mimeout_buf[mimeout_buf_count++] = (char)c;
6202 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6203 j = mimeout_buf_count;
6204 mimeout_buf_count = 0;
6206 mimeout_addchar(mimeout_buf[i]);
6213 if (mimeout_buf_count>0) {
6214 j = mimeout_buf_count;
6215 mimeout_buf_count = 0;
6217 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6219 mimeout_addchar(mimeout_buf[i]);
6225 (*o_mputc)(mimeout_buf[i]);
6227 open_mime(output_mode);
6237 struct input_code *p = input_code_list;
6250 mime_f = MIME_DECODE_DEFAULT;
6251 mime_decode_f = FALSE;
6256 x0201_f = X0201_DEFAULT;
6257 iso2022jp_f = FALSE;
6258 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6259 ms_ucs_map_f = UCS_MAP_ASCII;
6261 #ifdef UTF8_INPUT_ENABLE
6262 no_cp932ext_f = FALSE;
6263 no_best_fit_chars_f = FALSE;
6264 encode_fallback = NULL;
6265 unicode_subchar = '?';
6266 input_endian = ENDIAN_BIG;
6268 #ifdef UTF8_OUTPUT_ENABLE
6269 output_bom_f = FALSE;
6270 output_endian = ENDIAN_BIG;
6272 #ifdef UNICODE_NORMALIZATION
6288 #ifdef SHIFTJIS_CP932
6298 for (i = 0; i < 256; i++){
6299 prefix_table[i] = 0;
6303 mimeout_buf_count = 0;
6308 fold_preserve_f = FALSE;
6311 kanji_intro = DEFAULT_J;
6312 ascii_intro = DEFAULT_R;
6313 fold_margin = FOLD_MARGIN;
6314 output_conv = DEFAULT_CONV;
6315 oconv = DEFAULT_CONV;
6316 o_zconv = no_connection;
6317 o_fconv = no_connection;
6318 o_nlconv = no_connection;
6319 o_rot_conv = no_connection;
6320 o_hira_conv = no_connection;
6321 o_base64conv = no_connection;
6322 o_iso2022jp_check_conv = no_connection;
6325 i_ungetc = std_ungetc;
6327 i_bungetc = std_ungetc;
6330 i_mungetc = std_ungetc;
6331 i_mgetc_buf = std_getc;
6332 i_mungetc_buf = std_ungetc;
6333 output_mode = ASCII;
6336 mime_decode_mode = FALSE;
6344 z_prev2=0,z_prev1=0;
6346 iconv_for_check = 0;
6348 input_codename = NULL;
6349 output_encoding = nkf_enc_from_index(DEFAULT_ENCODING);
\r
6355 void no_connection(nkf_char c2, nkf_char c1)
6357 no_connection2(c2,c1,0);
6360 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6362 fprintf(stderr,"nkf internal module connection failure.\n");
6364 return 0; /* LINT */
6369 #define fprintf dllprintf
6373 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6374 fprintf(stderr,"Flags:\n");
6375 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6376 #ifdef DEFAULT_CODE_SJIS
6377 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6379 #ifdef DEFAULT_CODE_JIS
6380 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6382 #ifdef DEFAULT_CODE_EUC
6383 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6385 #ifdef DEFAULT_CODE_UTF8
6386 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6388 #ifdef UTF8_OUTPUT_ENABLE
6389 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6391 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6392 #ifdef UTF8_INPUT_ENABLE
6393 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6395 fprintf(stderr,"t no conversion\n");
6396 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6397 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6398 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6399 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6400 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6401 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6402 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6403 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6404 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6405 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6406 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6407 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6408 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6410 fprintf(stderr,"T Text mode output\n");
6412 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6413 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6414 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6415 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6416 fprintf(stderr,"v, V Show this usage. V: show configuration\n");
6417 fprintf(stderr,"\n");
6418 fprintf(stderr,"Long name options\n");
6419 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6420 fprintf(stderr," Specify the input or output codeset\n");
6421 fprintf(stderr," --fj --unix --mac --windows\n");
6422 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6423 fprintf(stderr," Convert for the system or code\n");
6424 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6425 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6426 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6428 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6430 #ifdef NUMCHAR_OPTION
6431 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6433 #ifdef UTF8_INPUT_ENABLE
6434 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6435 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6438 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6439 fprintf(stderr," Overwrite original listed files by filtered result\n");
6440 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6442 fprintf(stderr," -g --guess Guess the input code\n");
6443 fprintf(stderr," --help --version Show this help/the version\n");
6444 fprintf(stderr," For more information, see also man nkf\n");
6445 fprintf(stderr,"\n");
6449 void show_configuration(void)
6451 fprintf(stderr, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6452 fprintf(stderr, " Compile-time options:\n");
6453 fprintf(stderr, " Default output encoding: "
6454 #if defined(DEFAULT_CODE_JIS)
6456 #elif defined(DEFAULT_CODE_SJIS)
6458 #elif defined(DEFAULT_CODE_EUC)
6460 #elif defined(DEFAULT_CODE_UTF8)
6464 fprintf(stderr, " Default output newline: "
6465 #if DEFAULT_NEWLINE == CR
6467 #elif DEFAULT_NEWLINE == CRLF
6473 fprintf(stderr, " Decode MIME encoded string: "
6474 #if MIME_DECODE_DEFAULT
6480 fprintf(stderr, " Convert JIS X 0201 Katakana: "
6491 fprintf(stderr,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");