1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.154 2007/12/18 18:20:16 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-12-19"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #ifndef MIME_DECODE_DEFAULT
44 #define MIME_DECODE_DEFAULT STRICT_MIME
47 #define X0201_DEFAULT TRUE
50 #if DEFAULT_NEWLINE == 0x0D0A
51 #define PUT_NEWLINE(func) do {\
55 #define OCONV_NEWLINE(func) do {\
59 #elif DEFAULT_NEWLINE == 0x0D
60 #define PUT_NEWLINE(func) func(0x0D)
61 #define OCONV_NEWLINE(func) func(0, 0x0D)
63 #define DEFAULT_NEWLINE 0x0A
64 #define PUT_NEWLINE(func) func(0x0A)
65 #define OCONV_NEWLINE(func) func(0, 0x0A)
68 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
70 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
86 #if defined(MSDOS) || defined(__OS2__)
89 #if defined(_MSC_VER) || defined(__WATCOMC__)
90 #define mktemp _mktemp
96 #define setbinmode(fp) fsetbin(fp)
97 #elif defined(__DJGPP__)
98 #include <libc/dosio.h>
99 #define setbinmode(fp) djgpp_setbinmode(fp)
100 #else /* Microsoft C, Turbo C */
101 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
104 #define setbinmode(fp)
107 #if defined(__DJGPP__)
108 void djgpp_setbinmode(FILE *fp)
110 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
113 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
114 __file_handle_set(fd, m);
118 #ifdef _IOFBF /* SysV and MSDOS, Windows */
119 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
121 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
124 /*Borland C++ 4.5 EasyWin*/
125 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
134 /* added by satoru@isoternet.org */
136 #include <sys/types.h>
138 #include <sys/stat.h>
139 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
141 #if defined(__WATCOMC__)
142 #include <sys/utime.h>
146 #else /* defined(MSDOS) */
148 #ifdef __BORLANDC__ /* BCC32 */
150 #else /* !defined(__BORLANDC__) */
151 #include <sys/utime.h>
152 #endif /* (__BORLANDC__) */
153 #else /* !defined(__WIN32__) */
154 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
155 #include <sys/utime.h>
156 #elif defined(__TURBOC__) /* BCC */
158 #elif defined(LSI_C) /* LSI C */
159 #endif /* (__WIN32__) */
167 /* state of output_mode and input_mode
183 #define X0213_1 0x284F
184 #define X0213_2 0x2850
186 /* Input Assumption */
191 #define LATIN1_INPUT 6
193 #define STRICT_MIME 8
198 #define JAPANESE_EUC 10
202 #define UTF8_INPUT 13
203 #define UTF16_INPUT 1015
204 #define UTF32_INPUT 1017
208 #define ENDIAN_BIG 1234
209 #define ENDIAN_LITTLE 4321
210 #define ENDIAN_2143 2143
211 #define ENDIAN_3412 3412
230 #define is_alnum(c) \
231 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
233 /* I don't trust portablity of toupper */
234 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
235 #define nkf_isoctal(c) ('0'<=c && c<='7')
236 #define nkf_isdigit(c) ('0'<=c && c<='9')
237 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
238 #define nkf_isblank(c) (c == SP || c == TAB)
239 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
240 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
241 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
242 #define nkf_isprint(c) (SP<=c && c<='~')
243 #define nkf_isgraph(c) ('!'<=c && c<='~')
244 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
245 ('A'<=c&&c<='F') ? (c-'A'+10) : \
246 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
247 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
248 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
249 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
250 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
251 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
253 #define CP932_TABLE_BEGIN 0xFA
254 #define CP932_TABLE_END 0xFC
255 #define CP932INV_TABLE_BEGIN 0xED
256 #define CP932INV_TABLE_END 0xEE
257 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
259 #define HOLD_SIZE 1024
260 #if defined(INT_IS_SHORT)
261 #define IOBUF_SIZE 2048
263 #define IOBUF_SIZE 16384
266 #define DEFAULT_J 'B'
267 #define DEFAULT_R 'B'
269 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
270 #define SJ6394 0x0161 /* 63 - 94 ku offset */
272 #define RANGE_NUM_MAX 18
277 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
278 #define sizeof_euc_to_utf8_1byte 94
279 #define sizeof_euc_to_utf8_2bytes 94
280 #define sizeof_utf8_to_euc_C2 64
281 #define sizeof_utf8_to_euc_E5B8 64
282 #define sizeof_utf8_to_euc_2bytes 112
283 #define sizeof_utf8_to_euc_3bytes 16
286 /* MIME preprocessor */
288 #ifdef EASYWIN /*Easy Win */
289 extern POINT _BufferSize;
298 void (*status_func)(struct input_code *, nkf_char);
299 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
303 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
305 #if !defined(PERL_XS) && !defined(WIN32DLL)
306 static nkf_char noconvert(FILE *f);
308 static void module_connection(void);
309 static nkf_char kanji_convert(FILE *f);
310 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
311 static nkf_char push_hold_buf(nkf_char c2);
312 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
313 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
314 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
315 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
316 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
318 * 0: Shift_JIS, eucJP-ascii
323 #define UCS_MAP_ASCII 0
325 #define UCS_MAP_CP932 2
326 #define UCS_MAP_CP10001 3
327 static int ms_ucs_map_f = UCS_MAP_ASCII;
329 #ifdef UTF8_INPUT_ENABLE
330 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
331 static int no_cp932ext_f = FALSE;
332 /* ignore ZERO WIDTH NO-BREAK SPACE */
333 static int no_best_fit_chars_f = FALSE;
334 static int input_endian = ENDIAN_BIG;
335 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
336 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
337 static void encode_fallback_html(nkf_char c);
338 static void encode_fallback_xml(nkf_char c);
339 static void encode_fallback_java(nkf_char c);
340 static void encode_fallback_perl(nkf_char c);
341 static void encode_fallback_subchar(nkf_char c);
342 static void (*encode_fallback)(nkf_char c) = NULL;
343 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
344 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
345 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
346 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
348 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
349 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
350 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
351 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
352 static void w_status(struct input_code *, nkf_char);
354 #ifdef UTF8_OUTPUT_ENABLE
355 static int output_bom_f = FALSE;
356 static int output_endian = ENDIAN_BIG;
357 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
358 static void w_oconv(nkf_char c2,nkf_char c1);
359 static void w_oconv16(nkf_char c2,nkf_char c1);
360 static void w_oconv32(nkf_char c2,nkf_char c1);
362 static void e_oconv(nkf_char c2,nkf_char c1);
363 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
364 static void s_oconv(nkf_char c2,nkf_char c1);
365 static void j_oconv(nkf_char c2,nkf_char c1);
366 static void fold_conv(nkf_char c2,nkf_char c1);
367 static void nl_conv(nkf_char c2,nkf_char c1);
368 static void z_conv(nkf_char c2,nkf_char c1);
369 static void rot_conv(nkf_char c2,nkf_char c1);
370 static void hira_conv(nkf_char c2,nkf_char c1);
371 static void base64_conv(nkf_char c2,nkf_char c1);
372 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
373 static void no_connection(nkf_char c2,nkf_char c1);
374 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
376 static void code_score(struct input_code *ptr);
377 static void code_status(nkf_char c);
379 static void std_putc(nkf_char c);
380 static nkf_char std_getc(FILE *f);
381 static nkf_char std_ungetc(nkf_char c,FILE *f);
383 static nkf_char broken_getc(FILE *f);
384 static nkf_char broken_ungetc(nkf_char c,FILE *f);
386 static nkf_char mime_begin(FILE *f);
387 static nkf_char mime_getc(FILE *f);
388 static nkf_char mime_ungetc(nkf_char c,FILE *f);
390 static void switch_mime_getc(void);
391 static void unswitch_mime_getc(void);
392 static nkf_char mime_begin_strict(FILE *f);
393 static nkf_char mime_getc_buf(FILE *f);
394 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
395 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
397 static nkf_char base64decode(nkf_char c);
398 static void mime_prechar(nkf_char c2, nkf_char c1);
399 static void mime_putc(nkf_char c);
400 static void open_mime(nkf_char c);
401 static void close_mime(void);
402 static void eof_mime(void);
403 static void mimeout_addchar(nkf_char c);
405 static void usage(void);
406 static void version(void);
407 static void show_configuration(void);
409 static void options(unsigned char *c);
410 static void reinit(void);
414 #if !defined(PERL_XS) && !defined(WIN32DLL)
415 static unsigned char stdibuf[IOBUF_SIZE];
416 static unsigned char stdobuf[IOBUF_SIZE];
418 static unsigned char hold_buf[HOLD_SIZE*2];
419 static int hold_count = 0;
421 /* MIME preprocessor fifo */
423 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
424 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
425 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
426 static unsigned char mime_buf[MIME_BUF_SIZE];
427 static unsigned int mime_top = 0;
428 static unsigned int mime_last = 0; /* decoded */
429 static unsigned int mime_input = 0; /* undecoded */
430 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
433 static int unbuf_f = FALSE;
434 static int estab_f = FALSE;
435 static int nop_f = FALSE;
436 static int binmode_f = TRUE; /* binary mode */
437 static int rot_f = FALSE; /* rot14/43 mode */
438 static int hira_f = FALSE; /* hira/kata henkan */
439 static int input_f = FALSE; /* non fixed input code */
440 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
441 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
442 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
443 static int mimebuf_f = FALSE; /* MIME buffered input */
444 static int broken_f = FALSE; /* convert ESC-less broken JIS */
445 static int iso8859_f = FALSE; /* ISO8859 through */
446 static int mimeout_f = FALSE; /* base64 mode */
447 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
448 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
450 #ifdef UNICODE_NORMALIZATION
451 static int nfc_f = FALSE;
452 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
453 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
454 static nkf_char nfc_getc(FILE *f);
455 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
459 static int cap_f = FALSE;
460 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
461 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
462 static nkf_char cap_getc(FILE *f);
463 static nkf_char cap_ungetc(nkf_char c,FILE *f);
465 static int url_f = FALSE;
466 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
467 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
468 static nkf_char url_getc(FILE *f);
469 static nkf_char url_ungetc(nkf_char c,FILE *f);
472 #if defined(INT_IS_SHORT)
473 #define NKF_INT32_C(n) (n##L)
475 #define NKF_INT32_C(n) (n)
477 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
478 #define CLASS_MASK NKF_INT32_C(0xFF000000)
479 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
480 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
481 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
482 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
483 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
485 #ifdef NUMCHAR_OPTION
486 static int numchar_f = FALSE;
487 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
488 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
489 static nkf_char numchar_getc(FILE *f);
490 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
494 static int noout_f = FALSE;
495 static void no_putc(nkf_char c);
496 static int debug_f = FALSE;
497 static void debug(const char *str);
498 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
501 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
503 static void print_guessed_code(char *filename);
505 static void set_input_codename(char *codename);
508 static int exec_f = 0;
511 #ifdef SHIFTJIS_CP932
512 /* invert IBM extended characters to others */
513 static int cp51932_f = FALSE;
515 /* invert NEC-selected IBM extended characters to IBM extended characters */
516 static int cp932inv_f = TRUE;
518 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
519 #endif /* SHIFTJIS_CP932 */
522 static int x0212_f = FALSE;
523 static nkf_char x0212_shift(nkf_char c);
524 static nkf_char x0212_unshift(nkf_char c);
526 static int x0213_f = FALSE;
528 static unsigned char prefix_table[256];
530 static void set_code_score(struct input_code *ptr, nkf_char score);
531 static void clr_code_score(struct input_code *ptr, nkf_char score);
532 static void status_disable(struct input_code *ptr);
533 static void status_push_ch(struct input_code *ptr, nkf_char c);
534 static void status_clear(struct input_code *ptr);
535 static void status_reset(struct input_code *ptr);
536 static void status_reinit(struct input_code *ptr);
537 static void status_check(struct input_code *ptr, nkf_char c);
538 static void e_status(struct input_code *, nkf_char);
539 static void s_status(struct input_code *, nkf_char);
541 struct input_code input_code_list[] = {
542 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
543 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
544 #ifdef UTF8_INPUT_ENABLE
545 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
546 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
547 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
552 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
553 static int base64_count = 0;
555 /* X0208 -> ASCII converter */
558 static int f_line = 0; /* chars in line */
559 static int f_prev = 0;
560 static int fold_preserve_f = FALSE; /* preserve new lines */
561 static int fold_f = FALSE;
562 static int fold_len = 0;
565 static unsigned char kanji_intro = DEFAULT_J;
566 static unsigned char ascii_intro = DEFAULT_R;
570 #define FOLD_MARGIN 10
571 #define DEFAULT_FOLD 60
573 static int fold_margin = FOLD_MARGIN;
577 #ifdef DEFAULT_CODE_JIS
578 # define DEFAULT_CONV j_oconv
580 #ifdef DEFAULT_CODE_SJIS
581 # define DEFAULT_CONV s_oconv
583 #ifdef DEFAULT_CODE_EUC
584 # define DEFAULT_CONV e_oconv
586 #ifdef DEFAULT_CODE_UTF8
587 # define DEFAULT_CONV w_oconv
590 /* process default */
591 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
593 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
594 /* s_iconv or oconv */
595 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
597 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
598 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
599 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
600 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
601 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
602 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
603 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
605 /* static redirections */
607 static void (*o_putc)(nkf_char c) = std_putc;
609 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
610 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
612 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
613 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
615 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
617 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
618 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
620 /* for strict mime */
621 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
622 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
625 static int output_mode = ASCII, /* output kanji mode */
626 input_mode = ASCII, /* input kanji mode */
627 shift_mode = FALSE; /* TRUE shift out, or X0201 */
628 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
630 /* X0201 / X0208 conversion tables */
632 /* X0201 kana conversion table */
634 static const unsigned char cv[]= {
635 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
636 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
637 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
638 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
639 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
640 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
641 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
642 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
643 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
644 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
645 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
646 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
647 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
648 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
649 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
650 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
654 /* X0201 kana conversion table for daguten */
656 static const unsigned char dv[]= {
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
662 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
663 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
664 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
665 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
666 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
668 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 /* X0201 kana conversion table for han-daguten */
677 static const unsigned char ev[]= {
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
686 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
687 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
688 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
689 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
690 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
697 /* X0208 kigou conversion table */
698 /* 0x8140 - 0x819e */
699 static const unsigned char fv[] = {
701 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
702 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
703 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
704 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
705 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
706 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
707 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
708 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
709 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
717 static int file_out_f = FALSE;
719 static int overwrite_f = FALSE;
720 static int preserve_time_f = FALSE;
721 static int backup_f = FALSE;
722 static char *backup_suffix = "";
723 static char *get_backup_filename(const char *suffix, const char *filename);
726 static int nlmode_f = 0; /* CR, LF, CRLF */
727 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
728 static nkf_char prev_cr = 0; /* CR or 0 */
729 #ifdef EASYWIN /*Easy Win */
730 static int end_check;
733 #define STD_GC_BUFSIZE (256)
734 nkf_char std_gc_buf[STD_GC_BUFSIZE];
738 #include "nkf32dll.c"
739 #elif defined(PERL_XS)
741 int main(int argc, char **argv)
746 char *outfname = NULL;
749 #ifdef EASYWIN /*Easy Win */
750 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
753 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
754 cp = (unsigned char *)*argv;
758 int debug_f_back = debug_f;
761 int exec_f_back = exec_f;
764 int x0212_f_back = x0212_f;
766 int x0213_f_back = x0213_f;
767 int guess_f_back = guess_f;
769 guess_f = guess_f_back;
772 debug_f = debug_f_back;
775 exec_f = exec_f_back;
778 x0212_f = x0212_f_back;
780 x0213_f = x0213_f_back;
785 if (pipe(fds) < 0 || (pid = fork()) < 0){
796 execvp(argv[1], &argv[1]);
811 if (binmode_f == TRUE)
812 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
813 if (freopen("","wb",stdout) == NULL)
820 setbuf(stdout, (char *) NULL);
822 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
825 if (binmode_f == TRUE)
826 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
827 if (freopen("","rb",stdin) == NULL) return (-1);
831 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
835 kanji_convert(stdin);
836 if (guess_f) print_guessed_code(NULL);
840 int is_argument_error = FALSE;
842 input_codename = NULL;
847 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
850 is_argument_error = TRUE;
858 /* reopen file for stdout */
859 if (file_out_f == TRUE) {
862 outfname = malloc(strlen(origfname)
863 + strlen(".nkftmpXXXXXX")
869 strcpy(outfname, origfname);
873 for (i = strlen(outfname); i; --i){
874 if (outfname[i - 1] == '/'
875 || outfname[i - 1] == '\\'){
881 strcat(outfname, "ntXXXXXX");
883 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
886 strcat(outfname, ".nkftmpXXXXXX");
887 fd = mkstemp(outfname);
890 || (fd_backup = dup(fileno(stdout))) < 0
891 || dup2(fd, fileno(stdout)) < 0
902 outfname = "nkf.out";
905 if(freopen(outfname, "w", stdout) == NULL) {
909 if (binmode_f == TRUE) {
910 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
911 if (freopen("","wb",stdout) == NULL)
918 if (binmode_f == TRUE)
919 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
920 if (freopen("","rb",fin) == NULL)
925 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
929 char *filename = NULL;
931 if (nfiles > 1) filename = origfname;
932 if (guess_f) print_guessed_code(filename);
938 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
946 if (dup2(fd_backup, fileno(stdout)) < 0){
949 if (stat(origfname, &sb)) {
950 fprintf(stderr, "Can't stat %s\n", origfname);
952 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
953 if (chmod(outfname, sb.st_mode)) {
954 fprintf(stderr, "Can't set permission %s\n", outfname);
957 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
959 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
960 tb[0] = tb[1] = sb.st_mtime;
961 if (utime(outfname, tb)) {
962 fprintf(stderr, "Can't set timestamp %s\n", outfname);
965 tb.actime = sb.st_atime;
966 tb.modtime = sb.st_mtime;
967 if (utime(outfname, &tb)) {
968 fprintf(stderr, "Can't set timestamp %s\n", outfname);
973 char *backup_filename = get_backup_filename(backup_suffix, origfname);
975 unlink(backup_filename);
977 if (rename(origfname, backup_filename)) {
978 perror(backup_filename);
979 fprintf(stderr, "Can't rename %s to %s\n",
980 origfname, backup_filename);
984 if (unlink(origfname)){
989 if (rename(outfname, origfname)) {
991 fprintf(stderr, "Can't rename %s to %s\n",
992 outfname, origfname);
999 if (is_argument_error)
1002 #ifdef EASYWIN /*Easy Win */
1003 if (file_out_f == FALSE)
1004 scanf("%d",&end_check);
1007 #else /* for Other OS */
1008 if (file_out_f == TRUE)
1010 #endif /*Easy Win */
1013 #endif /* WIN32DLL */
1016 char *get_backup_filename(const char *suffix, const char *filename)
1018 char *backup_filename;
1019 int asterisk_count = 0;
1021 int filename_length = strlen(filename);
1023 for(i = 0; suffix[i]; i++){
1024 if(suffix[i] == '*') asterisk_count++;
1028 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1029 if (!backup_filename){
1030 perror("Can't malloc backup filename.");
1034 for(i = 0, j = 0; suffix[i];){
1035 if(suffix[i] == '*'){
1036 backup_filename[j] = '\0';
1037 strncat(backup_filename, filename, filename_length);
1039 j += filename_length;
1041 backup_filename[j++] = suffix[i++];
1044 backup_filename[j] = '\0';
1046 j = strlen(suffix) + filename_length;
1047 backup_filename = malloc( + 1);
1048 strcpy(backup_filename, filename);
1049 strcat(backup_filename, suffix);
1050 backup_filename[j] = '\0';
1052 return backup_filename;
1056 static const struct {
1080 {"katakana-hiragana","h3"},
1088 #ifdef UTF8_OUTPUT_ENABLE
1098 {"fb-subchar=", ""},
1100 #ifdef UTF8_INPUT_ENABLE
1101 {"utf8-input", "W"},
1102 {"utf16-input", "W16"},
1103 {"no-cp932ext", ""},
1104 {"no-best-fit-chars",""},
1106 #ifdef UNICODE_NORMALIZATION
1107 {"utf8mac-input", ""},
1119 #ifdef NUMCHAR_OPTION
1120 {"numchar-input", ""},
1126 #ifdef SHIFTJIS_CP932
1136 static int option_mode = 0;
1138 void options(unsigned char *cp)
1142 unsigned char *cp_back = NULL;
1147 while(*cp && *cp++!='-');
1148 while (*cp || cp_back) {
1156 case '-': /* literal options */
1157 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1161 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1162 p = (unsigned char *)long_option[i].name;
1163 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1164 if (*p == cp[j] || cp[j] == SP){
1171 fprintf(stderr, "unknown long option: --%s\n", cp);
1174 while(*cp && *cp != SP && cp++);
1175 if (long_option[i].alias[0]){
1177 cp = (unsigned char *)long_option[i].alias;
1179 if (strcmp(long_option[i].name, "ic=") == 0){
1180 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1181 codeset[i] = nkf_toupper(p[i]);
1184 if(strcmp(codeset, "ISO-2022-JP") == 0){
1185 input_f = JIS_INPUT;
1186 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1187 strcmp(codeset, "CP50220") == 0 ||
1188 strcmp(codeset, "CP50221") == 0 ||
1189 strcmp(codeset, "CP50222") == 0){
1190 input_f = JIS_INPUT;
1191 #ifdef SHIFTJIS_CP932
1194 #ifdef UTF8_OUTPUT_ENABLE
1195 ms_ucs_map_f = UCS_MAP_CP932;
1197 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1198 input_f = JIS_INPUT;
1202 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1203 input_f = JIS_INPUT;
1208 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1209 input_f = SJIS_INPUT;
1210 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1211 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1212 strcmp(codeset, "CP932") == 0 ||
1213 strcmp(codeset, "MS932") == 0){
1214 input_f = SJIS_INPUT;
1215 #ifdef SHIFTJIS_CP932
1218 #ifdef UTF8_OUTPUT_ENABLE
1219 ms_ucs_map_f = UCS_MAP_CP932;
1221 }else if(strcmp(codeset, "CP10001") == 0){
1222 input_f = SJIS_INPUT;
1223 #ifdef SHIFTJIS_CP932
1226 #ifdef UTF8_OUTPUT_ENABLE
1227 ms_ucs_map_f = UCS_MAP_CP10001;
1229 }else if(strcmp(codeset, "EUCJP") == 0 ||
1230 strcmp(codeset, "EUC-JP") == 0){
1231 input_f = EUC_INPUT;
1232 }else if(strcmp(codeset, "CP51932") == 0){
1233 input_f = EUC_INPUT;
1234 #ifdef SHIFTJIS_CP932
1237 #ifdef UTF8_OUTPUT_ENABLE
1238 ms_ucs_map_f = UCS_MAP_CP932;
1240 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1241 strcmp(codeset, "EUCJP-MS") == 0 ||
1242 strcmp(codeset, "EUCJPMS") == 0){
1243 input_f = EUC_INPUT;
1244 #ifdef SHIFTJIS_CP932
1247 #ifdef UTF8_OUTPUT_ENABLE
1248 ms_ucs_map_f = UCS_MAP_MS;
1250 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1251 strcmp(codeset, "EUCJP-ASCII") == 0){
1252 input_f = EUC_INPUT;
1253 #ifdef SHIFTJIS_CP932
1256 #ifdef UTF8_OUTPUT_ENABLE
1257 ms_ucs_map_f = UCS_MAP_ASCII;
1259 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1260 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1261 input_f = SJIS_INPUT;
1263 #ifdef SHIFTJIS_CP932
1266 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1267 strcmp(codeset, "EUC-JIS-2004") == 0){
1268 input_f = EUC_INPUT;
1270 #ifdef SHIFTJIS_CP932
1273 #ifdef UTF8_INPUT_ENABLE
1274 }else if(strcmp(codeset, "UTF-8") == 0 ||
1275 strcmp(codeset, "UTF-8N") == 0 ||
1276 strcmp(codeset, "UTF-8-BOM") == 0){
1277 input_f = UTF8_INPUT;
1278 #ifdef UNICODE_NORMALIZATION
1279 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1280 strcmp(codeset, "UTF-8-MAC") == 0){
1281 input_f = UTF8_INPUT;
1284 }else if(strcmp(codeset, "UTF-16") == 0 ||
1285 strcmp(codeset, "UTF-16BE") == 0 ||
1286 strcmp(codeset, "UTF-16BE-BOM") == 0){
1287 input_f = UTF16_INPUT;
1288 input_endian = ENDIAN_BIG;
1289 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1290 strcmp(codeset, "UTF-16LE-BOM") == 0){
1291 input_f = UTF16_INPUT;
1292 input_endian = ENDIAN_LITTLE;
1293 }else if(strcmp(codeset, "UTF-32") == 0 ||
1294 strcmp(codeset, "UTF-32BE") == 0 ||
1295 strcmp(codeset, "UTF-32BE-BOM") == 0){
1296 input_f = UTF32_INPUT;
1297 input_endian = ENDIAN_BIG;
1298 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1299 strcmp(codeset, "UTF-32LE-BOM") == 0){
1300 input_f = UTF32_INPUT;
1301 input_endian = ENDIAN_LITTLE;
1304 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1308 if (strcmp(long_option[i].name, "oc=") == 0){
1310 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1311 codeset[i] = nkf_toupper(p[i]);
1314 if(strcmp(codeset, "ISO-2022-JP") == 0){
1315 output_conv = j_oconv;
1316 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1317 output_conv = j_oconv;
1318 no_cp932ext_f = TRUE;
1319 #ifdef SHIFTJIS_CP932
1322 #ifdef UTF8_OUTPUT_ENABLE
1323 ms_ucs_map_f = UCS_MAP_CP932;
1325 }else if(strcmp(codeset, "CP50220") == 0){
1326 output_conv = j_oconv;
1328 #ifdef SHIFTJIS_CP932
1331 #ifdef UTF8_OUTPUT_ENABLE
1332 ms_ucs_map_f = UCS_MAP_CP932;
1334 }else if(strcmp(codeset, "CP50221") == 0){
1335 output_conv = j_oconv;
1336 #ifdef SHIFTJIS_CP932
1339 #ifdef UTF8_OUTPUT_ENABLE
1340 ms_ucs_map_f = UCS_MAP_CP932;
1342 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1343 output_conv = j_oconv;
1347 #ifdef SHIFTJIS_CP932
1350 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1351 output_conv = j_oconv;
1356 #ifdef SHIFTJIS_CP932
1359 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1360 output_conv = s_oconv;
1361 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1362 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1363 strcmp(codeset, "CP932") == 0 ||
1364 strcmp(codeset, "MS932") == 0){
1365 output_conv = s_oconv;
1366 #ifdef UTF8_OUTPUT_ENABLE
1367 ms_ucs_map_f = UCS_MAP_CP932;
1369 }else if(strcmp(codeset, "CP10001") == 0){
1370 output_conv = s_oconv;
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP10001;
1374 }else if(strcmp(codeset, "EUCJP") == 0 ||
1375 strcmp(codeset, "EUC-JP") == 0){
1376 output_conv = e_oconv;
1377 }else if(strcmp(codeset, "CP51932") == 0){
1378 output_conv = e_oconv;
1379 #ifdef SHIFTJIS_CP932
1382 #ifdef UTF8_OUTPUT_ENABLE
1383 ms_ucs_map_f = UCS_MAP_CP932;
1385 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1386 strcmp(codeset, "EUCJP-MS") == 0 ||
1387 strcmp(codeset, "EUCJPMS") == 0){
1388 output_conv = e_oconv;
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_MS;
1395 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1396 strcmp(codeset, "EUCJP-ASCII") == 0){
1397 output_conv = e_oconv;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_ASCII;
1404 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1405 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1406 output_conv = s_oconv;
1408 #ifdef SHIFTJIS_CP932
1411 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1412 strcmp(codeset, "EUC-JIS-2004") == 0){
1413 output_conv = e_oconv;
1418 #ifdef SHIFTJIS_CP932
1421 #ifdef UTF8_OUTPUT_ENABLE
1422 }else if(strcmp(codeset, "UTF-8") == 0){
1423 output_conv = w_oconv;
1424 }else if(strcmp(codeset, "UTF-8N") == 0){
1425 output_conv = w_oconv;
1426 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1427 output_conv = w_oconv;
1428 output_bom_f = TRUE;
1429 }else if(strcmp(codeset, "UTF-16BE") == 0){
1430 output_conv = w_oconv16;
1431 }else if(strcmp(codeset, "UTF-16") == 0 ||
1432 strcmp(codeset, "UTF-16BE-BOM") == 0){
1433 output_conv = w_oconv16;
1434 output_bom_f = TRUE;
1435 }else if(strcmp(codeset, "UTF-16LE") == 0){
1436 output_conv = w_oconv16;
1437 output_endian = ENDIAN_LITTLE;
1438 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1439 output_conv = w_oconv16;
1440 output_endian = ENDIAN_LITTLE;
1441 output_bom_f = TRUE;
1442 }else if(strcmp(codeset, "UTF-32") == 0 ||
1443 strcmp(codeset, "UTF-32BE") == 0){
1444 output_conv = w_oconv32;
1445 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1446 output_conv = w_oconv32;
1447 output_bom_f = TRUE;
1448 }else if(strcmp(codeset, "UTF-32LE") == 0){
1449 output_conv = w_oconv32;
1450 output_endian = ENDIAN_LITTLE;
1451 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1452 output_conv = w_oconv32;
1453 output_endian = ENDIAN_LITTLE;
1454 output_bom_f = TRUE;
1457 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1461 if (strcmp(long_option[i].name, "guess=") == 0){
1470 if (strcmp(long_option[i].name, "overwrite") == 0){
1473 preserve_time_f = TRUE;
1476 if (strcmp(long_option[i].name, "overwrite=") == 0){
1479 preserve_time_f = TRUE;
1481 backup_suffix = malloc(strlen((char *) p) + 1);
1482 strcpy(backup_suffix, (char *) p);
1485 if (strcmp(long_option[i].name, "in-place") == 0){
1488 preserve_time_f = FALSE;
1491 if (strcmp(long_option[i].name, "in-place=") == 0){
1494 preserve_time_f = FALSE;
1496 backup_suffix = malloc(strlen((char *) p) + 1);
1497 strcpy(backup_suffix, (char *) p);
1502 if (strcmp(long_option[i].name, "cap-input") == 0){
1506 if (strcmp(long_option[i].name, "url-input") == 0){
1511 #ifdef NUMCHAR_OPTION
1512 if (strcmp(long_option[i].name, "numchar-input") == 0){
1518 if (strcmp(long_option[i].name, "no-output") == 0){
1522 if (strcmp(long_option[i].name, "debug") == 0){
1527 if (strcmp(long_option[i].name, "cp932") == 0){
1528 #ifdef SHIFTJIS_CP932
1532 #ifdef UTF8_OUTPUT_ENABLE
1533 ms_ucs_map_f = UCS_MAP_CP932;
1537 if (strcmp(long_option[i].name, "no-cp932") == 0){
1538 #ifdef SHIFTJIS_CP932
1542 #ifdef UTF8_OUTPUT_ENABLE
1543 ms_ucs_map_f = UCS_MAP_ASCII;
1547 #ifdef SHIFTJIS_CP932
1548 if (strcmp(long_option[i].name, "cp932inv") == 0){
1555 if (strcmp(long_option[i].name, "x0212") == 0){
1562 if (strcmp(long_option[i].name, "exec-in") == 0){
1566 if (strcmp(long_option[i].name, "exec-out") == 0){
1571 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1572 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1573 no_cp932ext_f = TRUE;
1576 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1577 no_best_fit_chars_f = TRUE;
1580 if (strcmp(long_option[i].name, "fb-skip") == 0){
1581 encode_fallback = NULL;
1584 if (strcmp(long_option[i].name, "fb-html") == 0){
1585 encode_fallback = encode_fallback_html;
1588 if (strcmp(long_option[i].name, "fb-xml") == 0){
1589 encode_fallback = encode_fallback_xml;
1592 if (strcmp(long_option[i].name, "fb-java") == 0){
1593 encode_fallback = encode_fallback_java;
1596 if (strcmp(long_option[i].name, "fb-perl") == 0){
1597 encode_fallback = encode_fallback_perl;
1600 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1601 encode_fallback = encode_fallback_subchar;
1604 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1605 encode_fallback = encode_fallback_subchar;
1606 unicode_subchar = 0;
1608 /* decimal number */
1609 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1610 unicode_subchar *= 10;
1611 unicode_subchar += hex2bin(p[i]);
1613 }else if(p[1] == 'x' || p[1] == 'X'){
1614 /* hexadecimal number */
1615 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1616 unicode_subchar <<= 4;
1617 unicode_subchar |= hex2bin(p[i]);
1621 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1622 unicode_subchar *= 8;
1623 unicode_subchar += hex2bin(p[i]);
1626 w16e_conv(unicode_subchar, &i, &j);
1627 unicode_subchar = i<<8 | j;
1631 #ifdef UTF8_OUTPUT_ENABLE
1632 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1633 ms_ucs_map_f = UCS_MAP_MS;
1637 #ifdef UNICODE_NORMALIZATION
1638 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1639 input_f = UTF8_INPUT;
1644 if (strcmp(long_option[i].name, "prefix=") == 0){
1645 if (nkf_isgraph(p[0])){
1646 for (i = 1; nkf_isgraph(p[i]); i++){
1647 prefix_table[p[i]] = p[0];
1654 case 'b': /* buffered mode */
1657 case 'u': /* non bufferd mode */
1660 case 't': /* transparent mode */
1665 } else if (*cp=='2') {
1669 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1677 case 'j': /* JIS output */
1679 output_conv = j_oconv;
1681 case 'e': /* AT&T EUC output */
1682 output_conv = e_oconv;
1685 case 's': /* SJIS output */
1686 output_conv = s_oconv;
1688 case 'l': /* ISO8859 Latin-1 support, no conversion */
1689 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1690 input_f = LATIN1_INPUT;
1692 case 'i': /* Kanji IN ESC-$-@/B */
1693 if (*cp=='@'||*cp=='B')
1694 kanji_intro = *cp++;
1696 case 'o': /* ASCII IN ESC-(-J/B */
1697 if (*cp=='J'||*cp=='B'||*cp=='H')
1698 ascii_intro = *cp++;
1702 bit:1 katakana->hiragana
1703 bit:2 hiragana->katakana
1705 if ('9'>= *cp && *cp>='0')
1706 hira_f |= (*cp++ -'0');
1713 #if defined(MSDOS) || defined(__OS2__)
1720 show_configuration();
1728 #ifdef UTF8_OUTPUT_ENABLE
1729 case 'w': /* UTF-8 output */
1731 output_conv = w_oconv; cp++;
1735 output_bom_f = TRUE;
1738 if ('1'== cp[0] && '6'==cp[1]) {
1739 output_conv = w_oconv16; cp+=2;
1740 } else if ('3'== cp[0] && '2'==cp[1]) {
1741 output_conv = w_oconv32; cp+=2;
1743 output_conv = w_oconv;
1748 output_endian = ENDIAN_LITTLE;
1749 } else if (cp[0] == 'B') {
1757 output_bom_f = TRUE;
1762 #ifdef UTF8_INPUT_ENABLE
1763 case 'W': /* UTF input */
1766 input_f = UTF8_INPUT;
1768 if ('1'== cp[0] && '6'==cp[1]) {
1770 input_f = UTF16_INPUT;
1771 input_endian = ENDIAN_BIG;
1772 } else if ('3'== cp[0] && '2'==cp[1]) {
1774 input_f = UTF32_INPUT;
1775 input_endian = ENDIAN_BIG;
1777 input_f = UTF8_INPUT;
1782 input_endian = ENDIAN_LITTLE;
1783 } else if (cp[0] == 'B') {
1789 /* Input code assumption */
1790 case 'J': /* JIS input */
1791 input_f = JIS_INPUT;
1793 case 'E': /* AT&T EUC input */
1794 input_f = EUC_INPUT;
1796 case 'S': /* MS Kanji input */
1797 input_f = SJIS_INPUT;
1799 case 'Z': /* Convert X0208 alphabet to asii */
1801 bit:0 Convert JIS X 0208 Alphabet to ASCII
1802 bit:1 Convert Kankaku to one space
1803 bit:2 Convert Kankaku to two spaces
1804 bit:3 Convert HTML Entity
1805 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1807 while ('0'<= *cp && *cp <='9') {
1808 alpha_f |= 1 << (*cp++ - '0');
1810 if (!alpha_f) alpha_f = 1;
1812 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1813 x0201_f = FALSE; /* No X0201->X0208 conversion */
1815 ESC-(-I in JIS, EUC, MS Kanji
1816 SI/SO in JIS, EUC, MS Kanji
1817 SSO in EUC, JIS, not in MS Kanji
1818 MS Kanji (0xa0-0xdf)
1820 ESC-(-I in JIS (0x20-0x5f)
1821 SSO in EUC (0xa0-0xdf)
1822 0xa0-0xd in MS Kanji (0xa0-0xdf)
1825 case 'X': /* Convert X0201 kana to X0208 */
1828 case 'F': /* prserve new lines */
1829 fold_preserve_f = TRUE;
1830 case 'f': /* folding -f60 or -f */
1833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1835 fold_len += *cp++ - '0';
1837 if (!(0<fold_len && fold_len<BUFSIZ))
1838 fold_len = DEFAULT_FOLD;
1842 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1844 fold_margin += *cp++ - '0';
1848 case 'm': /* MIME support */
1849 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1850 if (*cp=='B'||*cp=='Q') {
1851 mime_decode_mode = *cp++;
1852 mimebuf_f = FIXED_MIME;
1853 } else if (*cp=='N') {
1854 mime_f = TRUE; cp++;
1855 } else if (*cp=='S') {
1856 mime_f = STRICT_MIME; cp++;
1857 } else if (*cp=='0') {
1858 mime_decode_f = FALSE;
1859 mime_f = FALSE; cp++;
1862 case 'M': /* MIME output */
1865 mimeout_f = FIXED_MIME; cp++;
1866 } else if (*cp=='Q') {
1868 mimeout_f = FIXED_MIME; cp++;
1873 case 'B': /* Broken JIS support */
1875 bit:1 allow any x on ESC-(-x or ESC-$-x
1876 bit:2 reset to ascii on NL
1878 if ('9'>= *cp && *cp>='0')
1879 broken_f |= 1<<(*cp++ -'0');
1884 case 'O':/* for Output file */
1888 case 'c':/* add cr code */
1891 case 'd':/* delete cr code */
1894 case 'I': /* ISO-2022-JP output */
1897 case 'L': /* line mode */
1898 if (*cp=='u') { /* unix */
1899 nlmode_f = LF; cp++;
1900 } else if (*cp=='m') { /* mac */
1901 nlmode_f = CR; cp++;
1902 } else if (*cp=='w') { /* windows */
1903 nlmode_f = CRLF; cp++;
1904 } else if (*cp=='0') { /* no conversion */
1913 } else if (*cp == '0') {
1922 /* module muliple options in a string are allowed for Perl moudle */
1923 while(*cp && *cp++!='-');
1926 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1927 /* bogus option but ignored */
1933 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1936 struct input_code *p = input_code_list;
1938 if (iconv_func == p->iconv_func){
1947 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1949 #ifdef INPUT_CODE_FIX
1957 #ifdef INPUT_CODE_FIX
1958 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1964 if (estab_f && iconv_for_check != iconv){
1965 struct input_code *p = find_inputcode_byfunc(iconv);
1967 set_input_codename(p->name);
1970 iconv_for_check = iconv;
1975 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1976 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1977 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1978 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1979 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1980 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1981 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1982 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1984 #define SCORE_INIT (SCORE_iMIME)
1986 static const char score_table_A0[] = {
1989 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1990 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1993 static const char score_table_F0[] = {
1994 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1995 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1996 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1997 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2000 void set_code_score(struct input_code *ptr, nkf_char score)
2003 ptr->score |= score;
2007 void clr_code_score(struct input_code *ptr, nkf_char score)
2010 ptr->score &= ~score;
2014 void code_score(struct input_code *ptr)
2016 nkf_char c2 = ptr->buf[0];
2017 #ifdef UTF8_OUTPUT_ENABLE
2018 nkf_char c1 = ptr->buf[1];
2021 set_code_score(ptr, SCORE_ERROR);
2022 }else if (c2 == SSO){
2023 set_code_score(ptr, SCORE_KANA);
2024 }else if (c2 == 0x8f){
2025 set_code_score(ptr, SCORE_X0212);
2026 #ifdef UTF8_OUTPUT_ENABLE
2027 }else if (!e2w_conv(c2, c1)){
2028 set_code_score(ptr, SCORE_NO_EXIST);
2030 }else if ((c2 & 0x70) == 0x20){
2031 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2032 }else if ((c2 & 0x70) == 0x70){
2033 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2034 }else if ((c2 & 0x70) >= 0x50){
2035 set_code_score(ptr, SCORE_L2);
2039 void status_disable(struct input_code *ptr)
2044 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2047 void status_push_ch(struct input_code *ptr, nkf_char c)
2049 ptr->buf[ptr->index++] = c;
2052 void status_clear(struct input_code *ptr)
2058 void status_reset(struct input_code *ptr)
2061 ptr->score = SCORE_INIT;
2064 void status_reinit(struct input_code *ptr)
2067 ptr->_file_stat = 0;
2070 void status_check(struct input_code *ptr, nkf_char c)
2072 if (c <= DEL && estab_f){
2077 void s_status(struct input_code *ptr, nkf_char c)
2081 status_check(ptr, c);
2086 #ifdef NUMCHAR_OPTION
2087 }else if (is_unicode_capsule(c)){
2090 }else if (0xa1 <= c && c <= 0xdf){
2091 status_push_ch(ptr, SSO);
2092 status_push_ch(ptr, c);
2095 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2097 status_push_ch(ptr, c);
2098 }else if (0xed <= c && c <= 0xee){
2100 status_push_ch(ptr, c);
2101 #ifdef SHIFTJIS_CP932
2102 }else if (is_ibmext_in_sjis(c)){
2104 status_push_ch(ptr, c);
2105 #endif /* SHIFTJIS_CP932 */
2107 }else if (0xf0 <= c && c <= 0xfc){
2109 status_push_ch(ptr, c);
2110 #endif /* X0212_ENABLE */
2112 status_disable(ptr);
2116 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2117 status_push_ch(ptr, c);
2118 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2122 status_disable(ptr);
2126 #ifdef SHIFTJIS_CP932
2127 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2128 status_push_ch(ptr, c);
2129 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2130 set_code_score(ptr, SCORE_CP932);
2135 #endif /* SHIFTJIS_CP932 */
2136 status_disable(ptr);
2139 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2140 status_push_ch(ptr, c);
2141 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2142 set_code_score(ptr, SCORE_CP932);
2145 status_disable(ptr);
2151 void e_status(struct input_code *ptr, nkf_char c)
2155 status_check(ptr, c);
2160 #ifdef NUMCHAR_OPTION
2161 }else if (is_unicode_capsule(c)){
2164 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2166 status_push_ch(ptr, c);
2168 }else if (0x8f == c){
2170 status_push_ch(ptr, c);
2171 #endif /* X0212_ENABLE */
2173 status_disable(ptr);
2177 if (0xa1 <= c && c <= 0xfe){
2178 status_push_ch(ptr, c);
2182 status_disable(ptr);
2187 if (0xa1 <= c && c <= 0xfe){
2189 status_push_ch(ptr, c);
2191 status_disable(ptr);
2193 #endif /* X0212_ENABLE */
2197 #ifdef UTF8_INPUT_ENABLE
2198 void w_status(struct input_code *ptr, nkf_char c)
2202 status_check(ptr, c);
2207 #ifdef NUMCHAR_OPTION
2208 }else if (is_unicode_capsule(c)){
2211 }else if (0xc0 <= c && c <= 0xdf){
2213 status_push_ch(ptr, c);
2214 }else if (0xe0 <= c && c <= 0xef){
2216 status_push_ch(ptr, c);
2217 }else if (0xf0 <= c && c <= 0xf4){
2219 status_push_ch(ptr, c);
2221 status_disable(ptr);
2226 if (0x80 <= c && c <= 0xbf){
2227 status_push_ch(ptr, c);
2228 if (ptr->index > ptr->stat){
2229 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2230 && ptr->buf[2] == 0xbf);
2231 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2232 &ptr->buf[0], &ptr->buf[1]);
2239 status_disable(ptr);
2243 if (0x80 <= c && c <= 0xbf){
2244 if (ptr->index < ptr->stat){
2245 status_push_ch(ptr, c);
2250 status_disable(ptr);
2257 void code_status(nkf_char c)
2259 int action_flag = 1;
2260 struct input_code *result = 0;
2261 struct input_code *p = input_code_list;
2263 if (!p->status_func) {
2267 if (!p->status_func)
2269 (p->status_func)(p, c);
2272 }else if(p->stat == 0){
2283 if (result && !estab_f){
2284 set_iconv(TRUE, result->iconv_func);
2285 }else if (c <= DEL){
2286 struct input_code *ptr = input_code_list;
2296 nkf_char std_getc(FILE *f)
2299 return std_gc_buf[--std_gc_ndx];
2305 nkf_char std_ungetc(nkf_char c, FILE *f)
2307 if (std_gc_ndx == STD_GC_BUFSIZE){
2310 std_gc_buf[std_gc_ndx++] = c;
2315 void std_putc(nkf_char c)
2322 #if !defined(PERL_XS) && !defined(WIN32DLL)
2323 nkf_char noconvert(FILE *f)
2328 module_connection();
2329 while ((c = (*i_getc)(f)) != EOF)
2336 void module_connection(void)
2338 oconv = output_conv;
2341 /* replace continucation module, from output side */
2343 /* output redicrection */
2345 if (noout_f || guess_f){
2352 if (mimeout_f == TRUE) {
2353 o_base64conv = oconv; oconv = base64_conv;
2355 /* base64_count = 0; */
2358 if (nlmode_f || guess_f) {
2359 o_nlconv = oconv; oconv = nl_conv;
2362 o_rot_conv = oconv; oconv = rot_conv;
2365 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2368 o_hira_conv = oconv; oconv = hira_conv;
2371 o_fconv = oconv; oconv = fold_conv;
2374 if (alpha_f || x0201_f) {
2375 o_zconv = oconv; oconv = z_conv;
2379 i_ungetc = std_ungetc;
2380 /* input redicrection */
2383 i_cgetc = i_getc; i_getc = cap_getc;
2384 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2387 i_ugetc = i_getc; i_getc = url_getc;
2388 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2391 #ifdef NUMCHAR_OPTION
2393 i_ngetc = i_getc; i_getc = numchar_getc;
2394 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2397 #ifdef UNICODE_NORMALIZATION
2398 if (nfc_f && input_f == UTF8_INPUT){
2399 i_nfc_getc = i_getc; i_getc = nfc_getc;
2400 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2403 if (mime_f && mimebuf_f==FIXED_MIME) {
2404 i_mgetc = i_getc; i_getc = mime_getc;
2405 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2408 i_bgetc = i_getc; i_getc = broken_getc;
2409 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2411 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2412 set_iconv(-TRUE, e_iconv);
2413 } else if (input_f == SJIS_INPUT) {
2414 set_iconv(-TRUE, s_iconv);
2415 #ifdef UTF8_INPUT_ENABLE
2416 } else if (input_f == UTF8_INPUT) {
2417 set_iconv(-TRUE, w_iconv);
2418 } else if (input_f == UTF16_INPUT) {
2419 set_iconv(-TRUE, w_iconv16);
2420 } else if (input_f == UTF32_INPUT) {
2421 set_iconv(-TRUE, w_iconv32);
2424 set_iconv(FALSE, e_iconv);
2428 struct input_code *p = input_code_list;
2436 * Check and Ignore BOM
2438 void check_bom(FILE *f)
2441 switch(c2 = (*i_getc)(f)){
2443 if((c2 = (*i_getc)(f)) == 0x00){
2444 if((c2 = (*i_getc)(f)) == 0xFE){
2445 if((c2 = (*i_getc)(f)) == 0xFF){
2447 set_iconv(TRUE, w_iconv32);
2449 if (iconv == w_iconv32) {
2450 input_endian = ENDIAN_BIG;
2453 (*i_ungetc)(0xFF,f);
2454 }else (*i_ungetc)(c2,f);
2455 (*i_ungetc)(0xFE,f);
2456 }else if(c2 == 0xFF){
2457 if((c2 = (*i_getc)(f)) == 0xFE){
2459 set_iconv(TRUE, w_iconv32);
2461 if (iconv == w_iconv32) {
2462 input_endian = ENDIAN_2143;
2465 (*i_ungetc)(0xFF,f);
2466 }else (*i_ungetc)(c2,f);
2467 (*i_ungetc)(0xFF,f);
2468 }else (*i_ungetc)(c2,f);
2469 (*i_ungetc)(0x00,f);
2470 }else (*i_ungetc)(c2,f);
2471 (*i_ungetc)(0x00,f);
2474 if((c2 = (*i_getc)(f)) == 0xBB){
2475 if((c2 = (*i_getc)(f)) == 0xBF){
2477 set_iconv(TRUE, w_iconv);
2479 if (iconv == w_iconv) {
2482 (*i_ungetc)(0xBF,f);
2483 }else (*i_ungetc)(c2,f);
2484 (*i_ungetc)(0xBB,f);
2485 }else (*i_ungetc)(c2,f);
2486 (*i_ungetc)(0xEF,f);
2489 if((c2 = (*i_getc)(f)) == 0xFF){
2490 if((c2 = (*i_getc)(f)) == 0x00){
2491 if((c2 = (*i_getc)(f)) == 0x00){
2493 set_iconv(TRUE, w_iconv32);
2495 if (iconv == w_iconv32) {
2496 input_endian = ENDIAN_3412;
2499 (*i_ungetc)(0x00,f);
2500 }else (*i_ungetc)(c2,f);
2501 (*i_ungetc)(0x00,f);
2502 }else (*i_ungetc)(c2,f);
2504 set_iconv(TRUE, w_iconv16);
2506 if (iconv == w_iconv16) {
2507 input_endian = ENDIAN_BIG;
2510 (*i_ungetc)(0xFF,f);
2511 }else (*i_ungetc)(c2,f);
2512 (*i_ungetc)(0xFE,f);
2515 if((c2 = (*i_getc)(f)) == 0xFE){
2516 if((c2 = (*i_getc)(f)) == 0x00){
2517 if((c2 = (*i_getc)(f)) == 0x00){
2519 set_iconv(TRUE, w_iconv32);
2521 if (iconv == w_iconv32) {
2522 input_endian = ENDIAN_LITTLE;
2525 (*i_ungetc)(0x00,f);
2526 }else (*i_ungetc)(c2,f);
2527 (*i_ungetc)(0x00,f);
2528 }else (*i_ungetc)(c2,f);
2530 set_iconv(TRUE, w_iconv16);
2532 if (iconv == w_iconv16) {
2533 input_endian = ENDIAN_LITTLE;
2536 (*i_ungetc)(0xFE,f);
2537 }else (*i_ungetc)(c2,f);
2538 (*i_ungetc)(0xFF,f);
2547 Conversion main loop. Code detection only.
2550 nkf_char kanji_convert(FILE *f)
2552 nkf_char c3, c2=0, c1, c0=0;
2553 int is_8bit = FALSE;
2555 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2556 #ifdef UTF8_INPUT_ENABLE
2557 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2564 output_mode = ASCII;
2567 #define NEXT continue /* no output, get next */
2568 #define SEND ; /* output c1 and c2, get next */
2569 #define LAST break /* end of loop, go closing */
2571 module_connection();
2574 while ((c1 = (*i_getc)(f)) != EOF) {
2575 #ifdef INPUT_CODE_FIX
2581 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2582 /* in case of 8th bit is on */
2583 if (!estab_f&&!mime_decode_mode) {
2584 /* in case of not established yet */
2585 /* It is still ambiguious */
2586 if (h_conv(f, c2, c1)==EOF)
2592 /* in case of already established */
2594 /* ignore bogus code and not CP5022x UCD */
2602 /* second byte, 7 bit code */
2603 /* it might be kanji shitfted */
2604 if ((c1 == DEL) || (c1 <= SP)) {
2605 /* ignore bogus first code */
2612 #ifdef UTF8_INPUT_ENABLE
2613 if (iconv == w_iconv16) {
2614 if (input_endian == ENDIAN_BIG) {
2616 if ((c1 = (*i_getc)(f)) != EOF) {
2617 if (0xD8 <= c2 && c2 <= 0xDB) {
2618 if ((c0 = (*i_getc)(f)) != EOF) {
2620 if ((c3 = (*i_getc)(f)) != EOF) {
2627 if ((c2 = (*i_getc)(f)) != EOF) {
2628 if (0xD8 <= c2 && c2 <= 0xDB) {
2629 if ((c3 = (*i_getc)(f)) != EOF) {
2630 if ((c0 = (*i_getc)(f)) != EOF) {
2639 } else if(iconv == w_iconv32){
2641 if((c2 = (*i_getc)(f)) != EOF &&
2642 (c1 = (*i_getc)(f)) != EOF &&
2643 (c0 = (*i_getc)(f)) != EOF){
2644 switch(input_endian){
2646 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2649 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2652 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2655 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2665 #ifdef NUMCHAR_OPTION
2666 if (is_unicode_capsule(c1)){
2670 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2672 if (!estab_f && !iso8859_f) {
2673 /* not established yet */
2676 } else { /* estab_f==TRUE */
2681 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2682 /* SJIS X0201 Case... */
2683 if (iso2022jp_f && !x0201_f) {
2684 (*oconv)(GETA1, GETA2);
2691 } else if (c1==SSO && iconv != s_iconv) {
2692 /* EUC X0201 Case */
2693 c1 = (*i_getc)(f); /* skip SSO */
2695 if (SSP<=c1 && c1<0xe0) {
2696 if (iso2022jp_f && !x0201_f) {
2697 (*oconv)(GETA1, GETA2);
2704 } else { /* bogus code, skip SSO and one byte */
2707 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2708 (c1 == 0xFD || c1 == 0xFE)) {
2714 /* already established */
2719 } else if ((c1 > SP) && (c1 != DEL)) {
2720 /* in case of Roman characters */
2722 /* output 1 shifted byte */
2726 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2727 /* output 1 shifted byte */
2728 if (iso2022jp_f && !x0201_f) {
2729 (*oconv)(GETA1, GETA2);
2736 /* look like bogus code */
2739 } else if (input_mode == X0208 || input_mode == X0212 ||
2740 input_mode == X0213_1 || input_mode == X0213_2) {
2741 /* in case of Kanji shifted */
2744 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2745 /* Check MIME code */
2746 if ((c1 = (*i_getc)(f)) == EOF) {
2749 } else if (c1 == '?') {
2750 /* =? is mime conversion start sequence */
2751 if(mime_f == STRICT_MIME) {
2752 /* check in real detail */
2753 if (mime_begin_strict(f) == EOF)
2757 } else if (mime_begin(f) == EOF)
2767 /* normal ASCII code */
2770 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2773 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2776 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2777 if ((c1 = (*i_getc)(f)) == EOF) {
2778 /* (*oconv)(0, ESC); don't send bogus code */
2780 } else if (c1 == '$') {
2781 if ((c1 = (*i_getc)(f)) == EOF) {
2783 (*oconv)(0, ESC); don't send bogus code
2784 (*oconv)(0, '$'); */
2786 } else if (c1 == '@'|| c1 == 'B') {
2787 /* This is kanji introduction */
2790 set_input_codename("ISO-2022-JP");
2792 debug("ISO-2022-JP");
2795 } else if (c1 == '(') {
2796 if ((c1 = (*i_getc)(f)) == EOF) {
2797 /* don't send bogus code
2803 } else if (c1 == '@'|| c1 == 'B') {
2804 /* This is kanji introduction */
2809 } else if (c1 == 'D'){
2813 #endif /* X0212_ENABLE */
2814 } else if (c1 == (X0213_1&0x7F)){
2815 input_mode = X0213_1;
2818 } else if (c1 == (X0213_2&0x7F)){
2819 input_mode = X0213_2;
2823 /* could be some special code */
2830 } else if (broken_f&0x2) {
2831 /* accept any ESC-(-x as broken code ... */
2841 } else if (c1 == '(') {
2842 if ((c1 = (*i_getc)(f)) == EOF) {
2843 /* don't send bogus code
2845 (*oconv)(0, '('); */
2849 /* This is X0201 kana introduction */
2850 input_mode = X0201; shift_mode = X0201;
2852 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2853 /* This is X0208 kanji introduction */
2854 input_mode = ASCII; shift_mode = FALSE;
2856 } else if (broken_f&0x2) {
2857 input_mode = ASCII; shift_mode = FALSE;
2862 /* maintain various input_mode here */
2866 } else if ( c1 == 'N' || c1 == 'n'){
2868 c3 = (*i_getc)(f); /* skip SS2 */
2869 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2884 } else if (c1 == ESC && iconv == s_iconv) {
2885 /* ESC in Shift_JIS */
2886 if ((c1 = (*i_getc)(f)) == EOF) {
2887 /* (*oconv)(0, ESC); don't send bogus code */
2889 } else if (c1 == '$') {
2891 if ((c1 = (*i_getc)(f)) == EOF) {
2893 (*oconv)(0, ESC); don't send bogus code
2894 (*oconv)(0, '$'); */
2897 if (('E' <= c1 && c1 <= 'G') ||
2898 ('O' <= c1 && c1 <= 'Q')) {
2906 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2907 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2908 while ((c1 = (*i_getc)(f)) != EOF) {
2909 if (SP <= c1 && c1 <= 'z') {
2910 (*oconv)(0, c1 + c0);
2911 } else break; /* c1 == SO */
2915 if (c1 == EOF) LAST;
2922 } else if (c1 == LF || c1 == CR) {
2924 input_mode = ASCII; set_iconv(FALSE, 0);
2926 } else if (mime_decode_f && !mime_decode_mode){
2928 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2936 } else { /* if (c1 == CR)*/
2937 if ((c1=(*i_getc)(f))!=EOF) {
2941 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2955 } else if (c1 == DEL && input_mode == X0208) {
2965 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2968 if ((c0 = (*i_getc)(f)) != EOF) {
2971 if ((c3 = (*i_getc)(f)) != EOF) {
2973 (*iconv)(c2, c1, c0|c3);
2978 /* 3 bytes EUC or UTF-8 */
2979 if ((c0 = (*i_getc)(f)) != EOF) {
2981 (*iconv)(c2, c1, c0);
2989 0x7F <= c2 && c2 <= 0x92 &&
2990 0x21 <= c1 && c1 <= 0x7E) {
2992 if(c1 == 0x7F) return 0;
2993 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2996 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3000 (*oconv)(PREFIX_EUCG3 | c2, c1);
3002 #endif /* X0212_ENABLE */
3004 (*oconv)(PREFIX_EUCG3 | c2, c1);
3007 (*oconv)(input_mode, c1); /* other special case */
3013 /* goto next_word */
3017 (*iconv)(EOF, 0, 0);
3018 if (!input_codename)
3021 struct input_code *p = input_code_list;
3022 struct input_code *result = p;
3024 if (p->score < result->score) result = p;
3027 set_input_codename(result->name);
3029 debug(result->name);
3037 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3039 nkf_char ret, c3, c0;
3043 /** it must NOT be in the kanji shifte sequence */
3044 /** it must NOT be written in JIS7 */
3045 /** and it must be after 2 byte 8bit code */
3051 while ((c1 = (*i_getc)(f)) != EOF) {
3057 if (push_hold_buf(c1) == EOF || estab_f){
3063 struct input_code *p = input_code_list;
3064 struct input_code *result = p;
3069 if (p->status_func && p->score < result->score){
3074 set_iconv(TRUE, result->iconv_func);
3079 ** 1) EOF is detected, or
3080 ** 2) Code is established, or
3081 ** 3) Buffer is FULL (but last word is pushed)
3083 ** in 1) and 3) cases, we continue to use
3084 ** Kanji codes by oconv and leave estab_f unchanged.
3089 while (hold_index < hold_count){
3090 c2 = hold_buf[hold_index++];
3092 #ifdef NUMCHAR_OPTION
3093 || is_unicode_capsule(c2)
3098 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3099 (*iconv)(X0201, c2, 0);
3102 if (hold_index < hold_count){
3103 c1 = hold_buf[hold_index++];
3113 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3116 if (hold_index < hold_count){
3117 c0 = hold_buf[hold_index++];
3118 } else if ((c0 = (*i_getc)(f)) == EOF) {
3124 if (hold_index < hold_count){
3125 c3 = hold_buf[hold_index++];
3126 } else if ((c3 = (*i_getc)(f)) == EOF) {
3131 (*iconv)(c2, c1, c0|c3);
3136 /* 3 bytes EUC or UTF-8 */
3137 if (hold_index < hold_count){
3138 c0 = hold_buf[hold_index++];
3139 } else if ((c0 = (*i_getc)(f)) == EOF) {
3145 (*iconv)(c2, c1, c0);
3148 if (c0 == EOF) break;
3153 nkf_char push_hold_buf(nkf_char c2)
3155 if (hold_count >= HOLD_SIZE*2)
3157 hold_buf[hold_count++] = (unsigned char)c2;
3158 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3161 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3163 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3166 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3167 #ifdef SHIFTJIS_CP932
3168 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3169 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3176 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3177 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3183 #endif /* SHIFTJIS_CP932 */
3185 if (!x0213_f && is_ibmext_in_sjis(c2)){
3186 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3189 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3202 if(x0213_f && c2 >= 0xF0){
3203 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3204 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3205 }else{ /* 78<=k<=94 */
3206 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3207 if (0x9E < c1) c2++;
3210 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3211 if (0x9E < c1) c2++;
3214 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3221 c2 = x0212_unshift(c2);
3228 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3232 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3234 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3236 if(c1 == 0x7F) return 0;
3237 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3240 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3241 if (ret) return ret;
3247 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3252 }else if (c2 == 0x8f){
3256 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3257 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3258 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3261 c2 = (c2 << 8) | (c1 & 0x7f);
3263 #ifdef SHIFTJIS_CP932
3266 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3267 s2e_conv(s2, s1, &c2, &c1);
3274 #endif /* SHIFTJIS_CP932 */
3276 #endif /* X0212_ENABLE */
3277 } else if (c2 == SSO){
3280 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3283 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3284 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3285 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3290 #ifdef SHIFTJIS_CP932
3291 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3293 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3294 s2e_conv(s2, s1, &c2, &c1);
3301 #endif /* SHIFTJIS_CP932 */
3308 #ifdef UTF8_INPUT_ENABLE
3309 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3316 }else if (0xc0 <= c2 && c2 <= 0xef) {
3317 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3318 #ifdef NUMCHAR_OPTION
3321 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3329 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3332 static const char w_iconv_utf8_1st_byte[] =
3334 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3335 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3336 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3337 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3339 if (c2 < 0 || 0xff < c2) {
3340 }else if (c2 == 0) { /* 0 : 1 byte*/
3342 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3345 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3347 if (c1 < 0x80 || 0xBF < c1) return 0;
3350 if (c0 == 0) return -1;
3351 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3356 if (c0 == 0) return -1;
3357 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3361 if (c0 == 0) return -1;
3362 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3366 if (c0 == 0) return -2;
3367 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3371 if (c0 == 0) return -2;
3372 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3376 if (c0 == 0) return -2;
3377 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3385 if (c2 == 0 || c2 == EOF){
3386 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3387 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3390 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3399 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3400 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3407 }else if (val < 0x800){
3408 *p2 = 0xc0 | (val >> 6);
3409 *p1 = 0x80 | (val & 0x3f);
3411 } else if (val <= NKF_INT32_C(0xFFFF)) {
3412 *p2 = 0xe0 | (val >> 12);
3413 *p1 = 0x80 | ((val >> 6) & 0x3f);
3414 *p0 = 0x80 | (val & 0x3f);
3415 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3416 *p2 = 0xe0 | (val >> 16);
3417 *p1 = 0x80 | ((val >> 12) & 0x3f);
3418 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3427 #ifdef UTF8_INPUT_ENABLE
3428 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3433 } else if (c2 >= 0xf0){
3434 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3435 val = (c2 & 0x0f) << 18;
3436 val |= (c1 & 0x3f) << 12;
3437 val |= (c0 & 0x3f00) >> 2;
3439 }else if (c2 >= 0xe0){
3440 val = (c2 & 0x0f) << 12;
3441 val |= (c1 & 0x3f) << 6;
3443 }else if (c2 >= 0xc0){
3444 val = (c2 & 0x1f) << 6;
3452 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3454 nkf_char c2, c1, c0;
3461 w16w_conv(val, &c2, &c1, &c0);
3462 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3463 #ifdef NUMCHAR_OPTION
3466 *p1 = CLASS_UNICODE | val;
3475 #ifdef UTF8_INPUT_ENABLE
3476 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3479 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3482 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3483 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3485 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3487 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3492 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3493 if (ret) return ret;
3498 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3502 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3503 } else if (is_unicode_bmp(c1)) {
3504 ret = w16e_conv(c1, &c2, &c1);
3507 c1 = CLASS_UNICODE | c1;
3509 if (ret) return ret;
3514 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3516 const unsigned short *const *pp;
3517 const unsigned short *const *const *ppp;
3518 static const char no_best_fit_chars_table_C2[] =
3519 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3520 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3522 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3523 static const char no_best_fit_chars_table_C2_ms[] =
3524 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3525 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3527 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3528 static const char no_best_fit_chars_table_932_C2[] =
3529 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3531 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3532 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3533 static const char no_best_fit_chars_table_932_C3[] =
3534 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3535 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3537 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3543 }else if(c2 < 0xe0){
3544 if(no_best_fit_chars_f){
3545 if(ms_ucs_map_f == UCS_MAP_CP932){
3548 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3551 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3554 }else if(!cp932inv_f){
3557 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3560 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3563 }else if(ms_ucs_map_f == UCS_MAP_MS){
3564 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3565 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3583 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3584 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3585 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3587 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3588 }else if(c0 < 0xF0){
3589 if(no_best_fit_chars_f){
3590 if(ms_ucs_map_f == UCS_MAP_CP932){
3591 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3592 }else if(ms_ucs_map_f == UCS_MAP_MS){
3597 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3600 if(c0 == 0x92) return 1;
3605 if(c1 == 0x80 || c0 == 0x9C) return 1;
3608 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3613 if(c0 == 0x94) return 1;
3616 if(c0 == 0xBB) return 1;
3626 if(c0 == 0x95) return 1;
3629 if(c0 == 0xA5) return 1;
3636 if(c0 == 0x8D) return 1;
3639 if(c0 == 0x9E && !cp932inv_f) return 1;
3642 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3650 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3651 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3652 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3654 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3656 #ifdef SHIFTJIS_CP932
3657 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3659 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3660 s2e_conv(s2, s1, p2, p1);
3669 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3672 const unsigned short *p;
3675 if (pp == 0) return 1;
3678 if (c1 < 0 || psize <= c1) return 1;
3680 if (p == 0) return 1;
3683 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3685 if (val == 0) return 1;
3686 if (no_cp932ext_f && (
3687 (val>>8) == 0x2D || /* NEC special characters */
3688 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3696 if (c2 == SO) c2 = X0201;
3703 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3710 (*f)(0, bin2hex(c>>shift));
3720 void encode_fallback_html(nkf_char c)
3725 if(c >= NKF_INT32_C(1000000))
3726 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3727 if(c >= NKF_INT32_C(100000))
3728 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3730 (*oconv)(0, 0x30+(c/10000 )%10);
3732 (*oconv)(0, 0x30+(c/1000 )%10);
3734 (*oconv)(0, 0x30+(c/100 )%10);
3736 (*oconv)(0, 0x30+(c/10 )%10);
3738 (*oconv)(0, 0x30+ c %10);
3743 void encode_fallback_xml(nkf_char c)
3748 nkf_each_char_to_hex(oconv, c);
3753 void encode_fallback_java(nkf_char c)
3757 if(!is_unicode_bmp(c)){
3761 (*oconv)(0, bin2hex(c>>20));
3762 (*oconv)(0, bin2hex(c>>16));
3766 (*oconv)(0, bin2hex(c>>12));
3767 (*oconv)(0, bin2hex(c>> 8));
3768 (*oconv)(0, bin2hex(c>> 4));
3769 (*oconv)(0, bin2hex(c ));
3773 void encode_fallback_perl(nkf_char c)
3778 nkf_each_char_to_hex(oconv, c);
3783 void encode_fallback_subchar(nkf_char c)
3785 c = unicode_subchar;
3786 (*oconv)((c>>8)&0xFF, c&0xFF);
3791 #ifdef UTF8_OUTPUT_ENABLE
3792 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3794 const unsigned short *p;
3797 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3805 p = euc_to_utf8_1byte;
3807 } else if (is_eucg3(c2)){
3808 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3811 c2 = (c2&0x7f) - 0x21;
3812 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3813 p = x0212_to_utf8_2bytes[c2];
3819 c2 = (c2&0x7f) - 0x21;
3820 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3822 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3823 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3824 euc_to_utf8_2bytes_ms[c2];
3829 c1 = (c1 & 0x7f) - 0x21;
3830 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3835 void w_oconv(nkf_char c2, nkf_char c1)
3841 output_bom_f = FALSE;
3852 #ifdef NUMCHAR_OPTION
3853 if (c2 == 0 && is_unicode_capsule(c1)){
3854 val = c1 & VALUE_MASK;
3857 }else if (val < 0x800){
3858 (*o_putc)(0xC0 | (val >> 6));
3859 (*o_putc)(0x80 | (val & 0x3f));
3860 } else if (val <= NKF_INT32_C(0xFFFF)) {
3861 (*o_putc)(0xE0 | (val >> 12));
3862 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3863 (*o_putc)(0x80 | (val & 0x3f));
3864 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3865 (*o_putc)(0xF0 | ( val>>18));
3866 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3867 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3868 (*o_putc)(0x80 | ( val & 0x3f));
3875 output_mode = ASCII;
3877 } else if (c2 == ISO8859_1) {
3879 (*o_putc)(c1 | 0x080);
3882 val = e2w_conv(c2, c1);
3884 w16w_conv(val, &c2, &c1, &c0);
3888 if (c0) (*o_putc)(c0);
3894 void w_oconv16(nkf_char c2, nkf_char c1)
3897 output_bom_f = FALSE;
3898 if (output_endian == ENDIAN_LITTLE){
3899 (*o_putc)((unsigned char)'\377');
3903 (*o_putc)((unsigned char)'\377');
3912 if (c2 == ISO8859_1) {
3915 #ifdef NUMCHAR_OPTION
3916 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3917 if (is_unicode_bmp(c1)) {
3918 c2 = (c1 >> 8) & 0xff;
3922 if (c1 <= UNICODE_MAX) {
3923 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3924 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3925 if (output_endian == ENDIAN_LITTLE){
3926 (*o_putc)(c2 & 0xff);
3927 (*o_putc)((c2 >> 8) & 0xff);
3928 (*o_putc)(c1 & 0xff);
3929 (*o_putc)((c1 >> 8) & 0xff);
3931 (*o_putc)((c2 >> 8) & 0xff);
3932 (*o_putc)(c2 & 0xff);
3933 (*o_putc)((c1 >> 8) & 0xff);
3934 (*o_putc)(c1 & 0xff);
3941 nkf_char val = e2w_conv(c2, c1);
3942 c2 = (val >> 8) & 0xff;
3946 if (output_endian == ENDIAN_LITTLE){
3955 void w_oconv32(nkf_char c2, nkf_char c1)
3958 output_bom_f = FALSE;
3959 if (output_endian == ENDIAN_LITTLE){
3960 (*o_putc)((unsigned char)'\377');
3968 (*o_putc)((unsigned char)'\377');
3977 if (c2 == ISO8859_1) {
3979 #ifdef NUMCHAR_OPTION
3980 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3984 c1 = e2w_conv(c2, c1);
3987 if (output_endian == ENDIAN_LITTLE){
3988 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3989 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3990 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3994 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3995 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3996 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4001 void e_oconv(nkf_char c2, nkf_char c1)
4003 #ifdef NUMCHAR_OPTION
4004 if (c2 == 0 && is_unicode_capsule(c1)){
4005 w16e_conv(c1, &c2, &c1);
4006 if (c2 == 0 && is_unicode_capsule(c1)){
4007 c2 = c1 & VALUE_MASK;
4008 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4012 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4013 c1 = 0x21 + c1 % 94;
4016 (*o_putc)((c2 & 0x7f) | 0x080);
4017 (*o_putc)(c1 | 0x080);
4019 (*o_putc)((c2 & 0x7f) | 0x080);
4020 (*o_putc)(c1 | 0x080);
4024 if (encode_fallback) (*encode_fallback)(c1);
4033 } else if (c2 == 0) {
4034 output_mode = ASCII;
4036 } else if (c2 == X0201) {
4037 output_mode = JAPANESE_EUC;
4038 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4039 } else if (c2 == ISO8859_1) {
4040 output_mode = ISO8859_1;
4041 (*o_putc)(c1 | 0x080);
4043 } else if (is_eucg3(c2)){
4044 output_mode = JAPANESE_EUC;
4045 #ifdef SHIFTJIS_CP932
4048 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4049 s2e_conv(s2, s1, &c2, &c1);
4054 output_mode = ASCII;
4056 }else if (is_eucg3(c2)){
4059 (*o_putc)((c2 & 0x7f) | 0x080);
4060 (*o_putc)(c1 | 0x080);
4063 (*o_putc)((c2 & 0x7f) | 0x080);
4064 (*o_putc)(c1 | 0x080);
4068 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4069 set_iconv(FALSE, 0);
4070 return; /* too late to rescue this char */
4072 output_mode = JAPANESE_EUC;
4073 (*o_putc)(c2 | 0x080);
4074 (*o_putc)(c1 | 0x080);
4079 nkf_char x0212_shift(nkf_char c)
4084 if (0x75 <= c && c <= 0x7f){
4085 ret = c + (0x109 - 0x75);
4088 if (0x75 <= c && c <= 0x7f){
4089 ret = c + (0x113 - 0x75);
4096 nkf_char x0212_unshift(nkf_char c)
4099 if (0x7f <= c && c <= 0x88){
4100 ret = c + (0x75 - 0x7f);
4101 }else if (0x89 <= c && c <= 0x92){
4102 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4106 #endif /* X0212_ENABLE */
4108 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4114 if((0x21 <= ndx && ndx <= 0x2F)){
4115 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4116 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4118 }else if(0x6E <= ndx && ndx <= 0x7E){
4119 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4120 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4126 else if(nkf_isgraph(ndx)){
4128 const unsigned short *ptr;
4129 ptr = x0212_shiftjis[ndx - 0x21];
4131 val = ptr[(c1 & 0x7f) - 0x21];
4140 c2 = x0212_shift(c2);
4142 #endif /* X0212_ENABLE */
4144 if(0x7F < c2) return 1;
4145 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4146 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4150 void s_oconv(nkf_char c2, nkf_char c1)
4152 #ifdef NUMCHAR_OPTION
4153 if (c2 == 0 && is_unicode_capsule(c1)){
4154 w16e_conv(c1, &c2, &c1);
4155 if (c2 == 0 && is_unicode_capsule(c1)){
4156 c2 = c1 & VALUE_MASK;
4157 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4160 c2 = c1 / 188 + 0xF0;
4162 c1 += 0x40 + (c1 > 0x3e);
4167 if(encode_fallback)(*encode_fallback)(c1);
4176 } else if (c2 == 0) {
4177 output_mode = ASCII;
4179 } else if (c2 == X0201) {
4180 output_mode = SHIFT_JIS;
4182 } else if (c2 == ISO8859_1) {
4183 output_mode = ISO8859_1;
4184 (*o_putc)(c1 | 0x080);
4186 } else if (is_eucg3(c2)){
4187 output_mode = SHIFT_JIS;
4188 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4194 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4195 set_iconv(FALSE, 0);
4196 return; /* too late to rescue this char */
4198 output_mode = SHIFT_JIS;
4199 e2s_conv(c2, c1, &c2, &c1);
4201 #ifdef SHIFTJIS_CP932
4203 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4204 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4210 #endif /* SHIFTJIS_CP932 */
4213 if (prefix_table[(unsigned char)c1]){
4214 (*o_putc)(prefix_table[(unsigned char)c1]);
4220 void j_oconv(nkf_char c2, nkf_char c1)
4222 #ifdef NUMCHAR_OPTION
4223 if (c2 == 0 && is_unicode_capsule(c1)){
4224 w16e_conv(c1, &c2, &c1);
4225 if (c2 == 0 && is_unicode_capsule(c1)){