1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.111 2006/10/08 03:02:34 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-09-15"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
858 is_inputcode_mixed = FALSE;
859 is_inputcode_set = FALSE;
864 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
874 /* reopen file for stdout */
875 if (file_out_f == TRUE) {
878 outfname = malloc(strlen(origfname)
879 + strlen(".nkftmpXXXXXX")
885 strcpy(outfname, origfname);
889 for (i = strlen(outfname); i; --i){
890 if (outfname[i - 1] == '/'
891 || outfname[i - 1] == '\\'){
897 strcat(outfname, "ntXXXXXX");
899 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
902 strcat(outfname, ".nkftmpXXXXXX");
903 fd = mkstemp(outfname);
906 || (fd_backup = dup(fileno(stdout))) < 0
907 || dup2(fd, fileno(stdout)) < 0
918 outfname = "nkf.out";
921 if(freopen(outfname, "w", stdout) == NULL) {
925 if (binmode_f == TRUE) {
926 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
927 if (freopen("","wb",stdout) == NULL)
934 if (binmode_f == TRUE)
935 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
936 if (freopen("","rb",fin) == NULL)
941 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
945 char *filename = NULL;
947 if (nfiles > 1) filename = origfname;
948 if (guess_f) print_guessed_code(filename);
954 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
962 if (dup2(fd_backup, fileno(stdout)) < 0){
965 if (stat(origfname, &sb)) {
966 fprintf(stderr, "Can't stat %s\n", origfname);
968 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
969 if (chmod(outfname, sb.st_mode)) {
970 fprintf(stderr, "Can't set permission %s\n", outfname);
973 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
975 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
976 tb[0] = tb[1] = sb.st_mtime;
977 if (utime(outfname, tb)) {
978 fprintf(stderr, "Can't set timestamp %s\n", outfname);
981 tb.actime = sb.st_atime;
982 tb.modtime = sb.st_mtime;
983 if (utime(outfname, &tb)) {
984 fprintf(stderr, "Can't set timestamp %s\n", outfname);
989 char *backup_filename = get_backup_filename(backup_suffix, origfname);
991 unlink(backup_filename);
993 if (rename(origfname, backup_filename)) {
994 perror(backup_filename);
995 fprintf(stderr, "Can't rename %s to %s\n",
996 origfname, backup_filename);
1000 if (unlink(origfname)){
1005 if (rename(outfname, origfname)) {
1007 fprintf(stderr, "Can't rename %s to %s\n",
1008 outfname, origfname);
1016 #ifdef EASYWIN /*Easy Win */
1017 if (file_out_f == FALSE)
1018 scanf("%d",&end_check);
1021 #else /* for Other OS */
1022 if (file_out_f == TRUE)
1024 #endif /*Easy Win */
1027 #endif /* WIN32DLL */
1030 char *get_backup_filename(const char *suffix, const char *filename)
1032 char *backup_filename;
1033 int asterisk_count = 0;
1035 int filename_length = strlen(filename);
1037 for(i = 0; suffix[i]; i++){
1038 if(suffix[i] == '*') asterisk_count++;
1042 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1043 if (!backup_filename){
1044 perror("Can't malloc backup filename.");
1048 for(i = 0, j = 0; suffix[i];){
1049 if(suffix[i] == '*'){
1050 backup_filename[j] = '\0';
1051 strncat(backup_filename, filename, filename_length);
1053 j += filename_length;
1055 backup_filename[j++] = suffix[i++];
1058 backup_filename[j] = '\0';
1060 j = strlen(suffix) + filename_length;
1061 backup_filename = malloc( + 1);
1062 strcpy(backup_filename, filename);
1063 strcat(backup_filename, suffix);
1064 backup_filename[j] = '\0';
1066 return backup_filename;
1095 {"katakana-hiragana","h3"},
1102 #ifdef UTF8_OUTPUT_ENABLE
1112 {"fb-subchar=", ""},
1114 #ifdef UTF8_INPUT_ENABLE
1115 {"utf8-input", "W"},
1116 {"utf16-input", "W16"},
1117 {"no-cp932ext", ""},
1118 {"no-best-fit-chars",""},
1120 #ifdef UNICODE_NORMALIZATION
1121 {"utf8mac-input", ""},
1133 #ifdef NUMCHAR_OPTION
1134 {"numchar-input", ""},
1140 #ifdef SHIFTJIS_CP932
1150 static int option_mode = 0;
1152 void options(unsigned char *cp)
1156 unsigned char *cp_back = NULL;
1161 while(*cp && *cp++!='-');
1162 while (*cp || cp_back) {
1170 case '-': /* literal options */
1171 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1175 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1176 p = (unsigned char *)long_option[i].name;
1177 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1178 if (*p == cp[j] || cp[j] == ' '){
1185 while(*cp && *cp != SPACE && cp++);
1186 if (long_option[i].alias[0]){
1188 cp = (unsigned char *)long_option[i].alias;
1190 if (strcmp(long_option[i].name, "ic=") == 0){
1191 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1192 codeset[i] = nkf_toupper(p[i]);
1195 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1196 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1197 strcmp(codeset, "CP50220") == 0 ||
1198 strcmp(codeset, "CP50221") == 0 ||
1199 strcmp(codeset, "CP50222") == 0 ||
1200 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1201 input_f = JIS_INPUT;
1202 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1203 input_f = JIS_INPUT;
1207 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1208 input_f = JIS_INPUT;
1213 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1214 input_f = SJIS_INPUT;
1215 if (x0201_f==NO_X0201) x0201_f=TRUE;
1216 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1217 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1218 strcmp(codeset, "CP932") == 0 ||
1219 strcmp(codeset, "MS932") == 0){
1220 input_f = SJIS_INPUT;
1222 #ifdef SHIFTJIS_CP932
1225 #ifdef UTF8_OUTPUT_ENABLE
1226 ms_ucs_map_f = UCS_MAP_CP932;
1228 }else if(strcmp(codeset, "EUCJP") == 0 ||
1229 strcmp(codeset, "EUC-JP") == 0){
1230 input_f = EUC_INPUT;
1231 }else if(strcmp(codeset, "CP51932") == 0){
1232 input_f = EUC_INPUT;
1234 #ifdef SHIFTJIS_CP932
1237 #ifdef UTF8_OUTPUT_ENABLE
1238 ms_ucs_map_f = UCS_MAP_CP932;
1240 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1241 strcmp(codeset, "EUCJP-MS") == 0 ||
1242 strcmp(codeset, "EUCJPMS") == 0){
1243 input_f = EUC_INPUT;
1245 #ifdef SHIFTJIS_CP932
1248 #ifdef UTF8_OUTPUT_ENABLE
1249 ms_ucs_map_f = UCS_MAP_MS;
1251 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1252 strcmp(codeset, "EUCJP-ASCII") == 0){
1253 input_f = EUC_INPUT;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 ms_ucs_map_f = UCS_MAP_ASCII;
1261 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1262 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1263 input_f = SJIS_INPUT;
1265 #ifdef SHIFTJIS_CP932
1269 if (x0201_f==NO_X0201) x0201_f=TRUE;
1270 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1271 strcmp(codeset, "EUC-JIS-2004") == 0){
1272 input_f = EUC_INPUT;
1275 #ifdef SHIFTJIS_CP932
1279 #ifdef UTF8_INPUT_ENABLE
1280 }else if(strcmp(codeset, "UTF-8") == 0 ||
1281 strcmp(codeset, "UTF-8N") == 0 ||
1282 strcmp(codeset, "UTF-8-BOM") == 0){
1283 input_f = UTF8_INPUT;
1284 #ifdef UNICODE_NORMALIZATION
1285 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1286 strcmp(codeset, "UTF-8-MAC") == 0){
1287 input_f = UTF8_INPUT;
1290 }else if(strcmp(codeset, "UTF-16") == 0 ||
1291 strcmp(codeset, "UTF-16BE") == 0 ||
1292 strcmp(codeset, "UTF-16BE-BOM") == 0){
1293 input_f = UTF16_INPUT;
1294 input_endian = ENDIAN_BIG;
1295 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1296 strcmp(codeset, "UTF-16LE-BOM") == 0){
1297 input_f = UTF16_INPUT;
1298 input_endian = ENDIAN_LITTLE;
1299 }else if(strcmp(codeset, "UTF-32") == 0 ||
1300 strcmp(codeset, "UTF-32BE") == 0 ||
1301 strcmp(codeset, "UTF-32BE-BOM") == 0){
1302 input_f = UTF32_INPUT;
1303 input_endian = ENDIAN_BIG;
1304 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1305 strcmp(codeset, "UTF-32LE-BOM") == 0){
1306 input_f = UTF32_INPUT;
1307 input_endian = ENDIAN_LITTLE;
1312 if (strcmp(long_option[i].name, "oc=") == 0){
1313 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1314 codeset[i] = nkf_toupper(p[i]);
1317 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1318 strcmp(codeset, "CP50220") == 0){
1319 output_conv = j_oconv;
1320 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1321 output_conv = j_oconv;
1322 no_cp932ext_f = TRUE;
1323 }else if(strcmp(codeset, "CP50221") == 0 ||
1324 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1325 output_conv = j_oconv;
1327 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1328 output_conv = j_oconv;
1332 #ifdef SHIFTJIS_CP932
1335 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1336 output_conv = j_oconv;
1341 #ifdef SHIFTJIS_CP932
1344 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1345 output_conv = j_oconv;
1350 #ifdef SHIFTJIS_CP932
1353 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1354 output_conv = s_oconv;
1355 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1356 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1357 strcmp(codeset, "CP932") == 0 ||
1358 strcmp(codeset, "MS932") == 0){
1359 output_conv = s_oconv;
1361 #ifdef SHIFTJIS_CP932
1365 #ifdef UTF8_OUTPUT_ENABLE
1366 ms_ucs_map_f = UCS_MAP_CP932;
1368 }else if(strcmp(codeset, "EUCJP") == 0 ||
1369 strcmp(codeset, "EUC-JP") == 0){
1370 output_conv = e_oconv;
1371 }else if(strcmp(codeset, "CP51932") == 0){
1372 output_conv = e_oconv;
1374 #ifdef SHIFTJIS_CP932
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_CP932;
1380 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1381 strcmp(codeset, "EUCJP-MS") == 0 ||
1382 strcmp(codeset, "EUCJPMS") == 0){
1383 output_conv = e_oconv;
1388 #ifdef SHIFTJIS_CP932
1391 #ifdef UTF8_OUTPUT_ENABLE
1392 ms_ucs_map_f = UCS_MAP_MS;
1394 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1395 strcmp(codeset, "EUCJP-ASCII") == 0){
1396 output_conv = e_oconv;
1401 #ifdef SHIFTJIS_CP932
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_ASCII;
1407 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1408 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1409 output_conv = s_oconv;
1411 #ifdef SHIFTJIS_CP932
1414 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1415 strcmp(codeset, "EUC-JIS-2004") == 0){
1416 output_conv = e_oconv;
1421 #ifdef SHIFTJIS_CP932
1424 #ifdef UTF8_OUTPUT_ENABLE
1425 }else if(strcmp(codeset, "UTF-8") == 0){
1426 output_conv = w_oconv;
1427 }else if(strcmp(codeset, "UTF-8N") == 0){
1428 output_conv = w_oconv;
1429 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1430 output_conv = w_oconv;
1431 output_bom_f = TRUE;
1432 }else if(strcmp(codeset, "UTF-16BE") == 0){
1433 output_conv = w_oconv16;
1434 }else if(strcmp(codeset, "UTF-16") == 0 ||
1435 strcmp(codeset, "UTF-16BE-BOM") == 0){
1436 output_conv = w_oconv16;
1437 output_bom_f = TRUE;
1438 }else if(strcmp(codeset, "UTF-16LE") == 0){
1439 output_conv = w_oconv16;
1440 output_endian = ENDIAN_LITTLE;
1441 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1442 output_conv = w_oconv16;
1443 output_endian = ENDIAN_LITTLE;
1444 output_bom_f = TRUE;
1445 }else if(strcmp(codeset, "UTF-32") == 0 ||
1446 strcmp(codeset, "UTF-32BE") == 0){
1447 output_conv = w_oconv32;
1448 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1449 output_conv = w_oconv32;
1450 output_bom_f = TRUE;
1451 }else if(strcmp(codeset, "UTF-32LE") == 0){
1452 output_conv = w_oconv32;
1453 output_endian = ENDIAN_LITTLE;
1454 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1455 output_conv = w_oconv32;
1456 output_endian = ENDIAN_LITTLE;
1457 output_bom_f = TRUE;
1463 if (strcmp(long_option[i].name, "overwrite") == 0){
1466 preserve_time_f = TRUE;
1469 if (strcmp(long_option[i].name, "overwrite=") == 0){
1472 preserve_time_f = TRUE;
1474 backup_suffix = malloc(strlen((char *) p) + 1);
1475 strcpy(backup_suffix, (char *) p);
1478 if (strcmp(long_option[i].name, "in-place") == 0){
1481 preserve_time_f = FALSE;
1484 if (strcmp(long_option[i].name, "in-place=") == 0){
1487 preserve_time_f = FALSE;
1489 backup_suffix = malloc(strlen((char *) p) + 1);
1490 strcpy(backup_suffix, (char *) p);
1495 if (strcmp(long_option[i].name, "cap-input") == 0){
1499 if (strcmp(long_option[i].name, "url-input") == 0){
1504 #ifdef NUMCHAR_OPTION
1505 if (strcmp(long_option[i].name, "numchar-input") == 0){
1511 if (strcmp(long_option[i].name, "no-output") == 0){
1515 if (strcmp(long_option[i].name, "debug") == 0){
1520 if (strcmp(long_option[i].name, "cp932") == 0){
1521 #ifdef SHIFTJIS_CP932
1525 #ifdef UTF8_OUTPUT_ENABLE
1526 ms_ucs_map_f = UCS_MAP_CP932;
1530 if (strcmp(long_option[i].name, "no-cp932") == 0){
1531 #ifdef SHIFTJIS_CP932
1535 #ifdef UTF8_OUTPUT_ENABLE
1536 ms_ucs_map_f = UCS_MAP_ASCII;
1540 #ifdef SHIFTJIS_CP932
1541 if (strcmp(long_option[i].name, "cp932inv") == 0){
1548 if (strcmp(long_option[i].name, "x0212") == 0){
1555 if (strcmp(long_option[i].name, "exec-in") == 0){
1559 if (strcmp(long_option[i].name, "exec-out") == 0){
1564 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1565 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1566 no_cp932ext_f = TRUE;
1569 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1570 no_best_fit_chars_f = TRUE;
1573 if (strcmp(long_option[i].name, "fb-skip") == 0){
1574 encode_fallback = NULL;
1577 if (strcmp(long_option[i].name, "fb-html") == 0){
1578 encode_fallback = encode_fallback_html;
1581 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1582 encode_fallback = encode_fallback_xml;
1585 if (strcmp(long_option[i].name, "fb-java") == 0){
1586 encode_fallback = encode_fallback_java;
1589 if (strcmp(long_option[i].name, "fb-perl") == 0){
1590 encode_fallback = encode_fallback_perl;
1593 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1594 encode_fallback = encode_fallback_subchar;
1597 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1598 encode_fallback = encode_fallback_subchar;
1599 unicode_subchar = 0;
1601 /* decimal number */
1602 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1603 unicode_subchar *= 10;
1604 unicode_subchar += hex2bin(p[i]);
1606 }else if(p[1] == 'x' || p[1] == 'X'){
1607 /* hexadecimal number */
1608 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1609 unicode_subchar <<= 4;
1610 unicode_subchar |= hex2bin(p[i]);
1614 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1615 unicode_subchar *= 8;
1616 unicode_subchar += hex2bin(p[i]);
1619 w16e_conv(unicode_subchar, &i, &j);
1620 unicode_subchar = i<<8 | j;
1624 #ifdef UTF8_OUTPUT_ENABLE
1625 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1626 ms_ucs_map_f = UCS_MAP_MS;
1630 #ifdef UNICODE_NORMALIZATION
1631 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1632 input_f = UTF8_INPUT;
1637 if (strcmp(long_option[i].name, "prefix=") == 0){
1638 if (nkf_isgraph(p[0])){
1639 for (i = 1; nkf_isgraph(p[i]); i++){
1640 prefix_table[p[i]] = p[0];
1647 case 'b': /* buffered mode */
1650 case 'u': /* non bufferd mode */
1653 case 't': /* transparent mode */
1658 } else if (*cp=='2') {
1662 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1670 case 'j': /* JIS output */
1672 output_conv = j_oconv;
1674 case 'e': /* AT&T EUC output */
1675 output_conv = e_oconv;
1677 case 's': /* SJIS output */
1678 output_conv = s_oconv;
1680 case 'l': /* ISO8859 Latin-1 support, no conversion */
1681 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1682 input_f = LATIN1_INPUT;
1684 case 'i': /* Kanji IN ESC-$-@/B */
1685 if (*cp=='@'||*cp=='B')
1686 kanji_intro = *cp++;
1688 case 'o': /* ASCII IN ESC-(-J/B */
1689 if (*cp=='J'||*cp=='B'||*cp=='H')
1690 ascii_intro = *cp++;
1694 bit:1 katakana->hiragana
1695 bit:2 hiragana->katakana
1697 if ('9'>= *cp && *cp>='0')
1698 hira_f |= (*cp++ -'0');
1705 #if defined(MSDOS) || defined(__OS2__)
1720 #ifdef UTF8_OUTPUT_ENABLE
1721 case 'w': /* UTF-8 output */
1723 output_conv = w_oconv; cp++;
1727 output_bom_f = TRUE;
1730 if ('1'== cp[0] && '6'==cp[1]) {
1731 output_conv = w_oconv16; cp+=2;
1732 } else if ('3'== cp[0] && '2'==cp[1]) {
1733 output_conv = w_oconv32; cp+=2;
1735 output_conv = w_oconv;
1740 output_endian = ENDIAN_LITTLE;
1741 } else if (cp[0] == 'B') {
1749 output_bom_f = TRUE;
1754 #ifdef UTF8_INPUT_ENABLE
1755 case 'W': /* UTF input */
1758 input_f = UTF8_INPUT;
1760 if ('1'== cp[0] && '6'==cp[1]) {
1762 input_f = UTF16_INPUT;
1763 input_endian = ENDIAN_BIG;
1764 } else if ('3'== cp[0] && '2'==cp[1]) {
1766 input_f = UTF32_INPUT;
1767 input_endian = ENDIAN_BIG;
1769 input_f = UTF8_INPUT;
1774 input_endian = ENDIAN_LITTLE;
1775 } else if (cp[0] == 'B') {
1781 /* Input code assumption */
1782 case 'J': /* JIS input */
1783 input_f = JIS_INPUT;
1785 case 'E': /* AT&T EUC input */
1786 input_f = EUC_INPUT;
1788 case 'S': /* MS Kanji input */
1789 input_f = SJIS_INPUT;
1790 if (x0201_f==NO_X0201) x0201_f=TRUE;
1792 case 'Z': /* Convert X0208 alphabet to asii */
1793 /* bit:0 Convert X0208
1794 bit:1 Convert Kankaku to one space
1795 bit:2 Convert Kankaku to two spaces
1796 bit:3 Convert HTML Entity
1798 if ('9'>= *cp && *cp>='0')
1799 alpha_f |= 1<<(*cp++ -'0');
1803 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1804 x0201_f = FALSE; /* No X0201->X0208 conversion */
1806 ESC-(-I in JIS, EUC, MS Kanji
1807 SI/SO in JIS, EUC, MS Kanji
1808 SSO in EUC, JIS, not in MS Kanji
1809 MS Kanji (0xa0-0xdf)
1811 ESC-(-I in JIS (0x20-0x5f)
1812 SSO in EUC (0xa0-0xdf)
1813 0xa0-0xd in MS Kanji (0xa0-0xdf)
1816 case 'X': /* Assume X0201 kana */
1817 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1820 case 'F': /* prserve new lines */
1821 fold_preserve_f = TRUE;
1822 case 'f': /* folding -f60 or -f */
1825 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1827 fold_len += *cp++ - '0';
1829 if (!(0<fold_len && fold_len<BUFSIZ))
1830 fold_len = DEFAULT_FOLD;
1834 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1836 fold_margin += *cp++ - '0';
1840 case 'm': /* MIME support */
1841 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1842 if (*cp=='B'||*cp=='Q') {
1843 mime_decode_mode = *cp++;
1844 mimebuf_f = FIXED_MIME;
1845 } else if (*cp=='N') {
1846 mime_f = TRUE; cp++;
1847 } else if (*cp=='S') {
1848 mime_f = STRICT_MIME; cp++;
1849 } else if (*cp=='0') {
1850 mime_decode_f = FALSE;
1851 mime_f = FALSE; cp++;
1854 case 'M': /* MIME output */
1857 mimeout_f = FIXED_MIME; cp++;
1858 } else if (*cp=='Q') {
1860 mimeout_f = FIXED_MIME; cp++;
1865 case 'B': /* Broken JIS support */
1867 bit:1 allow any x on ESC-(-x or ESC-$-x
1868 bit:2 reset to ascii on NL
1870 if ('9'>= *cp && *cp>='0')
1871 broken_f |= 1<<(*cp++ -'0');
1876 case 'O':/* for Output file */
1880 case 'c':/* add cr code */
1883 case 'd':/* delete cr code */
1886 case 'I': /* ISO-2022-JP output */
1889 case 'L': /* line mode */
1890 if (*cp=='u') { /* unix */
1891 crmode_f = NL; cp++;
1892 } else if (*cp=='m') { /* mac */
1893 crmode_f = CR; cp++;
1894 } else if (*cp=='w') { /* windows */
1895 crmode_f = CRLF; cp++;
1896 } else if (*cp=='0') { /* no conversion */
1906 /* module muliple options in a string are allowed for Perl moudle */
1907 while(*cp && *cp++!='-');
1910 /* bogus option but ignored */
1916 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1919 struct input_code *p = input_code_list;
1921 if (iconv_func == p->iconv_func){
1930 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1932 #ifdef INPUT_CODE_FIX
1940 #ifdef INPUT_CODE_FIX
1941 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1947 if (estab_f && iconv_for_check != iconv){
1948 struct input_code *p = find_inputcode_byfunc(iconv);
1950 set_input_codename(p->name);
1951 debug(input_codename);
1953 iconv_for_check = iconv;
1958 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1959 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1960 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1961 #ifdef SHIFTJIS_CP932
1962 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1963 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1965 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1967 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1968 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1970 #define SCORE_INIT (SCORE_iMIME)
1972 const nkf_char score_table_A0[] = {
1975 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1976 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1979 const nkf_char score_table_F0[] = {
1980 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1981 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1982 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1983 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1986 void set_code_score(struct input_code *ptr, nkf_char score)
1989 ptr->score |= score;
1993 void clr_code_score(struct input_code *ptr, nkf_char score)
1996 ptr->score &= ~score;
2000 void code_score(struct input_code *ptr)
2002 nkf_char c2 = ptr->buf[0];
2003 #ifdef UTF8_OUTPUT_ENABLE
2004 nkf_char c1 = ptr->buf[1];
2007 set_code_score(ptr, SCORE_ERROR);
2008 }else if (c2 == SSO){
2009 set_code_score(ptr, SCORE_KANA);
2010 #ifdef UTF8_OUTPUT_ENABLE
2011 }else if (!e2w_conv(c2, c1)){
2012 set_code_score(ptr, SCORE_NO_EXIST);
2014 }else if ((c2 & 0x70) == 0x20){
2015 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2016 }else if ((c2 & 0x70) == 0x70){
2017 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2018 }else if ((c2 & 0x70) >= 0x50){
2019 set_code_score(ptr, SCORE_L2);
2023 void status_disable(struct input_code *ptr)
2028 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2031 void status_push_ch(struct input_code *ptr, nkf_char c)
2033 ptr->buf[ptr->index++] = c;
2036 void status_clear(struct input_code *ptr)
2042 void status_reset(struct input_code *ptr)
2045 ptr->score = SCORE_INIT;
2048 void status_reinit(struct input_code *ptr)
2051 ptr->_file_stat = 0;
2054 void status_check(struct input_code *ptr, nkf_char c)
2056 if (c <= DEL && estab_f){
2061 void s_status(struct input_code *ptr, nkf_char c)
2065 status_check(ptr, c);
2070 #ifdef NUMCHAR_OPTION
2071 }else if (is_unicode_capsule(c)){
2074 }else if (0xa1 <= c && c <= 0xdf){
2075 status_push_ch(ptr, SSO);
2076 status_push_ch(ptr, c);
2079 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2081 status_push_ch(ptr, c);
2082 #ifdef SHIFTJIS_CP932
2084 && is_ibmext_in_sjis(c)){
2086 status_push_ch(ptr, c);
2087 #endif /* SHIFTJIS_CP932 */
2089 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2091 status_push_ch(ptr, c);
2092 #endif /* X0212_ENABLE */
2094 status_disable(ptr);
2098 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2099 status_push_ch(ptr, c);
2100 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2104 status_disable(ptr);
2108 #ifdef SHIFTJIS_CP932
2109 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2110 status_push_ch(ptr, c);
2111 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2112 set_code_score(ptr, SCORE_CP932);
2117 #endif /* SHIFTJIS_CP932 */
2118 #ifndef X0212_ENABLE
2119 status_disable(ptr);
2125 void e_status(struct input_code *ptr, nkf_char c)
2129 status_check(ptr, c);
2134 #ifdef NUMCHAR_OPTION
2135 }else if (is_unicode_capsule(c)){
2138 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2140 status_push_ch(ptr, c);
2142 }else if (0x8f == c){
2144 status_push_ch(ptr, c);
2145 #endif /* X0212_ENABLE */
2147 status_disable(ptr);
2151 if (0xa1 <= c && c <= 0xfe){
2152 status_push_ch(ptr, c);
2156 status_disable(ptr);
2161 if (0xa1 <= c && c <= 0xfe){
2163 status_push_ch(ptr, c);
2165 status_disable(ptr);
2167 #endif /* X0212_ENABLE */
2171 #ifdef UTF8_INPUT_ENABLE
2172 void w_status(struct input_code *ptr, nkf_char c)
2176 status_check(ptr, c);
2181 #ifdef NUMCHAR_OPTION
2182 }else if (is_unicode_capsule(c)){
2185 }else if (0xc0 <= c && c <= 0xdf){
2187 status_push_ch(ptr, c);
2188 }else if (0xe0 <= c && c <= 0xef){
2190 status_push_ch(ptr, c);
2191 }else if (0xf0 <= c && c <= 0xf4){
2193 status_push_ch(ptr, c);
2195 status_disable(ptr);
2200 if (0x80 <= c && c <= 0xbf){
2201 status_push_ch(ptr, c);
2202 if (ptr->index > ptr->stat){
2203 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2204 && ptr->buf[2] == 0xbf);
2205 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2206 &ptr->buf[0], &ptr->buf[1]);
2213 status_disable(ptr);
2217 if (0x80 <= c && c <= 0xbf){
2218 if (ptr->index < ptr->stat){
2219 status_push_ch(ptr, c);
2224 status_disable(ptr);
2231 void code_status(nkf_char c)
2233 int action_flag = 1;
2234 struct input_code *result = 0;
2235 struct input_code *p = input_code_list;
2237 if (!p->status_func) {
2241 if (!p->status_func)
2243 (p->status_func)(p, c);
2246 }else if(p->stat == 0){
2257 if (result && !estab_f){
2258 set_iconv(TRUE, result->iconv_func);
2259 }else if (c <= DEL){
2260 struct input_code *ptr = input_code_list;
2270 nkf_char std_getc(FILE *f)
2273 return std_gc_buf[--std_gc_ndx];
2279 nkf_char std_ungetc(nkf_char c, FILE *f)
2281 if (std_gc_ndx == STD_GC_BUFSIZE){
2284 std_gc_buf[std_gc_ndx++] = c;
2289 void std_putc(nkf_char c)
2296 #if !defined(PERL_XS) && !defined(WIN32DLL)
2297 nkf_char noconvert(FILE *f)
2302 module_connection();
2303 while ((c = (*i_getc)(f)) != EOF)
2310 void module_connection(void)
2312 oconv = output_conv;
2315 /* replace continucation module, from output side */
2317 /* output redicrection */
2319 if (noout_f || guess_f){
2326 if (mimeout_f == TRUE) {
2327 o_base64conv = oconv; oconv = base64_conv;
2329 /* base64_count = 0; */
2333 o_crconv = oconv; oconv = cr_conv;
2336 o_rot_conv = oconv; oconv = rot_conv;
2339 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2342 o_hira_conv = oconv; oconv = hira_conv;
2345 o_fconv = oconv; oconv = fold_conv;
2348 if (alpha_f || x0201_f) {
2349 o_zconv = oconv; oconv = z_conv;
2353 i_ungetc = std_ungetc;
2354 /* input redicrection */
2357 i_cgetc = i_getc; i_getc = cap_getc;
2358 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2361 i_ugetc = i_getc; i_getc = url_getc;
2362 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2365 #ifdef NUMCHAR_OPTION
2367 i_ngetc = i_getc; i_getc = numchar_getc;
2368 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2371 #ifdef UNICODE_NORMALIZATION
2372 if (nfc_f && input_f == UTF8_INPUT){
2373 i_nfc_getc = i_getc; i_getc = nfc_getc;
2374 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2377 if (mime_f && mimebuf_f==FIXED_MIME) {
2378 i_mgetc = i_getc; i_getc = mime_getc;
2379 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2382 i_bgetc = i_getc; i_getc = broken_getc;
2383 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2385 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2386 set_iconv(-TRUE, e_iconv);
2387 } else if (input_f == SJIS_INPUT) {
2388 set_iconv(-TRUE, s_iconv);
2389 #ifdef UTF8_INPUT_ENABLE
2390 } else if (input_f == UTF8_INPUT) {
2391 set_iconv(-TRUE, w_iconv);
2392 } else if (input_f == UTF16_INPUT) {
2393 set_iconv(-TRUE, w_iconv16);
2394 } else if (input_f == UTF32_INPUT) {
2395 set_iconv(-TRUE, w_iconv32);
2398 set_iconv(FALSE, e_iconv);
2402 struct input_code *p = input_code_list;
2410 * Check and Ignore BOM
2412 void check_bom(FILE *f)
2415 switch(c2 = (*i_getc)(f)){
2417 if((c2 = (*i_getc)(f)) == 0x00){
2418 if((c2 = (*i_getc)(f)) == 0xFE){
2419 if((c2 = (*i_getc)(f)) == 0xFF){
2421 set_iconv(TRUE, w_iconv32);
2423 if (iconv == w_iconv32) {
2424 input_endian = ENDIAN_BIG;
2427 (*i_ungetc)(0xFF,f);
2428 }else (*i_ungetc)(c2,f);
2429 (*i_ungetc)(0xFE,f);
2430 }else if(c2 == 0xFF){
2431 if((c2 = (*i_getc)(f)) == 0xFE){
2433 set_iconv(TRUE, w_iconv32);
2435 if (iconv == w_iconv32) {
2436 input_endian = ENDIAN_2143;
2439 (*i_ungetc)(0xFF,f);
2440 }else (*i_ungetc)(c2,f);
2441 (*i_ungetc)(0xFF,f);
2442 }else (*i_ungetc)(c2,f);
2443 (*i_ungetc)(0x00,f);
2444 }else (*i_ungetc)(c2,f);
2445 (*i_ungetc)(0x00,f);
2448 if((c2 = (*i_getc)(f)) == 0xBB){
2449 if((c2 = (*i_getc)(f)) == 0xBF){
2451 set_iconv(TRUE, w_iconv);
2453 if (iconv == w_iconv) {
2456 (*i_ungetc)(0xBF,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xBB,f);
2459 }else (*i_ungetc)(c2,f);
2460 (*i_ungetc)(0xEF,f);
2463 if((c2 = (*i_getc)(f)) == 0xFF){
2464 if((c2 = (*i_getc)(f)) == 0x00){
2465 if((c2 = (*i_getc)(f)) == 0x00){
2467 set_iconv(TRUE, w_iconv32);
2469 if (iconv == w_iconv32) {
2470 input_endian = ENDIAN_3412;
2473 (*i_ungetc)(0x00,f);
2474 }else (*i_ungetc)(c2,f);
2475 (*i_ungetc)(0x00,f);
2476 }else (*i_ungetc)(c2,f);
2478 set_iconv(TRUE, w_iconv16);
2480 if (iconv == w_iconv16) {
2481 input_endian = ENDIAN_BIG;
2484 (*i_ungetc)(0xFF,f);
2485 }else (*i_ungetc)(c2,f);
2486 (*i_ungetc)(0xFE,f);
2489 if((c2 = (*i_getc)(f)) == 0xFE){
2490 if((c2 = (*i_getc)(f)) == 0x00){
2491 if((c2 = (*i_getc)(f)) == 0x00){
2493 set_iconv(TRUE, w_iconv32);
2495 if (iconv == w_iconv32) {
2496 input_endian = ENDIAN_LITTLE;
2499 (*i_ungetc)(0x00,f);
2500 }else (*i_ungetc)(c2,f);
2501 (*i_ungetc)(0x00,f);
2502 }else (*i_ungetc)(c2,f);
2504 set_iconv(TRUE, w_iconv16);
2506 if (iconv == w_iconv16) {
2507 input_endian = ENDIAN_LITTLE;
2510 (*i_ungetc)(0xFE,f);
2511 }else (*i_ungetc)(c2,f);
2512 (*i_ungetc)(0xFF,f);
2521 Conversion main loop. Code detection only.
2524 nkf_char kanji_convert(FILE *f)
2526 nkf_char c3, c2=0, c1, c0=0;
2527 int is_8bit = FALSE;
2529 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2530 #ifdef UTF8_INPUT_ENABLE
2531 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2538 output_mode = ASCII;
2541 #define NEXT continue /* no output, get next */
2542 #define SEND ; /* output c1 and c2, get next */
2543 #define LAST break /* end of loop, go closing */
2545 module_connection();
2548 while ((c1 = (*i_getc)(f)) != EOF) {
2549 #ifdef INPUT_CODE_FIX
2556 /* in case of 8th bit is on */
2557 if (!estab_f&&!mime_decode_mode) {
2558 /* in case of not established yet */
2559 /* It is still ambiguious */
2560 if (h_conv(f, c2, c1)==EOF)
2566 /* in case of already established */
2568 /* ignore bogus code */
2574 /* second byte, 7 bit code */
2575 /* it might be kanji shitfted */
2576 if ((c1 == DEL) || (c1 <= SPACE)) {
2577 /* ignore bogus first code */
2584 #ifdef UTF8_INPUT_ENABLE
2585 if (iconv == w_iconv16) {
2586 if (input_endian == ENDIAN_BIG) {
2588 if ((c1 = (*i_getc)(f)) != EOF) {
2589 if (0xD8 <= c2 && c2 <= 0xDB) {
2590 if ((c0 = (*i_getc)(f)) != EOF) {
2592 if ((c3 = (*i_getc)(f)) != EOF) {
2599 if ((c2 = (*i_getc)(f)) != EOF) {
2600 if (0xD8 <= c2 && c2 <= 0xDB) {
2601 if ((c3 = (*i_getc)(f)) != EOF) {
2602 if ((c0 = (*i_getc)(f)) != EOF) {
2611 } else if(iconv == w_iconv32){
2613 if((c2 = (*i_getc)(f)) != EOF &&
2614 (c1 = (*i_getc)(f)) != EOF &&
2615 (c0 = (*i_getc)(f)) != EOF){
2616 switch(input_endian){
2618 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2621 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2624 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2627 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2637 #ifdef NUMCHAR_OPTION
2638 if (is_unicode_capsule(c1)){
2644 if (!estab_f && !iso8859_f) {
2645 /* not established yet */
2648 } else { /* estab_f==TRUE */
2653 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2654 /* SJIS X0201 Case... */
2655 if(iso2022jp_f && x0201_f==NO_X0201) {
2656 (*oconv)(GETA1, GETA2);
2663 } else if (c1==SSO && iconv != s_iconv) {
2664 /* EUC X0201 Case */
2665 c1 = (*i_getc)(f); /* skip SSO */
2667 if (SSP<=c1 && c1<0xe0) {
2668 if(iso2022jp_f && x0201_f==NO_X0201) {
2669 (*oconv)(GETA1, GETA2);
2676 } else { /* bogus code, skip SSO and one byte */
2680 /* already established */
2685 } else if ((c1 > SPACE) && (c1 != DEL)) {
2686 /* in case of Roman characters */
2688 /* output 1 shifted byte */
2692 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2693 /* output 1 shifted byte */
2694 if(iso2022jp_f && x0201_f==NO_X0201) {
2695 (*oconv)(GETA1, GETA2);
2702 /* look like bogus code */
2705 } else if (input_mode == X0208 || input_mode == X0212 ||
2706 input_mode == X0213_1 || input_mode == X0213_2) {
2707 /* in case of Kanji shifted */
2710 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2711 /* Check MIME code */
2712 if ((c1 = (*i_getc)(f)) == EOF) {
2715 } else if (c1 == '?') {
2716 /* =? is mime conversion start sequence */
2717 if(mime_f == STRICT_MIME) {
2718 /* check in real detail */
2719 if (mime_begin_strict(f) == EOF)
2723 } else if (mime_begin(f) == EOF)
2733 /* normal ASCII code */
2736 } else if (!is_8bit && c1 == SI) {
2739 } else if (!is_8bit && c1 == SO) {
2742 } else if (!is_8bit && c1 == ESC ) {
2743 if ((c1 = (*i_getc)(f)) == EOF) {
2744 /* (*oconv)(0, ESC); don't send bogus code */
2746 } else if (c1 == '$') {
2747 if ((c1 = (*i_getc)(f)) == EOF) {
2749 (*oconv)(0, ESC); don't send bogus code
2750 (*oconv)(0, '$'); */
2752 } else if (c1 == '@'|| c1 == 'B') {
2753 /* This is kanji introduction */
2756 set_input_codename("ISO-2022-JP");
2758 debug(input_codename);
2761 } else if (c1 == '(') {
2762 if ((c1 = (*i_getc)(f)) == EOF) {
2763 /* don't send bogus code
2769 } else if (c1 == '@'|| c1 == 'B') {
2770 /* This is kanji introduction */
2775 } else if (c1 == 'D'){
2779 #endif /* X0212_ENABLE */
2780 } else if (c1 == (X0213_1&0x7F)){
2781 input_mode = X0213_1;
2784 } else if (c1 == (X0213_2&0x7F)){
2785 input_mode = X0213_2;
2789 /* could be some special code */
2796 } else if (broken_f&0x2) {
2797 /* accept any ESC-(-x as broken code ... */
2807 } else if (c1 == '(') {
2808 if ((c1 = (*i_getc)(f)) == EOF) {
2809 /* don't send bogus code
2811 (*oconv)(0, '('); */
2815 /* This is X0201 kana introduction */
2816 input_mode = X0201; shift_mode = X0201;
2818 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2819 /* This is X0208 kanji introduction */
2820 input_mode = ASCII; shift_mode = FALSE;
2822 } else if (broken_f&0x2) {
2823 input_mode = ASCII; shift_mode = FALSE;
2828 /* maintain various input_mode here */
2832 } else if ( c1 == 'N' || c1 == 'n' ){
2834 c3 = (*i_getc)(f); /* skip SS2 */
2835 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2850 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2851 input_mode = ASCII; set_iconv(FALSE, 0);
2853 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2854 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2862 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2863 if ((c1=(*i_getc)(f))!=EOF) {
2867 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2885 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2888 if ((c0 = (*i_getc)(f)) != EOF) {
2891 if ((c3 = (*i_getc)(f)) != EOF) {
2893 (*iconv)(c2, c1, c0|c3);
2898 /* 3 bytes EUC or UTF-8 */
2899 if ((c0 = (*i_getc)(f)) != EOF) {
2901 (*iconv)(c2, c1, c0);
2908 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2912 (*oconv)(PREFIX_EUCG3 | c2, c1);
2914 #endif /* X0212_ENABLE */
2916 (*oconv)(PREFIX_EUCG3 | c2, c1);
2919 (*oconv)(input_mode, c1); /* other special case */
2925 /* goto next_word */
2929 (*iconv)(EOF, 0, 0);
2930 if (!is_inputcode_set)
2933 struct input_code *p = input_code_list;
2934 struct input_code *result = p;
2936 if (p->score < result->score) result = p;
2939 set_input_codename(result->name);
2946 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2948 nkf_char ret, c3, c0;
2952 /** it must NOT be in the kanji shifte sequence */
2953 /** it must NOT be written in JIS7 */
2954 /** and it must be after 2 byte 8bit code */
2960 while ((c1 = (*i_getc)(f)) != EOF) {
2966 if (push_hold_buf(c1) == EOF || estab_f){
2972 struct input_code *p = input_code_list;
2973 struct input_code *result = p;
2978 if (p->score < result->score){
2983 set_iconv(FALSE, result->iconv_func);
2988 ** 1) EOF is detected, or
2989 ** 2) Code is established, or
2990 ** 3) Buffer is FULL (but last word is pushed)
2992 ** in 1) and 3) cases, we continue to use
2993 ** Kanji codes by oconv and leave estab_f unchanged.
2998 while (hold_index < hold_count){
2999 c2 = hold_buf[hold_index++];
3001 #ifdef NUMCHAR_OPTION
3002 || is_unicode_capsule(c2)
3007 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3008 (*iconv)(X0201, c2, 0);
3011 if (hold_index < hold_count){
3012 c1 = hold_buf[hold_index++];
3022 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3025 if (hold_index < hold_count){
3026 c0 = hold_buf[hold_index++];
3027 } else if ((c0 = (*i_getc)(f)) == EOF) {
3033 if (hold_index < hold_count){
3034 c3 = hold_buf[hold_index++];
3035 } else if ((c3 = (*i_getc)(f)) == EOF) {
3040 (*iconv)(c2, c1, c0|c3);
3045 /* 3 bytes EUC or UTF-8 */
3046 if (hold_index < hold_count){
3047 c0 = hold_buf[hold_index++];
3048 } else if ((c0 = (*i_getc)(f)) == EOF) {
3054 (*iconv)(c2, c1, c0);
3057 if (c0 == EOF) break;
3062 nkf_char push_hold_buf(nkf_char c2)
3064 if (hold_count >= HOLD_SIZE*2)
3066 hold_buf[hold_count++] = (unsigned char)c2;
3067 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3070 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3072 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3075 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3076 #ifdef SHIFTJIS_CP932
3077 if (cp51932_f && is_ibmext_in_sjis(c2)){
3079 extern const unsigned short shiftjis_cp932[3][189];
3081 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3087 #endif /* SHIFTJIS_CP932 */
3089 if (!x0213_f && is_ibmext_in_sjis(c2)){
3091 extern const unsigned short shiftjis_x0212[3][189];
3093 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3096 c2 = PREFIX_EUCG3 | (val >> 8);
3109 if(x0213_f && c2 >= 0xF0){
3110 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3111 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3112 }else{ /* 78<=k<=94 */
3113 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3114 if (0x9E < c1) c2++;
3117 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3118 if (0x9E < c1) c2++;
3121 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3128 c2 = x0212_unshift(c2);
3135 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3139 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3142 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3143 if (ret) return ret;
3149 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3154 }else if (c2 == 0x8f){
3158 c2 = (c2 << 8) | (c1 & 0x7f);
3160 #ifdef SHIFTJIS_CP932
3163 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3164 s2e_conv(s2, s1, &c2, &c1);
3171 #endif /* SHIFTJIS_CP932 */
3172 #endif /* X0212_ENABLE */
3173 } else if (c2 == SSO){
3176 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3186 #ifdef UTF8_INPUT_ENABLE
3187 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3194 }else if (0xc0 <= c2 && c2 <= 0xef) {
3195 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3196 #ifdef NUMCHAR_OPTION
3199 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3207 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3210 static const int w_iconv_utf8_1st_byte[] =
3212 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3213 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3214 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3215 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3217 if (c2 < 0 || 0xff < c2) {
3218 }else if (c2 == 0) { /* 0 : 1 byte*/
3220 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3223 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3225 if (c1 < 0x80 || 0xBF < c1) return 0;
3228 if (c0 == 0) return -1;
3229 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3234 if (c0 == 0) return -1;
3235 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3239 if (c0 == 0) return -1;
3240 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3244 if (c0 == 0) return -2;
3245 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3249 if (c0 == 0) return -2;
3250 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3254 if (c0 == 0) return -2;
3255 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3263 if (c2 == 0 || c2 == EOF){
3264 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3265 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3268 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3277 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3278 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3285 }else if (val < 0x800){
3286 *p2 = 0xc0 | (val >> 6);
3287 *p1 = 0x80 | (val & 0x3f);
3289 } else if (val <= NKF_INT32_C(0xFFFF)) {
3290 *p2 = 0xe0 | (val >> 12);
3291 *p1 = 0x80 | ((val >> 6) & 0x3f);
3292 *p0 = 0x80 | (val & 0x3f);
3293 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3294 *p2 = 0xe0 | (val >> 16);
3295 *p1 = 0x80 | ((val >> 12) & 0x3f);
3296 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3305 #ifdef UTF8_INPUT_ENABLE
3306 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3311 } else if (c2 >= 0xf0){
3312 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3313 val = (c2 & 0x0f) << 18;
3314 val |= (c1 & 0x3f) << 12;
3315 val |= (c0 & 0x3f00) >> 2;
3317 }else if (c2 >= 0xe0){
3318 val = (c2 & 0x0f) << 12;
3319 val |= (c1 & 0x3f) << 6;
3321 }else if (c2 >= 0xc0){
3322 val = (c2 & 0x1f) << 6;
3330 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3332 nkf_char c2, c1, c0;
3339 w16w_conv(val, &c2, &c1, &c0);
3340 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3341 #ifdef NUMCHAR_OPTION
3344 *p1 = CLASS_UNICODE | val;
3353 #ifdef UTF8_INPUT_ENABLE
3354 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3357 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3360 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3361 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3363 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3365 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3370 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3371 if (ret) return ret;
3376 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3380 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3381 } else if (is_unicode_bmp(c1)) {
3382 ret = w16e_conv(c1, &c2, &c1);
3385 c1 = CLASS_UNICODE | c1;
3387 if (ret) return ret;
3392 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3395 extern const unsigned short *const utf8_to_euc_2bytes[];
3396 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3397 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3398 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3399 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3400 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3402 const unsigned short *const *pp;
3403 const unsigned short *const *const *ppp;
3404 static const int no_best_fit_chars_table_C2[] =
3405 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3406 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3407 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3408 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3409 static const int no_best_fit_chars_table_C2_ms[] =
3410 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3411 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3412 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3413 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3414 static const int no_best_fit_chars_table_932_C2[] =
3415 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3416 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3417 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3418 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3419 static const int no_best_fit_chars_table_932_C3[] =
3420 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3421 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3422 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3423 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3429 }else if(c2 < 0xe0){
3430 if(no_best_fit_chars_f){
3431 if(ms_ucs_map_f == UCS_MAP_CP932){
3434 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3437 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3440 }else if(cp51932_f){
3443 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3446 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3449 }else if(ms_ucs_map_f == UCS_MAP_MS){
3450 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3454 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3455 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3457 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3458 }else if(c0 < 0xF0){
3459 if(no_best_fit_chars_f){
3460 if(ms_ucs_map_f == UCS_MAP_CP932){
3461 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3462 }else if(ms_ucs_map_f == UCS_MAP_MS){
3467 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3470 if(c0 == 0x92) return 1;
3475 if(c1 == 0x80 || c0 == 0x9C) return 1;
3483 if(c0 == 0x95) return 1;
3486 if(c0 == 0xA5) return 1;
3493 if(c0 == 0x8D) return 1;
3496 if(c0 == 0x9E && cp51932_f) return 1;
3499 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3507 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3508 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3510 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3512 #ifdef SHIFTJIS_CP932
3513 if (!ret && cp51932_f && is_eucg3(*p2)) {
3515 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3516 s2e_conv(s2, s1, p2, p1);
3525 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3528 const unsigned short *p;
3531 if (pp == 0) return 1;
3534 if (c1 < 0 || psize <= c1) return 1;
3536 if (p == 0) return 1;
3539 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3541 if (val == 0) return 1;
3542 if (no_cp932ext_f && (
3543 (val>>8) == 0x2D || /* NEC special characters */
3544 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3552 if (c2 == SO) c2 = X0201;
3559 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3561 const char *hex = "0123456789ABCDEF";
3567 (*f)(0, hex[(c>>shift)&0xF]);
3577 void encode_fallback_html(nkf_char c)
3582 if(c >= NKF_INT32_C(1000000))
3583 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3584 if(c >= NKF_INT32_C(100000))
3585 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3587 (*oconv)(0, 0x30+(c/10000 )%10);
3589 (*oconv)(0, 0x30+(c/1000 )%10);
3591 (*oconv)(0, 0x30+(c/100 )%10);
3593 (*oconv)(0, 0x30+(c/10 )%10);
3595 (*oconv)(0, 0x30+ c %10);
3600 void encode_fallback_xml(nkf_char c)
3605 nkf_each_char_to_hex(oconv, c);
3610 void encode_fallback_java(nkf_char c)
3612 const char *hex = "0123456789ABCDEF";
3615 if(!is_unicode_bmp(c)){
3619 (*oconv)(0, hex[(c>>20)&0xF]);
3620 (*oconv)(0, hex[(c>>16)&0xF]);
3624 (*oconv)(0, hex[(c>>12)&0xF]);
3625 (*oconv)(0, hex[(c>> 8)&0xF]);
3626 (*oconv)(0, hex[(c>> 4)&0xF]);
3627 (*oconv)(0, hex[ c &0xF]);
3631 void encode_fallback_perl(nkf_char c)
3636 nkf_each_char_to_hex(oconv, c);
3641 void encode_fallback_subchar(nkf_char c)
3643 c = unicode_subchar;
3644 (*oconv)((c>>8)&0xFF, c&0xFF);
3649 #ifdef UTF8_OUTPUT_ENABLE
3650 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3653 extern const unsigned short euc_to_utf8_1byte[];
3654 extern const unsigned short *const euc_to_utf8_2bytes[];
3655 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3656 extern const unsigned short *const x0212_to_utf8_2bytes[];
3658 const unsigned short *p;
3661 p = euc_to_utf8_1byte;
3663 } else if (is_eucg3(c2)){
3664 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3667 c2 = (c2&0x7f) - 0x21;
3668 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3669 p = x0212_to_utf8_2bytes[c2];
3675 c2 = (c2&0x7f) - 0x21;
3676 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3677 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3682 c1 = (c1 & 0x7f) - 0x21;
3683 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3688 void w_oconv(nkf_char c2, nkf_char c1)
3694 output_bom_f = FALSE;
3705 #ifdef NUMCHAR_OPTION
3706 if (c2 == 0 && is_unicode_capsule(c1)){
3707 val = c1 & VALUE_MASK;
3710 }else if (val < 0x800){
3711 (*o_putc)(0xC0 | (val >> 6));
3712 (*o_putc)(0x80 | (val & 0x3f));
3713 } else if (val <= NKF_INT32_C(0xFFFF)) {
3714 (*o_putc)(0xE0 | (val >> 12));
3715 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3716 (*o_putc)(0x80 | (val & 0x3f));
3717 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3718 (*o_putc)(0xF0 | ( val>>18));
3719 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3720 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3721 (*o_putc)(0x80 | ( val & 0x3f));
3728 output_mode = ASCII;
3730 } else if (c2 == ISO8859_1) {
3731 output_mode = ISO8859_1;
3732 (*o_putc)(c1 | 0x080);
3735 val = e2w_conv(c2, c1);
3737 w16w_conv(val, &c2, &c1, &c0);
3741 if (c0) (*o_putc)(c0);
3747 void w_oconv16(nkf_char c2, nkf_char c1)
3750 output_bom_f = FALSE;
3751 if (output_endian == ENDIAN_LITTLE){
3752 (*o_putc)((unsigned char)'\377');
3756 (*o_putc)((unsigned char)'\377');
3765 if (c2 == ISO8859_1) {
3768 #ifdef NUMCHAR_OPTION
3769 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3770 if (is_unicode_bmp(c1)) {
3771 c2 = (c1 >> 8) & 0xff;
3775 if (c1 <= UNICODE_MAX) {
3776 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3777 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3778 if (output_endian == ENDIAN_LITTLE){
3779 (*o_putc)(c2 & 0xff);
3780 (*o_putc)((c2 >> 8) & 0xff);
3781 (*o_putc)(c1 & 0xff);
3782 (*o_putc)((c1 >> 8) & 0xff);
3784 (*o_putc)((c2 >> 8) & 0xff);
3785 (*o_putc)(c2 & 0xff);
3786 (*o_putc)((c1 >> 8) & 0xff);
3787 (*o_putc)(c1 & 0xff);
3794 nkf_char val = e2w_conv(c2, c1);
3795 c2 = (val >> 8) & 0xff;
3798 if (output_endian == ENDIAN_LITTLE){
3807 void w_oconv32(nkf_char c2, nkf_char c1)
3810 output_bom_f = FALSE;
3811 if (output_endian == ENDIAN_LITTLE){
3812 (*o_putc)((unsigned char)'\377');
3820 (*o_putc)((unsigned char)'\377');
3829 if (c2 == ISO8859_1) {
3831 #ifdef NUMCHAR_OPTION
3832 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3836 c1 = e2w_conv(c2, c1);
3838 if (output_endian == ENDIAN_LITTLE){
3839 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3840 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3841 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3845 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3846 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3847 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3852 void e_oconv(nkf_char c2, nkf_char c1)
3854 #ifdef NUMCHAR_OPTION
3855 if (c2 == 0 && is_unicode_capsule(c1)){
3856 w16e_conv(c1, &c2, &c1);
3857 if (c2 == 0 && is_unicode_capsule(c1)){
3858 if(encode_fallback)(*encode_fallback)(c1);
3866 } else if (c2 == 0) {
3867 output_mode = ASCII;
3869 } else if (c2 == X0201) {
3870 output_mode = JAPANESE_EUC;
3871 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3872 } else if (c2 == ISO8859_1) {
3873 output_mode = ISO8859_1;
3874 (*o_putc)(c1 | 0x080);
3876 } else if (is_eucg3(c2)){
3877 output_mode = JAPANESE_EUC;
3878 #ifdef SHIFTJIS_CP932
3881 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3882 s2e_conv(s2, s1, &c2, &c1);
3887 output_mode = ASCII;
3889 }else if (is_eucg3(c2)){
3892 (*o_putc)((c2 & 0x7f) | 0x080);
3893 (*o_putc)(c1 | 0x080);
3896 (*o_putc)((c2 & 0x7f) | 0x080);
3897 (*o_putc)(c1 | 0x080);
3901 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3902 set_iconv(FALSE, 0);
3903 return; /* too late to rescue this char */
3905 output_mode = JAPANESE_EUC;
3906 (*o_putc)(c2 | 0x080);
3907 (*o_putc)(c1 | 0x080);
3912 nkf_char x0212_shift(nkf_char c)
3917 if (0x75 <= c && c <= 0x7f){
3918 ret = c + (0x109 - 0x75);
3921 if (0x75 <= c && c <= 0x7f){
3922 ret = c + (0x113 - 0x75);
3929 nkf_char x0212_unshift(nkf_char c)
3932 if (0x7f <= c && c <= 0x88){
3933 ret = c + (0x75 - 0x7f);
3934 }else if (0x89 <= c && c <= 0x92){
3935 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3939 #endif /* X0212_ENABLE */
3941 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3947 if((0x21 <= ndx && ndx <= 0x2F)){
3948 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3949 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3951 }else if(0x6E <= ndx && ndx <= 0x7E){
3952 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3953 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3959 else if(nkf_isgraph(ndx)){
3961 const unsigned short *ptr;
3963 extern const unsigned short *const x0212_shiftjis[];
3965 ptr = x0212_shiftjis[ndx - 0x21];
3967 val = ptr[(c1 & 0x7f) - 0x21];
3976 c2 = x0212_shift(c2);
3978 #endif /* X0212_ENABLE */
3980 if(0x7F < c2) return 1;
3981 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3982 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3986 void s_oconv(nkf_char c2, nkf_char c1)
3988 #ifdef NUMCHAR_OPTION
3989 if (c2 == 0 && is_unicode_capsule(c1)){
3990 w16e_conv(c1, &c2, &c1);
3991 if (c2 == 0 && is_unicode_capsule(c1)){
3992 if(encode_fallback)(*encode_fallback)(c1);
4000 } else if (c2 == 0) {
4001 output_mode = ASCII;
4003 } else if (c2 == X0201) {
4004 output_mode = SHIFT_JIS;
4006 } else if (c2 == ISO8859_1) {
4007 output_mode = ISO8859_1;
4008 (*o_putc)(c1 | 0x080);
4010 } else if (is_eucg3(c2)){
4011 output_mode = SHIFT_JIS;
4012 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4018 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4019 set_iconv(FALSE, 0);
4020 return; /* too late to rescue this char */
4022 output_mode = SHIFT_JIS;
4023 e2s_conv(c2, c1, &c2, &c1);
4025 #ifdef SHIFTJIS_CP932
4027 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4029 extern const unsigned short cp932inv[2][189];
4031 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4037 #endif /* SHIFTJIS_CP932 */
4040 if (prefix_table[(unsigned char)c1]){
4041 (*o_putc)(prefix_table[(unsigned char)c1]);
4047 void j_oconv(nkf_char c2, nkf_char c1)
4049 #ifdef NUMCHAR_OPTION
4050 if (c2 == 0 && is_unicode_capsule(c1)){
4051 w16e_conv(c1, &c2, &c1);
4052 if (c2 == 0 && is_unicode_capsule(c1)){
4053 if(encode_fallback)(*encode_fallback)(c1);
4059 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4062 (*o_putc)(ascii_intro);
4063 output_mode = ASCII;
4067 } else if (is_eucg3(c2)){
4069 if(output_mode!=X0213_2){
4070 output_mode = X0213_2;
4074 (*o_putc)(X0213_2&0x7F);
4077 if(output_mode!=X0212){
4078 output_mode = X0212;
4082 (*o_putc)(X0212&0x7F);
4085 (*o_putc)(c2 & 0x7f);
4088 } else if (c2==X0201) {
4089 if (output_mode!=X0201) {
4090 output_mode = X0201;
4096 } else if (c2==ISO8859_1) {
4097 /* iso8859 introduction, or 8th bit on */
4098 /* Can we convert in 7bit form using ESC-'-'-A ?
4100 output_mode = ISO8859_1;
4102 } else if (c2 == 0) {
4103 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4106 (*o_putc)(ascii_intro);
4107 output_mode = ASCII;
4111 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4113 if (output_mode!=X0213_1) {
4114 output_mode = X0213_1;
4118 (*o_putc)(X0213_1&0x7F);
4120 }else if (output_mode != X0208) {
4121 output_mode = X0208;
4124 (*o_putc)(kanji_intro);
4131 void base64_conv(nkf_char c2, nkf_char c1)
4133 mime_prechar(c2, c1);
4134 (*o_base64conv)(c2,c1);
4138 static nkf_char broken_buf[3];
4139 static int broken_counter = 0;
4140 static int broken_last = 0;
4141 nkf_char broken_getc(FILE *f)
4145 if (broken_counter>0) {
4146 return broken_buf[--broken_counter];
4149 if (c=='$' && broken_last != ESC
4150 && (input_mode==ASCII || input_mode==X0201)) {
4153 if (c1=='@'|| c1=='B') {
4154 broken_buf[0]=c1; broken_buf[1]=c;
4161 } else if (c=='(' && broken_last != ESC
4162 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4165 if (c1=='J'|| c1=='B') {
4166 broken_buf[0]=c1; broken_buf[1]=c;
4179 nkf_char broken_ungetc(nkf_char c, FILE *f)
4181 if (broken_counter<2)
4182 broken_buf[broken_counter++]=c;
4186 static nkf_char prev_cr = 0;
4188 void cr_conv(nkf_char c2, nkf_char c1)
4192 if (! (c2==0&&c1==NL) ) {
4198 } else if (c1=='\r') {
4200 } else if (c1=='\n') {
4201 if (crmode_f==CRLF) {
4202 (*o_crconv)(0,'\r');
4203 } else if (crmode_f==CR) {
4204 (*o_crconv)(0,'\r');
4208 } else if (c1!='\032' || crmode_f!=NL){
4214 Return value of fold_conv()
4216 \n add newline and output char
4217 \r add newline and output nothing
4220 1 (or else) normal output
4222 fold state in prev (previous character)
4224 >0x80 Japanese (X0208/X0201)
4229 This fold algorthm does not preserve heading space in a line.
4230 This is the main difference from fmt.
4233 #define char_size(c2,c1) (c2?2:1)
4235 void fold_conv(nkf_char c2, nkf_char c1)
4238 nkf_char fold_state;
4240 if (c1== '\r' && !fold_preserve_f) {
4241 fold_state=0; /* ignore cr */
4242 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4244 fold_state=0; /* ignore cr */
4245 } else if (c1== BS) {
4246 if (f_line>0) f_line--;
4248 } else if (c2==EOF && f_line != 0) { /* close open last line */
4250 } else if ((c1=='\n' && !fold_preserve_f)
4251 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4252 && fold_preserve_f)) {
4254 if (fold_preserve_f) {
4258 } else if ((f_prev == c1 && !fold_preserve_f)
4259 || (f_prev == '\n' && fold_preserve_f)
4260 ) { /* duplicate newline */
4263 fold_state = '\n'; /* output two newline */
4269 if (f_prev&0x80) { /* Japanese? */
4271 fold_state = 0; /* ignore given single newline */
4272 } else if (f_prev==' ') {
4276 if (++f_line<=fold_len)
4280 fold_state = '\r'; /* fold and output nothing */
4284 } else if (c1=='\f') {