1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.100 2006/05/06 12:40:44 naruse Exp $ */
43 #define NKF_VERSION "2.0.7"
44 #define NKF_RELEASE_DATE "2006-05-06"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
49 " 2002-2006 Kono, Furukawa, Naruse, mastodon"
56 ** USAGE: nkf [flags] [file]
59 ** b Output is buffered (DEFAULT)
60 ** u Output is unbuffered
64 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
65 ** s Output code is MS Kanji (DEFAULT SELECT)
66 ** e Output code is AT&T JIS (DEFAULT SELECT)
67 ** w Output code is AT&T JIS (DEFAULT SELECT)
68 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
70 ** m MIME conversion for ISO-2022-JP
71 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
72 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
73 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
74 ** M MIME output conversion
76 ** r {de/en}crypt ROT13/47
80 ** T Text mode output (for MS-DOS)
82 ** x Do not convert X0201 kana into X0208
83 ** Z Convert X0208 alphabet to ASCII
88 ** B try to fix broken JIS, missing Escape
89 ** B[1-9] broken level
91 ** O Output to 'nkf.out' file or last file name
92 ** d Delete \r in line feed
93 ** c Add \r in line feed
94 ** -- other long option
95 ** -- ignore following option (don't use with -O )
99 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
101 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
117 #if defined(MSDOS) || defined(__OS2__)
120 #if defined(_MSC_VER) || defined(__WATCOMC__)
121 #define mktemp _mktemp
127 #define setbinmode(fp) fsetbin(fp)
128 #elif defined(__DJGPP__)
129 #include <libc/dosio.h>
130 #define setbinmode(fp) djgpp_setbinmode(fp)
131 #else /* Microsoft C, Turbo C */
132 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
135 #define setbinmode(fp)
138 #if defined(__DJGPP__)
139 void djgpp_setbinmode(FILE *fp)
141 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
144 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
145 __file_handle_set(fd, m);
149 #ifdef _IOFBF /* SysV and MSDOS, Windows */
150 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
152 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
155 /*Borland C++ 4.5 EasyWin*/
156 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
165 /* added by satoru@isoternet.org */
167 #include <sys/types.h>
169 #include <sys/stat.h>
170 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
172 #if defined(__WATCOMC__)
173 #include <sys/utime.h>
177 #else /* defined(MSDOS) */
179 #ifdef __BORLANDC__ /* BCC32 */
181 #else /* !defined(__BORLANDC__) */
182 #include <sys/utime.h>
183 #endif /* (__BORLANDC__) */
184 #else /* !defined(__WIN32__) */
185 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
186 #include <sys/utime.h>
187 #elif defined(__TURBOC__) /* BCC */
189 #elif defined(LSI_C) /* LSI C */
190 #endif /* (__WIN32__) */
198 /* state of output_mode and input_mode
215 #define X0213_1 0x284F
216 #define X0213_2 0x2850
218 /* Input Assumption */
222 #define LATIN1_INPUT 6
224 #define STRICT_MIME 8
229 #define JAPANESE_EUC 10
233 #define UTF8_INPUT 13
234 #define UTF16BE_INPUT 14
235 #define UTF16LE_INPUT 15
255 #define is_alnum(c) \
256 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
258 /* I don't trust portablity of toupper */
259 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
260 #define nkf_isoctal(c) ('0'<=c && c<='7')
261 #define nkf_isdigit(c) ('0'<=c && c<='9')
262 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
263 #define nkf_isblank(c) (c == SPACE || c == TAB)
264 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
265 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
266 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
267 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
269 #define HOLD_SIZE 1024
270 #define IOBUF_SIZE 16384
272 #define DEFAULT_J 'B'
273 #define DEFAULT_R 'B'
275 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
276 #define SJ6394 0x0161 /* 63 - 94 ku offset */
278 #define RANGE_NUM_MAX 18
283 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
284 #define sizeof_euc_to_utf8_1byte 94
285 #define sizeof_euc_to_utf8_2bytes 94
286 #define sizeof_utf8_to_euc_C2 64
287 #define sizeof_utf8_to_euc_E5B8 64
288 #define sizeof_utf8_to_euc_2bytes 112
289 #define sizeof_utf8_to_euc_3bytes 16
292 /* MIME preprocessor */
294 #ifdef EASYWIN /*Easy Win */
295 extern POINT _BufferSize;
304 void (*status_func)(struct input_code *, int);
305 int (*iconv_func)(int c2, int c1, int c0);
309 static char *input_codename = "";
312 static const char *CopyRight = COPY_RIGHT;
314 #if !defined(PERL_XS) && !defined(WIN32DLL)
315 static int noconvert(FILE *f);
317 static void module_connection(void);
318 static int kanji_convert(FILE *f);
319 static int h_conv(FILE *f,int c2,int c1);
320 static int push_hold_buf(int c2);
321 static void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0));
322 static int s_iconv(int c2,int c1,int c0);
323 static int s2e_conv(int c2, int c1, int *p2, int *p1);
324 static int e_iconv(int c2,int c1,int c0);
325 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
327 * 0: Shift_JIS, eucJP-ascii
331 #define UCS_MAP_ASCII 0
333 #define UCS_MAP_CP932 2
334 static int ms_ucs_map_f = UCS_MAP_ASCII;
336 #ifdef UTF8_INPUT_ENABLE
337 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
338 static int no_cp932ext_f = FALSE;
339 /* ignore ZERO WIDTH NO-BREAK SPACE */
340 static int ignore_zwnbsp_f = TRUE;
341 static int no_best_fit_chars_f = FALSE;
342 static int unicode_subchar = '?'; /* the regular substitution character */
343 static void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c);
344 static void encode_fallback_html(int c);
345 static void encode_fallback_xml(int c);
346 static void encode_fallback_java(int c);
347 static void encode_fallback_perl(int c);
348 static void encode_fallback_subchar(int c);
349 static void (*encode_fallback)(int c) = NULL;
350 static int w2e_conv(int c2,int c1,int c0,int *p2,int *p1);
351 static int w_iconv(int c2,int c1,int c0);
352 static int w_iconv16(int c2,int c1,int c0);
353 static int unicode_to_jis_common(int c2,int c1,int c0,int *p2,int *p1);
354 static int w_iconv_common(int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1);
355 static void w16w_conv(int val, int *p2, int *p1, int *p0);
356 static int ww16_conv(int c2, int c1, int c0);
357 static int w16e_conv(int val,int *p2,int *p1);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int unicode_bom_f= 0; /* Output Unicode BOM */
361 static int w_oconv16_LE = 0; /* utf-16 little endian */
362 static int e2w_conv(int c2,int c1);
363 static void w_oconv(int c2,int c1);
364 static void w_oconv16(int c2,int c1);
366 static void e_oconv(int c2,int c1);
367 static int e2s_conv(int c2, int c1, int *p2, int *p1);
368 static void s_oconv(int c2,int c1);
369 static void j_oconv(int c2,int c1);
370 static void fold_conv(int c2,int c1);
371 static void cr_conv(int c2,int c1);
372 static void z_conv(int c2,int c1);
373 static void rot_conv(int c2,int c1);
374 static void hira_conv(int c2,int c1);
375 static void base64_conv(int c2,int c1);
376 static void iso2022jp_check_conv(int c2,int c1);
377 static void no_connection(int c2,int c1);
378 static int no_connection2(int c2,int c1,int c0);
380 static void code_score(struct input_code *ptr);
381 static void code_status(int c);
383 static void std_putc(int c);
384 static int std_getc(FILE *f);
385 static int std_ungetc(int c,FILE *f);
387 static int broken_getc(FILE *f);
388 static int broken_ungetc(int c,FILE *f);
390 static int mime_begin(FILE *f);
391 static int mime_getc(FILE *f);
392 static int mime_ungetc(int c,FILE *f);
394 static void switch_mime_getc(void);
395 static void unswitch_mime_getc(void);
396 static int mime_begin_strict(FILE *f);
397 static int mime_getc_buf(FILE *f);
398 static int mime_ungetc_buf(int c,FILE *f);
399 static int mime_integrity(FILE *f,const unsigned char *p);
401 static int base64decode(int c);
402 static void mime_prechar(int c2, int c1);
403 static void mime_putc(int c);
404 static void open_mime(int c);
405 static void close_mime(void);
406 static void eof_mime(void);
407 static void mimeout_addchar(int c);
409 static void usage(void);
410 static void version(void);
412 static void options(unsigned char *c);
413 #if defined(PERL_XS) || defined(WIN32DLL)
414 static void reinit(void);
419 #if !defined(PERL_XS) && !defined(WIN32DLL)
420 static unsigned char stdibuf[IOBUF_SIZE];
421 static unsigned char stdobuf[IOBUF_SIZE];
423 static unsigned char hold_buf[HOLD_SIZE*2];
424 static int hold_count;
426 /* MIME preprocessor fifo */
428 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
429 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
430 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
431 static unsigned char mime_buf[MIME_BUF_SIZE];
432 static unsigned int mime_top = 0;
433 static unsigned int mime_last = 0; /* decoded */
434 static unsigned int mime_input = 0; /* undecoded */
435 static int (*mime_iconv_back)(int c2,int c1,int c0) = NULL;
438 static int unbuf_f = FALSE;
439 static int estab_f = FALSE;
440 static int nop_f = FALSE;
441 static int binmode_f = TRUE; /* binary mode */
442 static int rot_f = FALSE; /* rot14/43 mode */
443 static int hira_f = FALSE; /* hira/kata henkan */
444 static int input_f = FALSE; /* non fixed input code */
445 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
446 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
447 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
448 static int mimebuf_f = FALSE; /* MIME buffered input */
449 static int broken_f = FALSE; /* convert ESC-less broken JIS */
450 static int iso8859_f = FALSE; /* ISO8859 through */
451 static int mimeout_f = FALSE; /* base64 mode */
452 #if defined(MSDOS) || defined(__OS2__)
453 static int x0201_f = TRUE; /* Assume JISX0201 kana */
455 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
457 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
459 #ifdef UNICODE_NORMALIZATION
460 static int nfc_f = FALSE;
461 static int (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
462 static int (*i_nfc_ungetc)(int c ,FILE *f) = std_ungetc;
463 static int nfc_getc(FILE *f);
464 static int nfc_ungetc(int c,FILE *f);
468 static int cap_f = FALSE;
469 static int (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
470 static int (*i_cungetc)(int c ,FILE *f) = std_ungetc;
471 static int cap_getc(FILE *f);
472 static int cap_ungetc(int c,FILE *f);
474 static int url_f = FALSE;
475 static int (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
476 static int (*i_uungetc)(int c ,FILE *f) = std_ungetc;
477 static int url_getc(FILE *f);
478 static int url_ungetc(int c,FILE *f);
481 #ifdef NUMCHAR_OPTION
482 #define CLASS_MASK 0x0f000000
483 #define CLASS_UTF16 0x01000000
484 static int numchar_f = FALSE;
485 static int (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
486 static int (*i_nungetc)(int c ,FILE *f) = std_ungetc;
487 static int numchar_getc(FILE *f);
488 static int numchar_ungetc(int c,FILE *f);
492 static int noout_f = FALSE;
493 static void no_putc(int c);
494 static int debug_f = FALSE;
495 static void debug(const char *str);
496 static int (*iconv_for_check)(int c2,int c1,int c0) = 0;
499 static int guess_f = FALSE;
501 static void print_guessed_code(char *filename);
503 static void set_input_codename(char *codename);
504 static int is_inputcode_mixed = FALSE;
505 static int is_inputcode_set = FALSE;
508 static int exec_f = 0;
511 #ifdef SHIFTJIS_CP932
512 /* invert IBM extended characters to others */
513 static int cp51932_f = TRUE;
514 #define CP932_TABLE_BEGIN (0xfa)
515 #define CP932_TABLE_END (0xfc)
517 /* invert NEC-selected IBM extended characters to IBM extended characters */
518 static int cp932inv_f = TRUE;
519 #define CP932INV_TABLE_BEGIN (0xed)
520 #define CP932INV_TABLE_END (0xee)
522 /* static int cp932_conv(int c2, int c1); */
523 #endif /* SHIFTJIS_CP932 */
526 static int x0212_f = FALSE;
527 static int x0212_shift(int c);
528 static int x0212_unshift(int c);
530 static int x0213_f = FALSE;
532 static unsigned char prefix_table[256];
534 static void set_code_score(struct input_code *ptr, int score);
535 static void clr_code_score(struct input_code *ptr, int score);
536 static void status_disable(struct input_code *ptr);
537 static void status_push_ch(struct input_code *ptr, int c);
538 static void status_clear(struct input_code *ptr);
539 static void status_reset(struct input_code *ptr);
540 static void status_reinit(struct input_code *ptr);
541 static void status_check(struct input_code *ptr, int c);
542 static void e_status(struct input_code *, int);
543 static void s_status(struct input_code *, int);
545 #ifdef UTF8_INPUT_ENABLE
546 static void w_status(struct input_code *, int);
547 static void w16_status(struct input_code *, int);
548 static int utf16_mode = UTF16BE_INPUT;
551 struct input_code input_code_list[] = {
552 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
553 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
554 #ifdef UTF8_INPUT_ENABLE
555 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
556 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
561 static int mimeout_mode = 0;
562 static int base64_count = 0;
564 /* X0208 -> ASCII converter */
567 static int f_line = 0; /* chars in line */
568 static int f_prev = 0;
569 static int fold_preserve_f = FALSE; /* preserve new lines */
570 static int fold_f = FALSE;
571 static int fold_len = 0;
574 static unsigned char kanji_intro = DEFAULT_J;
575 static unsigned char ascii_intro = DEFAULT_R;
579 #define FOLD_MARGIN 10
580 #define DEFAULT_FOLD 60
582 static int fold_margin = FOLD_MARGIN;
586 #ifdef DEFAULT_CODE_JIS
587 # define DEFAULT_CONV j_oconv
589 #ifdef DEFAULT_CODE_SJIS
590 # define DEFAULT_CONV s_oconv
592 #ifdef DEFAULT_CODE_EUC
593 # define DEFAULT_CONV e_oconv
595 #ifdef DEFAULT_CODE_UTF8
596 # define DEFAULT_CONV w_oconv
599 /* process default */
600 static void (*output_conv)(int c2,int c1) = DEFAULT_CONV;
602 static void (*oconv)(int c2,int c1) = no_connection;
603 /* s_iconv or oconv */
604 static int (*iconv)(int c2,int c1,int c0) = no_connection2;
606 static void (*o_zconv)(int c2,int c1) = no_connection;
607 static void (*o_fconv)(int c2,int c1) = no_connection;
608 static void (*o_crconv)(int c2,int c1) = no_connection;
609 static void (*o_rot_conv)(int c2,int c1) = no_connection;
610 static void (*o_hira_conv)(int c2,int c1) = no_connection;
611 static void (*o_base64conv)(int c2,int c1) = no_connection;
612 static void (*o_iso2022jp_check_conv)(int c2,int c1) = no_connection;
614 /* static redirections */
616 static void (*o_putc)(int c) = std_putc;
618 static int (*i_getc)(FILE *f) = std_getc; /* general input */
619 static int (*i_ungetc)(int c,FILE *f) =std_ungetc;
621 static int (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
622 static int (*i_bungetc)(int c ,FILE *f) = std_ungetc;
624 static void (*o_mputc)(int c) = std_putc ; /* output of mputc */
626 static int (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
627 static int (*i_mungetc)(int c ,FILE *f) = std_ungetc;
629 /* for strict mime */
630 static int (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
631 static int (*i_mungetc_buf)(int c,FILE *f) = std_ungetc;
634 static int output_mode = ASCII, /* output kanji mode */
635 input_mode = ASCII, /* input kanji mode */
636 shift_mode = FALSE; /* TRUE shift out, or X0201 */
637 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
639 /* X0201 / X0208 conversion tables */
641 /* X0201 kana conversion table */
644 unsigned char cv[]= {
645 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
646 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
647 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
648 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
649 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
650 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
651 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
652 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
653 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
654 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
655 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
656 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
657 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
658 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
659 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
660 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
664 /* X0201 kana conversion table for daguten */
667 unsigned char dv[]= {
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
673 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
674 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
675 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
676 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
677 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
679 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
686 /* X0201 kana conversion table for han-daguten */
689 unsigned char ev[]= {
690 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
694 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
696 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
701 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
704 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
705 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
709 /* X0208 kigou conversion table */
710 /* 0x8140 - 0x819e */
712 unsigned char fv[] = {
714 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
715 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
716 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
717 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
718 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
719 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
720 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
722 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
731 static int file_out_f = FALSE;
733 static int overwrite_f = FALSE;
734 static int preserve_time_f = FALSE;
735 static int backup_f = FALSE;
736 static char *backup_suffix = "";
737 static char *get_backup_filename(const char *suffix, const char *filename);
740 static int crmode_f = 0; /* CR, NL, CRLF */
741 #ifdef EASYWIN /*Easy Win */
742 static int end_check;
745 #define STD_GC_BUFSIZE (256)
746 int std_gc_buf[STD_GC_BUFSIZE];
750 #include "nkf32dll.c"
751 #elif defined(PERL_XS)
753 int main(int argc, char **argv)
758 char *outfname = NULL;
761 #ifdef EASYWIN /*Easy Win */
762 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
765 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
766 cp = (unsigned char *)*argv;
771 if (pipe(fds) < 0 || (pid = fork()) < 0){
782 execvp(argv[1], &argv[1]);
796 if(x0201_f == WISH_TRUE)
797 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
799 if (binmode_f == TRUE)
800 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
801 if (freopen("","wb",stdout) == NULL)
808 setbuf(stdout, (char *) NULL);
810 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
813 if (binmode_f == TRUE)
814 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
815 if (freopen("","rb",stdin) == NULL) return (-1);
819 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
823 kanji_convert(stdin);
824 if (guess_f) print_guessed_code(NULL);
829 is_inputcode_mixed = FALSE;
830 is_inputcode_set = FALSE;
835 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
844 /* reopen file for stdout */
845 if (file_out_f == TRUE) {
848 outfname = malloc(strlen(origfname)
849 + strlen(".nkftmpXXXXXX")
855 strcpy(outfname, origfname);
859 for (i = strlen(outfname); i; --i){
860 if (outfname[i - 1] == '/'
861 || outfname[i - 1] == '\\'){
867 strcat(outfname, "ntXXXXXX");
869 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
872 strcat(outfname, ".nkftmpXXXXXX");
873 fd = mkstemp(outfname);
876 || (fd_backup = dup(fileno(stdout))) < 0
877 || dup2(fd, fileno(stdout)) < 0
888 outfname = "nkf.out";
891 if(freopen(outfname, "w", stdout) == NULL) {
895 if (binmode_f == TRUE) {
896 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
897 if (freopen("","wb",stdout) == NULL)
904 if (binmode_f == TRUE)
905 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
906 if (freopen("","rb",fin) == NULL)
911 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
915 char *filename = NULL;
917 if (nfiles > 1) filename = origfname;
918 if (guess_f) print_guessed_code(filename);
924 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
932 if (dup2(fd_backup, fileno(stdout)) < 0){
935 if (stat(origfname, &sb)) {
936 fprintf(stderr, "Can't stat %s\n", origfname);
938 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
939 if (chmod(outfname, sb.st_mode)) {
940 fprintf(stderr, "Can't set permission %s\n", outfname);
943 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
945 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
946 tb[0] = tb[1] = sb.st_mtime;
947 if (utime(outfname, tb)) {
948 fprintf(stderr, "Can't set timestamp %s\n", outfname);
951 tb.actime = sb.st_atime;
952 tb.modtime = sb.st_mtime;
953 if (utime(outfname, &tb)) {
954 fprintf(stderr, "Can't set timestamp %s\n", outfname);
959 char *backup_filename = get_backup_filename(backup_suffix, origfname);
961 unlink(backup_filename);
963 if (rename(origfname, backup_filename)) {
964 perror(backup_filename);
965 fprintf(stderr, "Can't rename %s to %s\n",
966 origfname, backup_filename);
970 if (unlink(origfname)){
975 if (rename(outfname, origfname)) {
977 fprintf(stderr, "Can't rename %s to %s\n",
978 outfname, origfname);
986 #ifdef EASYWIN /*Easy Win */
987 if (file_out_f == FALSE)
988 scanf("%d",&end_check);
991 #else /* for Other OS */
992 if (file_out_f == TRUE)
997 #endif /* WIN32DLL */
1000 char *get_backup_filename(const char *suffix, const char *filename)
1002 char *backup_filename;
1003 int asterisk_count = 0;
1005 int filename_length = strlen(filename);
1007 for(i = 0; suffix[i]; i++){
1008 if(suffix[i] == '*') asterisk_count++;
1012 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1013 if (!backup_filename){
1014 perror("Can't malloc backup filename.");
1018 for(i = 0, j = 0; suffix[i];){
1019 if(suffix[i] == '*'){
1020 backup_filename[j] = '\0';
1021 strncat(backup_filename, filename, filename_length);
1023 j += filename_length;
1025 backup_filename[j++] = suffix[i++];
1028 backup_filename[j] = '\0';
1030 j = strlen(suffix) + filename_length;
1031 backup_filename = malloc( + 1);
1032 strcpy(backup_filename, filename);
1033 strcat(backup_filename, suffix);
1034 backup_filename[j] = '\0';
1036 return backup_filename;
1065 {"katakana-hiragana","h3"},
1072 #ifdef UTF8_OUTPUT_ENABLE
1082 {"fb-subchar=", ""},
1084 #ifdef UTF8_INPUT_ENABLE
1085 {"utf8-input", "W"},
1086 {"utf16-input", "W16"},
1087 {"no-cp932ext", ""},
1088 {"no-best-fit-chars",""},
1090 #ifdef UNICODE_NORMALIZATION
1091 {"utf8mac-input", ""},
1103 #ifdef NUMCHAR_OPTION
1104 {"numchar-input", ""},
1110 #ifdef SHIFTJIS_CP932
1120 static int option_mode = 0;
1122 void options(unsigned char *cp)
1126 unsigned char *cp_back = NULL;
1131 while(*cp && *cp++!='-');
1132 while (*cp || cp_back) {
1140 case '-': /* literal options */
1141 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1145 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1146 p = (unsigned char *)long_option[i].name;
1147 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1148 if (*p == cp[j] || cp[j] == ' '){
1155 while(*cp && *cp != SPACE && cp++);
1156 if (long_option[i].alias[0]){
1158 cp = (unsigned char *)long_option[i].alias;
1160 if (strcmp(long_option[i].name, "ic=") == 0){
1161 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1162 codeset[i] = nkf_toupper(p[i]);
1165 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1166 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1167 strcmp(codeset, "CP50220") == 0 ||
1168 strcmp(codeset, "CP50221") == 0 ||
1169 strcmp(codeset, "CP50222") == 0 ||
1170 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1171 input_f = JIS_INPUT;
1172 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1173 input_f = JIS_INPUT;
1177 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1178 input_f = JIS_INPUT;
1183 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1184 input_f = SJIS_INPUT;
1185 if (x0201_f==NO_X0201) x0201_f=TRUE;
1186 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1187 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1188 strcmp(codeset, "CP932") == 0 ||
1189 strcmp(codeset, "MS932") == 0){
1190 input_f = SJIS_INPUT;
1192 #ifdef SHIFTJIS_CP932
1195 #ifdef UTF8_OUTPUT_ENABLE
1196 ms_ucs_map_f = UCS_MAP_CP932;
1198 }else if(strcmp(codeset, "EUCJP") == 0 ||
1199 strcmp(codeset, "EUC-JP") == 0){
1200 input_f = JIS_INPUT;
1201 }else if(strcmp(codeset, "CP51932") == 0){
1202 input_f = JIS_INPUT;
1204 #ifdef SHIFTJIS_CP932
1207 #ifdef UTF8_OUTPUT_ENABLE
1208 ms_ucs_map_f = UCS_MAP_CP932;
1210 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1211 strcmp(codeset, "EUCJP-MS") == 0 ||
1212 strcmp(codeset, "EUCJPMS") == 0){
1213 input_f = JIS_INPUT;
1215 #ifdef SHIFTJIS_CP932
1218 #ifdef UTF8_OUTPUT_ENABLE
1219 ms_ucs_map_f = UCS_MAP_MS;
1221 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1222 strcmp(codeset, "EUCJP-ASCII") == 0){
1223 input_f = JIS_INPUT;
1225 #ifdef SHIFTJIS_CP932
1228 #ifdef UTF8_OUTPUT_ENABLE
1229 ms_ucs_map_f = UCS_MAP_ASCII;
1231 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1232 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1233 input_f = SJIS_INPUT;
1235 #ifdef SHIFTJIS_CP932
1239 if (x0201_f==NO_X0201) x0201_f=TRUE;
1240 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1241 strcmp(codeset, "EUC-JIS-2004") == 0){
1242 input_f = JIS_INPUT;
1245 #ifdef SHIFTJIS_CP932
1249 #ifdef UTF8_INPUT_ENABLE
1250 }else if(strcmp(codeset, "UTF-8") == 0 ||
1251 strcmp(codeset, "UTF-8N") == 0 ||
1252 strcmp(codeset, "UTF-8-BOM") == 0){
1253 input_f = UTF8_INPUT;
1254 #ifdef UNICODE_NORMALIZATION
1255 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1256 strcmp(codeset, "UTF-8-MAC") == 0){
1257 input_f = UTF8_INPUT;
1260 }else if(strcmp(codeset, "UTF-16") == 0){
1261 input_f = UTF16BE_INPUT;
1262 utf16_mode = UTF16BE_INPUT;
1263 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1264 strcmp(codeset, "UTF-16BE-BOM") == 0){
1265 input_f = UTF16BE_INPUT;
1266 utf16_mode = UTF16BE_INPUT;
1267 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1268 strcmp(codeset, "UTF-16LE-BOM") == 0){
1269 input_f = UTF16LE_INPUT;
1270 utf16_mode = UTF16LE_INPUT;
1275 if (strcmp(long_option[i].name, "oc=") == 0){
1276 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1277 codeset[i] = nkf_toupper(p[i]);
1280 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1281 strcmp(codeset, "CP50220") == 0){
1282 output_conv = j_oconv;
1283 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1284 output_conv = j_oconv;
1285 no_cp932ext_f = TRUE;
1286 }else if(strcmp(codeset, "CP50221") == 0 ||
1287 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1288 output_conv = j_oconv;
1290 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1291 output_conv = j_oconv;
1295 #ifdef SHIFTJIS_CP932
1298 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1299 output_conv = j_oconv;
1304 #ifdef SHIFTJIS_CP932
1307 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1308 output_conv = j_oconv;
1313 #ifdef SHIFTJIS_CP932
1316 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1317 output_conv = s_oconv;
1318 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1319 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1320 strcmp(codeset, "CP932") == 0 ||
1321 strcmp(codeset, "MS932") == 0){
1322 output_conv = s_oconv;
1324 #ifdef SHIFTJIS_CP932
1328 #ifdef UTF8_OUTPUT_ENABLE
1329 ms_ucs_map_f = UCS_MAP_CP932;
1331 }else if(strcmp(codeset, "EUCJP") == 0 ||
1332 strcmp(codeset, "EUC-JP") == 0){
1333 output_conv = e_oconv;
1334 }else if(strcmp(codeset, "CP51932") == 0){
1335 output_conv = e_oconv;
1337 #ifdef SHIFTJIS_CP932
1340 #ifdef UTF8_OUTPUT_ENABLE
1341 ms_ucs_map_f = UCS_MAP_CP932;
1343 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1344 strcmp(codeset, "EUCJP-MS") == 0 ||
1345 strcmp(codeset, "EUCJPMS") == 0){
1346 output_conv = e_oconv;
1351 #ifdef SHIFTJIS_CP932
1354 #ifdef UTF8_OUTPUT_ENABLE
1355 ms_ucs_map_f = UCS_MAP_MS;
1357 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1358 strcmp(codeset, "EUCJP-ASCII") == 0){
1359 output_conv = e_oconv;
1364 #ifdef SHIFTJIS_CP932
1367 #ifdef UTF8_OUTPUT_ENABLE
1368 ms_ucs_map_f = UCS_MAP_ASCII;
1370 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1371 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1372 output_conv = s_oconv;
1374 #ifdef SHIFTJIS_CP932
1377 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1378 strcmp(codeset, "EUC-JIS-2004") == 0){
1379 output_conv = e_oconv;
1384 #ifdef SHIFTJIS_CP932
1387 #ifdef UTF8_OUTPUT_ENABLE
1388 }else if(strcmp(codeset, "UTF-8") == 0){
1389 output_conv = w_oconv;
1390 }else if(strcmp(codeset, "UTF-8N") == 0){
1391 output_conv = w_oconv;
1393 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1394 output_conv = w_oconv;
1396 }else if(strcmp(codeset, "UTF-16BE") == 0){
1397 output_conv = w_oconv16;
1399 }else if(strcmp(codeset, "UTF-16") == 0 ||
1400 strcmp(codeset, "UTF-16BE-BOM") == 0){
1401 output_conv = w_oconv16;
1403 }else if(strcmp(codeset, "UTF-16LE") == 0){
1404 output_conv = w_oconv16;
1407 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1408 output_conv = w_oconv16;
1416 if (strcmp(long_option[i].name, "overwrite") == 0){
1419 preserve_time_f = TRUE;
1422 if (strcmp(long_option[i].name, "overwrite=") == 0){
1425 preserve_time_f = TRUE;
1427 backup_suffix = malloc(strlen((char *) p) + 1);
1428 strcpy(backup_suffix, (char *) p);
1431 if (strcmp(long_option[i].name, "in-place") == 0){
1434 preserve_time_f = FALSE;
1437 if (strcmp(long_option[i].name, "in-place=") == 0){
1440 preserve_time_f = FALSE;
1442 backup_suffix = malloc(strlen((char *) p) + 1);
1443 strcpy(backup_suffix, (char *) p);
1448 if (strcmp(long_option[i].name, "cap-input") == 0){
1452 if (strcmp(long_option[i].name, "url-input") == 0){
1457 #ifdef NUMCHAR_OPTION
1458 if (strcmp(long_option[i].name, "numchar-input") == 0){
1464 if (strcmp(long_option[i].name, "no-output") == 0){
1468 if (strcmp(long_option[i].name, "debug") == 0){
1473 if (strcmp(long_option[i].name, "cp932") == 0){
1474 #ifdef SHIFTJIS_CP932
1478 #ifdef UTF8_OUTPUT_ENABLE
1479 ms_ucs_map_f = UCS_MAP_CP932;
1483 if (strcmp(long_option[i].name, "no-cp932") == 0){
1484 #ifdef SHIFTJIS_CP932
1488 #ifdef UTF8_OUTPUT_ENABLE
1489 ms_ucs_map_f = UCS_MAP_ASCII;
1493 #ifdef SHIFTJIS_CP932
1494 if (strcmp(long_option[i].name, "cp932inv") == 0){
1501 if (strcmp(long_option[i].name, "x0212") == 0){
1508 if (strcmp(long_option[i].name, "exec-in") == 0){
1512 if (strcmp(long_option[i].name, "exec-out") == 0){
1517 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1518 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1519 no_cp932ext_f = TRUE;
1522 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1523 no_best_fit_chars_f = TRUE;
1526 if (strcmp(long_option[i].name, "fb-skip") == 0){
1527 encode_fallback = NULL;
1530 if (strcmp(long_option[i].name, "fb-html") == 0){
1531 encode_fallback = encode_fallback_html;
1534 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1535 encode_fallback = encode_fallback_xml;
1538 if (strcmp(long_option[i].name, "fb-java") == 0){
1539 encode_fallback = encode_fallback_java;
1542 if (strcmp(long_option[i].name, "fb-perl") == 0){
1543 encode_fallback = encode_fallback_perl;
1546 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1547 encode_fallback = encode_fallback_subchar;
1550 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1551 encode_fallback = encode_fallback_subchar;
1552 unicode_subchar = 0;
1554 /* decimal number */
1555 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1556 unicode_subchar *= 10;
1557 unicode_subchar += hex2bin(p[i]);
1559 }else if(p[1] == 'x' || p[1] == 'X'){
1560 /* hexadecimal number */
1561 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1562 unicode_subchar <<= 4;
1563 unicode_subchar |= hex2bin(p[i]);
1567 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1568 unicode_subchar *= 8;
1569 unicode_subchar += hex2bin(p[i]);
1572 w16e_conv(unicode_subchar, &i, &j);
1573 unicode_subchar = i<<8 | j;
1577 #ifdef UTF8_OUTPUT_ENABLE
1578 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1579 ms_ucs_map_f = UCS_MAP_MS;
1583 #ifdef UNICODE_NORMALIZATION
1584 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1585 input_f = UTF8_INPUT;
1590 if (strcmp(long_option[i].name, "prefix=") == 0){
1591 if (' ' < p[0] && p[0] < 128){
1592 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1593 prefix_table[p[i]] = p[0];
1600 case 'b': /* buffered mode */
1603 case 'u': /* non bufferd mode */
1606 case 't': /* transparent mode */
1611 } else if (*cp=='2') {
1615 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1623 case 'j': /* JIS output */
1625 output_conv = j_oconv;
1627 case 'e': /* AT&T EUC output */
1628 output_conv = e_oconv;
1630 case 's': /* SJIS output */
1631 output_conv = s_oconv;
1633 case 'l': /* ISO8859 Latin-1 support, no conversion */
1634 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1635 input_f = LATIN1_INPUT;
1637 case 'i': /* Kanji IN ESC-$-@/B */
1638 if (*cp=='@'||*cp=='B')
1639 kanji_intro = *cp++;
1641 case 'o': /* ASCII IN ESC-(-J/B */
1642 if (*cp=='J'||*cp=='B'||*cp=='H')
1643 ascii_intro = *cp++;
1647 bit:1 katakana->hiragana
1648 bit:2 hiragana->katakana
1650 if ('9'>= *cp && *cp>='0')
1651 hira_f |= (*cp++ -'0');
1658 #if defined(MSDOS) || defined(__OS2__)
1673 #ifdef UTF8_OUTPUT_ENABLE
1674 case 'w': /* UTF-8 output */
1675 if ('1'== cp[0] && '6'==cp[1]) {
1676 output_conv = w_oconv16; cp+=2;
1678 unicode_bom_f=2; cp++;
1681 unicode_bom_f=1; cp++;
1683 } else if (cp[0] == 'B') {
1684 unicode_bom_f=2; cp++;
1686 unicode_bom_f=1; cp++;
1689 } else if (cp[0] == '8') {
1690 output_conv = w_oconv; cp++;
1693 unicode_bom_f=1; cp++;
1696 output_conv = w_oconv;
1699 #ifdef UTF8_INPUT_ENABLE
1700 case 'W': /* UTF-8 input */
1701 if ('1'== cp[0] && '6'==cp[1]) {
1702 input_f = UTF16BE_INPUT;
1703 utf16_mode = UTF16BE_INPUT;
1707 input_f = UTF16LE_INPUT;
1708 utf16_mode = UTF16LE_INPUT;
1709 } else if (cp[0] == 'B') {
1711 input_f = UTF16BE_INPUT;
1712 utf16_mode = UTF16BE_INPUT;
1714 } else if (cp[0] == '8') {
1716 input_f = UTF8_INPUT;
1718 input_f = UTF8_INPUT;
1721 /* Input code assumption */
1722 case 'J': /* JIS input */
1723 case 'E': /* AT&T EUC input */
1724 input_f = JIS_INPUT;
1726 case 'S': /* MS Kanji input */
1727 input_f = SJIS_INPUT;
1728 if (x0201_f==NO_X0201) x0201_f=TRUE;
1730 case 'Z': /* Convert X0208 alphabet to asii */
1731 /* bit:0 Convert X0208
1732 bit:1 Convert Kankaku to one space
1733 bit:2 Convert Kankaku to two spaces
1734 bit:3 Convert HTML Entity
1736 if ('9'>= *cp && *cp>='0')
1737 alpha_f |= 1<<(*cp++ -'0');
1741 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1742 x0201_f = FALSE; /* No X0201->X0208 conversion */
1744 ESC-(-I in JIS, EUC, MS Kanji
1745 SI/SO in JIS, EUC, MS Kanji
1746 SSO in EUC, JIS, not in MS Kanji
1747 MS Kanji (0xa0-0xdf)
1749 ESC-(-I in JIS (0x20-0x5f)
1750 SSO in EUC (0xa0-0xdf)
1751 0xa0-0xd in MS Kanji (0xa0-0xdf)
1754 case 'X': /* Assume X0201 kana */
1755 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1758 case 'F': /* prserve new lines */
1759 fold_preserve_f = TRUE;
1760 case 'f': /* folding -f60 or -f */
1763 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1765 fold_len += *cp++ - '0';
1767 if (!(0<fold_len && fold_len<BUFSIZ))
1768 fold_len = DEFAULT_FOLD;
1772 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1774 fold_margin += *cp++ - '0';
1778 case 'm': /* MIME support */
1779 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1780 if (*cp=='B'||*cp=='Q') {
1781 mime_decode_mode = *cp++;
1782 mimebuf_f = FIXED_MIME;
1783 } else if (*cp=='N') {
1784 mime_f = TRUE; cp++;
1785 } else if (*cp=='S') {
1786 mime_f = STRICT_MIME; cp++;
1787 } else if (*cp=='0') {
1788 mime_decode_f = FALSE;
1789 mime_f = FALSE; cp++;
1792 case 'M': /* MIME output */
1795 mimeout_f = FIXED_MIME; cp++;
1796 } else if (*cp=='Q') {
1798 mimeout_f = FIXED_MIME; cp++;
1803 case 'B': /* Broken JIS support */
1805 bit:1 allow any x on ESC-(-x or ESC-$-x
1806 bit:2 reset to ascii on NL
1808 if ('9'>= *cp && *cp>='0')
1809 broken_f |= 1<<(*cp++ -'0');
1814 case 'O':/* for Output file */
1818 case 'c':/* add cr code */
1821 case 'd':/* delete cr code */
1824 case 'I': /* ISO-2022-JP output */
1827 case 'L': /* line mode */
1828 if (*cp=='u') { /* unix */
1829 crmode_f = NL; cp++;
1830 } else if (*cp=='m') { /* mac */
1831 crmode_f = CR; cp++;
1832 } else if (*cp=='w') { /* windows */
1833 crmode_f = CRLF; cp++;
1834 } else if (*cp=='0') { /* no conversion */
1844 /* module muliple options in a string are allowed for Perl moudle */
1845 while(*cp && *cp++!='-');
1848 /* bogus option but ignored */
1854 #ifdef ANSI_C_PROTOTYPE
1855 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1857 struct input_code * find_inputcode_byfunc(iconv_func)
1858 int (*iconv_func)();
1862 struct input_code *p = input_code_list;
1864 if (iconv_func == p->iconv_func){
1873 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1875 #ifdef INPUT_CODE_FIX
1883 #ifdef INPUT_CODE_FIX
1884 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1890 if (estab_f && iconv_for_check != iconv){
1891 struct input_code *p = find_inputcode_byfunc(iconv);
1893 set_input_codename(p->name);
1894 debug(input_codename);
1896 iconv_for_check = iconv;
1901 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1902 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1903 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1904 #ifdef SHIFTJIS_CP932
1905 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1906 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1908 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1910 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1911 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1913 #define SCORE_INIT (SCORE_iMIME)
1915 const int score_table_A0[] = {
1918 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1919 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1922 const int score_table_F0[] = {
1923 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1924 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1925 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1926 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1929 void set_code_score(struct input_code *ptr, int score)
1932 ptr->score |= score;
1936 void clr_code_score(struct input_code *ptr, int score)
1939 ptr->score &= ~score;
1943 void code_score(struct input_code *ptr)
1945 int c2 = ptr->buf[0];
1946 #ifdef UTF8_OUTPUT_ENABLE
1947 int c1 = ptr->buf[1];
1950 set_code_score(ptr, SCORE_ERROR);
1951 }else if (c2 == SSO){
1952 set_code_score(ptr, SCORE_KANA);
1953 #ifdef UTF8_OUTPUT_ENABLE
1954 }else if (!e2w_conv(c2, c1)){
1955 set_code_score(ptr, SCORE_NO_EXIST);
1957 }else if ((c2 & 0x70) == 0x20){
1958 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1959 }else if ((c2 & 0x70) == 0x70){
1960 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1961 }else if ((c2 & 0x70) >= 0x50){
1962 set_code_score(ptr, SCORE_L2);
1966 void status_disable(struct input_code *ptr)
1971 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1974 void status_push_ch(struct input_code *ptr, int c)
1976 ptr->buf[ptr->index++] = c;
1979 void status_clear(struct input_code *ptr)
1985 void status_reset(struct input_code *ptr)
1988 ptr->score = SCORE_INIT;
1991 void status_reinit(struct input_code *ptr)
1994 ptr->_file_stat = 0;
1997 void status_check(struct input_code *ptr, int c)
1999 if (c <= DEL && estab_f){
2004 void s_status(struct input_code *ptr, int c)
2008 status_check(ptr, c);
2013 #ifdef NUMCHAR_OPTION
2014 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2017 }else if (0xa1 <= c && c <= 0xdf){
2018 status_push_ch(ptr, SSO);
2019 status_push_ch(ptr, c);
2022 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2024 status_push_ch(ptr, c);
2025 #ifdef SHIFTJIS_CP932
2027 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
2029 status_push_ch(ptr, c);
2030 #endif /* SHIFTJIS_CP932 */
2032 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2034 status_push_ch(ptr, c);
2035 #endif /* X0212_ENABLE */
2037 status_disable(ptr);
2041 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2042 status_push_ch(ptr, c);
2043 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2047 status_disable(ptr);
2051 #ifdef SHIFTJIS_CP932
2052 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2053 status_push_ch(ptr, c);
2054 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2055 set_code_score(ptr, SCORE_CP932);
2060 #endif /* SHIFTJIS_CP932 */
2061 #ifndef X0212_ENABLE
2062 status_disable(ptr);
2068 void e_status(struct input_code *ptr, int c)
2072 status_check(ptr, c);
2077 #ifdef NUMCHAR_OPTION
2078 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2081 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2083 status_push_ch(ptr, c);
2085 }else if (0x8f == c){
2087 status_push_ch(ptr, c);
2088 #endif /* X0212_ENABLE */
2090 status_disable(ptr);
2094 if (0xa1 <= c && c <= 0xfe){
2095 status_push_ch(ptr, c);
2099 status_disable(ptr);
2104 if (0xa1 <= c && c <= 0xfe){
2106 status_push_ch(ptr, c);
2108 status_disable(ptr);
2110 #endif /* X0212_ENABLE */
2114 #ifdef UTF8_INPUT_ENABLE
2115 void w16_status(struct input_code *ptr, int c)
2121 if (ptr->_file_stat == 0){
2122 if (c == 0xfe || c == 0xff){
2124 status_push_ch(ptr, c);
2125 ptr->_file_stat = 1;
2127 status_disable(ptr);
2128 ptr->_file_stat = -1;
2130 }else if (ptr->_file_stat > 0){
2132 status_push_ch(ptr, c);
2133 }else if (ptr->_file_stat < 0){
2134 status_disable(ptr);
2140 status_disable(ptr);
2141 ptr->_file_stat = -1;
2143 status_push_ch(ptr, c);
2150 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2151 status_push_ch(ptr, c);
2154 status_disable(ptr);
2155 ptr->_file_stat = -1;
2161 void w_status(struct input_code *ptr, int c)
2165 status_check(ptr, c);
2170 #ifdef NUMCHAR_OPTION
2171 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2174 }else if (0xc0 <= c && c <= 0xdf){
2176 status_push_ch(ptr, c);
2177 }else if (0xe0 <= c && c <= 0xef){
2179 status_push_ch(ptr, c);
2181 status_disable(ptr);
2186 if (0x80 <= c && c <= 0xbf){
2187 status_push_ch(ptr, c);
2188 if (ptr->index > ptr->stat){
2189 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2190 && ptr->buf[2] == 0xbf);
2191 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2192 &ptr->buf[0], &ptr->buf[1]);
2199 status_disable(ptr);
2206 void code_status(int c)
2208 int action_flag = 1;
2209 struct input_code *result = 0;
2210 struct input_code *p = input_code_list;
2212 (p->status_func)(p, c);
2215 }else if(p->stat == 0){
2226 if (result && !estab_f){
2227 set_iconv(TRUE, result->iconv_func);
2228 }else if (c <= DEL){
2229 struct input_code *ptr = input_code_list;
2239 int std_getc(FILE *f)
2242 return std_gc_buf[--std_gc_ndx];
2248 int std_ungetc(int c, FILE *f)
2250 if (std_gc_ndx == STD_GC_BUFSIZE){
2253 std_gc_buf[std_gc_ndx++] = c;
2258 void std_putc(int c)
2265 #if !defined(PERL_XS) && !defined(WIN32DLL)
2266 int noconvert(FILE *f)
2271 module_connection();
2272 while ((c = (*i_getc)(f)) != EOF)
2279 void module_connection(void)
2281 oconv = output_conv;
2284 /* replace continucation module, from output side */
2286 /* output redicrection */
2288 if (noout_f || guess_f){
2295 if (mimeout_f == TRUE) {
2296 o_base64conv = oconv; oconv = base64_conv;
2298 /* base64_count = 0; */
2302 o_crconv = oconv; oconv = cr_conv;
2305 o_rot_conv = oconv; oconv = rot_conv;
2308 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2311 o_hira_conv = oconv; oconv = hira_conv;
2314 o_fconv = oconv; oconv = fold_conv;
2317 if (alpha_f || x0201_f) {
2318 o_zconv = oconv; oconv = z_conv;
2322 i_ungetc = std_ungetc;
2323 /* input redicrection */
2326 i_cgetc = i_getc; i_getc = cap_getc;
2327 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2330 i_ugetc = i_getc; i_getc = url_getc;
2331 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2334 #ifdef NUMCHAR_OPTION
2336 i_ngetc = i_getc; i_getc = numchar_getc;
2337 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2340 #ifdef UNICODE_NORMALIZATION
2341 if (nfc_f && input_f == UTF8_INPUT){
2342 i_nfc_getc = i_getc; i_getc = nfc_getc;
2343 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2346 if (mime_f && mimebuf_f==FIXED_MIME) {
2347 i_mgetc = i_getc; i_getc = mime_getc;
2348 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2351 i_bgetc = i_getc; i_getc = broken_getc;
2352 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2354 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2355 set_iconv(-TRUE, e_iconv);
2356 } else if (input_f == SJIS_INPUT) {
2357 set_iconv(-TRUE, s_iconv);
2358 #ifdef UTF8_INPUT_ENABLE
2359 } else if (input_f == UTF8_INPUT) {
2360 set_iconv(-TRUE, w_iconv);
2361 } else if (input_f == UTF16BE_INPUT) {
2362 set_iconv(-TRUE, w_iconv16);
2363 } else if (input_f == UTF16LE_INPUT) {
2364 set_iconv(-TRUE, w_iconv16);
2367 set_iconv(FALSE, e_iconv);
2371 struct input_code *p = input_code_list;
2379 Conversion main loop. Code detection only.
2382 int kanji_convert(FILE *f)
2386 int is_8bit = FALSE;
2388 module_connection();
2391 if(input_f == SJIS_INPUT
2392 #ifdef UTF8_INPUT_ENABLE
2393 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2401 output_mode = ASCII;
2404 #define NEXT continue /* no output, get next */
2405 #define SEND ; /* output c1 and c2, get next */
2406 #define LAST break /* end of loop, go closing */
2408 while ((c1 = (*i_getc)(f)) != EOF) {
2409 #ifdef INPUT_CODE_FIX
2416 /* in case of 8th bit is on */
2417 if (!estab_f&&!mime_decode_mode) {
2418 /* in case of not established yet */
2419 /* It is still ambiguious */
2420 if (h_conv(f, c2, c1)==EOF)
2426 /* in case of already established */
2428 /* ignore bogus code */
2434 /* second byte, 7 bit code */
2435 /* it might be kanji shitfted */
2436 if ((c1 == DEL) || (c1 <= SPACE)) {
2437 /* ignore bogus first code */
2445 #ifdef UTF8_INPUT_ENABLE
2454 #ifdef NUMCHAR_OPTION
2455 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2458 } else if (c1 > DEL) {
2460 if (!estab_f && !iso8859_f) {
2461 /* not established yet */
2462 if (!is_8bit) is_8bit = TRUE;
2465 } else { /* estab_f==TRUE */
2470 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2471 /* SJIS X0201 Case... */
2472 if(iso2022jp_f && x0201_f==NO_X0201) {
2473 (*oconv)(GETA1, GETA2);
2480 } else if (c1==SSO && iconv != s_iconv) {
2481 /* EUC X0201 Case */
2482 c1 = (*i_getc)(f); /* skip SSO */
2484 if (SSP<=c1 && c1<0xe0) {
2485 if(iso2022jp_f && x0201_f==NO_X0201) {
2486 (*oconv)(GETA1, GETA2);
2493 } else { /* bogus code, skip SSO and one byte */
2497 /* already established */
2502 } else if ((c1 > SPACE) && (c1 != DEL)) {
2503 /* in case of Roman characters */
2505 /* output 1 shifted byte */
2509 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2510 /* output 1 shifted byte */
2511 if(iso2022jp_f && x0201_f==NO_X0201) {
2512 (*oconv)(GETA1, GETA2);
2519 /* look like bogus code */
2522 } else if (input_mode == X0208 || input_mode == X0212 ||
2523 input_mode == X0213_1 || input_mode == X0213_2) {
2524 /* in case of Kanji shifted */
2527 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2528 /* Check MIME code */
2529 if ((c1 = (*i_getc)(f)) == EOF) {
2532 } else if (c1 == '?') {
2533 /* =? is mime conversion start sequence */
2534 if(mime_f == STRICT_MIME) {
2535 /* check in real detail */
2536 if (mime_begin_strict(f) == EOF)
2540 } else if (mime_begin(f) == EOF)
2550 /* normal ASCII code */
2553 } else if (!is_8bit && c1 == SI) {
2556 } else if (!is_8bit && c1 == SO) {
2559 } else if (!is_8bit && c1 == ESC ) {
2560 if ((c1 = (*i_getc)(f)) == EOF) {
2561 /* (*oconv)(0, ESC); don't send bogus code */
2563 } else if (c1 == '$') {
2564 if ((c1 = (*i_getc)(f)) == EOF) {
2566 (*oconv)(0, ESC); don't send bogus code
2567 (*oconv)(0, '$'); */
2569 } else if (c1 == '@'|| c1 == 'B') {
2570 /* This is kanji introduction */
2573 set_input_codename("ISO-2022-JP");
2575 debug(input_codename);
2578 } else if (c1 == '(') {
2579 if ((c1 = (*i_getc)(f)) == EOF) {
2580 /* don't send bogus code
2586 } else if (c1 == '@'|| c1 == 'B') {
2587 /* This is kanji introduction */
2592 } else if (c1 == 'D'){
2596 #endif /* X0212_ENABLE */
2597 } else if (c1 == (X0213_1&0x7F)){
2598 input_mode = X0213_1;
2601 } else if (c1 == (X0213_2&0x7F)){
2602 input_mode = X0213_2;
2606 /* could be some special code */
2613 } else if (broken_f&0x2) {
2614 /* accept any ESC-(-x as broken code ... */
2624 } else if (c1 == '(') {
2625 if ((c1 = (*i_getc)(f)) == EOF) {
2626 /* don't send bogus code
2628 (*oconv)(0, '('); */
2632 /* This is X0201 kana introduction */
2633 input_mode = X0201; shift_mode = X0201;
2635 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2636 /* This is X0208 kanji introduction */
2637 input_mode = ASCII; shift_mode = FALSE;
2639 } else if (broken_f&0x2) {
2640 input_mode = ASCII; shift_mode = FALSE;
2645 /* maintain various input_mode here */
2649 } else if ( c1 == 'N' || c1 == 'n' ){
2651 c3 = (*i_getc)(f); /* skip SS2 */
2652 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2667 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2668 input_mode = ASCII; set_iconv(FALSE, 0);
2670 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2671 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2679 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2680 if ((c1=(*i_getc)(f))!=EOF) {
2684 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2702 if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2703 int c0 = (*i_getc)(f);
2706 (*iconv)(c2, c1, c0);
2712 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2716 (*oconv)((0x8f << 8) | c2, c1);
2718 #endif /* X0212_ENABLE */
2720 (*oconv)((0x8f << 8) | c2, c1);
2723 (*oconv)(input_mode, c1); /* other special case */
2728 /* goto next_word */
2732 (*iconv)(EOF, 0, 0);
2733 if (!is_inputcode_set)
2736 struct input_code *p = input_code_list;
2737 struct input_code *result = p;
2739 if (p->score < result->score) result = p;
2742 set_input_codename(result->name);
2749 h_conv(FILE *f, int c2, int c1)
2754 /** it must NOT be in the kanji shifte sequence */
2755 /** it must NOT be written in JIS7 */
2756 /** and it must be after 2 byte 8bit code */
2762 while ((c1 = (*i_getc)(f)) != EOF) {
2768 if (push_hold_buf(c1) == EOF || estab_f){
2774 struct input_code *p = input_code_list;
2775 struct input_code *result = p;
2780 if (p->score < result->score){
2785 set_iconv(FALSE, result->iconv_func);
2790 ** 1) EOF is detected, or
2791 ** 2) Code is established, or
2792 ** 3) Buffer is FULL (but last word is pushed)
2794 ** in 1) and 3) cases, we continue to use
2795 ** Kanji codes by oconv and leave estab_f unchanged.
2800 while (wc < hold_count){
2801 c2 = hold_buf[wc++];
2803 #ifdef NUMCHAR_OPTION
2804 || (c2 & CLASS_MASK) == CLASS_UTF16
2809 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2810 (*iconv)(X0201, c2, 0);
2813 if (wc < hold_count){
2814 c1 = hold_buf[wc++];
2823 if ((*iconv)(c2, c1, 0) < 0){
2825 if (wc < hold_count){
2826 c0 = hold_buf[wc++];
2835 (*iconv)(c2, c1, c0);
2844 push_hold_buf(int c2)
2846 if (hold_count >= HOLD_SIZE*2)
2848 hold_buf[hold_count++] = (unsigned char)c2;
2849 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2852 int s2e_conv(int c2, int c1, int *p2, int *p1)
2854 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2857 static const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2858 #ifdef SHIFTJIS_CP932
2859 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2860 extern const unsigned short shiftjis_cp932[3][189];
2861 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2867 #endif /* SHIFTJIS_CP932 */
2869 if (!x0213_f && 0xfa <= c2 && c2 <= 0xfc){
2870 extern const unsigned short shiftjis_x0212[3][189];
2871 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2874 c2 = (0x8f << 8) | (val >> 8);
2887 if(x0213_f && c2 >= 0xF0){
2888 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2889 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2890 }else{ /* 78<=k<=94 */
2891 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2892 if (0x9E < c1) c2++;
2895 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2896 if (0x9E < c1) c2++;
2899 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2906 c2 = x0212_unshift(c2);
2913 int s_iconv(int c2, int c1, int c0)
2917 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2920 int ret = s2e_conv(c2, c1, &c2, &c1);
2921 if (ret) return ret;
2927 int e_iconv(int c2, int c1, int c0)
2932 }else if (c2 == 0x8f){
2936 c2 = (c2 << 8) | (c1 & 0x7f);
2938 #ifdef SHIFTJIS_CP932
2941 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2942 s2e_conv(s2, s1, &c2, &c1);
2943 if ((c2 & 0xff00) == 0){
2949 #endif /* SHIFTJIS_CP932 */
2950 #endif /* X0212_ENABLE */
2951 } else if (c2 == SSO){
2954 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2964 #ifdef UTF8_INPUT_ENABLE
2965 int w2e_conv(int c2, int c1, int c0, int *p2, int *p1)
2972 }else if (0xc0 <= c2 && c2 <= 0xef) {
2973 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2974 #ifdef NUMCHAR_OPTION
2977 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2985 int w_iconv(int c2, int c1, int c0)
2989 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
2990 if(ignore_zwnbsp_f){
2991 ignore_zwnbsp_f = FALSE;
2992 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2996 if (c2 == 0) /* 0x00-0x7f */
2997 c1 &= 0x7F; /* 1byte */
2999 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
3001 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
3002 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
3003 return -1; /* 3bytes */
3005 else if (0xf0 <= c2)
3006 return 0; /* 4,5,6bytes */
3007 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
3008 return 0; /* trail byte */
3012 /* must be 3bytes */
3014 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
3016 }else if(c2 == 0xED){
3017 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
3019 }else if((c2 & 0xf0) == 0xe0){
3020 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
3024 if (c2 == 0 || c2 == EOF){
3026 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3035 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3036 void w16w_conv(int val, int *p2, int *p1, int *p0)
3043 }else if (val < 0x800){
3044 *p2 = 0xc0 | (val >> 6);
3045 *p1 = 0x80 | (val & 0x3f);
3048 *p2 = 0xe0 | (val >> 12);
3049 *p1 = 0x80 | ((val >> 6) & 0x3f);
3050 *p0 = 0x80 | (val & 0x3f);
3055 #ifdef UTF8_INPUT_ENABLE
3056 int ww16_conv(int c2, int c1, int c0)
3061 }else if (c2 >= 0xe0){
3062 val = (c2 & 0x0f) << 12;
3063 val |= (c1 & 0x3f) << 6;
3065 }else if (c2 >= 0xc0){
3066 val = (c2 & 0x1f) << 6;
3074 int w16e_conv(int val, int *p2, int *p1)
3083 w16w_conv(val, &c2, &c1, &c0);
3084 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3085 #ifdef NUMCHAR_OPTION
3088 *p1 = CLASS_UTF16 | val;
3097 #ifdef UTF8_INPUT_ENABLE
3098 int w_iconv16(int c2, int c1, int c0)
3102 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3103 if(ignore_zwnbsp_f){
3104 ignore_zwnbsp_f = FALSE;
3105 if (c2==0376 && c1==0377){
3106 utf16_mode = UTF16BE_INPUT;
3108 }else if(c2==0377 && c1==0376){
3109 utf16_mode = UTF16LE_INPUT;
3113 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3115 tmp=c1; c1=c2; c2=tmp;
3117 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3120 }else if((c2>>3)==27){ /* surrogate pair */
3122 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3123 if (ret) return ret;
3128 int unicode_to_jis_common(int c2, int c1, int c0, int *p2, int *p1)
3130 extern const unsigned short *const utf8_to_euc_2bytes[];
3131 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3132 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3133 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3134 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3135 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3136 const unsigned short *const *pp;
3137 const unsigned short *const *const *ppp;
3138 static const int no_best_fit_chars_table_C2[] =
3139 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3141 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3142 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3143 static const int no_best_fit_chars_table_C2_ms[] =
3144 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3146 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3147 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3148 static const int no_best_fit_chars_table_932_C2[] =
3149 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3151 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3152 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3153 static const int no_best_fit_chars_table_932_C3[] =
3154 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3155 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3157 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3163 }else if(c2 < 0xe0){
3164 if(no_best_fit_chars_f){
3165 if(ms_ucs_map_f == UCS_MAP_CP932){
3168 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3171 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3174 }else if(cp51932_f){
3177 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3180 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3183 }else if(ms_ucs_map_f == UCS_MAP_MS){
3184 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3188 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3189 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3191 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3193 if(no_best_fit_chars_f){
3194 if(ms_ucs_map_f == UCS_MAP_CP932){
3195 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3196 }else if(ms_ucs_map_f == UCS_MAP_MS){
3201 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3204 if(c0 == 0x92) return 1;
3209 if(c1 == 0x80 || c0 == 0x9C) return 1;
3217 if(c0 == 0x95) return 1;
3220 if(c0 == 0xA5) return 1;
3227 if(c0 == 0x8D) return 1;
3230 if(c0 == 0x9E && cp51932_f) return 1;
3233 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3241 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3242 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3244 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3246 #ifdef SHIFTJIS_CP932
3247 if (!ret && cp51932_f && (*p2 & 0xff00) >> 8 == 0x8f) {
3249 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3250 s2e_conv(s2, s1, p2, p1);
3259 int w_iconv_common(int c1, int c0, const unsigned short *const *pp, int psize, int *p2, int *p1)
3262 const unsigned short *p;
3265 if (pp == 0) return 1;
3268 if (c1 < 0 || psize <= c1) return 1;
3270 if (p == 0) return 1;
3273 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3275 if (val == 0) return 1;
3276 if (no_cp932ext_f && (
3277 (val>>8) == 0x2D || /* NEC special characters */
3278 val > 0xF300 /* NEC special characters */
3286 if (c2 == SO) c2 = X0201;
3293 void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c)
3295 const char *hex = "0123456789ABCDEF";
3301 (*f)(0, hex[(c>>shift)&0xF]);
3311 void encode_fallback_html(int c)
3317 (*oconv)(0, 0x30+(c/1000000)%10);
3319 (*oconv)(0, 0x30+(c/100000 )%10);
3321 (*oconv)(0, 0x30+(c/10000 )%10);
3323 (*oconv)(0, 0x30+(c/1000 )%10);
3325 (*oconv)(0, 0x30+(c/100 )%10);
3327 (*oconv)(0, 0x30+(c/10 )%10);
3329 (*oconv)(0, 0x30+ c %10);
3334 void encode_fallback_xml(int c)
3339 nkf_each_char_to_hex(oconv, c);
3344 void encode_fallback_java(int c)
3346 const char *hex = "0123456789ABCDEF";
3348 if((c&0x00FFFFFF) > 0xFFFF){
3352 (*oconv)(0, hex[(c>>20)&0xF]);
3353 (*oconv)(0, hex[(c>>16)&0xF]);
3357 (*oconv)(0, hex[(c>>12)&0xF]);
3358 (*oconv)(0, hex[(c>> 8)&0xF]);
3359 (*oconv)(0, hex[(c>> 4)&0xF]);
3360 (*oconv)(0, hex[ c &0xF]);
3364 void encode_fallback_perl(int c)
3369 nkf_each_char_to_hex(oconv, c);
3374 void encode_fallback_subchar(int c)
3376 c = unicode_subchar;
3377 (*oconv)((c>>8)&0xFF, c&0xFF);
3382 #ifdef UTF8_OUTPUT_ENABLE
3383 int e2w_conv(int c2, int c1)
3385 extern const unsigned short euc_to_utf8_1byte[];
3386 extern const unsigned short *const euc_to_utf8_2bytes[];
3387 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3388 extern const unsigned short *const x0212_to_utf8_2bytes[];
3389 const unsigned short *p;
3392 p = euc_to_utf8_1byte;
3394 } else if (c2 >> 8 == 0x8f){
3395 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3398 c2 = (c2&0x7f) - 0x21;
3399 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3400 p = x0212_to_utf8_2bytes[c2];
3406 c2 = (c2&0x7f) - 0x21;
3407 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3408 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3413 c1 = (c1 & 0x7f) - 0x21;
3414 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3419 void w_oconv(int c2, int c1)
3428 if (unicode_bom_f==2) {
3435 #ifdef NUMCHAR_OPTION
3436 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3437 w16w_conv(c1, &c2, &c1, &c0);
3441 if (c0) (*o_putc)(c0);
3448 output_mode = ASCII;
3450 } else if (c2 == ISO8859_1) {
3451 output_mode = ISO8859_1;
3452 (*o_putc)(c1 | 0x080);
3455 val = e2w_conv(c2, c1);
3457 w16w_conv(val, &c2, &c1, &c0);
3461 if (c0) (*o_putc)(c0);
3467 void w_oconv16(int c2, int c1)
3474 if (unicode_bom_f==2) {
3476 (*o_putc)((unsigned char)'\377');
3480 (*o_putc)((unsigned char)'\377');
3485 if (c2 == ISO8859_1) {
3488 #ifdef NUMCHAR_OPTION
3489 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3490 c2 = (c1 >> 8) & 0xff;
3494 int val = e2w_conv(c2, c1);
3495 c2 = (val >> 8) & 0xff;
3509 void e_oconv(int c2, int c1)
3511 #ifdef NUMCHAR_OPTION
3512 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3513 w16e_conv(c1, &c2, &c1);
3514 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3515 if(encode_fallback)(*encode_fallback)(c1);
3523 } else if (c2 == 0) {
3524 output_mode = ASCII;
3526 } else if (c2 == X0201) {
3527 output_mode = JAPANESE_EUC;
3528 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3529 } else if (c2 == ISO8859_1) {
3530 output_mode = ISO8859_1;
3531 (*o_putc)(c1 | 0x080);
3533 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3534 output_mode = JAPANESE_EUC;
3535 #ifdef SHIFTJIS_CP932
3538 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3539 s2e_conv(s2, s1, &c2, &c1);
3544 output_mode = ASCII;
3546 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3549 (*o_putc)((c2 & 0x7f) | 0x080);
3550 (*o_putc)(c1 | 0x080);
3553 (*o_putc)((c2 & 0x7f) | 0x080);
3554 (*o_putc)(c1 | 0x080);
3558 if ((c1<0x21 || 0x7e<c1) ||
3559 (c2<0x21 || 0x7e<c2)) {
3560 set_iconv(FALSE, 0);
3561 return; /* too late to rescue this char */
3563 output_mode = JAPANESE_EUC;
3564 (*o_putc)(c2 | 0x080);
3565 (*o_putc)(c1 | 0x080);
3570 int x0212_shift(int c)
3574 if ((ret & 0xff00) == 0x8f00){
3575 if (0x75 <= c && c <= 0x7f){
3576 ret = c + (0x109 - 0x75);
3579 if (0x75 <= c && c <= 0x7f){
3580 ret = c + (0x113 - 0x75);
3587 int x0212_unshift(int c)
3590 if (0x7f <= c && c <= 0x88){
3591 ret = c + (0x75 - 0x7f);
3592 }else if (0x89 <= c && c <= 0x92){
3593 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3597 #endif /* X0212_ENABLE */
3599 int e2s_conv(int c2, int c1, int *p2, int *p1)
3602 if ((c2 & 0xff00) == 0x8f00){
3605 if((0x21 <= ndx && ndx <= 0x2F)){
3606 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3607 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3609 }else if(0x6E <= ndx && ndx <= 0x7E){
3610 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3611 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3617 else if(0x21 <= ndx && ndx <= 0x7e){
3619 const unsigned short *ptr;
3620 extern const unsigned short *const x0212_shiftjis[];
3621 ptr = x0212_shiftjis[ndx - 0x21];
3623 val = ptr[(c1 & 0x7f) - 0x21];
3632 c2 = x0212_shift(c2);
3634 #endif /* X0212_ENABLE */
3636 if(0x7F < c2) return 1;
3637 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3638 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3642 void s_oconv(int c2, int c1)
3644 #ifdef NUMCHAR_OPTION
3645 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3646 w16e_conv(c1, &c2, &c1);
3647 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3648 if(encode_fallback)(*encode_fallback)(c1);
3656 } else if (c2 == 0) {
3657 output_mode = ASCII;
3659 } else if (c2 == X0201) {
3660 output_mode = SHIFT_JIS;
3662 } else if (c2 == ISO8859_1) {
3663 output_mode = ISO8859_1;
3664 (*o_putc)(c1 | 0x080);
3666 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3667 output_mode = SHIFT_JIS;
3668 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3674 if ((c1<0x20 || 0x7e<c1) ||
3675 (c2<0x20 || 0x7e<c2)) {
3676 set_iconv(FALSE, 0);
3677 return; /* too late to rescue this char */
3679 output_mode = SHIFT_JIS;
3680 e2s_conv(c2, c1, &c2, &c1);
3682 #ifdef SHIFTJIS_CP932
3684 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3685 extern const unsigned short cp932inv[2][189];
3686 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3692 #endif /* SHIFTJIS_CP932 */
3695 if (prefix_table[(unsigned char)c1]){
3696 (*o_putc)(prefix_table[(unsigned char)c1]);
3702 void j_oconv(int c2, int c1)
3704 #ifdef NUMCHAR_OPTION
3705 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3706 w16e_conv(c1, &c2, &c1);
3707 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3708 if(encode_fallback)(*encode_fallback)(c1);
3714 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3717 (*o_putc)(ascii_intro);
3718 output_mode = ASCII;
3722 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3724 if(output_mode!=X0213_2){
3725 output_mode = X0213_2;
3729 (*o_putc)(X0213_2&0x7F);
3732 if(output_mode!=X0212){
3733 output_mode = X0212;
3737 (*o_putc)(X0212&0x7F);
3740 (*o_putc)(c2 & 0x7f);
3743 } else if (c2==X0201) {
3744 if (output_mode!=X0201) {
3745 output_mode = X0201;
3751 } else if (c2==ISO8859_1) {
3752 /* iso8859 introduction, or 8th bit on */
3753 /* Can we convert in 7bit form using ESC-'-'-A ?
3755 output_mode = ISO8859_1;
3757 } else if (c2 == 0) {
3758 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3761 (*o_putc)(ascii_intro);
3762 output_mode = ASCII;
3766 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
3768 if (output_mode!=X0213_1) {
3769 output_mode = X0213_1;
3773 (*o_putc)(X0213_1&0x7F);
3775 }else if (output_mode != X0208) {
3776 output_mode = X0208;
3779 (*o_putc)(kanji_intro);
3786 void base64_conv(int c2, int c1)
3788 mime_prechar(c2, c1);
3789 (*o_base64conv)(c2,c1);
3793 static int broken_buf[3];
3794 static int broken_counter = 0;
3795 static int broken_last = 0;
3796 int broken_getc(FILE *f)
3800 if (broken_counter>0) {
3801 return broken_buf[--broken_counter];
3804 if (c=='$' && broken_last != ESC
3805 && (input_mode==ASCII || input_mode==X0201)) {
3808 if (c1=='@'|| c1=='B') {
3809 broken_buf[0]=c1; broken_buf[1]=c;
3816 } else if (c=='(' && broken_last != ESC
3817 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3820 if (c1=='J'|| c1=='B') {
3821 broken_buf[0]=c1; broken_buf[1]=c;
3834 int broken_ungetc(int c, FILE *f)
3836 if (broken_counter<2)
3837 broken_buf[broken_counter++]=c;
3841 static int prev_cr = 0;
3843 void cr_conv(int c2, int c1)
3847 if (! (c2==0&&c1==NL) ) {
3853 } else if (c1=='\r') {
3855 } else if (c1=='\n') {
3856 if (crmode_f==CRLF) {
3857 (*o_crconv)(0,'\r');
3858 } else if (crmode_f==CR) {
3859 (*o_crconv)(0,'\r');
3863 } else if (c1!='\032' || crmode_f!=NL){
3869 Return value of fold_conv()
3871 \n add newline and output char
3872 \r add newline and output nothing
3875 1 (or else) normal output
3877 fold state in prev (previous character)
3879 >0x80 Japanese (X0208/X0201)
3884 This fold algorthm does not preserve heading space in a line.
3885 This is the main difference from fmt.
3888 #define char_size(c2,c1) (c2?2:1)
3890 void fold_conv(int c2, int c1)
3895 if (c1== '\r' && !fold_preserve_f) {
3896 fold_state=0; /* ignore cr */
3897 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3899 fold_state=0; /* ignore cr */
3900 } else if (c1== BS) {
3901 if (f_line>0) f_line--;
3903 } else if (c2==EOF && f_line != 0) { /* close open last line */
3905 } else if ((c1=='\n' && !fold_preserve_f)
3906 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3907 && fold_preserve_f)) {
3909 if (fold_preserve_f) {
3913 } else if ((f_prev == c1 && !fold_preserve_f)
3914 || (f_prev == '\n' && fold_preserve_f)
3915 ) { /* duplicate newline */
3918 fold_state = '\n'; /* output two newline */
3924 if (f_prev&0x80) { /* Japanese? */
3926 fold_state = 0; /* ignore given single newline */
3927 } else if (f_prev==' ') {
3931 if (++f_line<=fold_len)
3935 fold_state = '\r'; /* fold and output nothing */
3939 } else if (c1=='\f') {
3942 fold_state = '\n'; /* output newline and clear */
3943 } else if ( (c2==0 && c1==' ')||
3944 (c2==0 && c1=='\t')||
3945 (c2=='!'&& c1=='!')) {
3946 /* X0208 kankaku or ascii space */
3947 if (f_prev == ' ') {
3948 fold_state = 0; /* remove duplicate spaces */
3951 if (++f_line<=fold_len)
3952 fold_state = ' '; /* output ASCII space only */
3954 f_prev = ' '; f_line = 0;
3955 fold_state = '\r'; /* fold and output nothing */
3959 prev0 = f_prev; /* we still need this one... , but almost done */
3961 if (c2 || c2==X0201)
3962 f_prev |= 0x80; /* this is Japanese */
3963 f_line += char_size(c2,c1);
3964 if (f_line<=fold_len) { /* normal case */
3967 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3968 f_line = char_size(c2,c1);
3969 fold_state = '\n'; /* We can't wait, do fold now */
3970 } else if (c2==X0201) {
3971 /* simple kinsoku rules return 1 means no folding */
3972 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3973 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3974 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3975 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3976 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3977 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3978 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3980 fold_state = '\n';/* add one new f_line before this character */
3983 fold_state = '\n';/* add one new f_line before this character */
3986 /* kinsoku point in ASCII */
3987 if ( c1==')'|| /* { [ ( */
3998 /* just after special */
3999 } else if (!is_alnum(prev0)) {
4000 f_line = char_size(c2,c1);
4002 } else if ((prev0==' ') || /* ignored new f_line */
4003 (prev0=='\n')|| /* ignored new f_line */
4004 (prev0&0x80)) { /* X0208 - ASCII */
4005 f_line = char_size(c2,c1);
4006 fold_state = '\n';/* add one new f_line before this character */
4008 fold_state = 1; /* default no fold in ASCII */
4012 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4013 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4014 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4015 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4016 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4017 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4018 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4019 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4020 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4021 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4022 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4023 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4024 /* default no fold in kinsoku */
4027 f_line = char_size(c2,c1);
4028 /* add one new f_line before this character */
4031 f_line = char_size(c2,c1);
4033 /* add one new f_line before this character */
4038 /* terminator process */
4039 switch(fold_state) {
4058 int z_prev2=0,z_prev1=0;
4060 void z_conv(int c2, int c1)
4063 /* if (c2) c1 &= 0x7f; assertion */
4065 if (x0201_f && z_prev2==X0201) { /* X0201 */
4066 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4068 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4070 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4072 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4076 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4085 if (x0201_f && c2==X0201) {
4086 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4087 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4088 z_prev1 = c1; z_prev2 = c2;
4091 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4096 /* JISX0208 Alphabet */
4097 if (alpha_f && c2 == 0x23 ) {
4099 } else if (alpha_f && c2 == 0x21 ) {
4100 /* JISX0208 Kigou */
4105 } else if (alpha_f&0x4) {
4110 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4116 case '>': entity = ">"; break;
4117 case '<': entity = "<"; break;
4118 case '\"': entity = """; break;
4119 case '&': entity = "&"; break;
4122 while (*entity) (*o_zconv)(0, *entity++);
4132 #define rot13(c) ( \
4134 (c <= 'M') ? (c + 13): \
4135 (c <= 'Z') ? (c - 13): \
4137 (c <= 'm') ? (c + 13): \
4138 (c <= 'z') ? (c - 13): \
4142 #define rot47(c) ( \
4144 ( c <= 'O' ) ? (c + 47) : \
4145 ( c <= '~' ) ? (c - 47) : \
4149 void rot_conv(int c2, int c1)
4151 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4157 (*o_rot_conv)(c2,c1);
4160 void hira_conv(int c2, int c1)
4164 if (0x20 < c1 && c1 < 0x74) {
4166 (*o_hira_conv)(c2,c1);
4168 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4170 c1 = CLASS_UTF16 | 0x3094;
4171 (*o_hira_conv)(c2,c1);
4174 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4176 (*o_hira_conv)(c2,c1);
4181 if (c2 == 0 && c1 == (CLASS_UTF16 | 0x3094)) {
4184 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4186 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4190 (*o_hira_conv)(c2,c1);
4194 void iso2022jp_check_conv(int c2, int c1)
4196 static const int range[RANGE_NUM_MAX][2] = {
4219 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4223 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4228 for (i = 0; i < RANGE_NUM_MAX; i++) {
4229 start = range[i][0];
4232 if (c >= start && c <= end) {
4237 (*o_iso2022jp_check_conv)(c2,c1);
4241 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4243 const unsigned char *mime_pattern[] = {
4244 (const unsigned char *)"\075?EUC-JP?B?",
4245 (const unsigned char *)"\075?SHIFT_JIS?B?",
4246 (const unsigned char *)"\075?ISO-8859-1?Q?",
4247 (const unsigned char *)"\075?ISO-8859-1?B?",
4248 (const unsigned char *)"\075?ISO-2022-JP?B?",
4249 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4250 #if defined(UTF8_INPUT_ENABLE)
4251 (const unsigned char *)"\075?UTF-8?B?",
4252 (const unsigned char *)"\075?UTF-8?Q?",
4254 (const unsigned char *)"\075?US-ASCII?Q?",
4259 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4260 int (*mime_priority_func[])(int c2, int c1, int c0) = {
4261 e_iconv, s_iconv, 0, 0, 0, 0,
4262 #if defined(UTF8_INPUT_ENABLE)
4268 const int mime_encode[] = {
4269 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4270 #if defined(UTF8_INPUT_ENABLE)
4277 const int mime_encode_method[] = {
4278 'B', 'B','Q', 'B', 'B', 'Q',
4279 #if defined(UTF8_INPUT_ENABLE)
4287 #define MAXRECOVER 20
4289 void switch_mime_getc(void)
4291 if (i_getc!=mime_getc) {
4292 i_mgetc = i_getc; i_getc = mime_getc;
4293 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4294 if(mime_f==STRICT_MIME) {
4295 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4296 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4301 void unswitch_mime_getc(void)
4303 if(mime_f==STRICT_MIME) {
4304 i_mgetc = i_mgetc_buf;
4305 i_mungetc = i_mungetc_buf;
4308 i_ungetc = i_mungetc;
4309 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4310 mime_iconv_back = NULL;
4313 int mime_begin_strict(FILE *f)
4317 const unsigned char *p,*q;
4318 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4320 mime_decode_mode = FALSE;
4321 /* =? has been checked */
4323 p = mime_pattern[j];
4326 for(i=2;p[i]>' ';i++) { /* start at =? */
4327 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4328 /* pattern fails, try next one */
4330 while (mime_pattern[++j]) {
4331 p = mime_pattern[j];
4332 for(k=2;k<i;k++) /* assume length(p) > i */
4333 if (p[k]!=q[k]) break;
4334 if (k==i && nkf_toupper(c1)==p[k]) break;
4336 p = mime_pattern[j];
4337 if (p) continue; /* found next one, continue */
4338 /* all fails, output from recovery buffer */
4346 mime_decode_mode = p[i-2];
4348 mime_iconv_back = iconv;
4349 set_iconv(FALSE, mime_priority_func[j]);
4350 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4352 if (mime_decode_mode=='B') {
4353 mimebuf_f = unbuf_f;
4355 /* do MIME integrity check */
4356 return mime_integrity(f,mime_pattern[j]);
4364 int mime_getc_buf(FILE *f)
4366 /* we don't keep eof of Fifo, becase it contains ?= as
4367 a terminator. It was checked in mime_integrity. */
4368 return ((mimebuf_f)?
4369 (*i_mgetc_buf)(f):Fifo(mime_input++));
4372 int mime_ungetc_buf(int c, FILE *f)
4375 (*i_mungetc_buf)(c,f);
4377 Fifo(--mime_input) = (unsigned char)c;
4381 int mime_begin(FILE *f)
4386 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4387 /* re-read and convert again from mime_buffer. */
4389 /* =? has been checked */
4391 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4392 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4393 /* We accept any character type even if it is breaked by new lines */
4394 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4395 if (c1=='\n'||c1==' '||c1=='\r'||
4396 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4398 /* Failed. But this could be another MIME preemble */
4406 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4407 if (!(++i<MAXRECOVER) || c1==EOF) break;
4408 if (c1=='b'||c1=='B') {
4409 mime_decode_mode = 'B';
4410 } else if (c1=='q'||c1=='Q') {
4411 mime_decode_mode = 'Q';
4415 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4416 if (!(++i<MAXRECOVER) || c1==EOF) break;
4418 mime_decode_mode = FALSE;
4424 if (!mime_decode_mode) {
4425 /* false MIME premble, restart from mime_buffer */
4426 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4427 /* Since we are in MIME mode until buffer becomes empty, */
4428 /* we never go into mime_begin again for a while. */
4431 /* discard mime preemble, and goto MIME mode */
4433 /* do no MIME integrity check */
4434 return c1; /* used only for checking EOF */
4443 void debug(const char *str)
4446 fprintf(stderr, "%s\n", str);
4451 void set_input_codename(char *codename)
4455 strcmp(codename, "") != 0 &&
4456 strcmp(codename, input_codename) != 0)
4458 is_inputcode_mixed = TRUE;
4460 input_codename = codename;
4461 is_inputcode_set = TRUE;
4464 #if !defined(PERL_XS) && !defined(WIN32DLL)
4465 void print_guessed_code(char *filename)
4467 char *codename = "BINARY";
4468 if (!is_inputcode_mixed) {
4469 if (strcmp(input_codename, "") == 0) {
4472 codename = input_codename;
4475 if (filename != NULL) printf("%s:", filename);
4476 printf("%s\n", codename);
4482 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4490 if (!nkf_isxdigit(c2)){
4495 if (!nkf_isxdigit(c3)){
4500 return (hex2bin(c2) << 4) | hex2bin(c3);
4503 int cap_getc(FILE *f)
4505 return hex_getc(':', f, i_cgetc, i_cungetc);
4508 int cap_ungetc(int c, FILE *f)
4510 return (*i_cungetc)(c, f);
4513 int url_getc(FILE *f)
4515 return hex_getc('%', f, i_ugetc, i_uungetc);
4518 int url_ungetc(int c, FILE *f)
4520 return (*i_uungetc)(c, f);
4524 #ifdef NUMCHAR_OPTION
4525 int numchar_getc(FILE *f)
4527 int (*g)(FILE *) = i_ngetc;
4528 int (*u)(int c ,FILE *f) = i_nungetc;
4539 if (buf[i] == 'x' || buf[i] == 'X'){
4540 for (j = 0; j < 5; j++){
4542 if (!nkf_isxdigit(buf[i])){
4549 c |= hex2bin(buf[i]);
4552 for (j = 0; j < 6; j++){
4556 if (!nkf_isdigit(buf[i])){
4563 c += hex2bin(buf[i]);
4569 return CLASS_UTF16 | c;
4578 int numchar_ungetc(int c, FILE *f)
4580 return (*i_nungetc)(c, f);
4584 #ifdef UNICODE_NORMALIZATION
4586 /* Normalization Form C */
4587 int nfc_getc(FILE *f)
4589 int (*g)(FILE *f) = i_nfc_getc;
4590 int (*u)(int c ,FILE *f) = i_nfc_ungetc;
4591 int i=0, j, k=1, lower, upper;
4594 extern const struct normalization_pair normalization_table[];
4597 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4598 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4599 while (upper >= lower) {
4600 j = (lower+upper) / 2;
4601 array = normalization_table[j].nfd;
4602 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4603 if (array[k] != buf[k]){
4604 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4611 array = normalization_table[j].nfc;
4612 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4624 int nfc_ungetc(int c, FILE *f)
4626 return (*i_nfc_ungetc)(c, f);
4628 #endif /* UNICODE_NORMALIZATION */
4634 int c1, c2, c3, c4, cc;
4635 int t1, t2, t3, t4, mode, exit_mode;
4639 int lwsp_size = 128;
4641 if (mime_top != mime_last) { /* Something is in FIFO */
4642 return Fifo(mime_top++);
4644 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4645 mime_decode_mode=FALSE;
4646 unswitch_mime_getc();
4647 return (*i_getc)(f);
4650 if (mimebuf_f == FIXED_MIME)
4651 exit_mode = mime_decode_mode;
4654 if (mime_decode_mode == 'Q') {
4655 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4657 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4658 if (c1<=' ' || DEL<=c1) {
4659 mime_decode_mode = exit_mode; /* prepare for quit */
4662 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4666 mime_decode_mode = exit_mode; /* prepare for quit */
4667 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4668 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4669 /* end Q encoding */
4670 input_mode = exit_mode;
4672 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4673 if (lwsp_buf==NULL) {
4674 perror("can't malloc");
4677 while ((c1=(*i_getc)(f))!=EOF) {
4682 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4690 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4691 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4706 lwsp_buf[lwsp_count] = (unsigned char)c1;
4707 if (lwsp_count++>lwsp_size){
4709 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4710 if (lwsp_buf_new==NULL) {
4712 perror("can't realloc");
4715 lwsp_buf = lwsp_buf_new;
4721 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4723 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4724 i_ungetc(lwsp_buf[lwsp_count],f);
4730 if (c1=='='&&c2<' ') { /* this is soft wrap */
4731 while((c1 = (*i_mgetc)(f)) <=' ') {
4732 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4734 mime_decode_mode = 'Q'; /* still in MIME */
4735 goto restart_mime_q;
4738 mime_decode_mode = 'Q'; /* still in MIME */
4742 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4743 if (c2<=' ') return c2;
4744 mime_decode_mode = 'Q'; /* still in MIME */
4745 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4746 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4747 return ((hex(c2)<<4) + hex(c3));
4750 if (mime_decode_mode != 'B') {
4751 mime_decode_mode = FALSE;
4752 return (*i_mgetc)(f);
4756 /* Base64 encoding */
4758 MIME allows line break in the middle of
4759 Base64, but we are very pessimistic in decoding
4760 in unbuf mode because MIME encoded code may broken by
4761 less or editor's control sequence (such as ESC-[-K in unbuffered
4762 mode. ignore incomplete MIME.
4764 mode = mime_decode_mode;
4765 mime_decode_mode = exit_mode; /* prepare for quit */
4767 while ((c1 = (*i_mgetc)(f))<=' ') {
4772 if ((c2 = (*i_mgetc)(f))<=' ') {
4775 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4776 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4779 if ((c1 == '?') && (c2 == '=')) {
4782 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4783 if (lwsp_buf==NULL) {
4784 perror("can't malloc");
4787 while ((c1=(*i_getc)(f))!=EOF) {
4792 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4800 if ((c1=(*i_getc)(f))!=EOF) {
4804 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4819 lwsp_buf[lwsp_count] = (unsigned char)c1;
4820 if (lwsp_count++>lwsp_size){
4822 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4823 if (lwsp_buf_new==NULL) {
4825 perror("can't realloc");
4828 lwsp_buf = lwsp_buf_new;
4834 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4836 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4837 i_ungetc(lwsp_buf[lwsp_count],f);
4844 if ((c3 = (*i_mgetc)(f))<=' ') {
4847 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4848 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4852 if ((c4 = (*i_mgetc)(f))<=' ') {
4855 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4856 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4860 mime_decode_mode = mode; /* still in MIME sigh... */
4862 /* BASE 64 decoding */
4864 t1 = 0x3f & base64decode(c1);
4865 t2 = 0x3f & base64decode(c2);
4866 t3 = 0x3f & base64decode(c3);
4867 t4 = 0x3f & base64decode(c4);
4868 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4870 Fifo(mime_last++) = (unsigned char)cc;
4871 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4873 Fifo(mime_last++) = (unsigned char)cc;
4874 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4876 Fifo(mime_last++) = (unsigned char)cc;
4881 return Fifo(mime_top++);
4884 int mime_ungetc(int c, FILE *f)
4886 Fifo(--mime_top) = (unsigned char)c;
4890 int mime_integrity(FILE *f, const unsigned char *p)
4894 /* In buffered mode, read until =? or NL or buffer full
4896 mime_input = mime_top;
4897 mime_last = mime_top;
4899 while(*p) Fifo(mime_input++) = *p++;
4902 while((c=(*i_getc)(f))!=EOF) {
4903 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4904 break; /* buffer full */
4906 if (c=='=' && d=='?') {
4907 /* checked. skip header, start decode */
4908 Fifo(mime_input++) = (unsigned char)c;
4909 /* mime_last_input = mime_input; */
4914 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4916 /* Should we check length mod 4? */
4917 Fifo(mime_input++) = (unsigned char)c;
4920 /* In case of Incomplete MIME, no MIME decode */
4921 Fifo(mime_input++) = (unsigned char)c;
4922 mime_last = mime_input; /* point undecoded buffer */
4923 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4924 switch_mime_getc(); /* anyway we need buffered getc */
4928 int base64decode(int c)
4933 i = c - 'A'; /* A..Z 0-25 */
4935 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4937 } else if (c > '/') {
4938 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4939 } else if (c == '+') {
4940 i = '>' /* 62 */ ; /* + 62 */
4942 i = '?' /* 63 */ ; /* / 63 */
4947 static const char basis_64[] =
4948 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4951 #define MIMEOUT_BUF_LENGTH (60)
4952 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4953 int mimeout_buf_count = 0;
4954 int mimeout_preserve_space = 0;
4955 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4957 void open_mime(int mode)
4959 const unsigned char *p;
4962 p = mime_pattern[0];
4963 for(i=0;mime_encode[i];i++) {
4964 if (mode == mime_encode[i]) {
4965 p = mime_pattern[i];
4969 mimeout_mode = mime_encode_method[i];
4972 if (base64_count>45) {
4973 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4974 (*o_mputc)(mimeout_buf[i]);
4980 if (!mimeout_preserve_space && mimeout_buf_count>0
4981 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4982 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4986 if (!mimeout_preserve_space) {
4987 for (;i<mimeout_buf_count;i++) {
4988 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4989 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4990 (*o_mputc)(mimeout_buf[i]);
4997 mimeout_preserve_space = FALSE;
5003 j = mimeout_buf_count;
5004 mimeout_buf_count = 0;
5006 mime_putc(mimeout_buf[i]);
5010 void close_mime(void)
5020 switch(mimeout_mode) {
5025 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5031 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5037 if (mimeout_f!=FIXED_MIME) {
5039 } else if (mimeout_mode != 'Q')
5044 void mimeout_addchar(int c)
5046 switch(mimeout_mode) {
5051 } else if(!nkf_isalnum(c)) {
5053 (*o_mputc)(itoh4(((c>>4)&0xf)));
5054 (*o_mputc)(itoh4((c&0xf)));
5063 (*o_mputc)(basis_64[c>>2]);
5068 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5074 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5075 (*o_mputc)(basis_64[c & 0x3F]);
5086 int mime_lastchar2, mime_lastchar1;
5088 void mime_prechar(int c2, int c1)
5092 if (base64_count + mimeout_buf_count/3*4> 66){
5093 (*o_base64conv)(EOF,0);
5094 (*o_base64conv)(0,NL);
5095 (*o_base64conv)(0,SPACE);
5097 }/*else if (mime_lastchar2){
5098 if (c1 <=DEL && !nkf_isspace(c1)){
5099 (*o_base64conv)(0,SPACE);
5103 if (c2 && mime_lastchar2 == 0
5104 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5105 (*o_base64conv)(0,SPACE);
5108 mime_lastchar2 = c2;
5109 mime_lastchar1 = c1;
5112 void mime_putc(int c)
5117 if (mimeout_f == FIXED_MIME){
5118 if (mimeout_mode == 'Q'){
5119 if (base64_count > 71){
5120 if (c!=CR && c!=NL) {
5127 if (base64_count > 71){
5132 if (c == EOF) { /* c==EOF */
5136 if (c != EOF) { /* c==EOF */
5142 /* mimeout_f != FIXED_MIME */
5144 if (c == EOF) { /* c==EOF */
5145 j = mimeout_buf_count;
5146 mimeout_buf_count = 0;
5150 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5153 mimeout_addchar(mimeout_buf[i]);
5157 mimeout_addchar(mimeout_buf[i]);
5161 mimeout_addchar(mimeout_buf[i]);
5167 if (mimeout_mode=='Q') {
5168 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5180 if (mimeout_buf_count > 0){
5181 lastchar = mimeout_buf[mimeout_buf_count - 1];
5186 if (!mimeout_mode) {
5187 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5188 if (nkf_isspace(c)) {
5189 if (c==CR || c==NL) {
5192 for (i=0;i<mimeout_buf_count;i++) {
5193 (*o_mputc)(mimeout_buf[i]);
5194 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5200 mimeout_buf[0] = (char)c;
5201 mimeout_buf_count = 1;
5203 if (base64_count > 1
5204 && base64_count + mimeout_buf_count > 76){
5207 if (!nkf_isspace(mimeout_buf[0])){
5212 mimeout_buf[mimeout_buf_count++] = (char)c;
5213 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5214 open_mime(output_mode);
5219 if (lastchar==CR || lastchar == NL){
5220 for (i=0;i<mimeout_buf_count;i++) {
5221 (*o_mputc)(mimeout_buf[i]);
5224 mimeout_buf_count = 0;
5226 if (lastchar==SPACE) {
5227 for (i=0;i<mimeout_buf_count-1;i++) {
5228 (*o_mputc)(mimeout_buf[i]);
5231 mimeout_buf[0] = SPACE;
5232 mimeout_buf_count = 1;
5234 open_mime(output_mode);
5237 /* mimeout_mode == 'B', 1, 2 */
5238 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5239 if (lastchar == CR || lastchar == NL){
5240 if (nkf_isblank(c)) {
5241 for (i=0;i<mimeout_buf_count;i++) {
5242 mimeout_addchar(mimeout_buf[i]);
5244 mimeout_buf_count = 0;
5245 } else if (SPACE<c && c<DEL) {
5247 for (i=0;i<mimeout_buf_count;i++) {
5248 (*o_mputc)(mimeout_buf[i]);
5251 mimeout_buf_count = 0;
5254 if (c==SPACE || c==TAB || c==CR || c==NL) {
5255 for (i=0;i<mimeout_buf_count;i++) {
5256 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5258 for (i=0;i<mimeout_buf_count;i++) {
5259 (*o_mputc)(mimeout_buf[i]);
5262 mimeout_buf_count = 0;
5265 mimeout_buf[mimeout_buf_count++] = (char)c;
5266 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5268 for (i=0;i<mimeout_buf_count;i++) {
5269 (*o_mputc)(mimeout_buf[i]);
5272 mimeout_buf_count = 0;
5276 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5277 mimeout_buf[mimeout_buf_count++] = (char)c;
5278 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5279 j = mimeout_buf_count;
5280 mimeout_buf_count = 0;
5282 mimeout_addchar(mimeout_buf[i]);
5289 if (mimeout_buf_count>0) {
5290 j = mimeout_buf_count;
5291 mimeout_buf_count = 0;
5293 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5295 mimeout_addchar(mimeout_buf[i]);
5301 (*o_mputc)(mimeout_buf[i]);
5303 open_mime(output_mode);
5310 #if defined(PERL_XS) || defined(WIN32DLL)
5314 struct input_code *p = input_code_list;
5327 mime_f = STRICT_MIME;
5328 mime_decode_f = FALSE;
5333 #if defined(MSDOS) || defined(__OS2__)
5338 iso2022jp_f = FALSE;
5339 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5340 ms_ucs_map_f = UCS_MAP_ASCII;
5342 #ifdef UTF8_INPUT_ENABLE
5343 no_cp932ext_f = FALSE;
5344 ignore_zwnbsp_f = TRUE;
5345 no_best_fit_chars_f = FALSE;
5346 encode_fallback = NULL;
5347 unicode_subchar = '?';
5349 #ifdef UTF8_OUTPUT_ENABLE
5353 #ifdef UNICODE_NORMALIZATION
5366 is_inputcode_mixed = FALSE;
5367 is_inputcode_set = FALSE;
5371 #ifdef SHIFTJIS_CP932
5381 for (i = 0; i < 256; i++){
5382 prefix_table[i] = 0;
5385 #ifdef UTF8_INPUT_ENABLE
5386 utf16_mode = UTF16BE_INPUT;
5388 mimeout_buf_count = 0;
5393 fold_preserve_f = FALSE;
5396 kanji_intro = DEFAULT_J;
5397 ascii_intro = DEFAULT_R;
5398 fold_margin = FOLD_MARGIN;
5399 output_conv = DEFAULT_CONV;
5400 oconv = DEFAULT_CONV;
5401 o_zconv = no_connection;
5402 o_fconv = no_connection;
5403 o_crconv = no_connection;
5404 o_rot_conv = no_connection;
5405 o_hira_conv = no_connection;
5406 o_base64conv = no_connection;
5407 o_iso2022jp_check_conv = no_connection;
5410 i_ungetc = std_ungetc;
5412 i_bungetc = std_ungetc;
5415 i_mungetc = std_ungetc;
5416 i_mgetc_buf = std_getc;
5417 i_mungetc_buf = std_ungetc;
5418 output_mode = ASCII;
5421 mime_decode_mode = FALSE;
5427 z_prev2=0,z_prev1=0;
5429 iconv_for_check = 0;
5431 input_codename = "";
5438 void no_connection(int c2, int c1)
5440 no_connection2(c2,c1,0);
5443 int no_connection2(int c2, int c1, int c0)
5445 fprintf(stderr,"nkf internal module connection failure.\n");
5447 return 0; /* LINT */
5452 #define fprintf dllprintf
5456 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5457 fprintf(stderr,"Flags:\n");
5458 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5459 #ifdef DEFAULT_CODE_SJIS
5460 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5462 #ifdef DEFAULT_CODE_JIS
5463 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5465 #ifdef DEFAULT_CODE_EUC
5466 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5468 #ifdef DEFAULT_CODE_UTF8
5469 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5471 #ifdef UTF8_OUTPUT_ENABLE
5472 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5474 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5475 #ifdef UTF8_INPUT_ENABLE
5476 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5478 fprintf(stderr,"t no conversion\n");
5479 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5480 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5481 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5482 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5483 fprintf(stderr,"v Show this usage. V: show version\n");
5484 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5485 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5486 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5487 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5488 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5489 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5490 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5491 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5493 fprintf(stderr,"T Text mode output\n");
5495 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5496 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5497 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5498 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5499 fprintf(stderr,"\n");
5500 fprintf(stderr,"Long name options\n");
5501 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5502 fprintf(stderr," Specify the input or output codeset\n");
5503 fprintf(stderr," --fj --unix --mac --windows\n");
5504 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5505 fprintf(stderr," Convert for the system or code\n");
5506 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5507 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5508 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5510 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5512 #ifdef NUMCHAR_OPTION
5513 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5515 #ifdef UTF8_INPUT_ENABLE
5516 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5517 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5520 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5521 fprintf(stderr," Overwrite original listed files by filtered result\n");
5522 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5524 fprintf(stderr," -g --guess Guess the input code\n");
5525 fprintf(stderr," --help --version Show this help/the version\n");
5526 fprintf(stderr," For more information, see also man nkf\n");
5527 fprintf(stderr,"\n");
5533 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5534 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5537 #if defined(MSDOS) && defined(__WIN16__)
5540 #if defined(MSDOS) && defined(__WIN32__)
5546 ,NKF_VERSION,NKF_RELEASE_DATE);
5547 fprintf(stderr,"\n%s\n",CopyRight);
5552 **
\e$B%Q%C%A@):n<T
\e(B
5553 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5554 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5555 ** ohta@src.ricoh.co.jp (Junn Ohta)
5556 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5557 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5558 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5559 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5560 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5561 ** GHG00637@nifty-serve.or.jp (COW)