1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.77 2005/08/10 20:48:31 naruse Exp $ */
43 #define NKF_VERSION "2.0.5"
44 #define NKF_RELEASE_DATE "2005-08-11"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2005 Kono, Furukawa, Naruse"
55 ** USAGE: nkf [flags] [file]
58 ** b Output is buffered (DEFAULT)
59 ** u Output is unbuffered
63 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
64 ** s Output code is MS Kanji (DEFAULT SELECT)
65 ** e Output code is AT&T JIS (DEFAULT SELECT)
66 ** w Output code is AT&T JIS (DEFAULT SELECT)
67 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
69 ** m MIME conversion for ISO-2022-JP
70 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
71 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
72 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
73 ** M MIME output conversion
75 ** r {de/en}crypt ROT13/47
79 ** T Text mode output (for MS-DOS)
81 ** x Do not convert X0201 kana into X0208
82 ** Z Convert X0208 alphabet to ASCII
87 ** B try to fix broken JIS, missing Escape
88 ** B[1-9] broken level
90 ** O Output to 'nkf.out' file or last file name
91 ** d Delete \r in line feed
92 ** c Add \r in line feed
93 ** -- other long option
94 ** -- ignore following option (don't use with -O )
98 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
100 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
116 #if defined(MSDOS) || defined(__OS2__)
123 #define setbinmode(fp) fsetbin(fp)
124 #else /* Microsoft C, Turbo C */
125 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
127 #else /* UNIX,OS/2 */
128 #define setbinmode(fp)
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
148 #include <sys/stat.h>
149 #ifndef MSDOS /* UNIX, OS/2 */
152 #else /* defined(MSDOS) */
154 #ifdef __BORLANDC__ /* BCC32 */
156 #else /* !defined(__BORLANDC__) */
157 #include <sys/utime.h>
158 #endif /* (__BORLANDC__) */
159 #else /* !defined(__WIN32__) */
160 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
161 #include <sys/utime.h>
162 #elif defined(__TURBOC__) /* BCC */
164 #elif defined(LSI_C) /* LSI C */
165 #endif /* (__WIN32__) */
177 /* state of output_mode and input_mode
195 /* Input Assumption */
199 #define LATIN1_INPUT 6
201 #define STRICT_MIME 8
206 #define JAPANESE_EUC 10
210 #define UTF8_INPUT 13
211 #define UTF16BE_INPUT 14
212 #define UTF16LE_INPUT 15
232 #define is_alnum(c) \
233 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
235 #define HOLD_SIZE 1024
236 #define IOBUF_SIZE 16384
238 #define DEFAULT_J 'B'
239 #define DEFAULT_R 'B'
241 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
242 #define SJ6394 0x0161 /* 63 - 94 ku offset */
244 #define RANGE_NUM_MAX 18
249 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
250 #define sizeof_euc_utf8 94
251 #define sizeof_euc_to_utf8_1byte 94
252 #define sizeof_euc_to_utf8_2bytes 94
253 #define sizeof_utf8_to_euc_C2 64
254 #define sizeof_utf8_to_euc_E5B8 64
255 #define sizeof_utf8_to_euc_2bytes 112
256 #define sizeof_utf8_to_euc_3bytes 112
259 /* MIME preprocessor */
261 #ifdef EASYWIN /*Easy Win */
262 extern POINT _BufferSize;
265 /* function prototype */
267 #ifdef ANSI_C_PROTOTYPE
269 #define STATIC static
283 void (*status_func)PROTO((struct input_code *, int));
284 int (*iconv_func)PROTO((int c2, int c1, int c0));
288 STATIC char *input_codename = "";
291 STATIC const char *CopyRight = COPY_RIGHT;
293 #if !defined(PERL_XS) && !defined(WIN32DLL)
294 STATIC int noconvert PROTO((FILE *f));
296 STATIC int kanji_convert PROTO((FILE *f));
297 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
298 STATIC int push_hold_buf PROTO((int c2));
299 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
300 STATIC int s_iconv PROTO((int c2,int c1,int c0));
301 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
302 STATIC int e_iconv PROTO((int c2,int c1,int c0));
303 #ifdef UTF8_INPUT_ENABLE
304 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
305 STATIC int w_iconv PROTO((int c2,int c1,int c0));
306 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
307 STATIC int w_iconv_common PROTO((int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1));
308 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
310 #ifdef UTF8_OUTPUT_ENABLE
311 STATIC int e2w_conv PROTO((int c2,int c1));
312 STATIC void w_oconv PROTO((int c2,int c1));
313 STATIC void w_oconv16 PROTO((int c2,int c1));
315 STATIC void e_oconv PROTO((int c2,int c1));
316 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
317 STATIC void s_oconv PROTO((int c2,int c1));
318 STATIC void j_oconv PROTO((int c2,int c1));
319 STATIC void fold_conv PROTO((int c2,int c1));
320 STATIC void cr_conv PROTO((int c2,int c1));
321 STATIC void z_conv PROTO((int c2,int c1));
322 STATIC void rot_conv PROTO((int c2,int c1));
323 STATIC void hira_conv PROTO((int c2,int c1));
324 STATIC void base64_conv PROTO((int c2,int c1));
325 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
326 STATIC void no_connection PROTO((int c2,int c1));
327 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
329 STATIC void code_score PROTO((struct input_code *ptr));
330 STATIC void code_status PROTO((int c));
332 STATIC void std_putc PROTO((int c));
333 STATIC int std_getc PROTO((FILE *f));
334 STATIC int std_ungetc PROTO((int c,FILE *f));
336 STATIC int broken_getc PROTO((FILE *f));
337 STATIC int broken_ungetc PROTO((int c,FILE *f));
339 STATIC int mime_begin PROTO((FILE *f));
340 STATIC int mime_getc PROTO((FILE *f));
341 STATIC int mime_ungetc PROTO((int c,FILE *f));
343 STATIC int mime_begin_strict PROTO((FILE *f));
344 STATIC int mime_getc_buf PROTO((FILE *f));
345 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
346 STATIC int mime_integrity PROTO((FILE *f,const unsigned char *p));
348 STATIC int base64decode PROTO((int c));
349 STATIC void mime_prechar PROTO((int c2, int c1));
350 STATIC void mime_putc PROTO((int c));
351 STATIC void open_mime PROTO((int c));
352 STATIC void close_mime PROTO(());
354 STATIC void usage PROTO(());
355 STATIC void version PROTO(());
357 STATIC void options PROTO((unsigned char *c));
358 #if defined(PERL_XS) || defined(WIN32DLL)
359 STATIC void reinit PROTO(());
364 #if !defined(PERL_XS) && !defined(WIN32DLL)
365 STATIC unsigned char stdibuf[IOBUF_SIZE];
366 STATIC unsigned char stdobuf[IOBUF_SIZE];
368 STATIC unsigned char hold_buf[HOLD_SIZE*2];
369 STATIC int hold_count;
371 /* MIME preprocessor fifo */
373 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
374 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
375 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
376 STATIC unsigned char mime_buf[MIME_BUF_SIZE];
377 STATIC unsigned int mime_top = 0;
378 STATIC unsigned int mime_last = 0; /* decoded */
379 STATIC unsigned int mime_input = 0; /* undecoded */
380 STATIC int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
383 STATIC int unbuf_f = FALSE;
384 STATIC int estab_f = FALSE;
385 STATIC int nop_f = FALSE;
386 STATIC int binmode_f = TRUE; /* binary mode */
387 STATIC int rot_f = FALSE; /* rot14/43 mode */
388 STATIC int hira_f = FALSE; /* hira/kata henkan */
389 STATIC int input_f = FALSE; /* non fixed input code */
390 STATIC int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
391 STATIC int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
392 STATIC int mime_decode_f = FALSE; /* mime decode is explicitly on */
393 STATIC int mimebuf_f = FALSE; /* MIME buffered input */
394 STATIC int broken_f = FALSE; /* convert ESC-less broken JIS */
395 STATIC int iso8859_f = FALSE; /* ISO8859 through */
396 STATIC int mimeout_f = FALSE; /* base64 mode */
397 #if defined(MSDOS) || defined(__OS2__)
398 STATIC int x0201_f = TRUE; /* Assume JISX0201 kana */
400 STATIC int x0201_f = NO_X0201; /* Assume NO JISX0201 */
402 STATIC int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
403 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
404 STATIC int internal_unicode_f = FALSE; /* Internal Unicode Processing */
406 #ifdef UTF8_OUTPUT_ENABLE
407 STATIC int unicode_bom_f= 0; /* Output Unicode BOM */
408 STATIC int w_oconv16_LE = 0; /* utf-16 little endian */
409 STATIC int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
412 #ifdef UNICODE_NORMALIZATION
413 STATIC int nfc_f = FALSE;
414 STATIC int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
415 STATIC int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
416 STATIC int nfc_getc PROTO((FILE *f));
417 STATIC int nfc_ungetc PROTO((int c,FILE *f));
421 STATIC int cap_f = FALSE;
422 STATIC int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
423 STATIC int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
424 STATIC int cap_getc PROTO((FILE *f));
425 STATIC int cap_ungetc PROTO((int c,FILE *f));
427 STATIC int url_f = FALSE;
428 STATIC int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
429 STATIC int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
430 STATIC int url_getc PROTO((FILE *f));
431 STATIC int url_ungetc PROTO((int c,FILE *f));
434 #ifdef NUMCHAR_OPTION
435 #define CLASS_MASK 0x0f000000
436 #define CLASS_UTF16 0x01000000
437 STATIC int numchar_f = FALSE;
438 STATIC int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
439 STATIC int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
440 STATIC int numchar_getc PROTO((FILE *f));
441 STATIC int numchar_ungetc PROTO((int c,FILE *f));
445 STATIC int noout_f = FALSE;
446 STATIC void no_putc PROTO((int c));
447 STATIC int debug_f = FALSE;
448 STATIC void debug PROTO((const char *str));
449 STATIC int (*iconv_for_check)() = 0;
452 STATIC int guess_f = FALSE;
454 STATIC void print_guessed_code PROTO((char *filename));
456 STATIC void set_input_codename PROTO((char *codename));
457 STATIC int is_inputcode_mixed = FALSE;
458 STATIC int is_inputcode_set = FALSE;
461 STATIC int exec_f = 0;
464 #ifdef SHIFTJIS_CP932
465 STATIC int cp932_f = TRUE;
466 #define CP932_TABLE_BEGIN (0xfa)
467 #define CP932_TABLE_END (0xfc)
469 STATIC int cp932inv_f = TRUE;
470 #define CP932INV_TABLE_BEGIN (0xed)
471 #define CP932INV_TABLE_END (0xee)
473 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
474 #endif /* SHIFTJIS_CP932 */
477 STATIC int x0212_f = FALSE;
478 STATIC int x0212_shift PROTO((int c));
479 STATIC int x0212_unshift PROTO((int c));
482 STATIC unsigned char prefix_table[256];
484 STATIC void e_status PROTO((struct input_code *, int));
485 STATIC void s_status PROTO((struct input_code *, int));
487 #ifdef UTF8_INPUT_ENABLE
488 STATIC void w_status PROTO((struct input_code *, int));
489 STATIC void w16_status PROTO((struct input_code *, int));
490 STATIC int utf16_mode = UTF16BE_INPUT;
493 struct input_code input_code_list[] = {
494 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
495 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
496 #ifdef UTF8_INPUT_ENABLE
497 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
498 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
503 STATIC int mimeout_mode = 0;
504 STATIC int base64_count = 0;
506 /* X0208 -> ASCII converter */
509 STATIC int f_line = 0; /* chars in line */
510 STATIC int f_prev = 0;
511 STATIC int fold_preserve_f = FALSE; /* preserve new lines */
512 STATIC int fold_f = FALSE;
513 STATIC int fold_len = 0;
516 STATIC unsigned char kanji_intro = DEFAULT_J;
517 STATIC unsigned char ascii_intro = DEFAULT_R;
521 #define FOLD_MARGIN 10
522 #define DEFAULT_FOLD 60
524 STATIC int fold_margin = FOLD_MARGIN;
528 #ifdef DEFAULT_CODE_JIS
529 # define DEFAULT_CONV j_oconv
531 #ifdef DEFAULT_CODE_SJIS
532 # define DEFAULT_CONV s_oconv
534 #ifdef DEFAULT_CODE_EUC
535 # define DEFAULT_CONV e_oconv
537 #ifdef DEFAULT_CODE_UTF8
538 # define DEFAULT_CONV w_oconv
541 /* process default */
542 STATIC void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
544 STATIC void (*oconv)PROTO((int c2,int c1)) = no_connection;
545 /* s_iconv or oconv */
546 STATIC int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
548 STATIC void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
549 STATIC void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
550 STATIC void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
551 STATIC void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
552 STATIC void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
553 STATIC void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
554 STATIC void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
556 /* STATIC redirections */
558 STATIC void (*o_putc)PROTO((int c)) = std_putc;
560 STATIC int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
561 STATIC int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
563 STATIC int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
564 STATIC int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
566 STATIC void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
568 STATIC int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
569 STATIC int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
571 /* for strict mime */
572 STATIC int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
573 STATIC int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
576 STATIC int output_mode = ASCII, /* output kanji mode */
577 input_mode = ASCII, /* input kanji mode */
578 shift_mode = FALSE; /* TRUE shift out, or X0201 */
579 STATIC int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
581 /* X0201 / X0208 conversion tables */
583 /* X0201 kana conversion table */
586 unsigned char cv[]= {
587 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
588 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
589 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
590 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
591 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
592 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
593 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
594 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
595 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
596 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
597 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
598 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
599 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
600 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
601 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
602 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
606 /* X0201 kana conversion table for daguten */
609 unsigned char dv[]= {
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
615 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
616 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
617 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
618 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
619 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
621 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
628 /* X0201 kana conversion table for han-daguten */
631 unsigned char ev[]= {
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
643 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
651 /* X0208 kigou conversion table */
652 /* 0x8140 - 0x819e */
654 unsigned char fv[] = {
656 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
657 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
658 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
660 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
661 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
662 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
664 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
673 STATIC int file_out = FALSE;
675 STATIC int overwrite = FALSE;
678 STATIC int crmode_f = 0; /* CR, NL, CRLF */
679 #ifdef EASYWIN /*Easy Win */
680 STATIC int end_check;
683 #define STD_GC_BUFSIZE (256)
684 int std_gc_buf[STD_GC_BUFSIZE];
688 #include "nkf32dll.c"
689 #elif defined(PERL_XS)
699 char *outfname = NULL;
702 #ifdef EASYWIN /*Easy Win */
703 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
706 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
707 cp = (unsigned char *)*argv;
712 if (pipe(fds) < 0 || (pid = fork()) < 0){
723 execvp(argv[1], &argv[1]);
737 if(x0201_f == WISH_TRUE)
738 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
740 if (binmode_f == TRUE)
742 if (freopen("","wb",stdout) == NULL)
749 setbuf(stdout, (char *) NULL);
751 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
754 if (binmode_f == TRUE)
756 if (freopen("","rb",stdin) == NULL) return (-1);
760 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
764 kanji_convert(stdin);
765 if (guess_f) print_guessed_code(NULL);
770 is_inputcode_mixed = FALSE;
771 is_inputcode_set = FALSE;
776 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
785 /* reopen file for stdout */
786 if (file_out == TRUE) {
789 outfname = malloc(strlen(origfname)
790 + strlen(".nkftmpXXXXXX")
796 strcpy(outfname, origfname);
800 for (i = strlen(outfname); i; --i){
801 if (outfname[i - 1] == '/'
802 || outfname[i - 1] == '\\'){
808 strcat(outfname, "ntXXXXXX");
810 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC,
813 strcat(outfname, ".nkftmpXXXXXX");
814 fd = mkstemp(outfname);
817 || (fd_backup = dup(fileno(stdout))) < 0
818 || dup2(fd, fileno(stdout)) < 0
829 outfname = "nkf.out";
832 if(freopen(outfname, "w", stdout) == NULL) {
836 if (binmode_f == TRUE) {
838 if (freopen("","wb",stdout) == NULL)
845 if (binmode_f == TRUE)
847 if (freopen("","rb",fin) == NULL)
852 setvbuffer(fin, stdibuf, IOBUF_SIZE);
856 char *filename = NULL;
858 if (nfiles > 1) filename = origfname;
859 if (guess_f) print_guessed_code(filename);
865 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
873 if (dup2(fd_backup, fileno(stdout)) < 0){
876 if (stat(origfname, &sb)) {
877 fprintf(stderr, "Can't stat %s\n", origfname);
879 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
880 if (chmod(outfname, sb.st_mode)) {
881 fprintf(stderr, "Can't set permission %s\n", outfname);
884 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
885 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
886 tb[0] = tb[1] = sb.st_mtime;
887 if (utime(outfname, tb)) {
888 fprintf(stderr, "Can't set timestamp %s\n", outfname);
891 tb.actime = sb.st_atime;
892 tb.modtime = sb.st_mtime;
893 if (utime(outfname, &tb)) {
894 fprintf(stderr, "Can't set timestamp %s\n", outfname);
898 if (unlink(origfname)){
902 if (rename(outfname, origfname)) {
904 fprintf(stderr, "Can't rename %s to %s\n",
905 outfname, origfname);
913 #ifdef EASYWIN /*Easy Win */
914 if (file_out == FALSE)
915 scanf("%d",&end_check);
918 #else /* for Other OS */
919 if (file_out == TRUE)
924 #endif /* WIN32DLL */
949 {"katakana-hiragana","h3"},
956 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
957 {"internal-unicode", ""},
959 #ifdef UTF8_OUTPUT_ENABLE
964 #ifdef UTF8_INPUT_ENABLE
966 {"utf16-input", "W16"},
968 #ifdef UNICODE_NORMALIZATION
969 {"utf8mac-input", ""},
978 #ifdef NUMCHAR_OPTION
979 {"numchar-input", ""},
985 #ifdef SHIFTJIS_CP932
995 STATIC int option_mode = 0;
1002 unsigned char *p = NULL;
1006 while(*cp && *cp++!='-');
1010 case '-': /* literal options */
1011 if (!*cp) { /* ignore the rest of arguments */
1015 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1017 p = (unsigned char *)long_option[i].name;
1018 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1019 if (*p == cp[j] || cp[j] == ' '){
1026 cp = (unsigned char *)long_option[i].alias;
1030 if (strcmp(long_option[i].name, "overwrite") == 0){
1037 if (strcmp(long_option[i].name, "cap-input") == 0){
1041 if (strcmp(long_option[i].name, "url-input") == 0){
1046 #ifdef NUMCHAR_OPTION
1047 if (strcmp(long_option[i].name, "numchar-input") == 0){
1053 if (strcmp(long_option[i].name, "no-output") == 0){
1057 if (strcmp(long_option[i].name, "debug") == 0){
1062 if (strcmp(long_option[i].name, "cp932") == 0){
1063 #ifdef SHIFTJIS_CP932
1067 #ifdef UTF8_OUTPUT_ENABLE
1068 ms_ucs_map_f = TRUE;
1072 if (strcmp(long_option[i].name, "no-cp932") == 0){
1073 #ifdef SHIFTJIS_CP932
1077 #ifdef UTF8_OUTPUT_ENABLE
1078 ms_ucs_map_f = FALSE;
1082 #ifdef SHIFTJIS_CP932
1083 if (strcmp(long_option[i].name, "cp932inv") == 0){
1090 if (strcmp(long_option[i].name, "x0212") == 0){
1097 if (strcmp(long_option[i].name, "exec-in") == 0){
1101 if (strcmp(long_option[i].name, "exec-out") == 0){
1106 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1107 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1108 internal_unicode_f = TRUE;
1112 #ifdef UTF8_OUTPUT_ENABLE
1113 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1114 ms_ucs_map_f = TRUE;
1118 #ifdef UNICODE_NORMALIZATION
1119 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1120 input_f = UTF8_INPUT;
1125 if (strcmp(long_option[i].name, "prefix=") == 0){
1126 if (*p == '=' && ' ' < p[1] && p[1] < 128){
1127 for (i = 2; ' ' < p[i] && p[i] < 128; i++){
1128 prefix_table[p[i]] = p[1];
1135 case 'b': /* buffered mode */
1138 case 'u': /* non bufferd mode */
1141 case 't': /* transparent mode */
1144 case 'j': /* JIS output */
1146 output_conv = j_oconv;
1148 case 'e': /* AT&T EUC output */
1149 output_conv = e_oconv;
1151 case 's': /* SJIS output */
1152 output_conv = s_oconv;
1154 case 'l': /* ISO8859 Latin-1 support, no conversion */
1155 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1156 input_f = LATIN1_INPUT;
1158 case 'i': /* Kanji IN ESC-$-@/B */
1159 if (*cp=='@'||*cp=='B')
1160 kanji_intro = *cp++;
1162 case 'o': /* ASCII IN ESC-(-J/B */
1163 if (*cp=='J'||*cp=='B'||*cp=='H')
1164 ascii_intro = *cp++;
1168 bit:1 katakana->hiragana
1169 bit:2 hiragana->katakana
1171 if ('9'>= *cp && *cp>='0')
1172 hira_f |= (*cp++ -'0');
1179 #if defined(MSDOS) || defined(__OS2__)
1194 #ifdef UTF8_OUTPUT_ENABLE
1195 case 'w': /* UTF-8 output */
1196 if ('1'== cp[0] && '6'==cp[1]) {
1197 output_conv = w_oconv16; cp+=2;
1199 unicode_bom_f=2; cp++;
1202 unicode_bom_f=1; cp++;
1204 } else if (cp[0] == 'B') {
1205 unicode_bom_f=2; cp++;
1207 unicode_bom_f=1; cp++;
1210 } else if (cp[0] == '8') {
1211 output_conv = w_oconv; cp++;
1214 unicode_bom_f=1; cp++;
1217 output_conv = w_oconv;
1220 #ifdef UTF8_INPUT_ENABLE
1221 case 'W': /* UTF-8 input */
1222 if ('1'== cp[0] && '6'==cp[1]) {
1223 input_f = UTF16BE_INPUT;
1224 utf16_mode = UTF16BE_INPUT;
1228 input_f = UTF16LE_INPUT;
1229 utf16_mode = UTF16LE_INPUT;
1230 } else if (cp[0] == 'B') {
1232 input_f = UTF16BE_INPUT;
1233 utf16_mode = UTF16BE_INPUT;
1235 } else if (cp[0] == '8') {
1237 input_f = UTF8_INPUT;
1239 input_f = UTF8_INPUT;
1242 /* Input code assumption */
1243 case 'J': /* JIS input */
1244 case 'E': /* AT&T EUC input */
1245 input_f = JIS_INPUT;
1247 case 'S': /* MS Kanji input */
1248 input_f = SJIS_INPUT;
1249 if (x0201_f==NO_X0201) x0201_f=TRUE;
1251 case 'Z': /* Convert X0208 alphabet to asii */
1252 /* bit:0 Convert X0208
1253 bit:1 Convert Kankaku to one space
1254 bit:2 Convert Kankaku to two spaces
1255 bit:3 Convert HTML Entity
1257 if ('9'>= *cp && *cp>='0')
1258 alpha_f |= 1<<(*cp++ -'0');
1262 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1263 x0201_f = FALSE; /* No X0201->X0208 conversion */
1265 ESC-(-I in JIS, EUC, MS Kanji
1266 SI/SO in JIS, EUC, MS Kanji
1267 SSO in EUC, JIS, not in MS Kanji
1268 MS Kanji (0xa0-0xdf)
1270 ESC-(-I in JIS (0x20-0x5f)
1271 SSO in EUC (0xa0-0xdf)
1272 0xa0-0xd in MS Kanji (0xa0-0xdf)
1275 case 'X': /* Assume X0201 kana */
1276 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1279 case 'F': /* prserve new lines */
1280 fold_preserve_f = TRUE;
1281 case 'f': /* folding -f60 or -f */
1284 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1286 fold_len += *cp++ - '0';
1288 if (!(0<fold_len && fold_len<BUFSIZ))
1289 fold_len = DEFAULT_FOLD;
1293 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1295 fold_margin += *cp++ - '0';
1299 case 'm': /* MIME support */
1300 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1301 if (*cp=='B'||*cp=='Q') {
1302 mime_decode_mode = *cp++;
1303 mimebuf_f = FIXED_MIME;
1304 } else if (*cp=='N') {
1305 mime_f = TRUE; cp++;
1306 } else if (*cp=='S') {
1307 mime_f = STRICT_MIME; cp++;
1308 } else if (*cp=='0') {
1309 mime_decode_f = FALSE;
1310 mime_f = FALSE; cp++;
1313 case 'M': /* MIME output */
1316 mimeout_f = FIXED_MIME; cp++;
1317 } else if (*cp=='Q') {
1319 mimeout_f = FIXED_MIME; cp++;
1324 case 'B': /* Broken JIS support */
1326 bit:1 allow any x on ESC-(-x or ESC-$-x
1327 bit:2 reset to ascii on NL
1329 if ('9'>= *cp && *cp>='0')
1330 broken_f |= 1<<(*cp++ -'0');
1335 case 'O':/* for Output file */
1339 case 'c':/* add cr code */
1342 case 'd':/* delete cr code */
1345 case 'I': /* ISO-2022-JP output */
1348 case 'L': /* line mode */
1349 if (*cp=='u') { /* unix */
1350 crmode_f = NL; cp++;
1351 } else if (*cp=='m') { /* mac */
1352 crmode_f = CR; cp++;
1353 } else if (*cp=='w') { /* windows */
1354 crmode_f = CRLF; cp++;
1355 } else if (*cp=='0') { /* no conversion */
1365 /* module muliple options in a string are allowed for Perl moudle */
1366 while(*cp && *cp++!='-');
1369 /* bogus option but ignored */
1375 #ifdef ANSI_C_PROTOTYPE
1376 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1378 struct input_code * find_inputcode_byfunc(iconv_func)
1379 int (*iconv_func)();
1383 struct input_code *p = input_code_list;
1385 if (iconv_func == p->iconv_func){
1394 #ifdef ANSI_C_PROTOTYPE
1395 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1397 void set_iconv(f, iconv_func)
1399 int (*iconv_func)();
1402 #ifdef INPUT_CODE_FIX
1410 #ifdef INPUT_CODE_FIX
1411 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1417 if (estab_f && iconv_for_check != iconv){
1418 struct input_code *p = find_inputcode_byfunc(iconv);
1420 set_input_codename(p->name);
1421 debug(input_codename);
1423 iconv_for_check = iconv;
1428 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1429 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1430 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1431 #ifdef SHIFTJIS_CP932
1432 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1433 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1435 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1437 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1438 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1440 #define SCORE_INIT (SCORE_iMIME)
1442 const int score_table_A0[] = {
1445 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1446 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1449 const int score_table_F0[] = {
1450 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1451 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1452 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1453 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1456 void set_code_score(ptr, score)
1457 struct input_code *ptr;
1461 ptr->score |= score;
1465 void clr_code_score(ptr, score)
1466 struct input_code *ptr;
1470 ptr->score &= ~score;
1474 void code_score(ptr)
1475 struct input_code *ptr;
1477 int c2 = ptr->buf[0];
1478 #ifdef UTF8_OUTPUT_ENABLE
1479 int c1 = ptr->buf[1];
1482 set_code_score(ptr, SCORE_ERROR);
1483 }else if (c2 == SSO){
1484 set_code_score(ptr, SCORE_KANA);
1485 #ifdef UTF8_OUTPUT_ENABLE
1486 }else if (!e2w_conv(c2, c1)){
1487 set_code_score(ptr, SCORE_NO_EXIST);
1489 }else if ((c2 & 0x70) == 0x20){
1490 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1491 }else if ((c2 & 0x70) == 0x70){
1492 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1493 }else if ((c2 & 0x70) >= 0x50){
1494 set_code_score(ptr, SCORE_L2);
1498 void status_disable(ptr)
1499 struct input_code *ptr;
1504 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1507 void status_push_ch(ptr, c)
1508 struct input_code *ptr;
1511 ptr->buf[ptr->index++] = c;
1514 void status_clear(ptr)
1515 struct input_code *ptr;
1521 void status_reset(ptr)
1522 struct input_code *ptr;
1525 ptr->score = SCORE_INIT;
1528 void status_reinit(ptr)
1529 struct input_code *ptr;
1532 ptr->_file_stat = 0;
1535 void status_check(ptr, c)
1536 struct input_code *ptr;
1539 if (c <= DEL && estab_f){
1544 void s_status(ptr, c)
1545 struct input_code *ptr;
1550 status_check(ptr, c);
1555 #ifdef NUMCHAR_OPTION
1556 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1559 }else if (0xa1 <= c && c <= 0xdf){
1560 status_push_ch(ptr, SSO);
1561 status_push_ch(ptr, c);
1564 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1566 status_push_ch(ptr, c);
1567 #ifdef SHIFTJIS_CP932
1569 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1571 status_push_ch(ptr, c);
1572 #endif /* SHIFTJIS_CP932 */
1574 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1576 status_push_ch(ptr, c);
1577 #endif /* X0212_ENABLE */
1579 status_disable(ptr);
1583 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1584 status_push_ch(ptr, c);
1585 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
1589 status_disable(ptr);
1593 #ifdef SHIFTJIS_CP932
1594 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1595 status_push_ch(ptr, c);
1596 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
1597 set_code_score(ptr, SCORE_CP932);
1602 #endif /* SHIFTJIS_CP932 */
1603 #ifndef X0212_ENABLE
1604 status_disable(ptr);
1610 void e_status(ptr, c)
1611 struct input_code *ptr;
1616 status_check(ptr, c);
1621 #ifdef NUMCHAR_OPTION
1622 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1625 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
1627 status_push_ch(ptr, c);
1629 }else if (0x8f == c){
1631 status_push_ch(ptr, c);
1632 #endif /* X0212_ENABLE */
1634 status_disable(ptr);
1638 if (0xa1 <= c && c <= 0xfe){
1639 status_push_ch(ptr, c);
1643 status_disable(ptr);
1648 if (0xa1 <= c && c <= 0xfe){
1650 status_push_ch(ptr, c);
1652 status_disable(ptr);
1654 #endif /* X0212_ENABLE */
1658 #ifdef UTF8_INPUT_ENABLE
1659 void w16_status(ptr, c)
1660 struct input_code *ptr;
1667 if (ptr->_file_stat == 0){
1668 if (c == 0xfe || c == 0xff){
1670 status_push_ch(ptr, c);
1671 ptr->_file_stat = 1;
1673 status_disable(ptr);
1674 ptr->_file_stat = -1;
1676 }else if (ptr->_file_stat > 0){
1678 status_push_ch(ptr, c);
1679 }else if (ptr->_file_stat < 0){
1680 status_disable(ptr);
1686 status_disable(ptr);
1687 ptr->_file_stat = -1;
1689 status_push_ch(ptr, c);
1696 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
1697 status_push_ch(ptr, c);
1700 status_disable(ptr);
1701 ptr->_file_stat = -1;
1707 void w_status(ptr, c)
1708 struct input_code *ptr;
1713 status_check(ptr, c);
1718 #ifdef NUMCHAR_OPTION
1719 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1722 }else if (0xc0 <= c && c <= 0xdf){
1724 status_push_ch(ptr, c);
1725 }else if (0xe0 <= c && c <= 0xef){
1727 status_push_ch(ptr, c);
1729 status_disable(ptr);
1734 if (0x80 <= c && c <= 0xbf){
1735 status_push_ch(ptr, c);
1736 if (ptr->index > ptr->stat){
1737 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
1738 && ptr->buf[2] == 0xbf);
1739 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
1740 &ptr->buf[0], &ptr->buf[1]);
1747 status_disable(ptr);
1758 int action_flag = 1;
1759 struct input_code *result = 0;
1760 struct input_code *p = input_code_list;
1762 (p->status_func)(p, c);
1765 }else if(p->stat == 0){
1776 if (result && !estab_f){
1777 set_iconv(TRUE, result->iconv_func);
1778 }else if (c <= DEL){
1779 struct input_code *ptr = input_code_list;
1794 return std_gc_buf[--std_gc_ndx];
1805 if (std_gc_ndx == STD_GC_BUFSIZE){
1808 std_gc_buf[std_gc_ndx++] = c;
1822 #if !defined(PERL_XS) && !defined(WIN32DLL)
1829 while ((c = (*i_getc)(f)) != EOF)
1838 oconv = output_conv;
1841 /* replace continucation module, from output side */
1843 /* output redicrection */
1845 if (noout_f || guess_f){
1852 if (mimeout_f == TRUE) {
1853 o_base64conv = oconv; oconv = base64_conv;
1855 /* base64_count = 0; */
1859 o_crconv = oconv; oconv = cr_conv;
1862 o_rot_conv = oconv; oconv = rot_conv;
1865 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
1868 o_hira_conv = oconv; oconv = hira_conv;
1871 o_fconv = oconv; oconv = fold_conv;
1874 if (alpha_f || x0201_f) {
1875 o_zconv = oconv; oconv = z_conv;
1879 i_ungetc = std_ungetc;
1880 /* input redicrection */
1883 i_cgetc = i_getc; i_getc = cap_getc;
1884 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
1887 i_ugetc = i_getc; i_getc = url_getc;
1888 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
1891 #ifdef NUMCHAR_OPTION
1893 i_ngetc = i_getc; i_getc = numchar_getc;
1894 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
1897 #ifdef UNICODE_NORMALIZATION
1898 if (nfc_f && input_f == UTF8_INPUT){
1899 i_nfc_getc = i_getc; i_getc = nfc_getc;
1900 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
1903 if (mime_f && mimebuf_f==FIXED_MIME) {
1904 i_mgetc = i_getc; i_getc = mime_getc;
1905 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
1908 i_bgetc = i_getc; i_getc = broken_getc;
1909 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
1911 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
1912 set_iconv(-TRUE, e_iconv);
1913 } else if (input_f == SJIS_INPUT) {
1914 set_iconv(-TRUE, s_iconv);
1915 #ifdef UTF8_INPUT_ENABLE
1916 } else if (input_f == UTF8_INPUT) {
1917 set_iconv(-TRUE, w_iconv);
1918 } else if (input_f == UTF16BE_INPUT) {
1919 set_iconv(-TRUE, w_iconv16);
1920 } else if (input_f == UTF16LE_INPUT) {
1921 set_iconv(-TRUE, w_iconv16);
1924 set_iconv(FALSE, e_iconv);
1928 struct input_code *p = input_code_list;
1936 Conversion main loop. Code detection only.
1945 int is_8bit = FALSE;
1947 module_connection();
1952 output_mode = ASCII;
1955 #define NEXT continue /* no output, get next */
1956 #define SEND ; /* output c1 and c2, get next */
1957 #define LAST break /* end of loop, go closing */
1959 while ((c1 = (*i_getc)(f)) != EOF) {
1964 /* in case of 8th bit is on */
1965 if (!estab_f&&!mime_decode_mode) {
1966 /* in case of not established yet */
1967 /* It is still ambiguious */
1968 if (h_conv(f, c2, c1)==EOF)
1974 /* in case of already established */
1976 /* ignore bogus code */
1982 /* second byte, 7 bit code */
1983 /* it might be kanji shitfted */
1984 if ((c1 == DEL) || (c1 <= SPACE)) {
1985 /* ignore bogus first code */
1993 #ifdef UTF8_INPUT_ENABLE
2002 #ifdef NUMCHAR_OPTION
2003 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2006 } else if (c1 > DEL) {
2008 if (!estab_f && !iso8859_f) {
2009 /* not established yet */
2010 if (!is_8bit) is_8bit = TRUE;
2013 } else { /* estab_f==TRUE */
2018 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2019 /* SJIS X0201 Case... */
2020 if(iso2022jp_f && x0201_f==NO_X0201) {
2021 (*oconv)(GETA1, GETA2);
2028 } else if (c1==SSO && iconv != s_iconv) {
2029 /* EUC X0201 Case */
2030 c1 = (*i_getc)(f); /* skip SSO */
2032 if (SSP<=c1 && c1<0xe0) {
2033 if(iso2022jp_f && x0201_f==NO_X0201) {
2034 (*oconv)(GETA1, GETA2);
2041 } else { /* bogus code, skip SSO and one byte */
2045 /* already established */
2050 } else if ((c1 > SPACE) && (c1 != DEL)) {
2051 /* in case of Roman characters */
2053 /* output 1 shifted byte */
2057 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2058 /* output 1 shifted byte */
2059 if(iso2022jp_f && x0201_f==NO_X0201) {
2060 (*oconv)(GETA1, GETA2);
2067 /* look like bogus code */
2070 } else if (input_mode == X0208) {
2071 /* in case of Kanji shifted */
2074 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2075 /* Check MIME code */
2076 if ((c1 = (*i_getc)(f)) == EOF) {
2079 } else if (c1 == '?') {
2080 /* =? is mime conversion start sequence */
2081 if(mime_f == STRICT_MIME) {
2082 /* check in real detail */
2083 if (mime_begin_strict(f) == EOF)
2087 } else if (mime_begin(f) == EOF)
2097 /* normal ASCII code */
2100 } else if (c1 == SI) {
2103 } else if (c1 == SO) {
2106 } else if (c1 == ESC ) {
2107 if ((c1 = (*i_getc)(f)) == EOF) {
2108 /* (*oconv)(0, ESC); don't send bogus code */
2110 } else if (c1 == '$') {
2111 if ((c1 = (*i_getc)(f)) == EOF) {
2113 (*oconv)(0, ESC); don't send bogus code
2114 (*oconv)(0, '$'); */
2116 } else if (c1 == '@'|| c1 == 'B') {
2117 /* This is kanji introduction */
2120 set_input_codename("ISO-2022-JP");
2122 debug(input_codename);
2125 } else if (c1 == '(') {
2126 if ((c1 = (*i_getc)(f)) == EOF) {
2127 /* don't send bogus code
2133 } else if (c1 == '@'|| c1 == 'B') {
2134 /* This is kanji introduction */
2139 } else if (c1 == 'D'){
2143 #endif /* X0212_ENABLE */
2145 /* could be some special code */
2152 } else if (broken_f&0x2) {
2153 /* accept any ESC-(-x as broken code ... */
2163 } else if (c1 == '(') {
2164 if ((c1 = (*i_getc)(f)) == EOF) {
2165 /* don't send bogus code
2167 (*oconv)(0, '('); */
2171 /* This is X0201 kana introduction */
2172 input_mode = X0201; shift_mode = X0201;
2174 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2175 /* This is X0208 kanji introduction */
2176 input_mode = ASCII; shift_mode = FALSE;
2178 } else if (broken_f&0x2) {
2179 input_mode = ASCII; shift_mode = FALSE;
2184 /* maintain various input_mode here */
2188 } else if ( c1 == 'N' || c1 == 'n' ){
2190 c3 = (*i_getc)(f); /* skip SS2 */
2191 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2206 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2207 input_mode = ASCII; set_iconv(FALSE, 0);
2209 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2210 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2218 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2219 if ((c1=(*i_getc)(f))!=EOF) {
2223 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2239 if (input_mode == X0208)
2240 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2242 else if (input_mode == X0212)
2243 (*oconv)((0x8f << 8) | c2, c1);
2244 #endif /* X0212_ENABLE */
2245 else if (input_mode)
2246 (*oconv)(input_mode, c1); /* other special case */
2247 else if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2248 int c0 = (*i_getc)(f);
2251 (*iconv)(c2, c1, c0);
2257 /* goto next_word */
2261 (*iconv)(EOF, 0, 0);
2262 if (!is_inputcode_set)
2265 struct input_code *p = input_code_list;
2266 struct input_code *result = p;
2268 if (p->score < result->score) result = p;
2271 set_input_codename(result->name);
2286 /** it must NOT be in the kanji shifte sequence */
2287 /** it must NOT be written in JIS7 */
2288 /** and it must be after 2 byte 8bit code */
2295 while ((c1 = (*i_getc)(f)) != EOF) {
2301 if (push_hold_buf(c1) == EOF || estab_f){
2307 struct input_code *p = input_code_list;
2308 struct input_code *result = p;
2313 if (p->score < result->score){
2318 set_iconv(FALSE, result->iconv_func);
2323 ** 1) EOF is detected, or
2324 ** 2) Code is established, or
2325 ** 3) Buffer is FULL (but last word is pushed)
2327 ** in 1) and 3) cases, we continue to use
2328 ** Kanji codes by oconv and leave estab_f unchanged.
2333 while (wc < hold_count){
2334 c2 = hold_buf[wc++];
2336 #ifdef NUMCHAR_OPTION
2337 || (c2 & CLASS_MASK) == CLASS_UTF16
2342 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2343 (*iconv)(X0201, c2, 0);
2346 if (wc < hold_count){
2347 c1 = hold_buf[wc++];
2356 if ((*iconv)(c2, c1, 0) < 0){
2358 if (wc < hold_count){
2359 c0 = hold_buf[wc++];
2368 (*iconv)(c2, c1, c0);
2381 if (hold_count >= HOLD_SIZE*2)
2383 hold_buf[hold_count++] = c2;
2384 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2387 int s2e_conv(c2, c1, p2, p1)
2391 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2394 #ifdef SHIFTJIS_CP932
2395 if (cp932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2396 extern const unsigned short shiftjis_cp932[3][189];
2397 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2403 #endif /* SHIFTJIS_CP932 */
2405 if (x0212_f && 0xfa <= c2 && c2 <= 0xfc){
2406 extern const unsigned short shiftjis_x0212[3][189];
2407 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2410 c2 = (0x8f << 8) | (val >> 8);
2422 c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394);
2424 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f);
2432 c2 = x0212_unshift(c2);
2447 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2450 int ret = s2e_conv(c2, c1, &c2, &c1);
2451 if (ret) return ret;
2465 }else if (c2 == 0x8f){
2469 c2 = (c2 << 8) | (c1 & 0x7f);
2471 #ifdef SHIFTJIS_CP932
2474 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2475 s2e_conv(s2, s1, &c2, &c1);
2476 if ((c2 & 0xff00) == 0){
2482 #endif /* SHIFTJIS_CP932 */
2483 #endif /* X0212_ENABLE */
2484 } else if (c2 == SSO){
2487 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2497 #ifdef UTF8_INPUT_ENABLE
2499 w2e_conv(c2, c1, c0, p2, p1)
2503 extern const unsigned short *const utf8_to_euc_2bytes[];
2504 extern const unsigned short *const *const utf8_to_euc_3bytes[];
2507 if (0xc0 <= c2 && c2 <= 0xef) {
2508 const unsigned short *const *pp;
2511 if (c0 == 0) return -1;
2512 pp = utf8_to_euc_3bytes[c2 - 0x80];
2513 ret = w_iconv_common(c1, c0, pp, sizeof_utf8_to_euc_C2, p2, p1);
2515 ret = w_iconv_common(c2, c1, utf8_to_euc_2bytes, sizeof_utf8_to_euc_2bytes, p2, p1);
2517 #ifdef NUMCHAR_OPTION
2520 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2525 } else if (c2 == X0201) {
2541 if (c2 == 0) /* 0x00-0x7f */
2543 else if ((c2 & 0xe0) == 0xc0) /* 0xc0-0xdf */
2545 else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2546 return -1; /* 3bytes */
2548 else if (0xf0 <= c2)
2549 return 0; /* 4,5,6bytes */
2550 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2551 return 0; /* trail byte */
2556 else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf) {
2557 return 0; /* throw BOM */
2558 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
2559 } else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
2560 unsigned short val = 0;
2565 val = ww16_conv(c2, c1, c0);
2566 c2 = (val >> 8) & 0xff;
2570 ret = w2e_conv(c2, c1, c0, &c2, &c1);
2579 w16w_conv(val, p2, p1, p0)
2587 }else if (val < 0x800){
2588 *p2 = 0xc0 | (val >> 6);
2589 *p1 = 0x80 | (val & 0x3f);
2592 *p2 = 0xe0 | (val >> 12);
2593 *p1 = 0x80 | ((val >> 6) & 0x3f);
2594 *p0 = 0x80 | (val & 0x3f);
2599 ww16_conv(c2, c1, c0)
2604 val = (c2 & 0x0f) << 12;
2605 val |= (c1 & 0x3f) << 6;
2607 }else if (c2 >= 0xc0){
2608 val = (c2 & 0x1f) << 6;
2617 w16e_conv(val, p2, p1)
2621 extern const unsigned short *const utf8_to_euc_2bytes[];
2622 extern const unsigned short *const *const utf8_to_euc_3bytes[];
2624 const unsigned short *const *pp;
2628 w16w_conv(val, &c2, &c1, &c0);
2631 pp = utf8_to_euc_3bytes[c2 - 0x80];
2632 psize = sizeof_utf8_to_euc_C2;
2633 ret = w_iconv_common(c1, c0, pp, psize, p2, p1);
2635 pp = utf8_to_euc_2bytes;
2636 psize = sizeof_utf8_to_euc_2bytes;
2637 ret = w_iconv_common(c2, c1, pp, psize, p2, p1);
2639 #ifdef NUMCHAR_OPTION
2642 *p1 = CLASS_UTF16 | val;
2654 w_iconv16(c2, c1, c0)
2659 if (c2==0376 && c1==0377){
2660 utf16_mode = UTF16BE_INPUT;
2662 } else if (c2==0377 && c1==0376){
2663 utf16_mode = UTF16LE_INPUT;
2666 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
2668 tmp=c1; c1=c2; c2=tmp;
2670 if ((c2==0 && c1 < 0x80) || c2==EOF) {
2674 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
2675 if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16));
2677 else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
2678 if (ret) return ret;
2684 w_iconv_common(c1, c0, pp, psize, p2, p1)
2686 const unsigned short *const *pp;
2691 const unsigned short *p;
2694 if (pp == 0) return 1;
2697 if (c1 < 0 || psize <= c1) return 1;
2699 if (p == 0) return 1;
2702 if (c0 < 0 || sizeof_utf8_to_euc_E5B8 <= c0) return 1;
2704 if (val == 0) return 1;
2711 if (c2 == SO) c2 = X0201;
2720 #ifdef UTF8_OUTPUT_ENABLE
2725 extern const unsigned short euc_to_utf8_1byte[];
2726 extern const unsigned short *const euc_to_utf8_2bytes[];
2727 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
2728 const unsigned short *p;
2731 p = euc_to_utf8_1byte;
2733 } else if (c2 >> 8 == 0x8f){
2734 extern const unsigned short *const x0212_to_utf8_2bytes[];
2735 c2 = (c2&0x7f) - 0x21;
2736 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2737 p = x0212_to_utf8_2bytes[c2];
2743 c2 = (c2&0x7f) - 0x21;
2744 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2745 p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
2750 c1 = (c1 & 0x7f) - 0x21;
2751 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
2768 if (unicode_bom_f==2) {
2775 #ifdef NUMCHAR_OPTION
2776 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2777 w16w_conv(c1, &c2, &c1, &c0);
2781 if (c0) (*o_putc)(c0);
2788 output_mode = ASCII;
2790 } else if (c2 == ISO8859_1) {
2791 output_mode = ISO8859_1;
2792 (*o_putc)(c1 | 0x080);
2795 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
2796 val = ((c2<<8)&0xff00) + c1;
2797 else val = e2w_conv(c2, c1);
2799 w16w_conv(val, &c2, &c1, &c0);
2803 if (c0) (*o_putc)(c0);
2819 if (unicode_bom_f==2) {
2821 (*o_putc)((unsigned char)'\377');
2825 (*o_putc)((unsigned char)'\377');
2830 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
2831 } else if (c2 == ISO8859_1) {
2834 #ifdef NUMCHAR_OPTION
2835 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
2836 c2 = (c1 >> 8) & 0xff;
2840 unsigned short val = e2w_conv(c2, c1);
2841 c2 = (val >> 8) & 0xff;
2860 #ifdef NUMCHAR_OPTION
2861 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2862 w16e_conv(c1, &c2, &c1);
2863 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2871 } else if (c2 == 0) {
2872 output_mode = ASCII;
2874 } else if (c2 == X0201) {
2875 output_mode = JAPANESE_EUC;
2876 (*o_putc)(SSO); (*o_putc)(c1|0x80);
2877 } else if (c2 == ISO8859_1) {
2878 output_mode = ISO8859_1;
2879 (*o_putc)(c1 | 0x080);
2881 } else if ((c2 & 0xff00) >> 8 == 0x8f){
2882 output_mode = JAPANESE_EUC;
2883 #ifdef SHIFTJIS_CP932
2886 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2887 s2e_conv(s2, s1, &c2, &c1);
2891 if ((c2 & 0xff00) >> 8 == 0x8f){
2894 (*o_putc)((c2 & 0x7f) | 0x080);
2895 (*o_putc)(c1 | 0x080);
2898 (*o_putc)((c2 & 0x7f) | 0x080);
2899 (*o_putc)(c1 | 0x080);
2903 if ((c1<0x21 || 0x7e<c1) ||
2904 (c2<0x21 || 0x7e<c2)) {
2905 set_iconv(FALSE, 0);
2906 return; /* too late to rescue this char */
2908 output_mode = JAPANESE_EUC;
2909 (*o_putc)(c2 | 0x080);
2910 (*o_putc)(c1 | 0x080);
2920 if ((ret & 0xff00) == 0x8f00){
2921 if (0x75 <= c && c <= 0x7f){
2922 ret = c + (0x109 - 0x75);
2925 if (0x75 <= c && c <= 0x7f){
2926 ret = c + (0x113 - 0x75);
2933 int x0212_unshift(c)
2937 if (0x7f <= c && c <= 0x88){
2938 ret = c + (0x75 - 0x7f);
2939 }else if (0x89 <= c && c <= 0x92){
2940 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
2944 #endif /* X0212_ENABLE */
2947 e2s_conv(c2, c1, p2, p1)
2948 int c2, c1, *p2, *p1;
2952 const unsigned short *ptr;
2954 extern const unsigned short *const x0212_shiftjis[];
2955 if ((c2 & 0xff00) == 0x8f00){
2957 if (0x21 <= ndx && ndx <= 0x7e){
2958 ptr = x0212_shiftjis[ndx - 0x21];
2960 val = ptr[(c1 & 0x7f) - 0x21];
2970 c2 = x0212_shift(c2);
2972 #endif /* X0212_ENABLE */
2973 if ((c2 & 0xff00) == 0x8f00){
2976 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
2977 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
2986 #ifdef NUMCHAR_OPTION
2987 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2988 w16e_conv(c1, &c2, &c1);
2994 } else if (c2 == 0) {
2995 output_mode = ASCII;
2997 } else if (c2 == X0201) {
2998 output_mode = SHIFT_JIS;
3000 } else if (c2 == ISO8859_1) {
3001 output_mode = ISO8859_1;
3002 (*o_putc)(c1 | 0x080);
3004 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3005 output_mode = SHIFT_JIS;
3006 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3012 if ((c1<0x20 || 0x7e<c1) ||
3013 (c2<0x20 || 0x7e<c2)) {
3014 set_iconv(FALSE, 0);
3015 return; /* too late to rescue this char */
3017 output_mode = SHIFT_JIS;
3018 e2s_conv(c2, c1, &c2, &c1);
3020 #ifdef SHIFTJIS_CP932
3022 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3023 extern const unsigned short cp932inv[2][189];
3024 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3030 #endif /* SHIFTJIS_CP932 */
3033 if (prefix_table[(unsigned char)c1]){
3034 (*o_putc)(prefix_table[(unsigned char)c1]);
3045 #ifdef NUMCHAR_OPTION
3046 if ((c1 & CLASS_MASK) == CLASS_UTF16){
3047 w16e_conv(c1, &c2, &c1);
3051 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3054 (*o_putc)(ascii_intro);
3055 output_mode = ASCII;
3059 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3060 if (output_mode!=X0212) {
3061 output_mode = X0212;
3067 (*o_putc)(c2 & 0x7f);
3070 } else if (c2==X0201) {
3071 if (output_mode!=X0201) {
3072 output_mode = X0201;
3078 } else if (c2==ISO8859_1) {
3079 /* iso8859 introduction, or 8th bit on */
3080 /* Can we convert in 7bit form using ESC-'-'-A ?
3082 output_mode = ISO8859_1;
3084 } else if (c2 == 0) {
3085 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3088 (*o_putc)(ascii_intro);
3089 output_mode = ASCII;
3093 if (output_mode != X0208) {
3094 output_mode = X0208;
3097 (*o_putc)(kanji_intro);
3099 if (c1<0x20 || 0x7e<c1)
3101 if (c2<0x20 || 0x7e<c2)
3113 mime_prechar(c2, c1);
3114 (*o_base64conv)(c2,c1);
3118 STATIC int broken_buf[3];
3119 STATIC int broken_counter = 0;
3120 STATIC int broken_last = 0;
3127 if (broken_counter>0) {
3128 return broken_buf[--broken_counter];
3131 if (c=='$' && broken_last != ESC
3132 && (input_mode==ASCII || input_mode==X0201)) {
3135 if (c1=='@'|| c1=='B') {
3136 broken_buf[0]=c1; broken_buf[1]=c;
3143 } else if (c=='(' && broken_last != ESC
3144 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3147 if (c1=='J'|| c1=='B') {
3148 broken_buf[0]=c1; broken_buf[1]=c;
3166 if (broken_counter<2)
3167 broken_buf[broken_counter++]=c;
3171 STATIC int prev_cr = 0;
3179 if (! (c2==0&&c1==NL) ) {
3185 } else if (c1=='\r') {
3187 } else if (c1=='\n') {
3188 if (crmode_f==CRLF) {
3189 (*o_crconv)(0,'\r');
3190 } else if (crmode_f==CR) {
3191 (*o_crconv)(0,'\r');
3195 } else if (c1!='\032' || crmode_f!=NL){
3201 Return value of fold_conv()
3203 \n add newline and output char
3204 \r add newline and output nothing
3207 1 (or else) normal output
3209 fold state in prev (previous character)
3211 >0x80 Japanese (X0208/X0201)
3216 This fold algorthm does not preserve heading space in a line.
3217 This is the main difference from fmt.
3220 #define char_size(c2,c1) (c2?2:1)
3229 if (c1== '\r' && !fold_preserve_f) {
3230 fold_state=0; /* ignore cr */
3231 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3233 fold_state=0; /* ignore cr */
3234 } else if (c1== BS) {
3235 if (f_line>0) f_line--;
3237 } else if (c2==EOF && f_line != 0) { /* close open last line */
3239 } else if ((c1=='\n' && !fold_preserve_f)
3240 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3241 && fold_preserve_f)) {
3243 if (fold_preserve_f) {
3247 } else if ((f_prev == c1 && !fold_preserve_f)
3248 || (f_prev == '\n' && fold_preserve_f)
3249 ) { /* duplicate newline */
3252 fold_state = '\n'; /* output two newline */
3258 if (f_prev&0x80) { /* Japanese? */
3260 fold_state = 0; /* ignore given single newline */
3261 } else if (f_prev==' ') {
3265 if (++f_line<=fold_len)
3269 fold_state = '\r'; /* fold and output nothing */
3273 } else if (c1=='\f') {
3278 fold_state = '\n'; /* output newline and clear */
3279 } else if ( (c2==0 && c1==' ')||
3280 (c2==0 && c1=='\t')||
3281 (c2=='!'&& c1=='!')) {
3282 /* X0208 kankaku or ascii space */
3283 if (f_prev == ' ') {
3284 fold_state = 0; /* remove duplicate spaces */
3287 if (++f_line<=fold_len)
3288 fold_state = ' '; /* output ASCII space only */
3290 f_prev = ' '; f_line = 0;
3291 fold_state = '\r'; /* fold and output nothing */
3295 prev0 = f_prev; /* we still need this one... , but almost done */
3297 if (c2 || c2==X0201)
3298 f_prev |= 0x80; /* this is Japanese */
3299 f_line += char_size(c2,c1);
3300 if (f_line<=fold_len) { /* normal case */
3303 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3304 f_line = char_size(c2,c1);
3305 fold_state = '\n'; /* We can't wait, do fold now */
3306 } else if (c2==X0201) {
3307 /* simple kinsoku rules return 1 means no folding */
3308 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3309 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3310 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3311 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3312 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3313 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3314 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3316 fold_state = '\n';/* add one new f_line before this character */
3319 fold_state = '\n';/* add one new f_line before this character */
3322 /* kinsoku point in ASCII */
3323 if ( c1==')'|| /* { [ ( */
3334 /* just after special */
3335 } else if (!is_alnum(prev0)) {
3336 f_line = char_size(c2,c1);
3338 } else if ((prev0==' ') || /* ignored new f_line */
3339 (prev0=='\n')|| /* ignored new f_line */
3340 (prev0&0x80)) { /* X0208 - ASCII */
3341 f_line = char_size(c2,c1);
3342 fold_state = '\n';/* add one new f_line before this character */
3344 fold_state = 1; /* default no fold in ASCII */
3348 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3349 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3350 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3351 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3352 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3353 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3354 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3355 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3356 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3357 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3358 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3359 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3360 /* default no fold in kinsoku */
3363 f_line = char_size(c2,c1);
3364 /* add one new f_line before this character */
3367 f_line = char_size(c2,c1);
3369 /* add one new f_line before this character */
3374 /* terminator process */
3375 switch(fold_state) {
3394 int z_prev2=0,z_prev1=0;
3401 /* if (c2) c1 &= 0x7f; assertion */
3403 if (x0201_f && z_prev2==X0201) { /* X0201 */
3404 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
3406 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
3408 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
3410 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
3414 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
3423 if (x0201_f && c2==X0201) {
3424 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
3425 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3426 z_prev1 = c1; z_prev2 = c2;
3429 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
3434 /* JISX0208 Alphabet */
3435 if (alpha_f && c2 == 0x23 ) {
3437 } else if (alpha_f && c2 == 0x21 ) {
3438 /* JISX0208 Kigou */
3443 } else if (alpha_f&0x4) {
3448 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3454 case '>': entity = ">"; break;
3455 case '<': entity = "<"; break;
3456 case '\"': entity = """; break;
3457 case '&': entity = "&"; break;
3460 while (*entity) (*o_zconv)(0, *entity++);
3470 #define rot13(c) ( \
3472 (c <= 'M') ? (c + 13): \
3473 (c <= 'Z') ? (c - 13): \
3475 (c <= 'm') ? (c + 13): \
3476 (c <= 'z') ? (c - 13): \
3480 #define rot47(c) ( \
3482 ( c <= 'O' ) ? (c + 47) : \
3483 ( c <= '~' ) ? (c - 47) : \
3491 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
3497 (*o_rot_conv)(c2,c1);
3504 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
3506 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
3509 (*o_hira_conv)(c2,c1);
3514 iso2022jp_check_conv(c2,c1)
3517 STATIC const int range[RANGE_NUM_MAX][2] = {
3540 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3544 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3549 for (i = 0; i < RANGE_NUM_MAX; i++) {
3550 start = range[i][0];
3553 if (c >= start && c <= end) {
3558 (*o_iso2022jp_check_conv)(c2,c1);
3562 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3564 const unsigned char *mime_pattern[] = {
3565 (const unsigned char *)"\075?EUC-JP?B?",
3566 (const unsigned char *)"\075?SHIFT_JIS?B?",
3567 (const unsigned char *)"\075?ISO-8859-1?Q?",
3568 (const unsigned char *)"\075?ISO-8859-1?B?",
3569 (const unsigned char *)"\075?ISO-2022-JP?B?",
3570 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3571 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3572 (const unsigned char *)"\075?UTF-8?B?",
3573 (const unsigned char *)"\075?UTF-8?Q?",
3575 (const unsigned char *)"\075?US-ASCII?Q?",
3580 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3581 int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
3582 e_iconv, s_iconv, 0, 0, 0, 0,
3583 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3589 const int mime_encode[] = {
3590 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
3591 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3598 const int mime_encode_method[] = {
3599 'B', 'B','Q', 'B', 'B', 'Q',
3600 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3608 #define MAXRECOVER 20
3610 /* I don't trust portablity of toupper */
3611 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
3612 #define nkf_isdigit(c) ('0'<=c && c<='9')
3613 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
3614 #define nkf_isblank(c) (c == SPACE || c == TAB)
3615 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
3616 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
3617 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
3622 if (i_getc!=mime_getc) {
3623 i_mgetc = i_getc; i_getc = mime_getc;
3624 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3625 if(mime_f==STRICT_MIME) {
3626 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3627 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3633 unswitch_mime_getc()
3635 if(mime_f==STRICT_MIME) {
3636 i_mgetc = i_mgetc_buf;
3637 i_mungetc = i_mungetc_buf;
3640 i_ungetc = i_mungetc;
3641 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3642 mime_iconv_back = NULL;
3646 mime_begin_strict(f)
3651 const unsigned char *p,*q;
3652 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
3654 mime_decode_mode = FALSE;
3655 /* =? has been checked */
3657 p = mime_pattern[j];
3660 for(i=2;p[i]>' ';i++) { /* start at =? */
3661 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
3662 /* pattern fails, try next one */
3664 while ((p = mime_pattern[++j])) {
3665 for(k=2;k<i;k++) /* assume length(p) > i */
3666 if (p[k]!=q[k]) break;
3667 if (k==i && nkf_toupper(c1)==p[k]) break;
3669 if (p) continue; /* found next one, continue */
3670 /* all fails, output from recovery buffer */
3678 mime_decode_mode = p[i-2];
3680 mime_iconv_back = iconv;
3681 set_iconv(FALSE, mime_priority_func[j]);
3682 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3684 if (mime_decode_mode=='B') {
3685 mimebuf_f = unbuf_f;
3687 /* do MIME integrity check */
3688 return mime_integrity(f,mime_pattern[j]);
3700 /* we don't keep eof of Fifo, becase it contains ?= as
3701 a terminator. It was checked in mime_integrity. */
3702 return ((mimebuf_f)?
3703 (*i_mgetc_buf)(f):Fifo(mime_input++));
3707 mime_ungetc_buf(c,f)
3712 (*i_mungetc_buf)(c,f);
3714 Fifo(--mime_input)=c;
3725 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3726 /* re-read and convert again from mime_buffer. */
3728 /* =? has been checked */
3730 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
3731 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3732 /* We accept any character type even if it is breaked by new lines */
3733 c1 = (*i_getc)(f); Fifo(mime_last++)= c1 ;
3734 if (c1=='\n'||c1==' '||c1=='\r'||
3735 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
3737 /* Failed. But this could be another MIME preemble */
3745 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
3746 if (!(++i<MAXRECOVER) || c1==EOF) break;
3747 if (c1=='b'||c1=='B') {
3748 mime_decode_mode = 'B';
3749 } else if (c1=='q'||c1=='Q') {
3750 mime_decode_mode = 'Q';
3754 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
3755 if (!(++i<MAXRECOVER) || c1==EOF) break;
3757 mime_decode_mode = FALSE;
3763 if (!mime_decode_mode) {
3764 /* false MIME premble, restart from mime_buffer */
3765 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3766 /* Since we are in MIME mode until buffer becomes empty, */
3767 /* we never go into mime_begin again for a while. */
3770 /* discard mime preemble, and goto MIME mode */
3772 /* do no MIME integrity check */
3773 return c1; /* used only for checking EOF */
3788 fprintf(stderr, "%s\n", str);
3794 set_input_codename (codename)
3799 strcmp(codename, "") != 0 &&
3800 strcmp(codename, input_codename) != 0)
3802 is_inputcode_mixed = TRUE;
3804 input_codename = codename;
3805 is_inputcode_set = TRUE;
3808 #if !defined(PERL_XS) && !defined(WIN32DLL)
3810 print_guessed_code (filename)
3813 char *codename = "BINARY";
3814 if (!is_inputcode_mixed) {
3815 if (strcmp(input_codename, "") == 0) {
3818 codename = input_codename;
3821 if (filename != NULL) printf("%s:", filename);
3822 printf("%s\n", codename);
3830 if (nkf_isdigit(x)) return x - '0';
3831 return nkf_toupper(x) - 'A' + 10;
3836 #ifdef ANSI_C_PROTOTYPE
3837 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
3840 hex_getc(ch, f, g, u)
3853 if (!nkf_isxdigit(c2)){
3858 if (!nkf_isxdigit(c3)){
3863 return (hex2bin(c2) << 4) | hex2bin(c3);
3870 return hex_getc(':', f, i_cgetc, i_cungetc);
3878 return (*i_cungetc)(c, f);
3885 return hex_getc('%', f, i_ugetc, i_uungetc);
3893 return (*i_uungetc)(c, f);
3897 #ifdef NUMCHAR_OPTION
3902 int (*g)() = i_ngetc;
3903 int (*u)() = i_nungetc;
3914 if (buf[i] == 'x' || buf[i] == 'X'){
3915 for (j = 0; j < 5; j++){
3917 if (!nkf_isxdigit(buf[i])){
3924 c |= hex2bin(buf[i]);
3927 for (j = 0; j < 6; j++){
3931 if (!nkf_isdigit(buf[i])){
3938 c += hex2bin(buf[i]);
3944 return CLASS_UTF16 | c;
3954 numchar_ungetc(c, f)
3958 return (*i_nungetc)(c, f);
3962 #ifdef UNICODE_NORMALIZATION
3964 /* Normalization Form C */
3969 int (*g)() = i_nfc_getc;
3970 int (*u)() = i_nfc_ungetc;
3971 int i=0, j, k=1, lower, upper;
3973 const int *array = NULL;
3974 extern const struct normalization_pair normalization_table[];
3977 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
3978 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
3979 while (upper >= lower) {
3980 j = (lower+upper) / 2;
3981 array = normalization_table[j].nfd;
3982 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
3983 if (array[k] != buf[k]){
3984 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
3991 array = normalization_table[j].nfc;
3992 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4009 return (*i_nfc_ungetc)(c, f);
4011 #endif /* UNICODE_NORMALIZATION */
4018 int c1, c2, c3, c4, cc;
4019 int t1, t2, t3, t4, mode, exit_mode;
4023 int lwsp_size = 128;
4025 if (mime_top != mime_last) { /* Something is in FIFO */
4026 return Fifo(mime_top++);
4028 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4029 mime_decode_mode=FALSE;
4030 unswitch_mime_getc();
4031 return (*i_getc)(f);
4034 if (mimebuf_f == FIXED_MIME)
4035 exit_mode = mime_decode_mode;
4038 if (mime_decode_mode == 'Q') {
4039 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4041 if (c1=='_') return ' ';
4042 if (c1<=' ' || DEL<=c1) {
4043 mime_decode_mode = exit_mode; /* prepare for quit */
4046 if (c1!='=' && c1!='?') {
4050 mime_decode_mode = exit_mode; /* prepare for quit */
4051 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4052 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4053 /* end Q encoding */
4054 input_mode = exit_mode;
4056 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4057 if (lwsp_buf==NULL) {
4058 perror("can't malloc");
4061 while ((c1=(*i_getc)(f))!=EOF) {
4066 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4074 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4075 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4090 lwsp_buf[lwsp_count] = c1;
4091 if (lwsp_count++>lwsp_size){
4093 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4094 if (lwsp_buf_new==NULL) {
4097 perror("can't realloc");
4100 lwsp_buf = lwsp_buf_new;
4106 if (lwsp_count > 0) {
4107 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4111 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4112 i_ungetc(lwsp_buf[lwsp_count],f);
4120 if (c1=='='&&c2<' ') { /* this is soft wrap */
4121 while((c1 = (*i_mgetc)(f)) <=' ') {
4122 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4124 mime_decode_mode = 'Q'; /* still in MIME */
4125 goto restart_mime_q;
4128 mime_decode_mode = 'Q'; /* still in MIME */
4132 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4133 if (c2<=' ') return c2;
4134 mime_decode_mode = 'Q'; /* still in MIME */
4135 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4136 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4137 return ((hex(c2)<<4) + hex(c3));
4140 if (mime_decode_mode != 'B') {
4141 mime_decode_mode = FALSE;
4142 return (*i_mgetc)(f);
4146 /* Base64 encoding */
4148 MIME allows line break in the middle of
4149 Base64, but we are very pessimistic in decoding
4150 in unbuf mode because MIME encoded code may broken by
4151 less or editor's control sequence (such as ESC-[-K in unbuffered
4152 mode. ignore incomplete MIME.
4154 mode = mime_decode_mode;
4155 mime_decode_mode = exit_mode; /* prepare for quit */
4157 while ((c1 = (*i_mgetc)(f))<=' ') {
4162 if ((c2 = (*i_mgetc)(f))<=' ') {
4165 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4166 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4169 if ((c1 == '?') && (c2 == '=')) {
4172 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4173 if (lwsp_buf==NULL) {
4174 perror("can't malloc");
4177 while ((c1=(*i_getc)(f))!=EOF) {
4182 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4190 if ((c1=(*i_getc)(f))!=EOF) {
4194 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4209 lwsp_buf[lwsp_count] = c1;
4210 if (lwsp_count++>lwsp_size){
4212 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4213 if (lwsp_buf_new==NULL) {
4216 perror("can't realloc");
4219 lwsp_buf = lwsp_buf_new;
4225 if (lwsp_count > 0) {
4226 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4230 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4231 i_ungetc(lwsp_buf[lwsp_count],f);
4240 if ((c3 = (*i_mgetc)(f))<=' ') {
4243 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4244 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4248 if ((c4 = (*i_mgetc)(f))<=' ') {
4251 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4252 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4256 mime_decode_mode = mode; /* still in MIME sigh... */
4258 /* BASE 64 decoding */
4260 t1 = 0x3f & base64decode(c1);
4261 t2 = 0x3f & base64decode(c2);
4262 t3 = 0x3f & base64decode(c3);
4263 t4 = 0x3f & base64decode(c4);
4264 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4266 Fifo(mime_last++) = cc;
4267 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4269 Fifo(mime_last++) = cc;
4270 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4272 Fifo(mime_last++) = cc;
4277 return Fifo(mime_top++);
4285 Fifo(--mime_top) = c;
4292 const unsigned char *p;
4296 /* In buffered mode, read until =? or NL or buffer full
4298 mime_input = mime_top;
4299 mime_last = mime_top;
4301 while(*p) Fifo(mime_input++) = *p++;
4304 while((c=(*i_getc)(f))!=EOF) {
4305 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4306 break; /* buffer full */
4308 if (c=='=' && d=='?') {
4309 /* checked. skip header, start decode */
4310 Fifo(mime_input++) = c;
4311 /* mime_last_input = mime_input; */
4316 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4318 /* Should we check length mod 4? */
4319 Fifo(mime_input++) = c;
4322 /* In case of Incomplete MIME, no MIME decode */
4323 Fifo(mime_input++) = c;
4324 mime_last = mime_input; /* point undecoded buffer */
4325 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4326 switch_mime_getc(); /* anyway we need buffered getc */
4337 i = c - 'A'; /* A..Z 0-25 */
4339 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4341 } else if (c > '/') {
4342 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4343 } else if (c == '+') {
4344 i = '>' /* 62 */ ; /* + 62 */
4346 i = '?' /* 63 */ ; /* / 63 */
4351 STATIC const char basis_64[] =
4352 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4355 #define MIMEOUT_BUF_LENGTH (60)
4356 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4357 int mimeout_buf_count = 0;
4358 int mimeout_preserve_space = 0;
4359 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4365 const unsigned char *p;
4368 p = mime_pattern[0];
4369 for(i=0;mime_encode[i];i++) {
4370 if (mode == mime_encode[i]) {
4371 p = mime_pattern[i];
4375 mimeout_mode = mime_encode_method[i];
4378 if (base64_count>45) {
4379 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4380 (*o_mputc)(mimeout_buf[i]);
4386 if (!mimeout_preserve_space && mimeout_buf_count>0
4387 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4388 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4392 if (!mimeout_preserve_space) {
4393 for (;i<mimeout_buf_count;i++) {
4394 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4395 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4396 (*o_mputc)(mimeout_buf[i]);
4403 mimeout_preserve_space = FALSE;
4409 j = mimeout_buf_count;
4410 mimeout_buf_count = 0;
4412 mime_putc(mimeout_buf[i]);
4428 switch(mimeout_mode) {
4433 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
4439 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
4445 if (mimeout_f!=FIXED_MIME) {
4447 } else if (mimeout_mode != 'Q')
4456 switch(mimeout_mode) {
4461 } else if (c==CR||c==NL) {
4464 } else if(c<SPACE||c=='='||c=='?'||c=='_'||DEL<=c) {
4466 (*o_mputc)(itoh4(((c>>4)&0xf)));
4467 (*o_mputc)(itoh4((c&0xf)));
4476 (*o_mputc)(basis_64[c>>2]);
4481 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4487 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
4488 (*o_mputc)(basis_64[c & 0x3F]);
4499 int mime_lastchar2, mime_lastchar1;
4501 void mime_prechar(c2, c1)
4506 if (base64_count + mimeout_buf_count/3*4> 66){
4507 (*o_base64conv)(EOF,0);
4508 (*o_base64conv)(0,NL);
4509 (*o_base64conv)(0,SPACE);
4511 }/*else if (mime_lastchar2){
4512 if (c1 <=DEL && !nkf_isspace(c1)){
4513 (*o_base64conv)(0,SPACE);
4517 if (c2 && mime_lastchar2 == 0
4518 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
4519 (*o_base64conv)(0,SPACE);
4522 mime_lastchar2 = c2;
4523 mime_lastchar1 = c1;
4534 if (mimeout_f == FIXED_MIME){
4535 if (mimeout_mode == 'Q'){
4536 if (base64_count > 71){
4537 if (c!=CR && c!=NL) {
4544 if (base64_count > 71){
4549 if (c == EOF) { /* c==EOF */
4553 if (c != EOF) { /* c==EOF */
4559 /* mimeout_f != FIXED_MIME */
4561 if (c == EOF) { /* c==EOF */
4562 j = mimeout_buf_count;
4563 mimeout_buf_count = 0;
4566 /*if (nkf_isspace(mimeout_buf[i])){
4569 mimeout_addchar(mimeout_buf[i]);
4573 (*o_mputc)(mimeout_buf[i]);
4579 if (mimeout_mode=='Q') {
4580 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
4592 if (mimeout_buf_count > 0){
4593 lastchar = mimeout_buf[mimeout_buf_count - 1];
4598 if (!mimeout_mode) {
4599 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
4600 if (nkf_isspace(c)) {
4601 if (c==CR || c==NL) {
4604 for (i=0;i<mimeout_buf_count;i++) {
4605 (*o_mputc)(mimeout_buf[i]);
4606 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
4613 mimeout_buf_count = 1;
4615 if (base64_count > 1
4616 && base64_count + mimeout_buf_count > 76){
4619 if (!nkf_isspace(mimeout_buf[0])){
4624 mimeout_buf[mimeout_buf_count++] = c;
4625 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4626 open_mime(output_mode);
4631 if (lastchar==CR || lastchar == NL){
4632 for (i=0;i<mimeout_buf_count;i++) {
4633 (*o_mputc)(mimeout_buf[i]);
4636 mimeout_buf_count = 0;
4638 if (lastchar==SPACE) {
4639 for (i=0;i<mimeout_buf_count-1;i++) {
4640 (*o_mputc)(mimeout_buf[i]);
4643 mimeout_buf[0] = SPACE;
4644 mimeout_buf_count = 1;
4646 open_mime(output_mode);
4649 /* mimeout_mode == 'B', 1, 2 */
4650 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
4651 if (lastchar == CR || lastchar == NL){
4652 if (nkf_isblank(c)) {
4653 for (i=0;i<mimeout_buf_count;i++) {
4654 mimeout_addchar(mimeout_buf[i]);
4656 mimeout_buf_count = 0;
4657 } else if (SPACE<c && c<DEL) {
4659 for (i=0;i<mimeout_buf_count;i++) {
4660 (*o_mputc)(mimeout_buf[i]);
4663 mimeout_buf_count = 0;
4666 if (c==SPACE || c==TAB || c==CR || c==NL) {
4667 for (i=0;i<mimeout_buf_count;i++) {
4668 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
4670 for (i=0;i<mimeout_buf_count;i++) {
4671 (*o_mputc)(mimeout_buf[i]);
4674 mimeout_buf_count = 0;
4677 mimeout_buf[mimeout_buf_count++] = c;
4678 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4680 for (i=0;i<mimeout_buf_count;i++) {
4681 (*o_mputc)(mimeout_buf[i]);
4684 mimeout_buf_count = 0;
4688 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
4689 mimeout_buf[mimeout_buf_count++] = c;
4690 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4691 j = mimeout_buf_count;
4692 mimeout_buf_count = 0;
4694 mimeout_addchar(mimeout_buf[i]);
4701 if (mimeout_buf_count>0) {
4702 j = mimeout_buf_count;
4703 mimeout_buf_count = 0;
4705 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
4707 mimeout_addchar(mimeout_buf[i]);
4713 (*o_mputc)(mimeout_buf[i]);
4715 open_mime(output_mode);
4722 #if defined(PERL_XS) || defined(WIN32DLL)
4727 struct input_code *p = input_code_list;
4740 mime_f = STRICT_MIME;
4741 mime_decode_f = FALSE;
4746 #if defined(MSDOS) || defined(__OS2__)
4751 iso2022jp_f = FALSE;
4752 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
4753 internal_unicode_f = FALSE;
4755 #ifdef UTF8_OUTPUT_ENABLE
4758 ms_ucs_map_f = FALSE;
4760 #ifdef UNICODE_NORMALIZATION
4773 is_inputcode_mixed = FALSE;
4774 is_inputcode_set = FALSE;
4778 #ifdef SHIFTJIS_CP932
4784 for (i = 0; i < 256; i++){
4785 prefix_table[i] = 0;
4788 #ifdef UTF8_INPUT_ENABLE
4789 utf16_mode = UTF16BE_INPUT;
4791 mimeout_buf_count = 0;
4796 fold_preserve_f = FALSE;
4799 kanji_intro = DEFAULT_J;
4800 ascii_intro = DEFAULT_R;
4801 fold_margin = FOLD_MARGIN;
4802 output_conv = DEFAULT_CONV;
4803 oconv = DEFAULT_CONV;
4804 o_zconv = no_connection;
4805 o_fconv = no_connection;
4806 o_crconv = no_connection;
4807 o_rot_conv = no_connection;
4808 o_hira_conv = no_connection;
4809 o_base64conv = no_connection;
4810 o_iso2022jp_check_conv = no_connection;
4813 i_ungetc = std_ungetc;
4815 i_bungetc = std_ungetc;
4818 i_mungetc = std_ungetc;
4819 i_mgetc_buf = std_getc;
4820 i_mungetc_buf = std_ungetc;
4821 output_mode = ASCII;
4824 mime_decode_mode = FALSE;
4830 z_prev2=0,z_prev1=0;
4832 iconv_for_check = 0;
4834 input_codename = "";
4842 no_connection(c2,c1)
4845 no_connection2(c2,c1,0);
4849 no_connection2(c2,c1,c0)
4852 fprintf(stderr,"nkf internal module connection failure.\n");
4854 return 0; /* LINT */
4859 #define fprintf dllprintf
4864 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
4865 fprintf(stderr,"Flags:\n");
4866 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
4867 #ifdef DEFAULT_CODE_SJIS
4868 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8N\n");
4870 #ifdef DEFAULT_CODE_JIS
4871 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8N\n");
4873 #ifdef DEFAULT_CODE_EUC
4874 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8N\n");
4876 #ifdef DEFAULT_CODE_UTF8
4877 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8N (DEFAULT)\n");
4879 #ifdef UTF8_OUTPUT_ENABLE
4880 fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
4882 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC), UTF-8\n");
4883 #ifdef UTF8_INPUT_ENABLE
4884 fprintf(stderr," After 'W' you can add more options. (8|16(B|L)?) \n");
4886 fprintf(stderr,"t no conversion\n");
4887 fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
4888 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
4889 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
4890 fprintf(stderr,"v Show this usage. V: show version\n");
4891 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
4892 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
4893 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
4894 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
4895 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces,\n");
4896 fprintf(stderr," 3: Convert HTML Entity\n");
4897 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
4898 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
4900 fprintf(stderr,"T Text mode output\n");
4902 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
4903 fprintf(stderr,"d,c Delete \\r in line feed and \\032, Add \\r in line feed\n");
4904 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
4905 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
4906 fprintf(stderr,"long name options\n");
4907 fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
4908 fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n");
4909 fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
4910 fprintf(stderr," --x0212 Convert JISX0212\n");
4911 fprintf(stderr," --cp932, --no-cp932 CP932 compatibility\n");
4912 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
4914 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
4916 #ifdef NUMCHAR_OPTION
4917 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
4919 #ifdef UNICODE_NORMALIZATION
4920 fprintf(stderr," --utf8mac-input UTF-8-MAC input\n");
4922 #ifdef UTF8_OUTPUT_ENABLE
4923 fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n");
4926 fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
4928 fprintf(stderr," -g, --guess Guess the input code\n");
4929 fprintf(stderr," --help,--version\n");
4936 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
4937 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
4940 #if defined(MSDOS) && defined(__WIN16__)
4943 #if defined(MSDOS) && defined(__WIN32__)
4949 ,NKF_VERSION,NKF_RELEASE_DATE);
4950 fprintf(stderr,"\n%s\n",CopyRight);
4955 **
\e$B%Q%C%A@):n<T
\e(B
4956 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
4957 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
4958 ** ohta@src.ricoh.co.jp (Junn Ohta)
4959 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
4960 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
4961 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
4962 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
4963 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
4964 ** GHG00637@nifty-serve.or.jp (COW)