1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.74 2005/07/18 16:24:35 naruse Exp $ */
43 #define NKF_VERSION "2.0.5"
44 #define NKF_RELEASE_DATE "2005-07-19"
47 static char *CopyRight =
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2005 Kono, Furukawa, Naruse";
55 ** USAGE: nkf [flags] [file]
58 ** b Output is buffered (DEFAULT)
59 ** u Output is unbuffered
63 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
64 ** s Output code is MS Kanji (DEFAULT SELECT)
65 ** e Output code is AT&T JIS (DEFAULT SELECT)
66 ** w Output code is AT&T JIS (DEFAULT SELECT)
67 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
69 ** m MIME conversion for ISO-2022-JP
70 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
71 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
72 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
73 ** M MIME output conversion
75 ** r {de/en}crypt ROT13/47
79 ** T Text mode output (for MS-DOS)
81 ** x Do not convert X0201 kana into X0208
82 ** Z Convert X0208 alphabet to ASCII
87 ** B try to fix broken JIS, missing Escape
88 ** B[1-9] broken level
90 ** O Output to 'nkf.out' file or last file name
91 ** d Delete \r in line feed
92 ** c Add \r in line feed
93 ** -- other long option
94 ** -- ignore following option (don't use with -O )
98 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
100 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
108 #if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE )
109 #define UNICODE_ENABLE
111 #undef UNICODE_NORMALIZATION
120 #if defined(MSDOS) || defined(__OS2__)
127 #define setbinmode(fp) fsetbin(fp)
128 #else /* Microsoft C, Turbo C */
129 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
131 #else /* UNIX,OS/2 */
132 #define setbinmode(fp)
135 #ifdef _IOFBF /* SysV and MSDOS, Windows */
136 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
138 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
141 /*Borland C++ 4.5 EasyWin*/
142 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
151 /* added by satoru@isoternet.org */
153 #include <sys/stat.h>
154 #ifndef MSDOS /* UNIX, OS/2 */
157 #else /* defined(MSDOS) */
159 #ifdef __BORLANDC__ /* BCC32 */
161 #else /* !defined(__BORLANDC__) */
162 #include <sys/utime.h>
163 #endif /* (__BORLANDC__) */
164 #else /* !defined(__WIN32__) */
165 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
166 #include <sys/utime.h>
167 #elif defined(__TURBOC__) /* BCC */
169 #elif defined(LSI_C) /* LSI C */
170 #endif /* (__WIN32__) */
182 /* state of output_mode and input_mode
200 /* Input Assumption */
204 #define LATIN1_INPUT 6
206 #define STRICT_MIME 8
211 #define JAPANESE_EUC 10
215 #define UTF8_INPUT 13
216 #define UTF16BE_INPUT 14
217 #define UTF16LE_INPUT 15
237 #define is_alnum(c) \
238 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
240 #define HOLD_SIZE 1024
241 #define IOBUF_SIZE 16384
243 #define DEFAULT_J 'B'
244 #define DEFAULT_R 'B'
246 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
247 #define SJ6394 0x0161 /* 63 - 94 ku offset */
249 #define RANGE_NUM_MAX 18
254 #ifdef UNICODE_ENABLE
255 #define sizeof_euc_utf8 94
256 #define sizeof_euc_to_utf8_1byte 94
257 #define sizeof_euc_to_utf8_2bytes 94
258 #define sizeof_utf8_to_euc_C2 64
259 #define sizeof_utf8_to_euc_E5B8 64
260 #define sizeof_utf8_to_euc_2bytes 112
261 #define sizeof_utf8_to_euc_3bytes 112
264 /* MIME preprocessor */
267 #ifdef EASYWIN /*Easy Win */
268 extern POINT _BufferSize;
271 /* function prototype */
273 #ifdef ANSI_C_PROTOTYPE
275 #define STATIC static
287 void (*status_func)PROTO((struct input_code *, int));
288 int (*iconv_func)PROTO((int c2, int c1, int c0));
292 STATIC char *input_codename = "";
294 STATIC int noconvert PROTO((FILE *f));
295 STATIC int kanji_convert PROTO((FILE *f));
296 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
297 STATIC int push_hold_buf PROTO((int c2));
298 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
299 STATIC int s_iconv PROTO((int c2,int c1,int c0));
300 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
301 STATIC int e_iconv PROTO((int c2,int c1,int c0));
302 #ifdef UTF8_INPUT_ENABLE
303 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
304 STATIC int w_iconv PROTO((int c2,int c1,int c0));
305 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
306 STATIC int w_iconv_common PROTO((int c1,int c0,unsigned short **pp,int psize,int *p2,int *p1));
307 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
309 #ifdef UTF8_OUTPUT_ENABLE
310 STATIC int e2w_conv PROTO((int c2,int c1));
311 STATIC void w_oconv PROTO((int c2,int c1));
312 STATIC void w_oconv16 PROTO((int c2,int c1));
314 STATIC void e_oconv PROTO((int c2,int c1));
315 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
316 STATIC void s_oconv PROTO((int c2,int c1));
317 STATIC void j_oconv PROTO((int c2,int c1));
318 STATIC void fold_conv PROTO((int c2,int c1));
319 STATIC void cr_conv PROTO((int c2,int c1));
320 STATIC void z_conv PROTO((int c2,int c1));
321 STATIC void rot_conv PROTO((int c2,int c1));
322 STATIC void hira_conv PROTO((int c2,int c1));
323 STATIC void base64_conv PROTO((int c2,int c1));
324 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
325 STATIC void no_connection PROTO((int c2,int c1));
326 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
328 STATIC void code_score PROTO((struct input_code *ptr));
329 STATIC void code_status PROTO((int c));
331 STATIC void std_putc PROTO((int c));
332 STATIC int std_getc PROTO((FILE *f));
333 STATIC int std_ungetc PROTO((int c,FILE *f));
335 STATIC int broken_getc PROTO((FILE *f));
336 STATIC int broken_ungetc PROTO((int c,FILE *f));
338 STATIC int mime_begin PROTO((FILE *f));
339 STATIC int mime_getc PROTO((FILE *f));
340 STATIC int mime_ungetc PROTO((int c,FILE *f));
342 STATIC int mime_begin_strict PROTO((FILE *f));
343 STATIC int mime_getc_buf PROTO((FILE *f));
344 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
345 STATIC int mime_integrity PROTO((FILE *f,unsigned char *p));
347 STATIC int base64decode PROTO((int c));
348 STATIC void mime_prechar PROTO((int c2, int c1));
349 STATIC void mime_putc PROTO((int c));
350 STATIC void open_mime PROTO((int c));
351 STATIC void close_mime PROTO(());
352 STATIC void usage PROTO(());
353 STATIC void version PROTO(());
354 STATIC void options PROTO((unsigned char *c));
355 #if defined(PERL_XS) || defined(WIN32DLL)
356 STATIC void reinit PROTO(());
361 static unsigned char stdibuf[IOBUF_SIZE];
362 static unsigned char stdobuf[IOBUF_SIZE];
363 static unsigned char hold_buf[HOLD_SIZE*2];
364 static int hold_count;
366 /* MIME preprocessor fifo */
368 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
369 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
370 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
371 static unsigned char mime_buf[MIME_BUF_SIZE];
372 static unsigned int mime_top = 0;
373 static unsigned int mime_last = 0; /* decoded */
374 static unsigned int mime_input = 0; /* undecoded */
375 static int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
378 static int unbuf_f = FALSE;
379 static int estab_f = FALSE;
380 static int nop_f = FALSE;
381 static int binmode_f = TRUE; /* binary mode */
382 static int rot_f = FALSE; /* rot14/43 mode */
383 static int hira_f = FALSE; /* hira/kata henkan */
384 static int input_f = FALSE; /* non fixed input code */
385 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
386 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
387 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
388 static int mimebuf_f = FALSE; /* MIME buffered input */
389 static int broken_f = FALSE; /* convert ESC-less broken JIS */
390 static int iso8859_f = FALSE; /* ISO8859 through */
391 static int mimeout_f = FALSE; /* base64 mode */
392 #if defined(MSDOS) || defined(__OS2__)
393 static int x0201_f = TRUE; /* Assume JISX0201 kana */
395 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
397 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
398 #ifdef UNICODE_ENABLE
399 static int internal_unicode_f = FALSE; /* Internal Unicode Processing */
401 #ifdef UTF8_OUTPUT_ENABLE
402 static int unicode_bom_f= 0; /* Output Unicode BOM */
403 static int w_oconv16_LE = 0; /* utf-16 little endian */
404 static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
407 #ifdef UNICODE_NORMALIZATION
408 static int nfc_f = FALSE;
409 static int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
410 static int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
411 STATIC int nfc_getc PROTO((FILE *f));
412 STATIC int nfc_ungetc PROTO((int c,FILE *f));
416 static int cap_f = FALSE;
417 static int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
418 static int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
419 STATIC int cap_getc PROTO((FILE *f));
420 STATIC int cap_ungetc PROTO((int c,FILE *f));
422 static int url_f = FALSE;
423 static int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
424 static int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
425 STATIC int url_getc PROTO((FILE *f));
426 STATIC int url_ungetc PROTO((int c,FILE *f));
429 #ifdef NUMCHAR_OPTION
430 #define CLASS_MASK 0x0f000000
431 #define CLASS_UTF16 0x01000000
432 static int numchar_f = FALSE;
433 static int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
434 static int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
435 STATIC int numchar_getc PROTO((FILE *f));
436 STATIC int numchar_ungetc PROTO((int c,FILE *f));
440 static int noout_f = FALSE;
441 STATIC void no_putc PROTO((int c));
442 static int debug_f = FALSE;
443 STATIC void debug PROTO((char *str));
444 static int (*iconv_for_check)() = 0;
447 static int guess_f = FALSE;
448 STATIC void print_guessed_code PROTO((char *filename));
449 STATIC void set_input_codename PROTO((char *codename));
450 static int is_inputcode_mixed = FALSE;
451 static int is_inputcode_set = FALSE;
454 static int exec_f = 0;
457 #ifdef SHIFTJIS_CP932
458 STATIC int cp932_f = TRUE;
459 #define CP932_TABLE_BEGIN (0xfa)
460 #define CP932_TABLE_END (0xfc)
462 STATIC int cp932inv_f = TRUE;
463 #define CP932INV_TABLE_BEGIN (0xed)
464 #define CP932INV_TABLE_END (0xee)
466 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
467 #endif /* SHIFTJIS_CP932 */
470 STATIC int x0212_f = FALSE;
471 static int x0212_shift PROTO((int c));
472 static int x0212_unshift PROTO((int c));
475 STATIC unsigned char prefix_table[256];
477 STATIC void e_status PROTO((struct input_code *, int));
478 STATIC void s_status PROTO((struct input_code *, int));
480 #ifdef UTF8_INPUT_ENABLE
481 STATIC void w_status PROTO((struct input_code *, int));
482 STATIC void w16_status PROTO((struct input_code *, int));
483 static int utf16_mode = UTF16BE_INPUT;
486 struct input_code input_code_list[] = {
487 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
488 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
489 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
490 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
494 static int mimeout_mode = 0;
495 static int base64_count = 0;
497 /* X0208 -> ASCII converter */
500 static int f_line = 0; /* chars in line */
501 static int f_prev = 0;
502 static int fold_preserve_f = FALSE; /* preserve new lines */
503 static int fold_f = FALSE;
504 static int fold_len = 0;
507 static unsigned char kanji_intro = DEFAULT_J;
508 static unsigned char ascii_intro = DEFAULT_R;
512 #define FOLD_MARGIN 10
513 #define DEFAULT_FOLD 60
515 static int fold_margin = FOLD_MARGIN;
519 #ifdef DEFAULT_CODE_JIS
520 # define DEFAULT_CONV j_oconv
522 #ifdef DEFAULT_CODE_SJIS
523 # define DEFAULT_CONV s_oconv
525 #ifdef DEFAULT_CODE_EUC
526 # define DEFAULT_CONV e_oconv
528 #ifdef DEFAULT_CODE_UTF8
529 # define DEFAULT_CONV w_oconv
532 /* process default */
533 static void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
535 static void (*oconv)PROTO((int c2,int c1)) = no_connection;
536 /* s_iconv or oconv */
537 static int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
539 static void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
540 static void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
541 static void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
542 static void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
543 static void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
544 static void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
545 static void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
547 /* static redirections */
549 static void (*o_putc)PROTO((int c)) = std_putc;
551 static int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
552 static int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
554 static int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
555 static int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
557 static void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
559 static int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
560 static int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
562 /* for strict mime */
563 static int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
564 static int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
567 static int output_mode = ASCII, /* output kanji mode */
568 input_mode = ASCII, /* input kanji mode */
569 shift_mode = FALSE; /* TRUE shift out, or X0201 */
570 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
572 /* X0201 / X0208 conversion tables */
574 /* X0201 kana conversion table */
577 unsigned char cv[]= {
578 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
579 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
580 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
581 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
582 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
583 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
584 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
585 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
586 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
587 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
588 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
589 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
590 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
591 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
592 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
593 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
597 /* X0201 kana conversion table for daguten */
600 unsigned char dv[]= {
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
606 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
607 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
608 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
609 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
610 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
612 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
619 /* X0201 kana conversion table for han-daguten */
622 unsigned char ev[]= {
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
634 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 /* X0208 kigou conversion table */
643 /* 0x8140 - 0x819e */
645 unsigned char fv[] = {
647 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
648 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
649 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
651 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
652 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
653 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
655 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
664 static int file_out = FALSE;
666 static int overwrite = FALSE;
669 static int crmode_f = 0; /* CR, NL, CRLF */
670 #ifdef EASYWIN /*Easy Win */
671 static int end_check;
674 #define STD_GC_BUFSIZE (256)
675 int std_gc_buf[STD_GC_BUFSIZE];
679 #include "nkf32dll.c"
680 #elif defined(PERL_XS)
690 char *outfname = NULL;
693 #ifdef EASYWIN /*Easy Win */
694 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
697 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
698 cp = (unsigned char *)*argv;
703 if (pipe(fds) < 0 || (pid = fork()) < 0){
714 execvp(argv[1], &argv[1]);
728 if(x0201_f == WISH_TRUE)
729 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
731 if (binmode_f == TRUE)
733 if (freopen("","wb",stdout) == NULL)
740 setbuf(stdout, (char *) NULL);
742 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
745 if (binmode_f == TRUE)
747 if (freopen("","rb",stdin) == NULL) return (-1);
751 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
755 kanji_convert(stdin);
756 if (guess_f) print_guessed_code(NULL);
761 is_inputcode_mixed = FALSE;
762 is_inputcode_set = FALSE;
767 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
776 /* reopen file for stdout */
777 if (file_out == TRUE) {
780 outfname = malloc(strlen(origfname)
781 + strlen(".nkftmpXXXXXX")
787 strcpy(outfname, origfname);
791 for (i = strlen(outfname); i; --i){
792 if (outfname[i - 1] == '/'
793 || outfname[i - 1] == '\\'){
799 strcat(outfname, "ntXXXXXX");
801 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC,
804 strcat(outfname, ".nkftmpXXXXXX");
805 fd = mkstemp(outfname);
808 || (fd_backup = dup(fileno(stdout))) < 0
809 || dup2(fd, fileno(stdout)) < 0
820 outfname = "nkf.out";
823 if(freopen(outfname, "w", stdout) == NULL) {
827 if (binmode_f == TRUE) {
829 if (freopen("","wb",stdout) == NULL)
836 if (binmode_f == TRUE)
838 if (freopen("","rb",fin) == NULL)
843 setvbuffer(fin, stdibuf, IOBUF_SIZE);
847 char *filename = NULL;
849 if (nfiles > 1) filename = origfname;
850 if (guess_f) print_guessed_code(filename);
856 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
864 if (dup2(fd_backup, fileno(stdout)) < 0){
867 if (stat(origfname, &sb)) {
868 fprintf(stderr, "Can't stat %s\n", origfname);
870 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
871 if (chmod(outfname, sb.st_mode)) {
872 fprintf(stderr, "Can't set permission %s\n", outfname);
875 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
876 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
877 tb[0] = tb[1] = sb.st_mtime;
878 if (utime(outfname, tb)) {
879 fprintf(stderr, "Can't set timestamp %s\n", outfname);
882 tb.actime = sb.st_atime;
883 tb.modtime = sb.st_mtime;
884 if (utime(outfname, &tb)) {
885 fprintf(stderr, "Can't set timestamp %s\n", outfname);
889 if (unlink(origfname)){
893 if (rename(outfname, origfname)) {
895 fprintf(stderr, "Can't rename %s to %s\n",
896 outfname, origfname);
904 #ifdef EASYWIN /*Easy Win */
905 if (file_out == FALSE)
906 scanf("%d",&end_check);
909 #else /* for Other OS */
910 if (file_out == TRUE)
915 #endif /* WIN32DLL */
940 {"katakana-hiragana","h3"},
947 #ifdef UNICODE_ENABLE
948 {"internal-unicode", ""},
950 #ifdef UTF8_OUTPUT_ENABLE
955 #ifdef UTF8_INPUT_ENABLE
957 {"utf16-input", "W16"},
959 #ifdef UNICODE_NORMALIZATION
960 {"utf8mac-input", ""},
969 #ifdef NUMCHAR_OPTION
970 {"numchar-input", ""},
976 #ifdef SHIFTJIS_CP932
986 static int option_mode = 0;
993 unsigned char *p = NULL;
997 while(*cp && *cp++!='-');
1001 case '-': /* literal options */
1002 if (!*cp) { /* ignore the rest of arguments */
1006 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1008 p = (unsigned char *)long_option[i].name;
1009 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1010 if (*p == cp[j] || cp[j] == ' '){
1017 cp = (unsigned char *)long_option[i].alias;
1021 if (strcmp(long_option[i].name, "overwrite") == 0){
1028 if (strcmp(long_option[i].name, "cap-input") == 0){
1032 if (strcmp(long_option[i].name, "url-input") == 0){
1037 #ifdef NUMCHAR_OPTION
1038 if (strcmp(long_option[i].name, "numchar-input") == 0){
1044 if (strcmp(long_option[i].name, "no-output") == 0){
1048 if (strcmp(long_option[i].name, "debug") == 0){
1053 if (strcmp(long_option[i].name, "cp932") == 0){
1054 #ifdef SHIFTJIS_CP932
1058 #ifdef UTF8_OUTPUT_ENABLE
1059 ms_ucs_map_f = TRUE;
1063 if (strcmp(long_option[i].name, "no-cp932") == 0){
1064 #ifdef SHIFTJIS_CP932
1068 #ifdef UTF8_OUTPUT_ENABLE
1069 ms_ucs_map_f = FALSE;
1073 #ifdef SHIFTJIS_CP932
1074 if (strcmp(long_option[i].name, "cp932inv") == 0){
1081 if (strcmp(long_option[i].name, "x0212") == 0){
1088 if (strcmp(long_option[i].name, "exec-in") == 0){
1092 if (strcmp(long_option[i].name, "exec-out") == 0){
1097 #ifdef UNICODE_ENABLE
1098 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1099 internal_unicode_f = TRUE;
1103 #ifdef UTF8_OUTPUT_ENABLE
1104 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1105 ms_ucs_map_f = TRUE;
1109 #ifdef UNICODE_NORMALIZATION
1110 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1111 input_f = UTF8_INPUT;
1116 if (strcmp(long_option[i].name, "prefix=") == 0){
1117 if (*p == '=' && ' ' < p[1] && p[1] < 128){
1118 for (i = 2; ' ' < p[i] && p[i] < 128; i++){
1119 prefix_table[p[i]] = p[1];
1126 case 'b': /* buffered mode */
1129 case 'u': /* non bufferd mode */
1132 case 't': /* transparent mode */
1135 case 'j': /* JIS output */
1137 output_conv = j_oconv;
1139 case 'e': /* AT&T EUC output */
1140 output_conv = e_oconv;
1142 case 's': /* SJIS output */
1143 output_conv = s_oconv;
1145 case 'l': /* ISO8859 Latin-1 support, no conversion */
1146 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1147 input_f = LATIN1_INPUT;
1149 case 'i': /* Kanji IN ESC-$-@/B */
1150 if (*cp=='@'||*cp=='B')
1151 kanji_intro = *cp++;
1153 case 'o': /* ASCII IN ESC-(-J/B */
1154 if (*cp=='J'||*cp=='B'||*cp=='H')
1155 ascii_intro = *cp++;
1159 bit:1 katakana->hiragana
1160 bit:2 hiragana->katakana
1162 if ('9'>= *cp && *cp>='0')
1163 hira_f |= (*cp++ -'0');
1170 #if defined(MSDOS) || defined(__OS2__)
1185 #ifdef UTF8_OUTPUT_ENABLE
1186 case 'w': /* UTF-8 output */
1187 if ('1'== cp[0] && '6'==cp[1]) {
1188 output_conv = w_oconv16; cp+=2;
1190 unicode_bom_f=2; cp++;
1193 unicode_bom_f=1; cp++;
1195 } else if (cp[0] == 'B') {
1196 unicode_bom_f=2; cp++;
1198 unicode_bom_f=1; cp++;
1201 } else if (cp[0] == '8') {
1202 output_conv = w_oconv; cp++;
1205 unicode_bom_f=1; cp++;
1208 output_conv = w_oconv;
1211 #ifdef UTF8_INPUT_ENABLE
1212 case 'W': /* UTF-8 input */
1213 if ('1'== cp[0] && '6'==cp[1]) {
1214 input_f = UTF16BE_INPUT;
1215 utf16_mode = UTF16BE_INPUT;
1219 input_f = UTF16LE_INPUT;
1220 utf16_mode = UTF16LE_INPUT;
1221 } else if (cp[0] == 'B') {
1223 input_f = UTF16BE_INPUT;
1224 utf16_mode = UTF16BE_INPUT;
1226 } else if (cp[0] == '8') {
1228 input_f = UTF8_INPUT;
1230 input_f = UTF8_INPUT;
1233 /* Input code assumption */
1234 case 'J': /* JIS input */
1235 case 'E': /* AT&T EUC input */
1236 input_f = JIS_INPUT;
1238 case 'S': /* MS Kanji input */
1239 input_f = SJIS_INPUT;
1240 if (x0201_f==NO_X0201) x0201_f=TRUE;
1242 case 'Z': /* Convert X0208 alphabet to asii */
1243 /* bit:0 Convert X0208
1244 bit:1 Convert Kankaku to one space
1245 bit:2 Convert Kankaku to two spaces
1246 bit:3 Convert HTML Entity
1248 if ('9'>= *cp && *cp>='0')
1249 alpha_f |= 1<<(*cp++ -'0');
1253 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1254 x0201_f = FALSE; /* No X0201->X0208 conversion */
1256 ESC-(-I in JIS, EUC, MS Kanji
1257 SI/SO in JIS, EUC, MS Kanji
1258 SSO in EUC, JIS, not in MS Kanji
1259 MS Kanji (0xa0-0xdf)
1261 ESC-(-I in JIS (0x20-0x5f)
1262 SSO in EUC (0xa0-0xdf)
1263 0xa0-0xd in MS Kanji (0xa0-0xdf)
1266 case 'X': /* Assume X0201 kana */
1267 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1270 case 'F': /* prserve new lines */
1271 fold_preserve_f = TRUE;
1272 case 'f': /* folding -f60 or -f */
1275 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1277 fold_len += *cp++ - '0';
1279 if (!(0<fold_len && fold_len<BUFSIZ))
1280 fold_len = DEFAULT_FOLD;
1284 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1286 fold_margin += *cp++ - '0';
1290 case 'm': /* MIME support */
1291 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1292 if (*cp=='B'||*cp=='Q') {
1293 mime_decode_mode = *cp++;
1294 mimebuf_f = FIXED_MIME;
1295 } else if (*cp=='N') {
1296 mime_f = TRUE; cp++;
1297 } else if (*cp=='S') {
1298 mime_f = STRICT_MIME; cp++;
1299 } else if (*cp=='0') {
1300 mime_decode_f = FALSE;
1301 mime_f = FALSE; cp++;
1304 case 'M': /* MIME output */
1307 mimeout_f = FIXED_MIME; cp++;
1308 } else if (*cp=='Q') {
1310 mimeout_f = FIXED_MIME; cp++;
1315 case 'B': /* Broken JIS support */
1317 bit:1 allow any x on ESC-(-x or ESC-$-x
1318 bit:2 reset to ascii on NL
1320 if ('9'>= *cp && *cp>='0')
1321 broken_f |= 1<<(*cp++ -'0');
1326 case 'O':/* for Output file */
1330 case 'c':/* add cr code */
1333 case 'd':/* delete cr code */
1336 case 'I': /* ISO-2022-JP output */
1339 case 'L': /* line mode */
1340 if (*cp=='u') { /* unix */
1341 crmode_f = NL; cp++;
1342 } else if (*cp=='m') { /* mac */
1343 crmode_f = CR; cp++;
1344 } else if (*cp=='w') { /* windows */
1345 crmode_f = CRLF; cp++;
1346 } else if (*cp=='0') { /* no conversion */
1356 /* module muliple options in a string are allowed for Perl moudle */
1357 while(*cp && *cp++!='-');
1360 /* bogus option but ignored */
1366 #ifdef ANSI_C_PROTOTYPE
1367 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1369 struct input_code * find_inputcode_byfunc(iconv_func)
1370 int (*iconv_func)();
1374 struct input_code *p = input_code_list;
1376 if (iconv_func == p->iconv_func){
1385 #ifdef ANSI_C_PROTOTYPE
1386 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1388 void set_iconv(f, iconv_func)
1390 int (*iconv_func)();
1393 #ifdef INPUT_CODE_FIX
1401 #ifdef INPUT_CODE_FIX
1402 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1408 if (estab_f && iconv_for_check != iconv){
1409 struct input_code *p = find_inputcode_byfunc(iconv);
1411 set_input_codename(p->name);
1412 debug(input_codename);
1414 iconv_for_check = iconv;
1419 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1420 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1421 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1422 #ifdef SHIFTJIS_CP932
1423 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1424 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1426 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1428 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1429 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1431 #define SCORE_INIT (SCORE_iMIME)
1433 int score_table_A0[] = {
1436 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1437 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1440 int score_table_F0[] = {
1441 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1442 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1443 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1444 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1447 void set_code_score(ptr, score)
1448 struct input_code *ptr;
1452 ptr->score |= score;
1456 void clr_code_score(ptr, score)
1457 struct input_code *ptr;
1461 ptr->score &= ~score;
1465 void code_score(ptr)
1466 struct input_code *ptr;
1468 int c2 = ptr->buf[0];
1469 int c1 = ptr->buf[1];
1471 set_code_score(ptr, SCORE_ERROR);
1472 }else if (c2 == SSO){
1473 set_code_score(ptr, SCORE_KANA);
1474 #ifdef UTF8_OUTPUT_ENABLE
1475 }else if (!e2w_conv(c2, c1)){
1476 set_code_score(ptr, SCORE_NO_EXIST);
1478 }else if ((c2 & 0x70) == 0x20){
1479 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1480 }else if ((c2 & 0x70) == 0x70){
1481 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1482 }else if ((c2 & 0x70) >= 0x50){
1483 set_code_score(ptr, SCORE_L2);
1487 void status_disable(ptr)
1488 struct input_code *ptr;
1493 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1496 void status_push_ch(ptr, c)
1497 struct input_code *ptr;
1500 ptr->buf[ptr->index++] = c;
1503 void status_clear(ptr)
1504 struct input_code *ptr;
1510 void status_reset(ptr)
1511 struct input_code *ptr;
1514 ptr->score = SCORE_INIT;
1517 void status_reinit(ptr)
1518 struct input_code *ptr;
1521 ptr->_file_stat = 0;
1524 void status_check(ptr, c)
1525 struct input_code *ptr;
1528 if (c <= DEL && estab_f){
1533 void s_status(ptr, c)
1534 struct input_code *ptr;
1539 status_check(ptr, c);
1544 #ifdef NUMCHAR_OPTION
1545 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1548 }else if (0xa1 <= c && c <= 0xdf){
1549 status_push_ch(ptr, SSO);
1550 status_push_ch(ptr, c);
1553 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1555 status_push_ch(ptr, c);
1556 #ifdef SHIFTJIS_CP932
1558 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1560 status_push_ch(ptr, c);
1561 #endif /* SHIFTJIS_CP932 */
1563 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1565 status_push_ch(ptr, c);
1566 #endif /* X0212_ENABLE */
1568 status_disable(ptr);
1572 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1573 status_push_ch(ptr, c);
1574 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
1578 status_disable(ptr);
1582 #ifdef SHIFTJIS_CP932
1583 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1584 status_push_ch(ptr, c);
1585 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
1586 set_code_score(ptr, SCORE_CP932);
1591 #endif /* SHIFTJIS_CP932 */
1592 #ifndef X0212_ENABLE
1593 status_disable(ptr);
1599 void e_status(ptr, c)
1600 struct input_code *ptr;
1605 status_check(ptr, c);
1610 #ifdef NUMCHAR_OPTION
1611 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1614 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
1616 status_push_ch(ptr, c);
1618 }else if (0x8f == c){
1620 status_push_ch(ptr, c);
1621 #endif /* X0212_ENABLE */
1623 status_disable(ptr);
1627 if (0xa1 <= c && c <= 0xfe){
1628 status_push_ch(ptr, c);
1632 status_disable(ptr);
1637 if (0xa1 <= c && c <= 0xfe){
1639 status_push_ch(ptr, c);
1641 status_disable(ptr);
1643 #endif /* X0212_ENABLE */
1647 #ifdef UTF8_INPUT_ENABLE
1648 void w16_status(ptr, c)
1649 struct input_code *ptr;
1656 if (ptr->_file_stat == 0){
1657 if (c == 0xfe || c == 0xff){
1659 status_push_ch(ptr, c);
1660 ptr->_file_stat = 1;
1662 status_disable(ptr);
1663 ptr->_file_stat = -1;
1665 }else if (ptr->_file_stat > 0){
1667 status_push_ch(ptr, c);
1668 }else if (ptr->_file_stat < 0){
1669 status_disable(ptr);
1675 status_disable(ptr);
1676 ptr->_file_stat = -1;
1678 status_push_ch(ptr, c);
1685 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
1686 status_push_ch(ptr, c);
1689 status_disable(ptr);
1690 ptr->_file_stat = -1;
1696 void w_status(ptr, c)
1697 struct input_code *ptr;
1702 status_check(ptr, c);
1707 #ifdef NUMCHAR_OPTION
1708 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1711 }else if (0xc0 <= c && c <= 0xdf){
1713 status_push_ch(ptr, c);
1714 }else if (0xe0 <= c && c <= 0xef){
1716 status_push_ch(ptr, c);
1718 status_disable(ptr);
1723 if (0x80 <= c && c <= 0xbf){
1724 status_push_ch(ptr, c);
1725 if (ptr->index > ptr->stat){
1726 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
1727 && ptr->buf[2] == 0xbf);
1728 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
1729 &ptr->buf[0], &ptr->buf[1]);
1736 status_disable(ptr);
1747 int action_flag = 1;
1748 struct input_code *result = 0;
1749 struct input_code *p = input_code_list;
1751 (p->status_func)(p, c);
1754 }else if(p->stat == 0){
1765 if (result && !estab_f){
1766 set_iconv(TRUE, result->iconv_func);
1767 }else if (c <= DEL){
1768 struct input_code *ptr = input_code_list;
1783 return std_gc_buf[--std_gc_ndx];
1794 if (std_gc_ndx == STD_GC_BUFSIZE){
1797 std_gc_buf[std_gc_ndx++] = c;
1817 while ((c = (*i_getc)(f)) != EOF)
1826 oconv = output_conv;
1829 /* replace continucation module, from output side */
1831 /* output redicrection */
1833 if (noout_f || guess_f){
1840 if (mimeout_f == TRUE) {
1841 o_base64conv = oconv; oconv = base64_conv;
1843 /* base64_count = 0; */
1847 o_crconv = oconv; oconv = cr_conv;
1850 o_rot_conv = oconv; oconv = rot_conv;
1853 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
1856 o_hira_conv = oconv; oconv = hira_conv;
1859 o_fconv = oconv; oconv = fold_conv;
1862 if (alpha_f || x0201_f) {
1863 o_zconv = oconv; oconv = z_conv;
1867 i_ungetc = std_ungetc;
1868 /* input redicrection */
1871 i_cgetc = i_getc; i_getc = cap_getc;
1872 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
1875 i_ugetc = i_getc; i_getc = url_getc;
1876 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
1879 #ifdef NUMCHAR_OPTION
1881 i_ngetc = i_getc; i_getc = numchar_getc;
1882 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
1885 #ifdef UNICODE_NORMALIZATION
1886 if (nfc_f && input_f == UTF8_INPUT){
1887 i_nfc_getc = i_getc; i_getc = nfc_getc;
1888 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
1891 if (mime_f && mimebuf_f==FIXED_MIME) {
1892 i_mgetc = i_getc; i_getc = mime_getc;
1893 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
1896 i_bgetc = i_getc; i_getc = broken_getc;
1897 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
1899 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
1900 set_iconv(-TRUE, e_iconv);
1901 } else if (input_f == SJIS_INPUT) {
1902 set_iconv(-TRUE, s_iconv);
1903 #ifdef UTF8_INPUT_ENABLE
1904 } else if (input_f == UTF8_INPUT) {
1905 set_iconv(-TRUE, w_iconv);
1906 } else if (input_f == UTF16BE_INPUT) {
1907 set_iconv(-TRUE, w_iconv16);
1908 } else if (input_f == UTF16LE_INPUT) {
1909 set_iconv(-TRUE, w_iconv16);
1912 set_iconv(FALSE, e_iconv);
1916 struct input_code *p = input_code_list;
1924 Conversion main loop. Code detection only.
1933 int is_8bit = FALSE;
1935 module_connection();
1940 output_mode = ASCII;
1943 #define NEXT continue /* no output, get next */
1944 #define SEND ; /* output c1 and c2, get next */
1945 #define LAST break /* end of loop, go closing */
1947 while ((c1 = (*i_getc)(f)) != EOF) {
1952 /* in case of 8th bit is on */
1953 if (!estab_f&&!mime_decode_mode) {
1954 /* in case of not established yet */
1955 /* It is still ambiguious */
1956 if (h_conv(f, c2, c1)==EOF)
1962 /* in case of already established */
1964 /* ignore bogus code */
1970 /* second byte, 7 bit code */
1971 /* it might be kanji shitfted */
1972 if ((c1 == DEL) || (c1 <= SPACE)) {
1973 /* ignore bogus first code */
1981 #ifdef UTF8_INPUT_ENABLE
1990 #ifdef NUMCHAR_OPTION
1991 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
1994 } else if (c1 > DEL) {
1996 if (!estab_f && !iso8859_f) {
1997 /* not established yet */
1998 if (!is_8bit) is_8bit = TRUE;
2001 } else { /* estab_f==TRUE */
2006 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2007 /* SJIS X0201 Case... */
2008 if(iso2022jp_f && x0201_f==NO_X0201) {
2009 (*oconv)(GETA1, GETA2);
2016 } else if (c1==SSO && iconv != s_iconv) {
2017 /* EUC X0201 Case */
2018 c1 = (*i_getc)(f); /* skip SSO */
2020 if (SSP<=c1 && c1<0xe0) {
2021 if(iso2022jp_f && x0201_f==NO_X0201) {
2022 (*oconv)(GETA1, GETA2);
2029 } else { /* bogus code, skip SSO and one byte */
2033 /* already established */
2038 } else if ((c1 > SPACE) && (c1 != DEL)) {
2039 /* in case of Roman characters */
2041 /* output 1 shifted byte */
2045 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2046 /* output 1 shifted byte */
2047 if(iso2022jp_f && x0201_f==NO_X0201) {
2048 (*oconv)(GETA1, GETA2);
2055 /* look like bogus code */
2058 } else if (input_mode == X0208) {
2059 /* in case of Kanji shifted */
2062 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2063 /* Check MIME code */
2064 if ((c1 = (*i_getc)(f)) == EOF) {
2067 } else if (c1 == '?') {
2068 /* =? is mime conversion start sequence */
2069 if(mime_f == STRICT_MIME) {
2070 /* check in real detail */
2071 if (mime_begin_strict(f) == EOF)
2075 } else if (mime_begin(f) == EOF)
2085 /* normal ASCII code */
2088 } else if (c1 == SI) {
2091 } else if (c1 == SO) {
2094 } else if (c1 == ESC ) {
2095 if ((c1 = (*i_getc)(f)) == EOF) {
2096 /* (*oconv)(0, ESC); don't send bogus code */
2098 } else if (c1 == '$') {
2099 if ((c1 = (*i_getc)(f)) == EOF) {
2101 (*oconv)(0, ESC); don't send bogus code
2102 (*oconv)(0, '$'); */
2104 } else if (c1 == '@'|| c1 == 'B') {
2105 /* This is kanji introduction */
2108 set_input_codename("ISO-2022-JP");
2109 debug(input_codename);
2111 } else if (c1 == '(') {
2112 if ((c1 = (*i_getc)(f)) == EOF) {
2113 /* don't send bogus code
2119 } else if (c1 == '@'|| c1 == 'B') {
2120 /* This is kanji introduction */
2125 } else if (c1 == 'D'){
2129 #endif /* X0212_ENABLE */
2131 /* could be some special code */
2138 } else if (broken_f&0x2) {
2139 /* accept any ESC-(-x as broken code ... */
2149 } else if (c1 == '(') {
2150 if ((c1 = (*i_getc)(f)) == EOF) {
2151 /* don't send bogus code
2153 (*oconv)(0, '('); */
2157 /* This is X0201 kana introduction */
2158 input_mode = X0201; shift_mode = X0201;
2160 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2161 /* This is X0208 kanji introduction */
2162 input_mode = ASCII; shift_mode = FALSE;
2164 } else if (broken_f&0x2) {
2165 input_mode = ASCII; shift_mode = FALSE;
2170 /* maintain various input_mode here */
2174 } else if ( c1 == 'N' || c1 == 'n' ){
2176 c3 = (*i_getc)(f); /* skip SS2 */
2177 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2192 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2193 input_mode = ASCII; set_iconv(FALSE, 0);
2195 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2196 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2204 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2205 if ((c1=(*i_getc)(f))!=EOF) {
2209 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2225 if (input_mode == X0208)
2226 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2228 else if (input_mode == X0212)
2229 (*oconv)((0x8f << 8) | c2, c1);
2230 #endif /* X0212_ENABLE */
2231 else if (input_mode)
2232 (*oconv)(input_mode, c1); /* other special case */
2233 else if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2234 int c0 = (*i_getc)(f);
2237 (*iconv)(c2, c1, c0);
2243 /* goto next_word */
2247 (*iconv)(EOF, 0, 0);
2248 if (!is_inputcode_set)
2251 struct input_code *p = input_code_list;
2252 struct input_code *result = p;
2254 if (p->score < result->score) result = p;
2257 set_input_codename(result->name);
2272 /** it must NOT be in the kanji shifte sequence */
2273 /** it must NOT be written in JIS7 */
2274 /** and it must be after 2 byte 8bit code */
2281 while ((c1 = (*i_getc)(f)) != EOF) {
2287 if (push_hold_buf(c1) == EOF || estab_f){
2293 struct input_code *p = input_code_list;
2294 struct input_code *result = p;
2299 if (p->score < result->score){
2304 set_iconv(FALSE, result->iconv_func);
2309 ** 1) EOF is detected, or
2310 ** 2) Code is established, or
2311 ** 3) Buffer is FULL (but last word is pushed)
2313 ** in 1) and 3) cases, we continue to use
2314 ** Kanji codes by oconv and leave estab_f unchanged.
2319 while (wc < hold_count){
2320 c2 = hold_buf[wc++];
2322 #ifdef NUMCHAR_OPTION
2323 || (c2 & CLASS_MASK) == CLASS_UTF16
2328 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2329 (*iconv)(X0201, c2, 0);
2332 if (wc < hold_count){
2333 c1 = hold_buf[wc++];
2342 if ((*iconv)(c2, c1, 0) < 0){
2344 if (wc < hold_count){
2345 c0 = hold_buf[wc++];
2354 (*iconv)(c2, c1, c0);
2367 if (hold_count >= HOLD_SIZE*2)
2369 hold_buf[hold_count++] = c2;
2370 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2373 int s2e_conv(c2, c1, p2, p1)
2378 #ifdef SHIFTJIS_CP932
2379 if (cp932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2380 extern unsigned short shiftjis_cp932[3][189];
2381 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2387 #endif /* SHIFTJIS_CP932 */
2389 if (x0212_f && 0xfa <= c2 && c2 <= 0xfc){
2390 extern unsigned short shiftjis_x0212[3][189];
2391 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2394 c2 = (0x8f << 8) | (val >> 8);
2406 c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394);
2408 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f);
2416 c2 = x0212_unshift(c2);
2431 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2434 int ret = s2e_conv(c2, c1, &c2, &c1);
2435 if (ret) return ret;
2449 }else if (c2 == 0x8f){
2453 c2 = (c2 << 8) | (c1 & 0x7f);
2455 #ifdef SHIFTJIS_CP932
2458 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2459 s2e_conv(s2, s1, &c2, &c1);
2460 if ((c2 & 0xff00) == 0){
2466 #endif /* SHIFTJIS_CP932 */
2467 #endif /* X0212_ENABLE */
2468 } else if (c2 == SSO){
2471 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2481 #ifdef UTF8_INPUT_ENABLE
2483 w2e_conv(c2, c1, c0, p2, p1)
2487 extern unsigned short * utf8_to_euc_2bytes[];
2488 extern unsigned short ** utf8_to_euc_3bytes[];
2491 if (0xc0 <= c2 && c2 <= 0xef) {
2492 unsigned short **pp;
2495 if (c0 == 0) return -1;
2496 pp = utf8_to_euc_3bytes[c2 - 0x80];
2497 ret = w_iconv_common(c1, c0, pp, sizeof_utf8_to_euc_C2, p2, p1);
2499 ret = w_iconv_common(c2, c1, utf8_to_euc_2bytes, sizeof_utf8_to_euc_2bytes, p2, p1);
2501 #ifdef NUMCHAR_OPTION
2504 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2509 } else if (c2 == X0201) {
2523 unsigned short val = 0;
2526 if (c2 == 0) /* 0x00-0x7f */
2528 else if ((c2 & 0xe0) == 0xc0) /* 0xc0-0xdf */
2530 else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2531 return -1; /* 3bytes */
2533 else if (0xf0 <= c2)
2534 return 0; /* 4,5,6bytes */
2535 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2536 return 0; /* trail byte */
2541 else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2542 return 0; /* throw BOM */
2543 else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
2548 val = ww16_conv(c2, c1, c0);
2549 c2 = (val >> 8) & 0xff;
2552 ret = w2e_conv(c2, c1, c0, &c2, &c1);
2561 w16w_conv(val, p2, p1, p0)
2569 }else if (val < 0x800){
2570 *p2 = 0xc0 | (val >> 6);
2571 *p1 = 0x80 | (val & 0x3f);
2574 *p2 = 0xe0 | (val >> 12);
2575 *p1 = 0x80 | ((val >> 6) & 0x3f);
2576 *p0 = 0x80 | (val & 0x3f);
2581 ww16_conv(c2, c1, c0)
2586 val = (c2 & 0x0f) << 12;
2587 val |= (c1 & 0x3f) << 6;
2589 }else if (c2 >= 0xc0){
2590 val = (c2 & 0x1f) << 6;
2599 w16e_conv(val, p2, p1)
2603 extern unsigned short * utf8_to_euc_2bytes[];
2604 extern unsigned short ** utf8_to_euc_3bytes[];
2606 unsigned short **pp;
2610 w16w_conv(val, &c2, &c1, &c0);
2613 pp = utf8_to_euc_3bytes[c2 - 0x80];
2614 psize = sizeof_utf8_to_euc_C2;
2615 ret = w_iconv_common(c1, c0, pp, psize, p2, p1);
2617 pp = utf8_to_euc_2bytes;
2618 psize = sizeof_utf8_to_euc_2bytes;
2619 ret = w_iconv_common(c2, c1, pp, psize, p2, p1);
2621 #ifdef NUMCHAR_OPTION
2624 *p1 = CLASS_UTF16 | val;
2636 w_iconv16(c2, c1, c0)
2641 if (c2==0376 && c1==0377){
2642 utf16_mode = UTF16BE_INPUT;
2644 } else if (c2==0377 && c1==0376){
2645 utf16_mode = UTF16LE_INPUT;
2648 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
2650 tmp=c1; c1=c2; c2=tmp;
2652 if ((c2==0 && c1 < 0x80) || c2==EOF) {
2656 if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16));
2657 else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
2658 if (ret) return ret;
2664 w_iconv_common(c1, c0, pp, psize, p2, p1)
2666 unsigned short **pp;
2674 if (pp == 0) return 1;
2677 if (c1 < 0 || psize <= c1) return 1;
2679 if (p == 0) return 1;
2682 if (c0 < 0 || sizeof_utf8_to_euc_E5B8 <= c0) return 1;
2684 if (val == 0) return 1;
2691 if (c2 == SO) c2 = X0201;
2700 #ifdef UTF8_OUTPUT_ENABLE
2705 extern unsigned short euc_to_utf8_1byte[];
2706 extern unsigned short * euc_to_utf8_2bytes[];
2707 extern unsigned short * euc_to_utf8_2bytes_ms[];
2711 p = euc_to_utf8_1byte;
2713 } else if (c2 >> 8 == 0x8f){
2714 extern unsigned short * x0212_to_utf8_2bytes[];
2715 c2 = (c2&0x7f) - 0x21;
2716 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2717 p = x0212_to_utf8_2bytes[c2];
2723 c2 = (c2&0x7f) - 0x21;
2724 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2725 p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
2730 c1 = (c1 & 0x7f) - 0x21;
2731 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
2748 if (unicode_bom_f==2) {
2755 #ifdef NUMCHAR_OPTION
2756 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2757 w16w_conv(c1, &c2, &c1, &c0);
2761 if (c0) (*o_putc)(c0);
2768 output_mode = ASCII;
2770 } else if (c2 == ISO8859_1) {
2771 output_mode = ISO8859_1;
2772 (*o_putc)(c1 | 0x080);
2775 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
2776 val = ((c2<<8)&0xff00) + c1;
2777 else val = e2w_conv(c2, c1);
2779 w16w_conv(val, &c2, &c1, &c0);
2783 if (c0) (*o_putc)(c0);
2799 if (unicode_bom_f==2) {
2801 (*o_putc)((unsigned char)'\377');
2805 (*o_putc)((unsigned char)'\377');
2810 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
2811 } else if (c2 == ISO8859_1) {
2814 #ifdef NUMCHAR_OPTION
2815 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
2816 c2 = (c1 >> 8) & 0xff;
2820 unsigned short val = e2w_conv(c2, c1);
2821 c2 = (val >> 8) & 0xff;
2840 #ifdef NUMCHAR_OPTION
2841 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2842 w16e_conv(c1, &c2, &c1);
2843 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2851 } else if (c2 == 0) {
2852 output_mode = ASCII;
2854 } else if (c2 == X0201) {
2855 output_mode = JAPANESE_EUC;
2856 (*o_putc)(SSO); (*o_putc)(c1|0x80);
2857 } else if (c2 == ISO8859_1) {
2858 output_mode = ISO8859_1;
2859 (*o_putc)(c1 | 0x080);
2861 } else if ((c2 & 0xff00) >> 8 == 0x8f){
2862 output_mode = JAPANESE_EUC;
2863 #ifdef SHIFTJIS_CP932
2866 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2867 s2e_conv(s2, s1, &c2, &c1);
2871 if ((c2 & 0xff00) >> 8 == 0x8f){
2874 (*o_putc)((c2 & 0x7f) | 0x080);
2875 (*o_putc)(c1 | 0x080);
2878 (*o_putc)((c2 & 0x7f) | 0x080);
2879 (*o_putc)(c1 | 0x080);
2883 if ((c1<0x21 || 0x7e<c1) ||
2884 (c2<0x21 || 0x7e<c2)) {
2885 set_iconv(FALSE, 0);
2886 return; /* too late to rescue this char */
2888 output_mode = JAPANESE_EUC;
2889 (*o_putc)(c2 | 0x080);
2890 (*o_putc)(c1 | 0x080);
2900 if ((ret & 0xff00) == 0x8f00){
2901 if (0x75 <= c && c <= 0x7f){
2902 ret = c + (0x109 - 0x75);
2905 if (0x75 <= c && c <= 0x7f){
2906 ret = c + (0x113 - 0x75);
2913 int x0212_unshift(c)
2917 if (0x7f <= c && c <= 0x88){
2918 ret = c + (0x75 - 0x7f);
2919 }else if (0x89 <= c && c <= 0x92){
2920 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
2924 #endif /* X0212_ENABLE */
2927 e2s_conv(c2, c1, p2, p1)
2928 int c2, c1, *p2, *p1;
2931 unsigned short *ptr;
2933 extern unsigned short *x0212_shiftjis[];
2935 if ((c2 & 0xff00) == 0x8f00){
2937 if (0x21 <= ndx && ndx <= 0x7e){
2938 ptr = x0212_shiftjis[ndx - 0x21];
2940 val = ptr[(c1 & 0x7f) - 0x21];
2950 c2 = x0212_shift(c2);
2952 #endif /* X0212_ENABLE */
2953 if ((c2 & 0xff00) == 0x8f00){
2956 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
2957 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
2966 #ifdef NUMCHAR_OPTION
2967 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
2968 w16e_conv(c1, &c2, &c1);
2974 } else if (c2 == 0) {
2975 output_mode = ASCII;
2977 } else if (c2 == X0201) {
2978 output_mode = SHIFT_JIS;
2980 } else if (c2 == ISO8859_1) {
2981 output_mode = ISO8859_1;
2982 (*o_putc)(c1 | 0x080);
2984 } else if ((c2 & 0xff00) >> 8 == 0x8f){
2985 output_mode = SHIFT_JIS;
2986 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2992 if ((c1<0x20 || 0x7e<c1) ||
2993 (c2<0x20 || 0x7e<c2)) {
2994 set_iconv(FALSE, 0);
2995 return; /* too late to rescue this char */
2997 output_mode = SHIFT_JIS;
2998 e2s_conv(c2, c1, &c2, &c1);
3000 #ifdef SHIFTJIS_CP932
3002 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3003 extern unsigned short cp932inv[2][189];
3004 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3010 #endif /* SHIFTJIS_CP932 */
3013 if (prefix_table[(unsigned char)c1]){
3014 (*o_putc)(prefix_table[(unsigned char)c1]);
3025 #ifdef NUMCHAR_OPTION
3026 if ((c1 & CLASS_MASK) == CLASS_UTF16){
3027 w16e_conv(c1, &c2, &c1);
3031 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3034 (*o_putc)(ascii_intro);
3035 output_mode = ASCII;
3039 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3040 if (output_mode!=X0212) {
3041 output_mode = X0212;
3047 (*o_putc)(c2 & 0x7f);
3050 } else if (c2==X0201) {
3051 if (output_mode!=X0201) {
3052 output_mode = X0201;
3058 } else if (c2==ISO8859_1) {
3059 /* iso8859 introduction, or 8th bit on */
3060 /* Can we convert in 7bit form using ESC-'-'-A ?
3062 output_mode = ISO8859_1;
3064 } else if (c2 == 0) {
3065 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3068 (*o_putc)(ascii_intro);
3069 output_mode = ASCII;
3073 if (output_mode != X0208) {
3074 output_mode = X0208;
3077 (*o_putc)(kanji_intro);
3079 if (c1<0x20 || 0x7e<c1)
3081 if (c2<0x20 || 0x7e<c2)
3093 mime_prechar(c2, c1);
3094 (*o_base64conv)(c2,c1);
3098 static int broken_buf[3];
3099 static int broken_counter = 0;
3100 static int broken_last = 0;
3107 if (broken_counter>0) {
3108 return broken_buf[--broken_counter];
3111 if (c=='$' && broken_last != ESC
3112 && (input_mode==ASCII || input_mode==X0201)) {
3115 if (c1=='@'|| c1=='B') {
3116 broken_buf[0]=c1; broken_buf[1]=c;
3123 } else if (c=='(' && broken_last != ESC
3124 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3127 if (c1=='J'|| c1=='B') {
3128 broken_buf[0]=c1; broken_buf[1]=c;
3146 if (broken_counter<2)
3147 broken_buf[broken_counter++]=c;
3151 static int prev_cr = 0;
3159 if (! (c2==0&&c1==NL) ) {
3165 } else if (c1=='\r') {
3167 } else if (c1=='\n') {
3168 if (crmode_f==CRLF) {
3169 (*o_crconv)(0,'\r');
3170 } else if (crmode_f==CR) {
3171 (*o_crconv)(0,'\r');
3175 } else if (c1!='\032' || crmode_f!=NL){
3181 Return value of fold_conv()
3183 \n add newline and output char
3184 \r add newline and output nothing
3187 1 (or else) normal output
3189 fold state in prev (previous character)
3191 >0x80 Japanese (X0208/X0201)
3196 This fold algorthm does not preserve heading space in a line.
3197 This is the main difference from fmt.
3200 #define char_size(c2,c1) (c2?2:1)
3209 if (c1== '\r' && !fold_preserve_f) {
3210 fold_state=0; /* ignore cr */
3211 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3213 fold_state=0; /* ignore cr */
3214 } else if (c1== BS) {
3215 if (f_line>0) f_line--;
3217 } else if (c2==EOF && f_line != 0) { /* close open last line */
3219 } else if ((c1=='\n' && !fold_preserve_f)
3220 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3221 && fold_preserve_f)) {
3223 if (fold_preserve_f) {
3227 } else if ((f_prev == c1 && !fold_preserve_f)
3228 || (f_prev == '\n' && fold_preserve_f)
3229 ) { /* duplicate newline */
3232 fold_state = '\n'; /* output two newline */
3238 if (f_prev&0x80) { /* Japanese? */
3240 fold_state = 0; /* ignore given single newline */
3241 } else if (f_prev==' ') {
3245 if (++f_line<=fold_len)
3249 fold_state = '\r'; /* fold and output nothing */
3253 } else if (c1=='\f') {
3258 fold_state = '\n'; /* output newline and clear */
3259 } else if ( (c2==0 && c1==' ')||
3260 (c2==0 && c1=='\t')||
3261 (c2=='!'&& c1=='!')) {
3262 /* X0208 kankaku or ascii space */
3263 if (f_prev == ' ') {
3264 fold_state = 0; /* remove duplicate spaces */
3267 if (++f_line<=fold_len)
3268 fold_state = ' '; /* output ASCII space only */
3270 f_prev = ' '; f_line = 0;
3271 fold_state = '\r'; /* fold and output nothing */
3275 prev0 = f_prev; /* we still need this one... , but almost done */
3277 if (c2 || c2==X0201)
3278 f_prev |= 0x80; /* this is Japanese */
3279 f_line += char_size(c2,c1);
3280 if (f_line<=fold_len) { /* normal case */
3283 if (f_line>=fold_len+fold_margin) { /* too many kinsou suspension */
3284 f_line = char_size(c2,c1);
3285 fold_state = '\n'; /* We can't wait, do fold now */
3286 } else if (c2==X0201) {
3287 /* simple kinsoku rules return 1 means no folding */
3288 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3289 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3290 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3291 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3292 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3293 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3294 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3296 fold_state = '\n';/* add one new f_line before this character */
3299 fold_state = '\n';/* add one new f_line before this character */
3302 /* kinsoku point in ASCII */
3303 if ( c1==')'|| /* { [ ( */
3314 /* just after special */
3315 } else if (!is_alnum(prev0)) {
3316 f_line = char_size(c2,c1);
3318 } else if ((prev0==' ') || /* ignored new f_line */
3319 (prev0=='\n')|| /* ignored new f_line */
3320 (prev0&0x80)) { /* X0208 - ASCII */
3321 f_line = char_size(c2,c1);
3322 fold_state = '\n';/* add one new f_line before this character */
3324 fold_state = 1; /* default no fold in ASCII */
3328 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3329 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3330 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3331 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3332 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3333 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3334 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3335 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3336 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3337 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3338 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3339 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3340 /* default no fold in kinsoku */
3343 f_line = char_size(c2,c1);
3344 /* add one new f_line before this character */
3347 f_line = char_size(c2,c1);
3349 /* add one new f_line before this character */
3354 /* terminator process */
3355 switch(fold_state) {
3374 int z_prev2=0,z_prev1=0;
3381 /* if (c2) c1 &= 0x7f; assertion */
3383 if (x0201_f && z_prev2==X0201) { /* X0201 */
3384 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
3386 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
3388 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
3390 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
3394 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
3403 if (x0201_f && c2==X0201) {
3404 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
3405 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3406 z_prev1 = c1; z_prev2 = c2;
3409 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
3414 /* JISX0208 Alphabet */
3415 if (alpha_f && c2 == 0x23 ) {
3417 } else if (alpha_f && c2 == 0x21 ) {
3418 /* JISX0208 Kigou */
3423 } else if (alpha_f&0x4) {
3428 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3434 case '>': entity = ">"; break;
3435 case '<': entity = "<"; break;
3436 case '\"': entity = """; break;
3437 case '&': entity = "&"; break;
3440 while (*entity) (*o_zconv)(0, *entity++);
3450 #define rot13(c) ( \
3452 (c <= 'M') ? (c + 13): \
3453 (c <= 'Z') ? (c - 13): \
3455 (c <= 'm') ? (c + 13): \
3456 (c <= 'z') ? (c - 13): \
3460 #define rot47(c) ( \
3462 ( c <= 'O' ) ? (c + 47) : \
3463 ( c <= '~' ) ? (c - 47) : \
3471 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
3477 (*o_rot_conv)(c2,c1);
3484 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
3486 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
3489 (*o_hira_conv)(c2,c1);
3494 iso2022jp_check_conv(c2,c1)
3497 static int range[RANGE_NUM_MAX][2] = {
3520 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3524 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3529 for (i = 0; i < RANGE_NUM_MAX; i++) {
3530 start = range[i][0];
3533 if (c >= start && c <= end) {
3538 (*o_iso2022jp_check_conv)(c2,c1);
3542 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3544 unsigned char *mime_pattern[] = {
3545 (unsigned char *)"\075?EUC-JP?B?",
3546 (unsigned char *)"\075?SHIFT_JIS?B?",
3547 (unsigned char *)"\075?ISO-8859-1?Q?",
3548 (unsigned char *)"\075?ISO-8859-1?B?",
3549 (unsigned char *)"\075?ISO-2022-JP?B?",
3550 (unsigned char *)"\075?ISO-2022-JP?Q?",
3551 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3552 (unsigned char *)"\075?UTF-8?B?",
3553 (unsigned char *)"\075?UTF-8?Q?",
3555 (unsigned char *)"\075?US-ASCII?Q?",
3560 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3561 int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
3562 e_iconv, s_iconv, 0, 0, 0, 0,
3563 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3569 int mime_encode[] = {
3570 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
3571 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3578 int mime_encode_method[] = {
3579 'B', 'B','Q', 'B', 'B', 'Q',
3580 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3588 #define MAXRECOVER 20
3590 /* I don't trust portablity of toupper */
3591 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
3592 #define nkf_isdigit(c) ('0'<=c && c<='9')
3593 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
3594 #define nkf_isblank(c) (c == SPACE || c == TAB)
3595 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
3596 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
3597 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
3602 if (i_getc!=mime_getc) {
3603 i_mgetc = i_getc; i_getc = mime_getc;
3604 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3605 if(mime_f==STRICT_MIME) {
3606 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3607 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3613 unswitch_mime_getc()
3615 if(mime_f==STRICT_MIME) {
3616 i_mgetc = i_mgetc_buf;
3617 i_mungetc = i_mungetc_buf;
3620 i_ungetc = i_mungetc;
3621 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3622 mime_iconv_back = NULL;
3626 mime_begin_strict(f)
3631 unsigned char *p,*q;
3632 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
3634 mime_decode_mode = FALSE;
3635 /* =? has been checked */
3637 p = mime_pattern[j];
3640 for(i=2;p[i]>' ';i++) { /* start at =? */
3641 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
3642 /* pattern fails, try next one */
3644 while ((p = mime_pattern[++j])) {
3645 for(k=2;k<i;k++) /* assume length(p) > i */
3646 if (p[k]!=q[k]) break;
3647 if (k==i && nkf_toupper(c1)==p[k]) break;
3649 if (p) continue; /* found next one, continue */
3650 /* all fails, output from recovery buffer */
3658 mime_decode_mode = p[i-2];
3660 mime_iconv_back = iconv;
3661 set_iconv(FALSE, mime_priority_func[j]);
3662 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3664 if (mime_decode_mode=='B') {
3665 mimebuf_f = unbuf_f;
3667 /* do MIME integrity check */
3668 return mime_integrity(f,mime_pattern[j]);
3680 /* we don't keep eof of Fifo, becase it contains ?= as
3681 a terminator. It was checked in mime_integrity. */
3682 return ((mimebuf_f)?
3683 (*i_mgetc_buf)(f):Fifo(mime_input++));
3687 mime_ungetc_buf(c,f)
3692 (*i_mungetc_buf)(c,f);
3694 Fifo(--mime_input)=c;
3705 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3706 /* re-read and convert again from mime_buffer. */
3708 /* =? has been checked */
3710 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
3711 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3712 /* We accept any character type even if it is breaked by new lines */
3713 c1 = (*i_getc)(f); Fifo(mime_last++)= c1 ;
3714 if (c1=='\n'||c1==' '||c1=='\r'||
3715 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
3717 /* Failed. But this could be another MIME preemble */
3725 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
3726 if (!(++i<MAXRECOVER) || c1==EOF) break;
3727 if (c1=='b'||c1=='B') {
3728 mime_decode_mode = 'B';
3729 } else if (c1=='q'||c1=='Q') {
3730 mime_decode_mode = 'Q';
3734 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
3735 if (!(++i<MAXRECOVER) || c1==EOF) break;
3737 mime_decode_mode = FALSE;
3743 if (!mime_decode_mode) {
3744 /* false MIME premble, restart from mime_buffer */
3745 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3746 /* Since we are in MIME mode until buffer becomes empty, */
3747 /* we never go into mime_begin again for a while. */
3750 /* discard mime preemble, and goto MIME mode */
3752 /* do no MIME integrity check */
3753 return c1; /* used only for checking EOF */
3768 fprintf(stderr, "%s\n", str);
3774 set_input_codename (codename)
3779 strcmp(codename, "") != 0 &&
3780 strcmp(codename, input_codename) != 0)
3782 is_inputcode_mixed = TRUE;
3784 input_codename = codename;
3785 is_inputcode_set = TRUE;
3790 print_guessed_code (filename)
3793 char *codename = "BINARY";
3794 if (!is_inputcode_mixed) {
3795 if (strcmp(input_codename, "") == 0) {
3798 codename = input_codename;
3801 if (filename != NULL) printf("%s:", filename);
3802 printf("%s\n", codename);
3810 if (nkf_isdigit(x)) return x - '0';
3811 return nkf_toupper(x) - 'A' + 10;
3816 #ifdef ANSI_C_PROTOTYPE
3817 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
3820 hex_getc(ch, f, g, u)
3833 if (!nkf_isxdigit(c2)){
3838 if (!nkf_isxdigit(c3)){
3843 return (hex2bin(c2) << 4) | hex2bin(c3);
3850 return hex_getc(':', f, i_cgetc, i_cungetc);
3858 return (*i_cungetc)(c, f);
3865 return hex_getc('%', f, i_ugetc, i_uungetc);
3873 return (*i_uungetc)(c, f);
3877 #ifdef NUMCHAR_OPTION
3882 int (*g)() = i_ngetc;
3883 int (*u)() = i_nungetc;
3894 if (buf[i] == 'x' || buf[i] == 'X'){
3895 for (j = 0; j < 5; j++){
3897 if (!nkf_isxdigit(buf[i])){
3904 c |= hex2bin(buf[i]);
3907 for (j = 0; j < 6; j++){
3911 if (!nkf_isdigit(buf[i])){
3918 c += hex2bin(buf[i]);
3924 return CLASS_UTF16 | c;
3934 numchar_ungetc(c, f)
3938 return (*i_nungetc)(c, f);
3942 #ifdef UNICODE_NORMALIZATION
3944 /* Normalization Form C */
3949 int (*g)() = i_nfc_getc;
3950 int (*u)() = i_nfc_ungetc;
3951 int i=0, j, k=1, lower, upper;
3954 extern struct normalization_pair normalization_table[];
3957 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
3958 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
3959 while (upper >= lower) {
3960 j = (lower+upper) / 2;
3961 array = normalization_table[j].nfd;
3962 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
3963 if (array[k] != buf[k]){
3964 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
3971 array = normalization_table[j].nfc;
3972 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
3989 return (*i_nfc_ungetc)(c, f);
3991 #endif /* UNICODE_NORMALIZATION */
3998 int c1, c2, c3, c4, cc;
3999 int t1, t2, t3, t4, mode, exit_mode;
4003 int lwsp_size = 128;
4005 if (mime_top != mime_last) { /* Something is in FIFO */
4006 return Fifo(mime_top++);
4008 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4009 mime_decode_mode=FALSE;
4010 unswitch_mime_getc();
4011 return (*i_getc)(f);
4014 if (mimebuf_f == FIXED_MIME)
4015 exit_mode = mime_decode_mode;
4018 if (mime_decode_mode == 'Q') {
4019 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4021 if (c1=='_') return ' ';
4022 if (c1<=' ' || DEL<=c1) {
4023 mime_decode_mode = exit_mode; /* prepare for quit */
4026 if (c1!='=' && c1!='?') {
4030 mime_decode_mode = exit_mode; /* prepare for quit */
4031 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4032 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4033 /* end Q encoding */
4034 input_mode = exit_mode;
4036 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4037 if (lwsp_buf==NULL) {
4038 perror("can't malloc");
4041 while ((c1=(*i_getc)(f))!=EOF) {
4046 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4054 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4055 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4070 lwsp_buf[lwsp_count] = c1;
4071 if (lwsp_count++>lwsp_size){
4073 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4074 if (lwsp_buf_new==NULL) {
4077 perror("can't realloc");
4080 lwsp_buf = lwsp_buf_new;
4086 if (lwsp_count > 0) {
4087 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4091 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4092 i_ungetc(lwsp_buf[lwsp_count],f);
4100 if (c1=='='&&c2<' ') { /* this is soft wrap */
4101 while((c1 = (*i_mgetc)(f)) <=' ') {
4102 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4104 mime_decode_mode = 'Q'; /* still in MIME */
4105 goto restart_mime_q;
4108 mime_decode_mode = 'Q'; /* still in MIME */
4112 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4113 if (c2<=' ') return c2;
4114 mime_decode_mode = 'Q'; /* still in MIME */
4115 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4116 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4117 return ((hex(c2)<<4) + hex(c3));
4120 if (mime_decode_mode != 'B') {
4121 mime_decode_mode = FALSE;
4122 return (*i_mgetc)(f);
4126 /* Base64 encoding */
4128 MIME allows line break in the middle of
4129 Base64, but we are very pessimistic in decoding
4130 in unbuf mode because MIME encoded code may broken by
4131 less or editor's control sequence (such as ESC-[-K in unbuffered
4132 mode. ignore incomplete MIME.
4134 mode = mime_decode_mode;
4135 mime_decode_mode = exit_mode; /* prepare for quit */
4137 while ((c1 = (*i_mgetc)(f))<=' ') {
4142 if ((c2 = (*i_mgetc)(f))<=' ') {
4145 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4146 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4149 if ((c1 == '?') && (c2 == '=')) {
4152 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4153 if (lwsp_buf==NULL) {
4154 perror("can't malloc");
4157 while ((c1=(*i_getc)(f))!=EOF) {
4162 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4170 if ((c1=(*i_getc)(f))!=EOF) {
4174 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4189 lwsp_buf[lwsp_count] = c1;
4190 if (lwsp_count++>lwsp_size){
4192 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4193 if (lwsp_buf_new==NULL) {
4196 perror("can't realloc");
4199 lwsp_buf = lwsp_buf_new;
4205 if (lwsp_count > 0) {
4206 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4210 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4211 i_ungetc(lwsp_buf[lwsp_count],f);
4220 if ((c3 = (*i_mgetc)(f))<=' ') {
4223 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4224 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4228 if ((c4 = (*i_mgetc)(f))<=' ') {
4231 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4232 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4236 mime_decode_mode = mode; /* still in MIME sigh... */
4238 /* BASE 64 decoding */
4240 t1 = 0x3f & base64decode(c1);
4241 t2 = 0x3f & base64decode(c2);
4242 t3 = 0x3f & base64decode(c3);
4243 t4 = 0x3f & base64decode(c4);
4244 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4246 Fifo(mime_last++) = cc;
4247 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4249 Fifo(mime_last++) = cc;
4250 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4252 Fifo(mime_last++) = cc;
4257 return Fifo(mime_top++);
4265 Fifo(--mime_top) = c;
4276 /* In buffered mode, read until =? or NL or buffer full
4278 mime_input = mime_top;
4279 mime_last = mime_top;
4280 while(*p) Fifo(mime_input++) = *p++;
4283 while((c=(*i_getc)(f))!=EOF) {
4284 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4285 break; /* buffer full */
4287 if (c=='=' && d=='?') {
4288 /* checked. skip header, start decode */
4289 Fifo(mime_input++) = c;
4290 /* mime_last_input = mime_input; */
4295 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4297 /* Should we check length mod 4? */
4298 Fifo(mime_input++) = c;
4301 /* In case of Incomplete MIME, no MIME decode */
4302 Fifo(mime_input++) = c;
4303 mime_last = mime_input; /* point undecoded buffer */
4304 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4305 switch_mime_getc(); /* anyway we need buffered getc */
4316 i = c - 'A'; /* A..Z 0-25 */
4318 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4320 } else if (c > '/') {
4321 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4322 } else if (c == '+') {
4323 i = '>' /* 62 */ ; /* + 62 */
4325 i = '?' /* 63 */ ; /* / 63 */
4330 static char basis_64[] =
4331 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4334 #define MIMEOUT_BUF_LENGTH (60)
4335 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4336 int mimeout_buf_count = 0;
4337 int mimeout_preserve_space = 0;
4338 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4347 p = mime_pattern[0];
4348 for(i=0;mime_encode[i];i++) {
4349 if (mode == mime_encode[i]) {
4350 p = mime_pattern[i];
4354 mimeout_mode = mime_encode_method[i];
4357 if (base64_count>45) {
4358 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4359 (*o_mputc)(mimeout_buf[i]);
4365 if (!mimeout_preserve_space && mimeout_buf_count>0
4366 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4367 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4371 if (!mimeout_preserve_space) {
4372 for (;i<mimeout_buf_count;i++) {
4373 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4374 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4375 (*o_mputc)(mimeout_buf[i]);
4382 mimeout_preserve_space = FALSE;
4388 j = mimeout_buf_count;
4389 mimeout_buf_count = 0;
4391 mime_putc(mimeout_buf[i]);
4407 switch(mimeout_mode) {
4412 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
4418 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
4424 if (mimeout_f!=FIXED_MIME) {
4426 } else if (mimeout_mode != 'Q')
4435 switch(mimeout_mode) {
4440 } else if (c==CR||c==NL) {
4443 } else if(c<SPACE||c=='='||c=='?'||c=='_'||DEL<=c) {
4445 (*o_mputc)(itoh4(((c>>4)&0xf)));
4446 (*o_mputc)(itoh4((c&0xf)));
4455 (*o_mputc)(basis_64[c>>2]);
4460 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4466 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
4467 (*o_mputc)(basis_64[c & 0x3F]);
4478 int mime_lastchar2, mime_lastchar1;
4480 void mime_prechar(c2, c1)
4485 if (base64_count + mimeout_buf_count/3*4> 66){
4486 (*o_base64conv)(EOF,0);
4487 (*o_base64conv)(0,NL);
4488 (*o_base64conv)(0,SPACE);
4490 }/*else if (mime_lastchar2){
4491 if (c1 <=DEL && !nkf_isspace(c1)){
4492 (*o_base64conv)(0,SPACE);
4496 if (c2 && mime_lastchar2 == 0
4497 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
4498 (*o_base64conv)(0,SPACE);
4501 mime_lastchar2 = c2;
4502 mime_lastchar1 = c1;
4513 if (mimeout_f == FIXED_MIME){
4514 if (mimeout_mode == 'Q'){
4515 if (base64_count > 71){
4516 if (c!=CR && c!=NL) {
4523 if (base64_count > 71){
4528 if (c == EOF) { /* c==EOF */
4532 if (c != EOF) { /* c==EOF */
4538 /* mimeout_f != FIXED_MIME */
4540 if (c == EOF) { /* c==EOF */
4541 j = mimeout_buf_count;
4542 mimeout_buf_count = 0;
4545 /*if (nkf_isspace(mimeout_buf[i])){
4548 mimeout_addchar(mimeout_buf[i]);
4552 (*o_mputc)(mimeout_buf[i]);
4558 if (mimeout_mode=='Q') {
4559 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
4571 if (mimeout_buf_count > 0){
4572 lastchar = mimeout_buf[mimeout_buf_count - 1];
4577 if (!mimeout_mode) {
4578 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
4579 if (nkf_isspace(c)) {
4580 if (c==CR || c==NL) {
4583 for (i=0;i<mimeout_buf_count;i++) {
4584 (*o_mputc)(mimeout_buf[i]);
4585 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
4592 mimeout_buf_count = 1;
4594 if (base64_count > 1
4595 && base64_count + mimeout_buf_count > 76){
4598 if (!nkf_isspace(mimeout_buf[0])){
4603 mimeout_buf[mimeout_buf_count++] = c;
4604 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4605 open_mime(output_mode);
4610 if (lastchar==CR || lastchar == NL){
4611 for (i=0;i<mimeout_buf_count;i++) {
4612 (*o_mputc)(mimeout_buf[i]);
4615 mimeout_buf_count = 0;
4617 if (lastchar==SPACE) {
4618 for (i=0;i<mimeout_buf_count-1;i++) {
4619 (*o_mputc)(mimeout_buf[i]);
4622 mimeout_buf[0] = SPACE;
4623 mimeout_buf_count = 1;
4625 open_mime(output_mode);
4628 /* mimeout_mode == 'B', 1, 2 */
4629 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
4630 if (lastchar == CR || lastchar == NL){
4631 if (nkf_isblank(c)) {
4632 for (i=0;i<mimeout_buf_count;i++) {
4633 mimeout_addchar(mimeout_buf[i]);
4635 mimeout_buf_count = 0;
4636 } else if (SPACE<c && c<DEL) {
4638 for (i=0;i<mimeout_buf_count;i++) {
4639 (*o_mputc)(mimeout_buf[i]);
4642 mimeout_buf_count = 0;
4645 if (c==SPACE || c==TAB || c==CR || c==NL) {
4646 for (i=0;i<mimeout_buf_count;i++) {
4647 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
4649 for (i=0;i<mimeout_buf_count;i++) {
4650 (*o_mputc)(mimeout_buf[i]);
4653 mimeout_buf_count = 0;
4656 mimeout_buf[mimeout_buf_count++] = c;
4657 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4659 for (i=0;i<mimeout_buf_count;i++) {
4660 (*o_mputc)(mimeout_buf[i]);
4663 mimeout_buf_count = 0;
4667 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
4668 mimeout_buf[mimeout_buf_count++] = c;
4669 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
4670 j = mimeout_buf_count;
4671 mimeout_buf_count = 0;
4673 mimeout_addchar(mimeout_buf[i]);
4680 if (mimeout_buf_count>0) {
4681 j = mimeout_buf_count;
4682 mimeout_buf_count = 0;
4684 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
4686 mimeout_addchar(mimeout_buf[i]);
4692 (*o_mputc)(mimeout_buf[i]);
4694 open_mime(output_mode);
4701 #if defined(PERL_XS) || defined(WIN32DLL)
4706 struct input_code *p = input_code_list;
4719 mime_f = STRICT_MIME;
4720 mime_decode_f = FALSE;
4725 #if defined(MSDOS) || defined(__OS2__)
4730 iso2022jp_f = FALSE;
4731 #ifdef UNICODE_ENABLE
4732 internal_unicode_f = FALSE;
4734 #ifdef UTF8_OUTPUT_ENABLE
4737 ms_ucs_map_f = FALSE;
4739 #ifdef UNICODE_NORMALIZATION
4752 is_inputcode_mixed = FALSE;
4753 is_inputcode_set = FALSE;
4757 #ifdef SHIFTJIS_CP932
4763 for (i = 0; i < 256; i++){
4764 prefix_table[i] = 0;
4767 #ifdef UTF8_INPUT_ENABLE
4768 utf16_mode = UTF16BE_INPUT;
4770 mimeout_buf_count = 0;
4775 fold_preserve_f = FALSE;
4778 kanji_intro = DEFAULT_J;
4779 ascii_intro = DEFAULT_R;
4780 fold_margin = FOLD_MARGIN;
4781 output_conv = DEFAULT_CONV;
4782 oconv = DEFAULT_CONV;
4783 o_zconv = no_connection;
4784 o_fconv = no_connection;
4785 o_crconv = no_connection;
4786 o_rot_conv = no_connection;
4787 o_hira_conv = no_connection;
4788 o_base64conv = no_connection;
4789 o_iso2022jp_check_conv = no_connection;
4792 i_ungetc = std_ungetc;
4794 i_bungetc = std_ungetc;
4797 i_mungetc = std_ungetc;
4798 i_mgetc_buf = std_getc;
4799 i_mungetc_buf = std_ungetc;
4800 output_mode = ASCII;
4803 mime_decode_mode = FALSE;
4809 z_prev2=0,z_prev1=0;
4811 iconv_for_check = 0;
4813 input_codename = "";
4821 no_connection(c2,c1)
4824 no_connection2(c2,c1,0);
4828 no_connection2(c2,c1,c0)
4831 fprintf(stderr,"nkf internal module connection failure.\n");
4833 return 0; /* LINT */
4838 #define fprintf dllprintf
4843 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
4844 fprintf(stderr,"Flags:\n");
4845 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
4846 #ifdef DEFAULT_CODE_SJIS
4847 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8N\n");
4849 #ifdef DEFAULT_CODE_JIS
4850 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8N\n");
4852 #ifdef DEFAULT_CODE_EUC
4853 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8N\n");
4855 #ifdef DEFAULT_CODE_UTF8
4856 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8N (DEFAULT)\n");
4858 #ifdef UTF8_OUTPUT_ENABLE
4859 fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
4861 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC), UTF-8\n");
4862 #ifdef UTF8_INPUT_ENABLE
4863 fprintf(stderr," After 'W' you can add more options. (8|16(B|L)?) \n");
4865 fprintf(stderr,"t no conversion\n");
4866 fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
4867 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
4868 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
4869 fprintf(stderr,"v Show this usage. V: show version\n");
4870 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
4871 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
4872 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
4873 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
4874 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces,\n");
4875 fprintf(stderr," 3: Convert HTML Entity\n");
4876 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
4877 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
4879 fprintf(stderr,"T Text mode output\n");
4881 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
4882 fprintf(stderr,"d,c Delete \\r in line feed and \\032, Add \\r in line feed\n");
4883 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
4884 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
4885 fprintf(stderr,"long name options\n");
4886 fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
4887 fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n");
4888 fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
4889 fprintf(stderr," --x0212 Convert JISX0212\n");
4890 fprintf(stderr," --cp932, --no-cp932 CP932 compatibility\n");
4891 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
4893 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
4895 #ifdef NUMCHAR_OPTION
4896 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
4898 #ifdef UNICODE_NORMALIZATION
4899 fprintf(stderr," --utf8mac-input UTF-8-MAC input\n");
4901 #ifdef UTF8_OUTPUT_ENABLE
4902 fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n");
4905 fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
4907 fprintf(stderr," -g, --guess Guess the input code\n");
4908 fprintf(stderr," --help,--version\n");
4915 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
4916 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
4919 #if defined(MSDOS) && defined(__WIN16__)
4922 #if defined(MSDOS) && defined(__WIN32__)
4928 ,NKF_VERSION,NKF_RELEASE_DATE);
4929 fprintf(stderr,"\n%s\n",CopyRight);
4934 **
\e$B%Q%C%A@):n<T
\e(B
4935 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
4936 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
4937 ** ohta@src.ricoh.co.jp (Junn Ohta)
4938 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
4939 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
4940 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
4941 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
4942 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
4943 ** GHG00637@nifty-serve.or.jp (COW)