1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.91 2006/03/04 17:07:59 naruse Exp $ */
43 #define NKF_VERSION "2.0.5"
44 #define NKF_RELEASE_DATE "2006-03-04"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2006 Kono, Furukawa, Naruse"
55 ** USAGE: nkf [flags] [file]
58 ** b Output is buffered (DEFAULT)
59 ** u Output is unbuffered
63 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
64 ** s Output code is MS Kanji (DEFAULT SELECT)
65 ** e Output code is AT&T JIS (DEFAULT SELECT)
66 ** w Output code is AT&T JIS (DEFAULT SELECT)
67 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
69 ** m MIME conversion for ISO-2022-JP
70 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
71 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
72 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
73 ** M MIME output conversion
75 ** r {de/en}crypt ROT13/47
79 ** T Text mode output (for MS-DOS)
81 ** x Do not convert X0201 kana into X0208
82 ** Z Convert X0208 alphabet to ASCII
87 ** B try to fix broken JIS, missing Escape
88 ** B[1-9] broken level
90 ** O Output to 'nkf.out' file or last file name
91 ** d Delete \r in line feed
92 ** c Add \r in line feed
93 ** -- other long option
94 ** -- ignore following option (don't use with -O )
98 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
100 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
116 #if defined(MSDOS) || defined(__OS2__)
123 #define setbinmode(fp) fsetbin(fp)
124 #else /* Microsoft C, Turbo C */
125 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
127 #else /* UNIX,OS/2 */
128 #define setbinmode(fp)
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
148 #include <sys/stat.h>
149 #ifndef MSDOS /* UNIX, OS/2 */
152 #else /* defined(MSDOS) */
154 #ifdef __BORLANDC__ /* BCC32 */
156 #else /* !defined(__BORLANDC__) */
157 #include <sys/utime.h>
158 #endif /* (__BORLANDC__) */
159 #else /* !defined(__WIN32__) */
160 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
161 #include <sys/utime.h>
162 #elif defined(__TURBOC__) /* BCC */
164 #elif defined(LSI_C) /* LSI C */
165 #endif /* (__WIN32__) */
177 /* state of output_mode and input_mode
194 #define X0213_1 0x2850
195 #define X0213_2 0x2850
197 /* Input Assumption */
201 #define LATIN1_INPUT 6
203 #define STRICT_MIME 8
208 #define JAPANESE_EUC 10
212 #define UTF8_INPUT 13
213 #define UTF16BE_INPUT 14
214 #define UTF16LE_INPUT 15
234 #define is_alnum(c) \
235 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
237 /* I don't trust portablity of toupper */
238 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
239 #define nkf_isoctal(c) ('0'<=c && c<='7')
240 #define nkf_isdigit(c) ('0'<=c && c<='9')
241 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
242 #define nkf_isblank(c) (c == SPACE || c == TAB)
243 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
244 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
245 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
246 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
248 #define HOLD_SIZE 1024
249 #define IOBUF_SIZE 16384
251 #define DEFAULT_J 'B'
252 #define DEFAULT_R 'B'
254 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
255 #define SJ6394 0x0161 /* 63 - 94 ku offset */
257 #define RANGE_NUM_MAX 18
262 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
263 #define sizeof_euc_to_utf8_1byte 94
264 #define sizeof_euc_to_utf8_2bytes 94
265 #define sizeof_utf8_to_euc_C2 64
266 #define sizeof_utf8_to_euc_E5B8 64
267 #define sizeof_utf8_to_euc_2bytes 112
268 #define sizeof_utf8_to_euc_3bytes 16
271 /* MIME preprocessor */
273 #ifdef EASYWIN /*Easy Win */
274 extern POINT _BufferSize;
277 /* function prototype */
279 #ifdef ANSI_C_PROTOTYPE
281 #define STATIC static
295 void (*status_func)PROTO((struct input_code *, int));
296 int (*iconv_func)PROTO((int c2, int c1, int c0));
300 STATIC char *input_codename = "";
303 STATIC const char *CopyRight = COPY_RIGHT;
305 #if !defined(PERL_XS) && !defined(WIN32DLL)
306 STATIC int noconvert PROTO((FILE *f));
308 STATIC int kanji_convert PROTO((FILE *f));
309 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
310 STATIC int push_hold_buf PROTO((int c2));
311 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
312 STATIC int s_iconv PROTO((int c2,int c1,int c0));
313 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
314 STATIC int e_iconv PROTO((int c2,int c1,int c0));
315 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
317 * 0: Shift_JIS, eucJP-ascii
321 #define UCS_MAP_ASCII 0
323 #define UCS_MAP_CP932 2
324 STATIC int ms_ucs_map_f = UCS_MAP_ASCII;
326 #ifdef UTF8_INPUT_ENABLE
327 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
328 STATIC int no_cp932ext_f = FALSE;
329 /* ignore ZERO WIDTH NO-BREAK SPACE */
330 STATIC int ignore_zwnbsp_f = TRUE;
331 STATIC int no_best_fit_chars_f = FALSE;
332 STATIC int unicode_subchar = '?'; /* the regular substitution character */
333 STATIC void encode_fallback_html PROTO((int c));
334 STATIC void encode_fallback_xml PROTO((int c));
335 STATIC void encode_fallback_java PROTO((int c));
336 STATIC void encode_fallback_perl PROTO((int c));
337 STATIC void encode_fallback_subchar PROTO((int c));
338 STATIC void (*encode_fallback)PROTO((int c)) = NULL;
339 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
340 STATIC int w_iconv PROTO((int c2,int c1,int c0));
341 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
342 STATIC int unicode_to_jis_common PROTO((int c2,int c1,int c0,int *p2,int *p1));
343 STATIC int w_iconv_common PROTO((int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1));
344 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
345 STATIC int w16e_conv PROTO((unsigned short val,int *p2,int *p1));
347 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
348 STATIC int internal_unicode_f = FALSE; /* Internal Unicode Processing */
350 #ifdef UTF8_OUTPUT_ENABLE
351 STATIC int unicode_bom_f= 0; /* Output Unicode BOM */
352 STATIC int w_oconv16_LE = 0; /* utf-16 little endian */
353 STATIC int e2w_conv PROTO((int c2,int c1));
354 STATIC void w_oconv PROTO((int c2,int c1));
355 STATIC void w_oconv16 PROTO((int c2,int c1));
357 STATIC void e_oconv PROTO((int c2,int c1));
358 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
359 STATIC void s_oconv PROTO((int c2,int c1));
360 STATIC void j_oconv PROTO((int c2,int c1));
361 STATIC void fold_conv PROTO((int c2,int c1));
362 STATIC void cr_conv PROTO((int c2,int c1));
363 STATIC void z_conv PROTO((int c2,int c1));
364 STATIC void rot_conv PROTO((int c2,int c1));
365 STATIC void hira_conv PROTO((int c2,int c1));
366 STATIC void base64_conv PROTO((int c2,int c1));
367 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
368 STATIC void no_connection PROTO((int c2,int c1));
369 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
371 STATIC void code_score PROTO((struct input_code *ptr));
372 STATIC void code_status PROTO((int c));
374 STATIC void std_putc PROTO((int c));
375 STATIC int std_getc PROTO((FILE *f));
376 STATIC int std_ungetc PROTO((int c,FILE *f));
378 STATIC int broken_getc PROTO((FILE *f));
379 STATIC int broken_ungetc PROTO((int c,FILE *f));
381 STATIC int mime_begin PROTO((FILE *f));
382 STATIC int mime_getc PROTO((FILE *f));
383 STATIC int mime_ungetc PROTO((int c,FILE *f));
385 STATIC int mime_begin_strict PROTO((FILE *f));
386 STATIC int mime_getc_buf PROTO((FILE *f));
387 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
388 STATIC int mime_integrity PROTO((FILE *f,const unsigned char *p));
390 STATIC int base64decode PROTO((int c));
391 STATIC void mime_prechar PROTO((int c2, int c1));
392 STATIC void mime_putc PROTO((int c));
393 STATIC void open_mime PROTO((int c));
394 STATIC void close_mime PROTO(());
396 STATIC void usage PROTO(());
397 STATIC void version PROTO(());
399 STATIC void options PROTO((unsigned char *c));
400 #if defined(PERL_XS) || defined(WIN32DLL)
401 STATIC void reinit PROTO(());
406 #if !defined(PERL_XS) && !defined(WIN32DLL)
407 STATIC unsigned char stdibuf[IOBUF_SIZE];
408 STATIC unsigned char stdobuf[IOBUF_SIZE];
410 STATIC unsigned char hold_buf[HOLD_SIZE*2];
411 STATIC int hold_count;
413 /* MIME preprocessor fifo */
415 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
416 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
417 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
418 STATIC unsigned char mime_buf[MIME_BUF_SIZE];
419 STATIC unsigned int mime_top = 0;
420 STATIC unsigned int mime_last = 0; /* decoded */
421 STATIC unsigned int mime_input = 0; /* undecoded */
422 STATIC int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
425 STATIC int unbuf_f = FALSE;
426 STATIC int estab_f = FALSE;
427 STATIC int nop_f = FALSE;
428 STATIC int binmode_f = TRUE; /* binary mode */
429 STATIC int rot_f = FALSE; /* rot14/43 mode */
430 STATIC int hira_f = FALSE; /* hira/kata henkan */
431 STATIC int input_f = FALSE; /* non fixed input code */
432 STATIC int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
433 STATIC int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
434 STATIC int mime_decode_f = FALSE; /* mime decode is explicitly on */
435 STATIC int mimebuf_f = FALSE; /* MIME buffered input */
436 STATIC int broken_f = FALSE; /* convert ESC-less broken JIS */
437 STATIC int iso8859_f = FALSE; /* ISO8859 through */
438 STATIC int mimeout_f = FALSE; /* base64 mode */
439 #if defined(MSDOS) || defined(__OS2__)
440 STATIC int x0201_f = TRUE; /* Assume JISX0201 kana */
442 STATIC int x0201_f = NO_X0201; /* Assume NO JISX0201 */
444 STATIC int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
446 #ifdef UNICODE_NORMALIZATION
447 STATIC int nfc_f = FALSE;
448 STATIC int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
449 STATIC int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
450 STATIC int nfc_getc PROTO((FILE *f));
451 STATIC int nfc_ungetc PROTO((int c,FILE *f));
455 STATIC int cap_f = FALSE;
456 STATIC int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
457 STATIC int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
458 STATIC int cap_getc PROTO((FILE *f));
459 STATIC int cap_ungetc PROTO((int c,FILE *f));
461 STATIC int url_f = FALSE;
462 STATIC int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
463 STATIC int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
464 STATIC int url_getc PROTO((FILE *f));
465 STATIC int url_ungetc PROTO((int c,FILE *f));
468 #ifdef NUMCHAR_OPTION
469 #define CLASS_MASK 0x0f000000
470 #define CLASS_UTF16 0x01000000
471 STATIC int numchar_f = FALSE;
472 STATIC int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
473 STATIC int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
474 STATIC int numchar_getc PROTO((FILE *f));
475 STATIC int numchar_ungetc PROTO((int c,FILE *f));
479 STATIC int noout_f = FALSE;
480 STATIC void no_putc PROTO((int c));
481 STATIC int debug_f = FALSE;
482 STATIC void debug PROTO((const char *str));
483 STATIC int (*iconv_for_check)() = 0;
486 STATIC int guess_f = FALSE;
488 STATIC void print_guessed_code PROTO((char *filename));
490 STATIC void set_input_codename PROTO((char *codename));
491 STATIC int is_inputcode_mixed = FALSE;
492 STATIC int is_inputcode_set = FALSE;
495 STATIC int exec_f = 0;
498 #ifdef SHIFTJIS_CP932
499 /* invert IBM extended characters to others */
500 STATIC int cp51932_f = TRUE;
501 #define CP932_TABLE_BEGIN (0xfa)
502 #define CP932_TABLE_END (0xfc)
504 /* invert NEC-selected IBM extended characters to IBM extended characters */
505 STATIC int cp932inv_f = TRUE;
506 #define CP932INV_TABLE_BEGIN (0xed)
507 #define CP932INV_TABLE_END (0xee)
509 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
510 #endif /* SHIFTJIS_CP932 */
513 STATIC int x0212_f = FALSE;
514 STATIC int x0212_shift PROTO((int c));
515 STATIC int x0212_unshift PROTO((int c));
517 STATIC int x0213_f = FALSE;
519 STATIC unsigned char prefix_table[256];
521 STATIC void e_status PROTO((struct input_code *, int));
522 STATIC void s_status PROTO((struct input_code *, int));
524 #ifdef UTF8_INPUT_ENABLE
525 STATIC void w_status PROTO((struct input_code *, int));
526 STATIC void w16_status PROTO((struct input_code *, int));
527 STATIC int utf16_mode = UTF16BE_INPUT;
530 struct input_code input_code_list[] = {
531 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
532 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
533 #ifdef UTF8_INPUT_ENABLE
534 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
535 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
540 STATIC int mimeout_mode = 0;
541 STATIC int base64_count = 0;
543 /* X0208 -> ASCII converter */
546 STATIC int f_line = 0; /* chars in line */
547 STATIC int f_prev = 0;
548 STATIC int fold_preserve_f = FALSE; /* preserve new lines */
549 STATIC int fold_f = FALSE;
550 STATIC int fold_len = 0;
553 STATIC unsigned char kanji_intro = DEFAULT_J;
554 STATIC unsigned char ascii_intro = DEFAULT_R;
558 #define FOLD_MARGIN 10
559 #define DEFAULT_FOLD 60
561 STATIC int fold_margin = FOLD_MARGIN;
565 #ifdef DEFAULT_CODE_JIS
566 # define DEFAULT_CONV j_oconv
568 #ifdef DEFAULT_CODE_SJIS
569 # define DEFAULT_CONV s_oconv
571 #ifdef DEFAULT_CODE_EUC
572 # define DEFAULT_CONV e_oconv
574 #ifdef DEFAULT_CODE_UTF8
575 # define DEFAULT_CONV w_oconv
578 /* process default */
579 STATIC void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
581 STATIC void (*oconv)PROTO((int c2,int c1)) = no_connection;
582 /* s_iconv or oconv */
583 STATIC int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
585 STATIC void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
586 STATIC void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
587 STATIC void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
588 STATIC void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
589 STATIC void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
590 STATIC void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
591 STATIC void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
593 /* STATIC redirections */
595 STATIC void (*o_putc)PROTO((int c)) = std_putc;
597 STATIC int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
598 STATIC int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
600 STATIC int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
601 STATIC int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
603 STATIC void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
605 STATIC int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
606 STATIC int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
608 /* for strict mime */
609 STATIC int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
610 STATIC int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
613 STATIC int output_mode = ASCII, /* output kanji mode */
614 input_mode = ASCII, /* input kanji mode */
615 shift_mode = FALSE; /* TRUE shift out, or X0201 */
616 STATIC int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
618 /* X0201 / X0208 conversion tables */
620 /* X0201 kana conversion table */
623 unsigned char cv[]= {
624 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
625 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
626 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
627 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
628 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
629 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
630 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
631 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
632 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
633 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
634 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
635 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
636 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
637 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
638 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
639 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
643 /* X0201 kana conversion table for daguten */
646 unsigned char dv[]= {
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
652 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
653 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
654 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
655 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
656 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
658 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 /* X0201 kana conversion table for han-daguten */
668 unsigned char ev[]= {
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
680 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
688 /* X0208 kigou conversion table */
689 /* 0x8140 - 0x819e */
691 unsigned char fv[] = {
693 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
694 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
695 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
696 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
697 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
698 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
699 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
701 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
703 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
704 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
710 STATIC int file_out = FALSE;
712 STATIC int overwrite = FALSE;
715 STATIC int crmode_f = 0; /* CR, NL, CRLF */
716 #ifdef EASYWIN /*Easy Win */
717 STATIC int end_check;
720 #define STD_GC_BUFSIZE (256)
721 int std_gc_buf[STD_GC_BUFSIZE];
725 #include "nkf32dll.c"
726 #elif defined(PERL_XS)
736 char *outfname = NULL;
739 #ifdef EASYWIN /*Easy Win */
740 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
743 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
744 cp = (unsigned char *)*argv;
749 if (pipe(fds) < 0 || (pid = fork()) < 0){
760 execvp(argv[1], &argv[1]);
774 if(x0201_f == WISH_TRUE)
775 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
777 if (binmode_f == TRUE)
779 if (freopen("","wb",stdout) == NULL)
786 setbuf(stdout, (char *) NULL);
788 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
791 if (binmode_f == TRUE)
793 if (freopen("","rb",stdin) == NULL) return (-1);
797 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
801 kanji_convert(stdin);
802 if (guess_f) print_guessed_code(NULL);
807 is_inputcode_mixed = FALSE;
808 is_inputcode_set = FALSE;
813 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
822 /* reopen file for stdout */
823 if (file_out == TRUE) {
826 outfname = malloc(strlen(origfname)
827 + strlen(".nkftmpXXXXXX")
833 strcpy(outfname, origfname);
837 for (i = strlen(outfname); i; --i){
838 if (outfname[i - 1] == '/'
839 || outfname[i - 1] == '\\'){
845 strcat(outfname, "ntXXXXXX");
847 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC,
850 strcat(outfname, ".nkftmpXXXXXX");
851 fd = mkstemp(outfname);
854 || (fd_backup = dup(fileno(stdout))) < 0
855 || dup2(fd, fileno(stdout)) < 0
866 outfname = "nkf.out";
869 if(freopen(outfname, "w", stdout) == NULL) {
873 if (binmode_f == TRUE) {
875 if (freopen("","wb",stdout) == NULL)
882 if (binmode_f == TRUE)
884 if (freopen("","rb",fin) == NULL)
889 setvbuffer(fin, stdibuf, IOBUF_SIZE);
893 char *filename = NULL;
895 if (nfiles > 1) filename = origfname;
896 if (guess_f) print_guessed_code(filename);
902 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
910 if (dup2(fd_backup, fileno(stdout)) < 0){
913 if (stat(origfname, &sb)) {
914 fprintf(stderr, "Can't stat %s\n", origfname);
916 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
917 if (chmod(outfname, sb.st_mode)) {
918 fprintf(stderr, "Can't set permission %s\n", outfname);
921 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
922 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
923 tb[0] = tb[1] = sb.st_mtime;
924 if (utime(outfname, tb)) {
925 fprintf(stderr, "Can't set timestamp %s\n", outfname);
928 tb.actime = sb.st_atime;
929 tb.modtime = sb.st_mtime;
930 if (utime(outfname, &tb)) {
931 fprintf(stderr, "Can't set timestamp %s\n", outfname);
935 if (unlink(origfname)){
939 if (rename(outfname, origfname)) {
941 fprintf(stderr, "Can't rename %s to %s\n",
942 outfname, origfname);
950 #ifdef EASYWIN /*Easy Win */
951 if (file_out == FALSE)
952 scanf("%d",&end_check);
955 #else /* for Other OS */
956 if (file_out == TRUE)
961 #endif /* WIN32DLL */
988 {"katakana-hiragana","h3"},
995 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
996 {"internal-unicode", ""},
998 #ifdef UTF8_OUTPUT_ENABLE
1008 {"fb-subchar=", ""},
1010 #ifdef UTF8_INPUT_ENABLE
1011 {"utf8-input", "W"},
1012 {"utf16-input", "W16"},
1013 {"no-cp932ext", ""},
1014 {"no-best-fit-chars",""},
1016 #ifdef UNICODE_NORMALIZATION
1017 {"utf8mac-input", ""},
1026 #ifdef NUMCHAR_OPTION
1027 {"numchar-input", ""},
1033 #ifdef SHIFTJIS_CP932
1043 STATIC int option_mode = 0;
1050 unsigned char *p = NULL;
1051 unsigned char *cp_back = NULL;
1052 unsigned char codeset[32];
1056 while(*cp && *cp++!='-');
1057 while (*cp || cp_back) {
1065 case '-': /* literal options */
1066 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1070 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1071 p = (unsigned char *)long_option[i].name;
1072 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1073 if (*p == cp[j] || cp[j] == ' '){
1080 while(*cp && *cp != SPACE && cp++);
1081 if (long_option[i].alias[0]){
1083 cp = (unsigned char *)long_option[i].alias;
1085 if (strcmp(long_option[i].name, "ic=") == 0){
1086 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1087 codeset[i] = nkf_toupper(p[i]);
1090 if(strcmp(codeset, "ISO-2022-JP") == 0){
1091 input_f = JIS_INPUT;
1092 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1093 input_f = SJIS_INPUT;
1094 if (x0201_f==NO_X0201) x0201_f=TRUE;
1095 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1096 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1097 strcmp(codeset, "CP932") == 0 ||
1098 strcmp(codeset, "MS932") == 0){
1099 input_f = SJIS_INPUT;
1101 #ifdef SHIFTJIS_CP932
1104 #ifdef UTF8_OUTPUT_ENABLE
1105 ms_ucs_map_f = UCS_MAP_CP932;
1107 }else if(strcmp(codeset, "EUCJP") == 0 ||
1108 strcmp(codeset, "EUC-JP") == 0){
1109 input_f = JIS_INPUT;
1110 }else if(strcmp(codeset, "CP51932") == 0){
1111 input_f = JIS_INPUT;
1113 #ifdef SHIFTJIS_CP932
1116 #ifdef UTF8_OUTPUT_ENABLE
1117 ms_ucs_map_f = UCS_MAP_CP932;
1119 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1120 strcmp(codeset, "EUCJP-MS") == 0 ||
1121 strcmp(codeset, "EUCJPMS") == 0){
1122 input_f = JIS_INPUT;
1124 #ifdef SHIFTJIS_CP932
1127 #ifdef UTF8_OUTPUT_ENABLE
1128 ms_ucs_map_f = UCS_MAP_MS;
1130 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1131 strcmp(codeset, "EUCJP-ASCII") == 0){
1132 input_f = JIS_INPUT;
1134 #ifdef SHIFTJIS_CP932
1137 #ifdef UTF8_OUTPUT_ENABLE
1138 ms_ucs_map_f = UCS_MAP_ASCII;
1140 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1141 input_f = SJIS_INPUT;
1143 #ifdef SHIFTJIS_CP932
1147 if (x0201_f==NO_X0201) x0201_f=TRUE;
1148 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1149 input_f = JIS_INPUT;
1152 #ifdef SHIFTJIS_CP932
1156 #ifdef UTF8_INPUT_ENABLE
1157 }else if(strcmp(codeset, "UTF-8") == 0 ||
1158 strcmp(codeset, "UTF-8N") == 0 ||
1159 strcmp(codeset, "UTF-8-BOM") == 0){
1160 input_f = UTF8_INPUT;
1161 #ifdef UNICODE_NORMALIZATION
1162 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1163 strcmp(codeset, "UTF-8-MAC") == 0){
1164 input_f = UTF8_INPUT;
1167 }else if(strcmp(codeset, "UTF-16") == 0){
1168 input_f = UTF16BE_INPUT;
1169 utf16_mode = UTF16BE_INPUT;
1170 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1171 strcmp(codeset, "UTF-16BE-BOM") == 0){
1172 input_f = UTF16BE_INPUT;
1173 utf16_mode = UTF16BE_INPUT;
1174 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1175 strcmp(codeset, "UTF-16LE-BOM") == 0){
1176 input_f = UTF16LE_INPUT;
1177 utf16_mode = UTF16LE_INPUT;
1182 if (strcmp(long_option[i].name, "oc=") == 0){
1183 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1184 codeset[i] = nkf_toupper(p[i]);
1187 if(strcmp(codeset, "ISO-2022-JP") == 0){
1188 output_conv = j_oconv;
1189 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1190 output_conv = s_oconv;
1191 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1192 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1193 strcmp(codeset, "CP932") == 0 ||
1194 strcmp(codeset, "MS932") == 0){
1195 output_conv = s_oconv;
1197 #ifdef SHIFTJIS_CP932
1201 #ifdef UTF8_OUTPUT_ENABLE
1202 ms_ucs_map_f = UCS_MAP_CP932;
1204 }else if(strcmp(codeset, "EUCJP") == 0 ||
1205 strcmp(codeset, "EUC-JP") == 0){
1206 output_conv = e_oconv;
1207 }else if(strcmp(codeset, "CP51932") == 0){
1208 output_conv = e_oconv;
1210 #ifdef SHIFTJIS_CP932
1213 #ifdef UTF8_OUTPUT_ENABLE
1214 ms_ucs_map_f = UCS_MAP_CP932;
1216 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1217 strcmp(codeset, "EUCJP-MS") == 0 ||
1218 strcmp(codeset, "EUCJPMS") == 0){
1219 output_conv = e_oconv;
1224 #ifdef SHIFTJIS_CP932
1227 #ifdef UTF8_OUTPUT_ENABLE
1228 ms_ucs_map_f = UCS_MAP_MS;
1230 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1231 strcmp(codeset, "EUCJP-ASCII") == 0){
1232 output_conv = e_oconv;
1237 #ifdef SHIFTJIS_CP932
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_ASCII;
1243 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1244 output_conv = s_oconv;
1246 #ifdef SHIFTJIS_CP932
1249 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1250 output_conv = e_oconv;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 }else if(strcmp(codeset, "UTF-8") == 0){
1260 output_conv = w_oconv;
1261 }else if(strcmp(codeset, "UTF-8N") == 0){
1262 output_conv = w_oconv;
1264 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1265 output_conv = w_oconv;
1267 }else if(strcmp(codeset, "UTF-16BE") == 0){
1268 output_conv = w_oconv16;
1270 }else if(strcmp(codeset, "UTF-16") == 0 ||
1271 strcmp(codeset, "UTF-16BE-BOM") == 0){
1272 output_conv = w_oconv16;
1274 }else if(strcmp(codeset, "UTF-16LE") == 0){
1275 output_conv = w_oconv16;
1278 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1279 output_conv = w_oconv16;
1287 if (strcmp(long_option[i].name, "overwrite") == 0){
1294 if (strcmp(long_option[i].name, "cap-input") == 0){
1298 if (strcmp(long_option[i].name, "url-input") == 0){
1303 #ifdef NUMCHAR_OPTION
1304 if (strcmp(long_option[i].name, "numchar-input") == 0){
1310 if (strcmp(long_option[i].name, "no-output") == 0){
1314 if (strcmp(long_option[i].name, "debug") == 0){
1319 if (strcmp(long_option[i].name, "cp932") == 0){
1320 #ifdef SHIFTJIS_CP932
1324 #ifdef UTF8_OUTPUT_ENABLE
1325 ms_ucs_map_f = UCS_MAP_CP932;
1329 if (strcmp(long_option[i].name, "no-cp932") == 0){
1330 #ifdef SHIFTJIS_CP932
1334 #ifdef UTF8_OUTPUT_ENABLE
1335 ms_ucs_map_f = UCS_MAP_ASCII;
1339 #ifdef SHIFTJIS_CP932
1340 if (strcmp(long_option[i].name, "cp932inv") == 0){
1347 if (strcmp(long_option[i].name, "x0212") == 0){
1354 if (strcmp(long_option[i].name, "exec-in") == 0){
1358 if (strcmp(long_option[i].name, "exec-out") == 0){
1363 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1364 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1365 internal_unicode_f = TRUE;
1368 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1369 no_cp932ext_f = TRUE;
1372 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1373 no_best_fit_chars_f = TRUE;
1376 if (strcmp(long_option[i].name, "fb-skip") == 0){
1377 encode_fallback = NULL;
1380 if (strcmp(long_option[i].name, "fb-html") == 0){
1381 encode_fallback = encode_fallback_html;
1384 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1385 encode_fallback = encode_fallback_xml;
1388 if (strcmp(long_option[i].name, "fb-java") == 0){
1389 encode_fallback = encode_fallback_java;
1392 if (strcmp(long_option[i].name, "fb-perl") == 0){
1393 encode_fallback = encode_fallback_perl;
1396 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1397 encode_fallback = encode_fallback_subchar;
1400 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1401 encode_fallback = encode_fallback_subchar;
1402 unicode_subchar = 0;
1404 /* decimal number */
1405 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1406 unicode_subchar *= 10;
1407 unicode_subchar += hex2bin(p[i]);
1409 }else if(p[1] == 'x' || p[1] == 'X'){
1410 /* hexadecimal number */
1411 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1412 unicode_subchar <<= 4;
1413 unicode_subchar |= hex2bin(p[i]);
1417 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1418 unicode_subchar *= 8;
1419 unicode_subchar += hex2bin(p[i]);
1422 w16e_conv(unicode_subchar, &i, &j);
1423 unicode_subchar = i<<8 | j;
1427 #ifdef UTF8_OUTPUT_ENABLE
1428 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1429 ms_ucs_map_f = UCS_MAP_MS;
1433 #ifdef UNICODE_NORMALIZATION
1434 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1435 input_f = UTF8_INPUT;
1440 if (strcmp(long_option[i].name, "prefix=") == 0){
1441 if (' ' < p[0] && p[0] < 128){
1442 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1443 prefix_table[p[i]] = p[0];
1450 case 'b': /* buffered mode */
1453 case 'u': /* non bufferd mode */
1456 case 't': /* transparent mode */
1459 case 'j': /* JIS output */
1461 output_conv = j_oconv;
1463 case 'e': /* AT&T EUC output */
1464 output_conv = e_oconv;
1466 case 's': /* SJIS output */
1467 output_conv = s_oconv;
1469 case 'l': /* ISO8859 Latin-1 support, no conversion */
1470 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1471 input_f = LATIN1_INPUT;
1473 case 'i': /* Kanji IN ESC-$-@/B */
1474 if (*cp=='@'||*cp=='B')
1475 kanji_intro = *cp++;
1477 case 'o': /* ASCII IN ESC-(-J/B */
1478 if (*cp=='J'||*cp=='B'||*cp=='H')
1479 ascii_intro = *cp++;
1483 bit:1 katakana->hiragana
1484 bit:2 hiragana->katakana
1486 if ('9'>= *cp && *cp>='0')
1487 hira_f |= (*cp++ -'0');
1494 #if defined(MSDOS) || defined(__OS2__)
1509 #ifdef UTF8_OUTPUT_ENABLE
1510 case 'w': /* UTF-8 output */
1511 if ('1'== cp[0] && '6'==cp[1]) {
1512 output_conv = w_oconv16; cp+=2;
1514 unicode_bom_f=2; cp++;
1517 unicode_bom_f=1; cp++;
1519 } else if (cp[0] == 'B') {
1520 unicode_bom_f=2; cp++;
1522 unicode_bom_f=1; cp++;
1525 } else if (cp[0] == '8') {
1526 output_conv = w_oconv; cp++;
1529 unicode_bom_f=1; cp++;
1532 output_conv = w_oconv;
1535 #ifdef UTF8_INPUT_ENABLE
1536 case 'W': /* UTF-8 input */
1537 if ('1'== cp[0] && '6'==cp[1]) {
1538 input_f = UTF16BE_INPUT;
1539 utf16_mode = UTF16BE_INPUT;
1543 input_f = UTF16LE_INPUT;
1544 utf16_mode = UTF16LE_INPUT;
1545 } else if (cp[0] == 'B') {
1547 input_f = UTF16BE_INPUT;
1548 utf16_mode = UTF16BE_INPUT;
1550 } else if (cp[0] == '8') {
1552 input_f = UTF8_INPUT;
1554 input_f = UTF8_INPUT;
1557 /* Input code assumption */
1558 case 'J': /* JIS input */
1559 case 'E': /* AT&T EUC input */
1560 input_f = JIS_INPUT;
1562 case 'S': /* MS Kanji input */
1563 input_f = SJIS_INPUT;
1564 if (x0201_f==NO_X0201) x0201_f=TRUE;
1566 case 'Z': /* Convert X0208 alphabet to asii */
1567 /* bit:0 Convert X0208
1568 bit:1 Convert Kankaku to one space
1569 bit:2 Convert Kankaku to two spaces
1570 bit:3 Convert HTML Entity
1572 if ('9'>= *cp && *cp>='0')
1573 alpha_f |= 1<<(*cp++ -'0');
1577 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1578 x0201_f = FALSE; /* No X0201->X0208 conversion */
1580 ESC-(-I in JIS, EUC, MS Kanji
1581 SI/SO in JIS, EUC, MS Kanji
1582 SSO in EUC, JIS, not in MS Kanji
1583 MS Kanji (0xa0-0xdf)
1585 ESC-(-I in JIS (0x20-0x5f)
1586 SSO in EUC (0xa0-0xdf)
1587 0xa0-0xd in MS Kanji (0xa0-0xdf)
1590 case 'X': /* Assume X0201 kana */
1591 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1594 case 'F': /* prserve new lines */
1595 fold_preserve_f = TRUE;
1596 case 'f': /* folding -f60 or -f */
1599 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1601 fold_len += *cp++ - '0';
1603 if (!(0<fold_len && fold_len<BUFSIZ))
1604 fold_len = DEFAULT_FOLD;
1608 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1610 fold_margin += *cp++ - '0';
1614 case 'm': /* MIME support */
1615 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1616 if (*cp=='B'||*cp=='Q') {
1617 mime_decode_mode = *cp++;
1618 mimebuf_f = FIXED_MIME;
1619 } else if (*cp=='N') {
1620 mime_f = TRUE; cp++;
1621 } else if (*cp=='S') {
1622 mime_f = STRICT_MIME; cp++;
1623 } else if (*cp=='0') {
1624 mime_decode_f = FALSE;
1625 mime_f = FALSE; cp++;
1628 case 'M': /* MIME output */
1631 mimeout_f = FIXED_MIME; cp++;
1632 } else if (*cp=='Q') {
1634 mimeout_f = FIXED_MIME; cp++;
1639 case 'B': /* Broken JIS support */
1641 bit:1 allow any x on ESC-(-x or ESC-$-x
1642 bit:2 reset to ascii on NL
1644 if ('9'>= *cp && *cp>='0')
1645 broken_f |= 1<<(*cp++ -'0');
1650 case 'O':/* for Output file */
1654 case 'c':/* add cr code */
1657 case 'd':/* delete cr code */
1660 case 'I': /* ISO-2022-JP output */
1663 case 'L': /* line mode */
1664 if (*cp=='u') { /* unix */
1665 crmode_f = NL; cp++;
1666 } else if (*cp=='m') { /* mac */
1667 crmode_f = CR; cp++;
1668 } else if (*cp=='w') { /* windows */
1669 crmode_f = CRLF; cp++;
1670 } else if (*cp=='0') { /* no conversion */
1680 /* module muliple options in a string are allowed for Perl moudle */
1681 while(*cp && *cp++!='-');
1684 /* bogus option but ignored */
1690 #ifdef ANSI_C_PROTOTYPE
1691 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1693 struct input_code * find_inputcode_byfunc(iconv_func)
1694 int (*iconv_func)();
1698 struct input_code *p = input_code_list;
1700 if (iconv_func == p->iconv_func){
1709 #ifdef ANSI_C_PROTOTYPE
1710 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1712 void set_iconv(f, iconv_func)
1714 int (*iconv_func)();
1717 #ifdef INPUT_CODE_FIX
1725 #ifdef INPUT_CODE_FIX
1726 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1732 if (estab_f && iconv_for_check != iconv){
1733 struct input_code *p = find_inputcode_byfunc(iconv);
1735 set_input_codename(p->name);
1736 debug(input_codename);
1738 iconv_for_check = iconv;
1743 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1744 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1745 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1746 #ifdef SHIFTJIS_CP932
1747 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1748 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1750 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1752 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1753 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1755 #define SCORE_INIT (SCORE_iMIME)
1757 const int score_table_A0[] = {
1760 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1761 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1764 const int score_table_F0[] = {
1765 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1766 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1767 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1768 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1771 void set_code_score(ptr, score)
1772 struct input_code *ptr;
1776 ptr->score |= score;
1780 void clr_code_score(ptr, score)
1781 struct input_code *ptr;
1785 ptr->score &= ~score;
1789 void code_score(ptr)
1790 struct input_code *ptr;
1792 int c2 = ptr->buf[0];
1793 #ifdef UTF8_OUTPUT_ENABLE
1794 int c1 = ptr->buf[1];
1797 set_code_score(ptr, SCORE_ERROR);
1798 }else if (c2 == SSO){
1799 set_code_score(ptr, SCORE_KANA);
1800 #ifdef UTF8_OUTPUT_ENABLE
1801 }else if (!e2w_conv(c2, c1)){
1802 set_code_score(ptr, SCORE_NO_EXIST);
1804 }else if ((c2 & 0x70) == 0x20){
1805 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1806 }else if ((c2 & 0x70) == 0x70){
1807 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1808 }else if ((c2 & 0x70) >= 0x50){
1809 set_code_score(ptr, SCORE_L2);
1813 void status_disable(ptr)
1814 struct input_code *ptr;
1819 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1822 void status_push_ch(ptr, c)
1823 struct input_code *ptr;
1826 ptr->buf[ptr->index++] = c;
1829 void status_clear(ptr)
1830 struct input_code *ptr;
1836 void status_reset(ptr)
1837 struct input_code *ptr;
1840 ptr->score = SCORE_INIT;
1843 void status_reinit(ptr)
1844 struct input_code *ptr;
1847 ptr->_file_stat = 0;
1850 void status_check(ptr, c)
1851 struct input_code *ptr;
1854 if (c <= DEL && estab_f){
1859 void s_status(ptr, c)
1860 struct input_code *ptr;
1865 status_check(ptr, c);
1870 #ifdef NUMCHAR_OPTION
1871 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1874 }else if (0xa1 <= c && c <= 0xdf){
1875 status_push_ch(ptr, SSO);
1876 status_push_ch(ptr, c);
1879 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1881 status_push_ch(ptr, c);
1882 #ifdef SHIFTJIS_CP932
1884 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1886 status_push_ch(ptr, c);
1887 #endif /* SHIFTJIS_CP932 */
1889 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1891 status_push_ch(ptr, c);
1892 #endif /* X0212_ENABLE */
1894 status_disable(ptr);
1898 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1899 status_push_ch(ptr, c);
1900 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
1904 status_disable(ptr);
1908 #ifdef SHIFTJIS_CP932
1909 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1910 status_push_ch(ptr, c);
1911 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
1912 set_code_score(ptr, SCORE_CP932);
1917 #endif /* SHIFTJIS_CP932 */
1918 #ifndef X0212_ENABLE
1919 status_disable(ptr);
1925 void e_status(ptr, c)
1926 struct input_code *ptr;
1931 status_check(ptr, c);
1936 #ifdef NUMCHAR_OPTION
1937 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1940 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
1942 status_push_ch(ptr, c);
1944 }else if (0x8f == c){
1946 status_push_ch(ptr, c);
1947 #endif /* X0212_ENABLE */
1949 status_disable(ptr);
1953 if (0xa1 <= c && c <= 0xfe){
1954 status_push_ch(ptr, c);
1958 status_disable(ptr);
1963 if (0xa1 <= c && c <= 0xfe){
1965 status_push_ch(ptr, c);
1967 status_disable(ptr);
1969 #endif /* X0212_ENABLE */
1973 #ifdef UTF8_INPUT_ENABLE
1974 void w16_status(ptr, c)
1975 struct input_code *ptr;
1982 if (ptr->_file_stat == 0){
1983 if (c == 0xfe || c == 0xff){
1985 status_push_ch(ptr, c);
1986 ptr->_file_stat = 1;
1988 status_disable(ptr);
1989 ptr->_file_stat = -1;
1991 }else if (ptr->_file_stat > 0){
1993 status_push_ch(ptr, c);
1994 }else if (ptr->_file_stat < 0){
1995 status_disable(ptr);
2001 status_disable(ptr);
2002 ptr->_file_stat = -1;
2004 status_push_ch(ptr, c);
2011 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2012 status_push_ch(ptr, c);
2015 status_disable(ptr);
2016 ptr->_file_stat = -1;
2022 void w_status(ptr, c)
2023 struct input_code *ptr;
2028 status_check(ptr, c);
2033 #ifdef NUMCHAR_OPTION
2034 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2037 }else if (0xc0 <= c && c <= 0xdf){
2039 status_push_ch(ptr, c);
2040 }else if (0xe0 <= c && c <= 0xef){
2042 status_push_ch(ptr, c);
2044 status_disable(ptr);
2049 if (0x80 <= c && c <= 0xbf){
2050 status_push_ch(ptr, c);
2051 if (ptr->index > ptr->stat){
2052 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2053 && ptr->buf[2] == 0xbf);
2054 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2055 &ptr->buf[0], &ptr->buf[1]);
2062 status_disable(ptr);
2073 int action_flag = 1;
2074 struct input_code *result = 0;
2075 struct input_code *p = input_code_list;
2077 (p->status_func)(p, c);
2080 }else if(p->stat == 0){
2091 if (result && !estab_f){
2092 set_iconv(TRUE, result->iconv_func);
2093 }else if (c <= DEL){
2094 struct input_code *ptr = input_code_list;
2109 return std_gc_buf[--std_gc_ndx];
2120 if (std_gc_ndx == STD_GC_BUFSIZE){
2123 std_gc_buf[std_gc_ndx++] = c;
2137 #if !defined(PERL_XS) && !defined(WIN32DLL)
2144 while ((c = (*i_getc)(f)) != EOF)
2153 oconv = output_conv;
2156 /* replace continucation module, from output side */
2158 /* output redicrection */
2160 if (noout_f || guess_f){
2167 if (mimeout_f == TRUE) {
2168 o_base64conv = oconv; oconv = base64_conv;
2170 /* base64_count = 0; */
2174 o_crconv = oconv; oconv = cr_conv;
2177 o_rot_conv = oconv; oconv = rot_conv;
2180 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2183 o_hira_conv = oconv; oconv = hira_conv;
2186 o_fconv = oconv; oconv = fold_conv;
2189 if (alpha_f || x0201_f) {
2190 o_zconv = oconv; oconv = z_conv;
2194 i_ungetc = std_ungetc;
2195 /* input redicrection */
2198 i_cgetc = i_getc; i_getc = cap_getc;
2199 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2202 i_ugetc = i_getc; i_getc = url_getc;
2203 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2206 #ifdef NUMCHAR_OPTION
2208 i_ngetc = i_getc; i_getc = numchar_getc;
2209 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2212 #ifdef UNICODE_NORMALIZATION
2213 if (nfc_f && input_f == UTF8_INPUT){
2214 i_nfc_getc = i_getc; i_getc = nfc_getc;
2215 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2218 if (mime_f && mimebuf_f==FIXED_MIME) {
2219 i_mgetc = i_getc; i_getc = mime_getc;
2220 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2223 i_bgetc = i_getc; i_getc = broken_getc;
2224 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2226 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2227 set_iconv(-TRUE, e_iconv);
2228 } else if (input_f == SJIS_INPUT) {
2229 set_iconv(-TRUE, s_iconv);
2230 #ifdef UTF8_INPUT_ENABLE
2231 } else if (input_f == UTF8_INPUT) {
2232 set_iconv(-TRUE, w_iconv);
2233 } else if (input_f == UTF16BE_INPUT) {
2234 set_iconv(-TRUE, w_iconv16);
2235 } else if (input_f == UTF16LE_INPUT) {
2236 set_iconv(-TRUE, w_iconv16);
2239 set_iconv(FALSE, e_iconv);
2243 struct input_code *p = input_code_list;
2251 Conversion main loop. Code detection only.
2260 int is_8bit = FALSE;
2262 module_connection();
2265 if(input_f == SJIS_INPUT
2266 #ifdef UTF8_INPUT_ENABLE
2267 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2275 output_mode = ASCII;
2278 #define NEXT continue /* no output, get next */
2279 #define SEND ; /* output c1 and c2, get next */
2280 #define LAST break /* end of loop, go closing */
2282 while ((c1 = (*i_getc)(f)) != EOF) {
2283 #ifdef INPUT_CODE_FIX
2290 /* in case of 8th bit is on */
2291 if (!estab_f&&!mime_decode_mode) {
2292 /* in case of not established yet */
2293 /* It is still ambiguious */
2294 if (h_conv(f, c2, c1)==EOF)
2300 /* in case of already established */
2302 /* ignore bogus code */
2308 /* second byte, 7 bit code */
2309 /* it might be kanji shitfted */
2310 if ((c1 == DEL) || (c1 <= SPACE)) {
2311 /* ignore bogus first code */
2319 #ifdef UTF8_INPUT_ENABLE
2328 #ifdef NUMCHAR_OPTION
2329 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2332 } else if (c1 > DEL) {
2334 if (!estab_f && !iso8859_f) {
2335 /* not established yet */
2336 if (!is_8bit) is_8bit = TRUE;
2339 } else { /* estab_f==TRUE */
2344 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2345 /* SJIS X0201 Case... */
2346 if(iso2022jp_f && x0201_f==NO_X0201) {
2347 (*oconv)(GETA1, GETA2);
2354 } else if (c1==SSO && iconv != s_iconv) {
2355 /* EUC X0201 Case */
2356 c1 = (*i_getc)(f); /* skip SSO */
2358 if (SSP<=c1 && c1<0xe0) {
2359 if(iso2022jp_f && x0201_f==NO_X0201) {
2360 (*oconv)(GETA1, GETA2);
2367 } else { /* bogus code, skip SSO and one byte */
2371 /* already established */
2376 } else if ((c1 > SPACE) && (c1 != DEL)) {
2377 /* in case of Roman characters */
2379 /* output 1 shifted byte */
2383 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2384 /* output 1 shifted byte */
2385 if(iso2022jp_f && x0201_f==NO_X0201) {
2386 (*oconv)(GETA1, GETA2);
2393 /* look like bogus code */
2396 } else if (input_mode == X0208) {
2397 /* in case of Kanji shifted */
2400 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2401 /* Check MIME code */
2402 if ((c1 = (*i_getc)(f)) == EOF) {
2405 } else if (c1 == '?') {
2406 /* =? is mime conversion start sequence */
2407 if(mime_f == STRICT_MIME) {
2408 /* check in real detail */
2409 if (mime_begin_strict(f) == EOF)
2413 } else if (mime_begin(f) == EOF)
2423 /* normal ASCII code */
2426 } else if (!is_8bit && c1 == SI) {
2429 } else if (!is_8bit && c1 == SO) {
2432 } else if (!is_8bit && c1 == ESC ) {
2433 if ((c1 = (*i_getc)(f)) == EOF) {
2434 /* (*oconv)(0, ESC); don't send bogus code */
2436 } else if (c1 == '$') {
2437 if ((c1 = (*i_getc)(f)) == EOF) {
2439 (*oconv)(0, ESC); don't send bogus code
2440 (*oconv)(0, '$'); */
2442 } else if (c1 == '@'|| c1 == 'B') {
2443 /* This is kanji introduction */
2446 set_input_codename("ISO-2022-JP");
2448 debug(input_codename);
2451 } else if (c1 == '(') {
2452 if ((c1 = (*i_getc)(f)) == EOF) {
2453 /* don't send bogus code
2459 } else if (c1 == '@'|| c1 == 'B') {
2460 /* This is kanji introduction */
2465 } else if (c1 == 'D'){
2469 #endif /* X0212_ENABLE */
2471 /* could be some special code */
2478 } else if (broken_f&0x2) {
2479 /* accept any ESC-(-x as broken code ... */
2489 } else if (c1 == '(') {
2490 if ((c1 = (*i_getc)(f)) == EOF) {
2491 /* don't send bogus code
2493 (*oconv)(0, '('); */
2497 /* This is X0201 kana introduction */
2498 input_mode = X0201; shift_mode = X0201;
2500 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2501 /* This is X0208 kanji introduction */
2502 input_mode = ASCII; shift_mode = FALSE;
2504 } else if (broken_f&0x2) {
2505 input_mode = ASCII; shift_mode = FALSE;
2510 /* maintain various input_mode here */
2514 } else if ( c1 == 'N' || c1 == 'n' ){
2516 c3 = (*i_getc)(f); /* skip SS2 */
2517 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2532 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2533 input_mode = ASCII; set_iconv(FALSE, 0);
2535 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2536 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2544 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2545 if ((c1=(*i_getc)(f))!=EOF) {
2549 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2565 if (input_mode == X0208)
2566 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2568 else if (input_mode == X0212)
2569 (*oconv)((0x8f << 8) | c2, c1);
2570 #endif /* X0212_ENABLE */
2571 else if (input_mode)
2572 (*oconv)(input_mode, c1); /* other special case */
2573 else if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2574 int c0 = (*i_getc)(f);
2577 (*iconv)(c2, c1, c0);
2583 /* goto next_word */
2587 (*iconv)(EOF, 0, 0);
2588 if (!is_inputcode_set)
2591 struct input_code *p = input_code_list;
2592 struct input_code *result = p;
2594 if (p->score < result->score) result = p;
2597 set_input_codename(result->name);
2612 /** it must NOT be in the kanji shifte sequence */
2613 /** it must NOT be written in JIS7 */
2614 /** and it must be after 2 byte 8bit code */
2621 while ((c1 = (*i_getc)(f)) != EOF) {
2627 if (push_hold_buf(c1) == EOF || estab_f){
2633 struct input_code *p = input_code_list;
2634 struct input_code *result = p;
2639 if (p->score < result->score){
2644 set_iconv(FALSE, result->iconv_func);
2649 ** 1) EOF is detected, or
2650 ** 2) Code is established, or
2651 ** 3) Buffer is FULL (but last word is pushed)
2653 ** in 1) and 3) cases, we continue to use
2654 ** Kanji codes by oconv and leave estab_f unchanged.
2659 while (wc < hold_count){
2660 c2 = hold_buf[wc++];
2662 #ifdef NUMCHAR_OPTION
2663 || (c2 & CLASS_MASK) == CLASS_UTF16
2668 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2669 (*iconv)(X0201, c2, 0);
2672 if (wc < hold_count){
2673 c1 = hold_buf[wc++];
2682 if ((*iconv)(c2, c1, 0) < 0){
2684 if (wc < hold_count){
2685 c0 = hold_buf[wc++];
2694 (*iconv)(c2, c1, c0);
2707 if (hold_count >= HOLD_SIZE*2)
2709 hold_buf[hold_count++] = c2;
2710 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2713 const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2715 int s2e_conv(c2, c1, p2, p1)
2719 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2722 #ifdef SHIFTJIS_CP932
2723 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2724 extern const unsigned short shiftjis_cp932[3][189];
2725 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2731 #endif /* SHIFTJIS_CP932 */
2733 if (!x0213_f && x0212_f && 0xfa <= c2 && c2 <= 0xfc){
2734 extern const unsigned short shiftjis_x0212[3][189];
2735 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2738 c2 = (0x8f << 8) | (val >> 8);
2751 if(x0213_f && c2 >= 0xF0){
2752 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2753 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2754 }else{ /* 78<=k<=94 */
2755 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2756 if (0x9E < c1) c2++;
2759 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2760 if (0x9E < c1) c2++;
2763 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2771 c2 = x0212_unshift(c2);
2786 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2789 int ret = s2e_conv(c2, c1, &c2, &c1);
2790 if (ret) return ret;
2804 }else if (c2 == 0x8f){
2808 c2 = (c2 << 8) | (c1 & 0x7f);
2810 #ifdef SHIFTJIS_CP932
2813 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2814 s2e_conv(s2, s1, &c2, &c1);
2815 if ((c2 & 0xff00) == 0){
2821 #endif /* SHIFTJIS_CP932 */
2822 #endif /* X0212_ENABLE */
2823 } else if (c2 == SSO){
2826 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2836 #ifdef UTF8_INPUT_ENABLE
2838 w2e_conv(c2, c1, c0, p2, p1)
2847 }else if (0xc0 <= c2 && c2 <= 0xef) {
2848 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2849 #ifdef NUMCHAR_OPTION
2852 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2867 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
2868 if(ignore_zwnbsp_f){
2869 ignore_zwnbsp_f = FALSE;
2870 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2874 if (c2 == 0) /* 0x00-0x7f */
2875 c1 &= 0x7F; /* 1byte */
2877 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
2879 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
2880 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2881 return -1; /* 3bytes */
2883 else if (0xf0 <= c2)
2884 return 0; /* 4,5,6bytes */
2885 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2886 return 0; /* trail byte */
2890 /* must be 3bytes */
2892 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2894 }else if(c2 == 0xED){
2895 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
2897 }else if((c2 & 0xf0) == 0xe0){
2898 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2902 if (c2 == 0 || c2 == EOF){
2903 #ifdef UTF8_OUTPUT_ENABLE
2904 } else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
2905 unsigned short val = 0;
2910 val = ww16_conv(c2, c1, c0);
2911 c2 = (val >> 8) & 0xff;
2915 ret = w2e_conv(c2, c1, c0, &c2, &c1);
2924 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
2926 w16w_conv(val, p2, p1, p0)
2934 }else if (val < 0x800){
2935 *p2 = 0xc0 | (val >> 6);
2936 *p1 = 0x80 | (val & 0x3f);
2939 *p2 = 0xe0 | (val >> 12);
2940 *p1 = 0x80 | ((val >> 6) & 0x3f);
2941 *p0 = 0x80 | (val & 0x3f);
2946 #ifdef UTF8_INPUT_ENABLE
2948 ww16_conv(c2, c1, c0)
2954 }else if (c2 >= 0xe0){
2955 val = (c2 & 0x0f) << 12;
2956 val |= (c1 & 0x3f) << 6;
2958 }else if (c2 >= 0xc0){
2959 val = (c2 & 0x1f) << 6;
2968 w16e_conv(val, p2, p1)
2979 w16w_conv(val, &c2, &c1, &c0);
2980 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2981 #ifdef NUMCHAR_OPTION
2984 *p1 = CLASS_UTF16 | val;
2993 #ifdef UTF8_INPUT_ENABLE
2995 w_iconv16(c2, c1, c0)
3000 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3001 if(ignore_zwnbsp_f){
3002 ignore_zwnbsp_f = FALSE;
3003 if (c2==0376 && c1==0377){
3004 utf16_mode = UTF16BE_INPUT;
3006 }else if(c2==0377 && c1==0376){
3007 utf16_mode = UTF16LE_INPUT;
3011 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3013 tmp=c1; c1=c2; c2=tmp;
3015 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3018 }else if((c2>>3)==27){ /* surrogate pair */
3020 #ifdef UTF8_OUTPUT_ENABLE
3021 }else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
3023 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3024 if (ret) return ret;
3030 unicode_to_jis_common(c2, c1, c0, p2, p1)
3034 extern const unsigned short *const utf8_to_euc_2bytes[];
3035 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3036 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3037 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3038 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3039 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3040 const unsigned short *const *pp;
3041 const unsigned short *const *const *ppp;
3042 STATIC const int no_best_fit_chars_table_C2[] =
3043 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3045 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3046 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3047 STATIC const int no_best_fit_chars_table_932_C2[] =
3048 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3050 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3051 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3052 STATIC const int no_best_fit_chars_table_932_C3[] =
3053 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3054 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3056 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3062 }else if(c2 < 0xe0){
3063 if(no_best_fit_chars_f){
3064 if(ms_ucs_map_f == UCS_MAP_CP932){
3067 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3070 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3074 if(c2 == 0xC2 && no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3078 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3079 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3081 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3083 if(no_best_fit_chars_f){
3084 if(ms_ucs_map_f == UCS_MAP_CP932){
3085 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3086 }else if(ms_ucs_map_f == UCS_MAP_MS){
3091 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3094 if(c0 == 0x92) return 1;
3099 if(c1 == 0x80 || c0 == 0x9C) return 1;
3107 if(c0 == 0x95) return 1;
3110 if(c0 == 0xA5) return 1;
3117 if(c0 == 0x8D) return 1;
3120 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3128 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3129 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3131 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3137 w_iconv_common(c1, c0, pp, psize, p2, p1)
3139 const unsigned short *const *pp;
3144 const unsigned short *p;
3147 if (pp == 0) return 1;
3150 if (c1 < 0 || psize <= c1) return 1;
3152 if (p == 0) return 1;
3155 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3157 if (val == 0) return 1;
3158 if (no_cp932ext_f && (
3159 (val>>8) == 0x2D || /* NEC special characters */
3160 val > 0xF300 /* NEC special characters */
3168 if (c2 == SO) c2 = X0201;
3176 nkf_each_char_to_hex(f, c)
3177 void (*f)PROTO((int c2,int c1));
3180 const char *hex = "0123456789ABCDEF";
3186 (*f)(0, hex[(c>>shift)&0xF]);
3197 encode_fallback_html(c)
3204 (*oconv)(0, 0x30+(c/1000000)%10);
3206 (*oconv)(0, 0x30+(c/100000 )%10);
3208 (*oconv)(0, 0x30+(c/10000 )%10);
3210 (*oconv)(0, 0x30+(c/1000 )%10);
3212 (*oconv)(0, 0x30+(c/100 )%10);
3214 (*oconv)(0, 0x30+(c/10 )%10);
3216 (*oconv)(0, 0x30+ c %10);
3222 encode_fallback_xml(c)
3228 nkf_each_char_to_hex(oconv, c);
3234 encode_fallback_java(c)
3237 const char *hex = "0123456789ABCDEF";
3239 if((c&0x00FFFFFF) > 0xFFFF){
3243 (*oconv)(0, hex[(c>>20)&0xF]);
3244 (*oconv)(0, hex[(c>>16)&0xF]);
3248 (*oconv)(0, hex[(c>>12)&0xF]);
3249 (*oconv)(0, hex[(c>> 8)&0xF]);
3250 (*oconv)(0, hex[(c>> 4)&0xF]);
3251 (*oconv)(0, hex[ c &0xF]);
3256 encode_fallback_perl(c)
3262 nkf_each_char_to_hex(oconv, c);
3268 encode_fallback_subchar(c)
3271 c = unicode_subchar;
3272 (*oconv)((c>>8)&0xFF, c&0xFF);
3278 (*oconv)(0, (c>>shift)&0xFF);
3289 #ifdef UTF8_OUTPUT_ENABLE
3294 extern const unsigned short euc_to_utf8_1byte[];
3295 extern const unsigned short *const euc_to_utf8_2bytes[];
3296 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3297 const unsigned short *p;
3300 p = euc_to_utf8_1byte;
3302 } else if (c2 >> 8 == 0x8f){
3303 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3306 extern const unsigned short *const x0212_to_utf8_2bytes[];
3307 c2 = (c2&0x7f) - 0x21;
3308 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3309 p = x0212_to_utf8_2bytes[c2];
3315 c2 = (c2&0x7f) - 0x21;
3316 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3317 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3322 c1 = (c1 & 0x7f) - 0x21;
3323 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3340 if (unicode_bom_f==2) {
3347 #ifdef NUMCHAR_OPTION
3348 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3349 w16w_conv(c1, &c2, &c1, &c0);
3353 if (c0) (*o_putc)(c0);
3360 output_mode = ASCII;
3362 } else if (c2 == ISO8859_1) {
3363 output_mode = ISO8859_1;
3364 (*o_putc)(c1 | 0x080);
3367 #ifdef UTF8_INPUT_ENABLE
3368 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
3369 val = ((c2<<8)&0xff00) + c1;
3372 val = e2w_conv(c2, c1);
3374 w16w_conv(val, &c2, &c1, &c0);
3378 if (c0) (*o_putc)(c0);
3394 if (unicode_bom_f==2) {
3396 (*o_putc)((unsigned char)'\377');
3400 (*o_putc)((unsigned char)'\377');
3405 #ifdef UTF8_INPUT_ENABLE
3406 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
3409 if (c2 == ISO8859_1) {
3412 #ifdef NUMCHAR_OPTION
3413 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3414 c2 = (c1 >> 8) & 0xff;
3418 unsigned short val = e2w_conv(c2, c1);
3419 c2 = (val >> 8) & 0xff;
3438 #ifdef NUMCHAR_OPTION
3439 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3440 w16e_conv(c1, &c2, &c1);
3441 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3442 if(encode_fallback)(*encode_fallback)(c1);
3450 } else if (c2 == 0) {
3451 output_mode = ASCII;
3453 } else if (c2 == X0201) {
3454 output_mode = JAPANESE_EUC;
3455 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3456 } else if (c2 == ISO8859_1) {
3457 output_mode = ISO8859_1;
3458 (*o_putc)(c1 | 0x080);
3460 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3461 output_mode = JAPANESE_EUC;
3462 #ifdef SHIFTJIS_CP932
3465 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3466 s2e_conv(s2, s1, &c2, &c1);
3471 output_mode = ASCII;
3473 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3476 (*o_putc)((c2 & 0x7f) | 0x080);
3477 (*o_putc)(c1 | 0x080);
3480 (*o_putc)((c2 & 0x7f) | 0x080);
3481 (*o_putc)(c1 | 0x080);
3485 if ((c1<0x21 || 0x7e<c1) ||
3486 (c2<0x21 || 0x7e<c2)) {
3487 set_iconv(FALSE, 0);
3488 return; /* too late to rescue this char */
3490 output_mode = JAPANESE_EUC;
3491 (*o_putc)(c2 | 0x080);
3492 (*o_putc)(c1 | 0x080);
3502 if ((ret & 0xff00) == 0x8f00){
3503 if (0x75 <= c && c <= 0x7f){
3504 ret = c + (0x109 - 0x75);
3507 if (0x75 <= c && c <= 0x7f){
3508 ret = c + (0x113 - 0x75);
3515 int x0212_unshift(c)
3519 if (0x7f <= c && c <= 0x88){
3520 ret = c + (0x75 - 0x7f);
3521 }else if (0x89 <= c && c <= 0x92){
3522 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3526 #endif /* X0212_ENABLE */
3529 e2s_conv(c2, c1, p2, p1)
3530 int c2, c1, *p2, *p1;
3533 if ((c2 & 0xff00) == 0x8f00){
3536 if((0x21 <= ndx && ndx <= 0x2F)){
3537 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3538 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3540 }else if(0x6E <= ndx && ndx <= 0x7E){
3541 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3542 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3548 else if(0x21 <= ndx && ndx <= 0x7e){
3550 const unsigned short *ptr;
3551 extern const unsigned short *const x0212_shiftjis[];
3553 ptr = x0212_shiftjis[ndx - 0x21];
3555 val = ptr[(c1 & 0x7f) - 0x21];
3564 c2 = x0212_shift(c2);
3566 #endif /* X0212_ENABLE */
3568 if(0x7F < c2) return 1;
3569 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3570 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3579 #ifdef NUMCHAR_OPTION
3580 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3581 w16e_conv(c1, &c2, &c1);
3582 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3583 if(encode_fallback)(*encode_fallback)(c1);
3591 } else if (c2 == 0) {
3592 output_mode = ASCII;
3594 } else if (c2 == X0201) {
3595 output_mode = SHIFT_JIS;
3597 } else if (c2 == ISO8859_1) {
3598 output_mode = ISO8859_1;
3599 (*o_putc)(c1 | 0x080);
3601 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3602 output_mode = SHIFT_JIS;
3603 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3609 if ((c1<0x20 || 0x7e<c1) ||
3610 (c2<0x20 || 0x7e<c2)) {
3611 set_iconv(FALSE, 0);
3612 return; /* too late to rescue this char */
3614 output_mode = SHIFT_JIS;
3615 e2s_conv(c2, c1, &c2, &c1);
3617 #ifdef SHIFTJIS_CP932
3619 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3620 extern const unsigned short cp932inv[2][189];
3621 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3627 #endif /* SHIFTJIS_CP932 */
3630 if (prefix_table[(unsigned char)c1]){
3631 (*o_putc)(prefix_table[(unsigned char)c1]);
3642 #ifdef NUMCHAR_OPTION
3643 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3644 w16e_conv(c1, &c2, &c1);
3645 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3646 if(encode_fallback)(*encode_fallback)(c1);
3652 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3655 (*o_putc)(ascii_intro);
3656 output_mode = ASCII;
3660 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3662 if(output_mode!=X0213_2){
3663 output_mode = X0213_2;
3666 if(output_mode!=X0212){
3667 output_mode = X0212;
3673 (*o_putc)(output_mode & 0x7F);
3674 (*o_putc)(c2 & 0x7f);
3677 } else if (c2==X0201) {
3678 if (output_mode!=X0201) {
3679 output_mode = X0201;
3685 } else if (c2==ISO8859_1) {
3686 /* iso8859 introduction, or 8th bit on */
3687 /* Can we convert in 7bit form using ESC-'-'-A ?
3689 output_mode = ISO8859_1;
3691 } else if (c2 == 0) {
3692 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3695 (*o_putc)(ascii_intro);
3696 output_mode = ASCII;
3701 if (output_mode!=X0213_1) {
3702 output_mode = X0213_1;
3706 (*o_putc)(output_mode & 0x7F);
3708 }else if (output_mode != X0208) {
3709 output_mode = X0208;
3712 (*o_putc)(kanji_intro);
3714 if (c1<0x20 || 0x7e<c1)
3716 if (c2<0x20 || 0x7e<c2)
3728 mime_prechar(c2, c1);
3729 (*o_base64conv)(c2,c1);
3733 STATIC int broken_buf[3];
3734 STATIC int broken_counter = 0;
3735 STATIC int broken_last = 0;
3742 if (broken_counter>0) {
3743 return broken_buf[--broken_counter];
3746 if (c=='$' && broken_last != ESC
3747 && (input_mode==ASCII || input_mode==X0201)) {
3750 if (c1=='@'|| c1=='B') {
3751 broken_buf[0]=c1; broken_buf[1]=c;
3758 } else if (c=='(' && broken_last != ESC
3759 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3762 if (c1=='J'|| c1=='B') {
3763 broken_buf[0]=c1; broken_buf[1]=c;
3781 if (broken_counter<2)
3782 broken_buf[broken_counter++]=c;
3786 STATIC int prev_cr = 0;
3794 if (! (c2==0&&c1==NL) ) {
3800 } else if (c1=='\r') {
3802 } else if (c1=='\n') {
3803 if (crmode_f==CRLF) {
3804 (*o_crconv)(0,'\r');
3805 } else if (crmode_f==CR) {
3806 (*o_crconv)(0,'\r');
3810 } else if (c1!='\032' || crmode_f!=NL){
3816 Return value of fold_conv()
3818 \n add newline and output char
3819 \r add newline and output nothing
3822 1 (or else) normal output
3824 fold state in prev (previous character)
3826 >0x80 Japanese (X0208/X0201)
3831 This fold algorthm does not preserve heading space in a line.
3832 This is the main difference from fmt.
3835 #define char_size(c2,c1) (c2?2:1)
3844 if (c1== '\r' && !fold_preserve_f) {
3845 fold_state=0; /* ignore cr */
3846 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3848 fold_state=0; /* ignore cr */
3849 } else if (c1== BS) {
3850 if (f_line>0) f_line--;
3852 } else if (c2==EOF && f_line != 0) { /* close open last line */
3854 } else if ((c1=='\n' && !fold_preserve_f)
3855 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3856 && fold_preserve_f)) {
3858 if (fold_preserve_f) {
3862 } else if ((f_prev == c1 && !fold_preserve_f)
3863 || (f_prev == '\n' && fold_preserve_f)
3864 ) { /* duplicate newline */
3867 fold_state = '\n'; /* output two newline */
3873 if (f_prev&0x80) { /* Japanese? */
3875 fold_state = 0; /* ignore given single newline */
3876 } else if (f_prev==' ') {
3880 if (++f_line<=fold_len)
3884 fold_state = '\r'; /* fold and output nothing */
3888 } else if (c1=='\f') {
3893 fold_state = '\n'; /* output newline and clear */
3894 } else if ( (c2==0 && c1==' ')||
3895 (c2==0 && c1=='\t')||
3896 (c2=='!'&& c1=='!')) {
3897 /* X0208 kankaku or ascii space */
3898 if (f_prev == ' ') {
3899 fold_state = 0; /* remove duplicate spaces */
3902 if (++f_line<=fold_len)
3903 fold_state = ' '; /* output ASCII space only */
3905 f_prev = ' '; f_line = 0;
3906 fold_state = '\r'; /* fold and output nothing */
3910 prev0 = f_prev; /* we still need this one... , but almost done */
3912 if (c2 || c2==X0201)
3913 f_prev |= 0x80; /* this is Japanese */
3914 f_line += char_size(c2,c1);
3915 if (f_line<=fold_len) { /* normal case */
3918 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3919 f_line = char_size(c2,c1);
3920 fold_state = '\n'; /* We can't wait, do fold now */
3921 } else if (c2==X0201) {
3922 /* simple kinsoku rules return 1 means no folding */
3923 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3924 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3925 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3926 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3927 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3928 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3929 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3931 fold_state = '\n';/* add one new f_line before this character */
3934 fold_state = '\n';/* add one new f_line before this character */
3937 /* kinsoku point in ASCII */
3938 if ( c1==')'|| /* { [ ( */
3949 /* just after special */
3950 } else if (!is_alnum(prev0)) {
3951 f_line = char_size(c2,c1);
3953 } else if ((prev0==' ') || /* ignored new f_line */
3954 (prev0=='\n')|| /* ignored new f_line */
3955 (prev0&0x80)) { /* X0208 - ASCII */
3956 f_line = char_size(c2,c1);
3957 fold_state = '\n';/* add one new f_line before this character */
3959 fold_state = 1; /* default no fold in ASCII */
3963 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3964 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3965 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3966 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3967 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3968 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3969 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3970 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3971 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3972 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3973 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3974 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3975 /* default no fold in kinsoku */
3978 f_line = char_size(c2,c1);
3979 /* add one new f_line before this character */
3982 f_line = char_size(c2,c1);
3984 /* add one new f_line before this character */
3989 /* terminator process */
3990 switch(fold_state) {
4009 int z_prev2=0,z_prev1=0;
4016 /* if (c2) c1 &= 0x7f; assertion */
4018 if (x0201_f && z_prev2==X0201) { /* X0201 */
4019 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4021 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4023 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4025 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4029 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4038 if (x0201_f && c2==X0201) {
4039 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4040 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4041 z_prev1 = c1; z_prev2 = c2;
4044 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4049 /* JISX0208 Alphabet */
4050 if (alpha_f && c2 == 0x23 ) {
4052 } else if (alpha_f && c2 == 0x21 ) {
4053 /* JISX0208 Kigou */
4058 } else if (alpha_f&0x4) {
4063 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4069 case '>': entity = ">"; break;
4070 case '<': entity = "<"; break;
4071 case '\"': entity = """; break;
4072 case '&': entity = "&"; break;
4075 while (*entity) (*o_zconv)(0, *entity++);
4085 #define rot13(c) ( \
4087 (c <= 'M') ? (c + 13): \
4088 (c <= 'Z') ? (c - 13): \
4090 (c <= 'm') ? (c + 13): \
4091 (c <= 'z') ? (c - 13): \
4095 #define rot47(c) ( \
4097 ( c <= 'O' ) ? (c + 47) : \
4098 ( c <= '~' ) ? (c - 47) : \
4106 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4112 (*o_rot_conv)(c2,c1);
4119 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
4121 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
4124 (*o_hira_conv)(c2,c1);
4129 iso2022jp_check_conv(c2,c1)
4132 STATIC const int range[RANGE_NUM_MAX][2] = {
4155 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4159 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4164 for (i = 0; i < RANGE_NUM_MAX; i++) {
4165 start = range[i][0];
4168 if (c >= start && c <= end) {
4173 (*o_iso2022jp_check_conv)(c2,c1);
4177 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4179 const unsigned char *mime_pattern[] = {
4180 (const unsigned char *)"\075?EUC-JP?B?",
4181 (const unsigned char *)"\075?SHIFT_JIS?B?",
4182 (const unsigned char *)"\075?ISO-8859-1?Q?",
4183 (const unsigned char *)"\075?ISO-8859-1?B?",
4184 (const unsigned char *)"\075?ISO-2022-JP?B?",
4185 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4186 #if defined(UTF8_INPUT_ENABLE)
4187 (const unsigned char *)"\075?UTF-8?B?",
4188 (const unsigned char *)"\075?UTF-8?Q?",
4190 (const unsigned char *)"\075?US-ASCII?Q?",
4195 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4196 int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
4197 e_iconv, s_iconv, 0, 0, 0, 0,
4198 #if defined(UTF8_INPUT_ENABLE)
4204 const int mime_encode[] = {
4205 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4206 #if defined(UTF8_INPUT_ENABLE)
4213 const int mime_encode_method[] = {
4214 'B', 'B','Q', 'B', 'B', 'Q',
4215 #if defined(UTF8_INPUT_ENABLE)
4223 #define MAXRECOVER 20
4228 if (i_getc!=mime_getc) {
4229 i_mgetc = i_getc; i_getc = mime_getc;
4230 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4231 if(mime_f==STRICT_MIME) {
4232 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4233 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4239 unswitch_mime_getc()
4241 if(mime_f==STRICT_MIME) {
4242 i_mgetc = i_mgetc_buf;
4243 i_mungetc = i_mungetc_buf;
4246 i_ungetc = i_mungetc;
4247 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4248 mime_iconv_back = NULL;
4252 mime_begin_strict(f)
4257 const unsigned char *p,*q;
4258 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4260 mime_decode_mode = FALSE;
4261 /* =? has been checked */
4263 p = mime_pattern[j];
4266 for(i=2;p[i]>' ';i++) { /* start at =? */
4267 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4268 /* pattern fails, try next one */
4270 while ((p = mime_pattern[++j])) {
4271 for(k=2;k<i;k++) /* assume length(p) > i */
4272 if (p[k]!=q[k]) break;
4273 if (k==i && nkf_toupper(c1)==p[k]) break;
4275 if (p) continue; /* found next one, continue */
4276 /* all fails, output from recovery buffer */
4284 mime_decode_mode = p[i-2];
4286 mime_iconv_back = iconv;
4287 set_iconv(FALSE, mime_priority_func[j]);
4288 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4290 if (mime_decode_mode=='B') {
4291 mimebuf_f = unbuf_f;
4293 /* do MIME integrity check */
4294 return mime_integrity(f,mime_pattern[j]);
4306 /* we don't keep eof of Fifo, becase it contains ?= as
4307 a terminator. It was checked in mime_integrity. */
4308 return ((mimebuf_f)?
4309 (*i_mgetc_buf)(f):Fifo(mime_input++));
4313 mime_ungetc_buf(c,f)
4318 (*i_mungetc_buf)(c,f);
4320 Fifo(--mime_input)=c;
4331 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4332 /* re-read and convert again from mime_buffer. */
4334 /* =? has been checked */
4336 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4337 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4338 /* We accept any character type even if it is breaked by new lines */
4339 c1 = (*i_getc)(f); Fifo(mime_last++)= c1 ;
4340 if (c1=='\n'||c1==' '||c1=='\r'||
4341 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4343 /* Failed. But this could be another MIME preemble */
4351 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4352 if (!(++i<MAXRECOVER) || c1==EOF) break;
4353 if (c1=='b'||c1=='B') {
4354 mime_decode_mode = 'B';
4355 } else if (c1=='q'||c1=='Q') {
4356 mime_decode_mode = 'Q';
4360 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4361 if (!(++i<MAXRECOVER) || c1==EOF) break;
4363 mime_decode_mode = FALSE;
4369 if (!mime_decode_mode) {
4370 /* false MIME premble, restart from mime_buffer */
4371 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4372 /* Since we are in MIME mode until buffer becomes empty, */
4373 /* we never go into mime_begin again for a while. */
4376 /* discard mime preemble, and goto MIME mode */
4378 /* do no MIME integrity check */
4379 return c1; /* used only for checking EOF */
4394 fprintf(stderr, "%s\n", str);
4400 set_input_codename (codename)
4405 strcmp(codename, "") != 0 &&
4406 strcmp(codename, input_codename) != 0)
4408 is_inputcode_mixed = TRUE;
4410 input_codename = codename;
4411 is_inputcode_set = TRUE;
4414 #if !defined(PERL_XS) && !defined(WIN32DLL)
4416 print_guessed_code (filename)
4419 char *codename = "BINARY";
4420 if (!is_inputcode_mixed) {
4421 if (strcmp(input_codename, "") == 0) {
4424 codename = input_codename;
4427 if (filename != NULL) printf("%s:", filename);
4428 printf("%s\n", codename);
4434 #ifdef ANSI_C_PROTOTYPE
4435 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4438 hex_getc(ch, f, g, u)
4451 if (!nkf_isxdigit(c2)){
4456 if (!nkf_isxdigit(c3)){
4461 return (hex2bin(c2) << 4) | hex2bin(c3);
4468 return hex_getc(':', f, i_cgetc, i_cungetc);
4476 return (*i_cungetc)(c, f);
4483 return hex_getc('%', f, i_ugetc, i_uungetc);
4491 return (*i_uungetc)(c, f);
4495 #ifdef NUMCHAR_OPTION
4500 int (*g)() = i_ngetc;
4501 int (*u)() = i_nungetc;
4512 if (buf[i] == 'x' || buf[i] == 'X'){
4513 for (j = 0; j < 5; j++){
4515 if (!nkf_isxdigit(buf[i])){
4522 c |= hex2bin(buf[i]);
4525 for (j = 0; j < 6; j++){
4529 if (!nkf_isdigit(buf[i])){
4536 c += hex2bin(buf[i]);
4542 return CLASS_UTF16 | c;
4552 numchar_ungetc(c, f)
4556 return (*i_nungetc)(c, f);
4560 #ifdef UNICODE_NORMALIZATION
4562 /* Normalization Form C */
4567 int (*g)() = i_nfc_getc;
4568 int (*u)() = i_nfc_ungetc;
4569 int i=0, j, k=1, lower, upper;
4571 const int *array = NULL;
4572 extern const struct normalization_pair normalization_table[];
4575 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4576 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4577 while (upper >= lower) {
4578 j = (lower+upper) / 2;
4579 array = normalization_table[j].nfd;
4580 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4581 if (array[k] != buf[k]){
4582 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4589 array = normalization_table[j].nfc;
4590 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4607 return (*i_nfc_ungetc)(c, f);
4609 #endif /* UNICODE_NORMALIZATION */
4616 int c1, c2, c3, c4, cc;
4617 int t1, t2, t3, t4, mode, exit_mode;
4621 int lwsp_size = 128;
4623 if (mime_top != mime_last) { /* Something is in FIFO */
4624 return Fifo(mime_top++);
4626 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4627 mime_decode_mode=FALSE;
4628 unswitch_mime_getc();
4629 return (*i_getc)(f);
4632 if (mimebuf_f == FIXED_MIME)
4633 exit_mode = mime_decode_mode;
4636 if (mime_decode_mode == 'Q') {
4637 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4639 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4640 if (c1<=' ' || DEL<=c1) {
4641 mime_decode_mode = exit_mode; /* prepare for quit */
4644 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4648 mime_decode_mode = exit_mode; /* prepare for quit */
4649 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4650 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4651 /* end Q encoding */
4652 input_mode = exit_mode;
4654 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4655 if (lwsp_buf==NULL) {
4656 perror("can't malloc");
4659 while ((c1=(*i_getc)(f))!=EOF) {
4664 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4672 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4673 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4688 lwsp_buf[lwsp_count] = c1;
4689 if (lwsp_count++>lwsp_size){
4691 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4692 if (lwsp_buf_new==NULL) {
4695 perror("can't realloc");
4698 lwsp_buf = lwsp_buf_new;
4704 if (lwsp_count > 0) {
4705 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4709 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4710 i_ungetc(lwsp_buf[lwsp_count],f);
4718 if (c1=='='&&c2<' ') { /* this is soft wrap */
4719 while((c1 = (*i_mgetc)(f)) <=' ') {
4720 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4722 mime_decode_mode = 'Q'; /* still in MIME */
4723 goto restart_mime_q;
4726 mime_decode_mode = 'Q'; /* still in MIME */
4730 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4731 if (c2<=' ') return c2;
4732 mime_decode_mode = 'Q'; /* still in MIME */
4733 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4734 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4735 return ((hex(c2)<<4) + hex(c3));
4738 if (mime_decode_mode != 'B') {
4739 mime_decode_mode = FALSE;
4740 return (*i_mgetc)(f);
4744 /* Base64 encoding */
4746 MIME allows line break in the middle of
4747 Base64, but we are very pessimistic in decoding
4748 in unbuf mode because MIME encoded code may broken by
4749 less or editor's control sequence (such as ESC-[-K in unbuffered
4750 mode. ignore incomplete MIME.
4752 mode = mime_decode_mode;
4753 mime_decode_mode = exit_mode; /* prepare for quit */
4755 while ((c1 = (*i_mgetc)(f))<=' ') {
4760 if ((c2 = (*i_mgetc)(f))<=' ') {
4763 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4764 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4767 if ((c1 == '?') && (c2 == '=')) {
4770 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4771 if (lwsp_buf==NULL) {
4772 perror("can't malloc");
4775 while ((c1=(*i_getc)(f))!=EOF) {
4780 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4788 if ((c1=(*i_getc)(f))!=EOF) {
4792 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4807 lwsp_buf[lwsp_count] = c1;
4808 if (lwsp_count++>lwsp_size){
4810 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4811 if (lwsp_buf_new==NULL) {
4814 perror("can't realloc");
4817 lwsp_buf = lwsp_buf_new;
4823 if (lwsp_count > 0) {
4824 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4828 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4829 i_ungetc(lwsp_buf[lwsp_count],f);
4838 if ((c3 = (*i_mgetc)(f))<=' ') {
4841 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4842 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4846 if ((c4 = (*i_mgetc)(f))<=' ') {
4849 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4850 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4854 mime_decode_mode = mode; /* still in MIME sigh... */
4856 /* BASE 64 decoding */
4858 t1 = 0x3f & base64decode(c1);
4859 t2 = 0x3f & base64decode(c2);
4860 t3 = 0x3f & base64decode(c3);
4861 t4 = 0x3f & base64decode(c4);
4862 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4864 Fifo(mime_last++) = cc;
4865 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4867 Fifo(mime_last++) = cc;
4868 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4870 Fifo(mime_last++) = cc;
4875 return Fifo(mime_top++);
4883 Fifo(--mime_top) = c;
4890 const unsigned char *p;
4894 /* In buffered mode, read until =? or NL or buffer full
4896 mime_input = mime_top;
4897 mime_last = mime_top;
4899 while(*p) Fifo(mime_input++) = *p++;
4902 while((c=(*i_getc)(f))!=EOF) {
4903 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4904 break; /* buffer full */
4906 if (c=='=' && d=='?') {
4907 /* checked. skip header, start decode */
4908 Fifo(mime_input++) = c;
4909 /* mime_last_input = mime_input; */
4914 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4916 /* Should we check length mod 4? */
4917 Fifo(mime_input++) = c;
4920 /* In case of Incomplete MIME, no MIME decode */
4921 Fifo(mime_input++) = c;
4922 mime_last = mime_input; /* point undecoded buffer */
4923 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4924 switch_mime_getc(); /* anyway we need buffered getc */
4935 i = c - 'A'; /* A..Z 0-25 */
4937 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4939 } else if (c > '/') {
4940 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4941 } else if (c == '+') {
4942 i = '>' /* 62 */ ; /* + 62 */
4944 i = '?' /* 63 */ ; /* / 63 */
4949 STATIC const char basis_64[] =
4950 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4953 #define MIMEOUT_BUF_LENGTH (60)
4954 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4955 int mimeout_buf_count = 0;
4956 int mimeout_preserve_space = 0;
4957 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4963 const unsigned char *p;
4966 p = mime_pattern[0];
4967 for(i=0;mime_encode[i];i++) {
4968 if (mode == mime_encode[i]) {
4969 p = mime_pattern[i];
4973 mimeout_mode = mime_encode_method[i];
4976 if (base64_count>45) {
4977 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4978 (*o_mputc)(mimeout_buf[i]);
4984 if (!mimeout_preserve_space && mimeout_buf_count>0
4985 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4986 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4990 if (!mimeout_preserve_space) {
4991 for (;i<mimeout_buf_count;i++) {
4992 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4993 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4994 (*o_mputc)(mimeout_buf[i]);
5001 mimeout_preserve_space = FALSE;
5007 j = mimeout_buf_count;
5008 mimeout_buf_count = 0;
5010 mime_putc(mimeout_buf[i]);
5026 switch(mimeout_mode) {
5031 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5037 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5043 if (mimeout_f!=FIXED_MIME) {
5045 } else if (mimeout_mode != 'Q')
5054 switch(mimeout_mode) {
5059 } else if(!nkf_isalnum(c)) {
5061 (*o_mputc)(itoh4(((c>>4)&0xf)));
5062 (*o_mputc)(itoh4((c&0xf)));
5071 (*o_mputc)(basis_64[c>>2]);
5076 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5082 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5083 (*o_mputc)(basis_64[c & 0x3F]);
5094 int mime_lastchar2, mime_lastchar1;
5096 void mime_prechar(c2, c1)
5101 if (base64_count + mimeout_buf_count/3*4> 66){
5102 (*o_base64conv)(EOF,0);
5103 (*o_base64conv)(0,NL);
5104 (*o_base64conv)(0,SPACE);
5106 }/*else if (mime_lastchar2){
5107 if (c1 <=DEL && !nkf_isspace(c1)){
5108 (*o_base64conv)(0,SPACE);
5112 if (c2 && mime_lastchar2 == 0
5113 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5114 (*o_base64conv)(0,SPACE);
5117 mime_lastchar2 = c2;
5118 mime_lastchar1 = c1;
5129 if (mimeout_f == FIXED_MIME){
5130 if (mimeout_mode == 'Q'){
5131 if (base64_count > 71){
5132 if (c!=CR && c!=NL) {
5139 if (base64_count > 71){
5144 if (c == EOF) { /* c==EOF */
5148 if (c != EOF) { /* c==EOF */
5154 /* mimeout_f != FIXED_MIME */
5156 if (c == EOF) { /* c==EOF */
5157 j = mimeout_buf_count;
5158 mimeout_buf_count = 0;
5161 /*if (nkf_isspace(mimeout_buf[i])){
5164 mimeout_addchar(mimeout_buf[i]);
5168 (*o_mputc)(mimeout_buf[i]);
5174 if (mimeout_mode=='Q') {
5175 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5187 if (mimeout_buf_count > 0){
5188 lastchar = mimeout_buf[mimeout_buf_count - 1];
5193 if (!mimeout_mode) {
5194 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5195 if (nkf_isspace(c)) {
5196 if (c==CR || c==NL) {
5199 for (i=0;i<mimeout_buf_count;i++) {
5200 (*o_mputc)(mimeout_buf[i]);
5201 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5208 mimeout_buf_count = 1;
5210 if (base64_count > 1
5211 && base64_count + mimeout_buf_count > 76){
5214 if (!nkf_isspace(mimeout_buf[0])){
5219 mimeout_buf[mimeout_buf_count++] = c;
5220 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5221 open_mime(output_mode);
5226 if (lastchar==CR || lastchar == NL){
5227 for (i=0;i<mimeout_buf_count;i++) {
5228 (*o_mputc)(mimeout_buf[i]);
5231 mimeout_buf_count = 0;
5233 if (lastchar==SPACE) {
5234 for (i=0;i<mimeout_buf_count-1;i++) {
5235 (*o_mputc)(mimeout_buf[i]);
5238 mimeout_buf[0] = SPACE;
5239 mimeout_buf_count = 1;
5241 open_mime(output_mode);
5244 /* mimeout_mode == 'B', 1, 2 */
5245 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5246 if (lastchar == CR || lastchar == NL){
5247 if (nkf_isblank(c)) {
5248 for (i=0;i<mimeout_buf_count;i++) {
5249 mimeout_addchar(mimeout_buf[i]);
5251 mimeout_buf_count = 0;
5252 } else if (SPACE<c && c<DEL) {
5254 for (i=0;i<mimeout_buf_count;i++) {
5255 (*o_mputc)(mimeout_buf[i]);
5258 mimeout_buf_count = 0;
5261 if (c==SPACE || c==TAB || c==CR || c==NL) {
5262 for (i=0;i<mimeout_buf_count;i++) {
5263 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5265 for (i=0;i<mimeout_buf_count;i++) {
5266 (*o_mputc)(mimeout_buf[i]);
5269 mimeout_buf_count = 0;
5272 mimeout_buf[mimeout_buf_count++] = c;
5273 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5275 for (i=0;i<mimeout_buf_count;i++) {
5276 (*o_mputc)(mimeout_buf[i]);
5279 mimeout_buf_count = 0;
5283 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5284 mimeout_buf[mimeout_buf_count++] = c;
5285 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5286 j = mimeout_buf_count;
5287 mimeout_buf_count = 0;
5289 mimeout_addchar(mimeout_buf[i]);
5296 if (mimeout_buf_count>0) {
5297 j = mimeout_buf_count;
5298 mimeout_buf_count = 0;
5300 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5302 mimeout_addchar(mimeout_buf[i]);
5308 (*o_mputc)(mimeout_buf[i]);
5310 open_mime(output_mode);
5317 #if defined(PERL_XS) || defined(WIN32DLL)
5322 struct input_code *p = input_code_list;
5335 mime_f = STRICT_MIME;
5336 mime_decode_f = FALSE;
5341 #if defined(MSDOS) || defined(__OS2__)
5346 iso2022jp_f = FALSE;
5347 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5348 ms_ucs_map_f = UCS_MAP_ASCII;
5350 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5351 internal_unicode_f = FALSE;
5353 #ifdef UTF8_INPUT_ENABLE
5354 no_cp932ext_f = FALSE;
5355 ignore_zwnbsp_f = TRUE;
5356 no_best_fit_chars_f = FALSE;
5357 encode_fallback = NULL;
5358 unicode_subchar = '?';
5360 #ifdef UTF8_OUTPUT_ENABLE
5364 #ifdef UNICODE_NORMALIZATION
5377 is_inputcode_mixed = FALSE;
5378 is_inputcode_set = FALSE;
5382 #ifdef SHIFTJIS_CP932
5392 for (i = 0; i < 256; i++){
5393 prefix_table[i] = 0;
5396 #ifdef UTF8_INPUT_ENABLE
5397 utf16_mode = UTF16BE_INPUT;
5399 mimeout_buf_count = 0;
5404 fold_preserve_f = FALSE;
5407 kanji_intro = DEFAULT_J;
5408 ascii_intro = DEFAULT_R;
5409 fold_margin = FOLD_MARGIN;
5410 output_conv = DEFAULT_CONV;
5411 oconv = DEFAULT_CONV;
5412 o_zconv = no_connection;
5413 o_fconv = no_connection;
5414 o_crconv = no_connection;
5415 o_rot_conv = no_connection;
5416 o_hira_conv = no_connection;
5417 o_base64conv = no_connection;
5418 o_iso2022jp_check_conv = no_connection;
5421 i_ungetc = std_ungetc;
5423 i_bungetc = std_ungetc;
5426 i_mungetc = std_ungetc;
5427 i_mgetc_buf = std_getc;
5428 i_mungetc_buf = std_ungetc;
5429 output_mode = ASCII;
5432 mime_decode_mode = FALSE;
5438 z_prev2=0,z_prev1=0;
5440 iconv_for_check = 0;
5442 input_codename = "";
5450 no_connection(c2,c1)
5453 no_connection2(c2,c1,0);
5457 no_connection2(c2,c1,c0)
5460 fprintf(stderr,"nkf internal module connection failure.\n");
5462 return 0; /* LINT */
5467 #define fprintf dllprintf
5472 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5473 fprintf(stderr,"Flags:\n");
5474 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5475 #ifdef DEFAULT_CODE_SJIS
5476 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8N\n");
5478 #ifdef DEFAULT_CODE_JIS
5479 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8N\n");
5481 #ifdef DEFAULT_CODE_EUC
5482 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8N\n");
5484 #ifdef DEFAULT_CODE_UTF8
5485 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8N (DEFAULT)\n");
5487 #ifdef UTF8_OUTPUT_ENABLE
5488 fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
5490 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC), UTF-8\n");
5491 #ifdef UTF8_INPUT_ENABLE
5492 fprintf(stderr," After 'W' you can add more options. (8|16(B|L)?) \n");
5494 fprintf(stderr,"t no conversion\n");
5495 fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
5496 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5497 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5498 fprintf(stderr,"v Show this usage. V: show version\n");
5499 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5500 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5501 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5502 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5503 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces,\n");
5504 fprintf(stderr," 3: Convert HTML Entity\n");
5505 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5506 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5508 fprintf(stderr,"T Text mode output\n");
5510 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5511 fprintf(stderr,"d,c Delete \\r in line feed and \\032, Add \\r in line feed\n");
5512 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5513 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5514 fprintf(stderr,"long name options\n");
5515 fprintf(stderr," --ic=<input codeset> --oc=<output codeset> set the input or output codeset\n");
5516 fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
5517 fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n");
5518 fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
5519 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5521 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5523 #ifdef NUMCHAR_OPTION
5524 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5526 #ifdef UTF8_INPUT_ENABLE
5527 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5528 fprintf(stderr," set the way nkf handles unassigned characters\n");
5531 fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
5533 fprintf(stderr," -g, --guess Guess the input code\n");
5534 fprintf(stderr," --help,--version\n");
5541 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5542 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
5545 #if defined(MSDOS) && defined(__WIN16__)
5548 #if defined(MSDOS) && defined(__WIN32__)
5554 ,NKF_VERSION,NKF_RELEASE_DATE);
5555 fprintf(stderr,"\n%s\n",CopyRight);
5560 **
\e$B%Q%C%A@):n<T
\e(B
5561 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5562 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5563 ** ohta@src.ricoh.co.jp (Junn Ohta)
5564 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5565 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5566 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5567 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5568 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5569 ** GHG00637@nifty-serve.or.jp (COW)