1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.93 2006/03/14 15:55:58 naruse Exp $ */
43 #define NKF_VERSION "2.0.6"
44 #define NKF_RELEASE_DATE "2006-03-14"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
49 " 2002-2006 Kono, Furukawa, Naruse, mastodon"
56 ** USAGE: nkf [flags] [file]
59 ** b Output is buffered (DEFAULT)
60 ** u Output is unbuffered
64 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
65 ** s Output code is MS Kanji (DEFAULT SELECT)
66 ** e Output code is AT&T JIS (DEFAULT SELECT)
67 ** w Output code is AT&T JIS (DEFAULT SELECT)
68 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
70 ** m MIME conversion for ISO-2022-JP
71 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
72 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
73 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
74 ** M MIME output conversion
76 ** r {de/en}crypt ROT13/47
80 ** T Text mode output (for MS-DOS)
82 ** x Do not convert X0201 kana into X0208
83 ** Z Convert X0208 alphabet to ASCII
88 ** B try to fix broken JIS, missing Escape
89 ** B[1-9] broken level
91 ** O Output to 'nkf.out' file or last file name
92 ** d Delete \r in line feed
93 ** c Add \r in line feed
94 ** -- other long option
95 ** -- ignore following option (don't use with -O )
99 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
101 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
117 #if defined(MSDOS) || defined(__OS2__)
124 #define setbinmode(fp) fsetbin(fp)
125 #else /* Microsoft C, Turbo C */
126 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
128 #else /* UNIX,OS/2 */
129 #define setbinmode(fp)
132 #ifdef _IOFBF /* SysV and MSDOS, Windows */
133 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
135 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
138 /*Borland C++ 4.5 EasyWin*/
139 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
148 /* added by satoru@isoternet.org */
149 #include <sys/stat.h>
150 #ifndef MSDOS /* UNIX, OS/2 */
153 #else /* defined(MSDOS) */
155 #ifdef __BORLANDC__ /* BCC32 */
157 #else /* !defined(__BORLANDC__) */
158 #include <sys/utime.h>
159 #endif /* (__BORLANDC__) */
160 #else /* !defined(__WIN32__) */
161 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
162 #include <sys/utime.h>
163 #elif defined(__TURBOC__) /* BCC */
165 #elif defined(LSI_C) /* LSI C */
166 #endif /* (__WIN32__) */
178 /* state of output_mode and input_mode
195 #define X0213_1 0x2850
196 #define X0213_2 0x2850
198 /* Input Assumption */
202 #define LATIN1_INPUT 6
204 #define STRICT_MIME 8
209 #define JAPANESE_EUC 10
213 #define UTF8_INPUT 13
214 #define UTF16BE_INPUT 14
215 #define UTF16LE_INPUT 15
235 #define is_alnum(c) \
236 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
238 /* I don't trust portablity of toupper */
239 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
240 #define nkf_isoctal(c) ('0'<=c && c<='7')
241 #define nkf_isdigit(c) ('0'<=c && c<='9')
242 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
243 #define nkf_isblank(c) (c == SPACE || c == TAB)
244 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
245 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
246 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
247 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
249 #define HOLD_SIZE 1024
250 #define IOBUF_SIZE 16384
252 #define DEFAULT_J 'B'
253 #define DEFAULT_R 'B'
255 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
256 #define SJ6394 0x0161 /* 63 - 94 ku offset */
258 #define RANGE_NUM_MAX 18
263 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
264 #define sizeof_euc_to_utf8_1byte 94
265 #define sizeof_euc_to_utf8_2bytes 94
266 #define sizeof_utf8_to_euc_C2 64
267 #define sizeof_utf8_to_euc_E5B8 64
268 #define sizeof_utf8_to_euc_2bytes 112
269 #define sizeof_utf8_to_euc_3bytes 16
272 /* MIME preprocessor */
274 #ifdef EASYWIN /*Easy Win */
275 extern POINT _BufferSize;
278 /* function prototype */
280 #ifdef ANSI_C_PROTOTYPE
282 #define STATIC static
296 void (*status_func)PROTO((struct input_code *, int));
297 int (*iconv_func)PROTO((int c2, int c1, int c0));
301 STATIC char *input_codename = "";
304 STATIC const char *CopyRight = COPY_RIGHT;
306 #if !defined(PERL_XS) && !defined(WIN32DLL)
307 STATIC int noconvert PROTO((FILE *f));
309 STATIC int kanji_convert PROTO((FILE *f));
310 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
311 STATIC int push_hold_buf PROTO((int c2));
312 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
313 STATIC int s_iconv PROTO((int c2,int c1,int c0));
314 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
315 STATIC int e_iconv PROTO((int c2,int c1,int c0));
316 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
318 * 0: Shift_JIS, eucJP-ascii
322 #define UCS_MAP_ASCII 0
324 #define UCS_MAP_CP932 2
325 STATIC int ms_ucs_map_f = UCS_MAP_ASCII;
327 #ifdef UTF8_INPUT_ENABLE
328 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
329 STATIC int no_cp932ext_f = FALSE;
330 /* ignore ZERO WIDTH NO-BREAK SPACE */
331 STATIC int ignore_zwnbsp_f = TRUE;
332 STATIC int no_best_fit_chars_f = FALSE;
333 STATIC int unicode_subchar = '?'; /* the regular substitution character */
334 STATIC void encode_fallback_html PROTO((int c));
335 STATIC void encode_fallback_xml PROTO((int c));
336 STATIC void encode_fallback_java PROTO((int c));
337 STATIC void encode_fallback_perl PROTO((int c));
338 STATIC void encode_fallback_subchar PROTO((int c));
339 STATIC void (*encode_fallback)PROTO((int c)) = NULL;
340 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
341 STATIC int w_iconv PROTO((int c2,int c1,int c0));
342 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
343 STATIC int unicode_to_jis_common PROTO((int c2,int c1,int c0,int *p2,int *p1));
344 STATIC int w_iconv_common PROTO((int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1));
345 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
346 STATIC int w16e_conv PROTO((unsigned short val,int *p2,int *p1));
348 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
349 STATIC int internal_unicode_f = FALSE; /* Internal Unicode Processing */
351 #ifdef UTF8_OUTPUT_ENABLE
352 STATIC int unicode_bom_f= 0; /* Output Unicode BOM */
353 STATIC int w_oconv16_LE = 0; /* utf-16 little endian */
354 STATIC int e2w_conv PROTO((int c2,int c1));
355 STATIC void w_oconv PROTO((int c2,int c1));
356 STATIC void w_oconv16 PROTO((int c2,int c1));
358 STATIC void e_oconv PROTO((int c2,int c1));
359 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
360 STATIC void s_oconv PROTO((int c2,int c1));
361 STATIC void j_oconv PROTO((int c2,int c1));
362 STATIC void fold_conv PROTO((int c2,int c1));
363 STATIC void cr_conv PROTO((int c2,int c1));
364 STATIC void z_conv PROTO((int c2,int c1));
365 STATIC void rot_conv PROTO((int c2,int c1));
366 STATIC void hira_conv PROTO((int c2,int c1));
367 STATIC void base64_conv PROTO((int c2,int c1));
368 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
369 STATIC void no_connection PROTO((int c2,int c1));
370 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
372 STATIC void code_score PROTO((struct input_code *ptr));
373 STATIC void code_status PROTO((int c));
375 STATIC void std_putc PROTO((int c));
376 STATIC int std_getc PROTO((FILE *f));
377 STATIC int std_ungetc PROTO((int c,FILE *f));
379 STATIC int broken_getc PROTO((FILE *f));
380 STATIC int broken_ungetc PROTO((int c,FILE *f));
382 STATIC int mime_begin PROTO((FILE *f));
383 STATIC int mime_getc PROTO((FILE *f));
384 STATIC int mime_ungetc PROTO((int c,FILE *f));
386 STATIC int mime_begin_strict PROTO((FILE *f));
387 STATIC int mime_getc_buf PROTO((FILE *f));
388 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
389 STATIC int mime_integrity PROTO((FILE *f,const unsigned char *p));
391 STATIC int base64decode PROTO((int c));
392 STATIC void mime_prechar PROTO((int c2, int c1));
393 STATIC void mime_putc PROTO((int c));
394 STATIC void open_mime PROTO((int c));
395 STATIC void close_mime PROTO(());
397 STATIC void usage PROTO(());
398 STATIC void version PROTO(());
400 STATIC void options PROTO((unsigned char *c));
401 #if defined(PERL_XS) || defined(WIN32DLL)
402 STATIC void reinit PROTO(());
407 #if !defined(PERL_XS) && !defined(WIN32DLL)
408 STATIC unsigned char stdibuf[IOBUF_SIZE];
409 STATIC unsigned char stdobuf[IOBUF_SIZE];
411 STATIC unsigned char hold_buf[HOLD_SIZE*2];
412 STATIC int hold_count;
414 /* MIME preprocessor fifo */
416 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
417 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
418 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
419 STATIC unsigned char mime_buf[MIME_BUF_SIZE];
420 STATIC unsigned int mime_top = 0;
421 STATIC unsigned int mime_last = 0; /* decoded */
422 STATIC unsigned int mime_input = 0; /* undecoded */
423 STATIC int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
426 STATIC int unbuf_f = FALSE;
427 STATIC int estab_f = FALSE;
428 STATIC int nop_f = FALSE;
429 STATIC int binmode_f = TRUE; /* binary mode */
430 STATIC int rot_f = FALSE; /* rot14/43 mode */
431 STATIC int hira_f = FALSE; /* hira/kata henkan */
432 STATIC int input_f = FALSE; /* non fixed input code */
433 STATIC int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
434 STATIC int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
435 STATIC int mime_decode_f = FALSE; /* mime decode is explicitly on */
436 STATIC int mimebuf_f = FALSE; /* MIME buffered input */
437 STATIC int broken_f = FALSE; /* convert ESC-less broken JIS */
438 STATIC int iso8859_f = FALSE; /* ISO8859 through */
439 STATIC int mimeout_f = FALSE; /* base64 mode */
440 #if defined(MSDOS) || defined(__OS2__)
441 STATIC int x0201_f = TRUE; /* Assume JISX0201 kana */
443 STATIC int x0201_f = NO_X0201; /* Assume NO JISX0201 */
445 STATIC int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
447 #ifdef UNICODE_NORMALIZATION
448 STATIC int nfc_f = FALSE;
449 STATIC int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
450 STATIC int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
451 STATIC int nfc_getc PROTO((FILE *f));
452 STATIC int nfc_ungetc PROTO((int c,FILE *f));
456 STATIC int cap_f = FALSE;
457 STATIC int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
458 STATIC int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
459 STATIC int cap_getc PROTO((FILE *f));
460 STATIC int cap_ungetc PROTO((int c,FILE *f));
462 STATIC int url_f = FALSE;
463 STATIC int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
464 STATIC int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
465 STATIC int url_getc PROTO((FILE *f));
466 STATIC int url_ungetc PROTO((int c,FILE *f));
469 #ifdef NUMCHAR_OPTION
470 #define CLASS_MASK 0x0f000000
471 #define CLASS_UTF16 0x01000000
472 STATIC int numchar_f = FALSE;
473 STATIC int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
474 STATIC int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
475 STATIC int numchar_getc PROTO((FILE *f));
476 STATIC int numchar_ungetc PROTO((int c,FILE *f));
480 STATIC int noout_f = FALSE;
481 STATIC void no_putc PROTO((int c));
482 STATIC int debug_f = FALSE;
483 STATIC void debug PROTO((const char *str));
484 STATIC int (*iconv_for_check)() = 0;
487 STATIC int guess_f = FALSE;
489 STATIC void print_guessed_code PROTO((char *filename));
491 STATIC void set_input_codename PROTO((char *codename));
492 STATIC int is_inputcode_mixed = FALSE;
493 STATIC int is_inputcode_set = FALSE;
496 STATIC int exec_f = 0;
499 #ifdef SHIFTJIS_CP932
500 /* invert IBM extended characters to others */
501 STATIC int cp51932_f = TRUE;
502 #define CP932_TABLE_BEGIN (0xfa)
503 #define CP932_TABLE_END (0xfc)
505 /* invert NEC-selected IBM extended characters to IBM extended characters */
506 STATIC int cp932inv_f = TRUE;
507 #define CP932INV_TABLE_BEGIN (0xed)
508 #define CP932INV_TABLE_END (0xee)
510 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
511 #endif /* SHIFTJIS_CP932 */
514 STATIC int x0212_f = FALSE;
515 STATIC int x0212_shift PROTO((int c));
516 STATIC int x0212_unshift PROTO((int c));
518 STATIC int x0213_f = FALSE;
520 STATIC unsigned char prefix_table[256];
522 STATIC void e_status PROTO((struct input_code *, int));
523 STATIC void s_status PROTO((struct input_code *, int));
525 #ifdef UTF8_INPUT_ENABLE
526 STATIC void w_status PROTO((struct input_code *, int));
527 STATIC void w16_status PROTO((struct input_code *, int));
528 STATIC int utf16_mode = UTF16BE_INPUT;
531 struct input_code input_code_list[] = {
532 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
533 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
534 #ifdef UTF8_INPUT_ENABLE
535 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
536 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
541 STATIC int mimeout_mode = 0;
542 STATIC int base64_count = 0;
544 /* X0208 -> ASCII converter */
547 STATIC int f_line = 0; /* chars in line */
548 STATIC int f_prev = 0;
549 STATIC int fold_preserve_f = FALSE; /* preserve new lines */
550 STATIC int fold_f = FALSE;
551 STATIC int fold_len = 0;
554 STATIC unsigned char kanji_intro = DEFAULT_J;
555 STATIC unsigned char ascii_intro = DEFAULT_R;
559 #define FOLD_MARGIN 10
560 #define DEFAULT_FOLD 60
562 STATIC int fold_margin = FOLD_MARGIN;
566 #ifdef DEFAULT_CODE_JIS
567 # define DEFAULT_CONV j_oconv
569 #ifdef DEFAULT_CODE_SJIS
570 # define DEFAULT_CONV s_oconv
572 #ifdef DEFAULT_CODE_EUC
573 # define DEFAULT_CONV e_oconv
575 #ifdef DEFAULT_CODE_UTF8
576 # define DEFAULT_CONV w_oconv
579 /* process default */
580 STATIC void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
582 STATIC void (*oconv)PROTO((int c2,int c1)) = no_connection;
583 /* s_iconv or oconv */
584 STATIC int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
586 STATIC void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
587 STATIC void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
588 STATIC void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
589 STATIC void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
590 STATIC void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
591 STATIC void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
592 STATIC void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
594 /* STATIC redirections */
596 STATIC void (*o_putc)PROTO((int c)) = std_putc;
598 STATIC int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
599 STATIC int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
601 STATIC int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
602 STATIC int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
604 STATIC void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
606 STATIC int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
607 STATIC int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
609 /* for strict mime */
610 STATIC int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
611 STATIC int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
614 STATIC int output_mode = ASCII, /* output kanji mode */
615 input_mode = ASCII, /* input kanji mode */
616 shift_mode = FALSE; /* TRUE shift out, or X0201 */
617 STATIC int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
619 /* X0201 / X0208 conversion tables */
621 /* X0201 kana conversion table */
624 unsigned char cv[]= {
625 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
626 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
627 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
628 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
629 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
630 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
631 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
632 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
633 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
634 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
635 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
636 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
637 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
638 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
639 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
640 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
644 /* X0201 kana conversion table for daguten */
647 unsigned char dv[]= {
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
653 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
654 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
655 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
656 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
657 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
659 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 /* X0201 kana conversion table for han-daguten */
669 unsigned char ev[]= {
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
681 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
689 /* X0208 kigou conversion table */
690 /* 0x8140 - 0x819e */
692 unsigned char fv[] = {
694 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
695 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
696 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
698 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
699 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
700 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
702 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
704 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
705 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
711 STATIC int file_out_f = FALSE;
713 STATIC int overwrite_f = FALSE;
714 STATIC int preserve_time_f = FALSE;
715 STATIC int backup_f = FALSE;
716 STATIC char *backup_suffix = "";
717 STATIC char *get_backup_filename PROTO((const char *suffix, const char *filename));
720 STATIC int crmode_f = 0; /* CR, NL, CRLF */
721 #ifdef EASYWIN /*Easy Win */
722 STATIC int end_check;
725 #define STD_GC_BUFSIZE (256)
726 int std_gc_buf[STD_GC_BUFSIZE];
730 #include "nkf32dll.c"
731 #elif defined(PERL_XS)
741 char *outfname = NULL;
744 #ifdef EASYWIN /*Easy Win */
745 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
748 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
749 cp = (unsigned char *)*argv;
754 if (pipe(fds) < 0 || (pid = fork()) < 0){
765 execvp(argv[1], &argv[1]);
779 if(x0201_f == WISH_TRUE)
780 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
782 if (binmode_f == TRUE)
784 if (freopen("","wb",stdout) == NULL)
791 setbuf(stdout, (char *) NULL);
793 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
796 if (binmode_f == TRUE)
798 if (freopen("","rb",stdin) == NULL) return (-1);
802 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
806 kanji_convert(stdin);
807 if (guess_f) print_guessed_code(NULL);
812 is_inputcode_mixed = FALSE;
813 is_inputcode_set = FALSE;
818 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
827 /* reopen file for stdout */
828 if (file_out_f == TRUE) {
831 outfname = malloc(strlen(origfname)
832 + strlen(".nkftmpXXXXXX")
838 strcpy(outfname, origfname);
842 for (i = strlen(outfname); i; --i){
843 if (outfname[i - 1] == '/'
844 || outfname[i - 1] == '\\'){
850 strcat(outfname, "ntXXXXXX");
852 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
855 strcat(outfname, ".nkftmpXXXXXX");
856 fd = mkstemp(outfname);
859 || (fd_backup = dup(fileno(stdout))) < 0
860 || dup2(fd, fileno(stdout)) < 0
871 outfname = "nkf.out";
874 if(freopen(outfname, "w", stdout) == NULL) {
878 if (binmode_f == TRUE) {
880 if (freopen("","wb",stdout) == NULL)
887 if (binmode_f == TRUE)
889 if (freopen("","rb",fin) == NULL)
894 setvbuffer(fin, stdibuf, IOBUF_SIZE);
898 char *filename = NULL;
900 if (nfiles > 1) filename = origfname;
901 if (guess_f) print_guessed_code(filename);
907 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
915 if (dup2(fd_backup, fileno(stdout)) < 0){
918 if (stat(origfname, &sb)) {
919 fprintf(stderr, "Can't stat %s\n", origfname);
921 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
922 if (chmod(outfname, sb.st_mode)) {
923 fprintf(stderr, "Can't set permission %s\n", outfname);
926 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
928 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
929 tb[0] = tb[1] = sb.st_mtime;
930 if (utime(outfname, tb)) {
931 fprintf(stderr, "Can't set timestamp %s\n", outfname);
934 tb.actime = sb.st_atime;
935 tb.modtime = sb.st_mtime;
936 if (utime(outfname, &tb)) {
937 fprintf(stderr, "Can't set timestamp %s\n", outfname);
942 char *backup_filename = get_backup_filename(backup_suffix, origfname);
944 unlink(backup_filename);
946 if (rename(origfname, backup_filename)) {
947 perror(backup_filename);
948 fprintf(stderr, "Can't rename %s to %s\n",
949 origfname, backup_filename);
953 if (unlink(origfname)){
958 if (rename(outfname, origfname)) {
960 fprintf(stderr, "Can't rename %s to %s\n",
961 outfname, origfname);
969 #ifdef EASYWIN /*Easy Win */
970 if (file_out_f == FALSE)
971 scanf("%d",&end_check);
974 #else /* for Other OS */
975 if (file_out_f == TRUE)
980 #endif /* WIN32DLL */
983 char *get_backup_filename(suffix, filename)
985 const char *filename;
987 char *backup_filename = NULL;
988 int asterisk_count = 0;
990 int filename_length = strlen(filename);
992 for(i = 0; suffix[i]; i++){
993 if(suffix[i] == '*') asterisk_count++;
997 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
998 if (!backup_filename){
999 perror("Can't malloc backup filename.");
1003 for(i = 0, j = 0; suffix[i];){
1004 if(suffix[i] == '*'){
1005 backup_filename[j] = '\0';
1006 strncat(backup_filename, filename, filename_length);
1008 j += filename_length;
1010 backup_filename[j++] = suffix[i++];
1013 backup_filename[j] = '\0';
1015 j = strlen(suffix) + filename_length;
1016 backup_filename = malloc( + 1);
1017 strcpy(backup_filename, filename);
1018 strcat(backup_filename, suffix);
1019 backup_filename[j] = '\0';
1021 return backup_filename;
1050 {"katakana-hiragana","h3"},
1057 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1058 {"internal-unicode", ""},
1060 #ifdef UTF8_OUTPUT_ENABLE
1070 {"fb-subchar=", ""},
1072 #ifdef UTF8_INPUT_ENABLE
1073 {"utf8-input", "W"},
1074 {"utf16-input", "W16"},
1075 {"no-cp932ext", ""},
1076 {"no-best-fit-chars",""},
1078 #ifdef UNICODE_NORMALIZATION
1079 {"utf8mac-input", ""},
1091 #ifdef NUMCHAR_OPTION
1092 {"numchar-input", ""},
1098 #ifdef SHIFTJIS_CP932
1108 STATIC int option_mode = 0;
1115 unsigned char *p = NULL;
1116 unsigned char *cp_back = NULL;
1117 unsigned char codeset[32];
1121 while(*cp && *cp++!='-');
1122 while (*cp || cp_back) {
1130 case '-': /* literal options */
1131 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1135 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1136 p = (unsigned char *)long_option[i].name;
1137 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1138 if (*p == cp[j] || cp[j] == ' '){
1145 while(*cp && *cp != SPACE && cp++);
1146 if (long_option[i].alias[0]){
1148 cp = (unsigned char *)long_option[i].alias;
1150 if (strcmp(long_option[i].name, "ic=") == 0){
1151 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1152 codeset[i] = nkf_toupper(p[i]);
1155 if(strcmp(codeset, "ISO-2022-JP") == 0){
1156 input_f = JIS_INPUT;
1157 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1158 input_f = SJIS_INPUT;
1159 if (x0201_f==NO_X0201) x0201_f=TRUE;
1160 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1161 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1162 strcmp(codeset, "CP932") == 0 ||
1163 strcmp(codeset, "MS932") == 0){
1164 input_f = SJIS_INPUT;
1166 #ifdef SHIFTJIS_CP932
1169 #ifdef UTF8_OUTPUT_ENABLE
1170 ms_ucs_map_f = UCS_MAP_CP932;
1172 }else if(strcmp(codeset, "EUCJP") == 0 ||
1173 strcmp(codeset, "EUC-JP") == 0){
1174 input_f = JIS_INPUT;
1175 }else if(strcmp(codeset, "CP51932") == 0){
1176 input_f = JIS_INPUT;
1178 #ifdef SHIFTJIS_CP932
1181 #ifdef UTF8_OUTPUT_ENABLE
1182 ms_ucs_map_f = UCS_MAP_CP932;
1184 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1185 strcmp(codeset, "EUCJP-MS") == 0 ||
1186 strcmp(codeset, "EUCJPMS") == 0){
1187 input_f = JIS_INPUT;
1189 #ifdef SHIFTJIS_CP932
1192 #ifdef UTF8_OUTPUT_ENABLE
1193 ms_ucs_map_f = UCS_MAP_MS;
1195 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1196 strcmp(codeset, "EUCJP-ASCII") == 0){
1197 input_f = JIS_INPUT;
1199 #ifdef SHIFTJIS_CP932
1202 #ifdef UTF8_OUTPUT_ENABLE
1203 ms_ucs_map_f = UCS_MAP_ASCII;
1205 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1206 input_f = SJIS_INPUT;
1208 #ifdef SHIFTJIS_CP932
1212 if (x0201_f==NO_X0201) x0201_f=TRUE;
1213 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1214 input_f = JIS_INPUT;
1217 #ifdef SHIFTJIS_CP932
1221 #ifdef UTF8_INPUT_ENABLE
1222 }else if(strcmp(codeset, "UTF-8") == 0 ||
1223 strcmp(codeset, "UTF-8N") == 0 ||
1224 strcmp(codeset, "UTF-8-BOM") == 0){
1225 input_f = UTF8_INPUT;
1226 #ifdef UNICODE_NORMALIZATION
1227 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1228 strcmp(codeset, "UTF-8-MAC") == 0){
1229 input_f = UTF8_INPUT;
1232 }else if(strcmp(codeset, "UTF-16") == 0){
1233 input_f = UTF16BE_INPUT;
1234 utf16_mode = UTF16BE_INPUT;
1235 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1236 strcmp(codeset, "UTF-16BE-BOM") == 0){
1237 input_f = UTF16BE_INPUT;
1238 utf16_mode = UTF16BE_INPUT;
1239 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1240 strcmp(codeset, "UTF-16LE-BOM") == 0){
1241 input_f = UTF16LE_INPUT;
1242 utf16_mode = UTF16LE_INPUT;
1247 if (strcmp(long_option[i].name, "oc=") == 0){
1248 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1249 codeset[i] = nkf_toupper(p[i]);
1252 if(strcmp(codeset, "ISO-2022-JP") == 0){
1253 output_conv = j_oconv;
1254 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1255 output_conv = s_oconv;
1256 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1257 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1258 strcmp(codeset, "CP932") == 0 ||
1259 strcmp(codeset, "MS932") == 0){
1260 output_conv = s_oconv;
1262 #ifdef SHIFTJIS_CP932
1266 #ifdef UTF8_OUTPUT_ENABLE
1267 ms_ucs_map_f = UCS_MAP_CP932;
1269 }else if(strcmp(codeset, "EUCJP") == 0 ||
1270 strcmp(codeset, "EUC-JP") == 0){
1271 output_conv = e_oconv;
1272 }else if(strcmp(codeset, "CP51932") == 0){
1273 output_conv = e_oconv;
1275 #ifdef SHIFTJIS_CP932
1278 #ifdef UTF8_OUTPUT_ENABLE
1279 ms_ucs_map_f = UCS_MAP_CP932;
1281 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1282 strcmp(codeset, "EUCJP-MS") == 0 ||
1283 strcmp(codeset, "EUCJPMS") == 0){
1284 output_conv = e_oconv;
1289 #ifdef SHIFTJIS_CP932
1292 #ifdef UTF8_OUTPUT_ENABLE
1293 ms_ucs_map_f = UCS_MAP_MS;
1295 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1296 strcmp(codeset, "EUCJP-ASCII") == 0){
1297 output_conv = e_oconv;
1302 #ifdef SHIFTJIS_CP932
1305 #ifdef UTF8_OUTPUT_ENABLE
1306 ms_ucs_map_f = UCS_MAP_ASCII;
1308 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0){
1309 output_conv = s_oconv;
1311 #ifdef SHIFTJIS_CP932
1314 }else if(strcmp(codeset, "EUC-JISX0213") == 0){
1315 output_conv = e_oconv;
1320 #ifdef SHIFTJIS_CP932
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 }else if(strcmp(codeset, "UTF-8") == 0){
1325 output_conv = w_oconv;
1326 }else if(strcmp(codeset, "UTF-8N") == 0){
1327 output_conv = w_oconv;
1329 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1330 output_conv = w_oconv;
1332 }else if(strcmp(codeset, "UTF-16BE") == 0){
1333 output_conv = w_oconv16;
1335 }else if(strcmp(codeset, "UTF-16") == 0 ||
1336 strcmp(codeset, "UTF-16BE-BOM") == 0){
1337 output_conv = w_oconv16;
1339 }else if(strcmp(codeset, "UTF-16LE") == 0){
1340 output_conv = w_oconv16;
1343 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1344 output_conv = w_oconv16;
1352 if (strcmp(long_option[i].name, "overwrite") == 0){
1355 preserve_time_f = TRUE;
1358 if (strcmp(long_option[i].name, "overwrite=") == 0){
1361 preserve_time_f = TRUE;
1363 backup_suffix = malloc(strlen(p) + 1);
1364 strcpy(backup_suffix, p);
1367 if (strcmp(long_option[i].name, "in-place") == 0){
1370 preserve_time_f = FALSE;
1373 if (strcmp(long_option[i].name, "in-place=") == 0){
1376 preserve_time_f = FALSE;
1378 backup_suffix = malloc(strlen(p) + 1);
1379 strcpy(backup_suffix, p);
1384 if (strcmp(long_option[i].name, "cap-input") == 0){
1388 if (strcmp(long_option[i].name, "url-input") == 0){
1393 #ifdef NUMCHAR_OPTION
1394 if (strcmp(long_option[i].name, "numchar-input") == 0){
1400 if (strcmp(long_option[i].name, "no-output") == 0){
1404 if (strcmp(long_option[i].name, "debug") == 0){
1409 if (strcmp(long_option[i].name, "cp932") == 0){
1410 #ifdef SHIFTJIS_CP932
1414 #ifdef UTF8_OUTPUT_ENABLE
1415 ms_ucs_map_f = UCS_MAP_CP932;
1419 if (strcmp(long_option[i].name, "no-cp932") == 0){
1420 #ifdef SHIFTJIS_CP932
1424 #ifdef UTF8_OUTPUT_ENABLE
1425 ms_ucs_map_f = UCS_MAP_ASCII;
1429 #ifdef SHIFTJIS_CP932
1430 if (strcmp(long_option[i].name, "cp932inv") == 0){
1437 if (strcmp(long_option[i].name, "x0212") == 0){
1444 if (strcmp(long_option[i].name, "exec-in") == 0){
1448 if (strcmp(long_option[i].name, "exec-out") == 0){
1453 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1454 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1455 internal_unicode_f = TRUE;
1458 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1459 no_cp932ext_f = TRUE;
1462 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1463 no_best_fit_chars_f = TRUE;
1466 if (strcmp(long_option[i].name, "fb-skip") == 0){
1467 encode_fallback = NULL;
1470 if (strcmp(long_option[i].name, "fb-html") == 0){
1471 encode_fallback = encode_fallback_html;
1474 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1475 encode_fallback = encode_fallback_xml;
1478 if (strcmp(long_option[i].name, "fb-java") == 0){
1479 encode_fallback = encode_fallback_java;
1482 if (strcmp(long_option[i].name, "fb-perl") == 0){
1483 encode_fallback = encode_fallback_perl;
1486 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1487 encode_fallback = encode_fallback_subchar;
1490 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1491 encode_fallback = encode_fallback_subchar;
1492 unicode_subchar = 0;
1494 /* decimal number */
1495 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1496 unicode_subchar *= 10;
1497 unicode_subchar += hex2bin(p[i]);
1499 }else if(p[1] == 'x' || p[1] == 'X'){
1500 /* hexadecimal number */
1501 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1502 unicode_subchar <<= 4;
1503 unicode_subchar |= hex2bin(p[i]);
1507 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1508 unicode_subchar *= 8;
1509 unicode_subchar += hex2bin(p[i]);
1512 w16e_conv(unicode_subchar, &i, &j);
1513 unicode_subchar = i<<8 | j;
1517 #ifdef UTF8_OUTPUT_ENABLE
1518 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1519 ms_ucs_map_f = UCS_MAP_MS;
1523 #ifdef UNICODE_NORMALIZATION
1524 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1525 input_f = UTF8_INPUT;
1530 if (strcmp(long_option[i].name, "prefix=") == 0){
1531 if (' ' < p[0] && p[0] < 128){
1532 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1533 prefix_table[p[i]] = p[0];
1540 case 'b': /* buffered mode */
1543 case 'u': /* non bufferd mode */
1546 case 't': /* transparent mode */
1549 case 'j': /* JIS output */
1551 output_conv = j_oconv;
1553 case 'e': /* AT&T EUC output */
1554 output_conv = e_oconv;
1556 case 's': /* SJIS output */
1557 output_conv = s_oconv;
1559 case 'l': /* ISO8859 Latin-1 support, no conversion */
1560 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1561 input_f = LATIN1_INPUT;
1563 case 'i': /* Kanji IN ESC-$-@/B */
1564 if (*cp=='@'||*cp=='B')
1565 kanji_intro = *cp++;
1567 case 'o': /* ASCII IN ESC-(-J/B */
1568 if (*cp=='J'||*cp=='B'||*cp=='H')
1569 ascii_intro = *cp++;
1573 bit:1 katakana->hiragana
1574 bit:2 hiragana->katakana
1576 if ('9'>= *cp && *cp>='0')
1577 hira_f |= (*cp++ -'0');
1584 #if defined(MSDOS) || defined(__OS2__)
1599 #ifdef UTF8_OUTPUT_ENABLE
1600 case 'w': /* UTF-8 output */
1601 if ('1'== cp[0] && '6'==cp[1]) {
1602 output_conv = w_oconv16; cp+=2;
1604 unicode_bom_f=2; cp++;
1607 unicode_bom_f=1; cp++;
1609 } else if (cp[0] == 'B') {
1610 unicode_bom_f=2; cp++;
1612 unicode_bom_f=1; cp++;
1615 } else if (cp[0] == '8') {
1616 output_conv = w_oconv; cp++;
1619 unicode_bom_f=1; cp++;
1622 output_conv = w_oconv;
1625 #ifdef UTF8_INPUT_ENABLE
1626 case 'W': /* UTF-8 input */
1627 if ('1'== cp[0] && '6'==cp[1]) {
1628 input_f = UTF16BE_INPUT;
1629 utf16_mode = UTF16BE_INPUT;
1633 input_f = UTF16LE_INPUT;
1634 utf16_mode = UTF16LE_INPUT;
1635 } else if (cp[0] == 'B') {
1637 input_f = UTF16BE_INPUT;
1638 utf16_mode = UTF16BE_INPUT;
1640 } else if (cp[0] == '8') {
1642 input_f = UTF8_INPUT;
1644 input_f = UTF8_INPUT;
1647 /* Input code assumption */
1648 case 'J': /* JIS input */
1649 case 'E': /* AT&T EUC input */
1650 input_f = JIS_INPUT;
1652 case 'S': /* MS Kanji input */
1653 input_f = SJIS_INPUT;
1654 if (x0201_f==NO_X0201) x0201_f=TRUE;
1656 case 'Z': /* Convert X0208 alphabet to asii */
1657 /* bit:0 Convert X0208
1658 bit:1 Convert Kankaku to one space
1659 bit:2 Convert Kankaku to two spaces
1660 bit:3 Convert HTML Entity
1662 if ('9'>= *cp && *cp>='0')
1663 alpha_f |= 1<<(*cp++ -'0');
1667 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1668 x0201_f = FALSE; /* No X0201->X0208 conversion */
1670 ESC-(-I in JIS, EUC, MS Kanji
1671 SI/SO in JIS, EUC, MS Kanji
1672 SSO in EUC, JIS, not in MS Kanji
1673 MS Kanji (0xa0-0xdf)
1675 ESC-(-I in JIS (0x20-0x5f)
1676 SSO in EUC (0xa0-0xdf)
1677 0xa0-0xd in MS Kanji (0xa0-0xdf)
1680 case 'X': /* Assume X0201 kana */
1681 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1684 case 'F': /* prserve new lines */
1685 fold_preserve_f = TRUE;
1686 case 'f': /* folding -f60 or -f */
1689 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1691 fold_len += *cp++ - '0';
1693 if (!(0<fold_len && fold_len<BUFSIZ))
1694 fold_len = DEFAULT_FOLD;
1698 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1700 fold_margin += *cp++ - '0';
1704 case 'm': /* MIME support */
1705 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1706 if (*cp=='B'||*cp=='Q') {
1707 mime_decode_mode = *cp++;
1708 mimebuf_f = FIXED_MIME;
1709 } else if (*cp=='N') {
1710 mime_f = TRUE; cp++;
1711 } else if (*cp=='S') {
1712 mime_f = STRICT_MIME; cp++;
1713 } else if (*cp=='0') {
1714 mime_decode_f = FALSE;
1715 mime_f = FALSE; cp++;
1718 case 'M': /* MIME output */
1721 mimeout_f = FIXED_MIME; cp++;
1722 } else if (*cp=='Q') {
1724 mimeout_f = FIXED_MIME; cp++;
1729 case 'B': /* Broken JIS support */
1731 bit:1 allow any x on ESC-(-x or ESC-$-x
1732 bit:2 reset to ascii on NL
1734 if ('9'>= *cp && *cp>='0')
1735 broken_f |= 1<<(*cp++ -'0');
1740 case 'O':/* for Output file */
1744 case 'c':/* add cr code */
1747 case 'd':/* delete cr code */
1750 case 'I': /* ISO-2022-JP output */
1753 case 'L': /* line mode */
1754 if (*cp=='u') { /* unix */
1755 crmode_f = NL; cp++;
1756 } else if (*cp=='m') { /* mac */
1757 crmode_f = CR; cp++;
1758 } else if (*cp=='w') { /* windows */
1759 crmode_f = CRLF; cp++;
1760 } else if (*cp=='0') { /* no conversion */
1770 /* module muliple options in a string are allowed for Perl moudle */
1771 while(*cp && *cp++!='-');
1774 /* bogus option but ignored */
1780 #ifdef ANSI_C_PROTOTYPE
1781 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1783 struct input_code * find_inputcode_byfunc(iconv_func)
1784 int (*iconv_func)();
1788 struct input_code *p = input_code_list;
1790 if (iconv_func == p->iconv_func){
1799 #ifdef ANSI_C_PROTOTYPE
1800 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1802 void set_iconv(f, iconv_func)
1804 int (*iconv_func)();
1807 #ifdef INPUT_CODE_FIX
1815 #ifdef INPUT_CODE_FIX
1816 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1822 if (estab_f && iconv_for_check != iconv){
1823 struct input_code *p = find_inputcode_byfunc(iconv);
1825 set_input_codename(p->name);
1826 debug(input_codename);
1828 iconv_for_check = iconv;
1833 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1834 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1835 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1836 #ifdef SHIFTJIS_CP932
1837 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1838 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1840 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1842 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1843 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1845 #define SCORE_INIT (SCORE_iMIME)
1847 const int score_table_A0[] = {
1850 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1851 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1854 const int score_table_F0[] = {
1855 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1856 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1857 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1858 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1861 void set_code_score(ptr, score)
1862 struct input_code *ptr;
1866 ptr->score |= score;
1870 void clr_code_score(ptr, score)
1871 struct input_code *ptr;
1875 ptr->score &= ~score;
1879 void code_score(ptr)
1880 struct input_code *ptr;
1882 int c2 = ptr->buf[0];
1883 #ifdef UTF8_OUTPUT_ENABLE
1884 int c1 = ptr->buf[1];
1887 set_code_score(ptr, SCORE_ERROR);
1888 }else if (c2 == SSO){
1889 set_code_score(ptr, SCORE_KANA);
1890 #ifdef UTF8_OUTPUT_ENABLE
1891 }else if (!e2w_conv(c2, c1)){
1892 set_code_score(ptr, SCORE_NO_EXIST);
1894 }else if ((c2 & 0x70) == 0x20){
1895 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1896 }else if ((c2 & 0x70) == 0x70){
1897 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1898 }else if ((c2 & 0x70) >= 0x50){
1899 set_code_score(ptr, SCORE_L2);
1903 void status_disable(ptr)
1904 struct input_code *ptr;
1909 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1912 void status_push_ch(ptr, c)
1913 struct input_code *ptr;
1916 ptr->buf[ptr->index++] = c;
1919 void status_clear(ptr)
1920 struct input_code *ptr;
1926 void status_reset(ptr)
1927 struct input_code *ptr;
1930 ptr->score = SCORE_INIT;
1933 void status_reinit(ptr)
1934 struct input_code *ptr;
1937 ptr->_file_stat = 0;
1940 void status_check(ptr, c)
1941 struct input_code *ptr;
1944 if (c <= DEL && estab_f){
1949 void s_status(ptr, c)
1950 struct input_code *ptr;
1955 status_check(ptr, c);
1960 #ifdef NUMCHAR_OPTION
1961 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1964 }else if (0xa1 <= c && c <= 0xdf){
1965 status_push_ch(ptr, SSO);
1966 status_push_ch(ptr, c);
1969 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1971 status_push_ch(ptr, c);
1972 #ifdef SHIFTJIS_CP932
1974 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1976 status_push_ch(ptr, c);
1977 #endif /* SHIFTJIS_CP932 */
1979 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1981 status_push_ch(ptr, c);
1982 #endif /* X0212_ENABLE */
1984 status_disable(ptr);
1988 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1989 status_push_ch(ptr, c);
1990 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
1994 status_disable(ptr);
1998 #ifdef SHIFTJIS_CP932
1999 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2000 status_push_ch(ptr, c);
2001 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2002 set_code_score(ptr, SCORE_CP932);
2007 #endif /* SHIFTJIS_CP932 */
2008 #ifndef X0212_ENABLE
2009 status_disable(ptr);
2015 void e_status(ptr, c)
2016 struct input_code *ptr;
2021 status_check(ptr, c);
2026 #ifdef NUMCHAR_OPTION
2027 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2030 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2032 status_push_ch(ptr, c);
2034 }else if (0x8f == c){
2036 status_push_ch(ptr, c);
2037 #endif /* X0212_ENABLE */
2039 status_disable(ptr);
2043 if (0xa1 <= c && c <= 0xfe){
2044 status_push_ch(ptr, c);
2048 status_disable(ptr);
2053 if (0xa1 <= c && c <= 0xfe){
2055 status_push_ch(ptr, c);
2057 status_disable(ptr);
2059 #endif /* X0212_ENABLE */
2063 #ifdef UTF8_INPUT_ENABLE
2064 void w16_status(ptr, c)
2065 struct input_code *ptr;
2072 if (ptr->_file_stat == 0){
2073 if (c == 0xfe || c == 0xff){
2075 status_push_ch(ptr, c);
2076 ptr->_file_stat = 1;
2078 status_disable(ptr);
2079 ptr->_file_stat = -1;
2081 }else if (ptr->_file_stat > 0){
2083 status_push_ch(ptr, c);
2084 }else if (ptr->_file_stat < 0){
2085 status_disable(ptr);
2091 status_disable(ptr);
2092 ptr->_file_stat = -1;
2094 status_push_ch(ptr, c);
2101 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2102 status_push_ch(ptr, c);
2105 status_disable(ptr);
2106 ptr->_file_stat = -1;
2112 void w_status(ptr, c)
2113 struct input_code *ptr;
2118 status_check(ptr, c);
2123 #ifdef NUMCHAR_OPTION
2124 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2127 }else if (0xc0 <= c && c <= 0xdf){
2129 status_push_ch(ptr, c);
2130 }else if (0xe0 <= c && c <= 0xef){
2132 status_push_ch(ptr, c);
2134 status_disable(ptr);
2139 if (0x80 <= c && c <= 0xbf){
2140 status_push_ch(ptr, c);
2141 if (ptr->index > ptr->stat){
2142 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2143 && ptr->buf[2] == 0xbf);
2144 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2145 &ptr->buf[0], &ptr->buf[1]);
2152 status_disable(ptr);
2163 int action_flag = 1;
2164 struct input_code *result = 0;
2165 struct input_code *p = input_code_list;
2167 (p->status_func)(p, c);
2170 }else if(p->stat == 0){
2181 if (result && !estab_f){
2182 set_iconv(TRUE, result->iconv_func);
2183 }else if (c <= DEL){
2184 struct input_code *ptr = input_code_list;
2199 return std_gc_buf[--std_gc_ndx];
2210 if (std_gc_ndx == STD_GC_BUFSIZE){
2213 std_gc_buf[std_gc_ndx++] = c;
2227 #if !defined(PERL_XS) && !defined(WIN32DLL)
2234 while ((c = (*i_getc)(f)) != EOF)
2243 oconv = output_conv;
2246 /* replace continucation module, from output side */
2248 /* output redicrection */
2250 if (noout_f || guess_f){
2257 if (mimeout_f == TRUE) {
2258 o_base64conv = oconv; oconv = base64_conv;
2260 /* base64_count = 0; */
2264 o_crconv = oconv; oconv = cr_conv;
2267 o_rot_conv = oconv; oconv = rot_conv;
2270 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2273 o_hira_conv = oconv; oconv = hira_conv;
2276 o_fconv = oconv; oconv = fold_conv;
2279 if (alpha_f || x0201_f) {
2280 o_zconv = oconv; oconv = z_conv;
2284 i_ungetc = std_ungetc;
2285 /* input redicrection */
2288 i_cgetc = i_getc; i_getc = cap_getc;
2289 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2292 i_ugetc = i_getc; i_getc = url_getc;
2293 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2296 #ifdef NUMCHAR_OPTION
2298 i_ngetc = i_getc; i_getc = numchar_getc;
2299 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2302 #ifdef UNICODE_NORMALIZATION
2303 if (nfc_f && input_f == UTF8_INPUT){
2304 i_nfc_getc = i_getc; i_getc = nfc_getc;
2305 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2308 if (mime_f && mimebuf_f==FIXED_MIME) {
2309 i_mgetc = i_getc; i_getc = mime_getc;
2310 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2313 i_bgetc = i_getc; i_getc = broken_getc;
2314 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2316 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2317 set_iconv(-TRUE, e_iconv);
2318 } else if (input_f == SJIS_INPUT) {
2319 set_iconv(-TRUE, s_iconv);
2320 #ifdef UTF8_INPUT_ENABLE
2321 } else if (input_f == UTF8_INPUT) {
2322 set_iconv(-TRUE, w_iconv);
2323 } else if (input_f == UTF16BE_INPUT) {
2324 set_iconv(-TRUE, w_iconv16);
2325 } else if (input_f == UTF16LE_INPUT) {
2326 set_iconv(-TRUE, w_iconv16);
2329 set_iconv(FALSE, e_iconv);
2333 struct input_code *p = input_code_list;
2341 Conversion main loop. Code detection only.
2350 int is_8bit = FALSE;
2352 module_connection();
2355 if(input_f == SJIS_INPUT
2356 #ifdef UTF8_INPUT_ENABLE
2357 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2365 output_mode = ASCII;
2368 #define NEXT continue /* no output, get next */
2369 #define SEND ; /* output c1 and c2, get next */
2370 #define LAST break /* end of loop, go closing */
2372 while ((c1 = (*i_getc)(f)) != EOF) {
2373 #ifdef INPUT_CODE_FIX
2380 /* in case of 8th bit is on */
2381 if (!estab_f&&!mime_decode_mode) {
2382 /* in case of not established yet */
2383 /* It is still ambiguious */
2384 if (h_conv(f, c2, c1)==EOF)
2390 /* in case of already established */
2392 /* ignore bogus code */
2398 /* second byte, 7 bit code */
2399 /* it might be kanji shitfted */
2400 if ((c1 == DEL) || (c1 <= SPACE)) {
2401 /* ignore bogus first code */
2409 #ifdef UTF8_INPUT_ENABLE
2418 #ifdef NUMCHAR_OPTION
2419 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2422 } else if (c1 > DEL) {
2424 if (!estab_f && !iso8859_f) {
2425 /* not established yet */
2426 if (!is_8bit) is_8bit = TRUE;
2429 } else { /* estab_f==TRUE */
2434 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2435 /* SJIS X0201 Case... */
2436 if(iso2022jp_f && x0201_f==NO_X0201) {
2437 (*oconv)(GETA1, GETA2);
2444 } else if (c1==SSO && iconv != s_iconv) {
2445 /* EUC X0201 Case */
2446 c1 = (*i_getc)(f); /* skip SSO */
2448 if (SSP<=c1 && c1<0xe0) {
2449 if(iso2022jp_f && x0201_f==NO_X0201) {
2450 (*oconv)(GETA1, GETA2);
2457 } else { /* bogus code, skip SSO and one byte */
2461 /* already established */
2466 } else if ((c1 > SPACE) && (c1 != DEL)) {
2467 /* in case of Roman characters */
2469 /* output 1 shifted byte */
2473 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2474 /* output 1 shifted byte */
2475 if(iso2022jp_f && x0201_f==NO_X0201) {
2476 (*oconv)(GETA1, GETA2);
2483 /* look like bogus code */
2486 } else if (input_mode == X0208) {
2487 /* in case of Kanji shifted */
2490 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2491 /* Check MIME code */
2492 if ((c1 = (*i_getc)(f)) == EOF) {
2495 } else if (c1 == '?') {
2496 /* =? is mime conversion start sequence */
2497 if(mime_f == STRICT_MIME) {
2498 /* check in real detail */
2499 if (mime_begin_strict(f) == EOF)
2503 } else if (mime_begin(f) == EOF)
2513 /* normal ASCII code */
2516 } else if (!is_8bit && c1 == SI) {
2519 } else if (!is_8bit && c1 == SO) {
2522 } else if (!is_8bit && c1 == ESC ) {
2523 if ((c1 = (*i_getc)(f)) == EOF) {
2524 /* (*oconv)(0, ESC); don't send bogus code */
2526 } else if (c1 == '$') {
2527 if ((c1 = (*i_getc)(f)) == EOF) {
2529 (*oconv)(0, ESC); don't send bogus code
2530 (*oconv)(0, '$'); */
2532 } else if (c1 == '@'|| c1 == 'B') {
2533 /* This is kanji introduction */
2536 set_input_codename("ISO-2022-JP");
2538 debug(input_codename);
2541 } else if (c1 == '(') {
2542 if ((c1 = (*i_getc)(f)) == EOF) {
2543 /* don't send bogus code
2549 } else if (c1 == '@'|| c1 == 'B') {
2550 /* This is kanji introduction */
2555 } else if (c1 == 'D'){
2559 #endif /* X0212_ENABLE */
2561 /* could be some special code */
2568 } else if (broken_f&0x2) {
2569 /* accept any ESC-(-x as broken code ... */
2579 } else if (c1 == '(') {
2580 if ((c1 = (*i_getc)(f)) == EOF) {
2581 /* don't send bogus code
2583 (*oconv)(0, '('); */
2587 /* This is X0201 kana introduction */
2588 input_mode = X0201; shift_mode = X0201;
2590 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2591 /* This is X0208 kanji introduction */
2592 input_mode = ASCII; shift_mode = FALSE;
2594 } else if (broken_f&0x2) {
2595 input_mode = ASCII; shift_mode = FALSE;
2600 /* maintain various input_mode here */
2604 } else if ( c1 == 'N' || c1 == 'n' ){
2606 c3 = (*i_getc)(f); /* skip SS2 */
2607 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2622 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2623 input_mode = ASCII; set_iconv(FALSE, 0);
2625 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2626 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2634 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2635 if ((c1=(*i_getc)(f))!=EOF) {
2639 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2655 if (input_mode == X0208)
2656 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2658 else if (input_mode == X0212)
2659 (*oconv)((0x8f << 8) | c2, c1);
2660 #endif /* X0212_ENABLE */
2661 else if (input_mode)
2662 (*oconv)(input_mode, c1); /* other special case */
2663 else if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2664 int c0 = (*i_getc)(f);
2667 (*iconv)(c2, c1, c0);
2673 /* goto next_word */
2677 (*iconv)(EOF, 0, 0);
2678 if (!is_inputcode_set)
2681 struct input_code *p = input_code_list;
2682 struct input_code *result = p;
2684 if (p->score < result->score) result = p;
2687 set_input_codename(result->name);
2702 /** it must NOT be in the kanji shifte sequence */
2703 /** it must NOT be written in JIS7 */
2704 /** and it must be after 2 byte 8bit code */
2711 while ((c1 = (*i_getc)(f)) != EOF) {
2717 if (push_hold_buf(c1) == EOF || estab_f){
2723 struct input_code *p = input_code_list;
2724 struct input_code *result = p;
2729 if (p->score < result->score){
2734 set_iconv(FALSE, result->iconv_func);
2739 ** 1) EOF is detected, or
2740 ** 2) Code is established, or
2741 ** 3) Buffer is FULL (but last word is pushed)
2743 ** in 1) and 3) cases, we continue to use
2744 ** Kanji codes by oconv and leave estab_f unchanged.
2749 while (wc < hold_count){
2750 c2 = hold_buf[wc++];
2752 #ifdef NUMCHAR_OPTION
2753 || (c2 & CLASS_MASK) == CLASS_UTF16
2758 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2759 (*iconv)(X0201, c2, 0);
2762 if (wc < hold_count){
2763 c1 = hold_buf[wc++];
2772 if ((*iconv)(c2, c1, 0) < 0){
2774 if (wc < hold_count){
2775 c0 = hold_buf[wc++];
2784 (*iconv)(c2, c1, c0);
2797 if (hold_count >= HOLD_SIZE*2)
2799 hold_buf[hold_count++] = c2;
2800 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2803 const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2805 int s2e_conv(c2, c1, p2, p1)
2809 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2812 #ifdef SHIFTJIS_CP932
2813 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2814 extern const unsigned short shiftjis_cp932[3][189];
2815 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2821 #endif /* SHIFTJIS_CP932 */
2823 if (!x0213_f && x0212_f && 0xfa <= c2 && c2 <= 0xfc){
2824 extern const unsigned short shiftjis_x0212[3][189];
2825 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2828 c2 = (0x8f << 8) | (val >> 8);
2841 if(x0213_f && c2 >= 0xF0){
2842 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2843 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2844 }else{ /* 78<=k<=94 */
2845 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2846 if (0x9E < c1) c2++;
2849 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2850 if (0x9E < c1) c2++;
2853 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2861 c2 = x0212_unshift(c2);
2876 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2879 int ret = s2e_conv(c2, c1, &c2, &c1);
2880 if (ret) return ret;
2894 }else if (c2 == 0x8f){
2898 c2 = (c2 << 8) | (c1 & 0x7f);
2900 #ifdef SHIFTJIS_CP932
2903 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2904 s2e_conv(s2, s1, &c2, &c1);
2905 if ((c2 & 0xff00) == 0){
2911 #endif /* SHIFTJIS_CP932 */
2912 #endif /* X0212_ENABLE */
2913 } else if (c2 == SSO){
2916 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2926 #ifdef UTF8_INPUT_ENABLE
2928 w2e_conv(c2, c1, c0, p2, p1)
2937 }else if (0xc0 <= c2 && c2 <= 0xef) {
2938 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2939 #ifdef NUMCHAR_OPTION
2942 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2957 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
2958 if(ignore_zwnbsp_f){
2959 ignore_zwnbsp_f = FALSE;
2960 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2964 if (c2 == 0) /* 0x00-0x7f */
2965 c1 &= 0x7F; /* 1byte */
2967 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
2969 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
2970 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2971 return -1; /* 3bytes */
2973 else if (0xf0 <= c2)
2974 return 0; /* 4,5,6bytes */
2975 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2976 return 0; /* trail byte */
2980 /* must be 3bytes */
2982 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2984 }else if(c2 == 0xED){
2985 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
2987 }else if((c2 & 0xf0) == 0xe0){
2988 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2992 if (c2 == 0 || c2 == EOF){
2993 #ifdef UTF8_OUTPUT_ENABLE
2994 } else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
2995 unsigned short val = 0;
3000 val = ww16_conv(c2, c1, c0);
3001 c2 = (val >> 8) & 0xff;
3005 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3014 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3016 w16w_conv(val, p2, p1, p0)
3024 }else if (val < 0x800){
3025 *p2 = 0xc0 | (val >> 6);
3026 *p1 = 0x80 | (val & 0x3f);
3029 *p2 = 0xe0 | (val >> 12);
3030 *p1 = 0x80 | ((val >> 6) & 0x3f);
3031 *p0 = 0x80 | (val & 0x3f);
3036 #ifdef UTF8_INPUT_ENABLE
3038 ww16_conv(c2, c1, c0)
3044 }else if (c2 >= 0xe0){
3045 val = (c2 & 0x0f) << 12;
3046 val |= (c1 & 0x3f) << 6;
3048 }else if (c2 >= 0xc0){
3049 val = (c2 & 0x1f) << 6;
3058 w16e_conv(val, p2, p1)
3069 w16w_conv(val, &c2, &c1, &c0);
3070 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3071 #ifdef NUMCHAR_OPTION
3074 *p1 = CLASS_UTF16 | val;
3083 #ifdef UTF8_INPUT_ENABLE
3085 w_iconv16(c2, c1, c0)
3090 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3091 if(ignore_zwnbsp_f){
3092 ignore_zwnbsp_f = FALSE;
3093 if (c2==0376 && c1==0377){
3094 utf16_mode = UTF16BE_INPUT;
3096 }else if(c2==0377 && c1==0376){
3097 utf16_mode = UTF16LE_INPUT;
3101 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3103 tmp=c1; c1=c2; c2=tmp;
3105 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3108 }else if((c2>>3)==27){ /* surrogate pair */
3110 #ifdef UTF8_OUTPUT_ENABLE
3111 }else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
3113 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3114 if (ret) return ret;
3120 unicode_to_jis_common(c2, c1, c0, p2, p1)
3124 extern const unsigned short *const utf8_to_euc_2bytes[];
3125 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3126 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3127 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3128 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3129 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3130 const unsigned short *const *pp;
3131 const unsigned short *const *const *ppp;
3132 STATIC const int no_best_fit_chars_table_C2[] =
3133 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
3136 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0};
3137 STATIC const int no_best_fit_chars_table_C2_ascii[] =
3138 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3140 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3141 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3142 STATIC const int no_best_fit_chars_table_932_C2[] =
3143 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3145 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3146 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3147 STATIC const int no_best_fit_chars_table_932_C3[] =
3148 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3149 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3151 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3157 }else if(c2 < 0xe0){
3158 if(no_best_fit_chars_f){
3159 if(ms_ucs_map_f == UCS_MAP_CP932){
3162 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3165 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3168 }else if(cp51932_f){
3169 if(c2 == 0xC2 && no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3171 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ascii[c1&0x3F]) return 1;
3175 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3176 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3178 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3180 if(no_best_fit_chars_f){
3181 if(ms_ucs_map_f == UCS_MAP_CP932){
3182 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3183 }else if(ms_ucs_map_f == UCS_MAP_MS){
3188 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3191 if(c0 == 0x92) return 1;
3196 if(c1 == 0x80 || c0 == 0x9C) return 1;
3204 if(c0 == 0x95) return 1;
3207 if(c0 == 0xA5) return 1;
3214 if(c0 == 0x8D) return 1;
3217 if(c0 == 0x9E && cp51932_f) return 1;
3220 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3228 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3229 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3231 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3237 w_iconv_common(c1, c0, pp, psize, p2, p1)
3239 const unsigned short *const *pp;
3244 const unsigned short *p;
3247 if (pp == 0) return 1;
3250 if (c1 < 0 || psize <= c1) return 1;
3252 if (p == 0) return 1;
3255 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3257 if (val == 0) return 1;
3258 if (no_cp932ext_f && (
3259 (val>>8) == 0x2D || /* NEC special characters */
3260 val > 0xF300 /* NEC special characters */
3268 if (c2 == SO) c2 = X0201;
3276 nkf_each_char_to_hex(f, c)
3277 void (*f)PROTO((int c2,int c1));
3280 const char *hex = "0123456789ABCDEF";
3286 (*f)(0, hex[(c>>shift)&0xF]);
3297 encode_fallback_html(c)
3304 (*oconv)(0, 0x30+(c/1000000)%10);
3306 (*oconv)(0, 0x30+(c/100000 )%10);
3308 (*oconv)(0, 0x30+(c/10000 )%10);
3310 (*oconv)(0, 0x30+(c/1000 )%10);
3312 (*oconv)(0, 0x30+(c/100 )%10);
3314 (*oconv)(0, 0x30+(c/10 )%10);
3316 (*oconv)(0, 0x30+ c %10);
3322 encode_fallback_xml(c)
3328 nkf_each_char_to_hex(oconv, c);
3334 encode_fallback_java(c)
3337 const char *hex = "0123456789ABCDEF";
3339 if((c&0x00FFFFFF) > 0xFFFF){
3343 (*oconv)(0, hex[(c>>20)&0xF]);
3344 (*oconv)(0, hex[(c>>16)&0xF]);
3348 (*oconv)(0, hex[(c>>12)&0xF]);
3349 (*oconv)(0, hex[(c>> 8)&0xF]);
3350 (*oconv)(0, hex[(c>> 4)&0xF]);
3351 (*oconv)(0, hex[ c &0xF]);
3356 encode_fallback_perl(c)
3362 nkf_each_char_to_hex(oconv, c);
3368 encode_fallback_subchar(c)
3371 c = unicode_subchar;
3372 (*oconv)((c>>8)&0xFF, c&0xFF);
3378 (*oconv)(0, (c>>shift)&0xFF);
3389 #ifdef UTF8_OUTPUT_ENABLE
3394 extern const unsigned short euc_to_utf8_1byte[];
3395 extern const unsigned short *const euc_to_utf8_2bytes[];
3396 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3397 const unsigned short *p;
3400 p = euc_to_utf8_1byte;
3402 } else if (c2 >> 8 == 0x8f){
3403 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3406 extern const unsigned short *const x0212_to_utf8_2bytes[];
3407 c2 = (c2&0x7f) - 0x21;
3408 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3409 p = x0212_to_utf8_2bytes[c2];
3415 c2 = (c2&0x7f) - 0x21;
3416 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3417 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3422 c1 = (c1 & 0x7f) - 0x21;
3423 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3440 if (unicode_bom_f==2) {
3447 #ifdef NUMCHAR_OPTION
3448 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3449 w16w_conv(c1, &c2, &c1, &c0);
3453 if (c0) (*o_putc)(c0);
3460 output_mode = ASCII;
3462 } else if (c2 == ISO8859_1) {
3463 output_mode = ISO8859_1;
3464 (*o_putc)(c1 | 0x080);
3467 #ifdef UTF8_INPUT_ENABLE
3468 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
3469 val = ((c2<<8)&0xff00) + c1;
3472 val = e2w_conv(c2, c1);
3474 w16w_conv(val, &c2, &c1, &c0);
3478 if (c0) (*o_putc)(c0);
3494 if (unicode_bom_f==2) {
3496 (*o_putc)((unsigned char)'\377');
3500 (*o_putc)((unsigned char)'\377');
3505 #ifdef UTF8_INPUT_ENABLE
3506 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
3509 if (c2 == ISO8859_1) {
3512 #ifdef NUMCHAR_OPTION
3513 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3514 c2 = (c1 >> 8) & 0xff;
3518 unsigned short val = e2w_conv(c2, c1);
3519 c2 = (val >> 8) & 0xff;
3538 #ifdef NUMCHAR_OPTION
3539 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3540 w16e_conv(c1, &c2, &c1);
3541 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3542 if(encode_fallback)(*encode_fallback)(c1);
3550 } else if (c2 == 0) {
3551 output_mode = ASCII;
3553 } else if (c2 == X0201) {
3554 output_mode = JAPANESE_EUC;
3555 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3556 } else if (c2 == ISO8859_1) {
3557 output_mode = ISO8859_1;
3558 (*o_putc)(c1 | 0x080);
3560 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3561 output_mode = JAPANESE_EUC;
3562 #ifdef SHIFTJIS_CP932
3565 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3566 s2e_conv(s2, s1, &c2, &c1);
3571 output_mode = ASCII;
3573 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3576 (*o_putc)((c2 & 0x7f) | 0x080);
3577 (*o_putc)(c1 | 0x080);
3580 (*o_putc)((c2 & 0x7f) | 0x080);
3581 (*o_putc)(c1 | 0x080);
3585 if ((c1<0x21 || 0x7e<c1) ||
3586 (c2<0x21 || 0x7e<c2)) {
3587 set_iconv(FALSE, 0);
3588 return; /* too late to rescue this char */
3590 output_mode = JAPANESE_EUC;
3591 (*o_putc)(c2 | 0x080);
3592 (*o_putc)(c1 | 0x080);
3602 if ((ret & 0xff00) == 0x8f00){
3603 if (0x75 <= c && c <= 0x7f){
3604 ret = c + (0x109 - 0x75);
3607 if (0x75 <= c && c <= 0x7f){
3608 ret = c + (0x113 - 0x75);
3615 int x0212_unshift(c)
3619 if (0x7f <= c && c <= 0x88){
3620 ret = c + (0x75 - 0x7f);
3621 }else if (0x89 <= c && c <= 0x92){
3622 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3626 #endif /* X0212_ENABLE */
3629 e2s_conv(c2, c1, p2, p1)
3630 int c2, c1, *p2, *p1;
3633 if ((c2 & 0xff00) == 0x8f00){
3636 if((0x21 <= ndx && ndx <= 0x2F)){
3637 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3638 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3640 }else if(0x6E <= ndx && ndx <= 0x7E){
3641 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3642 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3648 else if(0x21 <= ndx && ndx <= 0x7e){
3650 const unsigned short *ptr;
3651 extern const unsigned short *const x0212_shiftjis[];
3653 ptr = x0212_shiftjis[ndx - 0x21];
3655 val = ptr[(c1 & 0x7f) - 0x21];
3664 c2 = x0212_shift(c2);
3666 #endif /* X0212_ENABLE */
3668 if(0x7F < c2) return 1;
3669 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3670 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3679 #ifdef NUMCHAR_OPTION
3680 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3681 w16e_conv(c1, &c2, &c1);
3682 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3683 if(encode_fallback)(*encode_fallback)(c1);
3691 } else if (c2 == 0) {
3692 output_mode = ASCII;
3694 } else if (c2 == X0201) {
3695 output_mode = SHIFT_JIS;
3697 } else if (c2 == ISO8859_1) {
3698 output_mode = ISO8859_1;
3699 (*o_putc)(c1 | 0x080);
3701 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3702 output_mode = SHIFT_JIS;
3703 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3709 if ((c1<0x20 || 0x7e<c1) ||
3710 (c2<0x20 || 0x7e<c2)) {
3711 set_iconv(FALSE, 0);
3712 return; /* too late to rescue this char */
3714 output_mode = SHIFT_JIS;
3715 e2s_conv(c2, c1, &c2, &c1);
3717 #ifdef SHIFTJIS_CP932
3719 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3720 extern const unsigned short cp932inv[2][189];
3721 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3727 #endif /* SHIFTJIS_CP932 */
3730 if (prefix_table[(unsigned char)c1]){
3731 (*o_putc)(prefix_table[(unsigned char)c1]);
3742 #ifdef NUMCHAR_OPTION
3743 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3744 w16e_conv(c1, &c2, &c1);
3745 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3746 if(encode_fallback)(*encode_fallback)(c1);
3752 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3755 (*o_putc)(ascii_intro);
3756 output_mode = ASCII;
3760 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3762 if(output_mode!=X0213_2){
3763 output_mode = X0213_2;
3766 if(output_mode!=X0212){
3767 output_mode = X0212;
3773 (*o_putc)(output_mode & 0x7F);
3774 (*o_putc)(c2 & 0x7f);
3777 } else if (c2==X0201) {
3778 if (output_mode!=X0201) {
3779 output_mode = X0201;
3785 } else if (c2==ISO8859_1) {
3786 /* iso8859 introduction, or 8th bit on */
3787 /* Can we convert in 7bit form using ESC-'-'-A ?
3789 output_mode = ISO8859_1;
3791 } else if (c2 == 0) {
3792 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3795 (*o_putc)(ascii_intro);
3796 output_mode = ASCII;
3801 if (output_mode!=X0213_1) {
3802 output_mode = X0213_1;
3806 (*o_putc)(output_mode & 0x7F);
3808 }else if (output_mode != X0208) {
3809 output_mode = X0208;
3812 (*o_putc)(kanji_intro);
3814 if (c1<0x20 || 0x7e<c1)
3816 if (c2<0x20 || 0x7e<c2)
3828 mime_prechar(c2, c1);
3829 (*o_base64conv)(c2,c1);
3833 STATIC int broken_buf[3];
3834 STATIC int broken_counter = 0;
3835 STATIC int broken_last = 0;
3842 if (broken_counter>0) {
3843 return broken_buf[--broken_counter];
3846 if (c=='$' && broken_last != ESC
3847 && (input_mode==ASCII || input_mode==X0201)) {
3850 if (c1=='@'|| c1=='B') {
3851 broken_buf[0]=c1; broken_buf[1]=c;
3858 } else if (c=='(' && broken_last != ESC
3859 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3862 if (c1=='J'|| c1=='B') {
3863 broken_buf[0]=c1; broken_buf[1]=c;
3881 if (broken_counter<2)
3882 broken_buf[broken_counter++]=c;
3886 STATIC int prev_cr = 0;
3894 if (! (c2==0&&c1==NL) ) {
3900 } else if (c1=='\r') {
3902 } else if (c1=='\n') {
3903 if (crmode_f==CRLF) {
3904 (*o_crconv)(0,'\r');
3905 } else if (crmode_f==CR) {
3906 (*o_crconv)(0,'\r');
3910 } else if (c1!='\032' || crmode_f!=NL){
3916 Return value of fold_conv()
3918 \n add newline and output char
3919 \r add newline and output nothing
3922 1 (or else) normal output
3924 fold state in prev (previous character)
3926 >0x80 Japanese (X0208/X0201)
3931 This fold algorthm does not preserve heading space in a line.
3932 This is the main difference from fmt.
3935 #define char_size(c2,c1) (c2?2:1)
3944 if (c1== '\r' && !fold_preserve_f) {
3945 fold_state=0; /* ignore cr */
3946 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3948 fold_state=0; /* ignore cr */
3949 } else if (c1== BS) {
3950 if (f_line>0) f_line--;
3952 } else if (c2==EOF && f_line != 0) { /* close open last line */
3954 } else if ((c1=='\n' && !fold_preserve_f)
3955 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3956 && fold_preserve_f)) {
3958 if (fold_preserve_f) {
3962 } else if ((f_prev == c1 && !fold_preserve_f)
3963 || (f_prev == '\n' && fold_preserve_f)
3964 ) { /* duplicate newline */
3967 fold_state = '\n'; /* output two newline */
3973 if (f_prev&0x80) { /* Japanese? */
3975 fold_state = 0; /* ignore given single newline */
3976 } else if (f_prev==' ') {
3980 if (++f_line<=fold_len)
3984 fold_state = '\r'; /* fold and output nothing */
3988 } else if (c1=='\f') {
3993 fold_state = '\n'; /* output newline and clear */
3994 } else if ( (c2==0 && c1==' ')||
3995 (c2==0 && c1=='\t')||
3996 (c2=='!'&& c1=='!')) {
3997 /* X0208 kankaku or ascii space */
3998 if (f_prev == ' ') {
3999 fold_state = 0; /* remove duplicate spaces */
4002 if (++f_line<=fold_len)
4003 fold_state = ' '; /* output ASCII space only */
4005 f_prev = ' '; f_line = 0;
4006 fold_state = '\r'; /* fold and output nothing */
4010 prev0 = f_prev; /* we still need this one... , but almost done */
4012 if (c2 || c2==X0201)
4013 f_prev |= 0x80; /* this is Japanese */
4014 f_line += char_size(c2,c1);
4015 if (f_line<=fold_len) { /* normal case */
4018 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4019 f_line = char_size(c2,c1);
4020 fold_state = '\n'; /* We can't wait, do fold now */
4021 } else if (c2==X0201) {
4022 /* simple kinsoku rules return 1 means no folding */
4023 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4024 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4025 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4026 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4027 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4028 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4029 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4031 fold_state = '\n';/* add one new f_line before this character */
4034 fold_state = '\n';/* add one new f_line before this character */
4037 /* kinsoku point in ASCII */
4038 if ( c1==')'|| /* { [ ( */
4049 /* just after special */
4050 } else if (!is_alnum(prev0)) {
4051 f_line = char_size(c2,c1);
4053 } else if ((prev0==' ') || /* ignored new f_line */
4054 (prev0=='\n')|| /* ignored new f_line */
4055 (prev0&0x80)) { /* X0208 - ASCII */
4056 f_line = char_size(c2,c1);
4057 fold_state = '\n';/* add one new f_line before this character */
4059 fold_state = 1; /* default no fold in ASCII */
4063 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4064 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4065 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4066 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4067 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4068 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4069 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4070 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4071 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4072 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4073 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4074 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4075 /* default no fold in kinsoku */
4078 f_line = char_size(c2,c1);
4079 /* add one new f_line before this character */
4082 f_line = char_size(c2,c1);
4084 /* add one new f_line before this character */
4089 /* terminator process */
4090 switch(fold_state) {
4109 int z_prev2=0,z_prev1=0;
4116 /* if (c2) c1 &= 0x7f; assertion */
4118 if (x0201_f && z_prev2==X0201) { /* X0201 */
4119 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4121 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4123 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4125 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4129 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4138 if (x0201_f && c2==X0201) {
4139 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4140 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4141 z_prev1 = c1; z_prev2 = c2;
4144 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4149 /* JISX0208 Alphabet */
4150 if (alpha_f && c2 == 0x23 ) {
4152 } else if (alpha_f && c2 == 0x21 ) {
4153 /* JISX0208 Kigou */
4158 } else if (alpha_f&0x4) {
4163 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4169 case '>': entity = ">"; break;
4170 case '<': entity = "<"; break;
4171 case '\"': entity = """; break;
4172 case '&': entity = "&"; break;
4175 while (*entity) (*o_zconv)(0, *entity++);
4185 #define rot13(c) ( \
4187 (c <= 'M') ? (c + 13): \
4188 (c <= 'Z') ? (c - 13): \
4190 (c <= 'm') ? (c + 13): \
4191 (c <= 'z') ? (c - 13): \
4195 #define rot47(c) ( \
4197 ( c <= 'O' ) ? (c + 47) : \
4198 ( c <= '~' ) ? (c - 47) : \
4206 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4212 (*o_rot_conv)(c2,c1);
4219 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
4221 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
4224 (*o_hira_conv)(c2,c1);
4229 iso2022jp_check_conv(c2,c1)
4232 STATIC const int range[RANGE_NUM_MAX][2] = {
4255 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4259 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4264 for (i = 0; i < RANGE_NUM_MAX; i++) {
4265 start = range[i][0];
4268 if (c >= start && c <= end) {
4273 (*o_iso2022jp_check_conv)(c2,c1);
4277 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4279 const unsigned char *mime_pattern[] = {
4280 (const unsigned char *)"\075?EUC-JP?B?",
4281 (const unsigned char *)"\075?SHIFT_JIS?B?",
4282 (const unsigned char *)"\075?ISO-8859-1?Q?",
4283 (const unsigned char *)"\075?ISO-8859-1?B?",
4284 (const unsigned char *)"\075?ISO-2022-JP?B?",
4285 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4286 #if defined(UTF8_INPUT_ENABLE)
4287 (const unsigned char *)"\075?UTF-8?B?",
4288 (const unsigned char *)"\075?UTF-8?Q?",
4290 (const unsigned char *)"\075?US-ASCII?Q?",
4295 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4296 int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
4297 e_iconv, s_iconv, 0, 0, 0, 0,
4298 #if defined(UTF8_INPUT_ENABLE)
4304 const int mime_encode[] = {
4305 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4306 #if defined(UTF8_INPUT_ENABLE)
4313 const int mime_encode_method[] = {
4314 'B', 'B','Q', 'B', 'B', 'Q',
4315 #if defined(UTF8_INPUT_ENABLE)
4323 #define MAXRECOVER 20
4328 if (i_getc!=mime_getc) {
4329 i_mgetc = i_getc; i_getc = mime_getc;
4330 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4331 if(mime_f==STRICT_MIME) {
4332 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4333 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4339 unswitch_mime_getc()
4341 if(mime_f==STRICT_MIME) {
4342 i_mgetc = i_mgetc_buf;
4343 i_mungetc = i_mungetc_buf;
4346 i_ungetc = i_mungetc;
4347 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4348 mime_iconv_back = NULL;
4352 mime_begin_strict(f)
4357 const unsigned char *p,*q;
4358 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4360 mime_decode_mode = FALSE;
4361 /* =? has been checked */
4363 p = mime_pattern[j];
4366 for(i=2;p[i]>' ';i++) { /* start at =? */
4367 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4368 /* pattern fails, try next one */
4370 while ((p = mime_pattern[++j])) {
4371 for(k=2;k<i;k++) /* assume length(p) > i */
4372 if (p[k]!=q[k]) break;
4373 if (k==i && nkf_toupper(c1)==p[k]) break;
4375 if (p) continue; /* found next one, continue */
4376 /* all fails, output from recovery buffer */
4384 mime_decode_mode = p[i-2];
4386 mime_iconv_back = iconv;
4387 set_iconv(FALSE, mime_priority_func[j]);
4388 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4390 if (mime_decode_mode=='B') {
4391 mimebuf_f = unbuf_f;
4393 /* do MIME integrity check */
4394 return mime_integrity(f,mime_pattern[j]);
4406 /* we don't keep eof of Fifo, becase it contains ?= as
4407 a terminator. It was checked in mime_integrity. */
4408 return ((mimebuf_f)?
4409 (*i_mgetc_buf)(f):Fifo(mime_input++));
4413 mime_ungetc_buf(c,f)
4418 (*i_mungetc_buf)(c,f);
4420 Fifo(--mime_input)=c;
4431 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4432 /* re-read and convert again from mime_buffer. */
4434 /* =? has been checked */
4436 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4437 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4438 /* We accept any character type even if it is breaked by new lines */
4439 c1 = (*i_getc)(f); Fifo(mime_last++)= c1 ;
4440 if (c1=='\n'||c1==' '||c1=='\r'||
4441 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4443 /* Failed. But this could be another MIME preemble */
4451 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4452 if (!(++i<MAXRECOVER) || c1==EOF) break;
4453 if (c1=='b'||c1=='B') {
4454 mime_decode_mode = 'B';
4455 } else if (c1=='q'||c1=='Q') {
4456 mime_decode_mode = 'Q';
4460 c1 = (*i_getc)(f); Fifo(mime_last++) = c1;
4461 if (!(++i<MAXRECOVER) || c1==EOF) break;
4463 mime_decode_mode = FALSE;
4469 if (!mime_decode_mode) {
4470 /* false MIME premble, restart from mime_buffer */
4471 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4472 /* Since we are in MIME mode until buffer becomes empty, */
4473 /* we never go into mime_begin again for a while. */
4476 /* discard mime preemble, and goto MIME mode */
4478 /* do no MIME integrity check */
4479 return c1; /* used only for checking EOF */
4494 fprintf(stderr, "%s\n", str);
4500 set_input_codename (codename)
4505 strcmp(codename, "") != 0 &&
4506 strcmp(codename, input_codename) != 0)
4508 is_inputcode_mixed = TRUE;
4510 input_codename = codename;
4511 is_inputcode_set = TRUE;
4514 #if !defined(PERL_XS) && !defined(WIN32DLL)
4516 print_guessed_code (filename)
4519 char *codename = "BINARY";
4520 if (!is_inputcode_mixed) {
4521 if (strcmp(input_codename, "") == 0) {
4524 codename = input_codename;
4527 if (filename != NULL) printf("%s:", filename);
4528 printf("%s\n", codename);
4534 #ifdef ANSI_C_PROTOTYPE
4535 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4538 hex_getc(ch, f, g, u)
4551 if (!nkf_isxdigit(c2)){
4556 if (!nkf_isxdigit(c3)){
4561 return (hex2bin(c2) << 4) | hex2bin(c3);
4568 return hex_getc(':', f, i_cgetc, i_cungetc);
4576 return (*i_cungetc)(c, f);
4583 return hex_getc('%', f, i_ugetc, i_uungetc);
4591 return (*i_uungetc)(c, f);
4595 #ifdef NUMCHAR_OPTION
4600 int (*g)() = i_ngetc;
4601 int (*u)() = i_nungetc;
4612 if (buf[i] == 'x' || buf[i] == 'X'){
4613 for (j = 0; j < 5; j++){
4615 if (!nkf_isxdigit(buf[i])){
4622 c |= hex2bin(buf[i]);
4625 for (j = 0; j < 6; j++){
4629 if (!nkf_isdigit(buf[i])){
4636 c += hex2bin(buf[i]);
4642 return CLASS_UTF16 | c;
4652 numchar_ungetc(c, f)
4656 return (*i_nungetc)(c, f);
4660 #ifdef UNICODE_NORMALIZATION
4662 /* Normalization Form C */
4667 int (*g)() = i_nfc_getc;
4668 int (*u)() = i_nfc_ungetc;
4669 int i=0, j, k=1, lower, upper;
4671 const int *array = NULL;
4672 extern const struct normalization_pair normalization_table[];
4675 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4676 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4677 while (upper >= lower) {
4678 j = (lower+upper) / 2;
4679 array = normalization_table[j].nfd;
4680 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4681 if (array[k] != buf[k]){
4682 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4689 array = normalization_table[j].nfc;
4690 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4707 return (*i_nfc_ungetc)(c, f);
4709 #endif /* UNICODE_NORMALIZATION */
4716 int c1, c2, c3, c4, cc;
4717 int t1, t2, t3, t4, mode, exit_mode;
4721 int lwsp_size = 128;
4723 if (mime_top != mime_last) { /* Something is in FIFO */
4724 return Fifo(mime_top++);
4726 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4727 mime_decode_mode=FALSE;
4728 unswitch_mime_getc();
4729 return (*i_getc)(f);
4732 if (mimebuf_f == FIXED_MIME)
4733 exit_mode = mime_decode_mode;
4736 if (mime_decode_mode == 'Q') {
4737 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4739 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4740 if (c1<=' ' || DEL<=c1) {
4741 mime_decode_mode = exit_mode; /* prepare for quit */
4744 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4748 mime_decode_mode = exit_mode; /* prepare for quit */
4749 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4750 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4751 /* end Q encoding */
4752 input_mode = exit_mode;
4754 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4755 if (lwsp_buf==NULL) {
4756 perror("can't malloc");
4759 while ((c1=(*i_getc)(f))!=EOF) {
4764 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4772 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4773 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4788 lwsp_buf[lwsp_count] = c1;
4789 if (lwsp_count++>lwsp_size){
4791 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4792 if (lwsp_buf_new==NULL) {
4795 perror("can't realloc");
4798 lwsp_buf = lwsp_buf_new;
4804 if (lwsp_count > 0) {
4805 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4809 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4810 i_ungetc(lwsp_buf[lwsp_count],f);
4818 if (c1=='='&&c2<' ') { /* this is soft wrap */
4819 while((c1 = (*i_mgetc)(f)) <=' ') {
4820 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4822 mime_decode_mode = 'Q'; /* still in MIME */
4823 goto restart_mime_q;
4826 mime_decode_mode = 'Q'; /* still in MIME */
4830 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4831 if (c2<=' ') return c2;
4832 mime_decode_mode = 'Q'; /* still in MIME */
4833 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4834 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4835 return ((hex(c2)<<4) + hex(c3));
4838 if (mime_decode_mode != 'B') {
4839 mime_decode_mode = FALSE;
4840 return (*i_mgetc)(f);
4844 /* Base64 encoding */
4846 MIME allows line break in the middle of
4847 Base64, but we are very pessimistic in decoding
4848 in unbuf mode because MIME encoded code may broken by
4849 less or editor's control sequence (such as ESC-[-K in unbuffered
4850 mode. ignore incomplete MIME.
4852 mode = mime_decode_mode;
4853 mime_decode_mode = exit_mode; /* prepare for quit */
4855 while ((c1 = (*i_mgetc)(f))<=' ') {
4860 if ((c2 = (*i_mgetc)(f))<=' ') {
4863 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4864 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4867 if ((c1 == '?') && (c2 == '=')) {
4870 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4871 if (lwsp_buf==NULL) {
4872 perror("can't malloc");
4875 while ((c1=(*i_getc)(f))!=EOF) {
4880 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4888 if ((c1=(*i_getc)(f))!=EOF) {
4892 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4907 lwsp_buf[lwsp_count] = c1;
4908 if (lwsp_count++>lwsp_size){
4910 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4911 if (lwsp_buf_new==NULL) {
4914 perror("can't realloc");
4917 lwsp_buf = lwsp_buf_new;
4923 if (lwsp_count > 0) {
4924 if (c1=='=' && (lwsp_buf[lwsp_count-1]==SPACE||lwsp_buf[lwsp_count-1]==TAB)) {
4928 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4929 i_ungetc(lwsp_buf[lwsp_count],f);
4938 if ((c3 = (*i_mgetc)(f))<=' ') {
4941 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4942 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4946 if ((c4 = (*i_mgetc)(f))<=' ') {
4949 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4950 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4954 mime_decode_mode = mode; /* still in MIME sigh... */
4956 /* BASE 64 decoding */
4958 t1 = 0x3f & base64decode(c1);
4959 t2 = 0x3f & base64decode(c2);
4960 t3 = 0x3f & base64decode(c3);
4961 t4 = 0x3f & base64decode(c4);
4962 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4964 Fifo(mime_last++) = cc;
4965 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4967 Fifo(mime_last++) = cc;
4968 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4970 Fifo(mime_last++) = cc;
4975 return Fifo(mime_top++);
4983 Fifo(--mime_top) = c;
4990 const unsigned char *p;
4994 /* In buffered mode, read until =? or NL or buffer full
4996 mime_input = mime_top;
4997 mime_last = mime_top;
4999 while(*p) Fifo(mime_input++) = *p++;
5002 while((c=(*i_getc)(f))!=EOF) {
5003 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5004 break; /* buffer full */
5006 if (c=='=' && d=='?') {
5007 /* checked. skip header, start decode */
5008 Fifo(mime_input++) = c;
5009 /* mime_last_input = mime_input; */
5014 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5016 /* Should we check length mod 4? */
5017 Fifo(mime_input++) = c;
5020 /* In case of Incomplete MIME, no MIME decode */
5021 Fifo(mime_input++) = c;
5022 mime_last = mime_input; /* point undecoded buffer */
5023 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5024 switch_mime_getc(); /* anyway we need buffered getc */
5035 i = c - 'A'; /* A..Z 0-25 */
5037 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5039 } else if (c > '/') {
5040 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5041 } else if (c == '+') {
5042 i = '>' /* 62 */ ; /* + 62 */
5044 i = '?' /* 63 */ ; /* / 63 */
5049 STATIC const char basis_64[] =
5050 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5053 #define MIMEOUT_BUF_LENGTH (60)
5054 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5055 int mimeout_buf_count = 0;
5056 int mimeout_preserve_space = 0;
5057 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5063 const unsigned char *p;
5066 p = mime_pattern[0];
5067 for(i=0;mime_encode[i];i++) {
5068 if (mode == mime_encode[i]) {
5069 p = mime_pattern[i];
5073 mimeout_mode = mime_encode_method[i];
5076 if (base64_count>45) {
5077 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5078 (*o_mputc)(mimeout_buf[i]);
5084 if (!mimeout_preserve_space && mimeout_buf_count>0
5085 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5086 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5090 if (!mimeout_preserve_space) {
5091 for (;i<mimeout_buf_count;i++) {
5092 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5093 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5094 (*o_mputc)(mimeout_buf[i]);
5101 mimeout_preserve_space = FALSE;
5107 j = mimeout_buf_count;
5108 mimeout_buf_count = 0;
5110 mime_putc(mimeout_buf[i]);
5126 switch(mimeout_mode) {
5131 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5137 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5143 if (mimeout_f!=FIXED_MIME) {
5145 } else if (mimeout_mode != 'Q')
5154 switch(mimeout_mode) {
5159 } else if(!nkf_isalnum(c)) {
5161 (*o_mputc)(itoh4(((c>>4)&0xf)));
5162 (*o_mputc)(itoh4((c&0xf)));
5171 (*o_mputc)(basis_64[c>>2]);
5176 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5182 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5183 (*o_mputc)(basis_64[c & 0x3F]);
5194 int mime_lastchar2, mime_lastchar1;
5196 void mime_prechar(c2, c1)
5201 if (base64_count + mimeout_buf_count/3*4> 66){
5202 (*o_base64conv)(EOF,0);
5203 (*o_base64conv)(0,NL);
5204 (*o_base64conv)(0,SPACE);
5206 }/*else if (mime_lastchar2){
5207 if (c1 <=DEL && !nkf_isspace(c1)){
5208 (*o_base64conv)(0,SPACE);
5212 if (c2 && mime_lastchar2 == 0
5213 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5214 (*o_base64conv)(0,SPACE);
5217 mime_lastchar2 = c2;
5218 mime_lastchar1 = c1;
5229 if (mimeout_f == FIXED_MIME){
5230 if (mimeout_mode == 'Q'){
5231 if (base64_count > 71){
5232 if (c!=CR && c!=NL) {
5239 if (base64_count > 71){
5244 if (c == EOF) { /* c==EOF */
5248 if (c != EOF) { /* c==EOF */
5254 /* mimeout_f != FIXED_MIME */
5256 if (c == EOF) { /* c==EOF */
5257 j = mimeout_buf_count;
5258 mimeout_buf_count = 0;
5261 /*if (nkf_isspace(mimeout_buf[i])){
5264 mimeout_addchar(mimeout_buf[i]);
5268 (*o_mputc)(mimeout_buf[i]);
5274 if (mimeout_mode=='Q') {
5275 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5287 if (mimeout_buf_count > 0){
5288 lastchar = mimeout_buf[mimeout_buf_count - 1];
5293 if (!mimeout_mode) {
5294 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5295 if (nkf_isspace(c)) {
5296 if (c==CR || c==NL) {
5299 for (i=0;i<mimeout_buf_count;i++) {
5300 (*o_mputc)(mimeout_buf[i]);
5301 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5308 mimeout_buf_count = 1;
5310 if (base64_count > 1
5311 && base64_count + mimeout_buf_count > 76){
5314 if (!nkf_isspace(mimeout_buf[0])){
5319 mimeout_buf[mimeout_buf_count++] = c;
5320 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5321 open_mime(output_mode);
5326 if (lastchar==CR || lastchar == NL){
5327 for (i=0;i<mimeout_buf_count;i++) {
5328 (*o_mputc)(mimeout_buf[i]);
5331 mimeout_buf_count = 0;
5333 if (lastchar==SPACE) {
5334 for (i=0;i<mimeout_buf_count-1;i++) {
5335 (*o_mputc)(mimeout_buf[i]);
5338 mimeout_buf[0] = SPACE;
5339 mimeout_buf_count = 1;
5341 open_mime(output_mode);
5344 /* mimeout_mode == 'B', 1, 2 */
5345 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5346 if (lastchar == CR || lastchar == NL){
5347 if (nkf_isblank(c)) {
5348 for (i=0;i<mimeout_buf_count;i++) {
5349 mimeout_addchar(mimeout_buf[i]);
5351 mimeout_buf_count = 0;
5352 } else if (SPACE<c && c<DEL) {
5354 for (i=0;i<mimeout_buf_count;i++) {
5355 (*o_mputc)(mimeout_buf[i]);
5358 mimeout_buf_count = 0;
5361 if (c==SPACE || c==TAB || c==CR || c==NL) {
5362 for (i=0;i<mimeout_buf_count;i++) {
5363 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5365 for (i=0;i<mimeout_buf_count;i++) {
5366 (*o_mputc)(mimeout_buf[i]);
5369 mimeout_buf_count = 0;
5372 mimeout_buf[mimeout_buf_count++] = c;
5373 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5375 for (i=0;i<mimeout_buf_count;i++) {
5376 (*o_mputc)(mimeout_buf[i]);
5379 mimeout_buf_count = 0;
5383 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5384 mimeout_buf[mimeout_buf_count++] = c;
5385 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5386 j = mimeout_buf_count;
5387 mimeout_buf_count = 0;
5389 mimeout_addchar(mimeout_buf[i]);
5396 if (mimeout_buf_count>0) {
5397 j = mimeout_buf_count;
5398 mimeout_buf_count = 0;
5400 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5402 mimeout_addchar(mimeout_buf[i]);
5408 (*o_mputc)(mimeout_buf[i]);
5410 open_mime(output_mode);
5417 #if defined(PERL_XS) || defined(WIN32DLL)
5422 struct input_code *p = input_code_list;
5435 mime_f = STRICT_MIME;
5436 mime_decode_f = FALSE;
5441 #if defined(MSDOS) || defined(__OS2__)
5446 iso2022jp_f = FALSE;
5447 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5448 ms_ucs_map_f = UCS_MAP_ASCII;
5450 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5451 internal_unicode_f = FALSE;
5453 #ifdef UTF8_INPUT_ENABLE
5454 no_cp932ext_f = FALSE;
5455 ignore_zwnbsp_f = TRUE;
5456 no_best_fit_chars_f = FALSE;
5457 encode_fallback = NULL;
5458 unicode_subchar = '?';
5460 #ifdef UTF8_OUTPUT_ENABLE
5464 #ifdef UNICODE_NORMALIZATION
5477 is_inputcode_mixed = FALSE;
5478 is_inputcode_set = FALSE;
5482 #ifdef SHIFTJIS_CP932
5492 for (i = 0; i < 256; i++){
5493 prefix_table[i] = 0;
5496 #ifdef UTF8_INPUT_ENABLE
5497 utf16_mode = UTF16BE_INPUT;
5499 mimeout_buf_count = 0;
5504 fold_preserve_f = FALSE;
5507 kanji_intro = DEFAULT_J;
5508 ascii_intro = DEFAULT_R;
5509 fold_margin = FOLD_MARGIN;
5510 output_conv = DEFAULT_CONV;
5511 oconv = DEFAULT_CONV;
5512 o_zconv = no_connection;
5513 o_fconv = no_connection;
5514 o_crconv = no_connection;
5515 o_rot_conv = no_connection;
5516 o_hira_conv = no_connection;
5517 o_base64conv = no_connection;
5518 o_iso2022jp_check_conv = no_connection;
5521 i_ungetc = std_ungetc;
5523 i_bungetc = std_ungetc;
5526 i_mungetc = std_ungetc;
5527 i_mgetc_buf = std_getc;
5528 i_mungetc_buf = std_ungetc;
5529 output_mode = ASCII;
5532 mime_decode_mode = FALSE;
5538 z_prev2=0,z_prev1=0;
5540 iconv_for_check = 0;
5542 input_codename = "";
5550 no_connection(c2,c1)
5553 no_connection2(c2,c1,0);
5557 no_connection2(c2,c1,c0)
5560 fprintf(stderr,"nkf internal module connection failure.\n");
5562 return 0; /* LINT */
5567 #define fprintf dllprintf
5572 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5573 fprintf(stderr,"Flags:\n");
5574 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5575 #ifdef DEFAULT_CODE_SJIS
5576 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5578 #ifdef DEFAULT_CODE_JIS
5579 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5581 #ifdef DEFAULT_CODE_EUC
5582 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5584 #ifdef DEFAULT_CODE_UTF8
5585 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5587 #ifdef UTF8_OUTPUT_ENABLE
5588 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5590 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5591 #ifdef UTF8_INPUT_ENABLE
5592 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5594 fprintf(stderr,"t no conversion\n");
5595 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5596 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5597 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5598 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5599 fprintf(stderr,"v Show this usage. V: show version\n");
5600 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5601 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5602 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5603 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5604 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5605 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5606 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5607 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5609 fprintf(stderr,"T Text mode output\n");
5611 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5612 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5613 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5614 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5615 fprintf(stderr,"\n");
5616 fprintf(stderr,"Long name options\n");
5617 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5618 fprintf(stderr," Specify the input or output codeset\n");
5619 fprintf(stderr," --fj --unix --mac --windows\n");
5620 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5621 fprintf(stderr," Convert for the system or code\n");
5622 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5623 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5624 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5626 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5628 #ifdef NUMCHAR_OPTION
5629 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5631 #ifdef UTF8_INPUT_ENABLE
5632 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5633 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5636 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5637 fprintf(stderr," Overwrite original listed files by filtered result\n");
5638 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5640 fprintf(stderr," -g --guess Guess the input code\n");
5641 fprintf(stderr," --help --version Show this help/the version\n");
5642 fprintf(stderr," For more information, see also man nkf\n");
5643 fprintf(stderr,"\n");
5650 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5651 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
5654 #if defined(MSDOS) && defined(__WIN16__)
5657 #if defined(MSDOS) && defined(__WIN32__)
5663 ,NKF_VERSION,NKF_RELEASE_DATE);
5664 fprintf(stderr,"\n%s\n",CopyRight);
5669 **
\e$B%Q%C%A@):n<T
\e(B
5670 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5671 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5672 ** ohta@src.ricoh.co.jp (Junn Ohta)
5673 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5674 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5675 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5676 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5677 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5678 ** GHG00637@nifty-serve.or.jp (COW)