1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.83 2005/11/20 23:04:23 naruse Exp $ */
43 #define NKF_VERSION "2.0.5"
44 #define NKF_RELEASE_DATE "2005-11-21"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2005 Kono, Furukawa, Naruse"
55 ** USAGE: nkf [flags] [file]
58 ** b Output is buffered (DEFAULT)
59 ** u Output is unbuffered
63 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
64 ** s Output code is MS Kanji (DEFAULT SELECT)
65 ** e Output code is AT&T JIS (DEFAULT SELECT)
66 ** w Output code is AT&T JIS (DEFAULT SELECT)
67 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
69 ** m MIME conversion for ISO-2022-JP
70 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
71 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
72 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
73 ** M MIME output conversion
75 ** r {de/en}crypt ROT13/47
79 ** T Text mode output (for MS-DOS)
81 ** x Do not convert X0201 kana into X0208
82 ** Z Convert X0208 alphabet to ASCII
87 ** B try to fix broken JIS, missing Escape
88 ** B[1-9] broken level
90 ** O Output to 'nkf.out' file or last file name
91 ** d Delete \r in line feed
92 ** c Add \r in line feed
93 ** -- other long option
94 ** -- ignore following option (don't use with -O )
98 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
100 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
116 #if defined(MSDOS) || defined(__OS2__)
123 #define setbinmode(fp) fsetbin(fp)
124 #else /* Microsoft C, Turbo C */
125 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
127 #else /* UNIX,OS/2 */
128 #define setbinmode(fp)
131 #ifdef _IOFBF /* SysV and MSDOS, Windows */
132 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
134 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
137 /*Borland C++ 4.5 EasyWin*/
138 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
147 /* added by satoru@isoternet.org */
148 #include <sys/stat.h>
149 #ifndef MSDOS /* UNIX, OS/2 */
152 #else /* defined(MSDOS) */
154 #ifdef __BORLANDC__ /* BCC32 */
156 #else /* !defined(__BORLANDC__) */
157 #include <sys/utime.h>
158 #endif /* (__BORLANDC__) */
159 #else /* !defined(__WIN32__) */
160 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
161 #include <sys/utime.h>
162 #elif defined(__TURBOC__) /* BCC */
164 #elif defined(LSI_C) /* LSI C */
165 #endif /* (__WIN32__) */
177 /* state of output_mode and input_mode
195 /* Input Assumption */
199 #define LATIN1_INPUT 6
201 #define STRICT_MIME 8
206 #define JAPANESE_EUC 10
210 #define UTF8_INPUT 13
211 #define UTF16BE_INPUT 14
212 #define UTF16LE_INPUT 15
232 #define is_alnum(c) \
233 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
235 /* I don't trust portablity of toupper */
236 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
237 #define nkf_isoctal(c) ('0'<=c && c<='7')
238 #define nkf_isdigit(c) ('0'<=c && c<='9')
239 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
240 #define nkf_isblank(c) (c == SPACE || c == TAB)
241 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
242 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
243 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
244 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
246 #define HOLD_SIZE 1024
247 #define IOBUF_SIZE 16384
249 #define DEFAULT_J 'B'
250 #define DEFAULT_R 'B'
252 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
253 #define SJ6394 0x0161 /* 63 - 94 ku offset */
255 #define RANGE_NUM_MAX 18
260 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
261 #define sizeof_euc_utf8 94
262 #define sizeof_euc_to_utf8_1byte 94
263 #define sizeof_euc_to_utf8_2bytes 94
264 #define sizeof_utf8_to_euc_C2 64
265 #define sizeof_utf8_to_euc_E5B8 64
266 #define sizeof_utf8_to_euc_2bytes 112
267 #define sizeof_utf8_to_euc_3bytes 16
270 /* MIME preprocessor */
272 #ifdef EASYWIN /*Easy Win */
273 extern POINT _BufferSize;
276 /* function prototype */
278 #ifdef ANSI_C_PROTOTYPE
280 #define STATIC static
294 void (*status_func)PROTO((struct input_code *, int));
295 int (*iconv_func)PROTO((int c2, int c1, int c0));
299 STATIC char *input_codename = "";
302 STATIC const char *CopyRight = COPY_RIGHT;
304 #if !defined(PERL_XS) && !defined(WIN32DLL)
305 STATIC int noconvert PROTO((FILE *f));
307 STATIC int kanji_convert PROTO((FILE *f));
308 STATIC int h_conv PROTO((FILE *f,int c2,int c1));
309 STATIC int push_hold_buf PROTO((int c2));
310 STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
311 STATIC int s_iconv PROTO((int c2,int c1,int c0));
312 STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
313 STATIC int e_iconv PROTO((int c2,int c1,int c0));
314 #ifdef UTF8_INPUT_ENABLE
315 STATIC int strict_mapping_f = TRUE;
316 STATIC int disable_cp932ext_f = FALSE;
317 STATIC void encode_fallback_html PROTO((int c));
318 STATIC void encode_fallback_xml PROTO((int c));
319 STATIC void encode_fallback_java PROTO((int c));
320 STATIC void encode_fallback_perl PROTO((int c));
321 STATIC void encode_fallback_subchar PROTO((int c));
322 STATIC void (*encode_fallback)PROTO((int c)) = NULL;
323 STATIC int w2e_conv PROTO((int c2,int c1,int c0,int *p2,int *p1));
324 STATIC int w_iconv PROTO((int c2,int c1,int c0));
325 STATIC int w_iconv16 PROTO((int c2,int c1,int c0));
326 STATIC int unicode_to_jis_common PROTO((int c2,int c1,int c0,int *p2,int *p1));
327 STATIC int w_iconv_common PROTO((int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1));
328 STATIC int ww16_conv PROTO((int c2, int c1, int c0));
329 STATIC int w16e_conv PROTO((unsigned short val,int *p2,int *p1));
331 #ifdef UTF8_OUTPUT_ENABLE
332 STATIC int e2w_conv PROTO((int c2,int c1));
333 STATIC void w_oconv PROTO((int c2,int c1));
334 STATIC void w_oconv16 PROTO((int c2,int c1));
336 STATIC void e_oconv PROTO((int c2,int c1));
337 STATIC int e2s_conv PROTO((int c2, int c1, int *p2, int *p1));
338 STATIC void s_oconv PROTO((int c2,int c1));
339 STATIC void j_oconv PROTO((int c2,int c1));
340 STATIC void fold_conv PROTO((int c2,int c1));
341 STATIC void cr_conv PROTO((int c2,int c1));
342 STATIC void z_conv PROTO((int c2,int c1));
343 STATIC void rot_conv PROTO((int c2,int c1));
344 STATIC void hira_conv PROTO((int c2,int c1));
345 STATIC void base64_conv PROTO((int c2,int c1));
346 STATIC void iso2022jp_check_conv PROTO((int c2,int c1));
347 STATIC void no_connection PROTO((int c2,int c1));
348 STATIC int no_connection2 PROTO((int c2,int c1,int c0));
350 STATIC void code_score PROTO((struct input_code *ptr));
351 STATIC void code_status PROTO((int c));
353 STATIC void std_putc PROTO((int c));
354 STATIC int std_getc PROTO((FILE *f));
355 STATIC int std_ungetc PROTO((int c,FILE *f));
357 STATIC int broken_getc PROTO((FILE *f));
358 STATIC int broken_ungetc PROTO((int c,FILE *f));
360 STATIC int mime_begin PROTO((FILE *f));
361 STATIC int mime_getc PROTO((FILE *f));
362 STATIC int mime_ungetc PROTO((int c,FILE *f));
364 STATIC int mime_begin_strict PROTO((FILE *f));
365 STATIC int mime_getc_buf PROTO((FILE *f));
366 STATIC int mime_ungetc_buf PROTO((int c,FILE *f));
367 STATIC int mime_integrity PROTO((FILE *f,const unsigned char *p));
369 STATIC int base64decode PROTO((int c));
370 STATIC void mime_prechar PROTO((int c2, int c1));
371 STATIC void mime_putc PROTO((int c));
372 STATIC void open_mime PROTO((int c));
373 STATIC void close_mime PROTO(());
375 STATIC void usage PROTO(());
376 STATIC void version PROTO(());
378 STATIC void options PROTO((unsigned char *c));
379 #if defined(PERL_XS) || defined(WIN32DLL)
380 STATIC void reinit PROTO(());
385 #if !defined(PERL_XS) && !defined(WIN32DLL)
386 STATIC unsigned char stdibuf[IOBUF_SIZE];
387 STATIC unsigned char stdobuf[IOBUF_SIZE];
389 STATIC unsigned char hold_buf[HOLD_SIZE*2];
390 STATIC int hold_count;
392 /* MIME preprocessor fifo */
394 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
395 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
396 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
397 STATIC unsigned char mime_buf[MIME_BUF_SIZE];
398 STATIC unsigned int mime_top = 0;
399 STATIC unsigned int mime_last = 0; /* decoded */
400 STATIC unsigned int mime_input = 0; /* undecoded */
401 STATIC int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
404 STATIC int unbuf_f = FALSE;
405 STATIC int estab_f = FALSE;
406 STATIC int nop_f = FALSE;
407 STATIC int binmode_f = TRUE; /* binary mode */
408 STATIC int rot_f = FALSE; /* rot14/43 mode */
409 STATIC int hira_f = FALSE; /* hira/kata henkan */
410 STATIC int input_f = FALSE; /* non fixed input code */
411 STATIC int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
412 STATIC int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
413 STATIC int mime_decode_f = FALSE; /* mime decode is explicitly on */
414 STATIC int mimebuf_f = FALSE; /* MIME buffered input */
415 STATIC int broken_f = FALSE; /* convert ESC-less broken JIS */
416 STATIC int iso8859_f = FALSE; /* ISO8859 through */
417 STATIC int mimeout_f = FALSE; /* base64 mode */
418 #if defined(MSDOS) || defined(__OS2__)
419 STATIC int x0201_f = TRUE; /* Assume JISX0201 kana */
421 STATIC int x0201_f = NO_X0201; /* Assume NO JISX0201 */
423 STATIC int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
424 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
425 STATIC int internal_unicode_f = FALSE; /* Internal Unicode Processing */
427 #ifdef UTF8_OUTPUT_ENABLE
428 STATIC int unicode_bom_f= 0; /* Output Unicode BOM */
429 STATIC int w_oconv16_LE = 0; /* utf-16 little endian */
430 STATIC int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
431 STATIC int unicode_subchar = '?'; /* the regular substitution character */
434 #ifdef UNICODE_NORMALIZATION
435 STATIC int nfc_f = FALSE;
436 STATIC int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
437 STATIC int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
438 STATIC int nfc_getc PROTO((FILE *f));
439 STATIC int nfc_ungetc PROTO((int c,FILE *f));
443 STATIC int cap_f = FALSE;
444 STATIC int (*i_cgetc)PROTO((FILE *)) = std_getc; /* input of cgetc */
445 STATIC int (*i_cungetc)PROTO((int c ,FILE *f)) = std_ungetc;
446 STATIC int cap_getc PROTO((FILE *f));
447 STATIC int cap_ungetc PROTO((int c,FILE *f));
449 STATIC int url_f = FALSE;
450 STATIC int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
451 STATIC int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
452 STATIC int url_getc PROTO((FILE *f));
453 STATIC int url_ungetc PROTO((int c,FILE *f));
456 #ifdef NUMCHAR_OPTION
457 #define CLASS_MASK 0x0f000000
458 #define CLASS_UTF16 0x01000000
459 STATIC int numchar_f = FALSE;
460 STATIC int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
461 STATIC int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
462 STATIC int numchar_getc PROTO((FILE *f));
463 STATIC int numchar_ungetc PROTO((int c,FILE *f));
467 STATIC int noout_f = FALSE;
468 STATIC void no_putc PROTO((int c));
469 STATIC int debug_f = FALSE;
470 STATIC void debug PROTO((const char *str));
471 STATIC int (*iconv_for_check)() = 0;
474 STATIC int guess_f = FALSE;
476 STATIC void print_guessed_code PROTO((char *filename));
478 STATIC void set_input_codename PROTO((char *codename));
479 STATIC int is_inputcode_mixed = FALSE;
480 STATIC int is_inputcode_set = FALSE;
483 STATIC int exec_f = 0;
486 #ifdef SHIFTJIS_CP932
487 /* invert IBM extended characters to others
488 and controls some UCS mapping for Microsoft Code Page */
489 STATIC int cp51932_f = TRUE;
490 #define CP932_TABLE_BEGIN (0xfa)
491 #define CP932_TABLE_END (0xfc)
493 /* invert NEC-selected IBM extended characters to IBM extended characters */
494 STATIC int cp932inv_f = TRUE;
495 #define CP932INV_TABLE_BEGIN (0xed)
496 #define CP932INV_TABLE_END (0xee)
498 /* STATIC int cp932_conv PROTO((int c2, int c1)); */
499 #endif /* SHIFTJIS_CP932 */
502 STATIC int x0212_f = FALSE;
503 STATIC int x0212_shift PROTO((int c));
504 STATIC int x0212_unshift PROTO((int c));
507 STATIC unsigned char prefix_table[256];
509 STATIC void e_status PROTO((struct input_code *, int));
510 STATIC void s_status PROTO((struct input_code *, int));
512 #ifdef UTF8_INPUT_ENABLE
513 STATIC void w_status PROTO((struct input_code *, int));
514 STATIC void w16_status PROTO((struct input_code *, int));
515 STATIC int utf16_mode = UTF16BE_INPUT;
518 struct input_code input_code_list[] = {
519 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
520 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
521 #ifdef UTF8_INPUT_ENABLE
522 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
523 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
528 STATIC int mimeout_mode = 0;
529 STATIC int base64_count = 0;
531 /* X0208 -> ASCII converter */
534 STATIC int f_line = 0; /* chars in line */
535 STATIC int f_prev = 0;
536 STATIC int fold_preserve_f = FALSE; /* preserve new lines */
537 STATIC int fold_f = FALSE;
538 STATIC int fold_len = 0;
541 STATIC unsigned char kanji_intro = DEFAULT_J;
542 STATIC unsigned char ascii_intro = DEFAULT_R;
546 #define FOLD_MARGIN 10
547 #define DEFAULT_FOLD 60
549 STATIC int fold_margin = FOLD_MARGIN;
553 #ifdef DEFAULT_CODE_JIS
554 # define DEFAULT_CONV j_oconv
556 #ifdef DEFAULT_CODE_SJIS
557 # define DEFAULT_CONV s_oconv
559 #ifdef DEFAULT_CODE_EUC
560 # define DEFAULT_CONV e_oconv
562 #ifdef DEFAULT_CODE_UTF8
563 # define DEFAULT_CONV w_oconv
566 /* process default */
567 STATIC void (*output_conv)PROTO((int c2,int c1)) = DEFAULT_CONV;
569 STATIC void (*oconv)PROTO((int c2,int c1)) = no_connection;
570 /* s_iconv or oconv */
571 STATIC int (*iconv)PROTO((int c2,int c1,int c0)) = no_connection2;
573 STATIC void (*o_zconv)PROTO((int c2,int c1)) = no_connection;
574 STATIC void (*o_fconv)PROTO((int c2,int c1)) = no_connection;
575 STATIC void (*o_crconv)PROTO((int c2,int c1)) = no_connection;
576 STATIC void (*o_rot_conv)PROTO((int c2,int c1)) = no_connection;
577 STATIC void (*o_hira_conv)PROTO((int c2,int c1)) = no_connection;
578 STATIC void (*o_base64conv)PROTO((int c2,int c1)) = no_connection;
579 STATIC void (*o_iso2022jp_check_conv)PROTO((int c2,int c1)) = no_connection;
581 /* STATIC redirections */
583 STATIC void (*o_putc)PROTO((int c)) = std_putc;
585 STATIC int (*i_getc)PROTO((FILE *f)) = std_getc; /* general input */
586 STATIC int (*i_ungetc)PROTO((int c,FILE *f)) =std_ungetc;
588 STATIC int (*i_bgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
589 STATIC int (*i_bungetc)PROTO((int c ,FILE *f)) = std_ungetc;
591 STATIC void (*o_mputc)PROTO((int c)) = std_putc ; /* output of mputc */
593 STATIC int (*i_mgetc)PROTO((FILE *)) = std_getc; /* input of mgetc */
594 STATIC int (*i_mungetc)PROTO((int c ,FILE *f)) = std_ungetc;
596 /* for strict mime */
597 STATIC int (*i_mgetc_buf)PROTO((FILE *)) = std_getc; /* input of mgetc_buf */
598 STATIC int (*i_mungetc_buf)PROTO((int c,FILE *f)) = std_ungetc;
601 STATIC int output_mode = ASCII, /* output kanji mode */
602 input_mode = ASCII, /* input kanji mode */
603 shift_mode = FALSE; /* TRUE shift out, or X0201 */
604 STATIC int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
606 /* X0201 / X0208 conversion tables */
608 /* X0201 kana conversion table */
611 unsigned char cv[]= {
612 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
613 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
614 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
615 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
616 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
617 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
618 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
619 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
620 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
621 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
622 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
623 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
624 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
625 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
626 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
627 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
631 /* X0201 kana conversion table for daguten */
634 unsigned char dv[]= {
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
640 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
641 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
642 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
643 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
644 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
646 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 /* X0201 kana conversion table for han-daguten */
656 unsigned char ev[]= {
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
668 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 /* X0208 kigou conversion table */
677 /* 0x8140 - 0x819e */
679 unsigned char fv[] = {
681 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
682 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
683 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
685 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
686 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
687 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
688 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
689 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
690 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
691 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
698 STATIC int file_out = FALSE;
700 STATIC int overwrite = FALSE;
703 STATIC int crmode_f = 0; /* CR, NL, CRLF */
704 #ifdef EASYWIN /*Easy Win */
705 STATIC int end_check;
708 #define STD_GC_BUFSIZE (256)
709 int std_gc_buf[STD_GC_BUFSIZE];
713 #include "nkf32dll.c"
714 #elif defined(PERL_XS)
724 char *outfname = NULL;
727 #ifdef EASYWIN /*Easy Win */
728 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
731 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
732 cp = (unsigned char *)*argv;
737 if (pipe(fds) < 0 || (pid = fork()) < 0){
748 execvp(argv[1], &argv[1]);
762 if(x0201_f == WISH_TRUE)
763 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
765 if (binmode_f == TRUE)
767 if (freopen("","wb",stdout) == NULL)
774 setbuf(stdout, (char *) NULL);
776 setvbuffer(stdout, stdobuf, IOBUF_SIZE);
779 if (binmode_f == TRUE)
781 if (freopen("","rb",stdin) == NULL) return (-1);
785 setvbuffer(stdin, stdibuf, IOBUF_SIZE);
789 kanji_convert(stdin);
790 if (guess_f) print_guessed_code(NULL);
795 is_inputcode_mixed = FALSE;
796 is_inputcode_set = FALSE;
801 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
810 /* reopen file for stdout */
811 if (file_out == TRUE) {
814 outfname = malloc(strlen(origfname)
815 + strlen(".nkftmpXXXXXX")
821 strcpy(outfname, origfname);
825 for (i = strlen(outfname); i; --i){
826 if (outfname[i - 1] == '/'
827 || outfname[i - 1] == '\\'){
833 strcat(outfname, "ntXXXXXX");
835 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC,
838 strcat(outfname, ".nkftmpXXXXXX");
839 fd = mkstemp(outfname);
842 || (fd_backup = dup(fileno(stdout))) < 0
843 || dup2(fd, fileno(stdout)) < 0
854 outfname = "nkf.out";
857 if(freopen(outfname, "w", stdout) == NULL) {
861 if (binmode_f == TRUE) {
863 if (freopen("","wb",stdout) == NULL)
870 if (binmode_f == TRUE)
872 if (freopen("","rb",fin) == NULL)
877 setvbuffer(fin, stdibuf, IOBUF_SIZE);
881 char *filename = NULL;
883 if (nfiles > 1) filename = origfname;
884 if (guess_f) print_guessed_code(filename);
890 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
898 if (dup2(fd_backup, fileno(stdout)) < 0){
901 if (stat(origfname, &sb)) {
902 fprintf(stderr, "Can't stat %s\n", origfname);
904 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
905 if (chmod(outfname, sb.st_mode)) {
906 fprintf(stderr, "Can't set permission %s\n", outfname);
909 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
910 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
911 tb[0] = tb[1] = sb.st_mtime;
912 if (utime(outfname, tb)) {
913 fprintf(stderr, "Can't set timestamp %s\n", outfname);
916 tb.actime = sb.st_atime;
917 tb.modtime = sb.st_mtime;
918 if (utime(outfname, &tb)) {
919 fprintf(stderr, "Can't set timestamp %s\n", outfname);
923 if (unlink(origfname)){
927 if (rename(outfname, origfname)) {
929 fprintf(stderr, "Can't rename %s to %s\n",
930 outfname, origfname);
938 #ifdef EASYWIN /*Easy Win */
939 if (file_out == FALSE)
940 scanf("%d",&end_check);
943 #else /* for Other OS */
944 if (file_out == TRUE)
949 #endif /* WIN32DLL */
976 {"katakana-hiragana","h3"},
983 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
984 {"internal-unicode", ""},
986 #ifdef UTF8_OUTPUT_ENABLE
998 #ifdef UTF8_INPUT_ENABLE
1000 {"utf16-input", "W16"},
1001 {"disable-cp932ext", ""},
1002 {"strict-mapping", ""},
1004 #ifdef UNICODE_NORMALIZATION
1005 {"utf8mac-input", ""},
1014 #ifdef NUMCHAR_OPTION
1015 {"numchar-input", ""},
1021 #ifdef SHIFTJIS_CP932
1031 STATIC int option_mode = 0;
1038 unsigned char *p = NULL;
1039 unsigned char *cp_back = NULL;
1040 unsigned char codeset[32];
1044 while(*cp && *cp++!='-');
1045 while (*cp || cp_back) {
1053 case '-': /* literal options */
1054 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1058 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1059 p = (unsigned char *)long_option[i].name;
1060 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1061 if (*p == cp[j] || cp[j] == ' '){
1068 while(*cp && *cp != SPACE && cp++);
1069 if (long_option[i].alias[0]){
1071 cp = (unsigned char *)long_option[i].alias;
1073 if (strcmp(long_option[i].name, "ic=") == 0){
1074 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1075 codeset[i] = nkf_toupper(p[i]);
1078 if(strcmp(codeset, "ISO-2022-JP") == 0){
1079 input_f = JIS_INPUT;
1080 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1081 input_f = SJIS_INPUT;
1082 if (x0201_f==NO_X0201) x0201_f=TRUE;
1083 }else if(strcmp(codeset, "CP932") == 0){
1084 input_f = SJIS_INPUT;
1086 #ifdef SHIFTJIS_CP932
1090 #ifdef UTF8_OUTPUT_ENABLE
1091 ms_ucs_map_f = TRUE;
1093 }else if(strcmp(codeset, "EUCJP") == 0 ||
1094 strcmp(codeset, "EUC-JP") == 0){
1095 input_f = JIS_INPUT;
1096 }else if(strcmp(codeset, "CP51932") == 0){
1097 input_f = JIS_INPUT;
1099 #ifdef SHIFTJIS_CP932
1103 #ifdef UTF8_OUTPUT_ENABLE
1104 ms_ucs_map_f = TRUE;
1106 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1107 strcmp(codeset, "EUCJP-MS") == 0){
1108 input_f = JIS_INPUT;
1110 #ifdef SHIFTJIS_CP932
1114 #ifdef UTF8_OUTPUT_ENABLE
1115 ms_ucs_map_f = TRUE;
1117 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1118 strcmp(codeset, "EUCJP-ASCII") == 0){
1119 input_f = JIS_INPUT;
1121 #ifdef SHIFTJIS_CP932
1125 #ifdef UTF8_OUTPUT_ENABLE
1126 ms_ucs_map_f = FALSE;
1128 #ifdef UTF8_INPUT_ENABLE
1129 }else if(strcmp(codeset, "UTF-8") == 0 ||
1130 strcmp(codeset, "UTF-8N") == 0 ||
1131 strcmp(codeset, "UTF-8-BOM") == 0){
1132 input_f = UTF8_INPUT;
1133 #ifdef UNICODE_NORMALIZATION
1134 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1135 strcmp(codeset, "UTF-8-MAC") == 0){
1136 input_f = UTF8_INPUT;
1139 }else if(strcmp(codeset, "UTF-16") == 0){
1140 input_f = UTF16BE_INPUT;
1141 utf16_mode = UTF16BE_INPUT;
1142 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1143 strcmp(codeset, "UTF-16BE-BOM") == 0){
1144 input_f = UTF16BE_INPUT;
1145 utf16_mode = UTF16BE_INPUT;
1146 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1147 strcmp(codeset, "UTF-16LE-BOM") == 0){
1148 input_f = UTF16LE_INPUT;
1149 utf16_mode = UTF16LE_INPUT;
1154 if (strcmp(long_option[i].name, "oc=") == 0){
1155 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1156 codeset[i] = nkf_toupper(p[i]);
1159 if(strcmp(codeset, "ISO-2022-JP") == 0){
1160 output_conv = j_oconv;
1161 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1162 output_conv = s_oconv;
1163 }else if(strcmp(codeset, "CP932") == 0){
1164 output_conv = s_oconv;
1166 #ifdef SHIFTJIS_CP932
1170 #ifdef UTF8_OUTPUT_ENABLE
1171 ms_ucs_map_f = TRUE;
1173 }else if(strcmp(codeset, "EUCJP") == 0 ||
1174 strcmp(codeset, "EUC-JP") == 0){
1175 output_conv = e_oconv;
1176 }else if(strcmp(codeset, "CP51932") == 0){
1177 output_conv = e_oconv;
1179 #ifdef SHIFTJIS_CP932
1183 #ifdef UTF8_OUTPUT_ENABLE
1184 ms_ucs_map_f = TRUE;
1186 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1187 strcmp(codeset, "EUCJP-MS") == 0){
1188 output_conv = e_oconv;
1191 #ifdef SHIFTJIS_CP932
1194 #ifdef UTF8_OUTPUT_ENABLE
1195 ms_ucs_map_f = TRUE;
1197 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1198 strcmp(codeset, "EUCJP-ASCII") == 0){
1199 output_conv = e_oconv;
1202 #ifdef SHIFTJIS_CP932
1205 #ifdef UTF8_OUTPUT_ENABLE
1206 ms_ucs_map_f = FALSE;
1208 #ifdef UTF8_OUTPUT_ENABLE
1209 }else if(strcmp(codeset, "UTF-8") == 0){
1210 output_conv = w_oconv;
1211 }else if(strcmp(codeset, "UTF-8N") == 0){
1212 output_conv = w_oconv;
1214 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1215 output_conv = w_oconv;
1217 }else if(strcmp(codeset, "UTF-16BE") == 0){
1218 output_conv = w_oconv16;
1220 }else if(strcmp(codeset, "UTF-16") == 0 ||
1221 strcmp(codeset, "UTF-16BE-BOM") == 0){
1222 output_conv = w_oconv16;
1224 }else if(strcmp(codeset, "UTF-16LE") == 0){
1225 output_conv = w_oconv16;
1228 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1229 output_conv = w_oconv16;
1237 if (strcmp(long_option[i].name, "overwrite") == 0){
1244 if (strcmp(long_option[i].name, "cap-input") == 0){
1248 if (strcmp(long_option[i].name, "url-input") == 0){
1253 #ifdef NUMCHAR_OPTION
1254 if (strcmp(long_option[i].name, "numchar-input") == 0){
1260 if (strcmp(long_option[i].name, "no-output") == 0){
1264 if (strcmp(long_option[i].name, "debug") == 0){
1269 if (strcmp(long_option[i].name, "cp932") == 0){
1270 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = TRUE;
1279 if (strcmp(long_option[i].name, "no-cp932") == 0){
1280 #ifdef SHIFTJIS_CP932
1284 #ifdef UTF8_OUTPUT_ENABLE
1285 ms_ucs_map_f = FALSE;
1289 #ifdef SHIFTJIS_CP932
1290 if (strcmp(long_option[i].name, "cp932inv") == 0){
1297 if (strcmp(long_option[i].name, "x0212") == 0){
1304 if (strcmp(long_option[i].name, "exec-in") == 0){
1308 if (strcmp(long_option[i].name, "exec-out") == 0){
1313 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1314 if (strcmp(long_option[i].name, "internal-unicode") == 0){
1315 internal_unicode_f = TRUE;
1318 if (strcmp(long_option[i].name, "disable-cp932ext") == 0){
1319 disable_cp932ext_f = TRUE;
1322 if (strcmp(long_option[i].name, "fb-skip") == 0){
1323 encode_fallback = NULL;
1326 if (strcmp(long_option[i].name, "fb-html") == 0){
1327 encode_fallback = encode_fallback_html;
1330 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1331 encode_fallback = encode_fallback_xml;
1334 if (strcmp(long_option[i].name, "fb-java") == 0){
1335 encode_fallback = encode_fallback_java;
1338 if (strcmp(long_option[i].name, "fb-perl") == 0){
1339 encode_fallback = encode_fallback_perl;
1342 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1343 encode_fallback = encode_fallback_subchar;
1346 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1347 encode_fallback = encode_fallback_subchar;
1348 unicode_subchar = 0;
1350 /* decimal number */
1351 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1352 unicode_subchar *= 10;
1353 unicode_subchar += hex2bin(p[i]);
1355 }else if(p[1] == 'x' || p[1] == 'X'){
1356 /* hexadecimal number */
1357 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1358 unicode_subchar <<= 4;
1359 unicode_subchar |= hex2bin(p[i]);
1363 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1364 unicode_subchar *= 8;
1365 unicode_subchar += hex2bin(p[i]);
1368 w16e_conv(unicode_subchar, &i, &j);
1369 unicode_subchar = i<<8 | j;
1373 #ifdef UTF8_OUTPUT_ENABLE
1374 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1375 ms_ucs_map_f = TRUE;
1379 #ifdef UNICODE_NORMALIZATION
1380 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1381 input_f = UTF8_INPUT;
1386 if (strcmp(long_option[i].name, "prefix=") == 0){
1387 if (' ' < p[0] && p[0] < 128){
1388 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1389 prefix_table[p[i]] = p[0];
1396 case 'b': /* buffered mode */
1399 case 'u': /* non bufferd mode */
1402 case 't': /* transparent mode */
1405 case 'j': /* JIS output */
1407 output_conv = j_oconv;
1409 case 'e': /* AT&T EUC output */
1410 output_conv = e_oconv;
1412 case 's': /* SJIS output */
1413 output_conv = s_oconv;
1415 case 'l': /* ISO8859 Latin-1 support, no conversion */
1416 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1417 input_f = LATIN1_INPUT;
1419 case 'i': /* Kanji IN ESC-$-@/B */
1420 if (*cp=='@'||*cp=='B')
1421 kanji_intro = *cp++;
1423 case 'o': /* ASCII IN ESC-(-J/B */
1424 if (*cp=='J'||*cp=='B'||*cp=='H')
1425 ascii_intro = *cp++;
1429 bit:1 katakana->hiragana
1430 bit:2 hiragana->katakana
1432 if ('9'>= *cp && *cp>='0')
1433 hira_f |= (*cp++ -'0');
1440 #if defined(MSDOS) || defined(__OS2__)
1455 #ifdef UTF8_OUTPUT_ENABLE
1456 case 'w': /* UTF-8 output */
1457 if ('1'== cp[0] && '6'==cp[1]) {
1458 output_conv = w_oconv16; cp+=2;
1460 unicode_bom_f=2; cp++;
1463 unicode_bom_f=1; cp++;
1465 } else if (cp[0] == 'B') {
1466 unicode_bom_f=2; cp++;
1468 unicode_bom_f=1; cp++;
1471 } else if (cp[0] == '8') {
1472 output_conv = w_oconv; cp++;
1475 unicode_bom_f=1; cp++;
1478 output_conv = w_oconv;
1481 #ifdef UTF8_INPUT_ENABLE
1482 case 'W': /* UTF-8 input */
1483 if ('1'== cp[0] && '6'==cp[1]) {
1484 input_f = UTF16BE_INPUT;
1485 utf16_mode = UTF16BE_INPUT;
1489 input_f = UTF16LE_INPUT;
1490 utf16_mode = UTF16LE_INPUT;
1491 } else if (cp[0] == 'B') {
1493 input_f = UTF16BE_INPUT;
1494 utf16_mode = UTF16BE_INPUT;
1496 } else if (cp[0] == '8') {
1498 input_f = UTF8_INPUT;
1500 input_f = UTF8_INPUT;
1503 /* Input code assumption */
1504 case 'J': /* JIS input */
1505 case 'E': /* AT&T EUC input */
1506 input_f = JIS_INPUT;
1508 case 'S': /* MS Kanji input */
1509 input_f = SJIS_INPUT;
1510 if (x0201_f==NO_X0201) x0201_f=TRUE;
1512 case 'Z': /* Convert X0208 alphabet to asii */
1513 /* bit:0 Convert X0208
1514 bit:1 Convert Kankaku to one space
1515 bit:2 Convert Kankaku to two spaces
1516 bit:3 Convert HTML Entity
1518 if ('9'>= *cp && *cp>='0')
1519 alpha_f |= 1<<(*cp++ -'0');
1523 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1524 x0201_f = FALSE; /* No X0201->X0208 conversion */
1526 ESC-(-I in JIS, EUC, MS Kanji
1527 SI/SO in JIS, EUC, MS Kanji
1528 SSO in EUC, JIS, not in MS Kanji
1529 MS Kanji (0xa0-0xdf)
1531 ESC-(-I in JIS (0x20-0x5f)
1532 SSO in EUC (0xa0-0xdf)
1533 0xa0-0xd in MS Kanji (0xa0-0xdf)
1536 case 'X': /* Assume X0201 kana */
1537 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1540 case 'F': /* prserve new lines */
1541 fold_preserve_f = TRUE;
1542 case 'f': /* folding -f60 or -f */
1545 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1547 fold_len += *cp++ - '0';
1549 if (!(0<fold_len && fold_len<BUFSIZ))
1550 fold_len = DEFAULT_FOLD;
1554 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1556 fold_margin += *cp++ - '0';
1560 case 'm': /* MIME support */
1561 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1562 if (*cp=='B'||*cp=='Q') {
1563 mime_decode_mode = *cp++;
1564 mimebuf_f = FIXED_MIME;
1565 } else if (*cp=='N') {
1566 mime_f = TRUE; cp++;
1567 } else if (*cp=='S') {
1568 mime_f = STRICT_MIME; cp++;
1569 } else if (*cp=='0') {
1570 mime_decode_f = FALSE;
1571 mime_f = FALSE; cp++;
1574 case 'M': /* MIME output */
1577 mimeout_f = FIXED_MIME; cp++;
1578 } else if (*cp=='Q') {
1580 mimeout_f = FIXED_MIME; cp++;
1585 case 'B': /* Broken JIS support */
1587 bit:1 allow any x on ESC-(-x or ESC-$-x
1588 bit:2 reset to ascii on NL
1590 if ('9'>= *cp && *cp>='0')
1591 broken_f |= 1<<(*cp++ -'0');
1596 case 'O':/* for Output file */
1600 case 'c':/* add cr code */
1603 case 'd':/* delete cr code */
1606 case 'I': /* ISO-2022-JP output */
1609 case 'L': /* line mode */
1610 if (*cp=='u') { /* unix */
1611 crmode_f = NL; cp++;
1612 } else if (*cp=='m') { /* mac */
1613 crmode_f = CR; cp++;
1614 } else if (*cp=='w') { /* windows */
1615 crmode_f = CRLF; cp++;
1616 } else if (*cp=='0') { /* no conversion */
1626 /* module muliple options in a string are allowed for Perl moudle */
1627 while(*cp && *cp++!='-');
1630 /* bogus option but ignored */
1636 #ifdef ANSI_C_PROTOTYPE
1637 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1639 struct input_code * find_inputcode_byfunc(iconv_func)
1640 int (*iconv_func)();
1644 struct input_code *p = input_code_list;
1646 if (iconv_func == p->iconv_func){
1655 #ifdef ANSI_C_PROTOTYPE
1656 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1658 void set_iconv(f, iconv_func)
1660 int (*iconv_func)();
1663 #ifdef INPUT_CODE_FIX
1671 #ifdef INPUT_CODE_FIX
1672 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1678 if (estab_f && iconv_for_check != iconv){
1679 struct input_code *p = find_inputcode_byfunc(iconv);
1681 set_input_codename(p->name);
1682 debug(input_codename);
1684 iconv_for_check = iconv;
1689 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1690 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1691 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1692 #ifdef SHIFTJIS_CP932
1693 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1694 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1696 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1698 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1699 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1701 #define SCORE_INIT (SCORE_iMIME)
1703 const int score_table_A0[] = {
1706 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1707 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1710 const int score_table_F0[] = {
1711 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1712 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1713 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1714 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1717 void set_code_score(ptr, score)
1718 struct input_code *ptr;
1722 ptr->score |= score;
1726 void clr_code_score(ptr, score)
1727 struct input_code *ptr;
1731 ptr->score &= ~score;
1735 void code_score(ptr)
1736 struct input_code *ptr;
1738 int c2 = ptr->buf[0];
1739 #ifdef UTF8_OUTPUT_ENABLE
1740 int c1 = ptr->buf[1];
1743 set_code_score(ptr, SCORE_ERROR);
1744 }else if (c2 == SSO){
1745 set_code_score(ptr, SCORE_KANA);
1746 #ifdef UTF8_OUTPUT_ENABLE
1747 }else if (!e2w_conv(c2, c1)){
1748 set_code_score(ptr, SCORE_NO_EXIST);
1750 }else if ((c2 & 0x70) == 0x20){
1751 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1752 }else if ((c2 & 0x70) == 0x70){
1753 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1754 }else if ((c2 & 0x70) >= 0x50){
1755 set_code_score(ptr, SCORE_L2);
1759 void status_disable(ptr)
1760 struct input_code *ptr;
1765 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1768 void status_push_ch(ptr, c)
1769 struct input_code *ptr;
1772 ptr->buf[ptr->index++] = c;
1775 void status_clear(ptr)
1776 struct input_code *ptr;
1782 void status_reset(ptr)
1783 struct input_code *ptr;
1786 ptr->score = SCORE_INIT;
1789 void status_reinit(ptr)
1790 struct input_code *ptr;
1793 ptr->_file_stat = 0;
1796 void status_check(ptr, c)
1797 struct input_code *ptr;
1800 if (c <= DEL && estab_f){
1805 void s_status(ptr, c)
1806 struct input_code *ptr;
1811 status_check(ptr, c);
1816 #ifdef NUMCHAR_OPTION
1817 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1820 }else if (0xa1 <= c && c <= 0xdf){
1821 status_push_ch(ptr, SSO);
1822 status_push_ch(ptr, c);
1825 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1827 status_push_ch(ptr, c);
1828 #ifdef SHIFTJIS_CP932
1830 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1832 status_push_ch(ptr, c);
1833 #endif /* SHIFTJIS_CP932 */
1835 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1837 status_push_ch(ptr, c);
1838 #endif /* X0212_ENABLE */
1840 status_disable(ptr);
1844 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1845 status_push_ch(ptr, c);
1846 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
1850 status_disable(ptr);
1854 #ifdef SHIFTJIS_CP932
1855 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
1856 status_push_ch(ptr, c);
1857 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
1858 set_code_score(ptr, SCORE_CP932);
1863 #endif /* SHIFTJIS_CP932 */
1864 #ifndef X0212_ENABLE
1865 status_disable(ptr);
1871 void e_status(ptr, c)
1872 struct input_code *ptr;
1877 status_check(ptr, c);
1882 #ifdef NUMCHAR_OPTION
1883 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1886 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
1888 status_push_ch(ptr, c);
1890 }else if (0x8f == c){
1892 status_push_ch(ptr, c);
1893 #endif /* X0212_ENABLE */
1895 status_disable(ptr);
1899 if (0xa1 <= c && c <= 0xfe){
1900 status_push_ch(ptr, c);
1904 status_disable(ptr);
1909 if (0xa1 <= c && c <= 0xfe){
1911 status_push_ch(ptr, c);
1913 status_disable(ptr);
1915 #endif /* X0212_ENABLE */
1919 #ifdef UTF8_INPUT_ENABLE
1920 void w16_status(ptr, c)
1921 struct input_code *ptr;
1928 if (ptr->_file_stat == 0){
1929 if (c == 0xfe || c == 0xff){
1931 status_push_ch(ptr, c);
1932 ptr->_file_stat = 1;
1934 status_disable(ptr);
1935 ptr->_file_stat = -1;
1937 }else if (ptr->_file_stat > 0){
1939 status_push_ch(ptr, c);
1940 }else if (ptr->_file_stat < 0){
1941 status_disable(ptr);
1947 status_disable(ptr);
1948 ptr->_file_stat = -1;
1950 status_push_ch(ptr, c);
1957 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
1958 status_push_ch(ptr, c);
1961 status_disable(ptr);
1962 ptr->_file_stat = -1;
1968 void w_status(ptr, c)
1969 struct input_code *ptr;
1974 status_check(ptr, c);
1979 #ifdef NUMCHAR_OPTION
1980 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1983 }else if (0xc0 <= c && c <= 0xdf){
1985 status_push_ch(ptr, c);
1986 }else if (0xe0 <= c && c <= 0xef){
1988 status_push_ch(ptr, c);
1990 status_disable(ptr);
1995 if (0x80 <= c && c <= 0xbf){
1996 status_push_ch(ptr, c);
1997 if (ptr->index > ptr->stat){
1998 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
1999 && ptr->buf[2] == 0xbf);
2000 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2001 &ptr->buf[0], &ptr->buf[1]);
2008 status_disable(ptr);
2019 int action_flag = 1;
2020 struct input_code *result = 0;
2021 struct input_code *p = input_code_list;
2023 (p->status_func)(p, c);
2026 }else if(p->stat == 0){
2037 if (result && !estab_f){
2038 set_iconv(TRUE, result->iconv_func);
2039 }else if (c <= DEL){
2040 struct input_code *ptr = input_code_list;
2055 return std_gc_buf[--std_gc_ndx];
2066 if (std_gc_ndx == STD_GC_BUFSIZE){
2069 std_gc_buf[std_gc_ndx++] = c;
2083 #if !defined(PERL_XS) && !defined(WIN32DLL)
2090 while ((c = (*i_getc)(f)) != EOF)
2099 oconv = output_conv;
2102 /* replace continucation module, from output side */
2104 /* output redicrection */
2106 if (noout_f || guess_f){
2113 if (mimeout_f == TRUE) {
2114 o_base64conv = oconv; oconv = base64_conv;
2116 /* base64_count = 0; */
2120 o_crconv = oconv; oconv = cr_conv;
2123 o_rot_conv = oconv; oconv = rot_conv;
2126 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2129 o_hira_conv = oconv; oconv = hira_conv;
2132 o_fconv = oconv; oconv = fold_conv;
2135 if (alpha_f || x0201_f) {
2136 o_zconv = oconv; oconv = z_conv;
2140 i_ungetc = std_ungetc;
2141 /* input redicrection */
2144 i_cgetc = i_getc; i_getc = cap_getc;
2145 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2148 i_ugetc = i_getc; i_getc = url_getc;
2149 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2152 #ifdef NUMCHAR_OPTION
2154 i_ngetc = i_getc; i_getc = numchar_getc;
2155 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2158 #ifdef UNICODE_NORMALIZATION
2159 if (nfc_f && input_f == UTF8_INPUT){
2160 i_nfc_getc = i_getc; i_getc = nfc_getc;
2161 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2164 if (mime_f && mimebuf_f==FIXED_MIME) {
2165 i_mgetc = i_getc; i_getc = mime_getc;
2166 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2169 i_bgetc = i_getc; i_getc = broken_getc;
2170 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2172 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2173 set_iconv(-TRUE, e_iconv);
2174 } else if (input_f == SJIS_INPUT) {
2175 set_iconv(-TRUE, s_iconv);
2176 #ifdef UTF8_INPUT_ENABLE
2177 } else if (input_f == UTF8_INPUT) {
2178 set_iconv(-TRUE, w_iconv);
2179 } else if (input_f == UTF16BE_INPUT) {
2180 set_iconv(-TRUE, w_iconv16);
2181 } else if (input_f == UTF16LE_INPUT) {
2182 set_iconv(-TRUE, w_iconv16);
2185 set_iconv(FALSE, e_iconv);
2189 struct input_code *p = input_code_list;
2197 Conversion main loop. Code detection only.
2206 int is_8bit = FALSE;
2208 module_connection();
2213 output_mode = ASCII;
2216 #define NEXT continue /* no output, get next */
2217 #define SEND ; /* output c1 and c2, get next */
2218 #define LAST break /* end of loop, go closing */
2220 while ((c1 = (*i_getc)(f)) != EOF) {
2225 /* in case of 8th bit is on */
2226 if (!estab_f&&!mime_decode_mode) {
2227 /* in case of not established yet */
2228 /* It is still ambiguious */
2229 if (h_conv(f, c2, c1)==EOF)
2235 /* in case of already established */
2237 /* ignore bogus code */
2243 /* second byte, 7 bit code */
2244 /* it might be kanji shitfted */
2245 if ((c1 == DEL) || (c1 <= SPACE)) {
2246 /* ignore bogus first code */
2254 #ifdef UTF8_INPUT_ENABLE
2263 #ifdef NUMCHAR_OPTION
2264 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2267 } else if (c1 > DEL) {
2269 if (!estab_f && !iso8859_f) {
2270 /* not established yet */
2271 if (!is_8bit) is_8bit = TRUE;
2274 } else { /* estab_f==TRUE */
2279 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2280 /* SJIS X0201 Case... */
2281 if(iso2022jp_f && x0201_f==NO_X0201) {
2282 (*oconv)(GETA1, GETA2);
2289 } else if (c1==SSO && iconv != s_iconv) {
2290 /* EUC X0201 Case */
2291 c1 = (*i_getc)(f); /* skip SSO */
2293 if (SSP<=c1 && c1<0xe0) {
2294 if(iso2022jp_f && x0201_f==NO_X0201) {
2295 (*oconv)(GETA1, GETA2);
2302 } else { /* bogus code, skip SSO and one byte */
2306 /* already established */
2311 } else if ((c1 > SPACE) && (c1 != DEL)) {
2312 /* in case of Roman characters */
2314 /* output 1 shifted byte */
2318 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2319 /* output 1 shifted byte */
2320 if(iso2022jp_f && x0201_f==NO_X0201) {
2321 (*oconv)(GETA1, GETA2);
2328 /* look like bogus code */
2331 } else if (input_mode == X0208) {
2332 /* in case of Kanji shifted */
2335 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2336 /* Check MIME code */
2337 if ((c1 = (*i_getc)(f)) == EOF) {
2340 } else if (c1 == '?') {
2341 /* =? is mime conversion start sequence */
2342 if(mime_f == STRICT_MIME) {
2343 /* check in real detail */
2344 if (mime_begin_strict(f) == EOF)
2348 } else if (mime_begin(f) == EOF)
2358 /* normal ASCII code */
2361 } else if (!is_8bit && c1 == SI) {
2364 } else if (!is_8bit && c1 == SO) {
2367 } else if (!is_8bit && c1 == ESC ) {
2368 if ((c1 = (*i_getc)(f)) == EOF) {
2369 /* (*oconv)(0, ESC); don't send bogus code */
2371 } else if (c1 == '$') {
2372 if ((c1 = (*i_getc)(f)) == EOF) {
2374 (*oconv)(0, ESC); don't send bogus code
2375 (*oconv)(0, '$'); */
2377 } else if (c1 == '@'|| c1 == 'B') {
2378 /* This is kanji introduction */
2381 set_input_codename("ISO-2022-JP");
2383 debug(input_codename);
2386 } else if (c1 == '(') {
2387 if ((c1 = (*i_getc)(f)) == EOF) {
2388 /* don't send bogus code
2394 } else if (c1 == '@'|| c1 == 'B') {
2395 /* This is kanji introduction */
2400 } else if (c1 == 'D'){
2404 #endif /* X0212_ENABLE */
2406 /* could be some special code */
2413 } else if (broken_f&0x2) {
2414 /* accept any ESC-(-x as broken code ... */
2424 } else if (c1 == '(') {
2425 if ((c1 = (*i_getc)(f)) == EOF) {
2426 /* don't send bogus code
2428 (*oconv)(0, '('); */
2432 /* This is X0201 kana introduction */
2433 input_mode = X0201; shift_mode = X0201;
2435 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2436 /* This is X0208 kanji introduction */
2437 input_mode = ASCII; shift_mode = FALSE;
2439 } else if (broken_f&0x2) {
2440 input_mode = ASCII; shift_mode = FALSE;
2445 /* maintain various input_mode here */
2449 } else if ( c1 == 'N' || c1 == 'n' ){
2451 c3 = (*i_getc)(f); /* skip SS2 */
2452 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2467 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2468 input_mode = ASCII; set_iconv(FALSE, 0);
2470 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2471 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2479 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2480 if ((c1=(*i_getc)(f))!=EOF) {
2484 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2500 if (input_mode == X0208)
2501 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2503 else if (input_mode == X0212)
2504 (*oconv)((0x8f << 8) | c2, c1);
2505 #endif /* X0212_ENABLE */
2506 else if (input_mode)
2507 (*oconv)(input_mode, c1); /* other special case */
2508 else if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2509 int c0 = (*i_getc)(f);
2512 (*iconv)(c2, c1, c0);
2518 /* goto next_word */
2522 (*iconv)(EOF, 0, 0);
2523 if (!is_inputcode_set)
2526 struct input_code *p = input_code_list;
2527 struct input_code *result = p;
2529 if (p->score < result->score) result = p;
2532 set_input_codename(result->name);
2547 /** it must NOT be in the kanji shifte sequence */
2548 /** it must NOT be written in JIS7 */
2549 /** and it must be after 2 byte 8bit code */
2556 while ((c1 = (*i_getc)(f)) != EOF) {
2562 if (push_hold_buf(c1) == EOF || estab_f){
2568 struct input_code *p = input_code_list;
2569 struct input_code *result = p;
2574 if (p->score < result->score){
2579 set_iconv(FALSE, result->iconv_func);
2584 ** 1) EOF is detected, or
2585 ** 2) Code is established, or
2586 ** 3) Buffer is FULL (but last word is pushed)
2588 ** in 1) and 3) cases, we continue to use
2589 ** Kanji codes by oconv and leave estab_f unchanged.
2594 while (wc < hold_count){
2595 c2 = hold_buf[wc++];
2597 #ifdef NUMCHAR_OPTION
2598 || (c2 & CLASS_MASK) == CLASS_UTF16
2603 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2604 (*iconv)(X0201, c2, 0);
2607 if (wc < hold_count){
2608 c1 = hold_buf[wc++];
2617 if ((*iconv)(c2, c1, 0) < 0){
2619 if (wc < hold_count){
2620 c0 = hold_buf[wc++];
2629 (*iconv)(c2, c1, c0);
2642 if (hold_count >= HOLD_SIZE*2)
2644 hold_buf[hold_count++] = c2;
2645 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2648 int s2e_conv(c2, c1, p2, p1)
2652 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2655 #ifdef SHIFTJIS_CP932
2656 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2657 extern const unsigned short shiftjis_cp932[3][189];
2658 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2664 #endif /* SHIFTJIS_CP932 */
2666 if (x0212_f && 0xfa <= c2 && c2 <= 0xfc){
2667 extern const unsigned short shiftjis_x0212[3][189];
2668 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2671 c2 = (0x8f << 8) | (val >> 8);
2684 c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394);
2686 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f);
2695 c2 = x0212_unshift(c2);
2710 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2713 int ret = s2e_conv(c2, c1, &c2, &c1);
2714 if (ret) return ret;
2728 }else if (c2 == 0x8f){
2732 c2 = (c2 << 8) | (c1 & 0x7f);
2734 #ifdef SHIFTJIS_CP932
2737 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2738 s2e_conv(s2, s1, &c2, &c1);
2739 if ((c2 & 0xff00) == 0){
2745 #endif /* SHIFTJIS_CP932 */
2746 #endif /* X0212_ENABLE */
2747 } else if (c2 == SSO){
2750 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2760 #ifdef UTF8_INPUT_ENABLE
2762 w2e_conv(c2, c1, c0, p2, p1)
2771 }else if (0xc0 <= c2 && c2 <= 0xef) {
2772 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2773 #ifdef NUMCHAR_OPTION
2776 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2791 if (c2 == 0) /* 0x00-0x7f */
2792 c1 &= 0x7F; /* 1byte */
2794 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
2796 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
2797 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2798 return -1; /* 3bytes */
2800 else if (0xf0 <= c2)
2801 return 0; /* 4,5,6bytes */
2802 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2803 return 0; /* trail byte */
2807 /* must be 3bytes */
2809 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2811 }else if(c2 == 0xED){
2812 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
2814 }else if((c2 & 0xf0) == 0xe0){
2815 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2819 if (c2 == 0 || c2 == EOF);
2820 else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf) {
2821 return 0; /* throw BOM */
2822 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
2823 } else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
2824 unsigned short val = 0;
2829 val = ww16_conv(c2, c1, c0);
2830 c2 = (val >> 8) & 0xff;
2834 ret = w2e_conv(c2, c1, c0, &c2, &c1);
2843 w16w_conv(val, p2, p1, p0)
2851 }else if (val < 0x800){
2852 *p2 = 0xc0 | (val >> 6);
2853 *p1 = 0x80 | (val & 0x3f);
2856 *p2 = 0xe0 | (val >> 12);
2857 *p1 = 0x80 | ((val >> 6) & 0x3f);
2858 *p0 = 0x80 | (val & 0x3f);
2863 ww16_conv(c2, c1, c0)
2869 }else if (c2 >= 0xe0){
2870 val = (c2 & 0x0f) << 12;
2871 val |= (c1 & 0x3f) << 6;
2873 }else if (c2 >= 0xc0){
2874 val = (c2 & 0x1f) << 6;
2883 w16e_conv(val, p2, p1)
2909 w16w_conv(val, &c2, &c1, &c0);
2910 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2911 #ifdef NUMCHAR_OPTION
2914 *p1 = CLASS_UTF16 | val;
2923 w_iconv16(c2, c1, c0)
2928 if (c2==0376 && c1==0377){
2929 utf16_mode = UTF16BE_INPUT;
2931 } else if (c2==0377 && c1==0376){
2932 utf16_mode = UTF16LE_INPUT;
2935 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
2937 tmp=c1; c1=c2; c2=tmp;
2939 if ((c2==0 && c1 < 0x80) || c2==EOF) {
2943 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
2944 if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16));
2946 else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
2947 if (ret) return ret;
2953 unicode_to_jis_common(c2, c1, c0, p2, p1)
2957 extern const unsigned short *const utf8_to_euc_2bytes[];
2958 extern const unsigned short *const *const utf8_to_euc_3bytes[];
2962 if (ms_ucs_map_f && cp51932_f){
2963 /* CP932/CP51932: U+00A6 (BROKEN BAR) -> not 0x8fa2c3, but 0x7c */
2964 if(c2 == 0xC2 && c1 == 0xA6){
2969 }else if(strict_mapping_f){
2973 case 0xAB: case 0xAD: case 0xB2: case 0xB3:
2974 case 0xB5: case 0xB7: case 0xB9: case 0xBB:
2986 ret = w_iconv_common(c2, c1, utf8_to_euc_2bytes, sizeof_utf8_to_euc_2bytes, p2, p1);
2990 if(c2 == 0xE2 && c1 == 0x80 && c0 == 0xBE){
2994 }else if(c2 == 0xEF && c1 == 0xBD && c0 == 0x9E){
2995 if (p2) *p2 = 0x8F22;
3000 if(!strict_mapping_f);
3001 else if(ms_ucs_map_f && cp51932_f){
3002 /* Microsoft Code Page */
3008 case 0x94: case 0x96: case 0xBE:
3029 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94)
3032 ret = w_iconv_common(c1, c0, utf8_to_euc_3bytes[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3038 w_iconv_common(c1, c0, pp, psize, p2, p1)
3040 const unsigned short *const *pp;
3045 const unsigned short *p;
3048 if (pp == 0) return 1;
3051 if (c1 < 0 || psize <= c1) return 1;
3053 if (p == 0) return 1;
3056 if (c0 < 0 || sizeof_utf8_to_euc_E5B8 <= c0) return 1;
3058 if (val == 0) return 1;
3059 if (disable_cp932ext_f && (
3060 (val>>8) == 0x2D || /* disable NEC special characters */
3061 val > 0xF300 /* disable NEC special characters */
3069 if (c2 == SO) c2 = X0201;
3078 #ifdef UTF8_OUTPUT_ENABLE
3080 nkf_each_char_to_hex(f, c)
3081 void (*f)PROTO((int c2,int c1));
3084 const char *hex = "0123456789ABCDEF";
3090 (*f)(0, hex[(c>>shift)&0xF]);
3101 encode_fallback_html(c)
3108 (*oconv)(0, 0x30+(c/1000000)%10);
3110 (*oconv)(0, 0x30+(c/100000 )%10);
3112 (*oconv)(0, 0x30+(c/10000 )%10);
3114 (*oconv)(0, 0x30+(c/1000 )%10);
3116 (*oconv)(0, 0x30+(c/100 )%10);
3118 (*oconv)(0, 0x30+(c/10 )%10);
3120 (*oconv)(0, 0x30+ c %10);
3126 encode_fallback_xml(c)
3132 nkf_each_char_to_hex(oconv, c);
3138 encode_fallback_java(c)
3141 const char *hex = "0123456789ABCDEF";
3143 if((c&0x00FFFFFF) > 0xFFFF){
3147 (*oconv)(0, hex[(c>>20)&0xF]);
3148 (*oconv)(0, hex[(c>>16)&0xF]);
3152 (*oconv)(0, hex[(c>>12)&0xF]);
3153 (*oconv)(0, hex[(c>> 8)&0xF]);
3154 (*oconv)(0, hex[(c>> 4)&0xF]);
3155 (*oconv)(0, hex[ c &0xF]);
3160 encode_fallback_perl(c)
3166 nkf_each_char_to_hex(oconv, c);
3172 encode_fallback_subchar(c)
3175 c = unicode_subchar;
3176 (*oconv)((c>>8)&0xFF, c&0xFF);
3182 (*oconv)(0, (c>>shift)&0xFF);
3196 extern const unsigned short euc_to_utf8_1byte[];
3197 extern const unsigned short *const euc_to_utf8_2bytes[];
3198 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3199 const unsigned short *p;
3202 p = euc_to_utf8_1byte;
3204 } else if (c2 >> 8 == 0x8f){
3205 if(!ms_ucs_map_f && c2 == 0x8F22 && c1 == 0x43){
3208 extern const unsigned short *const x0212_to_utf8_2bytes[];
3209 c2 = (c2&0x7f) - 0x21;
3210 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3211 p = x0212_to_utf8_2bytes[c2];
3217 c2 = (c2&0x7f) - 0x21;
3218 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3219 p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3224 c1 = (c1 & 0x7f) - 0x21;
3225 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3242 if (unicode_bom_f==2) {
3249 #ifdef NUMCHAR_OPTION
3250 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3251 w16w_conv(c1, &c2, &c1, &c0);
3255 if (c0) (*o_putc)(c0);
3262 output_mode = ASCII;
3264 } else if (c2 == ISO8859_1) {
3265 output_mode = ISO8859_1;
3266 (*o_putc)(c1 | 0x080);
3269 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
3270 val = ((c2<<8)&0xff00) + c1;
3271 else val = e2w_conv(c2, c1);
3273 w16w_conv(val, &c2, &c1, &c0);
3277 if (c0) (*o_putc)(c0);
3293 if (unicode_bom_f==2) {
3295 (*o_putc)((unsigned char)'\377');
3299 (*o_putc)((unsigned char)'\377');
3304 if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
3305 } else if (c2 == ISO8859_1) {
3308 #ifdef NUMCHAR_OPTION
3309 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3310 c2 = (c1 >> 8) & 0xff;
3314 unsigned short val = e2w_conv(c2, c1);
3315 c2 = (val >> 8) & 0xff;
3334 #ifdef NUMCHAR_OPTION
3335 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3336 w16e_conv(c1, &c2, &c1);
3337 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3338 if(encode_fallback)(*encode_fallback)(c1);
3346 } else if (c2 == 0) {
3347 output_mode = ASCII;
3349 } else if (c2 == X0201) {
3350 output_mode = JAPANESE_EUC;
3351 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3352 } else if (c2 == ISO8859_1) {
3353 output_mode = ISO8859_1;
3354 (*o_putc)(c1 | 0x080);
3356 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3357 output_mode = JAPANESE_EUC;
3358 #ifdef SHIFTJIS_CP932
3361 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3362 s2e_conv(s2, s1, &c2, &c1);
3367 output_mode = ASCII;
3369 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3372 (*o_putc)((c2 & 0x7f) | 0x080);
3373 (*o_putc)(c1 | 0x080);
3376 (*o_putc)((c2 & 0x7f) | 0x080);
3377 (*o_putc)(c1 | 0x080);
3381 if ((c1<0x21 || 0x7e<c1) ||
3382 (c2<0x21 || 0x7e<c2)) {
3383 set_iconv(FALSE, 0);
3384 return; /* too late to rescue this char */
3386 output_mode = JAPANESE_EUC;
3387 (*o_putc)(c2 | 0x080);
3388 (*o_putc)(c1 | 0x080);
3398 if ((ret & 0xff00) == 0x8f00){
3399 if (0x75 <= c && c <= 0x7f){
3400 ret = c + (0x109 - 0x75);
3403 if (0x75 <= c && c <= 0x7f){
3404 ret = c + (0x113 - 0x75);
3411 int x0212_unshift(c)
3415 if (0x7f <= c && c <= 0x88){
3416 ret = c + (0x75 - 0x7f);
3417 }else if (0x89 <= c && c <= 0x92){
3418 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3422 #endif /* X0212_ENABLE */
3425 e2s_conv(c2, c1, p2, p1)
3426 int c2, c1, *p2, *p1;
3430 const unsigned short *ptr;
3432 extern const unsigned short *const x0212_shiftjis[];
3433 if ((c2 & 0xff00) == 0x8f00){
3435 if (0x21 <= ndx && ndx <= 0x7e){
3436 ptr = x0212_shiftjis[ndx - 0x21];
3438 val = ptr[(c1 & 0x7f) - 0x21];
3448 c2 = x0212_shift(c2);
3450 #endif /* X0212_ENABLE */
3451 if ((c2 & 0xff00) == 0x8f00){
3454 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3455 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3464 #ifdef NUMCHAR_OPTION
3465 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3466 w16e_conv(c1, &c2, &c1);
3467 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3468 if(encode_fallback)(*encode_fallback)(c1);
3476 } else if (c2 == 0) {
3477 output_mode = ASCII;
3479 } else if (c2 == X0201) {
3480 output_mode = SHIFT_JIS;
3482 } else if (c2 == ISO8859_1) {
3483 output_mode = ISO8859_1;
3484 (*o_putc)(c1 | 0x080);
3486 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3487 output_mode = SHIFT_JIS;
3488 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3494 if ((c1<0x20 || 0x7e<c1) ||
3495 (c2<0x20 || 0x7e<c2)) {
3496 set_iconv(FALSE, 0);
3497 return; /* too late to rescue this char */
3499 output_mode = SHIFT_JIS;
3500 e2s_conv(c2, c1, &c2, &c1);
3502 #ifdef SHIFTJIS_CP932
3504 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3505 extern const unsigned short cp932inv[2][189];
3506 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3512 #endif /* SHIFTJIS_CP932 */
3515 if (prefix_table[(unsigned char)c1]){
3516 (*o_putc)(prefix_table[(unsigned char)c1]);
3527 #ifdef NUMCHAR_OPTION
3528 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3529 w16e_conv(c1, &c2, &c1);
3530 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3531 if(encode_fallback)(*encode_fallback)(c1);
3537 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3540 (*o_putc)(ascii_intro);
3541 output_mode = ASCII;
3545 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3546 if (output_mode!=X0212) {
3547 output_mode = X0212;
3553 (*o_putc)(c2 & 0x7f);
3556 } else if (c2==X0201) {
3557 if (output_mode!=X0201) {
3558 output_mode = X0201;
3564 } else if (c2==ISO8859_1) {
3565 /* iso8859 introduction, or 8th bit on */
3566 /* Can we convert in 7bit form using ESC-'-'-A ?
3568 output_mode = ISO8859_1;
3570 } else if (c2 == 0) {
3571 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3574 (*o_putc)(ascii_intro);
3575 output_mode = ASCII;
3579 if (output_mode != X0208) {
3580 output_mode = X0208;
3583 (*o_putc)(kanji_intro);
3585 if (c1<0x20 || 0x7e<c1)
3587 if (c2<0x20 || 0x7e<c2)
3599 mime_prechar(c2, c1);
3600 (*o_base64conv)(c2,c1);
3604 STATIC int broken_buf[3];
3605 STATIC int broken_counter = 0;
3606 STATIC int broken_last = 0;
3613 if (broken_counter>0) {
3614 return broken_buf[--broken_counter];
3617 if (c=='$' && broken_last != ESC
3618 && (input_mode==ASCII || input_mode==X0201)) {
3621 if (c1=='@'|| c1=='B') {
3622 broken_buf[0]=c1; broken_buf[1]=c;
3629 } else if (c=='(' && broken_last != ESC
3630 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3633 if (c1=='J'|| c1=='B') {
3634 broken_buf[0]=c1; broken_buf[1]=c;
3652 if (broken_counter<2)
3653 broken_buf[broken_counter++]=c;
3657 STATIC int prev_cr = 0;
3665 if (! (c2==0&&c1==NL) ) {
3671 } else if (c1=='\r') {
3673 } else if (c1=='\n') {
3674 if (crmode_f==CRLF) {
3675 (*o_crconv)(0,'\r');
3676 } else if (crmode_f==CR) {
3677 (*o_crconv)(0,'\r');
3681 } else if (c1!='\032' || crmode_f!=NL){
3687 Return value of fold_conv()
3689 \n add newline and output char
3690 \r add newline and output nothing
3693 1 (or else) normal output
3695 fold state in prev (previous character)
3697 >0x80 Japanese (X0208/X0201)
3702 This fold algorthm does not preserve heading space in a line.
3703 This is the main difference from fmt.
3706 #define char_size(c2,c1) (c2?2:1)
3715 if (c1== '\r' && !fold_preserve_f) {
3716 fold_state=0; /* ignore cr */
3717 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3719 fold_state=0; /* ignore cr */
3720 } else if (c1== BS) {
3721 if (f_line>0) f_line--;
3723 } else if (c2==EOF && f_line != 0) { /* close open last line */
3725 } else if ((c1=='\n' && !fold_preserve_f)
3726 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3727 && fold_preserve_f)) {
3729 if (fold_preserve_f) {