1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.96 2006/04/01 16:29:01 naruse Exp $ */
43 #define NKF_VERSION "2.0.6"
44 #define NKF_RELEASE_DATE "2006-03-26"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
49 " 2002-2006 Kono, Furukawa, Naruse, mastodon"
56 ** USAGE: nkf [flags] [file]
59 ** b Output is buffered (DEFAULT)
60 ** u Output is unbuffered
64 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
65 ** s Output code is MS Kanji (DEFAULT SELECT)
66 ** e Output code is AT&T JIS (DEFAULT SELECT)
67 ** w Output code is AT&T JIS (DEFAULT SELECT)
68 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
70 ** m MIME conversion for ISO-2022-JP
71 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
72 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
73 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
74 ** M MIME output conversion
76 ** r {de/en}crypt ROT13/47
80 ** T Text mode output (for MS-DOS)
82 ** x Do not convert X0201 kana into X0208
83 ** Z Convert X0208 alphabet to ASCII
88 ** B try to fix broken JIS, missing Escape
89 ** B[1-9] broken level
91 ** O Output to 'nkf.out' file or last file name
92 ** d Delete \r in line feed
93 ** c Add \r in line feed
94 ** -- other long option
95 ** -- ignore following option (don't use with -O )
99 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
101 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
117 #if defined(MSDOS) || defined(__OS2__)
124 #define setbinmode(fp) fsetbin(fp)
125 #else /* Microsoft C, Turbo C */
126 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
128 #else /* UNIX,OS/2 */
129 #define setbinmode(fp)
132 #ifdef _IOFBF /* SysV and MSDOS, Windows */
133 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
135 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
138 /*Borland C++ 4.5 EasyWin*/
139 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
148 /* added by satoru@isoternet.org */
149 #include <sys/stat.h>
150 #ifndef MSDOS /* UNIX, OS/2 */
153 #else /* defined(MSDOS) */
155 #ifdef __BORLANDC__ /* BCC32 */
157 #else /* !defined(__BORLANDC__) */
158 #include <sys/utime.h>
159 #endif /* (__BORLANDC__) */
160 #else /* !defined(__WIN32__) */
161 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
162 #include <sys/utime.h>
163 #elif defined(__TURBOC__) /* BCC */
165 #elif defined(LSI_C) /* LSI C */
166 #endif /* (__WIN32__) */
174 /* state of output_mode and input_mode
191 #define X0213_1 0x284F
192 #define X0213_2 0x2850
194 /* Input Assumption */
198 #define LATIN1_INPUT 6
200 #define STRICT_MIME 8
205 #define JAPANESE_EUC 10
209 #define UTF8_INPUT 13
210 #define UTF16BE_INPUT 14
211 #define UTF16LE_INPUT 15
231 #define is_alnum(c) \
232 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
234 /* I don't trust portablity of toupper */
235 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
236 #define nkf_isoctal(c) ('0'<=c && c<='7')
237 #define nkf_isdigit(c) ('0'<=c && c<='9')
238 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
239 #define nkf_isblank(c) (c == SPACE || c == TAB)
240 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
241 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
242 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
243 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
245 #define HOLD_SIZE 1024
246 #define IOBUF_SIZE 16384
248 #define DEFAULT_J 'B'
249 #define DEFAULT_R 'B'
251 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
252 #define SJ6394 0x0161 /* 63 - 94 ku offset */
254 #define RANGE_NUM_MAX 18
259 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
260 #define sizeof_euc_to_utf8_1byte 94
261 #define sizeof_euc_to_utf8_2bytes 94
262 #define sizeof_utf8_to_euc_C2 64
263 #define sizeof_utf8_to_euc_E5B8 64
264 #define sizeof_utf8_to_euc_2bytes 112
265 #define sizeof_utf8_to_euc_3bytes 16
268 /* MIME preprocessor */
270 #ifdef EASYWIN /*Easy Win */
271 extern POINT _BufferSize;
280 void (*status_func)(struct input_code *, int);
281 int (*iconv_func)(int c2, int c1, int c0);
285 static char *input_codename = "";
288 static const char *CopyRight = COPY_RIGHT;
290 #if !defined(PERL_XS) && !defined(WIN32DLL)
291 static int noconvert(FILE *f);
293 static void module_connection(void);
294 static int kanji_convert(FILE *f);
295 static int h_conv(FILE *f,int c2,int c1);
296 static int push_hold_buf(int c2);
297 static void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0));
298 static int s_iconv(int c2,int c1,int c0);
299 static int s2e_conv(int c2, int c1, int *p2, int *p1);
300 static int e_iconv(int c2,int c1,int c0);
301 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
303 * 0: Shift_JIS, eucJP-ascii
307 #define UCS_MAP_ASCII 0
309 #define UCS_MAP_CP932 2
310 static int ms_ucs_map_f = UCS_MAP_ASCII;
312 #ifdef UTF8_INPUT_ENABLE
313 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
314 static int no_cp932ext_f = FALSE;
315 /* ignore ZERO WIDTH NO-BREAK SPACE */
316 static int ignore_zwnbsp_f = TRUE;
317 static int no_best_fit_chars_f = FALSE;
318 static int unicode_subchar = '?'; /* the regular substitution character */
319 static void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c);
320 static void encode_fallback_html(int c);
321 static void encode_fallback_xml(int c);
322 static void encode_fallback_java(int c);
323 static void encode_fallback_perl(int c);
324 static void encode_fallback_subchar(int c);
325 static void (*encode_fallback)(int c) = NULL;
326 static int w2e_conv(int c2,int c1,int c0,int *p2,int *p1);
327 static int w_iconv(int c2,int c1,int c0);
328 static int w_iconv16(int c2,int c1,int c0);
329 static int unicode_to_jis_common(int c2,int c1,int c0,int *p2,int *p1);
330 static int w_iconv_common(int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1);
331 static void w16w_conv(int val, int *p2, int *p1, int *p0);
332 static int ww16_conv(int c2, int c1, int c0);
333 static int w16e_conv(int val,int *p2,int *p1);
335 #ifdef UTF8_OUTPUT_ENABLE
336 static int unicode_bom_f= 0; /* Output Unicode BOM */
337 static int w_oconv16_LE = 0; /* utf-16 little endian */
338 static int e2w_conv(int c2,int c1);
339 static void w_oconv(int c2,int c1);
340 static void w_oconv16(int c2,int c1);
342 static void e_oconv(int c2,int c1);
343 static int e2s_conv(int c2, int c1, int *p2, int *p1);
344 static void s_oconv(int c2,int c1);
345 static void j_oconv(int c2,int c1);
346 static void fold_conv(int c2,int c1);
347 static void cr_conv(int c2,int c1);
348 static void z_conv(int c2,int c1);
349 static void rot_conv(int c2,int c1);
350 static void hira_conv(int c2,int c1);
351 static void base64_conv(int c2,int c1);
352 static void iso2022jp_check_conv(int c2,int c1);
353 static void no_connection(int c2,int c1);
354 static int no_connection2(int c2,int c1,int c0);
356 static void code_score(struct input_code *ptr);
357 static void code_status(int c);
359 static void std_putc(int c);
360 static int std_getc(FILE *f);
361 static int std_ungetc(int c,FILE *f);
363 static int broken_getc(FILE *f);
364 static int broken_ungetc(int c,FILE *f);
366 static int mime_begin(FILE *f);
367 static int mime_getc(FILE *f);
368 static int mime_ungetc(int c,FILE *f);
370 static void switch_mime_getc(void);
371 static void unswitch_mime_getc(void);
372 static int mime_begin_strict(FILE *f);
373 static int mime_getc_buf(FILE *f);
374 static int mime_ungetc_buf(int c,FILE *f);
375 static int mime_integrity(FILE *f,const unsigned char *p);
377 static int base64decode(int c);
378 static void mime_prechar(int c2, int c1);
379 static void mime_putc(int c);
380 static void open_mime(int c);
381 static void close_mime(void);
382 static void eof_mime(void);
383 static void mimeout_addchar(int c);
385 static void usage(void);
386 static void version(void);
388 static void options(unsigned char *c);
389 #if defined(PERL_XS) || defined(WIN32DLL)
390 static void reinit(void);
395 #if !defined(PERL_XS) && !defined(WIN32DLL)
396 static unsigned char stdibuf[IOBUF_SIZE];
397 static unsigned char stdobuf[IOBUF_SIZE];
399 static unsigned char hold_buf[HOLD_SIZE*2];
400 static int hold_count;
402 /* MIME preprocessor fifo */
404 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
405 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
406 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
407 static unsigned char mime_buf[MIME_BUF_SIZE];
408 static unsigned int mime_top = 0;
409 static unsigned int mime_last = 0; /* decoded */
410 static unsigned int mime_input = 0; /* undecoded */
411 static int (*mime_iconv_back)(int c2,int c1,int c0) = NULL;
414 static int unbuf_f = FALSE;
415 static int estab_f = FALSE;
416 static int nop_f = FALSE;
417 static int binmode_f = TRUE; /* binary mode */
418 static int rot_f = FALSE; /* rot14/43 mode */
419 static int hira_f = FALSE; /* hira/kata henkan */
420 static int input_f = FALSE; /* non fixed input code */
421 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
422 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
423 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
424 static int mimebuf_f = FALSE; /* MIME buffered input */
425 static int broken_f = FALSE; /* convert ESC-less broken JIS */
426 static int iso8859_f = FALSE; /* ISO8859 through */
427 static int mimeout_f = FALSE; /* base64 mode */
428 #if defined(MSDOS) || defined(__OS2__)
429 static int x0201_f = TRUE; /* Assume JISX0201 kana */
431 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
433 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
435 #ifdef UNICODE_NORMALIZATION
436 static int nfc_f = FALSE;
437 static int (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
438 static int (*i_nfc_ungetc)(int c ,FILE *f) = std_ungetc;
439 static int nfc_getc(FILE *f);
440 static int nfc_ungetc(int c,FILE *f);
444 static int cap_f = FALSE;
445 static int (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
446 static int (*i_cungetc)(int c ,FILE *f) = std_ungetc;
447 static int cap_getc(FILE *f);
448 static int cap_ungetc(int c,FILE *f);
450 static int url_f = FALSE;
451 static int (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
452 static int (*i_uungetc)(int c ,FILE *f) = std_ungetc;
453 static int url_getc(FILE *f);
454 static int url_ungetc(int c,FILE *f);
457 #ifdef NUMCHAR_OPTION
458 #define CLASS_MASK 0x0f000000
459 #define CLASS_UTF16 0x01000000
460 static int numchar_f = FALSE;
461 static int (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
462 static int (*i_nungetc)(int c ,FILE *f) = std_ungetc;
463 static int numchar_getc(FILE *f);
464 static int numchar_ungetc(int c,FILE *f);
468 static int noout_f = FALSE;
469 static void no_putc(int c);
470 static int debug_f = FALSE;
471 static void debug(const char *str);
472 static int (*iconv_for_check)(int c2,int c1,int c0) = 0;
475 static int guess_f = FALSE;
477 static void print_guessed_code(char *filename);
479 static void set_input_codename(char *codename);
480 static int is_inputcode_mixed = FALSE;
481 static int is_inputcode_set = FALSE;
484 static int exec_f = 0;
487 #ifdef SHIFTJIS_CP932
488 /* invert IBM extended characters to others */
489 static int cp51932_f = TRUE;
490 #define CP932_TABLE_BEGIN (0xfa)
491 #define CP932_TABLE_END (0xfc)
493 /* invert NEC-selected IBM extended characters to IBM extended characters */
494 static int cp932inv_f = TRUE;
495 #define CP932INV_TABLE_BEGIN (0xed)
496 #define CP932INV_TABLE_END (0xee)
498 /* static int cp932_conv(int c2, int c1); */
499 #endif /* SHIFTJIS_CP932 */
502 static int x0212_f = FALSE;
503 static int x0212_shift(int c);
504 static int x0212_unshift(int c);
506 static int x0213_f = FALSE;
508 static unsigned char prefix_table[256];
510 static void set_code_score(struct input_code *ptr, int score);
511 static void clr_code_score(struct input_code *ptr, int score);
512 static void status_disable(struct input_code *ptr);
513 static void status_push_ch(struct input_code *ptr, int c);
514 static void status_clear(struct input_code *ptr);
515 static void status_reset(struct input_code *ptr);
516 static void status_reinit(struct input_code *ptr);
517 static void status_check(struct input_code *ptr, int c);
518 static void e_status(struct input_code *, int);
519 static void s_status(struct input_code *, int);
521 #ifdef UTF8_INPUT_ENABLE
522 static void w_status(struct input_code *, int);
523 static void w16_status(struct input_code *, int);
524 static int utf16_mode = UTF16BE_INPUT;
527 struct input_code input_code_list[] = {
528 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
529 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
530 #ifdef UTF8_INPUT_ENABLE
531 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
532 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
537 static int mimeout_mode = 0;
538 static int base64_count = 0;
540 /* X0208 -> ASCII converter */
543 static int f_line = 0; /* chars in line */
544 static int f_prev = 0;
545 static int fold_preserve_f = FALSE; /* preserve new lines */
546 static int fold_f = FALSE;
547 static int fold_len = 0;
550 static unsigned char kanji_intro = DEFAULT_J;
551 static unsigned char ascii_intro = DEFAULT_R;
555 #define FOLD_MARGIN 10
556 #define DEFAULT_FOLD 60
558 static int fold_margin = FOLD_MARGIN;
562 #ifdef DEFAULT_CODE_JIS
563 # define DEFAULT_CONV j_oconv
565 #ifdef DEFAULT_CODE_SJIS
566 # define DEFAULT_CONV s_oconv
568 #ifdef DEFAULT_CODE_EUC
569 # define DEFAULT_CONV e_oconv
571 #ifdef DEFAULT_CODE_UTF8
572 # define DEFAULT_CONV w_oconv
575 /* process default */
576 static void (*output_conv)(int c2,int c1) = DEFAULT_CONV;
578 static void (*oconv)(int c2,int c1) = no_connection;
579 /* s_iconv or oconv */
580 static int (*iconv)(int c2,int c1,int c0) = no_connection2;
582 static void (*o_zconv)(int c2,int c1) = no_connection;
583 static void (*o_fconv)(int c2,int c1) = no_connection;
584 static void (*o_crconv)(int c2,int c1) = no_connection;
585 static void (*o_rot_conv)(int c2,int c1) = no_connection;
586 static void (*o_hira_conv)(int c2,int c1) = no_connection;
587 static void (*o_base64conv)(int c2,int c1) = no_connection;
588 static void (*o_iso2022jp_check_conv)(int c2,int c1) = no_connection;
590 /* static redirections */
592 static void (*o_putc)(int c) = std_putc;
594 static int (*i_getc)(FILE *f) = std_getc; /* general input */
595 static int (*i_ungetc)(int c,FILE *f) =std_ungetc;
597 static int (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
598 static int (*i_bungetc)(int c ,FILE *f) = std_ungetc;
600 static void (*o_mputc)(int c) = std_putc ; /* output of mputc */
602 static int (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
603 static int (*i_mungetc)(int c ,FILE *f) = std_ungetc;
605 /* for strict mime */
606 static int (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
607 static int (*i_mungetc_buf)(int c,FILE *f) = std_ungetc;
610 static int output_mode = ASCII, /* output kanji mode */
611 input_mode = ASCII, /* input kanji mode */
612 shift_mode = FALSE; /* TRUE shift out, or X0201 */
613 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
615 /* X0201 / X0208 conversion tables */
617 /* X0201 kana conversion table */
620 unsigned char cv[]= {
621 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
622 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
623 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
624 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
625 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
626 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
627 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
628 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
629 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
630 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
631 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
632 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
633 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
634 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
635 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
636 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
640 /* X0201 kana conversion table for daguten */
643 unsigned char dv[]= {
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
649 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
650 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
651 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
652 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
653 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
655 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 /* X0201 kana conversion table for han-daguten */
665 unsigned char ev[]= {
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
677 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 /* X0208 kigou conversion table */
686 /* 0x8140 - 0x819e */
688 unsigned char fv[] = {
690 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
691 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
692 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
694 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
695 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
696 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
698 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
707 static int file_out_f = FALSE;
709 static int overwrite_f = FALSE;
710 static int preserve_time_f = FALSE;
711 static int backup_f = FALSE;
712 static char *backup_suffix = "";
713 static char *get_backup_filename(const char *suffix, const char *filename);
716 static int crmode_f = 0; /* CR, NL, CRLF */
717 #ifdef EASYWIN /*Easy Win */
718 static int end_check;
721 #define STD_GC_BUFSIZE (256)
722 int std_gc_buf[STD_GC_BUFSIZE];
726 #include "nkf32dll.c"
727 #elif defined(PERL_XS)
729 int main(int argc, char **argv)
734 char *outfname = NULL;
737 #ifdef EASYWIN /*Easy Win */
738 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
741 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
742 cp = (unsigned char *)*argv;
747 if (pipe(fds) < 0 || (pid = fork()) < 0){
758 execvp(argv[1], &argv[1]);
772 if(x0201_f == WISH_TRUE)
773 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
775 if (binmode_f == TRUE)
777 if (freopen("","wb",stdout) == NULL)
784 setbuf(stdout, (char *) NULL);
786 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
789 if (binmode_f == TRUE)
791 if (freopen("","rb",stdin) == NULL) return (-1);
795 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
799 kanji_convert(stdin);
800 if (guess_f) print_guessed_code(NULL);
805 is_inputcode_mixed = FALSE;
806 is_inputcode_set = FALSE;
811 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
820 /* reopen file for stdout */
821 if (file_out_f == TRUE) {
824 outfname = malloc(strlen(origfname)
825 + strlen(".nkftmpXXXXXX")
831 strcpy(outfname, origfname);
835 for (i = strlen(outfname); i; --i){
836 if (outfname[i - 1] == '/'
837 || outfname[i - 1] == '\\'){
843 strcat(outfname, "ntXXXXXX");
845 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
848 strcat(outfname, ".nkftmpXXXXXX");
849 fd = mkstemp(outfname);
852 || (fd_backup = dup(fileno(stdout))) < 0
853 || dup2(fd, fileno(stdout)) < 0
864 outfname = "nkf.out";
867 if(freopen(outfname, "w", stdout) == NULL) {
871 if (binmode_f == TRUE) {
873 if (freopen("","wb",stdout) == NULL)
880 if (binmode_f == TRUE)
882 if (freopen("","rb",fin) == NULL)
887 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
891 char *filename = NULL;
893 if (nfiles > 1) filename = origfname;
894 if (guess_f) print_guessed_code(filename);
900 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
908 if (dup2(fd_backup, fileno(stdout)) < 0){
911 if (stat(origfname, &sb)) {
912 fprintf(stderr, "Can't stat %s\n", origfname);
914 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
915 if (chmod(outfname, sb.st_mode)) {
916 fprintf(stderr, "Can't set permission %s\n", outfname);
919 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
921 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
922 tb[0] = tb[1] = sb.st_mtime;
923 if (utime(outfname, tb)) {
924 fprintf(stderr, "Can't set timestamp %s\n", outfname);
927 tb.actime = sb.st_atime;
928 tb.modtime = sb.st_mtime;
929 if (utime(outfname, &tb)) {
930 fprintf(stderr, "Can't set timestamp %s\n", outfname);
935 char *backup_filename = get_backup_filename(backup_suffix, origfname);
937 unlink(backup_filename);
939 if (rename(origfname, backup_filename)) {
940 perror(backup_filename);
941 fprintf(stderr, "Can't rename %s to %s\n",
942 origfname, backup_filename);
946 if (unlink(origfname)){
951 if (rename(outfname, origfname)) {
953 fprintf(stderr, "Can't rename %s to %s\n",
954 outfname, origfname);
962 #ifdef EASYWIN /*Easy Win */
963 if (file_out_f == FALSE)
964 scanf("%d",&end_check);
967 #else /* for Other OS */
968 if (file_out_f == TRUE)
973 #endif /* WIN32DLL */
976 char *get_backup_filename(const char *suffix, const char *filename)
978 char *backup_filename;
979 int asterisk_count = 0;
981 int filename_length = strlen(filename);
983 for(i = 0; suffix[i]; i++){
984 if(suffix[i] == '*') asterisk_count++;
988 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
989 if (!backup_filename){
990 perror("Can't malloc backup filename.");
994 for(i = 0, j = 0; suffix[i];){
995 if(suffix[i] == '*'){
996 backup_filename[j] = '\0';
997 strncat(backup_filename, filename, filename_length);
999 j += filename_length;
1001 backup_filename[j++] = suffix[i++];
1004 backup_filename[j] = '\0';
1006 j = strlen(suffix) + filename_length;
1007 backup_filename = malloc( + 1);
1008 strcpy(backup_filename, filename);
1009 strcat(backup_filename, suffix);
1010 backup_filename[j] = '\0';
1012 return backup_filename;
1041 {"katakana-hiragana","h3"},
1048 #ifdef UTF8_OUTPUT_ENABLE
1058 {"fb-subchar=", ""},
1060 #ifdef UTF8_INPUT_ENABLE
1061 {"utf8-input", "W"},
1062 {"utf16-input", "W16"},
1063 {"no-cp932ext", ""},
1064 {"no-best-fit-chars",""},
1066 #ifdef UNICODE_NORMALIZATION
1067 {"utf8mac-input", ""},
1079 #ifdef NUMCHAR_OPTION
1080 {"numchar-input", ""},
1086 #ifdef SHIFTJIS_CP932
1096 static int option_mode = 0;
1098 void options(unsigned char *cp)
1102 unsigned char *cp_back = NULL;
1107 while(*cp && *cp++!='-');
1108 while (*cp || cp_back) {
1116 case '-': /* literal options */
1117 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1121 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1122 p = (unsigned char *)long_option[i].name;
1123 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1124 if (*p == cp[j] || cp[j] == ' '){
1131 while(*cp && *cp != SPACE && cp++);
1132 if (long_option[i].alias[0]){
1134 cp = (unsigned char *)long_option[i].alias;
1136 if (strcmp(long_option[i].name, "ic=") == 0){
1137 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1138 codeset[i] = nkf_toupper(p[i]);
1141 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1142 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1143 strcmp(codeset, "CP50220") == 0 ||
1144 strcmp(codeset, "CP50221") == 0 ||
1145 strcmp(codeset, "CP50222") == 0 ||
1146 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1147 input_f = JIS_INPUT;
1148 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1149 input_f = JIS_INPUT;
1153 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1154 input_f = JIS_INPUT;
1159 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1160 input_f = SJIS_INPUT;
1161 if (x0201_f==NO_X0201) x0201_f=TRUE;
1162 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1163 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1164 strcmp(codeset, "CP932") == 0 ||
1165 strcmp(codeset, "MS932") == 0){
1166 input_f = SJIS_INPUT;
1168 #ifdef SHIFTJIS_CP932
1171 #ifdef UTF8_OUTPUT_ENABLE
1172 ms_ucs_map_f = UCS_MAP_CP932;
1174 }else if(strcmp(codeset, "EUCJP") == 0 ||
1175 strcmp(codeset, "EUC-JP") == 0){
1176 input_f = JIS_INPUT;
1177 }else if(strcmp(codeset, "CP51932") == 0){
1178 input_f = JIS_INPUT;
1180 #ifdef SHIFTJIS_CP932
1183 #ifdef UTF8_OUTPUT_ENABLE
1184 ms_ucs_map_f = UCS_MAP_CP932;
1186 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1187 strcmp(codeset, "EUCJP-MS") == 0 ||
1188 strcmp(codeset, "EUCJPMS") == 0){
1189 input_f = JIS_INPUT;
1191 #ifdef SHIFTJIS_CP932
1194 #ifdef UTF8_OUTPUT_ENABLE
1195 ms_ucs_map_f = UCS_MAP_MS;
1197 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1198 strcmp(codeset, "EUCJP-ASCII") == 0){
1199 input_f = JIS_INPUT;
1201 #ifdef SHIFTJIS_CP932
1204 #ifdef UTF8_OUTPUT_ENABLE
1205 ms_ucs_map_f = UCS_MAP_ASCII;
1207 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1208 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1209 input_f = SJIS_INPUT;
1211 #ifdef SHIFTJIS_CP932
1215 if (x0201_f==NO_X0201) x0201_f=TRUE;
1216 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1217 strcmp(codeset, "EUC-JIS-2004") == 0){
1218 input_f = JIS_INPUT;
1221 #ifdef SHIFTJIS_CP932
1225 #ifdef UTF8_INPUT_ENABLE
1226 }else if(strcmp(codeset, "UTF-8") == 0 ||
1227 strcmp(codeset, "UTF-8N") == 0 ||
1228 strcmp(codeset, "UTF-8-BOM") == 0){
1229 input_f = UTF8_INPUT;
1230 #ifdef UNICODE_NORMALIZATION
1231 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1232 strcmp(codeset, "UTF-8-MAC") == 0){
1233 input_f = UTF8_INPUT;
1236 }else if(strcmp(codeset, "UTF-16") == 0){
1237 input_f = UTF16BE_INPUT;
1238 utf16_mode = UTF16BE_INPUT;
1239 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1240 strcmp(codeset, "UTF-16BE-BOM") == 0){
1241 input_f = UTF16BE_INPUT;
1242 utf16_mode = UTF16BE_INPUT;
1243 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1244 strcmp(codeset, "UTF-16LE-BOM") == 0){
1245 input_f = UTF16LE_INPUT;
1246 utf16_mode = UTF16LE_INPUT;
1251 if (strcmp(long_option[i].name, "oc=") == 0){
1252 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1253 codeset[i] = nkf_toupper(p[i]);
1256 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1257 strcmp(codeset, "CP50220") == 0){
1258 output_conv = j_oconv;
1259 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1260 output_conv = j_oconv;
1261 no_cp932ext_f = TRUE;
1262 }else if(strcmp(codeset, "CP50221") == 0 ||
1263 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1264 output_conv = j_oconv;
1266 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1267 output_conv = j_oconv;
1271 #ifdef SHIFTJIS_CP932
1274 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1275 output_conv = j_oconv;
1280 #ifdef SHIFTJIS_CP932
1283 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1284 output_conv = j_oconv;
1289 #ifdef SHIFTJIS_CP932
1292 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1293 output_conv = s_oconv;
1294 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1295 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1296 strcmp(codeset, "CP932") == 0 ||
1297 strcmp(codeset, "MS932") == 0){
1298 output_conv = s_oconv;
1300 #ifdef SHIFTJIS_CP932
1304 #ifdef UTF8_OUTPUT_ENABLE
1305 ms_ucs_map_f = UCS_MAP_CP932;
1307 }else if(strcmp(codeset, "EUCJP") == 0 ||
1308 strcmp(codeset, "EUC-JP") == 0){
1309 output_conv = e_oconv;
1310 }else if(strcmp(codeset, "CP51932") == 0){
1311 output_conv = e_oconv;
1313 #ifdef SHIFTJIS_CP932
1316 #ifdef UTF8_OUTPUT_ENABLE
1317 ms_ucs_map_f = UCS_MAP_CP932;
1319 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1320 strcmp(codeset, "EUCJP-MS") == 0 ||
1321 strcmp(codeset, "EUCJPMS") == 0){
1322 output_conv = e_oconv;
1327 #ifdef SHIFTJIS_CP932
1330 #ifdef UTF8_OUTPUT_ENABLE
1331 ms_ucs_map_f = UCS_MAP_MS;
1333 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1334 strcmp(codeset, "EUCJP-ASCII") == 0){
1335 output_conv = e_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 #ifdef UTF8_OUTPUT_ENABLE
1344 ms_ucs_map_f = UCS_MAP_ASCII;
1346 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1347 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1348 output_conv = s_oconv;
1350 #ifdef SHIFTJIS_CP932
1353 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1354 strcmp(codeset, "EUC-JIS-2004") == 0){
1355 output_conv = e_oconv;
1360 #ifdef SHIFTJIS_CP932
1363 #ifdef UTF8_OUTPUT_ENABLE
1364 }else if(strcmp(codeset, "UTF-8") == 0){
1365 output_conv = w_oconv;
1366 }else if(strcmp(codeset, "UTF-8N") == 0){
1367 output_conv = w_oconv;
1369 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1370 output_conv = w_oconv;
1372 }else if(strcmp(codeset, "UTF-16BE") == 0){
1373 output_conv = w_oconv16;
1375 }else if(strcmp(codeset, "UTF-16") == 0 ||
1376 strcmp(codeset, "UTF-16BE-BOM") == 0){
1377 output_conv = w_oconv16;
1379 }else if(strcmp(codeset, "UTF-16LE") == 0){
1380 output_conv = w_oconv16;
1383 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1384 output_conv = w_oconv16;
1392 if (strcmp(long_option[i].name, "overwrite") == 0){
1395 preserve_time_f = TRUE;
1398 if (strcmp(long_option[i].name, "overwrite=") == 0){
1401 preserve_time_f = TRUE;
1403 backup_suffix = malloc(strlen((char *) p) + 1);
1404 strcpy(backup_suffix, (char *) p);
1407 if (strcmp(long_option[i].name, "in-place") == 0){
1410 preserve_time_f = FALSE;
1413 if (strcmp(long_option[i].name, "in-place=") == 0){
1416 preserve_time_f = FALSE;
1418 backup_suffix = malloc(strlen((char *) p) + 1);
1419 strcpy(backup_suffix, (char *) p);
1424 if (strcmp(long_option[i].name, "cap-input") == 0){
1428 if (strcmp(long_option[i].name, "url-input") == 0){
1433 #ifdef NUMCHAR_OPTION
1434 if (strcmp(long_option[i].name, "numchar-input") == 0){
1440 if (strcmp(long_option[i].name, "no-output") == 0){
1444 if (strcmp(long_option[i].name, "debug") == 0){
1449 if (strcmp(long_option[i].name, "cp932") == 0){
1450 #ifdef SHIFTJIS_CP932
1454 #ifdef UTF8_OUTPUT_ENABLE
1455 ms_ucs_map_f = UCS_MAP_CP932;
1459 if (strcmp(long_option[i].name, "no-cp932") == 0){
1460 #ifdef SHIFTJIS_CP932
1464 #ifdef UTF8_OUTPUT_ENABLE
1465 ms_ucs_map_f = UCS_MAP_ASCII;
1469 #ifdef SHIFTJIS_CP932
1470 if (strcmp(long_option[i].name, "cp932inv") == 0){
1477 if (strcmp(long_option[i].name, "x0212") == 0){
1484 if (strcmp(long_option[i].name, "exec-in") == 0){
1488 if (strcmp(long_option[i].name, "exec-out") == 0){
1493 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1494 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1495 no_cp932ext_f = TRUE;
1498 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1499 no_best_fit_chars_f = TRUE;
1502 if (strcmp(long_option[i].name, "fb-skip") == 0){
1503 encode_fallback = NULL;
1506 if (strcmp(long_option[i].name, "fb-html") == 0){
1507 encode_fallback = encode_fallback_html;
1510 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1511 encode_fallback = encode_fallback_xml;
1514 if (strcmp(long_option[i].name, "fb-java") == 0){
1515 encode_fallback = encode_fallback_java;
1518 if (strcmp(long_option[i].name, "fb-perl") == 0){
1519 encode_fallback = encode_fallback_perl;
1522 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1523 encode_fallback = encode_fallback_subchar;
1526 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1527 encode_fallback = encode_fallback_subchar;
1528 unicode_subchar = 0;
1530 /* decimal number */
1531 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1532 unicode_subchar *= 10;
1533 unicode_subchar += hex2bin(p[i]);
1535 }else if(p[1] == 'x' || p[1] == 'X'){
1536 /* hexadecimal number */
1537 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1538 unicode_subchar <<= 4;
1539 unicode_subchar |= hex2bin(p[i]);
1543 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1544 unicode_subchar *= 8;
1545 unicode_subchar += hex2bin(p[i]);
1548 w16e_conv(unicode_subchar, &i, &j);
1549 unicode_subchar = i<<8 | j;
1553 #ifdef UTF8_OUTPUT_ENABLE
1554 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1555 ms_ucs_map_f = UCS_MAP_MS;
1559 #ifdef UNICODE_NORMALIZATION
1560 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1561 input_f = UTF8_INPUT;
1566 if (strcmp(long_option[i].name, "prefix=") == 0){
1567 if (' ' < p[0] && p[0] < 128){
1568 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1569 prefix_table[p[i]] = p[0];
1576 case 'b': /* buffered mode */
1579 case 'u': /* non bufferd mode */
1582 case 't': /* transparent mode */
1585 case 'j': /* JIS output */
1587 output_conv = j_oconv;
1589 case 'e': /* AT&T EUC output */
1590 output_conv = e_oconv;
1592 case 's': /* SJIS output */
1593 output_conv = s_oconv;
1595 case 'l': /* ISO8859 Latin-1 support, no conversion */
1596 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1597 input_f = LATIN1_INPUT;
1599 case 'i': /* Kanji IN ESC-$-@/B */
1600 if (*cp=='@'||*cp=='B')
1601 kanji_intro = *cp++;
1603 case 'o': /* ASCII IN ESC-(-J/B */
1604 if (*cp=='J'||*cp=='B'||*cp=='H')
1605 ascii_intro = *cp++;
1609 bit:1 katakana->hiragana
1610 bit:2 hiragana->katakana
1612 if ('9'>= *cp && *cp>='0')
1613 hira_f |= (*cp++ -'0');
1620 #if defined(MSDOS) || defined(__OS2__)
1635 #ifdef UTF8_OUTPUT_ENABLE
1636 case 'w': /* UTF-8 output */
1637 if ('1'== cp[0] && '6'==cp[1]) {
1638 output_conv = w_oconv16; cp+=2;
1640 unicode_bom_f=2; cp++;
1643 unicode_bom_f=1; cp++;
1645 } else if (cp[0] == 'B') {
1646 unicode_bom_f=2; cp++;
1648 unicode_bom_f=1; cp++;
1651 } else if (cp[0] == '8') {
1652 output_conv = w_oconv; cp++;
1655 unicode_bom_f=1; cp++;
1658 output_conv = w_oconv;
1661 #ifdef UTF8_INPUT_ENABLE
1662 case 'W': /* UTF-8 input */
1663 if ('1'== cp[0] && '6'==cp[1]) {
1664 input_f = UTF16BE_INPUT;
1665 utf16_mode = UTF16BE_INPUT;
1669 input_f = UTF16LE_INPUT;
1670 utf16_mode = UTF16LE_INPUT;
1671 } else if (cp[0] == 'B') {
1673 input_f = UTF16BE_INPUT;
1674 utf16_mode = UTF16BE_INPUT;
1676 } else if (cp[0] == '8') {
1678 input_f = UTF8_INPUT;
1680 input_f = UTF8_INPUT;
1683 /* Input code assumption */
1684 case 'J': /* JIS input */
1685 case 'E': /* AT&T EUC input */
1686 input_f = JIS_INPUT;
1688 case 'S': /* MS Kanji input */
1689 input_f = SJIS_INPUT;
1690 if (x0201_f==NO_X0201) x0201_f=TRUE;
1692 case 'Z': /* Convert X0208 alphabet to asii */
1693 /* bit:0 Convert X0208
1694 bit:1 Convert Kankaku to one space
1695 bit:2 Convert Kankaku to two spaces
1696 bit:3 Convert HTML Entity
1698 if ('9'>= *cp && *cp>='0')
1699 alpha_f |= 1<<(*cp++ -'0');
1703 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1704 x0201_f = FALSE; /* No X0201->X0208 conversion */
1706 ESC-(-I in JIS, EUC, MS Kanji
1707 SI/SO in JIS, EUC, MS Kanji
1708 SSO in EUC, JIS, not in MS Kanji
1709 MS Kanji (0xa0-0xdf)
1711 ESC-(-I in JIS (0x20-0x5f)
1712 SSO in EUC (0xa0-0xdf)
1713 0xa0-0xd in MS Kanji (0xa0-0xdf)
1716 case 'X': /* Assume X0201 kana */
1717 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1720 case 'F': /* prserve new lines */
1721 fold_preserve_f = TRUE;
1722 case 'f': /* folding -f60 or -f */
1725 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1727 fold_len += *cp++ - '0';
1729 if (!(0<fold_len && fold_len<BUFSIZ))
1730 fold_len = DEFAULT_FOLD;
1734 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1736 fold_margin += *cp++ - '0';
1740 case 'm': /* MIME support */
1741 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1742 if (*cp=='B'||*cp=='Q') {
1743 mime_decode_mode = *cp++;
1744 mimebuf_f = FIXED_MIME;
1745 } else if (*cp=='N') {
1746 mime_f = TRUE; cp++;
1747 } else if (*cp=='S') {
1748 mime_f = STRICT_MIME; cp++;
1749 } else if (*cp=='0') {
1750 mime_decode_f = FALSE;
1751 mime_f = FALSE; cp++;
1754 case 'M': /* MIME output */
1757 mimeout_f = FIXED_MIME; cp++;
1758 } else if (*cp=='Q') {
1760 mimeout_f = FIXED_MIME; cp++;
1765 case 'B': /* Broken JIS support */
1767 bit:1 allow any x on ESC-(-x or ESC-$-x
1768 bit:2 reset to ascii on NL
1770 if ('9'>= *cp && *cp>='0')
1771 broken_f |= 1<<(*cp++ -'0');
1776 case 'O':/* for Output file */
1780 case 'c':/* add cr code */
1783 case 'd':/* delete cr code */
1786 case 'I': /* ISO-2022-JP output */
1789 case 'L': /* line mode */
1790 if (*cp=='u') { /* unix */
1791 crmode_f = NL; cp++;
1792 } else if (*cp=='m') { /* mac */
1793 crmode_f = CR; cp++;
1794 } else if (*cp=='w') { /* windows */
1795 crmode_f = CRLF; cp++;
1796 } else if (*cp=='0') { /* no conversion */
1806 /* module muliple options in a string are allowed for Perl moudle */
1807 while(*cp && *cp++!='-');
1810 /* bogus option but ignored */
1816 #ifdef ANSI_C_PROTOTYPE
1817 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1819 struct input_code * find_inputcode_byfunc(iconv_func)
1820 int (*iconv_func)();
1824 struct input_code *p = input_code_list;
1826 if (iconv_func == p->iconv_func){
1835 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1837 #ifdef INPUT_CODE_FIX
1845 #ifdef INPUT_CODE_FIX
1846 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1852 if (estab_f && iconv_for_check != iconv){
1853 struct input_code *p = find_inputcode_byfunc(iconv);
1855 set_input_codename(p->name);
1856 debug(input_codename);
1858 iconv_for_check = iconv;
1863 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1864 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1865 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1866 #ifdef SHIFTJIS_CP932
1867 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1868 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1870 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1872 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1873 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1875 #define SCORE_INIT (SCORE_iMIME)
1877 const int score_table_A0[] = {
1880 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1881 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1884 const int score_table_F0[] = {
1885 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1886 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1887 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1888 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1891 void set_code_score(struct input_code *ptr, int score)
1894 ptr->score |= score;
1898 void clr_code_score(struct input_code *ptr, int score)
1901 ptr->score &= ~score;
1905 void code_score(struct input_code *ptr)
1907 int c2 = ptr->buf[0];
1908 #ifdef UTF8_OUTPUT_ENABLE
1909 int c1 = ptr->buf[1];
1912 set_code_score(ptr, SCORE_ERROR);
1913 }else if (c2 == SSO){
1914 set_code_score(ptr, SCORE_KANA);
1915 #ifdef UTF8_OUTPUT_ENABLE
1916 }else if (!e2w_conv(c2, c1)){
1917 set_code_score(ptr, SCORE_NO_EXIST);
1919 }else if ((c2 & 0x70) == 0x20){
1920 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1921 }else if ((c2 & 0x70) == 0x70){
1922 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1923 }else if ((c2 & 0x70) >= 0x50){
1924 set_code_score(ptr, SCORE_L2);
1928 void status_disable(struct input_code *ptr)
1933 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1936 void status_push_ch(struct input_code *ptr, int c)
1938 ptr->buf[ptr->index++] = c;
1941 void status_clear(struct input_code *ptr)
1947 void status_reset(struct input_code *ptr)
1950 ptr->score = SCORE_INIT;
1953 void status_reinit(struct input_code *ptr)
1956 ptr->_file_stat = 0;
1959 void status_check(struct input_code *ptr, int c)
1961 if (c <= DEL && estab_f){
1966 void s_status(struct input_code *ptr, int c)
1970 status_check(ptr, c);
1975 #ifdef NUMCHAR_OPTION
1976 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1979 }else if (0xa1 <= c && c <= 0xdf){
1980 status_push_ch(ptr, SSO);
1981 status_push_ch(ptr, c);
1984 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
1986 status_push_ch(ptr, c);
1987 #ifdef SHIFTJIS_CP932
1989 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
1991 status_push_ch(ptr, c);
1992 #endif /* SHIFTJIS_CP932 */
1994 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
1996 status_push_ch(ptr, c);
1997 #endif /* X0212_ENABLE */
1999 status_disable(ptr);
2003 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2004 status_push_ch(ptr, c);
2005 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2009 status_disable(ptr);
2013 #ifdef SHIFTJIS_CP932
2014 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2015 status_push_ch(ptr, c);
2016 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2017 set_code_score(ptr, SCORE_CP932);
2022 #endif /* SHIFTJIS_CP932 */
2023 #ifndef X0212_ENABLE
2024 status_disable(ptr);
2030 void e_status(struct input_code *ptr, int c)
2034 status_check(ptr, c);
2039 #ifdef NUMCHAR_OPTION
2040 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2043 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2045 status_push_ch(ptr, c);
2047 }else if (0x8f == c){
2049 status_push_ch(ptr, c);
2050 #endif /* X0212_ENABLE */
2052 status_disable(ptr);
2056 if (0xa1 <= c && c <= 0xfe){
2057 status_push_ch(ptr, c);
2061 status_disable(ptr);
2066 if (0xa1 <= c && c <= 0xfe){
2068 status_push_ch(ptr, c);
2070 status_disable(ptr);
2072 #endif /* X0212_ENABLE */
2076 #ifdef UTF8_INPUT_ENABLE
2077 void w16_status(struct input_code *ptr, int c)
2083 if (ptr->_file_stat == 0){
2084 if (c == 0xfe || c == 0xff){
2086 status_push_ch(ptr, c);
2087 ptr->_file_stat = 1;
2089 status_disable(ptr);
2090 ptr->_file_stat = -1;
2092 }else if (ptr->_file_stat > 0){
2094 status_push_ch(ptr, c);
2095 }else if (ptr->_file_stat < 0){
2096 status_disable(ptr);
2102 status_disable(ptr);
2103 ptr->_file_stat = -1;
2105 status_push_ch(ptr, c);
2112 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2113 status_push_ch(ptr, c);
2116 status_disable(ptr);
2117 ptr->_file_stat = -1;
2123 void w_status(struct input_code *ptr, int c)
2127 status_check(ptr, c);
2132 #ifdef NUMCHAR_OPTION
2133 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2136 }else if (0xc0 <= c && c <= 0xdf){
2138 status_push_ch(ptr, c);
2139 }else if (0xe0 <= c && c <= 0xef){
2141 status_push_ch(ptr, c);
2143 status_disable(ptr);
2148 if (0x80 <= c && c <= 0xbf){
2149 status_push_ch(ptr, c);
2150 if (ptr->index > ptr->stat){
2151 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2152 && ptr->buf[2] == 0xbf);
2153 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2154 &ptr->buf[0], &ptr->buf[1]);
2161 status_disable(ptr);
2168 void code_status(int c)
2170 int action_flag = 1;
2171 struct input_code *result = 0;
2172 struct input_code *p = input_code_list;
2174 (p->status_func)(p, c);
2177 }else if(p->stat == 0){
2188 if (result && !estab_f){
2189 set_iconv(TRUE, result->iconv_func);
2190 }else if (c <= DEL){
2191 struct input_code *ptr = input_code_list;
2201 int std_getc(FILE *f)
2204 return std_gc_buf[--std_gc_ndx];
2210 int std_ungetc(int c, FILE *f)
2212 if (std_gc_ndx == STD_GC_BUFSIZE){
2215 std_gc_buf[std_gc_ndx++] = c;
2220 void std_putc(int c)
2227 #if !defined(PERL_XS) && !defined(WIN32DLL)
2228 int noconvert(FILE *f)
2232 while ((c = (*i_getc)(f)) != EOF)
2238 void module_connection(void)
2240 oconv = output_conv;
2243 /* replace continucation module, from output side */
2245 /* output redicrection */
2247 if (noout_f || guess_f){
2254 if (mimeout_f == TRUE) {
2255 o_base64conv = oconv; oconv = base64_conv;
2257 /* base64_count = 0; */
2261 o_crconv = oconv; oconv = cr_conv;
2264 o_rot_conv = oconv; oconv = rot_conv;
2267 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2270 o_hira_conv = oconv; oconv = hira_conv;
2273 o_fconv = oconv; oconv = fold_conv;
2276 if (alpha_f || x0201_f) {
2277 o_zconv = oconv; oconv = z_conv;
2281 i_ungetc = std_ungetc;
2282 /* input redicrection */
2285 i_cgetc = i_getc; i_getc = cap_getc;
2286 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2289 i_ugetc = i_getc; i_getc = url_getc;
2290 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2293 #ifdef NUMCHAR_OPTION
2295 i_ngetc = i_getc; i_getc = numchar_getc;
2296 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2299 #ifdef UNICODE_NORMALIZATION
2300 if (nfc_f && input_f == UTF8_INPUT){
2301 i_nfc_getc = i_getc; i_getc = nfc_getc;
2302 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2305 if (mime_f && mimebuf_f==FIXED_MIME) {
2306 i_mgetc = i_getc; i_getc = mime_getc;
2307 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2310 i_bgetc = i_getc; i_getc = broken_getc;
2311 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2313 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2314 set_iconv(-TRUE, e_iconv);
2315 } else if (input_f == SJIS_INPUT) {
2316 set_iconv(-TRUE, s_iconv);
2317 #ifdef UTF8_INPUT_ENABLE
2318 } else if (input_f == UTF8_INPUT) {
2319 set_iconv(-TRUE, w_iconv);
2320 } else if (input_f == UTF16BE_INPUT) {
2321 set_iconv(-TRUE, w_iconv16);
2322 } else if (input_f == UTF16LE_INPUT) {
2323 set_iconv(-TRUE, w_iconv16);
2326 set_iconv(FALSE, e_iconv);
2330 struct input_code *p = input_code_list;
2338 Conversion main loop. Code detection only.
2341 int kanji_convert(FILE *f)
2345 int is_8bit = FALSE;
2347 module_connection();
2350 if(input_f == SJIS_INPUT
2351 #ifdef UTF8_INPUT_ENABLE
2352 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2360 output_mode = ASCII;
2363 #define NEXT continue /* no output, get next */
2364 #define SEND ; /* output c1 and c2, get next */
2365 #define LAST break /* end of loop, go closing */
2367 while ((c1 = (*i_getc)(f)) != EOF) {
2368 #ifdef INPUT_CODE_FIX
2375 /* in case of 8th bit is on */
2376 if (!estab_f&&!mime_decode_mode) {
2377 /* in case of not established yet */
2378 /* It is still ambiguious */
2379 if (h_conv(f, c2, c1)==EOF)
2385 /* in case of already established */
2387 /* ignore bogus code */
2393 /* second byte, 7 bit code */
2394 /* it might be kanji shitfted */
2395 if ((c1 == DEL) || (c1 <= SPACE)) {
2396 /* ignore bogus first code */
2404 #ifdef UTF8_INPUT_ENABLE
2413 #ifdef NUMCHAR_OPTION
2414 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2417 } else if (c1 > DEL) {
2419 if (!estab_f && !iso8859_f) {
2420 /* not established yet */
2421 if (!is_8bit) is_8bit = TRUE;
2424 } else { /* estab_f==TRUE */
2429 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2430 /* SJIS X0201 Case... */
2431 if(iso2022jp_f && x0201_f==NO_X0201) {
2432 (*oconv)(GETA1, GETA2);
2439 } else if (c1==SSO && iconv != s_iconv) {
2440 /* EUC X0201 Case */
2441 c1 = (*i_getc)(f); /* skip SSO */
2443 if (SSP<=c1 && c1<0xe0) {
2444 if(iso2022jp_f && x0201_f==NO_X0201) {
2445 (*oconv)(GETA1, GETA2);
2452 } else { /* bogus code, skip SSO and one byte */
2456 /* already established */
2461 } else if ((c1 > SPACE) && (c1 != DEL)) {
2462 /* in case of Roman characters */
2464 /* output 1 shifted byte */
2468 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2469 /* output 1 shifted byte */
2470 if(iso2022jp_f && x0201_f==NO_X0201) {
2471 (*oconv)(GETA1, GETA2);
2478 /* look like bogus code */
2481 } else if (input_mode == X0208 || input_mode == X0212 ||
2482 input_mode == X0213_1 || input_mode == X0213_2) {
2483 /* in case of Kanji shifted */
2486 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2487 /* Check MIME code */
2488 if ((c1 = (*i_getc)(f)) == EOF) {
2491 } else if (c1 == '?') {
2492 /* =? is mime conversion start sequence */
2493 if(mime_f == STRICT_MIME) {
2494 /* check in real detail */
2495 if (mime_begin_strict(f) == EOF)
2499 } else if (mime_begin(f) == EOF)
2509 /* normal ASCII code */
2512 } else if (!is_8bit && c1 == SI) {
2515 } else if (!is_8bit && c1 == SO) {
2518 } else if (!is_8bit && c1 == ESC ) {
2519 if ((c1 = (*i_getc)(f)) == EOF) {
2520 /* (*oconv)(0, ESC); don't send bogus code */
2522 } else if (c1 == '$') {
2523 if ((c1 = (*i_getc)(f)) == EOF) {
2525 (*oconv)(0, ESC); don't send bogus code
2526 (*oconv)(0, '$'); */
2528 } else if (c1 == '@'|| c1 == 'B') {
2529 /* This is kanji introduction */
2532 set_input_codename("ISO-2022-JP");
2534 debug(input_codename);
2537 } else if (c1 == '(') {
2538 if ((c1 = (*i_getc)(f)) == EOF) {
2539 /* don't send bogus code
2545 } else if (c1 == '@'|| c1 == 'B') {
2546 /* This is kanji introduction */
2551 } else if (c1 == 'D'){
2555 #endif /* X0212_ENABLE */
2556 } else if (c1 == (X0213_1&0x7F)){
2557 input_mode = X0213_1;
2560 } else if (c1 == (X0213_2&0x7F)){
2561 input_mode = X0213_2;
2565 /* could be some special code */
2572 } else if (broken_f&0x2) {
2573 /* accept any ESC-(-x as broken code ... */
2583 } else if (c1 == '(') {
2584 if ((c1 = (*i_getc)(f)) == EOF) {
2585 /* don't send bogus code
2587 (*oconv)(0, '('); */
2591 /* This is X0201 kana introduction */
2592 input_mode = X0201; shift_mode = X0201;
2594 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2595 /* This is X0208 kanji introduction */
2596 input_mode = ASCII; shift_mode = FALSE;
2598 } else if (broken_f&0x2) {
2599 input_mode = ASCII; shift_mode = FALSE;
2604 /* maintain various input_mode here */
2608 } else if ( c1 == 'N' || c1 == 'n' ){
2610 c3 = (*i_getc)(f); /* skip SS2 */
2611 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2626 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2627 input_mode = ASCII; set_iconv(FALSE, 0);
2629 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2630 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2638 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2639 if ((c1=(*i_getc)(f))!=EOF) {
2643 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2661 if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2662 int c0 = (*i_getc)(f);
2665 (*iconv)(c2, c1, c0);
2671 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2675 (*oconv)((0x8f << 8) | c2, c1);
2677 #endif /* X0212_ENABLE */
2679 (*oconv)((0x8f << 8) | c2, c1);
2682 (*oconv)(input_mode, c1); /* other special case */
2687 /* goto next_word */
2691 (*iconv)(EOF, 0, 0);
2692 if (!is_inputcode_set)
2695 struct input_code *p = input_code_list;
2696 struct input_code *result = p;
2698 if (p->score < result->score) result = p;
2701 set_input_codename(result->name);
2708 h_conv(FILE *f, int c2, int c1)
2713 /** it must NOT be in the kanji shifte sequence */
2714 /** it must NOT be written in JIS7 */
2715 /** and it must be after 2 byte 8bit code */
2721 while ((c1 = (*i_getc)(f)) != EOF) {
2727 if (push_hold_buf(c1) == EOF || estab_f){
2733 struct input_code *p = input_code_list;
2734 struct input_code *result = p;
2739 if (p->score < result->score){
2744 set_iconv(FALSE, result->iconv_func);
2749 ** 1) EOF is detected, or
2750 ** 2) Code is established, or
2751 ** 3) Buffer is FULL (but last word is pushed)
2753 ** in 1) and 3) cases, we continue to use
2754 ** Kanji codes by oconv and leave estab_f unchanged.
2759 while (wc < hold_count){
2760 c2 = hold_buf[wc++];
2762 #ifdef NUMCHAR_OPTION
2763 || (c2 & CLASS_MASK) == CLASS_UTF16
2768 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2769 (*iconv)(X0201, c2, 0);
2772 if (wc < hold_count){
2773 c1 = hold_buf[wc++];
2782 if ((*iconv)(c2, c1, 0) < 0){
2784 if (wc < hold_count){
2785 c0 = hold_buf[wc++];
2794 (*iconv)(c2, c1, c0);
2803 push_hold_buf(int c2)
2805 if (hold_count >= HOLD_SIZE*2)
2807 hold_buf[hold_count++] = (unsigned char)c2;
2808 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2811 int s2e_conv(int c2, int c1, int *p2, int *p1)
2813 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2816 static const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2817 #ifdef SHIFTJIS_CP932
2818 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2819 extern const unsigned short shiftjis_cp932[3][189];
2820 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2826 #endif /* SHIFTJIS_CP932 */
2828 if (!x0213_f && 0xfa <= c2 && c2 <= 0xfc){
2829 extern const unsigned short shiftjis_x0212[3][189];
2830 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2833 c2 = (0x8f << 8) | (val >> 8);
2846 if(x0213_f && c2 >= 0xF0){
2847 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2848 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2849 }else{ /* 78<=k<=94 */
2850 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2851 if (0x9E < c1) c2++;
2854 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2855 if (0x9E < c1) c2++;
2858 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2865 c2 = x0212_unshift(c2);
2872 int s_iconv(int c2, int c1, int c0)
2876 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2879 int ret = s2e_conv(c2, c1, &c2, &c1);
2880 if (ret) return ret;
2886 int e_iconv(int c2, int c1, int c0)
2891 }else if (c2 == 0x8f){
2895 c2 = (c2 << 8) | (c1 & 0x7f);
2897 #ifdef SHIFTJIS_CP932
2900 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2901 s2e_conv(s2, s1, &c2, &c1);
2902 if ((c2 & 0xff00) == 0){
2908 #endif /* SHIFTJIS_CP932 */
2909 #endif /* X0212_ENABLE */
2910 } else if (c2 == SSO){
2913 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2923 #ifdef UTF8_INPUT_ENABLE
2924 int w2e_conv(int c2, int c1, int c0, int *p2, int *p1)
2931 }else if (0xc0 <= c2 && c2 <= 0xef) {
2932 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2933 #ifdef NUMCHAR_OPTION
2936 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2944 int w_iconv(int c2, int c1, int c0)
2948 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
2949 if(ignore_zwnbsp_f){
2950 ignore_zwnbsp_f = FALSE;
2951 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2955 if (c2 == 0) /* 0x00-0x7f */
2956 c1 &= 0x7F; /* 1byte */
2958 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
2960 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
2961 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2962 return -1; /* 3bytes */
2964 else if (0xf0 <= c2)
2965 return 0; /* 4,5,6bytes */
2966 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2967 return 0; /* trail byte */
2971 /* must be 3bytes */
2973 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2975 }else if(c2 == 0xED){
2976 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
2978 }else if((c2 & 0xf0) == 0xe0){
2979 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2983 if (c2 == 0 || c2 == EOF){
2985 ret = w2e_conv(c2, c1, c0, &c2, &c1);
2994 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
2995 void w16w_conv(int val, int *p2, int *p1, int *p0)
3002 }else if (val < 0x800){
3003 *p2 = 0xc0 | (val >> 6);
3004 *p1 = 0x80 | (val & 0x3f);
3007 *p2 = 0xe0 | (val >> 12);
3008 *p1 = 0x80 | ((val >> 6) & 0x3f);
3009 *p0 = 0x80 | (val & 0x3f);
3014 #ifdef UTF8_INPUT_ENABLE
3015 int ww16_conv(int c2, int c1, int c0)
3020 }else if (c2 >= 0xe0){
3021 val = (c2 & 0x0f) << 12;
3022 val |= (c1 & 0x3f) << 6;
3024 }else if (c2 >= 0xc0){
3025 val = (c2 & 0x1f) << 6;
3033 int w16e_conv(int val, int *p2, int *p1)
3042 w16w_conv(val, &c2, &c1, &c0);
3043 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3044 #ifdef NUMCHAR_OPTION
3047 *p1 = CLASS_UTF16 | val;
3056 #ifdef UTF8_INPUT_ENABLE
3057 int w_iconv16(int c2, int c1, int c0)
3061 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3062 if(ignore_zwnbsp_f){
3063 ignore_zwnbsp_f = FALSE;
3064 if (c2==0376 && c1==0377){
3065 utf16_mode = UTF16BE_INPUT;
3067 }else if(c2==0377 && c1==0376){
3068 utf16_mode = UTF16LE_INPUT;
3072 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3074 tmp=c1; c1=c2; c2=tmp;
3076 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3079 }else if((c2>>3)==27){ /* surrogate pair */
3081 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3082 if (ret) return ret;
3087 int unicode_to_jis_common(int c2, int c1, int c0, int *p2, int *p1)
3089 extern const unsigned short *const utf8_to_euc_2bytes[];
3090 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3091 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3092 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3093 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3094 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3095 const unsigned short *const *pp;
3096 const unsigned short *const *const *ppp;
3097 static const int no_best_fit_chars_table_C2[] =
3098 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3100 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3101 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3102 static const int no_best_fit_chars_table_C2_ms[] =
3103 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3105 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3106 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3107 static const int no_best_fit_chars_table_932_C2[] =
3108 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3110 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3111 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3112 static const int no_best_fit_chars_table_932_C3[] =
3113 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3114 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3115 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3116 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3122 }else if(c2 < 0xe0){
3123 if(no_best_fit_chars_f){
3124 if(ms_ucs_map_f == UCS_MAP_CP932){
3127 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3130 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3133 }else if(cp51932_f){
3136 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3139 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3142 }else if(ms_ucs_map_f == UCS_MAP_MS){
3143 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3145 }else if(!x0212_f && ms_ucs_map_f != UCS_MAP_CP932){
3149 if(no_best_fit_chars_table_C2[c1&0x3F]&1) return 1;
3151 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3155 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3160 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3161 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3163 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3165 if(no_best_fit_chars_f){
3166 if(ms_ucs_map_f == UCS_MAP_CP932){
3167 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3168 }else if(ms_ucs_map_f == UCS_MAP_MS){
3173 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3176 if(c0 == 0x92) return 1;
3181 if(c1 == 0x80 || c0 == 0x9C) return 1;
3189 if(c0 == 0x95) return 1;
3192 if(c0 == 0xA5) return 1;
3199 if(c0 == 0x8D) return 1;
3202 if(c0 == 0x9E && cp51932_f) return 1;
3205 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3213 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3214 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3216 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3221 int w_iconv_common(int c1, int c0, const unsigned short *const *pp, int psize, int *p2, int *p1)
3224 const unsigned short *p;
3227 if (pp == 0) return 1;
3230 if (c1 < 0 || psize <= c1) return 1;
3232 if (p == 0) return 1;
3235 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3237 if (val == 0) return 1;
3238 if (no_cp932ext_f && (
3239 (val>>8) == 0x2D || /* NEC special characters */
3240 val > 0xF300 /* NEC special characters */
3248 if (c2 == SO) c2 = X0201;
3255 void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c)
3257 const char *hex = "0123456789ABCDEF";
3263 (*f)(0, hex[(c>>shift)&0xF]);
3273 void encode_fallback_html(int c)
3279 (*oconv)(0, 0x30+(c/1000000)%10);
3281 (*oconv)(0, 0x30+(c/100000 )%10);
3283 (*oconv)(0, 0x30+(c/10000 )%10);
3285 (*oconv)(0, 0x30+(c/1000 )%10);
3287 (*oconv)(0, 0x30+(c/100 )%10);
3289 (*oconv)(0, 0x30+(c/10 )%10);
3291 (*oconv)(0, 0x30+ c %10);
3296 void encode_fallback_xml(int c)
3301 nkf_each_char_to_hex(oconv, c);
3306 void encode_fallback_java(int c)
3308 const char *hex = "0123456789ABCDEF";
3310 if((c&0x00FFFFFF) > 0xFFFF){
3314 (*oconv)(0, hex[(c>>20)&0xF]);
3315 (*oconv)(0, hex[(c>>16)&0xF]);
3319 (*oconv)(0, hex[(c>>12)&0xF]);
3320 (*oconv)(0, hex[(c>> 8)&0xF]);
3321 (*oconv)(0, hex[(c>> 4)&0xF]);
3322 (*oconv)(0, hex[ c &0xF]);
3326 void encode_fallback_perl(int c)
3331 nkf_each_char_to_hex(oconv, c);
3336 void encode_fallback_subchar(int c)
3338 c = unicode_subchar;
3339 (*oconv)((c>>8)&0xFF, c&0xFF);
3344 #ifdef UTF8_OUTPUT_ENABLE
3345 int e2w_conv(int c2, int c1)
3347 extern const unsigned short euc_to_utf8_1byte[];
3348 extern const unsigned short *const euc_to_utf8_2bytes[];
3349 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3350 extern const unsigned short *const x0212_to_utf8_2bytes[];
3351 const unsigned short *p;
3354 p = euc_to_utf8_1byte;
3356 } else if (c2 >> 8 == 0x8f){
3357 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3360 c2 = (c2&0x7f) - 0x21;
3361 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3362 p = x0212_to_utf8_2bytes[c2];
3368 c2 = (c2&0x7f) - 0x21;
3369 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3370 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3375 c1 = (c1 & 0x7f) - 0x21;
3376 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3381 void w_oconv(int c2, int c1)
3390 if (unicode_bom_f==2) {
3397 #ifdef NUMCHAR_OPTION
3398 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3399 w16w_conv(c1, &c2, &c1, &c0);
3403 if (c0) (*o_putc)(c0);
3410 output_mode = ASCII;
3412 } else if (c2 == ISO8859_1) {
3413 output_mode = ISO8859_1;
3414 (*o_putc)(c1 | 0x080);
3417 val = e2w_conv(c2, c1);
3419 w16w_conv(val, &c2, &c1, &c0);
3423 if (c0) (*o_putc)(c0);
3429 void w_oconv16(int c2, int c1)
3436 if (unicode_bom_f==2) {
3438 (*o_putc)((unsigned char)'\377');
3442 (*o_putc)((unsigned char)'\377');
3447 if (c2 == ISO8859_1) {
3450 #ifdef NUMCHAR_OPTION
3451 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3452 c2 = (c1 >> 8) & 0xff;
3456 int val = e2w_conv(c2, c1);
3457 c2 = (val >> 8) & 0xff;
3471 void e_oconv(int c2, int c1)
3473 #ifdef NUMCHAR_OPTION
3474 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3475 w16e_conv(c1, &c2, &c1);
3476 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3477 if(encode_fallback)(*encode_fallback)(c1);
3485 } else if (c2 == 0) {
3486 output_mode = ASCII;
3488 } else if (c2 == X0201) {
3489 output_mode = JAPANESE_EUC;
3490 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3491 } else if (c2 == ISO8859_1) {
3492 output_mode = ISO8859_1;
3493 (*o_putc)(c1 | 0x080);
3495 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3496 output_mode = JAPANESE_EUC;
3497 #ifdef SHIFTJIS_CP932
3500 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3501 s2e_conv(s2, s1, &c2, &c1);
3506 output_mode = ASCII;
3508 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3511 (*o_putc)((c2 & 0x7f) | 0x080);
3512 (*o_putc)(c1 | 0x080);
3515 (*o_putc)((c2 & 0x7f) | 0x080);
3516 (*o_putc)(c1 | 0x080);
3520 if ((c1<0x21 || 0x7e<c1) ||
3521 (c2<0x21 || 0x7e<c2)) {
3522 set_iconv(FALSE, 0);
3523 return; /* too late to rescue this char */
3525 output_mode = JAPANESE_EUC;
3526 (*o_putc)(c2 | 0x080);
3527 (*o_putc)(c1 | 0x080);
3532 int x0212_shift(int c)
3536 if ((ret & 0xff00) == 0x8f00){
3537 if (0x75 <= c && c <= 0x7f){
3538 ret = c + (0x109 - 0x75);
3541 if (0x75 <= c && c <= 0x7f){
3542 ret = c + (0x113 - 0x75);
3549 int x0212_unshift(int c)
3552 if (0x7f <= c && c <= 0x88){
3553 ret = c + (0x75 - 0x7f);
3554 }else if (0x89 <= c && c <= 0x92){
3555 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3559 #endif /* X0212_ENABLE */
3561 int e2s_conv(int c2, int c1, int *p2, int *p1)
3564 if ((c2 & 0xff00) == 0x8f00){
3567 if((0x21 <= ndx && ndx <= 0x2F)){
3568 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3569 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3571 }else if(0x6E <= ndx && ndx <= 0x7E){
3572 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3573 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3579 else if(0x21 <= ndx && ndx <= 0x7e){
3581 const unsigned short *ptr;
3582 extern const unsigned short *const x0212_shiftjis[];
3583 ptr = x0212_shiftjis[ndx - 0x21];
3585 val = ptr[(c1 & 0x7f) - 0x21];
3594 c2 = x0212_shift(c2);
3596 #endif /* X0212_ENABLE */
3598 if(0x7F < c2) return 1;
3599 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3600 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3604 void s_oconv(int c2, int c1)
3606 #ifdef NUMCHAR_OPTION
3607 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3608 w16e_conv(c1, &c2, &c1);
3609 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3610 if(encode_fallback)(*encode_fallback)(c1);
3618 } else if (c2 == 0) {
3619 output_mode = ASCII;
3621 } else if (c2 == X0201) {
3622 output_mode = SHIFT_JIS;
3624 } else if (c2 == ISO8859_1) {
3625 output_mode = ISO8859_1;
3626 (*o_putc)(c1 | 0x080);
3628 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3629 output_mode = SHIFT_JIS;
3630 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3636 if ((c1<0x20 || 0x7e<c1) ||
3637 (c2<0x20 || 0x7e<c2)) {
3638 set_iconv(FALSE, 0);
3639 return; /* too late to rescue this char */
3641 output_mode = SHIFT_JIS;
3642 e2s_conv(c2, c1, &c2, &c1);
3644 #ifdef SHIFTJIS_CP932
3646 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3647 extern const unsigned short cp932inv[2][189];
3648 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3654 #endif /* SHIFTJIS_CP932 */
3657 if (prefix_table[(unsigned char)c1]){
3658 (*o_putc)(prefix_table[(unsigned char)c1]);
3664 void j_oconv(int c2, int c1)
3666 #ifdef NUMCHAR_OPTION
3667 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3668 w16e_conv(c1, &c2, &c1);
3669 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3670 if(encode_fallback)(*encode_fallback)(c1);
3676 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3679 (*o_putc)(ascii_intro);
3680 output_mode = ASCII;
3684 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3686 if(output_mode!=X0213_2){
3687 output_mode = X0213_2;
3691 (*o_putc)(X0213_2&0x7F);
3694 if(output_mode!=X0212){
3695 output_mode = X0212;
3699 (*o_putc)(X0212&0x7F);
3702 (*o_putc)(c2 & 0x7f);
3705 } else if (c2==X0201) {
3706 if (output_mode!=X0201) {
3707 output_mode = X0201;
3713 } else if (c2==ISO8859_1) {
3714 /* iso8859 introduction, or 8th bit on */
3715 /* Can we convert in 7bit form using ESC-'-'-A ?
3717 output_mode = ISO8859_1;
3719 } else if (c2 == 0) {
3720 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3723 (*o_putc)(ascii_intro);
3724 output_mode = ASCII;
3728 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
3730 if (output_mode!=X0213_1) {
3731 output_mode = X0213_1;
3735 (*o_putc)(X0213_1&0x7F);
3737 }else if (output_mode != X0208) {
3738 output_mode = X0208;
3741 (*o_putc)(kanji_intro);
3748 void base64_conv(int c2, int c1)
3750 mime_prechar(c2, c1);
3751 (*o_base64conv)(c2,c1);
3755 static int broken_buf[3];
3756 static int broken_counter = 0;
3757 static int broken_last = 0;
3758 int broken_getc(FILE *f)
3762 if (broken_counter>0) {
3763 return broken_buf[--broken_counter];
3766 if (c=='$' && broken_last != ESC
3767 && (input_mode==ASCII || input_mode==X0201)) {
3770 if (c1=='@'|| c1=='B') {
3771 broken_buf[0]=c1; broken_buf[1]=c;
3778 } else if (c=='(' && broken_last != ESC
3779 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3782 if (c1=='J'|| c1=='B') {
3783 broken_buf[0]=c1; broken_buf[1]=c;
3796 int broken_ungetc(int c, FILE *f)
3798 if (broken_counter<2)
3799 broken_buf[broken_counter++]=c;
3803 static int prev_cr = 0;
3805 void cr_conv(int c2, int c1)
3809 if (! (c2==0&&c1==NL) ) {
3815 } else if (c1=='\r') {
3817 } else if (c1=='\n') {
3818 if (crmode_f==CRLF) {
3819 (*o_crconv)(0,'\r');
3820 } else if (crmode_f==CR) {
3821 (*o_crconv)(0,'\r');
3825 } else if (c1!='\032' || crmode_f!=NL){
3831 Return value of fold_conv()
3833 \n add newline and output char
3834 \r add newline and output nothing
3837 1 (or else) normal output
3839 fold state in prev (previous character)
3841 >0x80 Japanese (X0208/X0201)
3846 This fold algorthm does not preserve heading space in a line.
3847 This is the main difference from fmt.
3850 #define char_size(c2,c1) (c2?2:1)
3852 void fold_conv(int c2, int c1)
3857 if (c1== '\r' && !fold_preserve_f) {
3858 fold_state=0; /* ignore cr */
3859 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3861 fold_state=0; /* ignore cr */
3862 } else if (c1== BS) {
3863 if (f_line>0) f_line--;
3865 } else if (c2==EOF && f_line != 0) { /* close open last line */
3867 } else if ((c1=='\n' && !fold_preserve_f)
3868 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3869 && fold_preserve_f)) {
3871 if (fold_preserve_f) {
3875 } else if ((f_prev == c1 && !fold_preserve_f)
3876 || (f_prev == '\n' && fold_preserve_f)
3877 ) { /* duplicate newline */
3880 fold_state = '\n'; /* output two newline */
3886 if (f_prev&0x80) { /* Japanese? */
3888 fold_state = 0; /* ignore given single newline */
3889 } else if (f_prev==' ') {
3893 if (++f_line<=fold_len)
3897 fold_state = '\r'; /* fold and output nothing */
3901 } else if (c1=='\f') {
3904 fold_state = '\n'; /* output newline and clear */
3905 } else if ( (c2==0 && c1==' ')||
3906 (c2==0 && c1=='\t')||
3907 (c2=='!'&& c1=='!')) {
3908 /* X0208 kankaku or ascii space */
3909 if (f_prev == ' ') {
3910 fold_state = 0; /* remove duplicate spaces */
3913 if (++f_line<=fold_len)
3914 fold_state = ' '; /* output ASCII space only */
3916 f_prev = ' '; f_line = 0;
3917 fold_state = '\r'; /* fold and output nothing */
3921 prev0 = f_prev; /* we still need this one... , but almost done */
3923 if (c2 || c2==X0201)
3924 f_prev |= 0x80; /* this is Japanese */
3925 f_line += char_size(c2,c1);
3926 if (f_line<=fold_len) { /* normal case */
3929 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3930 f_line = char_size(c2,c1);
3931 fold_state = '\n'; /* We can't wait, do fold now */
3932 } else if (c2==X0201) {
3933 /* simple kinsoku rules return 1 means no folding */
3934 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3935 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3936 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3937 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3938 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3939 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3940 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3942 fold_state = '\n';/* add one new f_line before this character */
3945 fold_state = '\n';/* add one new f_line before this character */
3948 /* kinsoku point in ASCII */
3949 if ( c1==')'|| /* { [ ( */
3960 /* just after special */
3961 } else if (!is_alnum(prev0)) {
3962 f_line = char_size(c2,c1);
3964 } else if ((prev0==' ') || /* ignored new f_line */
3965 (prev0=='\n')|| /* ignored new f_line */
3966 (prev0&0x80)) { /* X0208 - ASCII */
3967 f_line = char_size(c2,c1);
3968 fold_state = '\n';/* add one new f_line before this character */
3970 fold_state = 1; /* default no fold in ASCII */
3974 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3975 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3976 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3977 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3978 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3979 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3980 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3981 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3982 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3983 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3984 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3985 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3986 /* default no fold in kinsoku */
3989 f_line = char_size(c2,c1);
3990 /* add one new f_line before this character */
3993 f_line = char_size(c2,c1);
3995 /* add one new f_line before this character */
4000 /* terminator process */
4001 switch(fold_state) {
4020 int z_prev2=0,z_prev1=0;
4022 void z_conv(int c2, int c1)
4025 /* if (c2) c1 &= 0x7f; assertion */
4027 if (x0201_f && z_prev2==X0201) { /* X0201 */
4028 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4030 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4032 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4034 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4038 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4047 if (x0201_f && c2==X0201) {
4048 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4049 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4050 z_prev1 = c1; z_prev2 = c2;
4053 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4058 /* JISX0208 Alphabet */
4059 if (alpha_f && c2 == 0x23 ) {
4061 } else if (alpha_f && c2 == 0x21 ) {
4062 /* JISX0208 Kigou */
4067 } else if (alpha_f&0x4) {
4072 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4078 case '>': entity = ">"; break;
4079 case '<': entity = "<"; break;
4080 case '\"': entity = """; break;
4081 case '&': entity = "&"; break;
4084 while (*entity) (*o_zconv)(0, *entity++);
4094 #define rot13(c) ( \
4096 (c <= 'M') ? (c + 13): \
4097 (c <= 'Z') ? (c - 13): \
4099 (c <= 'm') ? (c + 13): \
4100 (c <= 'z') ? (c - 13): \
4104 #define rot47(c) ( \
4106 ( c <= 'O' ) ? (c + 47) : \
4107 ( c <= '~' ) ? (c - 47) : \
4111 void rot_conv(int c2, int c1)
4113 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4119 (*o_rot_conv)(c2,c1);
4122 void hira_conv(int c2, int c1)
4124 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
4126 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
4129 (*o_hira_conv)(c2,c1);
4133 void iso2022jp_check_conv(int c2, int c1)
4135 static const int range[RANGE_NUM_MAX][2] = {
4158 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4162 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4167 for (i = 0; i < RANGE_NUM_MAX; i++) {
4168 start = range[i][0];
4171 if (c >= start && c <= end) {
4176 (*o_iso2022jp_check_conv)(c2,c1);
4180 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4182 const unsigned char *mime_pattern[] = {
4183 (const unsigned char *)"\075?EUC-JP?B?",
4184 (const unsigned char *)"\075?SHIFT_JIS?B?",
4185 (const unsigned char *)"\075?ISO-8859-1?Q?",
4186 (const unsigned char *)"\075?ISO-8859-1?B?",
4187 (const unsigned char *)"\075?ISO-2022-JP?B?",
4188 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4189 #if defined(UTF8_INPUT_ENABLE)
4190 (const unsigned char *)"\075?UTF-8?B?",
4191 (const unsigned char *)"\075?UTF-8?Q?",
4193 (const unsigned char *)"\075?US-ASCII?Q?",
4198 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4199 int (*mime_priority_func[])(int c2, int c1, int c0) = {
4200 e_iconv, s_iconv, 0, 0, 0, 0,
4201 #if defined(UTF8_INPUT_ENABLE)
4207 const int mime_encode[] = {
4208 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4209 #if defined(UTF8_INPUT_ENABLE)
4216 const int mime_encode_method[] = {
4217 'B', 'B','Q', 'B', 'B', 'Q',
4218 #if defined(UTF8_INPUT_ENABLE)
4226 #define MAXRECOVER 20
4228 void switch_mime_getc(void)
4230 if (i_getc!=mime_getc) {
4231 i_mgetc = i_getc; i_getc = mime_getc;
4232 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4233 if(mime_f==STRICT_MIME) {
4234 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4235 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4240 void unswitch_mime_getc(void)
4242 if(mime_f==STRICT_MIME) {
4243 i_mgetc = i_mgetc_buf;
4244 i_mungetc = i_mungetc_buf;
4247 i_ungetc = i_mungetc;
4248 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4249 mime_iconv_back = NULL;
4252 int mime_begin_strict(FILE *f)
4256 const unsigned char *p,*q;
4257 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4259 mime_decode_mode = FALSE;
4260 /* =? has been checked */
4262 p = mime_pattern[j];
4265 for(i=2;p[i]>' ';i++) { /* start at =? */
4266 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4267 /* pattern fails, try next one */
4269 while (mime_pattern[++j]) {
4270 p = mime_pattern[j];
4271 for(k=2;k<i;k++) /* assume length(p) > i */
4272 if (p[k]!=q[k]) break;
4273 if (k==i && nkf_toupper(c1)==p[k]) break;
4275 p = mime_pattern[j];
4276 if (p) continue; /* found next one, continue */
4277 /* all fails, output from recovery buffer */
4285 mime_decode_mode = p[i-2];
4287 mime_iconv_back = iconv;
4288 set_iconv(FALSE, mime_priority_func[j]);
4289 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4291 if (mime_decode_mode=='B') {
4292 mimebuf_f = unbuf_f;
4294 /* do MIME integrity check */
4295 return mime_integrity(f,mime_pattern[j]);
4303 int mime_getc_buf(FILE *f)
4305 /* we don't keep eof of Fifo, becase it contains ?= as
4306 a terminator. It was checked in mime_integrity. */
4307 return ((mimebuf_f)?
4308 (*i_mgetc_buf)(f):Fifo(mime_input++));
4311 int mime_ungetc_buf(int c, FILE *f)
4314 (*i_mungetc_buf)(c,f);
4316 Fifo(--mime_input) = (unsigned char)c;
4320 int mime_begin(FILE *f)
4325 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4326 /* re-read and convert again from mime_buffer. */
4328 /* =? has been checked */
4330 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4331 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4332 /* We accept any character type even if it is breaked by new lines */
4333 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4334 if (c1=='\n'||c1==' '||c1=='\r'||
4335 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4337 /* Failed. But this could be another MIME preemble */
4345 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4346 if (!(++i<MAXRECOVER) || c1==EOF) break;
4347 if (c1=='b'||c1=='B') {
4348 mime_decode_mode = 'B';
4349 } else if (c1=='q'||c1=='Q') {
4350 mime_decode_mode = 'Q';
4354 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4355 if (!(++i<MAXRECOVER) || c1==EOF) break;
4357 mime_decode_mode = FALSE;
4363 if (!mime_decode_mode) {
4364 /* false MIME premble, restart from mime_buffer */
4365 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4366 /* Since we are in MIME mode until buffer becomes empty, */
4367 /* we never go into mime_begin again for a while. */
4370 /* discard mime preemble, and goto MIME mode */
4372 /* do no MIME integrity check */
4373 return c1; /* used only for checking EOF */
4382 void debug(const char *str)
4385 fprintf(stderr, "%s\n", str);
4390 void set_input_codename(char *codename)
4394 strcmp(codename, "") != 0 &&
4395 strcmp(codename, input_codename) != 0)
4397 is_inputcode_mixed = TRUE;
4399 input_codename = codename;
4400 is_inputcode_set = TRUE;
4403 #if !defined(PERL_XS) && !defined(WIN32DLL)
4404 void print_guessed_code(char *filename)
4406 char *codename = "BINARY";
4407 if (!is_inputcode_mixed) {
4408 if (strcmp(input_codename, "") == 0) {
4411 codename = input_codename;
4414 if (filename != NULL) printf("%s:", filename);
4415 printf("%s\n", codename);
4421 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4429 if (!nkf_isxdigit(c2)){
4434 if (!nkf_isxdigit(c3)){
4439 return (hex2bin(c2) << 4) | hex2bin(c3);
4442 int cap_getc(FILE *f)
4444 return hex_getc(':', f, i_cgetc, i_cungetc);
4447 int cap_ungetc(int c, FILE *f)
4449 return (*i_cungetc)(c, f);
4452 int url_getc(FILE *f)
4454 return hex_getc('%', f, i_ugetc, i_uungetc);
4457 int url_ungetc(int c, FILE *f)
4459 return (*i_uungetc)(c, f);
4463 #ifdef NUMCHAR_OPTION
4464 int numchar_getc(FILE *f)
4466 int (*g)(FILE *) = i_ngetc;
4467 int (*u)(int c ,FILE *f) = i_nungetc;
4478 if (buf[i] == 'x' || buf[i] == 'X'){
4479 for (j = 0; j < 5; j++){
4481 if (!nkf_isxdigit(buf[i])){
4488 c |= hex2bin(buf[i]);
4491 for (j = 0; j < 6; j++){
4495 if (!nkf_isdigit(buf[i])){
4502 c += hex2bin(buf[i]);
4508 return CLASS_UTF16 | c;
4517 int numchar_ungetc(int c, FILE *f)
4519 return (*i_nungetc)(c, f);
4523 #ifdef UNICODE_NORMALIZATION
4525 /* Normalization Form C */
4526 int nfc_getc(FILE *f)
4528 int (*g)(FILE *f) = i_nfc_getc;
4529 int (*u)(int c ,FILE *f) = i_nfc_ungetc;
4530 int i=0, j, k=1, lower, upper;
4533 extern const struct normalization_pair normalization_table[];
4536 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4537 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4538 while (upper >= lower) {
4539 j = (lower+upper) / 2;
4540 array = normalization_table[j].nfd;
4541 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4542 if (array[k] != buf[k]){
4543 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4550 array = normalization_table[j].nfc;
4551 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4563 int nfc_ungetc(int c, FILE *f)
4565 return (*i_nfc_ungetc)(c, f);
4567 #endif /* UNICODE_NORMALIZATION */
4573 int c1, c2, c3, c4, cc;
4574 int t1, t2, t3, t4, mode, exit_mode;
4578 int lwsp_size = 128;
4580 if (mime_top != mime_last) { /* Something is in FIFO */
4581 return Fifo(mime_top++);
4583 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4584 mime_decode_mode=FALSE;
4585 unswitch_mime_getc();
4586 return (*i_getc)(f);
4589 if (mimebuf_f == FIXED_MIME)
4590 exit_mode = mime_decode_mode;
4593 if (mime_decode_mode == 'Q') {
4594 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4596 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4597 if (c1<=' ' || DEL<=c1) {
4598 mime_decode_mode = exit_mode; /* prepare for quit */
4601 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4605 mime_decode_mode = exit_mode; /* prepare for quit */
4606 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4607 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4608 /* end Q encoding */
4609 input_mode = exit_mode;
4611 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4612 if (lwsp_buf==NULL) {
4613 perror("can't malloc");
4616 while ((c1=(*i_getc)(f))!=EOF) {
4621 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4629 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4630 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4645 lwsp_buf[lwsp_count] = (unsigned char)c1;
4646 if (lwsp_count++>lwsp_size){
4648 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4649 if (lwsp_buf_new==NULL) {
4651 perror("can't realloc");
4654 lwsp_buf = lwsp_buf_new;
4660 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4662 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4663 i_ungetc(lwsp_buf[lwsp_count],f);
4669 if (c1=='='&&c2<' ') { /* this is soft wrap */
4670 while((c1 = (*i_mgetc)(f)) <=' ') {
4671 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4673 mime_decode_mode = 'Q'; /* still in MIME */
4674 goto restart_mime_q;
4677 mime_decode_mode = 'Q'; /* still in MIME */
4681 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4682 if (c2<=' ') return c2;
4683 mime_decode_mode = 'Q'; /* still in MIME */
4684 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4685 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4686 return ((hex(c2)<<4) + hex(c3));
4689 if (mime_decode_mode != 'B') {
4690 mime_decode_mode = FALSE;
4691 return (*i_mgetc)(f);
4695 /* Base64 encoding */
4697 MIME allows line break in the middle of
4698 Base64, but we are very pessimistic in decoding
4699 in unbuf mode because MIME encoded code may broken by
4700 less or editor's control sequence (such as ESC-[-K in unbuffered
4701 mode. ignore incomplete MIME.
4703 mode = mime_decode_mode;
4704 mime_decode_mode = exit_mode; /* prepare for quit */
4706 while ((c1 = (*i_mgetc)(f))<=' ') {
4711 if ((c2 = (*i_mgetc)(f))<=' ') {
4714 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4715 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4718 if ((c1 == '?') && (c2 == '=')) {
4721 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4722 if (lwsp_buf==NULL) {
4723 perror("can't malloc");
4726 while ((c1=(*i_getc)(f))!=EOF) {
4731 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4739 if ((c1=(*i_getc)(f))!=EOF) {
4743 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4758 lwsp_buf[lwsp_count] = (unsigned char)c1;
4759 if (lwsp_count++>lwsp_size){
4761 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4762 if (lwsp_buf_new==NULL) {
4764 perror("can't realloc");
4767 lwsp_buf = lwsp_buf_new;
4773 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4775 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4776 i_ungetc(lwsp_buf[lwsp_count],f);
4783 if ((c3 = (*i_mgetc)(f))<=' ') {
4786 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4787 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4791 if ((c4 = (*i_mgetc)(f))<=' ') {
4794 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4795 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4799 mime_decode_mode = mode; /* still in MIME sigh... */
4801 /* BASE 64 decoding */
4803 t1 = 0x3f & base64decode(c1);
4804 t2 = 0x3f & base64decode(c2);
4805 t3 = 0x3f & base64decode(c3);
4806 t4 = 0x3f & base64decode(c4);
4807 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4809 Fifo(mime_last++) = (unsigned char)cc;
4810 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4812 Fifo(mime_last++) = (unsigned char)cc;
4813 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4815 Fifo(mime_last++) = (unsigned char)cc;
4820 return Fifo(mime_top++);
4823 int mime_ungetc(int c, FILE *f)
4825 Fifo(--mime_top) = (unsigned char)c;
4829 int mime_integrity(FILE *f, const unsigned char *p)
4833 /* In buffered mode, read until =? or NL or buffer full
4835 mime_input = mime_top;
4836 mime_last = mime_top;
4838 while(*p) Fifo(mime_input++) = *p++;
4841 while((c=(*i_getc)(f))!=EOF) {
4842 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4843 break; /* buffer full */
4845 if (c=='=' && d=='?') {
4846 /* checked. skip header, start decode */
4847 Fifo(mime_input++) = (unsigned char)c;
4848 /* mime_last_input = mime_input; */
4853 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4855 /* Should we check length mod 4? */
4856 Fifo(mime_input++) = (unsigned char)c;
4859 /* In case of Incomplete MIME, no MIME decode */
4860 Fifo(mime_input++) = (unsigned char)c;
4861 mime_last = mime_input; /* point undecoded buffer */
4862 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4863 switch_mime_getc(); /* anyway we need buffered getc */
4867 int base64decode(int c)
4872 i = c - 'A'; /* A..Z 0-25 */
4874 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4876 } else if (c > '/') {
4877 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4878 } else if (c == '+') {
4879 i = '>' /* 62 */ ; /* + 62 */
4881 i = '?' /* 63 */ ; /* / 63 */
4886 static const char basis_64[] =
4887 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4890 #define MIMEOUT_BUF_LENGTH (60)
4891 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4892 int mimeout_buf_count = 0;
4893 int mimeout_preserve_space = 0;
4894 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4896 void open_mime(int mode)
4898 const unsigned char *p;
4901 p = mime_pattern[0];
4902 for(i=0;mime_encode[i];i++) {
4903 if (mode == mime_encode[i]) {
4904 p = mime_pattern[i];
4908 mimeout_mode = mime_encode_method[i];
4911 if (base64_count>45) {
4912 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4913 (*o_mputc)(mimeout_buf[i]);
4919 if (!mimeout_preserve_space && mimeout_buf_count>0
4920 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4921 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4925 if (!mimeout_preserve_space) {
4926 for (;i<mimeout_buf_count;i++) {
4927 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4928 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4929 (*o_mputc)(mimeout_buf[i]);
4936 mimeout_preserve_space = FALSE;
4942 j = mimeout_buf_count;
4943 mimeout_buf_count = 0;
4945 mime_putc(mimeout_buf[i]);
4949 void close_mime(void)
4959 switch(mimeout_mode) {
4964 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
4970 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
4976 if (mimeout_f!=FIXED_MIME) {
4978 } else if (mimeout_mode != 'Q')
4983 void mimeout_addchar(int c)
4985 switch(mimeout_mode) {
4990 } else if(!nkf_isalnum(c)) {
4992 (*o_mputc)(itoh4(((c>>4)&0xf)));
4993 (*o_mputc)(itoh4((c&0xf)));
5002 (*o_mputc)(basis_64[c>>2]);
5007 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5013 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5014 (*o_mputc)(basis_64[c & 0x3F]);
5025 int mime_lastchar2, mime_lastchar1;
5027 void mime_prechar(int c2, int c1)
5031 if (base64_count + mimeout_buf_count/3*4> 66){
5032 (*o_base64conv)(EOF,0);
5033 (*o_base64conv)(0,NL);
5034 (*o_base64conv)(0,SPACE);
5036 }/*else if (mime_lastchar2){
5037 if (c1 <=DEL && !nkf_isspace(c1)){
5038 (*o_base64conv)(0,SPACE);
5042 if (c2 && mime_lastchar2 == 0
5043 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5044 (*o_base64conv)(0,SPACE);
5047 mime_lastchar2 = c2;
5048 mime_lastchar1 = c1;
5051 void mime_putc(int c)
5056 if (mimeout_f == FIXED_MIME){
5057 if (mimeout_mode == 'Q'){
5058 if (base64_count > 71){
5059 if (c!=CR && c!=NL) {
5066 if (base64_count > 71){
5071 if (c == EOF) { /* c==EOF */
5075 if (c != EOF) { /* c==EOF */
5081 /* mimeout_f != FIXED_MIME */
5083 if (c == EOF) { /* c==EOF */
5084 j = mimeout_buf_count;
5085 mimeout_buf_count = 0;
5088 /*if (nkf_isspace(mimeout_buf[i])){
5091 mimeout_addchar(mimeout_buf[i]);
5095 (*o_mputc)(mimeout_buf[i]);
5101 if (mimeout_mode=='Q') {
5102 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5114 if (mimeout_buf_count > 0){
5115 lastchar = mimeout_buf[mimeout_buf_count - 1];
5120 if (!mimeout_mode) {
5121 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5122 if (nkf_isspace(c)) {
5123 if (c==CR || c==NL) {
5126 for (i=0;i<mimeout_buf_count;i++) {
5127 (*o_mputc)(mimeout_buf[i]);
5128 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5134 mimeout_buf[0] = (char)c;
5135 mimeout_buf_count = 1;
5137 if (base64_count > 1
5138 && base64_count + mimeout_buf_count > 76){
5141 if (!nkf_isspace(mimeout_buf[0])){
5146 mimeout_buf[mimeout_buf_count++] = (char)c;
5147 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5148 open_mime(output_mode);
5153 if (lastchar==CR || lastchar == NL){
5154 for (i=0;i<mimeout_buf_count;i++) {
5155 (*o_mputc)(mimeout_buf[i]);
5158 mimeout_buf_count = 0;
5160 if (lastchar==SPACE) {
5161 for (i=0;i<mimeout_buf_count-1;i++) {
5162 (*o_mputc)(mimeout_buf[i]);
5165 mimeout_buf[0] = SPACE;
5166 mimeout_buf_count = 1;
5168 open_mime(output_mode);
5171 /* mimeout_mode == 'B', 1, 2 */
5172 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5173 if (lastchar == CR || lastchar == NL){
5174 if (nkf_isblank(c)) {
5175 for (i=0;i<mimeout_buf_count;i++) {
5176 mimeout_addchar(mimeout_buf[i]);
5178 mimeout_buf_count = 0;
5179 } else if (SPACE<c && c<DEL) {
5181 for (i=0;i<mimeout_buf_count;i++) {
5182 (*o_mputc)(mimeout_buf[i]);
5185 mimeout_buf_count = 0;
5188 if (c==SPACE || c==TAB || c==CR || c==NL) {
5189 for (i=0;i<mimeout_buf_count;i++) {
5190 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5192 for (i=0;i<mimeout_buf_count;i++) {
5193 (*o_mputc)(mimeout_buf[i]);
5196 mimeout_buf_count = 0;
5199 mimeout_buf[mimeout_buf_count++] = (char)c;
5200 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5202 for (i=0;i<mimeout_buf_count;i++) {
5203 (*o_mputc)(mimeout_buf[i]);
5206 mimeout_buf_count = 0;
5210 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5211 mimeout_buf[mimeout_buf_count++] = (char)c;
5212 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5213 j = mimeout_buf_count;
5214 mimeout_buf_count = 0;
5216 mimeout_addchar(mimeout_buf[i]);
5223 if (mimeout_buf_count>0) {
5224 j = mimeout_buf_count;
5225 mimeout_buf_count = 0;
5227 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5229 mimeout_addchar(mimeout_buf[i]);
5235 (*o_mputc)(mimeout_buf[i]);
5237 open_mime(output_mode);
5244 #if defined(PERL_XS) || defined(WIN32DLL)
5248 struct input_code *p = input_code_list;
5261 mime_f = STRICT_MIME;
5262 mime_decode_f = FALSE;
5267 #if defined(MSDOS) || defined(__OS2__)
5272 iso2022jp_f = FALSE;
5273 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5274 ms_ucs_map_f = UCS_MAP_ASCII;
5276 #ifdef UTF8_INPUT_ENABLE
5277 no_cp932ext_f = FALSE;
5278 ignore_zwnbsp_f = TRUE;
5279 no_best_fit_chars_f = FALSE;
5280 encode_fallback = NULL;
5281 unicode_subchar = '?';
5283 #ifdef UTF8_OUTPUT_ENABLE
5287 #ifdef UNICODE_NORMALIZATION
5300 is_inputcode_mixed = FALSE;
5301 is_inputcode_set = FALSE;
5305 #ifdef SHIFTJIS_CP932
5315 for (i = 0; i < 256; i++){
5316 prefix_table[i] = 0;
5319 #ifdef UTF8_INPUT_ENABLE
5320 utf16_mode = UTF16BE_INPUT;
5322 mimeout_buf_count = 0;
5327 fold_preserve_f = FALSE;
5330 kanji_intro = DEFAULT_J;
5331 ascii_intro = DEFAULT_R;
5332 fold_margin = FOLD_MARGIN;
5333 output_conv = DEFAULT_CONV;
5334 oconv = DEFAULT_CONV;
5335 o_zconv = no_connection;
5336 o_fconv = no_connection;
5337 o_crconv = no_connection;
5338 o_rot_conv = no_connection;
5339 o_hira_conv = no_connection;
5340 o_base64conv = no_connection;
5341 o_iso2022jp_check_conv = no_connection;
5344 i_ungetc = std_ungetc;
5346 i_bungetc = std_ungetc;
5349 i_mungetc = std_ungetc;
5350 i_mgetc_buf = std_getc;
5351 i_mungetc_buf = std_ungetc;
5352 output_mode = ASCII;
5355 mime_decode_mode = FALSE;
5361 z_prev2=0,z_prev1=0;
5363 iconv_for_check = 0;
5365 input_codename = "";
5372 void no_connection(int c2, int c1)
5374 no_connection2(c2,c1,0);
5377 int no_connection2(int c2, int c1, int c0)
5379 fprintf(stderr,"nkf internal module connection failure.\n");
5381 return 0; /* LINT */
5386 #define fprintf dllprintf
5390 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5391 fprintf(stderr,"Flags:\n");
5392 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5393 #ifdef DEFAULT_CODE_SJIS
5394 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5396 #ifdef DEFAULT_CODE_JIS
5397 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5399 #ifdef DEFAULT_CODE_EUC
5400 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5402 #ifdef DEFAULT_CODE_UTF8
5403 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5405 #ifdef UTF8_OUTPUT_ENABLE
5406 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5408 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5409 #ifdef UTF8_INPUT_ENABLE
5410 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5412 fprintf(stderr,"t no conversion\n");
5413 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5414 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5415 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5416 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5417 fprintf(stderr,"v Show this usage. V: show version\n");
5418 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5419 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5420 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5421 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5422 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5423 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5424 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5425 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5427 fprintf(stderr,"T Text mode output\n");
5429 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5430 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5431 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5432 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5433 fprintf(stderr,"\n");
5434 fprintf(stderr,"Long name options\n");
5435 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5436 fprintf(stderr," Specify the input or output codeset\n");
5437 fprintf(stderr," --fj --unix --mac --windows\n");
5438 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5439 fprintf(stderr," Convert for the system or code\n");
5440 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5441 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5442 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5444 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5446 #ifdef NUMCHAR_OPTION
5447 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5449 #ifdef UTF8_INPUT_ENABLE
5450 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5451 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5454 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5455 fprintf(stderr," Overwrite original listed files by filtered result\n");
5456 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5458 fprintf(stderr," -g --guess Guess the input code\n");
5459 fprintf(stderr," --help --version Show this help/the version\n");
5460 fprintf(stderr," For more information, see also man nkf\n");
5461 fprintf(stderr,"\n");
5467 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5468 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
5471 #if defined(MSDOS) && defined(__WIN16__)
5474 #if defined(MSDOS) && defined(__WIN32__)
5480 ,NKF_VERSION,NKF_RELEASE_DATE);
5481 fprintf(stderr,"\n%s\n",CopyRight);
5486 **
\e$B%Q%C%A@):n<T
\e(B
5487 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5488 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5489 ** ohta@src.ricoh.co.jp (Junn Ohta)
5490 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5491 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5492 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5493 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5494 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5495 ** GHG00637@nifty-serve.or.jp (COW)