1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.98 2006/05/01 19:51:31 naruse Exp $ */
43 #define NKF_VERSION "2.0.7"
44 #define NKF_RELEASE_DATE "2006-04-22"
48 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
49 " 2002-2006 Kono, Furukawa, Naruse, mastodon"
56 ** USAGE: nkf [flags] [file]
59 ** b Output is buffered (DEFAULT)
60 ** u Output is unbuffered
64 ** j Outout code is JIS 7 bit (DEFAULT SELECT)
65 ** s Output code is MS Kanji (DEFAULT SELECT)
66 ** e Output code is AT&T JIS (DEFAULT SELECT)
67 ** w Output code is AT&T JIS (DEFAULT SELECT)
68 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
70 ** m MIME conversion for ISO-2022-JP
71 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
72 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
73 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
74 ** M MIME output conversion
76 ** r {de/en}crypt ROT13/47
80 ** T Text mode output (for MS-DOS)
82 ** x Do not convert X0201 kana into X0208
83 ** Z Convert X0208 alphabet to ASCII
88 ** B try to fix broken JIS, missing Escape
89 ** B[1-9] broken level
91 ** O Output to 'nkf.out' file or last file name
92 ** d Delete \r in line feed
93 ** c Add \r in line feed
94 ** -- other long option
95 ** -- ignore following option (don't use with -O )
99 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__)) && !defined(MSDOS)
101 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
117 #if defined(MSDOS) || defined(__OS2__)
124 #define setbinmode(fp) fsetbin(fp)
125 #else /* Microsoft C, Turbo C */
126 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
128 #else /* UNIX,OS/2 */
129 #define setbinmode(fp)
132 #ifdef _IOFBF /* SysV and MSDOS, Windows */
133 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
135 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
138 /*Borland C++ 4.5 EasyWin*/
139 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
148 /* added by satoru@isoternet.org */
149 #include <sys/stat.h>
150 #ifndef MSDOS /* UNIX, OS/2 */
153 #else /* defined(MSDOS) */
155 #ifdef __BORLANDC__ /* BCC32 */
157 #else /* !defined(__BORLANDC__) */
158 #include <sys/utime.h>
159 #endif /* (__BORLANDC__) */
160 #else /* !defined(__WIN32__) */
161 #if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
162 #include <sys/utime.h>
163 #elif defined(__TURBOC__) /* BCC */
165 #elif defined(LSI_C) /* LSI C */
166 #endif /* (__WIN32__) */
174 /* state of output_mode and input_mode
191 #define X0213_1 0x284F
192 #define X0213_2 0x2850
194 /* Input Assumption */
198 #define LATIN1_INPUT 6
200 #define STRICT_MIME 8
205 #define JAPANESE_EUC 10
209 #define UTF8_INPUT 13
210 #define UTF16BE_INPUT 14
211 #define UTF16LE_INPUT 15
231 #define is_alnum(c) \
232 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
234 /* I don't trust portablity of toupper */
235 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
236 #define nkf_isoctal(c) ('0'<=c && c<='7')
237 #define nkf_isdigit(c) ('0'<=c && c<='9')
238 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
239 #define nkf_isblank(c) (c == SPACE || c == TAB)
240 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
241 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
242 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
243 #define hex2bin(x) ( nkf_isdigit(x) ? x - '0' : nkf_toupper(x) - 'A' + 10)
245 #define HOLD_SIZE 1024
246 #define IOBUF_SIZE 16384
248 #define DEFAULT_J 'B'
249 #define DEFAULT_R 'B'
251 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
252 #define SJ6394 0x0161 /* 63 - 94 ku offset */
254 #define RANGE_NUM_MAX 18
259 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
260 #define sizeof_euc_to_utf8_1byte 94
261 #define sizeof_euc_to_utf8_2bytes 94
262 #define sizeof_utf8_to_euc_C2 64
263 #define sizeof_utf8_to_euc_E5B8 64
264 #define sizeof_utf8_to_euc_2bytes 112
265 #define sizeof_utf8_to_euc_3bytes 16
268 /* MIME preprocessor */
270 #ifdef EASYWIN /*Easy Win */
271 extern POINT _BufferSize;
280 void (*status_func)(struct input_code *, int);
281 int (*iconv_func)(int c2, int c1, int c0);
285 static char *input_codename = "";
288 static const char *CopyRight = COPY_RIGHT;
290 #if !defined(PERL_XS) && !defined(WIN32DLL)
291 static int noconvert(FILE *f);
293 static void module_connection(void);
294 static int kanji_convert(FILE *f);
295 static int h_conv(FILE *f,int c2,int c1);
296 static int push_hold_buf(int c2);
297 static void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0));
298 static int s_iconv(int c2,int c1,int c0);
299 static int s2e_conv(int c2, int c1, int *p2, int *p1);
300 static int e_iconv(int c2,int c1,int c0);
301 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
303 * 0: Shift_JIS, eucJP-ascii
307 #define UCS_MAP_ASCII 0
309 #define UCS_MAP_CP932 2
310 static int ms_ucs_map_f = UCS_MAP_ASCII;
312 #ifdef UTF8_INPUT_ENABLE
313 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
314 static int no_cp932ext_f = FALSE;
315 /* ignore ZERO WIDTH NO-BREAK SPACE */
316 static int ignore_zwnbsp_f = TRUE;
317 static int no_best_fit_chars_f = FALSE;
318 static int unicode_subchar = '?'; /* the regular substitution character */
319 static void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c);
320 static void encode_fallback_html(int c);
321 static void encode_fallback_xml(int c);
322 static void encode_fallback_java(int c);
323 static void encode_fallback_perl(int c);
324 static void encode_fallback_subchar(int c);
325 static void (*encode_fallback)(int c) = NULL;
326 static int w2e_conv(int c2,int c1,int c0,int *p2,int *p1);
327 static int w_iconv(int c2,int c1,int c0);
328 static int w_iconv16(int c2,int c1,int c0);
329 static int unicode_to_jis_common(int c2,int c1,int c0,int *p2,int *p1);
330 static int w_iconv_common(int c1,int c0,const unsigned short *const *pp,int psize,int *p2,int *p1);
331 static void w16w_conv(int val, int *p2, int *p1, int *p0);
332 static int ww16_conv(int c2, int c1, int c0);
333 static int w16e_conv(int val,int *p2,int *p1);
335 #ifdef UTF8_OUTPUT_ENABLE
336 static int unicode_bom_f= 0; /* Output Unicode BOM */
337 static int w_oconv16_LE = 0; /* utf-16 little endian */
338 static int e2w_conv(int c2,int c1);
339 static void w_oconv(int c2,int c1);
340 static void w_oconv16(int c2,int c1);
342 static void e_oconv(int c2,int c1);
343 static int e2s_conv(int c2, int c1, int *p2, int *p1);
344 static void s_oconv(int c2,int c1);
345 static void j_oconv(int c2,int c1);
346 static void fold_conv(int c2,int c1);
347 static void cr_conv(int c2,int c1);
348 static void z_conv(int c2,int c1);
349 static void rot_conv(int c2,int c1);
350 static void hira_conv(int c2,int c1);
351 static void base64_conv(int c2,int c1);
352 static void iso2022jp_check_conv(int c2,int c1);
353 static void no_connection(int c2,int c1);
354 static int no_connection2(int c2,int c1,int c0);
356 static void code_score(struct input_code *ptr);
357 static void code_status(int c);
359 static void std_putc(int c);
360 static int std_getc(FILE *f);
361 static int std_ungetc(int c,FILE *f);
363 static int broken_getc(FILE *f);
364 static int broken_ungetc(int c,FILE *f);
366 static int mime_begin(FILE *f);
367 static int mime_getc(FILE *f);
368 static int mime_ungetc(int c,FILE *f);
370 static void switch_mime_getc(void);
371 static void unswitch_mime_getc(void);
372 static int mime_begin_strict(FILE *f);
373 static int mime_getc_buf(FILE *f);
374 static int mime_ungetc_buf(int c,FILE *f);
375 static int mime_integrity(FILE *f,const unsigned char *p);
377 static int base64decode(int c);
378 static void mime_prechar(int c2, int c1);
379 static void mime_putc(int c);
380 static void open_mime(int c);
381 static void close_mime(void);
382 static void eof_mime(void);
383 static void mimeout_addchar(int c);
385 static void usage(void);
386 static void version(void);
388 static void options(unsigned char *c);
389 #if defined(PERL_XS) || defined(WIN32DLL)
390 static void reinit(void);
395 #if !defined(PERL_XS) && !defined(WIN32DLL)
396 static unsigned char stdibuf[IOBUF_SIZE];
397 static unsigned char stdobuf[IOBUF_SIZE];
399 static unsigned char hold_buf[HOLD_SIZE*2];
400 static int hold_count;
402 /* MIME preprocessor fifo */
404 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
405 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
406 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
407 static unsigned char mime_buf[MIME_BUF_SIZE];
408 static unsigned int mime_top = 0;
409 static unsigned int mime_last = 0; /* decoded */
410 static unsigned int mime_input = 0; /* undecoded */
411 static int (*mime_iconv_back)(int c2,int c1,int c0) = NULL;
414 static int unbuf_f = FALSE;
415 static int estab_f = FALSE;
416 static int nop_f = FALSE;
417 static int binmode_f = TRUE; /* binary mode */
418 static int rot_f = FALSE; /* rot14/43 mode */
419 static int hira_f = FALSE; /* hira/kata henkan */
420 static int input_f = FALSE; /* non fixed input code */
421 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
422 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
423 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
424 static int mimebuf_f = FALSE; /* MIME buffered input */
425 static int broken_f = FALSE; /* convert ESC-less broken JIS */
426 static int iso8859_f = FALSE; /* ISO8859 through */
427 static int mimeout_f = FALSE; /* base64 mode */
428 #if defined(MSDOS) || defined(__OS2__)
429 static int x0201_f = TRUE; /* Assume JISX0201 kana */
431 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
433 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
435 #ifdef UNICODE_NORMALIZATION
436 static int nfc_f = FALSE;
437 static int (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
438 static int (*i_nfc_ungetc)(int c ,FILE *f) = std_ungetc;
439 static int nfc_getc(FILE *f);
440 static int nfc_ungetc(int c,FILE *f);
444 static int cap_f = FALSE;
445 static int (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
446 static int (*i_cungetc)(int c ,FILE *f) = std_ungetc;
447 static int cap_getc(FILE *f);
448 static int cap_ungetc(int c,FILE *f);
450 static int url_f = FALSE;
451 static int (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
452 static int (*i_uungetc)(int c ,FILE *f) = std_ungetc;
453 static int url_getc(FILE *f);
454 static int url_ungetc(int c,FILE *f);
457 #ifdef NUMCHAR_OPTION
458 #define CLASS_MASK 0x0f000000
459 #define CLASS_UTF16 0x01000000
460 static int numchar_f = FALSE;
461 static int (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
462 static int (*i_nungetc)(int c ,FILE *f) = std_ungetc;
463 static int numchar_getc(FILE *f);
464 static int numchar_ungetc(int c,FILE *f);
468 static int noout_f = FALSE;
469 static void no_putc(int c);
470 static int debug_f = FALSE;
471 static void debug(const char *str);
472 static int (*iconv_for_check)(int c2,int c1,int c0) = 0;
475 static int guess_f = FALSE;
477 static void print_guessed_code(char *filename);
479 static void set_input_codename(char *codename);
480 static int is_inputcode_mixed = FALSE;
481 static int is_inputcode_set = FALSE;
484 static int exec_f = 0;
487 #ifdef SHIFTJIS_CP932
488 /* invert IBM extended characters to others */
489 static int cp51932_f = TRUE;
490 #define CP932_TABLE_BEGIN (0xfa)
491 #define CP932_TABLE_END (0xfc)
493 /* invert NEC-selected IBM extended characters to IBM extended characters */
494 static int cp932inv_f = TRUE;
495 #define CP932INV_TABLE_BEGIN (0xed)
496 #define CP932INV_TABLE_END (0xee)
498 /* static int cp932_conv(int c2, int c1); */
499 #endif /* SHIFTJIS_CP932 */
502 static int x0212_f = FALSE;
503 static int x0212_shift(int c);
504 static int x0212_unshift(int c);
506 static int x0213_f = FALSE;
508 static unsigned char prefix_table[256];
510 static void set_code_score(struct input_code *ptr, int score);
511 static void clr_code_score(struct input_code *ptr, int score);
512 static void status_disable(struct input_code *ptr);
513 static void status_push_ch(struct input_code *ptr, int c);
514 static void status_clear(struct input_code *ptr);
515 static void status_reset(struct input_code *ptr);
516 static void status_reinit(struct input_code *ptr);
517 static void status_check(struct input_code *ptr, int c);
518 static void e_status(struct input_code *, int);
519 static void s_status(struct input_code *, int);
521 #ifdef UTF8_INPUT_ENABLE
522 static void w_status(struct input_code *, int);
523 static void w16_status(struct input_code *, int);
524 static int utf16_mode = UTF16BE_INPUT;
527 struct input_code input_code_list[] = {
528 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
529 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
530 #ifdef UTF8_INPUT_ENABLE
531 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
532 {"UTF-16", 0, 0, 0, {0, 0, 0}, w16_status, w_iconv16, 0},
537 static int mimeout_mode = 0;
538 static int base64_count = 0;
540 /* X0208 -> ASCII converter */
543 static int f_line = 0; /* chars in line */
544 static int f_prev = 0;
545 static int fold_preserve_f = FALSE; /* preserve new lines */
546 static int fold_f = FALSE;
547 static int fold_len = 0;
550 static unsigned char kanji_intro = DEFAULT_J;
551 static unsigned char ascii_intro = DEFAULT_R;
555 #define FOLD_MARGIN 10
556 #define DEFAULT_FOLD 60
558 static int fold_margin = FOLD_MARGIN;
562 #ifdef DEFAULT_CODE_JIS
563 # define DEFAULT_CONV j_oconv
565 #ifdef DEFAULT_CODE_SJIS
566 # define DEFAULT_CONV s_oconv
568 #ifdef DEFAULT_CODE_EUC
569 # define DEFAULT_CONV e_oconv
571 #ifdef DEFAULT_CODE_UTF8
572 # define DEFAULT_CONV w_oconv
575 /* process default */
576 static void (*output_conv)(int c2,int c1) = DEFAULT_CONV;
578 static void (*oconv)(int c2,int c1) = no_connection;
579 /* s_iconv or oconv */
580 static int (*iconv)(int c2,int c1,int c0) = no_connection2;
582 static void (*o_zconv)(int c2,int c1) = no_connection;
583 static void (*o_fconv)(int c2,int c1) = no_connection;
584 static void (*o_crconv)(int c2,int c1) = no_connection;
585 static void (*o_rot_conv)(int c2,int c1) = no_connection;
586 static void (*o_hira_conv)(int c2,int c1) = no_connection;
587 static void (*o_base64conv)(int c2,int c1) = no_connection;
588 static void (*o_iso2022jp_check_conv)(int c2,int c1) = no_connection;
590 /* static redirections */
592 static void (*o_putc)(int c) = std_putc;
594 static int (*i_getc)(FILE *f) = std_getc; /* general input */
595 static int (*i_ungetc)(int c,FILE *f) =std_ungetc;
597 static int (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
598 static int (*i_bungetc)(int c ,FILE *f) = std_ungetc;
600 static void (*o_mputc)(int c) = std_putc ; /* output of mputc */
602 static int (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
603 static int (*i_mungetc)(int c ,FILE *f) = std_ungetc;
605 /* for strict mime */
606 static int (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
607 static int (*i_mungetc_buf)(int c,FILE *f) = std_ungetc;
610 static int output_mode = ASCII, /* output kanji mode */
611 input_mode = ASCII, /* input kanji mode */
612 shift_mode = FALSE; /* TRUE shift out, or X0201 */
613 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
615 /* X0201 / X0208 conversion tables */
617 /* X0201 kana conversion table */
620 unsigned char cv[]= {
621 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
622 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
623 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
624 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
625 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
626 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
627 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
628 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
629 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
630 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
631 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
632 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
633 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
634 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
635 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
636 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
640 /* X0201 kana conversion table for daguten */
643 unsigned char dv[]= {
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
649 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
650 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
651 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
652 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
653 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
655 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 /* X0201 kana conversion table for han-daguten */
665 unsigned char ev[]= {
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
677 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
681 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
685 /* X0208 kigou conversion table */
686 /* 0x8140 - 0x819e */
688 unsigned char fv[] = {
690 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
691 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
692 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
694 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
695 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
696 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
698 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
707 static int file_out_f = FALSE;
709 static int overwrite_f = FALSE;
710 static int preserve_time_f = FALSE;
711 static int backup_f = FALSE;
712 static char *backup_suffix = "";
713 static char *get_backup_filename(const char *suffix, const char *filename);
716 static int crmode_f = 0; /* CR, NL, CRLF */
717 #ifdef EASYWIN /*Easy Win */
718 static int end_check;
721 #define STD_GC_BUFSIZE (256)
722 int std_gc_buf[STD_GC_BUFSIZE];
726 #include "nkf32dll.c"
727 #elif defined(PERL_XS)
729 int main(int argc, char **argv)
734 char *outfname = NULL;
737 #ifdef EASYWIN /*Easy Win */
738 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
741 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
742 cp = (unsigned char *)*argv;
747 if (pipe(fds) < 0 || (pid = fork()) < 0){
758 execvp(argv[1], &argv[1]);
772 if(x0201_f == WISH_TRUE)
773 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
775 if (binmode_f == TRUE)
777 if (freopen("","wb",stdout) == NULL)
784 setbuf(stdout, (char *) NULL);
786 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
789 if (binmode_f == TRUE)
791 if (freopen("","rb",stdin) == NULL) return (-1);
795 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
799 kanji_convert(stdin);
800 if (guess_f) print_guessed_code(NULL);
805 is_inputcode_mixed = FALSE;
806 is_inputcode_set = FALSE;
811 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
820 /* reopen file for stdout */
821 if (file_out_f == TRUE) {
824 outfname = malloc(strlen(origfname)
825 + strlen(".nkftmpXXXXXX")
831 strcpy(outfname, origfname);
835 for (i = strlen(outfname); i; --i){
836 if (outfname[i - 1] == '/'
837 || outfname[i - 1] == '\\'){
843 strcat(outfname, "ntXXXXXX");
845 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
848 strcat(outfname, ".nkftmpXXXXXX");
849 fd = mkstemp(outfname);
852 || (fd_backup = dup(fileno(stdout))) < 0
853 || dup2(fd, fileno(stdout)) < 0
864 outfname = "nkf.out";
867 if(freopen(outfname, "w", stdout) == NULL) {
871 if (binmode_f == TRUE) {
873 if (freopen("","wb",stdout) == NULL)
880 if (binmode_f == TRUE)
882 if (freopen("","rb",fin) == NULL)
887 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
891 char *filename = NULL;
893 if (nfiles > 1) filename = origfname;
894 if (guess_f) print_guessed_code(filename);
900 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
908 if (dup2(fd_backup, fileno(stdout)) < 0){
911 if (stat(origfname, &sb)) {
912 fprintf(stderr, "Can't stat %s\n", origfname);
914 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
915 if (chmod(outfname, sb.st_mode)) {
916 fprintf(stderr, "Can't set permission %s\n", outfname);
919 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
921 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
922 tb[0] = tb[1] = sb.st_mtime;
923 if (utime(outfname, tb)) {
924 fprintf(stderr, "Can't set timestamp %s\n", outfname);
927 tb.actime = sb.st_atime;
928 tb.modtime = sb.st_mtime;
929 if (utime(outfname, &tb)) {
930 fprintf(stderr, "Can't set timestamp %s\n", outfname);
935 char *backup_filename = get_backup_filename(backup_suffix, origfname);
937 unlink(backup_filename);
939 if (rename(origfname, backup_filename)) {
940 perror(backup_filename);
941 fprintf(stderr, "Can't rename %s to %s\n",
942 origfname, backup_filename);
946 if (unlink(origfname)){
951 if (rename(outfname, origfname)) {
953 fprintf(stderr, "Can't rename %s to %s\n",
954 outfname, origfname);
962 #ifdef EASYWIN /*Easy Win */
963 if (file_out_f == FALSE)
964 scanf("%d",&end_check);
967 #else /* for Other OS */
968 if (file_out_f == TRUE)
973 #endif /* WIN32DLL */
976 char *get_backup_filename(const char *suffix, const char *filename)
978 char *backup_filename;
979 int asterisk_count = 0;
981 int filename_length = strlen(filename);
983 for(i = 0; suffix[i]; i++){
984 if(suffix[i] == '*') asterisk_count++;
988 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
989 if (!backup_filename){
990 perror("Can't malloc backup filename.");
994 for(i = 0, j = 0; suffix[i];){
995 if(suffix[i] == '*'){
996 backup_filename[j] = '\0';
997 strncat(backup_filename, filename, filename_length);
999 j += filename_length;
1001 backup_filename[j++] = suffix[i++];
1004 backup_filename[j] = '\0';
1006 j = strlen(suffix) + filename_length;
1007 backup_filename = malloc( + 1);
1008 strcpy(backup_filename, filename);
1009 strcat(backup_filename, suffix);
1010 backup_filename[j] = '\0';
1012 return backup_filename;
1041 {"katakana-hiragana","h3"},
1048 #ifdef UTF8_OUTPUT_ENABLE
1058 {"fb-subchar=", ""},
1060 #ifdef UTF8_INPUT_ENABLE
1061 {"utf8-input", "W"},
1062 {"utf16-input", "W16"},
1063 {"no-cp932ext", ""},
1064 {"no-best-fit-chars",""},
1066 #ifdef UNICODE_NORMALIZATION
1067 {"utf8mac-input", ""},
1079 #ifdef NUMCHAR_OPTION
1080 {"numchar-input", ""},
1086 #ifdef SHIFTJIS_CP932
1096 static int option_mode = 0;
1098 void options(unsigned char *cp)
1102 unsigned char *cp_back = NULL;
1107 while(*cp && *cp++!='-');
1108 while (*cp || cp_back) {
1116 case '-': /* literal options */
1117 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1121 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1122 p = (unsigned char *)long_option[i].name;
1123 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1124 if (*p == cp[j] || cp[j] == ' '){
1131 while(*cp && *cp != SPACE && cp++);
1132 if (long_option[i].alias[0]){
1134 cp = (unsigned char *)long_option[i].alias;
1136 if (strcmp(long_option[i].name, "ic=") == 0){
1137 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1138 codeset[i] = nkf_toupper(p[i]);
1141 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1142 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1143 strcmp(codeset, "CP50220") == 0 ||
1144 strcmp(codeset, "CP50221") == 0 ||
1145 strcmp(codeset, "CP50222") == 0 ||
1146 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1147 input_f = JIS_INPUT;
1148 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1149 input_f = JIS_INPUT;
1153 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1154 input_f = JIS_INPUT;
1159 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1160 input_f = SJIS_INPUT;
1161 if (x0201_f==NO_X0201) x0201_f=TRUE;
1162 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1163 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1164 strcmp(codeset, "CP932") == 0 ||
1165 strcmp(codeset, "MS932") == 0){
1166 input_f = SJIS_INPUT;
1168 #ifdef SHIFTJIS_CP932
1171 #ifdef UTF8_OUTPUT_ENABLE
1172 ms_ucs_map_f = UCS_MAP_CP932;
1174 }else if(strcmp(codeset, "EUCJP") == 0 ||
1175 strcmp(codeset, "EUC-JP") == 0){
1176 input_f = JIS_INPUT;
1177 }else if(strcmp(codeset, "CP51932") == 0){
1178 input_f = JIS_INPUT;
1180 #ifdef SHIFTJIS_CP932
1183 #ifdef UTF8_OUTPUT_ENABLE
1184 ms_ucs_map_f = UCS_MAP_CP932;
1186 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1187 strcmp(codeset, "EUCJP-MS") == 0 ||
1188 strcmp(codeset, "EUCJPMS") == 0){
1189 input_f = JIS_INPUT;
1191 #ifdef SHIFTJIS_CP932
1194 #ifdef UTF8_OUTPUT_ENABLE
1195 ms_ucs_map_f = UCS_MAP_MS;
1197 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1198 strcmp(codeset, "EUCJP-ASCII") == 0){
1199 input_f = JIS_INPUT;
1201 #ifdef SHIFTJIS_CP932
1204 #ifdef UTF8_OUTPUT_ENABLE
1205 ms_ucs_map_f = UCS_MAP_ASCII;
1207 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1208 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1209 input_f = SJIS_INPUT;
1211 #ifdef SHIFTJIS_CP932
1215 if (x0201_f==NO_X0201) x0201_f=TRUE;
1216 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1217 strcmp(codeset, "EUC-JIS-2004") == 0){
1218 input_f = JIS_INPUT;
1221 #ifdef SHIFTJIS_CP932
1225 #ifdef UTF8_INPUT_ENABLE
1226 }else if(strcmp(codeset, "UTF-8") == 0 ||
1227 strcmp(codeset, "UTF-8N") == 0 ||
1228 strcmp(codeset, "UTF-8-BOM") == 0){
1229 input_f = UTF8_INPUT;
1230 #ifdef UNICODE_NORMALIZATION
1231 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1232 strcmp(codeset, "UTF-8-MAC") == 0){
1233 input_f = UTF8_INPUT;
1236 }else if(strcmp(codeset, "UTF-16") == 0){
1237 input_f = UTF16BE_INPUT;
1238 utf16_mode = UTF16BE_INPUT;
1239 }else if(strcmp(codeset, "UTF-16BE") == 0 ||
1240 strcmp(codeset, "UTF-16BE-BOM") == 0){
1241 input_f = UTF16BE_INPUT;
1242 utf16_mode = UTF16BE_INPUT;
1243 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1244 strcmp(codeset, "UTF-16LE-BOM") == 0){
1245 input_f = UTF16LE_INPUT;
1246 utf16_mode = UTF16LE_INPUT;
1251 if (strcmp(long_option[i].name, "oc=") == 0){
1252 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1253 codeset[i] = nkf_toupper(p[i]);
1256 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1257 strcmp(codeset, "CP50220") == 0){
1258 output_conv = j_oconv;
1259 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1260 output_conv = j_oconv;
1261 no_cp932ext_f = TRUE;
1262 }else if(strcmp(codeset, "CP50221") == 0 ||
1263 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1264 output_conv = j_oconv;
1266 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1267 output_conv = j_oconv;
1271 #ifdef SHIFTJIS_CP932
1274 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1275 output_conv = j_oconv;
1280 #ifdef SHIFTJIS_CP932
1283 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1284 output_conv = j_oconv;
1289 #ifdef SHIFTJIS_CP932
1292 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1293 output_conv = s_oconv;
1294 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1295 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1296 strcmp(codeset, "CP932") == 0 ||
1297 strcmp(codeset, "MS932") == 0){
1298 output_conv = s_oconv;
1300 #ifdef SHIFTJIS_CP932
1304 #ifdef UTF8_OUTPUT_ENABLE
1305 ms_ucs_map_f = UCS_MAP_CP932;
1307 }else if(strcmp(codeset, "EUCJP") == 0 ||
1308 strcmp(codeset, "EUC-JP") == 0){
1309 output_conv = e_oconv;
1310 }else if(strcmp(codeset, "CP51932") == 0){
1311 output_conv = e_oconv;
1313 #ifdef SHIFTJIS_CP932
1316 #ifdef UTF8_OUTPUT_ENABLE
1317 ms_ucs_map_f = UCS_MAP_CP932;
1319 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1320 strcmp(codeset, "EUCJP-MS") == 0 ||
1321 strcmp(codeset, "EUCJPMS") == 0){
1322 output_conv = e_oconv;
1327 #ifdef SHIFTJIS_CP932
1330 #ifdef UTF8_OUTPUT_ENABLE
1331 ms_ucs_map_f = UCS_MAP_MS;
1333 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1334 strcmp(codeset, "EUCJP-ASCII") == 0){
1335 output_conv = e_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 #ifdef UTF8_OUTPUT_ENABLE
1344 ms_ucs_map_f = UCS_MAP_ASCII;
1346 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1347 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1348 output_conv = s_oconv;
1350 #ifdef SHIFTJIS_CP932
1353 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1354 strcmp(codeset, "EUC-JIS-2004") == 0){
1355 output_conv = e_oconv;
1360 #ifdef SHIFTJIS_CP932
1363 #ifdef UTF8_OUTPUT_ENABLE
1364 }else if(strcmp(codeset, "UTF-8") == 0){
1365 output_conv = w_oconv;
1366 }else if(strcmp(codeset, "UTF-8N") == 0){
1367 output_conv = w_oconv;
1369 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1370 output_conv = w_oconv;
1372 }else if(strcmp(codeset, "UTF-16BE") == 0){
1373 output_conv = w_oconv16;
1375 }else if(strcmp(codeset, "UTF-16") == 0 ||
1376 strcmp(codeset, "UTF-16BE-BOM") == 0){
1377 output_conv = w_oconv16;
1379 }else if(strcmp(codeset, "UTF-16LE") == 0){
1380 output_conv = w_oconv16;
1383 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1384 output_conv = w_oconv16;
1392 if (strcmp(long_option[i].name, "overwrite") == 0){
1395 preserve_time_f = TRUE;
1398 if (strcmp(long_option[i].name, "overwrite=") == 0){
1401 preserve_time_f = TRUE;
1403 backup_suffix = malloc(strlen((char *) p) + 1);
1404 strcpy(backup_suffix, (char *) p);
1407 if (strcmp(long_option[i].name, "in-place") == 0){
1410 preserve_time_f = FALSE;
1413 if (strcmp(long_option[i].name, "in-place=") == 0){
1416 preserve_time_f = FALSE;
1418 backup_suffix = malloc(strlen((char *) p) + 1);
1419 strcpy(backup_suffix, (char *) p);
1424 if (strcmp(long_option[i].name, "cap-input") == 0){
1428 if (strcmp(long_option[i].name, "url-input") == 0){
1433 #ifdef NUMCHAR_OPTION
1434 if (strcmp(long_option[i].name, "numchar-input") == 0){
1440 if (strcmp(long_option[i].name, "no-output") == 0){
1444 if (strcmp(long_option[i].name, "debug") == 0){
1449 if (strcmp(long_option[i].name, "cp932") == 0){
1450 #ifdef SHIFTJIS_CP932
1454 #ifdef UTF8_OUTPUT_ENABLE
1455 ms_ucs_map_f = UCS_MAP_CP932;
1459 if (strcmp(long_option[i].name, "no-cp932") == 0){
1460 #ifdef SHIFTJIS_CP932
1464 #ifdef UTF8_OUTPUT_ENABLE
1465 ms_ucs_map_f = UCS_MAP_ASCII;
1469 #ifdef SHIFTJIS_CP932
1470 if (strcmp(long_option[i].name, "cp932inv") == 0){
1477 if (strcmp(long_option[i].name, "x0212") == 0){
1484 if (strcmp(long_option[i].name, "exec-in") == 0){
1488 if (strcmp(long_option[i].name, "exec-out") == 0){
1493 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1494 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1495 no_cp932ext_f = TRUE;
1498 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1499 no_best_fit_chars_f = TRUE;
1502 if (strcmp(long_option[i].name, "fb-skip") == 0){
1503 encode_fallback = NULL;
1506 if (strcmp(long_option[i].name, "fb-html") == 0){
1507 encode_fallback = encode_fallback_html;
1510 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1511 encode_fallback = encode_fallback_xml;
1514 if (strcmp(long_option[i].name, "fb-java") == 0){
1515 encode_fallback = encode_fallback_java;
1518 if (strcmp(long_option[i].name, "fb-perl") == 0){
1519 encode_fallback = encode_fallback_perl;
1522 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1523 encode_fallback = encode_fallback_subchar;
1526 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1527 encode_fallback = encode_fallback_subchar;
1528 unicode_subchar = 0;
1530 /* decimal number */
1531 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1532 unicode_subchar *= 10;
1533 unicode_subchar += hex2bin(p[i]);
1535 }else if(p[1] == 'x' || p[1] == 'X'){
1536 /* hexadecimal number */
1537 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1538 unicode_subchar <<= 4;
1539 unicode_subchar |= hex2bin(p[i]);
1543 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1544 unicode_subchar *= 8;
1545 unicode_subchar += hex2bin(p[i]);
1548 w16e_conv(unicode_subchar, &i, &j);
1549 unicode_subchar = i<<8 | j;
1553 #ifdef UTF8_OUTPUT_ENABLE
1554 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1555 ms_ucs_map_f = UCS_MAP_MS;
1559 #ifdef UNICODE_NORMALIZATION
1560 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1561 input_f = UTF8_INPUT;
1566 if (strcmp(long_option[i].name, "prefix=") == 0){
1567 if (' ' < p[0] && p[0] < 128){
1568 for (i = 1; ' ' < p[i] && p[i] < 128; i++){
1569 prefix_table[p[i]] = p[0];
1576 case 'b': /* buffered mode */
1579 case 'u': /* non bufferd mode */
1582 case 't': /* transparent mode */
1587 } else if (*cp=='2') {
1591 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1599 case 'j': /* JIS output */
1601 output_conv = j_oconv;
1603 case 'e': /* AT&T EUC output */
1604 output_conv = e_oconv;
1606 case 's': /* SJIS output */
1607 output_conv = s_oconv;
1609 case 'l': /* ISO8859 Latin-1 support, no conversion */
1610 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1611 input_f = LATIN1_INPUT;
1613 case 'i': /* Kanji IN ESC-$-@/B */
1614 if (*cp=='@'||*cp=='B')
1615 kanji_intro = *cp++;
1617 case 'o': /* ASCII IN ESC-(-J/B */
1618 if (*cp=='J'||*cp=='B'||*cp=='H')
1619 ascii_intro = *cp++;
1623 bit:1 katakana->hiragana
1624 bit:2 hiragana->katakana
1626 if ('9'>= *cp && *cp>='0')
1627 hira_f |= (*cp++ -'0');
1634 #if defined(MSDOS) || defined(__OS2__)
1649 #ifdef UTF8_OUTPUT_ENABLE
1650 case 'w': /* UTF-8 output */
1651 if ('1'== cp[0] && '6'==cp[1]) {
1652 output_conv = w_oconv16; cp+=2;
1654 unicode_bom_f=2; cp++;
1657 unicode_bom_f=1; cp++;
1659 } else if (cp[0] == 'B') {
1660 unicode_bom_f=2; cp++;
1662 unicode_bom_f=1; cp++;
1665 } else if (cp[0] == '8') {
1666 output_conv = w_oconv; cp++;
1669 unicode_bom_f=1; cp++;
1672 output_conv = w_oconv;
1675 #ifdef UTF8_INPUT_ENABLE
1676 case 'W': /* UTF-8 input */
1677 if ('1'== cp[0] && '6'==cp[1]) {
1678 input_f = UTF16BE_INPUT;
1679 utf16_mode = UTF16BE_INPUT;
1683 input_f = UTF16LE_INPUT;
1684 utf16_mode = UTF16LE_INPUT;
1685 } else if (cp[0] == 'B') {
1687 input_f = UTF16BE_INPUT;
1688 utf16_mode = UTF16BE_INPUT;
1690 } else if (cp[0] == '8') {
1692 input_f = UTF8_INPUT;
1694 input_f = UTF8_INPUT;
1697 /* Input code assumption */
1698 case 'J': /* JIS input */
1699 case 'E': /* AT&T EUC input */
1700 input_f = JIS_INPUT;
1702 case 'S': /* MS Kanji input */
1703 input_f = SJIS_INPUT;
1704 if (x0201_f==NO_X0201) x0201_f=TRUE;
1706 case 'Z': /* Convert X0208 alphabet to asii */
1707 /* bit:0 Convert X0208
1708 bit:1 Convert Kankaku to one space
1709 bit:2 Convert Kankaku to two spaces
1710 bit:3 Convert HTML Entity
1712 if ('9'>= *cp && *cp>='0')
1713 alpha_f |= 1<<(*cp++ -'0');
1717 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1718 x0201_f = FALSE; /* No X0201->X0208 conversion */
1720 ESC-(-I in JIS, EUC, MS Kanji
1721 SI/SO in JIS, EUC, MS Kanji
1722 SSO in EUC, JIS, not in MS Kanji
1723 MS Kanji (0xa0-0xdf)
1725 ESC-(-I in JIS (0x20-0x5f)
1726 SSO in EUC (0xa0-0xdf)
1727 0xa0-0xd in MS Kanji (0xa0-0xdf)
1730 case 'X': /* Assume X0201 kana */
1731 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1734 case 'F': /* prserve new lines */
1735 fold_preserve_f = TRUE;
1736 case 'f': /* folding -f60 or -f */
1739 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1741 fold_len += *cp++ - '0';
1743 if (!(0<fold_len && fold_len<BUFSIZ))
1744 fold_len = DEFAULT_FOLD;
1748 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1750 fold_margin += *cp++ - '0';
1754 case 'm': /* MIME support */
1755 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1756 if (*cp=='B'||*cp=='Q') {
1757 mime_decode_mode = *cp++;
1758 mimebuf_f = FIXED_MIME;
1759 } else if (*cp=='N') {
1760 mime_f = TRUE; cp++;
1761 } else if (*cp=='S') {
1762 mime_f = STRICT_MIME; cp++;
1763 } else if (*cp=='0') {
1764 mime_decode_f = FALSE;
1765 mime_f = FALSE; cp++;
1768 case 'M': /* MIME output */
1771 mimeout_f = FIXED_MIME; cp++;
1772 } else if (*cp=='Q') {
1774 mimeout_f = FIXED_MIME; cp++;
1779 case 'B': /* Broken JIS support */
1781 bit:1 allow any x on ESC-(-x or ESC-$-x
1782 bit:2 reset to ascii on NL
1784 if ('9'>= *cp && *cp>='0')
1785 broken_f |= 1<<(*cp++ -'0');
1790 case 'O':/* for Output file */
1794 case 'c':/* add cr code */
1797 case 'd':/* delete cr code */
1800 case 'I': /* ISO-2022-JP output */
1803 case 'L': /* line mode */
1804 if (*cp=='u') { /* unix */
1805 crmode_f = NL; cp++;
1806 } else if (*cp=='m') { /* mac */
1807 crmode_f = CR; cp++;
1808 } else if (*cp=='w') { /* windows */
1809 crmode_f = CRLF; cp++;
1810 } else if (*cp=='0') { /* no conversion */
1820 /* module muliple options in a string are allowed for Perl moudle */
1821 while(*cp && *cp++!='-');
1824 /* bogus option but ignored */
1830 #ifdef ANSI_C_PROTOTYPE
1831 struct input_code * find_inputcode_byfunc(int (*iconv_func)(int c2,int c1,int c0))
1833 struct input_code * find_inputcode_byfunc(iconv_func)
1834 int (*iconv_func)();
1838 struct input_code *p = input_code_list;
1840 if (iconv_func == p->iconv_func){
1849 void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
1851 #ifdef INPUT_CODE_FIX
1859 #ifdef INPUT_CODE_FIX
1860 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1866 if (estab_f && iconv_for_check != iconv){
1867 struct input_code *p = find_inputcode_byfunc(iconv);
1869 set_input_codename(p->name);
1870 debug(input_codename);
1872 iconv_for_check = iconv;
1877 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1878 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1879 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1880 #ifdef SHIFTJIS_CP932
1881 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1882 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1884 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1886 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1887 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1889 #define SCORE_INIT (SCORE_iMIME)
1891 const int score_table_A0[] = {
1894 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1895 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1898 const int score_table_F0[] = {
1899 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1900 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1901 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1902 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1905 void set_code_score(struct input_code *ptr, int score)
1908 ptr->score |= score;
1912 void clr_code_score(struct input_code *ptr, int score)
1915 ptr->score &= ~score;
1919 void code_score(struct input_code *ptr)
1921 int c2 = ptr->buf[0];
1922 #ifdef UTF8_OUTPUT_ENABLE
1923 int c1 = ptr->buf[1];
1926 set_code_score(ptr, SCORE_ERROR);
1927 }else if (c2 == SSO){
1928 set_code_score(ptr, SCORE_KANA);
1929 #ifdef UTF8_OUTPUT_ENABLE
1930 }else if (!e2w_conv(c2, c1)){
1931 set_code_score(ptr, SCORE_NO_EXIST);
1933 }else if ((c2 & 0x70) == 0x20){
1934 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1935 }else if ((c2 & 0x70) == 0x70){
1936 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1937 }else if ((c2 & 0x70) >= 0x50){
1938 set_code_score(ptr, SCORE_L2);
1942 void status_disable(struct input_code *ptr)
1947 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1950 void status_push_ch(struct input_code *ptr, int c)
1952 ptr->buf[ptr->index++] = c;
1955 void status_clear(struct input_code *ptr)
1961 void status_reset(struct input_code *ptr)
1964 ptr->score = SCORE_INIT;
1967 void status_reinit(struct input_code *ptr)
1970 ptr->_file_stat = 0;
1973 void status_check(struct input_code *ptr, int c)
1975 if (c <= DEL && estab_f){
1980 void s_status(struct input_code *ptr, int c)
1984 status_check(ptr, c);
1989 #ifdef NUMCHAR_OPTION
1990 }else if ((c & CLASS_MASK) == CLASS_UTF16){
1993 }else if (0xa1 <= c && c <= 0xdf){
1994 status_push_ch(ptr, SSO);
1995 status_push_ch(ptr, c);
1998 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2000 status_push_ch(ptr, c);
2001 #ifdef SHIFTJIS_CP932
2003 && CP932_TABLE_BEGIN <= c && c <= CP932_TABLE_END){
2005 status_push_ch(ptr, c);
2006 #endif /* SHIFTJIS_CP932 */
2008 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2010 status_push_ch(ptr, c);
2011 #endif /* X0212_ENABLE */
2013 status_disable(ptr);
2017 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2018 status_push_ch(ptr, c);
2019 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2023 status_disable(ptr);
2027 #ifdef SHIFTJIS_CP932
2028 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2029 status_push_ch(ptr, c);
2030 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2031 set_code_score(ptr, SCORE_CP932);
2036 #endif /* SHIFTJIS_CP932 */
2037 #ifndef X0212_ENABLE
2038 status_disable(ptr);
2044 void e_status(struct input_code *ptr, int c)
2048 status_check(ptr, c);
2053 #ifdef NUMCHAR_OPTION
2054 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2057 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2059 status_push_ch(ptr, c);
2061 }else if (0x8f == c){
2063 status_push_ch(ptr, c);
2064 #endif /* X0212_ENABLE */
2066 status_disable(ptr);
2070 if (0xa1 <= c && c <= 0xfe){
2071 status_push_ch(ptr, c);
2075 status_disable(ptr);
2080 if (0xa1 <= c && c <= 0xfe){
2082 status_push_ch(ptr, c);
2084 status_disable(ptr);
2086 #endif /* X0212_ENABLE */
2090 #ifdef UTF8_INPUT_ENABLE
2091 void w16_status(struct input_code *ptr, int c)
2097 if (ptr->_file_stat == 0){
2098 if (c == 0xfe || c == 0xff){
2100 status_push_ch(ptr, c);
2101 ptr->_file_stat = 1;
2103 status_disable(ptr);
2104 ptr->_file_stat = -1;
2106 }else if (ptr->_file_stat > 0){
2108 status_push_ch(ptr, c);
2109 }else if (ptr->_file_stat < 0){
2110 status_disable(ptr);
2116 status_disable(ptr);
2117 ptr->_file_stat = -1;
2119 status_push_ch(ptr, c);
2126 if (ptr->stat != c && (c == 0xfe || c == 0xff)){
2127 status_push_ch(ptr, c);
2130 status_disable(ptr);
2131 ptr->_file_stat = -1;
2137 void w_status(struct input_code *ptr, int c)
2141 status_check(ptr, c);
2146 #ifdef NUMCHAR_OPTION
2147 }else if ((c & CLASS_MASK) == CLASS_UTF16){
2150 }else if (0xc0 <= c && c <= 0xdf){
2152 status_push_ch(ptr, c);
2153 }else if (0xe0 <= c && c <= 0xef){
2155 status_push_ch(ptr, c);
2157 status_disable(ptr);
2162 if (0x80 <= c && c <= 0xbf){
2163 status_push_ch(ptr, c);
2164 if (ptr->index > ptr->stat){
2165 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2166 && ptr->buf[2] == 0xbf);
2167 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2168 &ptr->buf[0], &ptr->buf[1]);
2175 status_disable(ptr);
2182 void code_status(int c)
2184 int action_flag = 1;
2185 struct input_code *result = 0;
2186 struct input_code *p = input_code_list;
2188 (p->status_func)(p, c);
2191 }else if(p->stat == 0){
2202 if (result && !estab_f){
2203 set_iconv(TRUE, result->iconv_func);
2204 }else if (c <= DEL){
2205 struct input_code *ptr = input_code_list;
2215 int std_getc(FILE *f)
2218 return std_gc_buf[--std_gc_ndx];
2224 int std_ungetc(int c, FILE *f)
2226 if (std_gc_ndx == STD_GC_BUFSIZE){
2229 std_gc_buf[std_gc_ndx++] = c;
2234 void std_putc(int c)
2241 #if !defined(PERL_XS) && !defined(WIN32DLL)
2242 int noconvert(FILE *f)
2247 module_connection();
2248 while ((c = (*i_getc)(f)) != EOF)
2255 void module_connection(void)
2257 oconv = output_conv;
2260 /* replace continucation module, from output side */
2262 /* output redicrection */
2264 if (noout_f || guess_f){
2271 if (mimeout_f == TRUE) {
2272 o_base64conv = oconv; oconv = base64_conv;
2274 /* base64_count = 0; */
2278 o_crconv = oconv; oconv = cr_conv;
2281 o_rot_conv = oconv; oconv = rot_conv;
2284 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2287 o_hira_conv = oconv; oconv = hira_conv;
2290 o_fconv = oconv; oconv = fold_conv;
2293 if (alpha_f || x0201_f) {
2294 o_zconv = oconv; oconv = z_conv;
2298 i_ungetc = std_ungetc;
2299 /* input redicrection */
2302 i_cgetc = i_getc; i_getc = cap_getc;
2303 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2306 i_ugetc = i_getc; i_getc = url_getc;
2307 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2310 #ifdef NUMCHAR_OPTION
2312 i_ngetc = i_getc; i_getc = numchar_getc;
2313 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2316 #ifdef UNICODE_NORMALIZATION
2317 if (nfc_f && input_f == UTF8_INPUT){
2318 i_nfc_getc = i_getc; i_getc = nfc_getc;
2319 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2322 if (mime_f && mimebuf_f==FIXED_MIME) {
2323 i_mgetc = i_getc; i_getc = mime_getc;
2324 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2327 i_bgetc = i_getc; i_getc = broken_getc;
2328 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2330 if (input_f == JIS_INPUT || input_f == LATIN1_INPUT) {
2331 set_iconv(-TRUE, e_iconv);
2332 } else if (input_f == SJIS_INPUT) {
2333 set_iconv(-TRUE, s_iconv);
2334 #ifdef UTF8_INPUT_ENABLE
2335 } else if (input_f == UTF8_INPUT) {
2336 set_iconv(-TRUE, w_iconv);
2337 } else if (input_f == UTF16BE_INPUT) {
2338 set_iconv(-TRUE, w_iconv16);
2339 } else if (input_f == UTF16LE_INPUT) {
2340 set_iconv(-TRUE, w_iconv16);
2343 set_iconv(FALSE, e_iconv);
2347 struct input_code *p = input_code_list;
2355 Conversion main loop. Code detection only.
2358 int kanji_convert(FILE *f)
2362 int is_8bit = FALSE;
2364 module_connection();
2367 if(input_f == SJIS_INPUT
2368 #ifdef UTF8_INPUT_ENABLE
2369 || input_f == UTF8_INPUT || input_f == UTF16BE_INPUT || input_f == UTF16LE_INPUT
2377 output_mode = ASCII;
2380 #define NEXT continue /* no output, get next */
2381 #define SEND ; /* output c1 and c2, get next */
2382 #define LAST break /* end of loop, go closing */
2384 while ((c1 = (*i_getc)(f)) != EOF) {
2385 #ifdef INPUT_CODE_FIX
2392 /* in case of 8th bit is on */
2393 if (!estab_f&&!mime_decode_mode) {
2394 /* in case of not established yet */
2395 /* It is still ambiguious */
2396 if (h_conv(f, c2, c1)==EOF)
2402 /* in case of already established */
2404 /* ignore bogus code */
2410 /* second byte, 7 bit code */
2411 /* it might be kanji shitfted */
2412 if ((c1 == DEL) || (c1 <= SPACE)) {
2413 /* ignore bogus first code */
2421 #ifdef UTF8_INPUT_ENABLE
2430 #ifdef NUMCHAR_OPTION
2431 } else if ((c1 & CLASS_MASK) == CLASS_UTF16){
2434 } else if (c1 > DEL) {
2436 if (!estab_f && !iso8859_f) {
2437 /* not established yet */
2438 if (!is_8bit) is_8bit = TRUE;
2441 } else { /* estab_f==TRUE */
2446 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2447 /* SJIS X0201 Case... */
2448 if(iso2022jp_f && x0201_f==NO_X0201) {
2449 (*oconv)(GETA1, GETA2);
2456 } else if (c1==SSO && iconv != s_iconv) {
2457 /* EUC X0201 Case */
2458 c1 = (*i_getc)(f); /* skip SSO */
2460 if (SSP<=c1 && c1<0xe0) {
2461 if(iso2022jp_f && x0201_f==NO_X0201) {
2462 (*oconv)(GETA1, GETA2);
2469 } else { /* bogus code, skip SSO and one byte */
2473 /* already established */
2478 } else if ((c1 > SPACE) && (c1 != DEL)) {
2479 /* in case of Roman characters */
2481 /* output 1 shifted byte */
2485 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2486 /* output 1 shifted byte */
2487 if(iso2022jp_f && x0201_f==NO_X0201) {
2488 (*oconv)(GETA1, GETA2);
2495 /* look like bogus code */
2498 } else if (input_mode == X0208 || input_mode == X0212 ||
2499 input_mode == X0213_1 || input_mode == X0213_2) {
2500 /* in case of Kanji shifted */
2503 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2504 /* Check MIME code */
2505 if ((c1 = (*i_getc)(f)) == EOF) {
2508 } else if (c1 == '?') {
2509 /* =? is mime conversion start sequence */
2510 if(mime_f == STRICT_MIME) {
2511 /* check in real detail */
2512 if (mime_begin_strict(f) == EOF)
2516 } else if (mime_begin(f) == EOF)
2526 /* normal ASCII code */
2529 } else if (!is_8bit && c1 == SI) {
2532 } else if (!is_8bit && c1 == SO) {
2535 } else if (!is_8bit && c1 == ESC ) {
2536 if ((c1 = (*i_getc)(f)) == EOF) {
2537 /* (*oconv)(0, ESC); don't send bogus code */
2539 } else if (c1 == '$') {
2540 if ((c1 = (*i_getc)(f)) == EOF) {
2542 (*oconv)(0, ESC); don't send bogus code
2543 (*oconv)(0, '$'); */
2545 } else if (c1 == '@'|| c1 == 'B') {
2546 /* This is kanji introduction */
2549 set_input_codename("ISO-2022-JP");
2551 debug(input_codename);
2554 } else if (c1 == '(') {
2555 if ((c1 = (*i_getc)(f)) == EOF) {
2556 /* don't send bogus code
2562 } else if (c1 == '@'|| c1 == 'B') {
2563 /* This is kanji introduction */
2568 } else if (c1 == 'D'){
2572 #endif /* X0212_ENABLE */
2573 } else if (c1 == (X0213_1&0x7F)){
2574 input_mode = X0213_1;
2577 } else if (c1 == (X0213_2&0x7F)){
2578 input_mode = X0213_2;
2582 /* could be some special code */
2589 } else if (broken_f&0x2) {
2590 /* accept any ESC-(-x as broken code ... */
2600 } else if (c1 == '(') {
2601 if ((c1 = (*i_getc)(f)) == EOF) {
2602 /* don't send bogus code
2604 (*oconv)(0, '('); */
2608 /* This is X0201 kana introduction */
2609 input_mode = X0201; shift_mode = X0201;
2611 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2612 /* This is X0208 kanji introduction */
2613 input_mode = ASCII; shift_mode = FALSE;
2615 } else if (broken_f&0x2) {
2616 input_mode = ASCII; shift_mode = FALSE;
2621 /* maintain various input_mode here */
2625 } else if ( c1 == 'N' || c1 == 'n' ){
2627 c3 = (*i_getc)(f); /* skip SS2 */
2628 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2643 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2644 input_mode = ASCII; set_iconv(FALSE, 0);
2646 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2647 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2655 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2656 if ((c1=(*i_getc)(f))!=EOF) {
2660 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2678 if ((*iconv)(c2, c1, 0) < 0){ /* can be EUC/SJIS */
2679 int c0 = (*i_getc)(f);
2682 (*iconv)(c2, c1, c0);
2688 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2692 (*oconv)((0x8f << 8) | c2, c1);
2694 #endif /* X0212_ENABLE */
2696 (*oconv)((0x8f << 8) | c2, c1);
2699 (*oconv)(input_mode, c1); /* other special case */
2704 /* goto next_word */
2708 (*iconv)(EOF, 0, 0);
2709 if (!is_inputcode_set)
2712 struct input_code *p = input_code_list;
2713 struct input_code *result = p;
2715 if (p->score < result->score) result = p;
2718 set_input_codename(result->name);
2725 h_conv(FILE *f, int c2, int c1)
2730 /** it must NOT be in the kanji shifte sequence */
2731 /** it must NOT be written in JIS7 */
2732 /** and it must be after 2 byte 8bit code */
2738 while ((c1 = (*i_getc)(f)) != EOF) {
2744 if (push_hold_buf(c1) == EOF || estab_f){
2750 struct input_code *p = input_code_list;
2751 struct input_code *result = p;
2756 if (p->score < result->score){
2761 set_iconv(FALSE, result->iconv_func);
2766 ** 1) EOF is detected, or
2767 ** 2) Code is established, or
2768 ** 3) Buffer is FULL (but last word is pushed)
2770 ** in 1) and 3) cases, we continue to use
2771 ** Kanji codes by oconv and leave estab_f unchanged.
2776 while (wc < hold_count){
2777 c2 = hold_buf[wc++];
2779 #ifdef NUMCHAR_OPTION
2780 || (c2 & CLASS_MASK) == CLASS_UTF16
2785 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2786 (*iconv)(X0201, c2, 0);
2789 if (wc < hold_count){
2790 c1 = hold_buf[wc++];
2799 if ((*iconv)(c2, c1, 0) < 0){
2801 if (wc < hold_count){
2802 c0 = hold_buf[wc++];
2811 (*iconv)(c2, c1, c0);
2820 push_hold_buf(int c2)
2822 if (hold_count >= HOLD_SIZE*2)
2824 hold_buf[hold_count++] = (unsigned char)c2;
2825 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2828 int s2e_conv(int c2, int c1, int *p2, int *p1)
2830 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
2833 static const int shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
2834 #ifdef SHIFTJIS_CP932
2835 if (cp51932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
2836 extern const unsigned short shiftjis_cp932[3][189];
2837 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
2843 #endif /* SHIFTJIS_CP932 */
2845 if (!x0213_f && 0xfa <= c2 && c2 <= 0xfc){
2846 extern const unsigned short shiftjis_x0212[3][189];
2847 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
2850 c2 = (0x8f << 8) | (val >> 8);
2863 if(x0213_f && c2 >= 0xF0){
2864 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
2865 c2 = 0x8F20 + shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
2866 }else{ /* 78<=k<=94 */
2867 c2 = 0x8F00 | (c2 * 2 - 0x17B);
2868 if (0x9E < c1) c2++;
2871 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
2872 if (0x9E < c1) c2++;
2875 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
2882 c2 = x0212_unshift(c2);
2889 int s_iconv(int c2, int c1, int c0)
2893 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2896 int ret = s2e_conv(c2, c1, &c2, &c1);
2897 if (ret) return ret;
2903 int e_iconv(int c2, int c1, int c0)
2908 }else if (c2 == 0x8f){
2912 c2 = (c2 << 8) | (c1 & 0x7f);
2914 #ifdef SHIFTJIS_CP932
2917 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2918 s2e_conv(s2, s1, &c2, &c1);
2919 if ((c2 & 0xff00) == 0){
2925 #endif /* SHIFTJIS_CP932 */
2926 #endif /* X0212_ENABLE */
2927 } else if (c2 == SSO){
2930 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
2940 #ifdef UTF8_INPUT_ENABLE
2941 int w2e_conv(int c2, int c1, int c0, int *p2, int *p1)
2948 }else if (0xc0 <= c2 && c2 <= 0xef) {
2949 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2950 #ifdef NUMCHAR_OPTION
2953 if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
2961 int w_iconv(int c2, int c1, int c0)
2965 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
2966 if(ignore_zwnbsp_f){
2967 ignore_zwnbsp_f = FALSE;
2968 if(c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
2972 if (c2 == 0) /* 0x00-0x7f */
2973 c1 &= 0x7F; /* 1byte */
2975 if ((c2 & 0xe0) == 0xc0){ /* 0xc0-0xdf */
2977 if((c2 & 0xFE) == 0xC0 || c1 < 0x80 || 0xBF < c1) return 0;
2978 }else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
2979 return -1; /* 3bytes */
2981 else if (0xf0 <= c2)
2982 return 0; /* 4,5,6bytes */
2983 else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
2984 return 0; /* trail byte */
2988 /* must be 3bytes */
2990 if(c1 < 0xA0 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
2992 }else if(c2 == 0xED){
2993 if(c1 < 0x80 || 0x9F < c1 || c0 < 0x80 || 0xBF < c0)
2995 }else if((c2 & 0xf0) == 0xe0){
2996 if(c1 < 0x80 || 0xBF < c1 || c0 < 0x80 || 0xBF < c0)
3000 if (c2 == 0 || c2 == EOF){
3002 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3011 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3012 void w16w_conv(int val, int *p2, int *p1, int *p0)
3019 }else if (val < 0x800){
3020 *p2 = 0xc0 | (val >> 6);
3021 *p1 = 0x80 | (val & 0x3f);
3024 *p2 = 0xe0 | (val >> 12);
3025 *p1 = 0x80 | ((val >> 6) & 0x3f);
3026 *p0 = 0x80 | (val & 0x3f);
3031 #ifdef UTF8_INPUT_ENABLE
3032 int ww16_conv(int c2, int c1, int c0)
3037 }else if (c2 >= 0xe0){
3038 val = (c2 & 0x0f) << 12;
3039 val |= (c1 & 0x3f) << 6;
3041 }else if (c2 >= 0xc0){
3042 val = (c2 & 0x1f) << 6;
3050 int w16e_conv(int val, int *p2, int *p1)
3059 w16w_conv(val, &c2, &c1, &c0);
3060 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3061 #ifdef NUMCHAR_OPTION
3064 *p1 = CLASS_UTF16 | val;
3073 #ifdef UTF8_INPUT_ENABLE
3074 int w_iconv16(int c2, int c1, int c0)
3078 /* throw away ZERO WIDTH NO-BREAK SPACE (U+FEFF) */
3079 if(ignore_zwnbsp_f){
3080 ignore_zwnbsp_f = FALSE;
3081 if (c2==0376 && c1==0377){
3082 utf16_mode = UTF16BE_INPUT;
3084 }else if(c2==0377 && c1==0376){
3085 utf16_mode = UTF16LE_INPUT;
3089 if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
3091 tmp=c1; c1=c2; c2=tmp;
3093 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3096 }else if((c2>>3)==27){ /* surrogate pair */
3098 }else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
3099 if (ret) return ret;
3104 int unicode_to_jis_common(int c2, int c1, int c0, int *p2, int *p1)
3106 extern const unsigned short *const utf8_to_euc_2bytes[];
3107 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3108 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3109 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3110 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3111 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3112 const unsigned short *const *pp;
3113 const unsigned short *const *const *ppp;
3114 static const int no_best_fit_chars_table_C2[] =
3115 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3116 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3117 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3118 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3119 static const int no_best_fit_chars_table_C2_ms[] =
3120 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3122 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3123 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3124 static const int no_best_fit_chars_table_932_C2[] =
3125 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3127 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3128 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3129 static const int no_best_fit_chars_table_932_C3[] =
3130 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3131 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3133 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3139 }else if(c2 < 0xe0){
3140 if(no_best_fit_chars_f){
3141 if(ms_ucs_map_f == UCS_MAP_CP932){
3144 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3147 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3150 }else if(cp51932_f){
3153 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3156 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3159 }else if(ms_ucs_map_f == UCS_MAP_MS){
3160 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3164 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3165 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3167 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3169 if(no_best_fit_chars_f){
3170 if(ms_ucs_map_f == UCS_MAP_CP932){
3171 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3172 }else if(ms_ucs_map_f == UCS_MAP_MS){
3177 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3180 if(c0 == 0x92) return 1;
3185 if(c1 == 0x80 || c0 == 0x9C) return 1;
3193 if(c0 == 0x95) return 1;
3196 if(c0 == 0xA5) return 1;
3203 if(c0 == 0x8D) return 1;
3206 if(c0 == 0x9E && cp51932_f) return 1;
3209 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3217 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3218 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3220 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3222 #ifdef SHIFTJIS_CP932
3223 if (!ret && cp51932_f && (*p2 & 0xff00) >> 8 == 0x8f) {
3225 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3226 s2e_conv(s2, s1, p2, p1);
3235 int w_iconv_common(int c1, int c0, const unsigned short *const *pp, int psize, int *p2, int *p1)
3238 const unsigned short *p;
3241 if (pp == 0) return 1;
3244 if (c1 < 0 || psize <= c1) return 1;
3246 if (p == 0) return 1;
3249 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3251 if (val == 0) return 1;
3252 if (no_cp932ext_f && (
3253 (val>>8) == 0x2D || /* NEC special characters */
3254 val > 0xF300 /* NEC special characters */
3262 if (c2 == SO) c2 = X0201;
3269 void nkf_each_char_to_hex(void (*f)(int c2,int c1), int c)
3271 const char *hex = "0123456789ABCDEF";
3277 (*f)(0, hex[(c>>shift)&0xF]);
3287 void encode_fallback_html(int c)
3293 (*oconv)(0, 0x30+(c/1000000)%10);
3295 (*oconv)(0, 0x30+(c/100000 )%10);
3297 (*oconv)(0, 0x30+(c/10000 )%10);
3299 (*oconv)(0, 0x30+(c/1000 )%10);
3301 (*oconv)(0, 0x30+(c/100 )%10);
3303 (*oconv)(0, 0x30+(c/10 )%10);
3305 (*oconv)(0, 0x30+ c %10);
3310 void encode_fallback_xml(int c)
3315 nkf_each_char_to_hex(oconv, c);
3320 void encode_fallback_java(int c)
3322 const char *hex = "0123456789ABCDEF";
3324 if((c&0x00FFFFFF) > 0xFFFF){
3328 (*oconv)(0, hex[(c>>20)&0xF]);
3329 (*oconv)(0, hex[(c>>16)&0xF]);
3333 (*oconv)(0, hex[(c>>12)&0xF]);
3334 (*oconv)(0, hex[(c>> 8)&0xF]);
3335 (*oconv)(0, hex[(c>> 4)&0xF]);
3336 (*oconv)(0, hex[ c &0xF]);
3340 void encode_fallback_perl(int c)
3345 nkf_each_char_to_hex(oconv, c);
3350 void encode_fallback_subchar(int c)
3352 c = unicode_subchar;
3353 (*oconv)((c>>8)&0xFF, c&0xFF);
3358 #ifdef UTF8_OUTPUT_ENABLE
3359 int e2w_conv(int c2, int c1)
3361 extern const unsigned short euc_to_utf8_1byte[];
3362 extern const unsigned short *const euc_to_utf8_2bytes[];
3363 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3364 extern const unsigned short *const x0212_to_utf8_2bytes[];
3365 const unsigned short *p;
3368 p = euc_to_utf8_1byte;
3370 } else if (c2 >> 8 == 0x8f){
3371 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == 0x8F22 && c1 == 0x43){
3374 c2 = (c2&0x7f) - 0x21;
3375 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3376 p = x0212_to_utf8_2bytes[c2];
3382 c2 = (c2&0x7f) - 0x21;
3383 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3384 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3389 c1 = (c1 & 0x7f) - 0x21;
3390 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3395 void w_oconv(int c2, int c1)
3404 if (unicode_bom_f==2) {
3411 #ifdef NUMCHAR_OPTION
3412 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3413 w16w_conv(c1, &c2, &c1, &c0);
3417 if (c0) (*o_putc)(c0);
3424 output_mode = ASCII;
3426 } else if (c2 == ISO8859_1) {
3427 output_mode = ISO8859_1;
3428 (*o_putc)(c1 | 0x080);
3431 val = e2w_conv(c2, c1);
3433 w16w_conv(val, &c2, &c1, &c0);
3437 if (c0) (*o_putc)(c0);
3443 void w_oconv16(int c2, int c1)
3450 if (unicode_bom_f==2) {
3452 (*o_putc)((unsigned char)'\377');
3456 (*o_putc)((unsigned char)'\377');
3461 if (c2 == ISO8859_1) {
3464 #ifdef NUMCHAR_OPTION
3465 } else if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16) {
3466 c2 = (c1 >> 8) & 0xff;
3470 int val = e2w_conv(c2, c1);
3471 c2 = (val >> 8) & 0xff;
3485 void e_oconv(int c2, int c1)
3487 #ifdef NUMCHAR_OPTION
3488 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3489 w16e_conv(c1, &c2, &c1);
3490 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3491 if(encode_fallback)(*encode_fallback)(c1);
3499 } else if (c2 == 0) {
3500 output_mode = ASCII;
3502 } else if (c2 == X0201) {
3503 output_mode = JAPANESE_EUC;
3504 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3505 } else if (c2 == ISO8859_1) {
3506 output_mode = ISO8859_1;
3507 (*o_putc)(c1 | 0x080);
3509 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3510 output_mode = JAPANESE_EUC;
3511 #ifdef SHIFTJIS_CP932
3514 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3515 s2e_conv(s2, s1, &c2, &c1);
3520 output_mode = ASCII;
3522 }else if ((c2 & 0xff00) >> 8 == 0x8f){
3525 (*o_putc)((c2 & 0x7f) | 0x080);
3526 (*o_putc)(c1 | 0x080);
3529 (*o_putc)((c2 & 0x7f) | 0x080);
3530 (*o_putc)(c1 | 0x080);
3534 if ((c1<0x21 || 0x7e<c1) ||
3535 (c2<0x21 || 0x7e<c2)) {
3536 set_iconv(FALSE, 0);
3537 return; /* too late to rescue this char */
3539 output_mode = JAPANESE_EUC;
3540 (*o_putc)(c2 | 0x080);
3541 (*o_putc)(c1 | 0x080);
3546 int x0212_shift(int c)
3550 if ((ret & 0xff00) == 0x8f00){
3551 if (0x75 <= c && c <= 0x7f){
3552 ret = c + (0x109 - 0x75);
3555 if (0x75 <= c && c <= 0x7f){
3556 ret = c + (0x113 - 0x75);
3563 int x0212_unshift(int c)
3566 if (0x7f <= c && c <= 0x88){
3567 ret = c + (0x75 - 0x7f);
3568 }else if (0x89 <= c && c <= 0x92){
3569 ret = (0x8f << 8) | 0x80 | (c + (0x75 - 0x89));
3573 #endif /* X0212_ENABLE */
3575 int e2s_conv(int c2, int c1, int *p2, int *p1)
3578 if ((c2 & 0xff00) == 0x8f00){
3581 if((0x21 <= ndx && ndx <= 0x2F)){
3582 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3583 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3585 }else if(0x6E <= ndx && ndx <= 0x7E){
3586 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3587 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3593 else if(0x21 <= ndx && ndx <= 0x7e){
3595 const unsigned short *ptr;
3596 extern const unsigned short *const x0212_shiftjis[];
3597 ptr = x0212_shiftjis[ndx - 0x21];
3599 val = ptr[(c1 & 0x7f) - 0x21];
3608 c2 = x0212_shift(c2);
3610 #endif /* X0212_ENABLE */
3612 if(0x7F < c2) return 1;
3613 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3614 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3618 void s_oconv(int c2, int c1)
3620 #ifdef NUMCHAR_OPTION
3621 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3622 w16e_conv(c1, &c2, &c1);
3623 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3624 if(encode_fallback)(*encode_fallback)(c1);
3632 } else if (c2 == 0) {
3633 output_mode = ASCII;
3635 } else if (c2 == X0201) {
3636 output_mode = SHIFT_JIS;
3638 } else if (c2 == ISO8859_1) {
3639 output_mode = ISO8859_1;
3640 (*o_putc)(c1 | 0x080);
3642 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3643 output_mode = SHIFT_JIS;
3644 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3650 if ((c1<0x20 || 0x7e<c1) ||
3651 (c2<0x20 || 0x7e<c2)) {
3652 set_iconv(FALSE, 0);
3653 return; /* too late to rescue this char */
3655 output_mode = SHIFT_JIS;
3656 e2s_conv(c2, c1, &c2, &c1);
3658 #ifdef SHIFTJIS_CP932
3660 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3661 extern const unsigned short cp932inv[2][189];
3662 int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3668 #endif /* SHIFTJIS_CP932 */
3671 if (prefix_table[(unsigned char)c1]){
3672 (*o_putc)(prefix_table[(unsigned char)c1]);
3678 void j_oconv(int c2, int c1)
3680 #ifdef NUMCHAR_OPTION
3681 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3682 w16e_conv(c1, &c2, &c1);
3683 if (c2 == 0 && (c1 & CLASS_MASK) == CLASS_UTF16){
3684 if(encode_fallback)(*encode_fallback)(c1);
3690 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3693 (*o_putc)(ascii_intro);
3694 output_mode = ASCII;
3698 } else if ((c2 & 0xff00) >> 8 == 0x8f){
3700 if(output_mode!=X0213_2){
3701 output_mode = X0213_2;
3705 (*o_putc)(X0213_2&0x7F);
3708 if(output_mode!=X0212){
3709 output_mode = X0212;
3713 (*o_putc)(X0212&0x7F);
3716 (*o_putc)(c2 & 0x7f);
3719 } else if (c2==X0201) {
3720 if (output_mode!=X0201) {
3721 output_mode = X0201;
3727 } else if (c2==ISO8859_1) {
3728 /* iso8859 introduction, or 8th bit on */
3729 /* Can we convert in 7bit form using ESC-'-'-A ?
3731 output_mode = ISO8859_1;
3733 } else if (c2 == 0) {
3734 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
3737 (*o_putc)(ascii_intro);
3738 output_mode = ASCII;
3742 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
3744 if (output_mode!=X0213_1) {
3745 output_mode = X0213_1;
3749 (*o_putc)(X0213_1&0x7F);
3751 }else if (output_mode != X0208) {
3752 output_mode = X0208;
3755 (*o_putc)(kanji_intro);
3762 void base64_conv(int c2, int c1)
3764 mime_prechar(c2, c1);
3765 (*o_base64conv)(c2,c1);
3769 static int broken_buf[3];
3770 static int broken_counter = 0;
3771 static int broken_last = 0;
3772 int broken_getc(FILE *f)
3776 if (broken_counter>0) {
3777 return broken_buf[--broken_counter];
3780 if (c=='$' && broken_last != ESC
3781 && (input_mode==ASCII || input_mode==X0201)) {
3784 if (c1=='@'|| c1=='B') {
3785 broken_buf[0]=c1; broken_buf[1]=c;
3792 } else if (c=='(' && broken_last != ESC
3793 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
3796 if (c1=='J'|| c1=='B') {
3797 broken_buf[0]=c1; broken_buf[1]=c;
3810 int broken_ungetc(int c, FILE *f)
3812 if (broken_counter<2)
3813 broken_buf[broken_counter++]=c;
3817 static int prev_cr = 0;
3819 void cr_conv(int c2, int c1)
3823 if (! (c2==0&&c1==NL) ) {
3829 } else if (c1=='\r') {
3831 } else if (c1=='\n') {
3832 if (crmode_f==CRLF) {
3833 (*o_crconv)(0,'\r');
3834 } else if (crmode_f==CR) {
3835 (*o_crconv)(0,'\r');
3839 } else if (c1!='\032' || crmode_f!=NL){
3845 Return value of fold_conv()
3847 \n add newline and output char
3848 \r add newline and output nothing
3851 1 (or else) normal output
3853 fold state in prev (previous character)
3855 >0x80 Japanese (X0208/X0201)
3860 This fold algorthm does not preserve heading space in a line.
3861 This is the main difference from fmt.
3864 #define char_size(c2,c1) (c2?2:1)
3866 void fold_conv(int c2, int c1)
3871 if (c1== '\r' && !fold_preserve_f) {
3872 fold_state=0; /* ignore cr */
3873 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
3875 fold_state=0; /* ignore cr */
3876 } else if (c1== BS) {
3877 if (f_line>0) f_line--;
3879 } else if (c2==EOF && f_line != 0) { /* close open last line */
3881 } else if ((c1=='\n' && !fold_preserve_f)
3882 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
3883 && fold_preserve_f)) {
3885 if (fold_preserve_f) {
3889 } else if ((f_prev == c1 && !fold_preserve_f)
3890 || (f_prev == '\n' && fold_preserve_f)
3891 ) { /* duplicate newline */
3894 fold_state = '\n'; /* output two newline */
3900 if (f_prev&0x80) { /* Japanese? */
3902 fold_state = 0; /* ignore given single newline */
3903 } else if (f_prev==' ') {
3907 if (++f_line<=fold_len)
3911 fold_state = '\r'; /* fold and output nothing */
3915 } else if (c1=='\f') {
3918 fold_state = '\n'; /* output newline and clear */
3919 } else if ( (c2==0 && c1==' ')||
3920 (c2==0 && c1=='\t')||
3921 (c2=='!'&& c1=='!')) {
3922 /* X0208 kankaku or ascii space */
3923 if (f_prev == ' ') {
3924 fold_state = 0; /* remove duplicate spaces */
3927 if (++f_line<=fold_len)
3928 fold_state = ' '; /* output ASCII space only */
3930 f_prev = ' '; f_line = 0;
3931 fold_state = '\r'; /* fold and output nothing */
3935 prev0 = f_prev; /* we still need this one... , but almost done */
3937 if (c2 || c2==X0201)
3938 f_prev |= 0x80; /* this is Japanese */
3939 f_line += char_size(c2,c1);
3940 if (f_line<=fold_len) { /* normal case */
3943 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3944 f_line = char_size(c2,c1);
3945 fold_state = '\n'; /* We can't wait, do fold now */
3946 } else if (c2==X0201) {
3947 /* simple kinsoku rules return 1 means no folding */
3948 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3949 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3950 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3951 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3952 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3953 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3954 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3956 fold_state = '\n';/* add one new f_line before this character */
3959 fold_state = '\n';/* add one new f_line before this character */
3962 /* kinsoku point in ASCII */
3963 if ( c1==')'|| /* { [ ( */
3974 /* just after special */
3975 } else if (!is_alnum(prev0)) {
3976 f_line = char_size(c2,c1);
3978 } else if ((prev0==' ') || /* ignored new f_line */
3979 (prev0=='\n')|| /* ignored new f_line */
3980 (prev0&0x80)) { /* X0208 - ASCII */
3981 f_line = char_size(c2,c1);
3982 fold_state = '\n';/* add one new f_line before this character */
3984 fold_state = 1; /* default no fold in ASCII */
3988 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3989 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3990 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3991 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3992 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3993 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3994 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3995 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3996 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3997 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3998 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3999 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4000 /* default no fold in kinsoku */
4003 f_line = char_size(c2,c1);
4004 /* add one new f_line before this character */
4007 f_line = char_size(c2,c1);
4009 /* add one new f_line before this character */
4014 /* terminator process */
4015 switch(fold_state) {
4034 int z_prev2=0,z_prev1=0;
4036 void z_conv(int c2, int c1)
4039 /* if (c2) c1 &= 0x7f; assertion */
4041 if (x0201_f && z_prev2==X0201) { /* X0201 */
4042 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4044 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4046 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4048 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4052 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4061 if (x0201_f && c2==X0201) {
4062 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4063 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4064 z_prev1 = c1; z_prev2 = c2;
4067 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4072 /* JISX0208 Alphabet */
4073 if (alpha_f && c2 == 0x23 ) {
4075 } else if (alpha_f && c2 == 0x21 ) {
4076 /* JISX0208 Kigou */
4081 } else if (alpha_f&0x4) {
4086 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4092 case '>': entity = ">"; break;
4093 case '<': entity = "<"; break;
4094 case '\"': entity = """; break;
4095 case '&': entity = "&"; break;
4098 while (*entity) (*o_zconv)(0, *entity++);
4108 #define rot13(c) ( \
4110 (c <= 'M') ? (c + 13): \
4111 (c <= 'Z') ? (c - 13): \
4113 (c <= 'm') ? (c + 13): \
4114 (c <= 'z') ? (c - 13): \
4118 #define rot47(c) ( \
4120 ( c <= 'O' ) ? (c + 47) : \
4121 ( c <= '~' ) ? (c - 47) : \
4125 void rot_conv(int c2, int c1)
4127 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4133 (*o_rot_conv)(c2,c1);
4136 void hira_conv(int c2, int c1)
4138 if ((hira_f & 1) && c2==0x25 && 0x20<c1 && c1<0x74) {
4140 } else if ((hira_f & 2) && c2==0x24 && 0x20<c1 && c1<0x74) {
4143 (*o_hira_conv)(c2,c1);
4147 void iso2022jp_check_conv(int c2, int c1)
4149 static const int range[RANGE_NUM_MAX][2] = {
4172 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4176 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4181 for (i = 0; i < RANGE_NUM_MAX; i++) {
4182 start = range[i][0];
4185 if (c >= start && c <= end) {
4190 (*o_iso2022jp_check_conv)(c2,c1);
4194 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4196 const unsigned char *mime_pattern[] = {
4197 (const unsigned char *)"\075?EUC-JP?B?",
4198 (const unsigned char *)"\075?SHIFT_JIS?B?",
4199 (const unsigned char *)"\075?ISO-8859-1?Q?",
4200 (const unsigned char *)"\075?ISO-8859-1?B?",
4201 (const unsigned char *)"\075?ISO-2022-JP?B?",
4202 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4203 #if defined(UTF8_INPUT_ENABLE)
4204 (const unsigned char *)"\075?UTF-8?B?",
4205 (const unsigned char *)"\075?UTF-8?Q?",
4207 (const unsigned char *)"\075?US-ASCII?Q?",
4212 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4213 int (*mime_priority_func[])(int c2, int c1, int c0) = {
4214 e_iconv, s_iconv, 0, 0, 0, 0,
4215 #if defined(UTF8_INPUT_ENABLE)
4221 const int mime_encode[] = {
4222 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4223 #if defined(UTF8_INPUT_ENABLE)
4230 const int mime_encode_method[] = {
4231 'B', 'B','Q', 'B', 'B', 'Q',
4232 #if defined(UTF8_INPUT_ENABLE)
4240 #define MAXRECOVER 20
4242 void switch_mime_getc(void)
4244 if (i_getc!=mime_getc) {
4245 i_mgetc = i_getc; i_getc = mime_getc;
4246 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4247 if(mime_f==STRICT_MIME) {
4248 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4249 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4254 void unswitch_mime_getc(void)
4256 if(mime_f==STRICT_MIME) {
4257 i_mgetc = i_mgetc_buf;
4258 i_mungetc = i_mungetc_buf;
4261 i_ungetc = i_mungetc;
4262 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4263 mime_iconv_back = NULL;
4266 int mime_begin_strict(FILE *f)
4270 const unsigned char *p,*q;
4271 int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */
4273 mime_decode_mode = FALSE;
4274 /* =? has been checked */
4276 p = mime_pattern[j];
4279 for(i=2;p[i]>' ';i++) { /* start at =? */
4280 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4281 /* pattern fails, try next one */
4283 while (mime_pattern[++j]) {
4284 p = mime_pattern[j];
4285 for(k=2;k<i;k++) /* assume length(p) > i */
4286 if (p[k]!=q[k]) break;
4287 if (k==i && nkf_toupper(c1)==p[k]) break;
4289 p = mime_pattern[j];
4290 if (p) continue; /* found next one, continue */
4291 /* all fails, output from recovery buffer */
4299 mime_decode_mode = p[i-2];
4301 mime_iconv_back = iconv;
4302 set_iconv(FALSE, mime_priority_func[j]);
4303 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4305 if (mime_decode_mode=='B') {
4306 mimebuf_f = unbuf_f;
4308 /* do MIME integrity check */
4309 return mime_integrity(f,mime_pattern[j]);
4317 int mime_getc_buf(FILE *f)
4319 /* we don't keep eof of Fifo, becase it contains ?= as
4320 a terminator. It was checked in mime_integrity. */
4321 return ((mimebuf_f)?
4322 (*i_mgetc_buf)(f):Fifo(mime_input++));
4325 int mime_ungetc_buf(int c, FILE *f)
4328 (*i_mungetc_buf)(c,f);
4330 Fifo(--mime_input) = (unsigned char)c;
4334 int mime_begin(FILE *f)
4339 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4340 /* re-read and convert again from mime_buffer. */
4342 /* =? has been checked */
4344 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4345 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4346 /* We accept any character type even if it is breaked by new lines */
4347 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4348 if (c1=='\n'||c1==' '||c1=='\r'||
4349 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4351 /* Failed. But this could be another MIME preemble */
4359 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4360 if (!(++i<MAXRECOVER) || c1==EOF) break;
4361 if (c1=='b'||c1=='B') {
4362 mime_decode_mode = 'B';
4363 } else if (c1=='q'||c1=='Q') {
4364 mime_decode_mode = 'Q';
4368 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4369 if (!(++i<MAXRECOVER) || c1==EOF) break;
4371 mime_decode_mode = FALSE;
4377 if (!mime_decode_mode) {
4378 /* false MIME premble, restart from mime_buffer */
4379 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4380 /* Since we are in MIME mode until buffer becomes empty, */
4381 /* we never go into mime_begin again for a while. */
4384 /* discard mime preemble, and goto MIME mode */
4386 /* do no MIME integrity check */
4387 return c1; /* used only for checking EOF */
4396 void debug(const char *str)
4399 fprintf(stderr, "%s\n", str);
4404 void set_input_codename(char *codename)
4408 strcmp(codename, "") != 0 &&
4409 strcmp(codename, input_codename) != 0)
4411 is_inputcode_mixed = TRUE;
4413 input_codename = codename;
4414 is_inputcode_set = TRUE;
4417 #if !defined(PERL_XS) && !defined(WIN32DLL)
4418 void print_guessed_code(char *filename)
4420 char *codename = "BINARY";
4421 if (!is_inputcode_mixed) {
4422 if (strcmp(input_codename, "") == 0) {
4425 codename = input_codename;
4428 if (filename != NULL) printf("%s:", filename);
4429 printf("%s\n", codename);
4435 int hex_getc(int ch, FILE *f, int (*g)(FILE *f), int (*u)(int c, FILE *f))
4443 if (!nkf_isxdigit(c2)){
4448 if (!nkf_isxdigit(c3)){
4453 return (hex2bin(c2) << 4) | hex2bin(c3);
4456 int cap_getc(FILE *f)
4458 return hex_getc(':', f, i_cgetc, i_cungetc);
4461 int cap_ungetc(int c, FILE *f)
4463 return (*i_cungetc)(c, f);
4466 int url_getc(FILE *f)
4468 return hex_getc('%', f, i_ugetc, i_uungetc);
4471 int url_ungetc(int c, FILE *f)
4473 return (*i_uungetc)(c, f);
4477 #ifdef NUMCHAR_OPTION
4478 int numchar_getc(FILE *f)
4480 int (*g)(FILE *) = i_ngetc;
4481 int (*u)(int c ,FILE *f) = i_nungetc;
4492 if (buf[i] == 'x' || buf[i] == 'X'){
4493 for (j = 0; j < 5; j++){
4495 if (!nkf_isxdigit(buf[i])){
4502 c |= hex2bin(buf[i]);
4505 for (j = 0; j < 6; j++){
4509 if (!nkf_isdigit(buf[i])){
4516 c += hex2bin(buf[i]);
4522 return CLASS_UTF16 | c;
4531 int numchar_ungetc(int c, FILE *f)
4533 return (*i_nungetc)(c, f);
4537 #ifdef UNICODE_NORMALIZATION
4539 /* Normalization Form C */
4540 int nfc_getc(FILE *f)
4542 int (*g)(FILE *f) = i_nfc_getc;
4543 int (*u)(int c ,FILE *f) = i_nfc_ungetc;
4544 int i=0, j, k=1, lower, upper;
4547 extern const struct normalization_pair normalization_table[];
4550 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4551 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4552 while (upper >= lower) {
4553 j = (lower+upper) / 2;
4554 array = normalization_table[j].nfd;
4555 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4556 if (array[k] != buf[k]){
4557 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4564 array = normalization_table[j].nfc;
4565 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4577 int nfc_ungetc(int c, FILE *f)
4579 return (*i_nfc_ungetc)(c, f);
4581 #endif /* UNICODE_NORMALIZATION */
4587 int c1, c2, c3, c4, cc;
4588 int t1, t2, t3, t4, mode, exit_mode;
4592 int lwsp_size = 128;
4594 if (mime_top != mime_last) { /* Something is in FIFO */
4595 return Fifo(mime_top++);
4597 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4598 mime_decode_mode=FALSE;
4599 unswitch_mime_getc();
4600 return (*i_getc)(f);
4603 if (mimebuf_f == FIXED_MIME)
4604 exit_mode = mime_decode_mode;
4607 if (mime_decode_mode == 'Q') {
4608 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4610 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4611 if (c1<=' ' || DEL<=c1) {
4612 mime_decode_mode = exit_mode; /* prepare for quit */
4615 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4619 mime_decode_mode = exit_mode; /* prepare for quit */
4620 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4621 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4622 /* end Q encoding */
4623 input_mode = exit_mode;
4625 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4626 if (lwsp_buf==NULL) {
4627 perror("can't malloc");
4630 while ((c1=(*i_getc)(f))!=EOF) {
4635 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4643 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
4644 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4659 lwsp_buf[lwsp_count] = (unsigned char)c1;
4660 if (lwsp_count++>lwsp_size){
4662 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4663 if (lwsp_buf_new==NULL) {
4665 perror("can't realloc");
4668 lwsp_buf = lwsp_buf_new;
4674 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4676 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4677 i_ungetc(lwsp_buf[lwsp_count],f);
4683 if (c1=='='&&c2<' ') { /* this is soft wrap */
4684 while((c1 = (*i_mgetc)(f)) <=' ') {
4685 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4687 mime_decode_mode = 'Q'; /* still in MIME */
4688 goto restart_mime_q;
4691 mime_decode_mode = 'Q'; /* still in MIME */
4695 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4696 if (c2<=' ') return c2;
4697 mime_decode_mode = 'Q'; /* still in MIME */
4698 #define hex(c) (('0'<=c&&c<='9')?(c-'0'):\
4699 ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0)
4700 return ((hex(c2)<<4) + hex(c3));
4703 if (mime_decode_mode != 'B') {
4704 mime_decode_mode = FALSE;
4705 return (*i_mgetc)(f);
4709 /* Base64 encoding */
4711 MIME allows line break in the middle of
4712 Base64, but we are very pessimistic in decoding
4713 in unbuf mode because MIME encoded code may broken by
4714 less or editor's control sequence (such as ESC-[-K in unbuffered
4715 mode. ignore incomplete MIME.
4717 mode = mime_decode_mode;
4718 mime_decode_mode = exit_mode; /* prepare for quit */
4720 while ((c1 = (*i_mgetc)(f))<=' ') {
4725 if ((c2 = (*i_mgetc)(f))<=' ') {
4728 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4729 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4732 if ((c1 == '?') && (c2 == '=')) {
4735 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4736 if (lwsp_buf==NULL) {
4737 perror("can't malloc");
4740 while ((c1=(*i_getc)(f))!=EOF) {
4745 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4753 if ((c1=(*i_getc)(f))!=EOF) {
4757 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
4772 lwsp_buf[lwsp_count] = (unsigned char)c1;
4773 if (lwsp_count++>lwsp_size){
4775 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4776 if (lwsp_buf_new==NULL) {
4778 perror("can't realloc");
4781 lwsp_buf = lwsp_buf_new;
4787 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
4789 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4790 i_ungetc(lwsp_buf[lwsp_count],f);
4797 if ((c3 = (*i_mgetc)(f))<=' ') {
4800 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4801 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4805 if ((c4 = (*i_mgetc)(f))<=' ') {
4808 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4809 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4813 mime_decode_mode = mode; /* still in MIME sigh... */
4815 /* BASE 64 decoding */
4817 t1 = 0x3f & base64decode(c1);
4818 t2 = 0x3f & base64decode(c2);
4819 t3 = 0x3f & base64decode(c3);
4820 t4 = 0x3f & base64decode(c4);
4821 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4823 Fifo(mime_last++) = (unsigned char)cc;
4824 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4826 Fifo(mime_last++) = (unsigned char)cc;
4827 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4829 Fifo(mime_last++) = (unsigned char)cc;
4834 return Fifo(mime_top++);
4837 int mime_ungetc(int c, FILE *f)
4839 Fifo(--mime_top) = (unsigned char)c;
4843 int mime_integrity(FILE *f, const unsigned char *p)
4847 /* In buffered mode, read until =? or NL or buffer full
4849 mime_input = mime_top;
4850 mime_last = mime_top;
4852 while(*p) Fifo(mime_input++) = *p++;
4855 while((c=(*i_getc)(f))!=EOF) {
4856 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
4857 break; /* buffer full */
4859 if (c=='=' && d=='?') {
4860 /* checked. skip header, start decode */
4861 Fifo(mime_input++) = (unsigned char)c;
4862 /* mime_last_input = mime_input; */
4867 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4869 /* Should we check length mod 4? */
4870 Fifo(mime_input++) = (unsigned char)c;
4873 /* In case of Incomplete MIME, no MIME decode */
4874 Fifo(mime_input++) = (unsigned char)c;
4875 mime_last = mime_input; /* point undecoded buffer */
4876 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
4877 switch_mime_getc(); /* anyway we need buffered getc */
4881 int base64decode(int c)
4886 i = c - 'A'; /* A..Z 0-25 */
4888 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4890 } else if (c > '/') {
4891 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4892 } else if (c == '+') {
4893 i = '>' /* 62 */ ; /* + 62 */
4895 i = '?' /* 63 */ ; /* / 63 */
4900 static const char basis_64[] =
4901 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4904 #define MIMEOUT_BUF_LENGTH (60)
4905 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
4906 int mimeout_buf_count = 0;
4907 int mimeout_preserve_space = 0;
4908 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
4910 void open_mime(int mode)
4912 const unsigned char *p;
4915 p = mime_pattern[0];
4916 for(i=0;mime_encode[i];i++) {
4917 if (mode == mime_encode[i]) {
4918 p = mime_pattern[i];
4922 mimeout_mode = mime_encode_method[i];
4925 if (base64_count>45) {
4926 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
4927 (*o_mputc)(mimeout_buf[i]);
4933 if (!mimeout_preserve_space && mimeout_buf_count>0
4934 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4935 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
4939 if (!mimeout_preserve_space) {
4940 for (;i<mimeout_buf_count;i++) {
4941 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
4942 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
4943 (*o_mputc)(mimeout_buf[i]);
4950 mimeout_preserve_space = FALSE;
4956 j = mimeout_buf_count;
4957 mimeout_buf_count = 0;
4959 mime_putc(mimeout_buf[i]);
4963 void close_mime(void)
4973 switch(mimeout_mode) {
4978 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
4984 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
4990 if (mimeout_f!=FIXED_MIME) {
4992 } else if (mimeout_mode != 'Q')
4997 void mimeout_addchar(int c)
4999 switch(mimeout_mode) {
5004 } else if(!nkf_isalnum(c)) {
5006 (*o_mputc)(itoh4(((c>>4)&0xf)));
5007 (*o_mputc)(itoh4((c&0xf)));
5016 (*o_mputc)(basis_64[c>>2]);
5021 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5027 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5028 (*o_mputc)(basis_64[c & 0x3F]);
5039 int mime_lastchar2, mime_lastchar1;
5041 void mime_prechar(int c2, int c1)
5045 if (base64_count + mimeout_buf_count/3*4> 66){
5046 (*o_base64conv)(EOF,0);
5047 (*o_base64conv)(0,NL);
5048 (*o_base64conv)(0,SPACE);
5050 }/*else if (mime_lastchar2){
5051 if (c1 <=DEL && !nkf_isspace(c1)){
5052 (*o_base64conv)(0,SPACE);
5056 if (c2 && mime_lastchar2 == 0
5057 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5058 (*o_base64conv)(0,SPACE);
5061 mime_lastchar2 = c2;
5062 mime_lastchar1 = c1;
5065 void mime_putc(int c)
5070 if (mimeout_f == FIXED_MIME){
5071 if (mimeout_mode == 'Q'){
5072 if (base64_count > 71){
5073 if (c!=CR && c!=NL) {
5080 if (base64_count > 71){
5085 if (c == EOF) { /* c==EOF */
5089 if (c != EOF) { /* c==EOF */
5095 /* mimeout_f != FIXED_MIME */
5097 if (c == EOF) { /* c==EOF */
5098 j = mimeout_buf_count;
5099 mimeout_buf_count = 0;
5103 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5106 mimeout_addchar(mimeout_buf[i]);
5110 mimeout_addchar(mimeout_buf[i]);
5114 mimeout_addchar(mimeout_buf[i]);
5120 if (mimeout_mode=='Q') {
5121 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5133 if (mimeout_buf_count > 0){
5134 lastchar = mimeout_buf[mimeout_buf_count - 1];
5139 if (!mimeout_mode) {
5140 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5141 if (nkf_isspace(c)) {
5142 if (c==CR || c==NL) {
5145 for (i=0;i<mimeout_buf_count;i++) {
5146 (*o_mputc)(mimeout_buf[i]);
5147 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5153 mimeout_buf[0] = (char)c;
5154 mimeout_buf_count = 1;
5156 if (base64_count > 1
5157 && base64_count + mimeout_buf_count > 76){
5160 if (!nkf_isspace(mimeout_buf[0])){
5165 mimeout_buf[mimeout_buf_count++] = (char)c;
5166 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5167 open_mime(output_mode);
5172 if (lastchar==CR || lastchar == NL){
5173 for (i=0;i<mimeout_buf_count;i++) {
5174 (*o_mputc)(mimeout_buf[i]);
5177 mimeout_buf_count = 0;
5179 if (lastchar==SPACE) {
5180 for (i=0;i<mimeout_buf_count-1;i++) {
5181 (*o_mputc)(mimeout_buf[i]);
5184 mimeout_buf[0] = SPACE;
5185 mimeout_buf_count = 1;
5187 open_mime(output_mode);
5190 /* mimeout_mode == 'B', 1, 2 */
5191 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5192 if (lastchar == CR || lastchar == NL){
5193 if (nkf_isblank(c)) {
5194 for (i=0;i<mimeout_buf_count;i++) {
5195 mimeout_addchar(mimeout_buf[i]);
5197 mimeout_buf_count = 0;
5198 } else if (SPACE<c && c<DEL) {
5200 for (i=0;i<mimeout_buf_count;i++) {
5201 (*o_mputc)(mimeout_buf[i]);
5204 mimeout_buf_count = 0;
5207 if (c==SPACE || c==TAB || c==CR || c==NL) {
5208 for (i=0;i<mimeout_buf_count;i++) {
5209 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5211 for (i=0;i<mimeout_buf_count;i++) {
5212 (*o_mputc)(mimeout_buf[i]);
5215 mimeout_buf_count = 0;
5218 mimeout_buf[mimeout_buf_count++] = (char)c;
5219 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5221 for (i=0;i<mimeout_buf_count;i++) {
5222 (*o_mputc)(mimeout_buf[i]);
5225 mimeout_buf_count = 0;
5229 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5230 mimeout_buf[mimeout_buf_count++] = (char)c;
5231 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5232 j = mimeout_buf_count;
5233 mimeout_buf_count = 0;
5235 mimeout_addchar(mimeout_buf[i]);
5242 if (mimeout_buf_count>0) {
5243 j = mimeout_buf_count;
5244 mimeout_buf_count = 0;
5246 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5248 mimeout_addchar(mimeout_buf[i]);
5254 (*o_mputc)(mimeout_buf[i]);
5256 open_mime(output_mode);
5263 #if defined(PERL_XS) || defined(WIN32DLL)
5267 struct input_code *p = input_code_list;
5280 mime_f = STRICT_MIME;
5281 mime_decode_f = FALSE;
5286 #if defined(MSDOS) || defined(__OS2__)
5291 iso2022jp_f = FALSE;
5292 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5293 ms_ucs_map_f = UCS_MAP_ASCII;
5295 #ifdef UTF8_INPUT_ENABLE
5296 no_cp932ext_f = FALSE;
5297 ignore_zwnbsp_f = TRUE;
5298 no_best_fit_chars_f = FALSE;
5299 encode_fallback = NULL;
5300 unicode_subchar = '?';
5302 #ifdef UTF8_OUTPUT_ENABLE
5306 #ifdef UNICODE_NORMALIZATION
5319 is_inputcode_mixed = FALSE;
5320 is_inputcode_set = FALSE;
5324 #ifdef SHIFTJIS_CP932
5334 for (i = 0; i < 256; i++){
5335 prefix_table[i] = 0;
5338 #ifdef UTF8_INPUT_ENABLE
5339 utf16_mode = UTF16BE_INPUT;
5341 mimeout_buf_count = 0;
5346 fold_preserve_f = FALSE;
5349 kanji_intro = DEFAULT_J;
5350 ascii_intro = DEFAULT_R;
5351 fold_margin = FOLD_MARGIN;
5352 output_conv = DEFAULT_CONV;
5353 oconv = DEFAULT_CONV;
5354 o_zconv = no_connection;
5355 o_fconv = no_connection;
5356 o_crconv = no_connection;
5357 o_rot_conv = no_connection;
5358 o_hira_conv = no_connection;
5359 o_base64conv = no_connection;
5360 o_iso2022jp_check_conv = no_connection;
5363 i_ungetc = std_ungetc;
5365 i_bungetc = std_ungetc;
5368 i_mungetc = std_ungetc;
5369 i_mgetc_buf = std_getc;
5370 i_mungetc_buf = std_ungetc;
5371 output_mode = ASCII;
5374 mime_decode_mode = FALSE;
5380 z_prev2=0,z_prev1=0;
5382 iconv_for_check = 0;
5384 input_codename = "";
5391 void no_connection(int c2, int c1)
5393 no_connection2(c2,c1,0);
5396 int no_connection2(int c2, int c1, int c0)
5398 fprintf(stderr,"nkf internal module connection failure.\n");
5400 return 0; /* LINT */
5405 #define fprintf dllprintf
5409 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5410 fprintf(stderr,"Flags:\n");
5411 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5412 #ifdef DEFAULT_CODE_SJIS
5413 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5415 #ifdef DEFAULT_CODE_JIS
5416 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5418 #ifdef DEFAULT_CODE_EUC
5419 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5421 #ifdef DEFAULT_CODE_UTF8
5422 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5424 #ifdef UTF8_OUTPUT_ENABLE
5425 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5427 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5428 #ifdef UTF8_INPUT_ENABLE
5429 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5431 fprintf(stderr,"t no conversion\n");
5432 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5433 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5434 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5435 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5436 fprintf(stderr,"v Show this usage. V: show version\n");
5437 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5438 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5439 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5440 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5441 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5442 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5443 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5444 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5446 fprintf(stderr,"T Text mode output\n");
5448 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5449 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5450 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5451 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5452 fprintf(stderr,"\n");
5453 fprintf(stderr,"Long name options\n");
5454 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5455 fprintf(stderr," Specify the input or output codeset\n");
5456 fprintf(stderr," --fj --unix --mac --windows\n");
5457 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5458 fprintf(stderr," Convert for the system or code\n");
5459 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5460 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5461 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5463 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5465 #ifdef NUMCHAR_OPTION
5466 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5468 #ifdef UTF8_INPUT_ENABLE
5469 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5470 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5473 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5474 fprintf(stderr," Overwrite original listed files by filtered result\n");
5475 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5477 fprintf(stderr," -g --guess Guess the input code\n");
5478 fprintf(stderr," --help --version Show this help/the version\n");
5479 fprintf(stderr," For more information, see also man nkf\n");
5480 fprintf(stderr,"\n");
5486 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5487 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__)
5490 #if defined(MSDOS) && defined(__WIN16__)
5493 #if defined(MSDOS) && defined(__WIN32__)
5499 ,NKF_VERSION,NKF_RELEASE_DATE);
5500 fprintf(stderr,"\n%s\n",CopyRight);
5505 **
\e$B%Q%C%A@):n<T
\e(B
5506 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5507 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5508 ** ohta@src.ricoh.co.jp (Junn Ohta)
5509 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5510 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5511 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5512 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5513 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5514 ** GHG00637@nifty-serve.or.jp (COW)