1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.107 2006/09/15 07:23:20 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-09-15"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
588 static int mimeout_mode = 0;
589 static int base64_count = 0;
591 /* X0208 -> ASCII converter */
594 static int f_line = 0; /* chars in line */
595 static int f_prev = 0;
596 static int fold_preserve_f = FALSE; /* preserve new lines */
597 static int fold_f = FALSE;
598 static int fold_len = 0;
601 static unsigned char kanji_intro = DEFAULT_J;
602 static unsigned char ascii_intro = DEFAULT_R;
606 #define FOLD_MARGIN 10
607 #define DEFAULT_FOLD 60
609 static int fold_margin = FOLD_MARGIN;
613 #ifdef DEFAULT_CODE_JIS
614 # define DEFAULT_CONV j_oconv
616 #ifdef DEFAULT_CODE_SJIS
617 # define DEFAULT_CONV s_oconv
619 #ifdef DEFAULT_CODE_EUC
620 # define DEFAULT_CONV e_oconv
622 #ifdef DEFAULT_CODE_UTF8
623 # define DEFAULT_CONV w_oconv
626 /* process default */
627 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
629 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
630 /* s_iconv or oconv */
631 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
633 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
634 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
635 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
641 /* static redirections */
643 static void (*o_putc)(nkf_char c) = std_putc;
645 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
646 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
648 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
649 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
651 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
653 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
654 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
656 /* for strict mime */
657 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
658 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
661 static int output_mode = ASCII, /* output kanji mode */
662 input_mode = ASCII, /* input kanji mode */
663 shift_mode = FALSE; /* TRUE shift out, or X0201 */
664 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
666 /* X0201 / X0208 conversion tables */
668 /* X0201 kana conversion table */
671 unsigned char cv[]= {
672 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
673 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
674 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
675 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
676 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
677 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
678 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
679 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
680 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
681 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
682 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
683 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
684 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
685 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
686 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
687 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
691 /* X0201 kana conversion table for daguten */
694 unsigned char dv[]= {
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
696 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
700 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
701 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
702 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
703 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
704 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
705 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
706 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
708 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 /* X0201 kana conversion table for han-daguten */
716 unsigned char ev[]= {
717 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
718 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
728 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
730 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
736 /* X0208 kigou conversion table */
737 /* 0x8140 - 0x819e */
739 unsigned char fv[] = {
741 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
742 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
743 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
744 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
745 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
746 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
747 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
748 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
749 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
751 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
758 static int file_out_f = FALSE;
760 static int overwrite_f = FALSE;
761 static int preserve_time_f = FALSE;
762 static int backup_f = FALSE;
763 static char *backup_suffix = "";
764 static char *get_backup_filename(const char *suffix, const char *filename);
767 static int crmode_f = 0; /* CR, NL, CRLF */
768 #ifdef EASYWIN /*Easy Win */
769 static int end_check;
772 #define STD_GC_BUFSIZE (256)
773 nkf_char std_gc_buf[STD_GC_BUFSIZE];
777 #include "nkf32dll.c"
778 #elif defined(PERL_XS)
780 int main(int argc, char **argv)
785 char *outfname = NULL;
788 #ifdef EASYWIN /*Easy Win */
789 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
792 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
793 cp = (unsigned char *)*argv;
798 if (pipe(fds) < 0 || (pid = fork()) < 0){
809 execvp(argv[1], &argv[1]);
823 if(x0201_f == WISH_TRUE)
824 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
826 if (binmode_f == TRUE)
827 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
828 if (freopen("","wb",stdout) == NULL)
835 setbuf(stdout, (char *) NULL);
837 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
840 if (binmode_f == TRUE)
841 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
842 if (freopen("","rb",stdin) == NULL) return (-1);
846 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
850 kanji_convert(stdin);
851 if (guess_f) print_guessed_code(NULL);
856 is_inputcode_mixed = FALSE;
857 is_inputcode_set = FALSE;
862 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
871 /* reopen file for stdout */
872 if (file_out_f == TRUE) {
875 outfname = malloc(strlen(origfname)
876 + strlen(".nkftmpXXXXXX")
882 strcpy(outfname, origfname);
886 for (i = strlen(outfname); i; --i){
887 if (outfname[i - 1] == '/'
888 || outfname[i - 1] == '\\'){
894 strcat(outfname, "ntXXXXXX");
896 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
899 strcat(outfname, ".nkftmpXXXXXX");
900 fd = mkstemp(outfname);
903 || (fd_backup = dup(fileno(stdout))) < 0
904 || dup2(fd, fileno(stdout)) < 0
915 outfname = "nkf.out";
918 if(freopen(outfname, "w", stdout) == NULL) {
922 if (binmode_f == TRUE) {
923 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
924 if (freopen("","wb",stdout) == NULL)
931 if (binmode_f == TRUE)
932 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
933 if (freopen("","rb",fin) == NULL)
938 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
942 char *filename = NULL;
944 if (nfiles > 1) filename = origfname;
945 if (guess_f) print_guessed_code(filename);
951 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
959 if (dup2(fd_backup, fileno(stdout)) < 0){
962 if (stat(origfname, &sb)) {
963 fprintf(stderr, "Can't stat %s\n", origfname);
965 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
966 if (chmod(outfname, sb.st_mode)) {
967 fprintf(stderr, "Can't set permission %s\n", outfname);
970 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
972 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
973 tb[0] = tb[1] = sb.st_mtime;
974 if (utime(outfname, tb)) {
975 fprintf(stderr, "Can't set timestamp %s\n", outfname);
978 tb.actime = sb.st_atime;
979 tb.modtime = sb.st_mtime;
980 if (utime(outfname, &tb)) {
981 fprintf(stderr, "Can't set timestamp %s\n", outfname);
986 char *backup_filename = get_backup_filename(backup_suffix, origfname);
988 unlink(backup_filename);
990 if (rename(origfname, backup_filename)) {
991 perror(backup_filename);
992 fprintf(stderr, "Can't rename %s to %s\n",
993 origfname, backup_filename);
997 if (unlink(origfname)){
1002 if (rename(outfname, origfname)) {
1004 fprintf(stderr, "Can't rename %s to %s\n",
1005 outfname, origfname);
1013 #ifdef EASYWIN /*Easy Win */
1014 if (file_out_f == FALSE)
1015 scanf("%d",&end_check);
1018 #else /* for Other OS */
1019 if (file_out_f == TRUE)
1021 #endif /*Easy Win */
1024 #endif /* WIN32DLL */
1027 char *get_backup_filename(const char *suffix, const char *filename)
1029 char *backup_filename;
1030 int asterisk_count = 0;
1032 int filename_length = strlen(filename);
1034 for(i = 0; suffix[i]; i++){
1035 if(suffix[i] == '*') asterisk_count++;
1039 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1040 if (!backup_filename){
1041 perror("Can't malloc backup filename.");
1045 for(i = 0, j = 0; suffix[i];){
1046 if(suffix[i] == '*'){
1047 backup_filename[j] = '\0';
1048 strncat(backup_filename, filename, filename_length);
1050 j += filename_length;
1052 backup_filename[j++] = suffix[i++];
1055 backup_filename[j] = '\0';
1057 j = strlen(suffix) + filename_length;
1058 backup_filename = malloc( + 1);
1059 strcpy(backup_filename, filename);
1060 strcat(backup_filename, suffix);
1061 backup_filename[j] = '\0';
1063 return backup_filename;
1092 {"katakana-hiragana","h3"},
1099 #ifdef UTF8_OUTPUT_ENABLE
1109 {"fb-subchar=", ""},
1111 #ifdef UTF8_INPUT_ENABLE
1112 {"utf8-input", "W"},
1113 {"utf16-input", "W16"},
1114 {"no-cp932ext", ""},
1115 {"no-best-fit-chars",""},
1117 #ifdef UNICODE_NORMALIZATION
1118 {"utf8mac-input", ""},
1130 #ifdef NUMCHAR_OPTION
1131 {"numchar-input", ""},
1137 #ifdef SHIFTJIS_CP932
1147 static int option_mode = 0;
1149 void options(unsigned char *cp)
1153 unsigned char *cp_back = NULL;
1158 while(*cp && *cp++!='-');
1159 while (*cp || cp_back) {
1167 case '-': /* literal options */
1168 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1172 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1173 p = (unsigned char *)long_option[i].name;
1174 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1175 if (*p == cp[j] || cp[j] == ' '){
1182 while(*cp && *cp != SPACE && cp++);
1183 if (long_option[i].alias[0]){
1185 cp = (unsigned char *)long_option[i].alias;
1187 if (strcmp(long_option[i].name, "ic=") == 0){
1188 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1189 codeset[i] = nkf_toupper(p[i]);
1192 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1193 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1194 strcmp(codeset, "CP50220") == 0 ||
1195 strcmp(codeset, "CP50221") == 0 ||
1196 strcmp(codeset, "CP50222") == 0 ||
1197 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1198 input_f = JIS_INPUT;
1199 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1200 input_f = JIS_INPUT;
1204 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1205 input_f = JIS_INPUT;
1210 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1211 input_f = SJIS_INPUT;
1212 if (x0201_f==NO_X0201) x0201_f=TRUE;
1213 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1214 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1215 strcmp(codeset, "CP932") == 0 ||
1216 strcmp(codeset, "MS932") == 0){
1217 input_f = SJIS_INPUT;
1219 #ifdef SHIFTJIS_CP932
1222 #ifdef UTF8_OUTPUT_ENABLE
1223 ms_ucs_map_f = UCS_MAP_CP932;
1225 }else if(strcmp(codeset, "EUCJP") == 0 ||
1226 strcmp(codeset, "EUC-JP") == 0){
1227 input_f = EUC_INPUT;
1228 }else if(strcmp(codeset, "CP51932") == 0){
1229 input_f = EUC_INPUT;
1231 #ifdef SHIFTJIS_CP932
1234 #ifdef UTF8_OUTPUT_ENABLE
1235 ms_ucs_map_f = UCS_MAP_CP932;
1237 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1238 strcmp(codeset, "EUCJP-MS") == 0 ||
1239 strcmp(codeset, "EUCJPMS") == 0){
1240 input_f = EUC_INPUT;
1242 #ifdef SHIFTJIS_CP932
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_MS;
1248 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1249 strcmp(codeset, "EUCJP-ASCII") == 0){
1250 input_f = EUC_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 #ifdef UTF8_OUTPUT_ENABLE
1256 ms_ucs_map_f = UCS_MAP_ASCII;
1258 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1259 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1260 input_f = SJIS_INPUT;
1262 #ifdef SHIFTJIS_CP932
1266 if (x0201_f==NO_X0201) x0201_f=TRUE;
1267 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1268 strcmp(codeset, "EUC-JIS-2004") == 0){
1269 input_f = EUC_INPUT;
1272 #ifdef SHIFTJIS_CP932
1276 #ifdef UTF8_INPUT_ENABLE
1277 }else if(strcmp(codeset, "UTF-8") == 0 ||
1278 strcmp(codeset, "UTF-8N") == 0 ||
1279 strcmp(codeset, "UTF-8-BOM") == 0){
1280 input_f = UTF8_INPUT;
1281 #ifdef UNICODE_NORMALIZATION
1282 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1283 strcmp(codeset, "UTF-8-MAC") == 0){
1284 input_f = UTF8_INPUT;
1287 }else if(strcmp(codeset, "UTF-16") == 0 ||
1288 strcmp(codeset, "UTF-16BE") == 0 ||
1289 strcmp(codeset, "UTF-16BE-BOM") == 0){
1290 input_f = UTF16_INPUT;
1291 input_endian = ENDIAN_BIG;
1292 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1293 strcmp(codeset, "UTF-16LE-BOM") == 0){
1294 input_f = UTF16_INPUT;
1295 input_endian = ENDIAN_LITTLE;
1296 }else if(strcmp(codeset, "UTF-32") == 0 ||
1297 strcmp(codeset, "UTF-32BE") == 0 ||
1298 strcmp(codeset, "UTF-32BE-BOM") == 0){
1299 input_f = UTF32_INPUT;
1300 input_endian = ENDIAN_BIG;
1301 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1302 strcmp(codeset, "UTF-32LE-BOM") == 0){
1303 input_f = UTF32_INPUT;
1304 input_endian = ENDIAN_LITTLE;
1309 if (strcmp(long_option[i].name, "oc=") == 0){
1310 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1311 codeset[i] = nkf_toupper(p[i]);
1314 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1315 strcmp(codeset, "CP50220") == 0){
1316 output_conv = j_oconv;
1317 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1318 output_conv = j_oconv;
1319 no_cp932ext_f = TRUE;
1320 }else if(strcmp(codeset, "CP50221") == 0 ||
1321 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1322 output_conv = j_oconv;
1324 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1325 output_conv = j_oconv;
1329 #ifdef SHIFTJIS_CP932
1332 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1333 output_conv = j_oconv;
1338 #ifdef SHIFTJIS_CP932
1341 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1342 output_conv = j_oconv;
1347 #ifdef SHIFTJIS_CP932
1350 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1351 output_conv = s_oconv;
1352 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1353 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1354 strcmp(codeset, "CP932") == 0 ||
1355 strcmp(codeset, "MS932") == 0){
1356 output_conv = s_oconv;
1358 #ifdef SHIFTJIS_CP932
1362 #ifdef UTF8_OUTPUT_ENABLE
1363 ms_ucs_map_f = UCS_MAP_CP932;
1365 }else if(strcmp(codeset, "EUCJP") == 0 ||
1366 strcmp(codeset, "EUC-JP") == 0){
1367 output_conv = e_oconv;
1368 }else if(strcmp(codeset, "CP51932") == 0){
1369 output_conv = e_oconv;
1371 #ifdef SHIFTJIS_CP932
1374 #ifdef UTF8_OUTPUT_ENABLE
1375 ms_ucs_map_f = UCS_MAP_CP932;
1377 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1378 strcmp(codeset, "EUCJP-MS") == 0 ||
1379 strcmp(codeset, "EUCJPMS") == 0){
1380 output_conv = e_oconv;
1385 #ifdef SHIFTJIS_CP932
1388 #ifdef UTF8_OUTPUT_ENABLE
1389 ms_ucs_map_f = UCS_MAP_MS;
1391 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1392 strcmp(codeset, "EUCJP-ASCII") == 0){
1393 output_conv = e_oconv;
1398 #ifdef SHIFTJIS_CP932
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_ASCII;
1404 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1405 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1406 output_conv = s_oconv;
1408 #ifdef SHIFTJIS_CP932
1411 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1412 strcmp(codeset, "EUC-JIS-2004") == 0){
1413 output_conv = e_oconv;
1418 #ifdef SHIFTJIS_CP932
1421 #ifdef UTF8_OUTPUT_ENABLE
1422 }else if(strcmp(codeset, "UTF-8") == 0){
1423 output_conv = w_oconv;
1424 }else if(strcmp(codeset, "UTF-8N") == 0){
1425 output_conv = w_oconv;
1426 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1427 output_conv = w_oconv;
1428 output_bom_f = TRUE;
1429 }else if(strcmp(codeset, "UTF-16BE") == 0){
1430 output_conv = w_oconv16;
1431 }else if(strcmp(codeset, "UTF-16") == 0 ||
1432 strcmp(codeset, "UTF-16BE-BOM") == 0){
1433 output_conv = w_oconv16;
1434 output_bom_f = TRUE;
1435 }else if(strcmp(codeset, "UTF-16LE") == 0){
1436 output_conv = w_oconv16;
1437 output_endian = ENDIAN_LITTLE;
1438 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1439 output_conv = w_oconv16;
1440 output_endian = ENDIAN_LITTLE;
1441 output_bom_f = TRUE;
1442 }else if(strcmp(codeset, "UTF-32") == 0 ||
1443 strcmp(codeset, "UTF-32BE") == 0){
1444 output_conv = w_oconv32;
1445 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1446 output_conv = w_oconv32;
1447 output_bom_f = TRUE;
1448 }else if(strcmp(codeset, "UTF-32LE") == 0){
1449 output_conv = w_oconv32;
1450 output_endian = ENDIAN_LITTLE;
1451 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1452 output_conv = w_oconv32;
1453 output_endian = ENDIAN_LITTLE;
1454 output_bom_f = TRUE;
1460 if (strcmp(long_option[i].name, "overwrite") == 0){
1463 preserve_time_f = TRUE;
1466 if (strcmp(long_option[i].name, "overwrite=") == 0){
1469 preserve_time_f = TRUE;
1471 backup_suffix = malloc(strlen((char *) p) + 1);
1472 strcpy(backup_suffix, (char *) p);
1475 if (strcmp(long_option[i].name, "in-place") == 0){
1478 preserve_time_f = FALSE;
1481 if (strcmp(long_option[i].name, "in-place=") == 0){
1484 preserve_time_f = FALSE;
1486 backup_suffix = malloc(strlen((char *) p) + 1);
1487 strcpy(backup_suffix, (char *) p);
1492 if (strcmp(long_option[i].name, "cap-input") == 0){
1496 if (strcmp(long_option[i].name, "url-input") == 0){
1501 #ifdef NUMCHAR_OPTION
1502 if (strcmp(long_option[i].name, "numchar-input") == 0){
1508 if (strcmp(long_option[i].name, "no-output") == 0){
1512 if (strcmp(long_option[i].name, "debug") == 0){
1517 if (strcmp(long_option[i].name, "cp932") == 0){
1518 #ifdef SHIFTJIS_CP932
1522 #ifdef UTF8_OUTPUT_ENABLE
1523 ms_ucs_map_f = UCS_MAP_CP932;
1527 if (strcmp(long_option[i].name, "no-cp932") == 0){
1528 #ifdef SHIFTJIS_CP932
1532 #ifdef UTF8_OUTPUT_ENABLE
1533 ms_ucs_map_f = UCS_MAP_ASCII;
1537 #ifdef SHIFTJIS_CP932
1538 if (strcmp(long_option[i].name, "cp932inv") == 0){
1545 if (strcmp(long_option[i].name, "x0212") == 0){
1552 if (strcmp(long_option[i].name, "exec-in") == 0){
1556 if (strcmp(long_option[i].name, "exec-out") == 0){
1561 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1562 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1563 no_cp932ext_f = TRUE;
1566 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1567 no_best_fit_chars_f = TRUE;
1570 if (strcmp(long_option[i].name, "fb-skip") == 0){
1571 encode_fallback = NULL;
1574 if (strcmp(long_option[i].name, "fb-html") == 0){
1575 encode_fallback = encode_fallback_html;
1578 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1579 encode_fallback = encode_fallback_xml;
1582 if (strcmp(long_option[i].name, "fb-java") == 0){
1583 encode_fallback = encode_fallback_java;
1586 if (strcmp(long_option[i].name, "fb-perl") == 0){
1587 encode_fallback = encode_fallback_perl;
1590 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1591 encode_fallback = encode_fallback_subchar;
1594 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1595 encode_fallback = encode_fallback_subchar;
1596 unicode_subchar = 0;
1598 /* decimal number */
1599 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1600 unicode_subchar *= 10;
1601 unicode_subchar += hex2bin(p[i]);
1603 }else if(p[1] == 'x' || p[1] == 'X'){
1604 /* hexadecimal number */
1605 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1606 unicode_subchar <<= 4;
1607 unicode_subchar |= hex2bin(p[i]);
1611 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1612 unicode_subchar *= 8;
1613 unicode_subchar += hex2bin(p[i]);
1616 w16e_conv(unicode_subchar, &i, &j);
1617 unicode_subchar = i<<8 | j;
1621 #ifdef UTF8_OUTPUT_ENABLE
1622 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1623 ms_ucs_map_f = UCS_MAP_MS;
1627 #ifdef UNICODE_NORMALIZATION
1628 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1629 input_f = UTF8_INPUT;
1634 if (strcmp(long_option[i].name, "prefix=") == 0){
1635 if (nkf_isgraph(p[0])){
1636 for (i = 1; nkf_isgraph(p[i]); i++){
1637 prefix_table[p[i]] = p[0];
1644 case 'b': /* buffered mode */
1647 case 'u': /* non bufferd mode */
1650 case 't': /* transparent mode */
1655 } else if (*cp=='2') {
1659 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1667 case 'j': /* JIS output */
1669 output_conv = j_oconv;
1671 case 'e': /* AT&T EUC output */
1672 output_conv = e_oconv;
1674 case 's': /* SJIS output */
1675 output_conv = s_oconv;
1677 case 'l': /* ISO8859 Latin-1 support, no conversion */
1678 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1679 input_f = LATIN1_INPUT;
1681 case 'i': /* Kanji IN ESC-$-@/B */
1682 if (*cp=='@'||*cp=='B')
1683 kanji_intro = *cp++;
1685 case 'o': /* ASCII IN ESC-(-J/B */
1686 if (*cp=='J'||*cp=='B'||*cp=='H')
1687 ascii_intro = *cp++;
1691 bit:1 katakana->hiragana
1692 bit:2 hiragana->katakana
1694 if ('9'>= *cp && *cp>='0')
1695 hira_f |= (*cp++ -'0');
1702 #if defined(MSDOS) || defined(__OS2__)
1717 #ifdef UTF8_OUTPUT_ENABLE
1718 case 'w': /* UTF-8 output */
1720 output_conv = w_oconv; cp++;
1724 output_bom_f = TRUE;
1727 if ('1'== cp[0] && '6'==cp[1]) {
1728 output_conv = w_oconv16; cp+=2;
1729 } else if ('3'== cp[0] && '2'==cp[1]) {
1730 output_conv = w_oconv32; cp+=2;
1732 output_conv = w_oconv;
1737 output_endian = ENDIAN_LITTLE;
1738 } else if (cp[0] == 'B') {
1746 output_bom_f = TRUE;
1751 #ifdef UTF8_INPUT_ENABLE
1752 case 'W': /* UTF input */
1755 input_f = UTF8_INPUT;
1757 if ('1'== cp[0] && '6'==cp[1]) {
1759 input_f = UTF16_INPUT;
1760 input_endian = ENDIAN_BIG;
1761 } else if ('3'== cp[0] && '2'==cp[1]) {
1763 input_f = UTF32_INPUT;
1764 input_endian = ENDIAN_BIG;
1766 input_f = UTF8_INPUT;
1771 input_endian = ENDIAN_LITTLE;
1772 } else if (cp[0] == 'B') {
1778 /* Input code assumption */
1779 case 'J': /* JIS input */
1780 input_f = JIS_INPUT;
1782 case 'E': /* AT&T EUC input */
1783 input_f = EUC_INPUT;
1785 case 'S': /* MS Kanji input */
1786 input_f = SJIS_INPUT;
1787 if (x0201_f==NO_X0201) x0201_f=TRUE;
1789 case 'Z': /* Convert X0208 alphabet to asii */
1790 /* bit:0 Convert X0208
1791 bit:1 Convert Kankaku to one space
1792 bit:2 Convert Kankaku to two spaces
1793 bit:3 Convert HTML Entity
1795 if ('9'>= *cp && *cp>='0')
1796 alpha_f |= 1<<(*cp++ -'0');
1800 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1801 x0201_f = FALSE; /* No X0201->X0208 conversion */
1803 ESC-(-I in JIS, EUC, MS Kanji
1804 SI/SO in JIS, EUC, MS Kanji
1805 SSO in EUC, JIS, not in MS Kanji
1806 MS Kanji (0xa0-0xdf)
1808 ESC-(-I in JIS (0x20-0x5f)
1809 SSO in EUC (0xa0-0xdf)
1810 0xa0-0xd in MS Kanji (0xa0-0xdf)
1813 case 'X': /* Assume X0201 kana */
1814 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1817 case 'F': /* prserve new lines */
1818 fold_preserve_f = TRUE;
1819 case 'f': /* folding -f60 or -f */
1822 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1824 fold_len += *cp++ - '0';
1826 if (!(0<fold_len && fold_len<BUFSIZ))
1827 fold_len = DEFAULT_FOLD;
1831 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1833 fold_margin += *cp++ - '0';
1837 case 'm': /* MIME support */
1838 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1839 if (*cp=='B'||*cp=='Q') {
1840 mime_decode_mode = *cp++;
1841 mimebuf_f = FIXED_MIME;
1842 } else if (*cp=='N') {
1843 mime_f = TRUE; cp++;
1844 } else if (*cp=='S') {
1845 mime_f = STRICT_MIME; cp++;
1846 } else if (*cp=='0') {
1847 mime_decode_f = FALSE;
1848 mime_f = FALSE; cp++;
1851 case 'M': /* MIME output */
1854 mimeout_f = FIXED_MIME; cp++;
1855 } else if (*cp=='Q') {
1857 mimeout_f = FIXED_MIME; cp++;
1862 case 'B': /* Broken JIS support */
1864 bit:1 allow any x on ESC-(-x or ESC-$-x
1865 bit:2 reset to ascii on NL
1867 if ('9'>= *cp && *cp>='0')
1868 broken_f |= 1<<(*cp++ -'0');
1873 case 'O':/* for Output file */
1877 case 'c':/* add cr code */
1880 case 'd':/* delete cr code */
1883 case 'I': /* ISO-2022-JP output */
1886 case 'L': /* line mode */
1887 if (*cp=='u') { /* unix */
1888 crmode_f = NL; cp++;
1889 } else if (*cp=='m') { /* mac */
1890 crmode_f = CR; cp++;
1891 } else if (*cp=='w') { /* windows */
1892 crmode_f = CRLF; cp++;
1893 } else if (*cp=='0') { /* no conversion */
1903 /* module muliple options in a string are allowed for Perl moudle */
1904 while(*cp && *cp++!='-');
1907 /* bogus option but ignored */
1913 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1916 struct input_code *p = input_code_list;
1918 if (iconv_func == p->iconv_func){
1927 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1929 #ifdef INPUT_CODE_FIX
1937 #ifdef INPUT_CODE_FIX
1938 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1944 if (estab_f && iconv_for_check != iconv){
1945 struct input_code *p = find_inputcode_byfunc(iconv);
1947 set_input_codename(p->name);
1948 debug(input_codename);
1950 iconv_for_check = iconv;
1955 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1956 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1957 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1958 #ifdef SHIFTJIS_CP932
1959 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1960 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1962 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1964 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1965 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1967 #define SCORE_INIT (SCORE_iMIME)
1969 const nkf_char score_table_A0[] = {
1972 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1973 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1976 const nkf_char score_table_F0[] = {
1977 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1978 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1979 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1980 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1983 void set_code_score(struct input_code *ptr, nkf_char score)
1986 ptr->score |= score;
1990 void clr_code_score(struct input_code *ptr, nkf_char score)
1993 ptr->score &= ~score;
1997 void code_score(struct input_code *ptr)
1999 nkf_char c2 = ptr->buf[0];
2000 #ifdef UTF8_OUTPUT_ENABLE
2001 nkf_char c1 = ptr->buf[1];
2004 set_code_score(ptr, SCORE_ERROR);
2005 }else if (c2 == SSO){
2006 set_code_score(ptr, SCORE_KANA);
2007 #ifdef UTF8_OUTPUT_ENABLE
2008 }else if (!e2w_conv(c2, c1)){
2009 set_code_score(ptr, SCORE_NO_EXIST);
2011 }else if ((c2 & 0x70) == 0x20){
2012 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2013 }else if ((c2 & 0x70) == 0x70){
2014 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2015 }else if ((c2 & 0x70) >= 0x50){
2016 set_code_score(ptr, SCORE_L2);
2020 void status_disable(struct input_code *ptr)
2025 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2028 void status_push_ch(struct input_code *ptr, nkf_char c)
2030 ptr->buf[ptr->index++] = c;
2033 void status_clear(struct input_code *ptr)
2039 void status_reset(struct input_code *ptr)
2042 ptr->score = SCORE_INIT;
2045 void status_reinit(struct input_code *ptr)
2048 ptr->_file_stat = 0;
2051 void status_check(struct input_code *ptr, nkf_char c)
2053 if (c <= DEL && estab_f){
2058 void s_status(struct input_code *ptr, nkf_char c)
2062 status_check(ptr, c);
2067 #ifdef NUMCHAR_OPTION
2068 }else if (is_unicode_capsule(c)){
2071 }else if (0xa1 <= c && c <= 0xdf){
2072 status_push_ch(ptr, SSO);
2073 status_push_ch(ptr, c);
2076 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2078 status_push_ch(ptr, c);
2079 #ifdef SHIFTJIS_CP932
2081 && is_ibmext_in_sjis(c)){
2083 status_push_ch(ptr, c);
2084 #endif /* SHIFTJIS_CP932 */
2086 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2088 status_push_ch(ptr, c);
2089 #endif /* X0212_ENABLE */
2091 status_disable(ptr);
2095 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2096 status_push_ch(ptr, c);
2097 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2101 status_disable(ptr);
2105 #ifdef SHIFTJIS_CP932
2106 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2107 status_push_ch(ptr, c);
2108 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2109 set_code_score(ptr, SCORE_CP932);
2114 #endif /* SHIFTJIS_CP932 */
2115 #ifndef X0212_ENABLE
2116 status_disable(ptr);
2122 void e_status(struct input_code *ptr, nkf_char c)
2126 status_check(ptr, c);
2131 #ifdef NUMCHAR_OPTION
2132 }else if (is_unicode_capsule(c)){
2135 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2137 status_push_ch(ptr, c);
2139 }else if (0x8f == c){
2141 status_push_ch(ptr, c);
2142 #endif /* X0212_ENABLE */
2144 status_disable(ptr);
2148 if (0xa1 <= c && c <= 0xfe){
2149 status_push_ch(ptr, c);
2153 status_disable(ptr);
2158 if (0xa1 <= c && c <= 0xfe){
2160 status_push_ch(ptr, c);
2162 status_disable(ptr);
2164 #endif /* X0212_ENABLE */
2168 #ifdef UTF8_INPUT_ENABLE
2169 void w_status(struct input_code *ptr, nkf_char c)
2173 status_check(ptr, c);
2178 #ifdef NUMCHAR_OPTION
2179 }else if (is_unicode_capsule(c)){
2182 }else if (0xc0 <= c && c <= 0xdf){
2184 status_push_ch(ptr, c);
2185 }else if (0xe0 <= c && c <= 0xef){
2187 status_push_ch(ptr, c);
2188 }else if (0xf0 <= c && c <= 0xf4){
2190 status_push_ch(ptr, c);
2192 status_disable(ptr);
2197 if (0x80 <= c && c <= 0xbf){
2198 status_push_ch(ptr, c);
2199 if (ptr->index > ptr->stat){
2200 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2201 && ptr->buf[2] == 0xbf);
2202 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2203 &ptr->buf[0], &ptr->buf[1]);
2210 status_disable(ptr);
2214 if (0x80 <= c && c <= 0xbf){
2215 if (ptr->index < ptr->stat){
2216 status_push_ch(ptr, c);
2221 status_disable(ptr);
2228 void code_status(nkf_char c)
2230 int action_flag = 1;
2231 struct input_code *result = 0;
2232 struct input_code *p = input_code_list;
2234 if (!p->status_func)
2236 (p->status_func)(p, c);
2239 }else if(p->stat == 0){
2250 if (result && !estab_f){
2251 set_iconv(TRUE, result->iconv_func);
2252 }else if (c <= DEL){
2253 struct input_code *ptr = input_code_list;
2263 nkf_char std_getc(FILE *f)
2266 return std_gc_buf[--std_gc_ndx];
2272 nkf_char std_ungetc(nkf_char c, FILE *f)
2274 if (std_gc_ndx == STD_GC_BUFSIZE){
2277 std_gc_buf[std_gc_ndx++] = c;
2282 void std_putc(nkf_char c)
2289 #if !defined(PERL_XS) && !defined(WIN32DLL)
2290 nkf_char noconvert(FILE *f)
2295 module_connection();
2296 while ((c = (*i_getc)(f)) != EOF)
2303 void module_connection(void)
2305 oconv = output_conv;
2308 /* replace continucation module, from output side */
2310 /* output redicrection */
2312 if (noout_f || guess_f){
2319 if (mimeout_f == TRUE) {
2320 o_base64conv = oconv; oconv = base64_conv;
2322 /* base64_count = 0; */
2326 o_crconv = oconv; oconv = cr_conv;
2329 o_rot_conv = oconv; oconv = rot_conv;
2332 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2335 o_hira_conv = oconv; oconv = hira_conv;
2338 o_fconv = oconv; oconv = fold_conv;
2341 if (alpha_f || x0201_f) {
2342 o_zconv = oconv; oconv = z_conv;
2346 i_ungetc = std_ungetc;
2347 /* input redicrection */
2350 i_cgetc = i_getc; i_getc = cap_getc;
2351 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2354 i_ugetc = i_getc; i_getc = url_getc;
2355 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2358 #ifdef NUMCHAR_OPTION
2360 i_ngetc = i_getc; i_getc = numchar_getc;
2361 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2364 #ifdef UNICODE_NORMALIZATION
2365 if (nfc_f && input_f == UTF8_INPUT){
2366 i_nfc_getc = i_getc; i_getc = nfc_getc;
2367 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2370 if (mime_f && mimebuf_f==FIXED_MIME) {
2371 i_mgetc = i_getc; i_getc = mime_getc;
2372 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2375 i_bgetc = i_getc; i_getc = broken_getc;
2376 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2378 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2379 set_iconv(-TRUE, e_iconv);
2380 } else if (input_f == SJIS_INPUT) {
2381 set_iconv(-TRUE, s_iconv);
2382 #ifdef UTF8_INPUT_ENABLE
2383 } else if (input_f == UTF8_INPUT) {
2384 set_iconv(-TRUE, w_iconv);
2385 } else if (input_f == UTF16_INPUT) {
2386 set_iconv(-TRUE, w_iconv16);
2387 } else if (input_f == UTF32_INPUT) {
2388 set_iconv(-TRUE, w_iconv32);
2391 set_iconv(FALSE, e_iconv);
2395 struct input_code *p = input_code_list;
2403 * Check and Ignore BOM
2405 void check_bom(FILE *f)
2408 switch(c2 = (*i_getc)(f)){
2410 if((c2 = (*i_getc)(f)) == 0x00){
2411 if((c2 = (*i_getc)(f)) == 0xFE){
2412 if((c2 = (*i_getc)(f)) == 0xFF){
2414 set_iconv(TRUE, w_iconv32);
2416 input_endian = ENDIAN_BIG;
2418 }else (*i_ungetc)(c2,f);
2419 (*i_ungetc)(0xFE,f);
2420 }else if(c2 == 0xFF){
2421 if((c2 = (*i_getc)(f)) == 0xFE){
2423 set_iconv(TRUE, w_iconv32);
2425 input_endian = ENDIAN_2143;
2427 }else (*i_ungetc)(c2,f);
2428 (*i_ungetc)(0xFF,f);
2429 }else (*i_ungetc)(c2,f);
2430 (*i_ungetc)(0x00,f);
2431 }else (*i_ungetc)(c2,f);
2432 (*i_ungetc)(0x00,f);
2435 if((c2 = (*i_getc)(f)) == 0xBB){
2436 if((c2 = (*i_getc)(f)) == 0xBF){
2438 set_iconv(TRUE, w_iconv);
2441 }else (*i_ungetc)(c2,f);
2442 (*i_ungetc)(0xBB,f);
2443 }else (*i_ungetc)(c2,f);
2444 (*i_ungetc)(0xEF,f);
2447 if((c2 = (*i_getc)(f)) == 0xFF){
2448 if((c2 = (*i_getc)(f)) == 0x00){
2449 if((c2 = (*i_getc)(f)) == 0x00){
2451 set_iconv(TRUE, w_iconv32);
2453 input_endian = ENDIAN_3412;
2455 }else (*i_ungetc)(c2,f);
2456 (*i_ungetc)(0x00,f);
2457 }else (*i_ungetc)(c2,f);
2459 set_iconv(TRUE, w_iconv16);
2461 input_endian = ENDIAN_BIG;
2463 }else (*i_ungetc)(c2,f);
2464 (*i_ungetc)(0xFE,f);
2467 if((c2 = (*i_getc)(f)) == 0xFE){
2468 if((c2 = (*i_getc)(f)) == 0x00){
2469 if((c2 = (*i_getc)(f)) == 0x00){
2471 set_iconv(TRUE, w_iconv32);
2473 input_endian = ENDIAN_LITTLE;
2475 }else (*i_ungetc)(c2,f);
2476 (*i_ungetc)(0x00,f);
2477 }else (*i_ungetc)(c2,f);
2479 set_iconv(TRUE, w_iconv16);
2481 input_endian = ENDIAN_LITTLE;
2483 }else (*i_ungetc)(c2,f);
2484 (*i_ungetc)(0xFF,f);
2493 Conversion main loop. Code detection only.
2496 nkf_char kanji_convert(FILE *f)
2498 nkf_char c3, c2=0, c1, c0=0;
2499 int is_8bit = FALSE;
2501 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2502 #ifdef UTF8_INPUT_ENABLE
2503 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2510 output_mode = ASCII;
2513 #define NEXT continue /* no output, get next */
2514 #define SEND ; /* output c1 and c2, get next */
2515 #define LAST break /* end of loop, go closing */
2517 module_connection();
2520 while ((c1 = (*i_getc)(f)) != EOF) {
2521 #ifdef INPUT_CODE_FIX
2528 /* in case of 8th bit is on */
2529 if (!estab_f&&!mime_decode_mode) {
2530 /* in case of not established yet */
2531 /* It is still ambiguious */
2532 if (h_conv(f, c2, c1)==EOF)
2538 /* in case of already established */
2540 /* ignore bogus code */
2546 /* second byte, 7 bit code */
2547 /* it might be kanji shitfted */
2548 if ((c1 == DEL) || (c1 <= SPACE)) {
2549 /* ignore bogus first code */
2556 #ifdef UTF8_INPUT_ENABLE
2557 if (iconv == w_iconv16) {
2558 if (input_endian == ENDIAN_BIG) {
2560 if ((c1 = (*i_getc)(f)) != EOF) {
2561 if (0xD8 <= c2 && c2 <= 0xDB) {
2562 if ((c0 = (*i_getc)(f)) != EOF) {
2564 if ((c3 = (*i_getc)(f)) != EOF) {
2571 if ((c2 = (*i_getc)(f)) != EOF) {
2572 if (0xD8 <= c2 && c2 <= 0xDB) {
2573 if ((c3 = (*i_getc)(f)) != EOF) {
2574 if ((c0 = (*i_getc)(f)) != EOF) {
2583 } else if(iconv == w_iconv32){
2585 if((c2 = (*i_getc)(f)) != EOF &&
2586 (c1 = (*i_getc)(f)) != EOF &&
2587 (c0 = (*i_getc)(f)) != EOF){
2588 switch(input_endian){
2590 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2593 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2596 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2599 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2609 #ifdef NUMCHAR_OPTION
2610 if (is_unicode_capsule(c1)){
2616 if (!estab_f && !iso8859_f) {
2617 /* not established yet */
2620 } else { /* estab_f==TRUE */
2625 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2626 /* SJIS X0201 Case... */
2627 if(iso2022jp_f && x0201_f==NO_X0201) {
2628 (*oconv)(GETA1, GETA2);
2635 } else if (c1==SSO && iconv != s_iconv) {
2636 /* EUC X0201 Case */
2637 c1 = (*i_getc)(f); /* skip SSO */
2639 if (SSP<=c1 && c1<0xe0) {
2640 if(iso2022jp_f && x0201_f==NO_X0201) {
2641 (*oconv)(GETA1, GETA2);
2648 } else { /* bogus code, skip SSO and one byte */
2652 /* already established */
2657 } else if ((c1 > SPACE) && (c1 != DEL)) {
2658 /* in case of Roman characters */
2660 /* output 1 shifted byte */
2664 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2665 /* output 1 shifted byte */
2666 if(iso2022jp_f && x0201_f==NO_X0201) {
2667 (*oconv)(GETA1, GETA2);
2674 /* look like bogus code */
2677 } else if (input_mode == X0208 || input_mode == X0212 ||
2678 input_mode == X0213_1 || input_mode == X0213_2) {
2679 /* in case of Kanji shifted */
2682 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2683 /* Check MIME code */
2684 if ((c1 = (*i_getc)(f)) == EOF) {
2687 } else if (c1 == '?') {
2688 /* =? is mime conversion start sequence */
2689 if(mime_f == STRICT_MIME) {
2690 /* check in real detail */
2691 if (mime_begin_strict(f) == EOF)
2695 } else if (mime_begin(f) == EOF)
2705 /* normal ASCII code */
2708 } else if (!is_8bit && c1 == SI) {
2711 } else if (!is_8bit && c1 == SO) {
2714 } else if (!is_8bit && c1 == ESC ) {
2715 if ((c1 = (*i_getc)(f)) == EOF) {
2716 /* (*oconv)(0, ESC); don't send bogus code */
2718 } else if (c1 == '$') {
2719 if ((c1 = (*i_getc)(f)) == EOF) {
2721 (*oconv)(0, ESC); don't send bogus code
2722 (*oconv)(0, '$'); */
2724 } else if (c1 == '@'|| c1 == 'B') {
2725 /* This is kanji introduction */
2728 set_input_codename("ISO-2022-JP");
2730 debug(input_codename);
2733 } else if (c1 == '(') {
2734 if ((c1 = (*i_getc)(f)) == EOF) {
2735 /* don't send bogus code
2741 } else if (c1 == '@'|| c1 == 'B') {
2742 /* This is kanji introduction */
2747 } else if (c1 == 'D'){
2751 #endif /* X0212_ENABLE */
2752 } else if (c1 == (X0213_1&0x7F)){
2753 input_mode = X0213_1;
2756 } else if (c1 == (X0213_2&0x7F)){
2757 input_mode = X0213_2;
2761 /* could be some special code */
2768 } else if (broken_f&0x2) {
2769 /* accept any ESC-(-x as broken code ... */
2779 } else if (c1 == '(') {
2780 if ((c1 = (*i_getc)(f)) == EOF) {
2781 /* don't send bogus code
2783 (*oconv)(0, '('); */
2787 /* This is X0201 kana introduction */
2788 input_mode = X0201; shift_mode = X0201;
2790 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2791 /* This is X0208 kanji introduction */
2792 input_mode = ASCII; shift_mode = FALSE;
2794 } else if (broken_f&0x2) {
2795 input_mode = ASCII; shift_mode = FALSE;
2800 /* maintain various input_mode here */
2804 } else if ( c1 == 'N' || c1 == 'n' ){
2806 c3 = (*i_getc)(f); /* skip SS2 */
2807 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2822 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2823 input_mode = ASCII; set_iconv(FALSE, 0);
2825 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2826 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2834 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2835 if ((c1=(*i_getc)(f))!=EOF) {
2839 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2857 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2860 if ((c0 = (*i_getc)(f)) != EOF) {
2863 if ((c3 = (*i_getc)(f)) != EOF) {
2865 (*iconv)(c2, c1, c0|c3);
2870 /* 3 bytes EUC or UTF-8 */
2871 if ((c0 = (*i_getc)(f)) != EOF) {
2873 (*iconv)(c2, c1, c0);
2880 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2884 (*oconv)(PREFIX_EUCG3 | c2, c1);
2886 #endif /* X0212_ENABLE */
2888 (*oconv)(PREFIX_EUCG3 | c2, c1);
2891 (*oconv)(input_mode, c1); /* other special case */
2897 /* goto next_word */
2901 (*iconv)(EOF, 0, 0);
2902 if (!is_inputcode_set)
2905 struct input_code *p = input_code_list;
2906 struct input_code *result = p;
2908 if (p->score < result->score) result = p;
2911 set_input_codename(result->name);
2918 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2920 nkf_char ret, c3, c0;
2924 /** it must NOT be in the kanji shifte sequence */
2925 /** it must NOT be written in JIS7 */
2926 /** and it must be after 2 byte 8bit code */
2932 while ((c1 = (*i_getc)(f)) != EOF) {
2938 if (push_hold_buf(c1) == EOF || estab_f){
2944 struct input_code *p = input_code_list;
2945 struct input_code *result = p;
2950 if (p->score < result->score){
2955 set_iconv(FALSE, result->iconv_func);
2960 ** 1) EOF is detected, or
2961 ** 2) Code is established, or
2962 ** 3) Buffer is FULL (but last word is pushed)
2964 ** in 1) and 3) cases, we continue to use
2965 ** Kanji codes by oconv and leave estab_f unchanged.
2970 while (hold_index < hold_count){
2971 c2 = hold_buf[hold_index++];
2973 #ifdef NUMCHAR_OPTION
2974 || is_unicode_capsule(c2)
2979 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2980 (*iconv)(X0201, c2, 0);
2983 if (hold_index < hold_count){
2984 c1 = hold_buf[hold_index++];
2994 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
2997 if (hold_index < hold_count){
2998 c0 = hold_buf[hold_index++];
2999 } else if ((c0 = (*i_getc)(f)) == EOF) {
3005 if (hold_index < hold_count){
3006 c3 = hold_buf[hold_index++];
3007 } else if ((c3 = (*i_getc)(f)) == EOF) {
3012 (*iconv)(c2, c1, c0|c3);
3017 /* 3 bytes EUC or UTF-8 */
3018 if (hold_index < hold_count){
3019 c0 = hold_buf[hold_index++];
3020 } else if ((c0 = (*i_getc)(f)) == EOF) {
3026 (*iconv)(c2, c1, c0);
3029 if (c0 == EOF) break;
3034 nkf_char push_hold_buf(nkf_char c2)
3036 if (hold_count >= HOLD_SIZE*2)
3038 hold_buf[hold_count++] = (unsigned char)c2;
3039 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3042 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3044 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3047 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3048 #ifdef SHIFTJIS_CP932
3049 if (cp51932_f && is_ibmext_in_sjis(c2)){
3051 extern const unsigned short shiftjis_cp932[3][189];
3053 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3059 #endif /* SHIFTJIS_CP932 */
3061 if (!x0213_f && is_ibmext_in_sjis(c2)){
3063 extern const unsigned short shiftjis_x0212[3][189];
3065 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3068 c2 = PREFIX_EUCG3 | (val >> 8);
3081 if(x0213_f && c2 >= 0xF0){
3082 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3083 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3084 }else{ /* 78<=k<=94 */
3085 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3086 if (0x9E < c1) c2++;
3089 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3090 if (0x9E < c1) c2++;
3093 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3100 c2 = x0212_unshift(c2);
3107 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3111 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3114 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3115 if (ret) return ret;
3121 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3126 }else if (c2 == 0x8f){
3130 c2 = (c2 << 8) | (c1 & 0x7f);
3132 #ifdef SHIFTJIS_CP932
3135 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3136 s2e_conv(s2, s1, &c2, &c1);
3143 #endif /* SHIFTJIS_CP932 */
3144 #endif /* X0212_ENABLE */
3145 } else if (c2 == SSO){
3148 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3158 #ifdef UTF8_INPUT_ENABLE
3159 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3166 }else if (0xc0 <= c2 && c2 <= 0xef) {
3167 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3168 #ifdef NUMCHAR_OPTION
3171 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3179 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3182 static const int w_iconv_utf8_1st_byte[] =
3184 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3185 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3186 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3187 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3189 if (c2 < 0 || 0xff < c2) {
3190 }else if (c2 == 0) { /* 0 : 1 byte*/
3192 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3195 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3197 if (c1 < 0x80 || 0xBF < c1) return 0;
3200 if (c0 == 0) return -1;
3201 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3206 if (c0 == 0) return -1;
3207 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3211 if (c0 == 0) return -1;
3212 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3216 if (c0 == 0) return -2;
3217 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3221 if (c0 == 0) return -2;
3222 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3226 if (c0 == 0) return -2;
3227 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3235 if (c2 == 0 || c2 == EOF){
3236 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3237 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3240 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3249 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3250 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3257 }else if (val < 0x800){
3258 *p2 = 0xc0 | (val >> 6);
3259 *p1 = 0x80 | (val & 0x3f);
3261 } else if (val <= NKF_INT32_C(0xFFFF)) {
3262 *p2 = 0xe0 | (val >> 12);
3263 *p1 = 0x80 | ((val >> 6) & 0x3f);
3264 *p0 = 0x80 | (val & 0x3f);
3265 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3266 *p2 = 0xe0 | (val >> 16);
3267 *p1 = 0x80 | ((val >> 12) & 0x3f);
3268 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3277 #ifdef UTF8_INPUT_ENABLE
3278 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3283 } else if (c2 >= 0xf0){
3284 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3285 val = (c2 & 0x0f) << 18;
3286 val |= (c1 & 0x3f) << 12;
3287 val |= (c0 & 0x3f00) >> 2;
3289 }else if (c2 >= 0xe0){
3290 val = (c2 & 0x0f) << 12;
3291 val |= (c1 & 0x3f) << 6;
3293 }else if (c2 >= 0xc0){
3294 val = (c2 & 0x1f) << 6;
3302 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3304 nkf_char c2, c1, c0;
3311 w16w_conv(val, &c2, &c1, &c0);
3312 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3313 #ifdef NUMCHAR_OPTION
3316 *p1 = CLASS_UNICODE | val;
3325 #ifdef UTF8_INPUT_ENABLE
3326 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3329 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3332 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3333 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3335 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3337 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3342 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3343 if (ret) return ret;
3348 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3352 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3353 } else if (is_unicode_bmp(c1)) {
3354 ret = w16e_conv(c1, &c2, &c1);
3357 c1 = CLASS_UNICODE | c1;
3359 if (ret) return ret;
3364 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3367 extern const unsigned short *const utf8_to_euc_2bytes[];
3368 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3369 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3370 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3371 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3372 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3374 const unsigned short *const *pp;
3375 const unsigned short *const *const *ppp;
3376 static const int no_best_fit_chars_table_C2[] =
3377 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3379 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3380 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3381 static const int no_best_fit_chars_table_C2_ms[] =
3382 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3383 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3384 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3385 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3386 static const int no_best_fit_chars_table_932_C2[] =
3387 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3389 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3390 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3391 static const int no_best_fit_chars_table_932_C3[] =
3392 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3393 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3394 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3395 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3401 }else if(c2 < 0xe0){
3402 if(no_best_fit_chars_f){
3403 if(ms_ucs_map_f == UCS_MAP_CP932){
3406 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3409 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3412 }else if(cp51932_f){
3415 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3418 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3421 }else if(ms_ucs_map_f == UCS_MAP_MS){
3422 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3426 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3427 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3429 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3430 }else if(c0 < 0xF0){
3431 if(no_best_fit_chars_f){
3432 if(ms_ucs_map_f == UCS_MAP_CP932){
3433 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3434 }else if(ms_ucs_map_f == UCS_MAP_MS){
3439 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3442 if(c0 == 0x92) return 1;
3447 if(c1 == 0x80 || c0 == 0x9C) return 1;
3455 if(c0 == 0x95) return 1;
3458 if(c0 == 0xA5) return 1;
3465 if(c0 == 0x8D) return 1;
3468 if(c0 == 0x9E && cp51932_f) return 1;
3471 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3479 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3480 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3482 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3484 #ifdef SHIFTJIS_CP932
3485 if (!ret && cp51932_f && is_eucg3(*p2)) {
3487 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3488 s2e_conv(s2, s1, p2, p1);
3497 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3500 const unsigned short *p;
3503 if (pp == 0) return 1;
3506 if (c1 < 0 || psize <= c1) return 1;
3508 if (p == 0) return 1;
3511 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3513 if (val == 0) return 1;
3514 if (no_cp932ext_f && (
3515 (val>>8) == 0x2D || /* NEC special characters */
3516 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3524 if (c2 == SO) c2 = X0201;
3531 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3533 const char *hex = "0123456789ABCDEF";
3539 (*f)(0, hex[(c>>shift)&0xF]);
3549 void encode_fallback_html(nkf_char c)
3554 if(c >= NKF_INT32_C(1000000))
3555 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3556 if(c >= NKF_INT32_C(100000))
3557 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3559 (*oconv)(0, 0x30+(c/10000 )%10);
3561 (*oconv)(0, 0x30+(c/1000 )%10);
3563 (*oconv)(0, 0x30+(c/100 )%10);
3565 (*oconv)(0, 0x30+(c/10 )%10);
3567 (*oconv)(0, 0x30+ c %10);
3572 void encode_fallback_xml(nkf_char c)
3577 nkf_each_char_to_hex(oconv, c);
3582 void encode_fallback_java(nkf_char c)
3584 const char *hex = "0123456789ABCDEF";
3587 if(!is_unicode_bmp(c)){
3591 (*oconv)(0, hex[(c>>20)&0xF]);
3592 (*oconv)(0, hex[(c>>16)&0xF]);
3596 (*oconv)(0, hex[(c>>12)&0xF]);
3597 (*oconv)(0, hex[(c>> 8)&0xF]);
3598 (*oconv)(0, hex[(c>> 4)&0xF]);
3599 (*oconv)(0, hex[ c &0xF]);
3603 void encode_fallback_perl(nkf_char c)
3608 nkf_each_char_to_hex(oconv, c);
3613 void encode_fallback_subchar(nkf_char c)
3615 c = unicode_subchar;
3616 (*oconv)((c>>8)&0xFF, c&0xFF);
3621 #ifdef UTF8_OUTPUT_ENABLE
3622 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3625 extern const unsigned short euc_to_utf8_1byte[];
3626 extern const unsigned short *const euc_to_utf8_2bytes[];
3627 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3628 extern const unsigned short *const x0212_to_utf8_2bytes[];
3630 const unsigned short *p;
3633 p = euc_to_utf8_1byte;
3635 } else if (is_eucg3(c2)){
3636 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3639 c2 = (c2&0x7f) - 0x21;
3640 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3641 p = x0212_to_utf8_2bytes[c2];
3647 c2 = (c2&0x7f) - 0x21;
3648 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3649 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3654 c1 = (c1 & 0x7f) - 0x21;
3655 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3660 void w_oconv(nkf_char c2, nkf_char c1)
3666 output_bom_f = FALSE;
3677 #ifdef NUMCHAR_OPTION
3678 if (c2 == 0 && is_unicode_capsule(c1)){
3679 val = c1 & VALUE_MASK;
3682 }else if (val < 0x800){
3683 (*o_putc)(0xC0 | (val >> 6));
3684 (*o_putc)(0x80 | (val & 0x3f));
3685 } else if (val <= NKF_INT32_C(0xFFFF)) {
3686 (*o_putc)(0xE0 | (val >> 12));
3687 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3688 (*o_putc)(0x80 | (val & 0x3f));
3689 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3690 (*o_putc)(0xF0 | ( val>>18));
3691 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3692 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3693 (*o_putc)(0x80 | ( val & 0x3f));
3700 output_mode = ASCII;
3702 } else if (c2 == ISO8859_1) {
3703 output_mode = ISO8859_1;
3704 (*o_putc)(c1 | 0x080);
3707 val = e2w_conv(c2, c1);
3709 w16w_conv(val, &c2, &c1, &c0);
3713 if (c0) (*o_putc)(c0);
3719 void w_oconv16(nkf_char c2, nkf_char c1)
3722 output_bom_f = FALSE;
3723 if (output_endian == ENDIAN_LITTLE){
3724 (*o_putc)((unsigned char)'\377');
3728 (*o_putc)((unsigned char)'\377');
3737 if (c2 == ISO8859_1) {
3740 #ifdef NUMCHAR_OPTION
3741 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3742 if (is_unicode_bmp(c1)) {
3743 c2 = (c1 >> 8) & 0xff;
3747 if (c1 <= UNICODE_MAX) {
3748 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3749 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3750 if (output_endian == ENDIAN_LITTLE){
3751 (*o_putc)(c2 & 0xff);
3752 (*o_putc)((c2 >> 8) & 0xff);
3753 (*o_putc)(c1 & 0xff);
3754 (*o_putc)((c1 >> 8) & 0xff);
3756 (*o_putc)((c2 >> 8) & 0xff);
3757 (*o_putc)(c2 & 0xff);
3758 (*o_putc)((c1 >> 8) & 0xff);
3759 (*o_putc)(c1 & 0xff);
3766 nkf_char val = e2w_conv(c2, c1);
3767 c2 = (val >> 8) & 0xff;
3770 if (output_endian == ENDIAN_LITTLE){
3779 void w_oconv32(nkf_char c2, nkf_char c1)
3782 output_bom_f = FALSE;
3783 if (output_endian == ENDIAN_LITTLE){
3784 (*o_putc)((unsigned char)'\377');
3792 (*o_putc)((unsigned char)'\377');
3801 if (c2 == ISO8859_1) {
3803 #ifdef NUMCHAR_OPTION
3804 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3808 c1 = e2w_conv(c2, c1);
3810 if (output_endian == ENDIAN_LITTLE){
3811 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3812 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3813 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3817 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3818 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3819 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3824 void e_oconv(nkf_char c2, nkf_char c1)
3826 #ifdef NUMCHAR_OPTION
3827 if (c2 == 0 && is_unicode_capsule(c1)){
3828 w16e_conv(c1, &c2, &c1);
3829 if (c2 == 0 && is_unicode_capsule(c1)){
3830 if(encode_fallback)(*encode_fallback)(c1);
3838 } else if (c2 == 0) {
3839 output_mode = ASCII;
3841 } else if (c2 == X0201) {
3842 output_mode = JAPANESE_EUC;
3843 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3844 } else if (c2 == ISO8859_1) {
3845 output_mode = ISO8859_1;
3846 (*o_putc)(c1 | 0x080);
3848 } else if (is_eucg3(c2)){
3849 output_mode = JAPANESE_EUC;
3850 #ifdef SHIFTJIS_CP932
3853 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3854 s2e_conv(s2, s1, &c2, &c1);
3859 output_mode = ASCII;
3861 }else if (is_eucg3(c2)){
3864 (*o_putc)((c2 & 0x7f) | 0x080);
3865 (*o_putc)(c1 | 0x080);
3868 (*o_putc)((c2 & 0x7f) | 0x080);
3869 (*o_putc)(c1 | 0x080);
3873 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3874 set_iconv(FALSE, 0);
3875 return; /* too late to rescue this char */
3877 output_mode = JAPANESE_EUC;
3878 (*o_putc)(c2 | 0x080);
3879 (*o_putc)(c1 | 0x080);
3884 nkf_char x0212_shift(nkf_char c)
3889 if (0x75 <= c && c <= 0x7f){
3890 ret = c + (0x109 - 0x75);
3893 if (0x75 <= c && c <= 0x7f){
3894 ret = c + (0x113 - 0x75);
3901 nkf_char x0212_unshift(nkf_char c)
3904 if (0x7f <= c && c <= 0x88){
3905 ret = c + (0x75 - 0x7f);
3906 }else if (0x89 <= c && c <= 0x92){
3907 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3911 #endif /* X0212_ENABLE */
3913 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3919 if((0x21 <= ndx && ndx <= 0x2F)){
3920 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3921 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3923 }else if(0x6E <= ndx && ndx <= 0x7E){
3924 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3925 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3931 else if(nkf_isgraph(ndx)){
3933 const unsigned short *ptr;
3935 extern const unsigned short *const x0212_shiftjis[];
3937 ptr = x0212_shiftjis[ndx - 0x21];
3939 val = ptr[(c1 & 0x7f) - 0x21];
3948 c2 = x0212_shift(c2);
3950 #endif /* X0212_ENABLE */
3952 if(0x7F < c2) return 1;
3953 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3954 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3958 void s_oconv(nkf_char c2, nkf_char c1)
3960 #ifdef NUMCHAR_OPTION
3961 if (c2 == 0 && is_unicode_capsule(c1)){
3962 w16e_conv(c1, &c2, &c1);
3963 if (c2 == 0 && is_unicode_capsule(c1)){
3964 if(encode_fallback)(*encode_fallback)(c1);
3972 } else if (c2 == 0) {
3973 output_mode = ASCII;
3975 } else if (c2 == X0201) {
3976 output_mode = SHIFT_JIS;
3978 } else if (c2 == ISO8859_1) {
3979 output_mode = ISO8859_1;
3980 (*o_putc)(c1 | 0x080);
3982 } else if (is_eucg3(c2)){
3983 output_mode = SHIFT_JIS;
3984 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3990 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
3991 set_iconv(FALSE, 0);
3992 return; /* too late to rescue this char */
3994 output_mode = SHIFT_JIS;
3995 e2s_conv(c2, c1, &c2, &c1);
3997 #ifdef SHIFTJIS_CP932
3999 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4001 extern const unsigned short cp932inv[2][189];
4003 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4009 #endif /* SHIFTJIS_CP932 */
4012 if (prefix_table[(unsigned char)c1]){
4013 (*o_putc)(prefix_table[(unsigned char)c1]);
4019 void j_oconv(nkf_char c2, nkf_char c1)
4021 #ifdef NUMCHAR_OPTION
4022 if (c2 == 0 && is_unicode_capsule(c1)){
4023 w16e_conv(c1, &c2, &c1);
4024 if (c2 == 0 && is_unicode_capsule(c1)){
4025 if(encode_fallback)(*encode_fallback)(c1);
4031 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4034 (*o_putc)(ascii_intro);
4035 output_mode = ASCII;
4039 } else if (is_eucg3(c2)){
4041 if(output_mode!=X0213_2){
4042 output_mode = X0213_2;
4046 (*o_putc)(X0213_2&0x7F);
4049 if(output_mode!=X0212){
4050 output_mode = X0212;
4054 (*o_putc)(X0212&0x7F);
4057 (*o_putc)(c2 & 0x7f);
4060 } else if (c2==X0201) {
4061 if (output_mode!=X0201) {
4062 output_mode = X0201;
4068 } else if (c2==ISO8859_1) {
4069 /* iso8859 introduction, or 8th bit on */
4070 /* Can we convert in 7bit form using ESC-'-'-A ?
4072 output_mode = ISO8859_1;
4074 } else if (c2 == 0) {
4075 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4078 (*o_putc)(ascii_intro);
4079 output_mode = ASCII;
4083 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4085 if (output_mode!=X0213_1) {
4086 output_mode = X0213_1;
4090 (*o_putc)(X0213_1&0x7F);
4092 }else if (output_mode != X0208) {
4093 output_mode = X0208;
4096 (*o_putc)(kanji_intro);
4103 void base64_conv(nkf_char c2, nkf_char c1)
4105 mime_prechar(c2, c1);
4106 (*o_base64conv)(c2,c1);
4110 static nkf_char broken_buf[3];
4111 static int broken_counter = 0;
4112 static int broken_last = 0;
4113 nkf_char broken_getc(FILE *f)
4117 if (broken_counter>0) {
4118 return broken_buf[--broken_counter];
4121 if (c=='$' && broken_last != ESC
4122 && (input_mode==ASCII || input_mode==X0201)) {
4125 if (c1=='@'|| c1=='B') {
4126 broken_buf[0]=c1; broken_buf[1]=c;
4133 } else if (c=='(' && broken_last != ESC
4134 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4137 if (c1=='J'|| c1=='B') {
4138 broken_buf[0]=c1; broken_buf[1]=c;
4151 nkf_char broken_ungetc(nkf_char c, FILE *f)
4153 if (broken_counter<2)
4154 broken_buf[broken_counter++]=c;
4158 static nkf_char prev_cr = 0;
4160 void cr_conv(nkf_char c2, nkf_char c1)
4164 if (! (c2==0&&c1==NL) ) {
4170 } else if (c1=='\r') {
4172 } else if (c1=='\n') {
4173 if (crmode_f==CRLF) {
4174 (*o_crconv)(0,'\r');
4175 } else if (crmode_f==CR) {
4176 (*o_crconv)(0,'\r');
4180 } else if (c1!='\032' || crmode_f!=NL){
4186 Return value of fold_conv()
4188 \n add newline and output char
4189 \r add newline and output nothing
4192 1 (or else) normal output
4194 fold state in prev (previous character)
4196 >0x80 Japanese (X0208/X0201)
4201 This fold algorthm does not preserve heading space in a line.
4202 This is the main difference from fmt.
4205 #define char_size(c2,c1) (c2?2:1)
4207 void fold_conv(nkf_char c2, nkf_char c1)
4210 nkf_char fold_state;
4212 if (c1== '\r' && !fold_preserve_f) {
4213 fold_state=0; /* ignore cr */
4214 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4216 fold_state=0; /* ignore cr */
4217 } else if (c1== BS) {
4218 if (f_line>0) f_line--;
4220 } else if (c2==EOF && f_line != 0) { /* close open last line */
4222 } else if ((c1=='\n' && !fold_preserve_f)
4223 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4224 && fold_preserve_f)) {
4226 if (fold_preserve_f) {
4230 } else if ((f_prev == c1 && !fold_preserve_f)
4231 || (f_prev == '\n' && fold_preserve_f)
4232 ) { /* duplicate newline */
4235 fold_state = '\n'; /* output two newline */
4241 if (f_prev&0x80) { /* Japanese? */
4243 fold_state = 0; /* ignore given single newline */
4244 } else if (f_prev==' ') {
4248 if (++f_line<=fold_len)
4252 fold_state = '\r'; /* fold and output nothing */
4256 } else if (c1=='\f') {
4259 fold_state = '\n'; /* output newline and clear */
4260 } else if ( (c2==0 && c1==' ')||
4261 (c2==0 && c1=='\t')||
4262 (c2=='!'&& c1=='!')) {
4263 /* X0208 kankaku or ascii space */
4264 if (f_prev == ' ') {
4265 fold_state = 0; /* remove duplicate spaces */
4268 if (++f_line<=fold_len)
4269 fold_state = ' '; /* output ASCII space only */
4271 f_prev = ' '; f_line = 0;
4272 fold_state = '\r'; /* fold and output nothing */
4276 prev0 = f_prev; /* we still need this one... , but almost done */
4278 if (c2 || c2==X0201)
4279 f_prev |= 0x80; /* this is Japanese */
4280 f_line += char_size(c2,c1);
4281 if (f_line<=fold_len) { /* normal case */
4284 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4285 f_line = char_size(c2,c1);
4286 fold_state = '\n'; /* We can't wait, do fold now */
4287 } else if (c2==X0201) {
4288 /* simple kinsoku rules return 1 means no folding */
4289 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4290 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4291 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4292 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4293 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4294 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4295 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4297 fold_state = '\n';/* add one new f_line before this character */
4300 fold_state = '\n';/* add one new f_line before this character */
4303 /* kinsoku point in ASCII */
4304 if ( c1==')'|| /* { [ ( */
4315 /* just after special */
4316 } else if (!is_alnum(prev0)) {
4317 f_line = char_size(c2,c1);
4319 } else if ((prev0==' ') || /* ignored new f_line */
4320 (prev0=='\n')|| /* ignored new f_line */
4321 (prev0&0x80)) { /* X0208 - ASCII */
4322 f_line = char_size(c2,c1);
4323 fold_state = '\n';/* add one new f_line before this character */
4325 fold_state = 1; /* default no fold in ASCII */
4329 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4330 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4331 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4332 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4333 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4334 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4335 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4336 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4337 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4338 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4339 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4340 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4341 /* default no fold in kinsoku */
4344 f_line = char_size(c2,c1);
4345 /* add one new f_line before this character */
4348 f_line = char_size(c2,c1);
4350 /* add one new f_line before this character */
4355 /* terminator process */
4356 switch(fold_state) {
4375 nkf_char z_prev2=0,z_prev1=0;
4377 void z_conv(nkf_char c2, nkf_char c1)
4380 /* if (c2) c1 &= 0x7f; assertion */
4382 if (x0201_f && z_prev2==X0201) { /* X0201 */
4383 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4385 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4387 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4389 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4393 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4402 if (x0201_f && c2==X0201) {
4403 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4404 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4405 z_prev1 = c1; z_prev2 = c2;
4408 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4413 /* JISX0208 Alphabet */
4414 if (alpha_f && c2 == 0x23 ) {
4416 } else if (alpha_f && c2 == 0x21 ) {
4417 /* JISX0208 Kigou */
4422 } else if (alpha_f&0x4) {
4427 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4433 case '>': entity = ">"; break;
4434 case '<': entity = "<"; break;
4435 case '\"': entity = """; break;
4436 case '&': entity = "&"; break;
4439 while (*entity) (*o_zconv)(0, *entity++);
4449 #define rot13(c) ( \
4451 (c <= 'M') ? (c + 13): \
4452 (c <= 'Z') ? (c - 13): \
4454 (c <= 'm') ? (c + 13): \
4455 (c <= 'z') ? (c - 13): \
4459 #define rot47(c) ( \
4461 ( c <= 'O' ) ? (c + 47) : \
4462 ( c <= '~' ) ? (c - 47) : \
4466 void rot_conv(nkf_char c2, nkf_char c1)
4468 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4474 (*o_rot_conv)(c2,c1);
4477 void hira_conv(nkf_char c2, nkf_char c1)
4481 if (0x20 < c1 && c1 < 0x74) {
4483 (*o_hira_conv)(c2,c1);
4485 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4487 c1 = CLASS_UNICODE | 0x3094;
4488 (*o_hira_conv)(c2,c1);
4491 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4493 (*o_hira_conv)(c2,c1);
4498 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4501 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4503 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4507 (*o_hira_conv)(c2,c1);
4511 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4513 static const nkf_char range[RANGE_NUM_MAX][2] = {
4534 nkf_char start, end, c;
4536 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4540 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4545 for (i = 0; i < RANGE_NUM_MAX; i++) {
4546 start = range[i][0];
4549 if (c >= start && c <= end) {
4554 (*o_iso2022jp_check_conv)(c2,c1);
4558 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4560 const unsigned char *mime_pattern[] = {
4561 (const unsigned char *)"\075?EUC-JP?B?",
4562 (const unsigned char *)"\075?SHIFT_JIS?B?",
4563 (const unsigned char *)"\075?ISO-8859-1?Q?",
4564 (const unsigned char *)"\075?ISO-8859-1?B?",
4565 (const unsigned char *)"\075?ISO-2022-JP?B?",
4566 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4567 #if defined(UTF8_INPUT_ENABLE)
4568 (const unsigned char *)"\075?UTF-8?B?",
4569 (const unsigned char *)"\075?UTF-8?Q?",
4571 (const unsigned char *)"\075?US-ASCII?Q?",
4576 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4577 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4578 e_iconv, s_iconv, 0, 0, 0, 0,
4579 #if defined(UTF8_INPUT_ENABLE)
4585 const nkf_char mime_encode[] = {
4586 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4587 #if defined(UTF8_INPUT_ENABLE)
4594 const nkf_char mime_encode_method[] = {
4595 'B', 'B','Q', 'B', 'B', 'Q',
4596 #if defined(UTF8_INPUT_ENABLE)
4604 #define MAXRECOVER 20
4606 void switch_mime_getc(void)
4608 if (i_getc!=mime_getc) {
4609 i_mgetc = i_getc; i_getc = mime_getc;
4610 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4611 if(mime_f==STRICT_MIME) {
4612 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4613 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4618 void unswitch_mime_getc(void)
4620 if(mime_f==STRICT_MIME) {
4621 i_mgetc = i_mgetc_buf;
4622 i_mungetc = i_mungetc_buf;
4625 i_ungetc = i_mungetc;
4626 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4627 mime_iconv_back = NULL;
4630 nkf_char mime_begin_strict(FILE *f)
4634 const unsigned char *p,*q;
4635 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4637 mime_decode_mode = FALSE;
4638 /* =? has been checked */
4640 p = mime_pattern[j];
4643 for(i=2;p[i]>' ';i++) { /* start at =? */
4644 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4645 /* pattern fails, try next one */
4647 while (mime_pattern[++j]) {
4648 p = mime_pattern[j];
4649 for(k=2;k<i;k++) /* assume length(p) > i */
4650 if (p[k]!=q[k]) break;
4651 if (k==i && nkf_toupper(c1)==p[k]) break;
4653 p = mime_pattern[j];
4654 if (p) continue; /* found next one, continue */
4655 /* all fails, output from recovery buffer */
4663 mime_decode_mode = p[i-2];
4665 mime_iconv_back = iconv;
4666 set_iconv(FALSE, mime_priority_func[j]);
4667 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4669 if (mime_decode_mode=='B') {
4670 mimebuf_f = unbuf_f;
4672 /* do MIME integrity check */
4673 return mime_integrity(f,mime_pattern[j]);
4681 nkf_char mime_getc_buf(FILE *f)
4683 /* we don't keep eof of Fifo, becase it contains ?= as
4684 a terminator. It was checked in mime_integrity. */
4685 return ((mimebuf_f)?
4686 (*i_mgetc_buf)(f):Fifo(mime_input++));
4689 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4692 (*i_mungetc_buf)(c,f);
4694 Fifo(--mime_input) = (unsigned char)c;
4698 nkf_char mime_begin(FILE *f)
4703 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4704 /* re-read and convert again from mime_buffer. */
4706 /* =? has been checked */
4708 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4709 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4710 /* We accept any character type even if it is breaked by new lines */
4711 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4712 if (c1=='\n'||c1==' '||c1=='\r'||
4713 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4715 /* Failed. But this could be another MIME preemble */
4723 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4724 if (!(++i<MAXRECOVER) || c1==EOF) break;
4725 if (c1=='b'||c1=='B') {
4726 mime_decode_mode = 'B';
4727 } else if (c1=='q'||c1=='Q') {
4728 mime_decode_mode = 'Q';
4732 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4733 if (!(++i<MAXRECOVER) || c1==EOF) break;
4735 mime_decode_mode = FALSE;
4741 if (!mime_decode_mode) {
4742 /* false MIME premble, restart from mime_buffer */
4743 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4744 /* Since we are in MIME mode until buffer becomes empty, */
4745 /* we never go into mime_begin again for a while. */
4748 /* discard mime preemble, and goto MIME mode */
4750 /* do no MIME integrity check */
4751 return c1; /* used only for checking EOF */
4755 void no_putc(nkf_char c)
4760 void debug(const char *str)
4763 fprintf(stderr, "%s\n", str);
4768 void set_input_codename(char *codename)
4772 strcmp(codename, "") != 0 &&
4773 strcmp(codename, input_codename) != 0)
4775 is_inputcode_mixed = TRUE;
4777 input_codename = codename;
4778 is_inputcode_set = TRUE;
4781 #if !defined(PERL_XS) && !defined(WIN32DLL)
4782 void print_guessed_code(char *filename)
4784 char *codename = "BINARY";
4785 if (!is_inputcode_mixed) {
4786 if (strcmp(input_codename, "") == 0) {
4789 codename = input_codename;
4792 if (filename != NULL) printf("%s:", filename);
4793 printf("%s\n", codename);
4799 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4801 nkf_char c1, c2, c3;
4807 if (!nkf_isxdigit(c2)){
4812 if (!nkf_isxdigit(c3)){
4817 return (hex2bin(c2) << 4) | hex2bin(c3);
4820 nkf_char cap_getc(FILE *f)
4822 return hex_getc(':', f, i_cgetc, i_cungetc);
4825 nkf_char cap_ungetc(nkf_char c, FILE *f)
4827 return (*i_cungetc)(c, f);
4830 nkf_char url_getc(FILE *f)
4832 return hex_getc('%', f, i_ugetc, i_uungetc);
4835 nkf_char url_ungetc(nkf_char c, FILE *f)
4837 return (*i_uungetc)(c, f);
4841 #ifdef NUMCHAR_OPTION
4842 nkf_char numchar_getc(FILE *f)
4844 nkf_char (*g)(FILE *) = i_ngetc;
4845 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4856 if (buf[i] == 'x' || buf[i] == 'X'){
4857 for (j = 0; j < 7; j++){
4859 if (!nkf_isxdigit(buf[i])){
4866 c |= hex2bin(buf[i]);
4869 for (j = 0; j < 8; j++){
4873 if (!nkf_isdigit(buf[i])){
4880 c += hex2bin(buf[i]);
4886 return CLASS_UNICODE | c;
4895 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4897 return (*i_nungetc)(c, f);
4901 #ifdef UNICODE_NORMALIZATION
4903 /* Normalization Form C */
4904 nkf_char nfc_getc(FILE *f)
4906 nkf_char (*g)(FILE *f) = i_nfc_getc;
4907 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4908 int i=0, j, k=1, lower, upper;
4910 const nkf_nfchar *array;
4912 extern const struct normalization_pair normalization_table[];
4916 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4917 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4918 while (upper >= lower) {
4919 j = (lower+upper) / 2;
4920 array = normalization_table[j].nfd;
4921 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4922 if (array[k] != buf[k]){
4923 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4930 array = normalization_table[j].nfc;
4931 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4932 buf[i] = (nkf_char)(array[i]);
4943 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4945 return (*i_nfc_ungetc)(c, f);
4947 #endif /* UNICODE_NORMALIZATION */
4953 nkf_char c1, c2, c3, c4, cc;
4954 nkf_char t1, t2, t3, t4, mode, exit_mode;
4955 nkf_char lwsp_count;
4958 nkf_char lwsp_size = 128;
4960 if (mime_top != mime_last) { /* Something is in FIFO */
4961 return Fifo(mime_top++);
4963 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4964 mime_decode_mode=FALSE;
4965 unswitch_mime_getc();
4966 return (*i_getc)(f);
4969 if (mimebuf_f == FIXED_MIME)
4970 exit_mode = mime_decode_mode;
4973 if (mime_decode_mode == 'Q') {
4974 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4976 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4977 if (c1<=' ' || DEL<=c1) {
4978 mime_decode_mode = exit_mode; /* prepare for quit */
4981 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4985 mime_decode_mode = exit_mode; /* prepare for quit */
4986 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4987 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4988 /* end Q encoding */
4989 input_mode = exit_mode;
4991 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4992 if (lwsp_buf==NULL) {
4993 perror("can't malloc");
4996 while ((c1=(*i_getc)(f))!=EOF) {
5001 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5009 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5010 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5025 lwsp_buf[lwsp_count] = (unsigned char)c1;
5026 if (lwsp_count++>lwsp_size){
5028 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5029 if (lwsp_buf_new==NULL) {
5031 perror("can't realloc");
5034 lwsp_buf = lwsp_buf_new;
5040 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5042 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5043 i_ungetc(lwsp_buf[lwsp_count],f);
5049 if (c1=='='&&c2<' ') { /* this is soft wrap */
5050 while((c1 = (*i_mgetc)(f)) <=' ') {
5051 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5053 mime_decode_mode = 'Q'; /* still in MIME */
5054 goto restart_mime_q;
5057 mime_decode_mode = 'Q'; /* still in MIME */
5061 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5062 if (c2<=' ') return c2;
5063 mime_decode_mode = 'Q'; /* still in MIME */
5064 return ((hex2bin(c2)<<4) + hex2bin(c3));
5067 if (mime_decode_mode != 'B') {
5068 mime_decode_mode = FALSE;
5069 return (*i_mgetc)(f);
5073 /* Base64 encoding */
5075 MIME allows line break in the middle of
5076 Base64, but we are very pessimistic in decoding
5077 in unbuf mode because MIME encoded code may broken by
5078 less or editor's control sequence (such as ESC-[-K in unbuffered
5079 mode. ignore incomplete MIME.
5081 mode = mime_decode_mode;
5082 mime_decode_mode = exit_mode; /* prepare for quit */
5084 while ((c1 = (*i_mgetc)(f))<=' ') {
5089 if ((c2 = (*i_mgetc)(f))<=' ') {
5092 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5093 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5096 if ((c1 == '?') && (c2 == '=')) {
5099 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5100 if (lwsp_buf==NULL) {
5101 perror("can't malloc");
5104 while ((c1=(*i_getc)(f))!=EOF) {
5109 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5117 if ((c1=(*i_getc)(f))!=EOF) {
5121 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5136 lwsp_buf[lwsp_count] = (unsigned char)c1;
5137 if (lwsp_count++>lwsp_size){
5139 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5140 if (lwsp_buf_new==NULL) {
5142 perror("can't realloc");
5145 lwsp_buf = lwsp_buf_new;
5151 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5153 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5154 i_ungetc(lwsp_buf[lwsp_count],f);
5161 if ((c3 = (*i_mgetc)(f))<=' ') {
5164 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5165 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5169 if ((c4 = (*i_mgetc)(f))<=' ') {
5172 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5173 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5177 mime_decode_mode = mode; /* still in MIME sigh... */
5179 /* BASE 64 decoding */
5181 t1 = 0x3f & base64decode(c1);
5182 t2 = 0x3f & base64decode(c2);
5183 t3 = 0x3f & base64decode(c3);
5184 t4 = 0x3f & base64decode(c4);
5185 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5187 Fifo(mime_last++) = (unsigned char)cc;
5188 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5190 Fifo(mime_last++) = (unsigned char)cc;
5191 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5193 Fifo(mime_last++) = (unsigned char)cc;
5198 return Fifo(mime_top++);
5201 nkf_char mime_ungetc(nkf_char c, FILE *f)
5203 Fifo(--mime_top) = (unsigned char)c;
5207 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5211 /* In buffered mode, read until =? or NL or buffer full
5213 mime_input = mime_top;
5214 mime_last = mime_top;
5216 while(*p) Fifo(mime_input++) = *p++;
5219 while((c=(*i_getc)(f))!=EOF) {
5220 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5221 break; /* buffer full */
5223 if (c=='=' && d=='?') {
5224 /* checked. skip header, start decode */
5225 Fifo(mime_input++) = (unsigned char)c;
5226 /* mime_last_input = mime_input; */
5231 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5233 /* Should we check length mod 4? */
5234 Fifo(mime_input++) = (unsigned char)c;
5237 /* In case of Incomplete MIME, no MIME decode */
5238 Fifo(mime_input++) = (unsigned char)c;
5239 mime_last = mime_input; /* point undecoded buffer */
5240 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5241 switch_mime_getc(); /* anyway we need buffered getc */
5245 nkf_char base64decode(nkf_char c)
5250 i = c - 'A'; /* A..Z 0-25 */
5252 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5254 } else if (c > '/') {
5255 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5256 } else if (c == '+') {
5257 i = '>' /* 62 */ ; /* + 62 */
5259 i = '?' /* 63 */ ; /* / 63 */
5264 static const char basis_64[] =
5265 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5267 static nkf_char b64c;
5268 #define MIMEOUT_BUF_LENGTH (60)
5269 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5270 int mimeout_buf_count = 0;
5271 int mimeout_preserve_space = 0;
5272 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5274 void open_mime(nkf_char mode)
5276 const unsigned char *p;
5279 p = mime_pattern[0];
5280 for(i=0;mime_encode[i];i++) {
5281 if (mode == mime_encode[i]) {
5282 p = mime_pattern[i];
5286 mimeout_mode = mime_encode_method[i];
5289 if (base64_count>45) {
5290 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5291 (*o_mputc)(mimeout_buf[i]);
5297 if (!mimeout_preserve_space && mimeout_buf_count>0
5298 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5299 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5303 if (!mimeout_preserve_space) {
5304 for (;i<mimeout_buf_count;i++) {
5305 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5306 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5307 (*o_mputc)(mimeout_buf[i]);
5314 mimeout_preserve_space = FALSE;
5320 j = mimeout_buf_count;
5321 mimeout_buf_count = 0;
5323 mime_putc(mimeout_buf[i]);
5327 void close_mime(void)
5337 switch(mimeout_mode) {
5342 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5348 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5354 if (mimeout_f!=FIXED_MIME) {
5356 } else if (mimeout_mode != 'Q')
5361 void mimeout_addchar(nkf_char c)
5363 switch(mimeout_mode) {
5368 } else if(!nkf_isalnum(c)) {
5370 (*o_mputc)(itoh4(((c>>4)&0xf)));
5371 (*o_mputc)(itoh4((c&0xf)));
5380 (*o_mputc)(basis_64[c>>2]);
5385 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5391 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5392 (*o_mputc)(basis_64[c & 0x3F]);
5403 nkf_char mime_lastchar2, mime_lastchar1;
5405 void mime_prechar(nkf_char c2, nkf_char c1)
5409 if (base64_count + mimeout_buf_count/3*4> 66){
5410 (*o_base64conv)(EOF,0);
5411 (*o_base64conv)(0,NL);
5412 (*o_base64conv)(0,SPACE);
5414 }/*else if (mime_lastchar2){
5415 if (c1 <=DEL && !nkf_isspace(c1)){
5416 (*o_base64conv)(0,SPACE);
5420 if (c2 && mime_lastchar2 == 0
5421 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5422 (*o_base64conv)(0,SPACE);
5425 mime_lastchar2 = c2;
5426 mime_lastchar1 = c1;
5429 void mime_putc(nkf_char c)
5434 if (mimeout_f == FIXED_MIME){
5435 if (mimeout_mode == 'Q'){
5436 if (base64_count > 71){
5437 if (c!=CR && c!=NL) {
5444 if (base64_count > 71){
5449 if (c == EOF) { /* c==EOF */
5453 if (c != EOF) { /* c==EOF */
5459 /* mimeout_f != FIXED_MIME */
5461 if (c == EOF) { /* c==EOF */
5462 j = mimeout_buf_count;
5463 mimeout_buf_count = 0;
5467 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5470 mimeout_addchar(mimeout_buf[i]);
5474 mimeout_addchar(mimeout_buf[i]);
5478 mimeout_addchar(mimeout_buf[i]);
5484 if (mimeout_mode=='Q') {
5485 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5497 if (mimeout_buf_count > 0){
5498 lastchar = mimeout_buf[mimeout_buf_count - 1];
5503 if (!mimeout_mode) {
5504 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5505 if (nkf_isspace(c)) {
5506 if (c==CR || c==NL) {
5509 for (i=0;i<mimeout_buf_count;i++) {
5510 (*o_mputc)(mimeout_buf[i]);
5511 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5517 mimeout_buf[0] = (char)c;
5518 mimeout_buf_count = 1;
5520 if (base64_count > 1
5521 && base64_count + mimeout_buf_count > 76){
5524 if (!nkf_isspace(mimeout_buf[0])){
5529 mimeout_buf[mimeout_buf_count++] = (char)c;
5530 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5531 open_mime(output_mode);
5536 if (lastchar==CR || lastchar == NL){
5537 for (i=0;i<mimeout_buf_count;i++) {
5538 (*o_mputc)(mimeout_buf[i]);
5541 mimeout_buf_count = 0;
5543 if (lastchar==SPACE) {
5544 for (i=0;i<mimeout_buf_count-1;i++) {
5545 (*o_mputc)(mimeout_buf[i]);
5548 mimeout_buf[0] = SPACE;
5549 mimeout_buf_count = 1;
5551 open_mime(output_mode);
5554 /* mimeout_mode == 'B', 1, 2 */
5555 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5556 if (lastchar == CR || lastchar == NL){
5557 if (nkf_isblank(c)) {
5558 for (i=0;i<mimeout_buf_count;i++) {
5559 mimeout_addchar(mimeout_buf[i]);
5561 mimeout_buf_count = 0;
5562 } else if (SPACE<c && c<DEL) {
5564 for (i=0;i<mimeout_buf_count;i++) {
5565 (*o_mputc)(mimeout_buf[i]);
5568 mimeout_buf_count = 0;
5571 if (c==SPACE || c==TAB || c==CR || c==NL) {
5572 for (i=0;i<mimeout_buf_count;i++) {
5573 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5575 for (i=0;i<mimeout_buf_count;i++) {
5576 (*o_mputc)(mimeout_buf[i]);
5579 mimeout_buf_count = 0;
5582 mimeout_buf[mimeout_buf_count++] = (char)c;
5583 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5585 for (i=0;i<mimeout_buf_count;i++) {
5586 (*o_mputc)(mimeout_buf[i]);
5589 mimeout_buf_count = 0;
5593 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5594 mimeout_buf[mimeout_buf_count++] = (char)c;
5595 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5596 j = mimeout_buf_count;
5597 mimeout_buf_count = 0;
5599 mimeout_addchar(mimeout_buf[i]);
5606 if (mimeout_buf_count>0) {
5607 j = mimeout_buf_count;
5608 mimeout_buf_count = 0;
5610 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5612 mimeout_addchar(mimeout_buf[i]);
5618 (*o_mputc)(mimeout_buf[i]);
5620 open_mime(output_mode);
5627 #if defined(PERL_XS) || defined(WIN32DLL)
5631 struct input_code *p = input_code_list;
5644 mime_f = STRICT_MIME;
5645 mime_decode_f = FALSE;
5650 #if defined(MSDOS) || defined(__OS2__)
5655 iso2022jp_f = FALSE;
5656 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5657 ms_ucs_map_f = UCS_MAP_ASCII;
5659 #ifdef UTF8_INPUT_ENABLE
5660 no_cp932ext_f = FALSE;
5661 no_best_fit_chars_f = FALSE;
5662 encode_fallback = NULL;
5663 unicode_subchar = '?';
5664 input_endian = ENDIAN_BIG;
5666 #ifdef UTF8_OUTPUT_ENABLE
5667 output_bom_f = FALSE;
5668 output_endian = ENDIAN_BIG;
5670 #ifdef UNICODE_NORMALIZATION
5683 is_inputcode_mixed = FALSE;
5684 is_inputcode_set = FALSE;
5688 #ifdef SHIFTJIS_CP932
5698 for (i = 0; i < 256; i++){
5699 prefix_table[i] = 0;
5703 mimeout_buf_count = 0;
5708 fold_preserve_f = FALSE;
5711 kanji_intro = DEFAULT_J;
5712 ascii_intro = DEFAULT_R;
5713 fold_margin = FOLD_MARGIN;
5714 output_conv = DEFAULT_CONV;
5715 oconv = DEFAULT_CONV;
5716 o_zconv = no_connection;
5717 o_fconv = no_connection;
5718 o_crconv = no_connection;
5719 o_rot_conv = no_connection;
5720 o_hira_conv = no_connection;
5721 o_base64conv = no_connection;
5722 o_iso2022jp_check_conv = no_connection;
5725 i_ungetc = std_ungetc;
5727 i_bungetc = std_ungetc;
5730 i_mungetc = std_ungetc;
5731 i_mgetc_buf = std_getc;
5732 i_mungetc_buf = std_ungetc;
5733 output_mode = ASCII;
5736 mime_decode_mode = FALSE;
5742 z_prev2=0,z_prev1=0;
5744 iconv_for_check = 0;
5746 input_codename = "";
5753 void no_connection(nkf_char c2, nkf_char c1)
5755 no_connection2(c2,c1,0);
5758 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5760 fprintf(stderr,"nkf internal module connection failure.\n");
5762 return 0; /* LINT */
5767 #define fprintf dllprintf
5771 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5772 fprintf(stderr,"Flags:\n");
5773 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5774 #ifdef DEFAULT_CODE_SJIS
5775 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5777 #ifdef DEFAULT_CODE_JIS
5778 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5780 #ifdef DEFAULT_CODE_EUC
5781 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5783 #ifdef DEFAULT_CODE_UTF8
5784 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5786 #ifdef UTF8_OUTPUT_ENABLE
5787 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5789 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5790 #ifdef UTF8_INPUT_ENABLE
5791 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5793 fprintf(stderr,"t no conversion\n");
5794 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5795 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5796 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5797 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5798 fprintf(stderr,"v Show this usage. V: show version\n");
5799 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5800 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5801 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5802 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5803 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5804 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5805 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5806 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5808 fprintf(stderr,"T Text mode output\n");
5810 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5811 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5812 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5813 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5814 fprintf(stderr,"\n");
5815 fprintf(stderr,"Long name options\n");
5816 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5817 fprintf(stderr," Specify the input or output codeset\n");
5818 fprintf(stderr," --fj --unix --mac --windows\n");
5819 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5820 fprintf(stderr," Convert for the system or code\n");
5821 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5822 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5823 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5825 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5827 #ifdef NUMCHAR_OPTION
5828 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5830 #ifdef UTF8_INPUT_ENABLE
5831 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5832 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5835 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5836 fprintf(stderr," Overwrite original listed files by filtered result\n");
5837 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5839 fprintf(stderr," -g --guess Guess the input code\n");
5840 fprintf(stderr," --help --version Show this help/the version\n");
5841 fprintf(stderr," For more information, see also man nkf\n");
5842 fprintf(stderr,"\n");
5848 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5849 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5852 #if defined(MSDOS) && defined(__WIN16__)
5855 #if defined(MSDOS) && defined(__WIN32__)
5861 ,NKF_VERSION,NKF_RELEASE_DATE);
5862 fprintf(stderr,"\n%s\n",CopyRight);
5867 **
\e$B%Q%C%A@):n<T
\e(B
5868 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5869 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5870 ** ohta@src.ricoh.co.jp (Junn Ohta)
5871 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5872 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5873 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5874 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5875 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5876 ** GHG00637@nifty-serve.or.jp (COW)