1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.110 2006/09/15 11:04:36 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-09-15"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
\r
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
\r
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
858 is_inputcode_mixed = FALSE;
859 is_inputcode_set = FALSE;
864 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
873 /* reopen file for stdout */
874 if (file_out_f == TRUE) {
877 outfname = malloc(strlen(origfname)
878 + strlen(".nkftmpXXXXXX")
884 strcpy(outfname, origfname);
888 for (i = strlen(outfname); i; --i){
889 if (outfname[i - 1] == '/'
890 || outfname[i - 1] == '\\'){
896 strcat(outfname, "ntXXXXXX");
898 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
901 strcat(outfname, ".nkftmpXXXXXX");
902 fd = mkstemp(outfname);
905 || (fd_backup = dup(fileno(stdout))) < 0
906 || dup2(fd, fileno(stdout)) < 0
917 outfname = "nkf.out";
920 if(freopen(outfname, "w", stdout) == NULL) {
924 if (binmode_f == TRUE) {
925 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
926 if (freopen("","wb",stdout) == NULL)
933 if (binmode_f == TRUE)
934 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
935 if (freopen("","rb",fin) == NULL)
940 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
944 char *filename = NULL;
946 if (nfiles > 1) filename = origfname;
947 if (guess_f) print_guessed_code(filename);
953 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
961 if (dup2(fd_backup, fileno(stdout)) < 0){
964 if (stat(origfname, &sb)) {
965 fprintf(stderr, "Can't stat %s\n", origfname);
967 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
968 if (chmod(outfname, sb.st_mode)) {
969 fprintf(stderr, "Can't set permission %s\n", outfname);
972 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
974 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
975 tb[0] = tb[1] = sb.st_mtime;
976 if (utime(outfname, tb)) {
977 fprintf(stderr, "Can't set timestamp %s\n", outfname);
980 tb.actime = sb.st_atime;
981 tb.modtime = sb.st_mtime;
982 if (utime(outfname, &tb)) {
983 fprintf(stderr, "Can't set timestamp %s\n", outfname);
988 char *backup_filename = get_backup_filename(backup_suffix, origfname);
990 unlink(backup_filename);
992 if (rename(origfname, backup_filename)) {
993 perror(backup_filename);
994 fprintf(stderr, "Can't rename %s to %s\n",
995 origfname, backup_filename);
999 if (unlink(origfname)){
1004 if (rename(outfname, origfname)) {
1006 fprintf(stderr, "Can't rename %s to %s\n",
1007 outfname, origfname);
1015 #ifdef EASYWIN /*Easy Win */
1016 if (file_out_f == FALSE)
1017 scanf("%d",&end_check);
1020 #else /* for Other OS */
1021 if (file_out_f == TRUE)
1023 #endif /*Easy Win */
1026 #endif /* WIN32DLL */
1029 char *get_backup_filename(const char *suffix, const char *filename)
1031 char *backup_filename;
1032 int asterisk_count = 0;
1034 int filename_length = strlen(filename);
1036 for(i = 0; suffix[i]; i++){
1037 if(suffix[i] == '*') asterisk_count++;
1041 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1042 if (!backup_filename){
1043 perror("Can't malloc backup filename.");
1047 for(i = 0, j = 0; suffix[i];){
1048 if(suffix[i] == '*'){
1049 backup_filename[j] = '\0';
1050 strncat(backup_filename, filename, filename_length);
1052 j += filename_length;
1054 backup_filename[j++] = suffix[i++];
1057 backup_filename[j] = '\0';
1059 j = strlen(suffix) + filename_length;
1060 backup_filename = malloc( + 1);
1061 strcpy(backup_filename, filename);
1062 strcat(backup_filename, suffix);
1063 backup_filename[j] = '\0';
1065 return backup_filename;
1094 {"katakana-hiragana","h3"},
1101 #ifdef UTF8_OUTPUT_ENABLE
1111 {"fb-subchar=", ""},
1113 #ifdef UTF8_INPUT_ENABLE
1114 {"utf8-input", "W"},
1115 {"utf16-input", "W16"},
1116 {"no-cp932ext", ""},
1117 {"no-best-fit-chars",""},
1119 #ifdef UNICODE_NORMALIZATION
1120 {"utf8mac-input", ""},
1132 #ifdef NUMCHAR_OPTION
1133 {"numchar-input", ""},
1139 #ifdef SHIFTJIS_CP932
1149 static int option_mode = 0;
1151 void options(unsigned char *cp)
1155 unsigned char *cp_back = NULL;
1160 while(*cp && *cp++!='-');
1161 while (*cp || cp_back) {
1169 case '-': /* literal options */
1170 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1174 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1175 p = (unsigned char *)long_option[i].name;
1176 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1177 if (*p == cp[j] || cp[j] == ' '){
1184 while(*cp && *cp != SPACE && cp++);
1185 if (long_option[i].alias[0]){
1187 cp = (unsigned char *)long_option[i].alias;
1189 if (strcmp(long_option[i].name, "ic=") == 0){
1190 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1191 codeset[i] = nkf_toupper(p[i]);
1194 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1195 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1196 strcmp(codeset, "CP50220") == 0 ||
1197 strcmp(codeset, "CP50221") == 0 ||
1198 strcmp(codeset, "CP50222") == 0 ||
1199 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1200 input_f = JIS_INPUT;
1201 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1202 input_f = JIS_INPUT;
1206 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1207 input_f = JIS_INPUT;
1212 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1213 input_f = SJIS_INPUT;
1214 if (x0201_f==NO_X0201) x0201_f=TRUE;
1215 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1216 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1217 strcmp(codeset, "CP932") == 0 ||
1218 strcmp(codeset, "MS932") == 0){
1219 input_f = SJIS_INPUT;
1221 #ifdef SHIFTJIS_CP932
1224 #ifdef UTF8_OUTPUT_ENABLE
1225 ms_ucs_map_f = UCS_MAP_CP932;
1227 }else if(strcmp(codeset, "EUCJP") == 0 ||
1228 strcmp(codeset, "EUC-JP") == 0){
1229 input_f = EUC_INPUT;
1230 }else if(strcmp(codeset, "CP51932") == 0){
1231 input_f = EUC_INPUT;
1233 #ifdef SHIFTJIS_CP932
1236 #ifdef UTF8_OUTPUT_ENABLE
1237 ms_ucs_map_f = UCS_MAP_CP932;
1239 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1240 strcmp(codeset, "EUCJP-MS") == 0 ||
1241 strcmp(codeset, "EUCJPMS") == 0){
1242 input_f = EUC_INPUT;
1244 #ifdef SHIFTJIS_CP932
1247 #ifdef UTF8_OUTPUT_ENABLE
1248 ms_ucs_map_f = UCS_MAP_MS;
1250 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1251 strcmp(codeset, "EUCJP-ASCII") == 0){
1252 input_f = EUC_INPUT;
1254 #ifdef SHIFTJIS_CP932
1257 #ifdef UTF8_OUTPUT_ENABLE
1258 ms_ucs_map_f = UCS_MAP_ASCII;
1260 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1261 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1262 input_f = SJIS_INPUT;
1264 #ifdef SHIFTJIS_CP932
1268 if (x0201_f==NO_X0201) x0201_f=TRUE;
1269 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1270 strcmp(codeset, "EUC-JIS-2004") == 0){
1271 input_f = EUC_INPUT;
1274 #ifdef SHIFTJIS_CP932
1278 #ifdef UTF8_INPUT_ENABLE
1279 }else if(strcmp(codeset, "UTF-8") == 0 ||
1280 strcmp(codeset, "UTF-8N") == 0 ||
1281 strcmp(codeset, "UTF-8-BOM") == 0){
1282 input_f = UTF8_INPUT;
1283 #ifdef UNICODE_NORMALIZATION
1284 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1285 strcmp(codeset, "UTF-8-MAC") == 0){
1286 input_f = UTF8_INPUT;
1289 }else if(strcmp(codeset, "UTF-16") == 0 ||
1290 strcmp(codeset, "UTF-16BE") == 0 ||
1291 strcmp(codeset, "UTF-16BE-BOM") == 0){
1292 input_f = UTF16_INPUT;
1293 input_endian = ENDIAN_BIG;
1294 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1295 strcmp(codeset, "UTF-16LE-BOM") == 0){
1296 input_f = UTF16_INPUT;
1297 input_endian = ENDIAN_LITTLE;
1298 }else if(strcmp(codeset, "UTF-32") == 0 ||
1299 strcmp(codeset, "UTF-32BE") == 0 ||
1300 strcmp(codeset, "UTF-32BE-BOM") == 0){
1301 input_f = UTF32_INPUT;
1302 input_endian = ENDIAN_BIG;
1303 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1304 strcmp(codeset, "UTF-32LE-BOM") == 0){
1305 input_f = UTF32_INPUT;
1306 input_endian = ENDIAN_LITTLE;
1311 if (strcmp(long_option[i].name, "oc=") == 0){
1312 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1313 codeset[i] = nkf_toupper(p[i]);
1316 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1317 strcmp(codeset, "CP50220") == 0){
1318 output_conv = j_oconv;
1319 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1320 output_conv = j_oconv;
1321 no_cp932ext_f = TRUE;
1322 }else if(strcmp(codeset, "CP50221") == 0 ||
1323 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1324 output_conv = j_oconv;
1326 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1327 output_conv = j_oconv;
1331 #ifdef SHIFTJIS_CP932
1334 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1335 output_conv = j_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1344 output_conv = j_oconv;
1349 #ifdef SHIFTJIS_CP932
1352 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1353 output_conv = s_oconv;
1354 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1355 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1356 strcmp(codeset, "CP932") == 0 ||
1357 strcmp(codeset, "MS932") == 0){
1358 output_conv = s_oconv;
1360 #ifdef SHIFTJIS_CP932
1364 #ifdef UTF8_OUTPUT_ENABLE
1365 ms_ucs_map_f = UCS_MAP_CP932;
1367 }else if(strcmp(codeset, "EUCJP") == 0 ||
1368 strcmp(codeset, "EUC-JP") == 0){
1369 output_conv = e_oconv;
1370 }else if(strcmp(codeset, "CP51932") == 0){
1371 output_conv = e_oconv;
1373 #ifdef SHIFTJIS_CP932
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_CP932;
1379 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1380 strcmp(codeset, "EUCJP-MS") == 0 ||
1381 strcmp(codeset, "EUCJPMS") == 0){
1382 output_conv = e_oconv;
1387 #ifdef SHIFTJIS_CP932
1390 #ifdef UTF8_OUTPUT_ENABLE
1391 ms_ucs_map_f = UCS_MAP_MS;
1393 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1394 strcmp(codeset, "EUCJP-ASCII") == 0){
1395 output_conv = e_oconv;
1400 #ifdef SHIFTJIS_CP932
1403 #ifdef UTF8_OUTPUT_ENABLE
1404 ms_ucs_map_f = UCS_MAP_ASCII;
1406 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1407 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1408 output_conv = s_oconv;
1410 #ifdef SHIFTJIS_CP932
1413 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1414 strcmp(codeset, "EUC-JIS-2004") == 0){
1415 output_conv = e_oconv;
1420 #ifdef SHIFTJIS_CP932
1423 #ifdef UTF8_OUTPUT_ENABLE
1424 }else if(strcmp(codeset, "UTF-8") == 0){
1425 output_conv = w_oconv;
1426 }else if(strcmp(codeset, "UTF-8N") == 0){
1427 output_conv = w_oconv;
1428 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1429 output_conv = w_oconv;
1430 output_bom_f = TRUE;
1431 }else if(strcmp(codeset, "UTF-16BE") == 0){
1432 output_conv = w_oconv16;
1433 }else if(strcmp(codeset, "UTF-16") == 0 ||
1434 strcmp(codeset, "UTF-16BE-BOM") == 0){
1435 output_conv = w_oconv16;
1436 output_bom_f = TRUE;
1437 }else if(strcmp(codeset, "UTF-16LE") == 0){
1438 output_conv = w_oconv16;
1439 output_endian = ENDIAN_LITTLE;
1440 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1441 output_conv = w_oconv16;
1442 output_endian = ENDIAN_LITTLE;
1443 output_bom_f = TRUE;
1444 }else if(strcmp(codeset, "UTF-32") == 0 ||
1445 strcmp(codeset, "UTF-32BE") == 0){
1446 output_conv = w_oconv32;
1447 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1448 output_conv = w_oconv32;
1449 output_bom_f = TRUE;
1450 }else if(strcmp(codeset, "UTF-32LE") == 0){
1451 output_conv = w_oconv32;
1452 output_endian = ENDIAN_LITTLE;
1453 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1454 output_conv = w_oconv32;
1455 output_endian = ENDIAN_LITTLE;
1456 output_bom_f = TRUE;
1462 if (strcmp(long_option[i].name, "overwrite") == 0){
1465 preserve_time_f = TRUE;
1468 if (strcmp(long_option[i].name, "overwrite=") == 0){
1471 preserve_time_f = TRUE;
1473 backup_suffix = malloc(strlen((char *) p) + 1);
1474 strcpy(backup_suffix, (char *) p);
1477 if (strcmp(long_option[i].name, "in-place") == 0){
1480 preserve_time_f = FALSE;
1483 if (strcmp(long_option[i].name, "in-place=") == 0){
1486 preserve_time_f = FALSE;
1488 backup_suffix = malloc(strlen((char *) p) + 1);
1489 strcpy(backup_suffix, (char *) p);
1494 if (strcmp(long_option[i].name, "cap-input") == 0){
1498 if (strcmp(long_option[i].name, "url-input") == 0){
1503 #ifdef NUMCHAR_OPTION
1504 if (strcmp(long_option[i].name, "numchar-input") == 0){
1510 if (strcmp(long_option[i].name, "no-output") == 0){
1514 if (strcmp(long_option[i].name, "debug") == 0){
1519 if (strcmp(long_option[i].name, "cp932") == 0){
1520 #ifdef SHIFTJIS_CP932
1524 #ifdef UTF8_OUTPUT_ENABLE
1525 ms_ucs_map_f = UCS_MAP_CP932;
1529 if (strcmp(long_option[i].name, "no-cp932") == 0){
1530 #ifdef SHIFTJIS_CP932
1534 #ifdef UTF8_OUTPUT_ENABLE
1535 ms_ucs_map_f = UCS_MAP_ASCII;
1539 #ifdef SHIFTJIS_CP932
1540 if (strcmp(long_option[i].name, "cp932inv") == 0){
1547 if (strcmp(long_option[i].name, "x0212") == 0){
1554 if (strcmp(long_option[i].name, "exec-in") == 0){
1558 if (strcmp(long_option[i].name, "exec-out") == 0){
1563 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1564 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1565 no_cp932ext_f = TRUE;
1568 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1569 no_best_fit_chars_f = TRUE;
1572 if (strcmp(long_option[i].name, "fb-skip") == 0){
1573 encode_fallback = NULL;
1576 if (strcmp(long_option[i].name, "fb-html") == 0){
1577 encode_fallback = encode_fallback_html;
1580 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1581 encode_fallback = encode_fallback_xml;
1584 if (strcmp(long_option[i].name, "fb-java") == 0){
1585 encode_fallback = encode_fallback_java;
1588 if (strcmp(long_option[i].name, "fb-perl") == 0){
1589 encode_fallback = encode_fallback_perl;
1592 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1593 encode_fallback = encode_fallback_subchar;
1596 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1597 encode_fallback = encode_fallback_subchar;
1598 unicode_subchar = 0;
1600 /* decimal number */
1601 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1602 unicode_subchar *= 10;
1603 unicode_subchar += hex2bin(p[i]);
1605 }else if(p[1] == 'x' || p[1] == 'X'){
1606 /* hexadecimal number */
1607 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1608 unicode_subchar <<= 4;
1609 unicode_subchar |= hex2bin(p[i]);
1613 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1614 unicode_subchar *= 8;
1615 unicode_subchar += hex2bin(p[i]);
1618 w16e_conv(unicode_subchar, &i, &j);
1619 unicode_subchar = i<<8 | j;
1623 #ifdef UTF8_OUTPUT_ENABLE
1624 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1625 ms_ucs_map_f = UCS_MAP_MS;
1629 #ifdef UNICODE_NORMALIZATION
1630 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1631 input_f = UTF8_INPUT;
1636 if (strcmp(long_option[i].name, "prefix=") == 0){
1637 if (nkf_isgraph(p[0])){
1638 for (i = 1; nkf_isgraph(p[i]); i++){
1639 prefix_table[p[i]] = p[0];
1646 case 'b': /* buffered mode */
1649 case 'u': /* non bufferd mode */
1652 case 't': /* transparent mode */
1657 } else if (*cp=='2') {
1661 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1669 case 'j': /* JIS output */
1671 output_conv = j_oconv;
1673 case 'e': /* AT&T EUC output */
1674 output_conv = e_oconv;
1676 case 's': /* SJIS output */
1677 output_conv = s_oconv;
1679 case 'l': /* ISO8859 Latin-1 support, no conversion */
1680 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1681 input_f = LATIN1_INPUT;
1683 case 'i': /* Kanji IN ESC-$-@/B */
1684 if (*cp=='@'||*cp=='B')
1685 kanji_intro = *cp++;
1687 case 'o': /* ASCII IN ESC-(-J/B */
1688 if (*cp=='J'||*cp=='B'||*cp=='H')
1689 ascii_intro = *cp++;
1693 bit:1 katakana->hiragana
1694 bit:2 hiragana->katakana
1696 if ('9'>= *cp && *cp>='0')
1697 hira_f |= (*cp++ -'0');
1704 #if defined(MSDOS) || defined(__OS2__)
1719 #ifdef UTF8_OUTPUT_ENABLE
1720 case 'w': /* UTF-8 output */
1722 output_conv = w_oconv; cp++;
1726 output_bom_f = TRUE;
1729 if ('1'== cp[0] && '6'==cp[1]) {
1730 output_conv = w_oconv16; cp+=2;
1731 } else if ('3'== cp[0] && '2'==cp[1]) {
1732 output_conv = w_oconv32; cp+=2;
1734 output_conv = w_oconv;
1739 output_endian = ENDIAN_LITTLE;
1740 } else if (cp[0] == 'B') {
1748 output_bom_f = TRUE;
1753 #ifdef UTF8_INPUT_ENABLE
1754 case 'W': /* UTF input */
1757 input_f = UTF8_INPUT;
1759 if ('1'== cp[0] && '6'==cp[1]) {
1761 input_f = UTF16_INPUT;
1762 input_endian = ENDIAN_BIG;
1763 } else if ('3'== cp[0] && '2'==cp[1]) {
1765 input_f = UTF32_INPUT;
1766 input_endian = ENDIAN_BIG;
1768 input_f = UTF8_INPUT;
1773 input_endian = ENDIAN_LITTLE;
1774 } else if (cp[0] == 'B') {
1780 /* Input code assumption */
1781 case 'J': /* JIS input */
1782 input_f = JIS_INPUT;
1784 case 'E': /* AT&T EUC input */
1785 input_f = EUC_INPUT;
1787 case 'S': /* MS Kanji input */
1788 input_f = SJIS_INPUT;
1789 if (x0201_f==NO_X0201) x0201_f=TRUE;
1791 case 'Z': /* Convert X0208 alphabet to asii */
1792 /* bit:0 Convert X0208
1793 bit:1 Convert Kankaku to one space
1794 bit:2 Convert Kankaku to two spaces
1795 bit:3 Convert HTML Entity
1797 if ('9'>= *cp && *cp>='0')
1798 alpha_f |= 1<<(*cp++ -'0');
1802 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1803 x0201_f = FALSE; /* No X0201->X0208 conversion */
1805 ESC-(-I in JIS, EUC, MS Kanji
1806 SI/SO in JIS, EUC, MS Kanji
1807 SSO in EUC, JIS, not in MS Kanji
1808 MS Kanji (0xa0-0xdf)
1810 ESC-(-I in JIS (0x20-0x5f)
1811 SSO in EUC (0xa0-0xdf)
1812 0xa0-0xd in MS Kanji (0xa0-0xdf)
1815 case 'X': /* Assume X0201 kana */
1816 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1819 case 'F': /* prserve new lines */
1820 fold_preserve_f = TRUE;
1821 case 'f': /* folding -f60 or -f */
1824 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1826 fold_len += *cp++ - '0';
1828 if (!(0<fold_len && fold_len<BUFSIZ))
1829 fold_len = DEFAULT_FOLD;
1833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1835 fold_margin += *cp++ - '0';
1839 case 'm': /* MIME support */
1840 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1841 if (*cp=='B'||*cp=='Q') {
1842 mime_decode_mode = *cp++;
1843 mimebuf_f = FIXED_MIME;
1844 } else if (*cp=='N') {
1845 mime_f = TRUE; cp++;
1846 } else if (*cp=='S') {
1847 mime_f = STRICT_MIME; cp++;
1848 } else if (*cp=='0') {
1849 mime_decode_f = FALSE;
1850 mime_f = FALSE; cp++;
1853 case 'M': /* MIME output */
1856 mimeout_f = FIXED_MIME; cp++;
1857 } else if (*cp=='Q') {
1859 mimeout_f = FIXED_MIME; cp++;
1864 case 'B': /* Broken JIS support */
1866 bit:1 allow any x on ESC-(-x or ESC-$-x
1867 bit:2 reset to ascii on NL
1869 if ('9'>= *cp && *cp>='0')
1870 broken_f |= 1<<(*cp++ -'0');
1875 case 'O':/* for Output file */
1879 case 'c':/* add cr code */
1882 case 'd':/* delete cr code */
1885 case 'I': /* ISO-2022-JP output */
1888 case 'L': /* line mode */
1889 if (*cp=='u') { /* unix */
1890 crmode_f = NL; cp++;
1891 } else if (*cp=='m') { /* mac */
1892 crmode_f = CR; cp++;
1893 } else if (*cp=='w') { /* windows */
1894 crmode_f = CRLF; cp++;
1895 } else if (*cp=='0') { /* no conversion */
1905 /* module muliple options in a string are allowed for Perl moudle */
1906 while(*cp && *cp++!='-');
1909 /* bogus option but ignored */
1915 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1918 struct input_code *p = input_code_list;
1920 if (iconv_func == p->iconv_func){
1929 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1931 #ifdef INPUT_CODE_FIX
1939 #ifdef INPUT_CODE_FIX
1940 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1946 if (estab_f && iconv_for_check != iconv){
1947 struct input_code *p = find_inputcode_byfunc(iconv);
1949 set_input_codename(p->name);
1950 debug(input_codename);
1952 iconv_for_check = iconv;
1957 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1958 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1959 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1960 #ifdef SHIFTJIS_CP932
1961 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1962 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1964 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1966 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1967 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1969 #define SCORE_INIT (SCORE_iMIME)
1971 const nkf_char score_table_A0[] = {
1974 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1975 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1978 const nkf_char score_table_F0[] = {
1979 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1980 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1981 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1982 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1985 void set_code_score(struct input_code *ptr, nkf_char score)
1988 ptr->score |= score;
1992 void clr_code_score(struct input_code *ptr, nkf_char score)
1995 ptr->score &= ~score;
1999 void code_score(struct input_code *ptr)
2001 nkf_char c2 = ptr->buf[0];
2002 #ifdef UTF8_OUTPUT_ENABLE
2003 nkf_char c1 = ptr->buf[1];
2006 set_code_score(ptr, SCORE_ERROR);
2007 }else if (c2 == SSO){
2008 set_code_score(ptr, SCORE_KANA);
2009 #ifdef UTF8_OUTPUT_ENABLE
2010 }else if (!e2w_conv(c2, c1)){
2011 set_code_score(ptr, SCORE_NO_EXIST);
2013 }else if ((c2 & 0x70) == 0x20){
2014 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2015 }else if ((c2 & 0x70) == 0x70){
2016 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2017 }else if ((c2 & 0x70) >= 0x50){
2018 set_code_score(ptr, SCORE_L2);
2022 void status_disable(struct input_code *ptr)
2027 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2030 void status_push_ch(struct input_code *ptr, nkf_char c)
2032 ptr->buf[ptr->index++] = c;
2035 void status_clear(struct input_code *ptr)
2041 void status_reset(struct input_code *ptr)
2044 ptr->score = SCORE_INIT;
2047 void status_reinit(struct input_code *ptr)
2050 ptr->_file_stat = 0;
2053 void status_check(struct input_code *ptr, nkf_char c)
2055 if (c <= DEL && estab_f){
2060 void s_status(struct input_code *ptr, nkf_char c)
2064 status_check(ptr, c);
2069 #ifdef NUMCHAR_OPTION
2070 }else if (is_unicode_capsule(c)){
2073 }else if (0xa1 <= c && c <= 0xdf){
2074 status_push_ch(ptr, SSO);
2075 status_push_ch(ptr, c);
2078 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2080 status_push_ch(ptr, c);
2081 #ifdef SHIFTJIS_CP932
2083 && is_ibmext_in_sjis(c)){
2085 status_push_ch(ptr, c);
2086 #endif /* SHIFTJIS_CP932 */
2088 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2090 status_push_ch(ptr, c);
2091 #endif /* X0212_ENABLE */
2093 status_disable(ptr);
2097 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2098 status_push_ch(ptr, c);
2099 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2103 status_disable(ptr);
2107 #ifdef SHIFTJIS_CP932
2108 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2109 status_push_ch(ptr, c);
2110 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2111 set_code_score(ptr, SCORE_CP932);
2116 #endif /* SHIFTJIS_CP932 */
2117 #ifndef X0212_ENABLE
2118 status_disable(ptr);
2124 void e_status(struct input_code *ptr, nkf_char c)
2128 status_check(ptr, c);
2133 #ifdef NUMCHAR_OPTION
2134 }else if (is_unicode_capsule(c)){
2137 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2139 status_push_ch(ptr, c);
2141 }else if (0x8f == c){
2143 status_push_ch(ptr, c);
2144 #endif /* X0212_ENABLE */
2146 status_disable(ptr);
2150 if (0xa1 <= c && c <= 0xfe){
2151 status_push_ch(ptr, c);
2155 status_disable(ptr);
2160 if (0xa1 <= c && c <= 0xfe){
2162 status_push_ch(ptr, c);
2164 status_disable(ptr);
2166 #endif /* X0212_ENABLE */
2170 #ifdef UTF8_INPUT_ENABLE
2171 void w_status(struct input_code *ptr, nkf_char c)
2175 status_check(ptr, c);
2180 #ifdef NUMCHAR_OPTION
2181 }else if (is_unicode_capsule(c)){
2184 }else if (0xc0 <= c && c <= 0xdf){
2186 status_push_ch(ptr, c);
2187 }else if (0xe0 <= c && c <= 0xef){
2189 status_push_ch(ptr, c);
2190 }else if (0xf0 <= c && c <= 0xf4){
2192 status_push_ch(ptr, c);
2194 status_disable(ptr);
2199 if (0x80 <= c && c <= 0xbf){
2200 status_push_ch(ptr, c);
2201 if (ptr->index > ptr->stat){
2202 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2203 && ptr->buf[2] == 0xbf);
2204 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2205 &ptr->buf[0], &ptr->buf[1]);
2212 status_disable(ptr);
2216 if (0x80 <= c && c <= 0xbf){
2217 if (ptr->index < ptr->stat){
2218 status_push_ch(ptr, c);
2223 status_disable(ptr);
2230 void code_status(nkf_char c)
2232 int action_flag = 1;
2233 struct input_code *result = 0;
2234 struct input_code *p = input_code_list;
2236 if (!p->status_func) {
\r
2240 if (!p->status_func)
2242 (p->status_func)(p, c);
2245 }else if(p->stat == 0){
2256 if (result && !estab_f){
2257 set_iconv(TRUE, result->iconv_func);
2258 }else if (c <= DEL){
2259 struct input_code *ptr = input_code_list;
2269 nkf_char std_getc(FILE *f)
2272 return std_gc_buf[--std_gc_ndx];
2278 nkf_char std_ungetc(nkf_char c, FILE *f)
2280 if (std_gc_ndx == STD_GC_BUFSIZE){
2283 std_gc_buf[std_gc_ndx++] = c;
2288 void std_putc(nkf_char c)
2295 #if !defined(PERL_XS) && !defined(WIN32DLL)
2296 nkf_char noconvert(FILE *f)
2301 module_connection();
2302 while ((c = (*i_getc)(f)) != EOF)
2309 void module_connection(void)
2311 oconv = output_conv;
2314 /* replace continucation module, from output side */
2316 /* output redicrection */
2318 if (noout_f || guess_f){
2325 if (mimeout_f == TRUE) {
2326 o_base64conv = oconv; oconv = base64_conv;
2328 /* base64_count = 0; */
2332 o_crconv = oconv; oconv = cr_conv;
2335 o_rot_conv = oconv; oconv = rot_conv;
2338 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2341 o_hira_conv = oconv; oconv = hira_conv;
2344 o_fconv = oconv; oconv = fold_conv;
2347 if (alpha_f || x0201_f) {
2348 o_zconv = oconv; oconv = z_conv;
2352 i_ungetc = std_ungetc;
2353 /* input redicrection */
2356 i_cgetc = i_getc; i_getc = cap_getc;
2357 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2360 i_ugetc = i_getc; i_getc = url_getc;
2361 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2364 #ifdef NUMCHAR_OPTION
2366 i_ngetc = i_getc; i_getc = numchar_getc;
2367 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2370 #ifdef UNICODE_NORMALIZATION
2371 if (nfc_f && input_f == UTF8_INPUT){
2372 i_nfc_getc = i_getc; i_getc = nfc_getc;
2373 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2376 if (mime_f && mimebuf_f==FIXED_MIME) {
2377 i_mgetc = i_getc; i_getc = mime_getc;
2378 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2381 i_bgetc = i_getc; i_getc = broken_getc;
2382 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2384 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2385 set_iconv(-TRUE, e_iconv);
2386 } else if (input_f == SJIS_INPUT) {
2387 set_iconv(-TRUE, s_iconv);
2388 #ifdef UTF8_INPUT_ENABLE
2389 } else if (input_f == UTF8_INPUT) {
2390 set_iconv(-TRUE, w_iconv);
2391 } else if (input_f == UTF16_INPUT) {
2392 set_iconv(-TRUE, w_iconv16);
2393 } else if (input_f == UTF32_INPUT) {
2394 set_iconv(-TRUE, w_iconv32);
2397 set_iconv(FALSE, e_iconv);
2401 struct input_code *p = input_code_list;
2409 * Check and Ignore BOM
2411 void check_bom(FILE *f)
2414 switch(c2 = (*i_getc)(f)){
2416 if((c2 = (*i_getc)(f)) == 0x00){
2417 if((c2 = (*i_getc)(f)) == 0xFE){
2418 if((c2 = (*i_getc)(f)) == 0xFF){
2420 set_iconv(TRUE, w_iconv32);
2422 if (iconv == w_iconv32) {
2423 input_endian = ENDIAN_BIG;
2426 (*i_ungetc)(0xFF,f);
2427 }else (*i_ungetc)(c2,f);
2428 (*i_ungetc)(0xFE,f);
2429 }else if(c2 == 0xFF){
2430 if((c2 = (*i_getc)(f)) == 0xFE){
2432 set_iconv(TRUE, w_iconv32);
2434 if (iconv == w_iconv32) {
2435 input_endian = ENDIAN_2143;
2438 (*i_ungetc)(0xFF,f);
2439 }else (*i_ungetc)(c2,f);
2440 (*i_ungetc)(0xFF,f);
2441 }else (*i_ungetc)(c2,f);
2442 (*i_ungetc)(0x00,f);
2443 }else (*i_ungetc)(c2,f);
2444 (*i_ungetc)(0x00,f);
2447 if((c2 = (*i_getc)(f)) == 0xBB){
2448 if((c2 = (*i_getc)(f)) == 0xBF){
2450 set_iconv(TRUE, w_iconv);
2452 if (iconv == w_iconv) {
2455 (*i_ungetc)(0xBF,f);
2456 }else (*i_ungetc)(c2,f);
2457 (*i_ungetc)(0xBB,f);
2458 }else (*i_ungetc)(c2,f);
2459 (*i_ungetc)(0xEF,f);
2462 if((c2 = (*i_getc)(f)) == 0xFF){
2463 if((c2 = (*i_getc)(f)) == 0x00){
2464 if((c2 = (*i_getc)(f)) == 0x00){
2466 set_iconv(TRUE, w_iconv32);
2468 if (iconv == w_iconv32) {
2469 input_endian = ENDIAN_3412;
2472 (*i_ungetc)(0x00,f);
2473 }else (*i_ungetc)(c2,f);
2474 (*i_ungetc)(0x00,f);
2475 }else (*i_ungetc)(c2,f);
2477 set_iconv(TRUE, w_iconv16);
2479 if (iconv == w_iconv16) {
2480 input_endian = ENDIAN_BIG;
2483 (*i_ungetc)(0xFF,f);
2484 }else (*i_ungetc)(c2,f);
2485 (*i_ungetc)(0xFE,f);
2488 if((c2 = (*i_getc)(f)) == 0xFE){
2489 if((c2 = (*i_getc)(f)) == 0x00){
2490 if((c2 = (*i_getc)(f)) == 0x00){
2492 set_iconv(TRUE, w_iconv32);
2494 if (iconv == w_iconv32) {
2495 input_endian = ENDIAN_LITTLE;
2498 (*i_ungetc)(0x00,f);
2499 }else (*i_ungetc)(c2,f);
2500 (*i_ungetc)(0x00,f);
2501 }else (*i_ungetc)(c2,f);
2503 set_iconv(TRUE, w_iconv16);
2505 if (iconv == w_iconv16) {
2506 input_endian = ENDIAN_LITTLE;
2509 (*i_ungetc)(0xFE,f);
2510 }else (*i_ungetc)(c2,f);
2511 (*i_ungetc)(0xFF,f);
2520 Conversion main loop. Code detection only.
2523 nkf_char kanji_convert(FILE *f)
2525 nkf_char c3, c2=0, c1, c0=0;
2526 int is_8bit = FALSE;
2528 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2529 #ifdef UTF8_INPUT_ENABLE
2530 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2537 output_mode = ASCII;
2540 #define NEXT continue /* no output, get next */
2541 #define SEND ; /* output c1 and c2, get next */
2542 #define LAST break /* end of loop, go closing */
2544 module_connection();
2547 while ((c1 = (*i_getc)(f)) != EOF) {
2548 #ifdef INPUT_CODE_FIX
2555 /* in case of 8th bit is on */
2556 if (!estab_f&&!mime_decode_mode) {
2557 /* in case of not established yet */
2558 /* It is still ambiguious */
2559 if (h_conv(f, c2, c1)==EOF)
2565 /* in case of already established */
2567 /* ignore bogus code */
2573 /* second byte, 7 bit code */
2574 /* it might be kanji shitfted */
2575 if ((c1 == DEL) || (c1 <= SPACE)) {
2576 /* ignore bogus first code */
2583 #ifdef UTF8_INPUT_ENABLE
2584 if (iconv == w_iconv16) {
2585 if (input_endian == ENDIAN_BIG) {
2587 if ((c1 = (*i_getc)(f)) != EOF) {
2588 if (0xD8 <= c2 && c2 <= 0xDB) {
2589 if ((c0 = (*i_getc)(f)) != EOF) {
2591 if ((c3 = (*i_getc)(f)) != EOF) {
2598 if ((c2 = (*i_getc)(f)) != EOF) {
2599 if (0xD8 <= c2 && c2 <= 0xDB) {
2600 if ((c3 = (*i_getc)(f)) != EOF) {
2601 if ((c0 = (*i_getc)(f)) != EOF) {
2610 } else if(iconv == w_iconv32){
2612 if((c2 = (*i_getc)(f)) != EOF &&
2613 (c1 = (*i_getc)(f)) != EOF &&
2614 (c0 = (*i_getc)(f)) != EOF){
2615 switch(input_endian){
2617 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2620 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2623 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2626 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2636 #ifdef NUMCHAR_OPTION
2637 if (is_unicode_capsule(c1)){
2643 if (!estab_f && !iso8859_f) {
2644 /* not established yet */
2647 } else { /* estab_f==TRUE */
2652 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2653 /* SJIS X0201 Case... */
2654 if(iso2022jp_f && x0201_f==NO_X0201) {
2655 (*oconv)(GETA1, GETA2);
2662 } else if (c1==SSO && iconv != s_iconv) {
2663 /* EUC X0201 Case */
2664 c1 = (*i_getc)(f); /* skip SSO */
2666 if (SSP<=c1 && c1<0xe0) {
2667 if(iso2022jp_f && x0201_f==NO_X0201) {
2668 (*oconv)(GETA1, GETA2);
2675 } else { /* bogus code, skip SSO and one byte */
2679 /* already established */
2684 } else if ((c1 > SPACE) && (c1 != DEL)) {
2685 /* in case of Roman characters */
2687 /* output 1 shifted byte */
2691 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2692 /* output 1 shifted byte */
2693 if(iso2022jp_f && x0201_f==NO_X0201) {
2694 (*oconv)(GETA1, GETA2);
2701 /* look like bogus code */
2704 } else if (input_mode == X0208 || input_mode == X0212 ||
2705 input_mode == X0213_1 || input_mode == X0213_2) {
2706 /* in case of Kanji shifted */
2709 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2710 /* Check MIME code */
2711 if ((c1 = (*i_getc)(f)) == EOF) {
2714 } else if (c1 == '?') {
2715 /* =? is mime conversion start sequence */
2716 if(mime_f == STRICT_MIME) {
2717 /* check in real detail */
2718 if (mime_begin_strict(f) == EOF)
2722 } else if (mime_begin(f) == EOF)
2732 /* normal ASCII code */
2735 } else if (!is_8bit && c1 == SI) {
2738 } else if (!is_8bit && c1 == SO) {
2741 } else if (!is_8bit && c1 == ESC ) {
2742 if ((c1 = (*i_getc)(f)) == EOF) {
2743 /* (*oconv)(0, ESC); don't send bogus code */
2745 } else if (c1 == '$') {
2746 if ((c1 = (*i_getc)(f)) == EOF) {
2748 (*oconv)(0, ESC); don't send bogus code
2749 (*oconv)(0, '$'); */
2751 } else if (c1 == '@'|| c1 == 'B') {
2752 /* This is kanji introduction */
2755 set_input_codename("ISO-2022-JP");
2757 debug(input_codename);
2760 } else if (c1 == '(') {
2761 if ((c1 = (*i_getc)(f)) == EOF) {
2762 /* don't send bogus code
2768 } else if (c1 == '@'|| c1 == 'B') {
2769 /* This is kanji introduction */
2774 } else if (c1 == 'D'){
2778 #endif /* X0212_ENABLE */
2779 } else if (c1 == (X0213_1&0x7F)){
2780 input_mode = X0213_1;
2783 } else if (c1 == (X0213_2&0x7F)){
2784 input_mode = X0213_2;
2788 /* could be some special code */
2795 } else if (broken_f&0x2) {
2796 /* accept any ESC-(-x as broken code ... */
2806 } else if (c1 == '(') {
2807 if ((c1 = (*i_getc)(f)) == EOF) {
2808 /* don't send bogus code
2810 (*oconv)(0, '('); */
2814 /* This is X0201 kana introduction */
2815 input_mode = X0201; shift_mode = X0201;
2817 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2818 /* This is X0208 kanji introduction */
2819 input_mode = ASCII; shift_mode = FALSE;
2821 } else if (broken_f&0x2) {
2822 input_mode = ASCII; shift_mode = FALSE;
2827 /* maintain various input_mode here */
2831 } else if ( c1 == 'N' || c1 == 'n' ){
2833 c3 = (*i_getc)(f); /* skip SS2 */
2834 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2849 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2850 input_mode = ASCII; set_iconv(FALSE, 0);
2852 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2853 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2861 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2862 if ((c1=(*i_getc)(f))!=EOF) {
2866 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2884 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2887 if ((c0 = (*i_getc)(f)) != EOF) {
2890 if ((c3 = (*i_getc)(f)) != EOF) {
2892 (*iconv)(c2, c1, c0|c3);
2897 /* 3 bytes EUC or UTF-8 */
2898 if ((c0 = (*i_getc)(f)) != EOF) {
2900 (*iconv)(c2, c1, c0);
2907 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2911 (*oconv)(PREFIX_EUCG3 | c2, c1);
2913 #endif /* X0212_ENABLE */
2915 (*oconv)(PREFIX_EUCG3 | c2, c1);
2918 (*oconv)(input_mode, c1); /* other special case */
2924 /* goto next_word */
2928 (*iconv)(EOF, 0, 0);
2929 if (!is_inputcode_set)
2932 struct input_code *p = input_code_list;
2933 struct input_code *result = p;
2935 if (p->score < result->score) result = p;
2938 set_input_codename(result->name);
2945 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2947 nkf_char ret, c3, c0;
2951 /** it must NOT be in the kanji shifte sequence */
2952 /** it must NOT be written in JIS7 */
2953 /** and it must be after 2 byte 8bit code */
2959 while ((c1 = (*i_getc)(f)) != EOF) {
2965 if (push_hold_buf(c1) == EOF || estab_f){
2971 struct input_code *p = input_code_list;
2972 struct input_code *result = p;
2977 if (p->score < result->score){
2982 set_iconv(FALSE, result->iconv_func);
2987 ** 1) EOF is detected, or
2988 ** 2) Code is established, or
2989 ** 3) Buffer is FULL (but last word is pushed)
2991 ** in 1) and 3) cases, we continue to use
2992 ** Kanji codes by oconv and leave estab_f unchanged.
2997 while (hold_index < hold_count){
2998 c2 = hold_buf[hold_index++];
3000 #ifdef NUMCHAR_OPTION
3001 || is_unicode_capsule(c2)
3006 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3007 (*iconv)(X0201, c2, 0);
3010 if (hold_index < hold_count){
3011 c1 = hold_buf[hold_index++];
3021 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3024 if (hold_index < hold_count){
3025 c0 = hold_buf[hold_index++];
3026 } else if ((c0 = (*i_getc)(f)) == EOF) {
3032 if (hold_index < hold_count){
3033 c3 = hold_buf[hold_index++];
3034 } else if ((c3 = (*i_getc)(f)) == EOF) {
3039 (*iconv)(c2, c1, c0|c3);
3044 /* 3 bytes EUC or UTF-8 */
3045 if (hold_index < hold_count){
3046 c0 = hold_buf[hold_index++];
3047 } else if ((c0 = (*i_getc)(f)) == EOF) {
3053 (*iconv)(c2, c1, c0);
3056 if (c0 == EOF) break;
3061 nkf_char push_hold_buf(nkf_char c2)
3063 if (hold_count >= HOLD_SIZE*2)
3065 hold_buf[hold_count++] = (unsigned char)c2;
3066 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3069 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3071 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3074 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3075 #ifdef SHIFTJIS_CP932
3076 if (cp51932_f && is_ibmext_in_sjis(c2)){
3078 extern const unsigned short shiftjis_cp932[3][189];
3080 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3086 #endif /* SHIFTJIS_CP932 */
3088 if (!x0213_f && is_ibmext_in_sjis(c2)){
3090 extern const unsigned short shiftjis_x0212[3][189];
3092 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3095 c2 = PREFIX_EUCG3 | (val >> 8);
3108 if(x0213_f && c2 >= 0xF0){
3109 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3110 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3111 }else{ /* 78<=k<=94 */
3112 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3113 if (0x9E < c1) c2++;
3116 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3117 if (0x9E < c1) c2++;
3120 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3127 c2 = x0212_unshift(c2);
3134 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3138 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3141 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3142 if (ret) return ret;
3148 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3153 }else if (c2 == 0x8f){
3157 c2 = (c2 << 8) | (c1 & 0x7f);
3159 #ifdef SHIFTJIS_CP932
3162 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3163 s2e_conv(s2, s1, &c2, &c1);
3170 #endif /* SHIFTJIS_CP932 */
3171 #endif /* X0212_ENABLE */
3172 } else if (c2 == SSO){
3175 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3185 #ifdef UTF8_INPUT_ENABLE
3186 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3193 }else if (0xc0 <= c2 && c2 <= 0xef) {
3194 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3195 #ifdef NUMCHAR_OPTION
3198 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3206 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3209 static const int w_iconv_utf8_1st_byte[] =
3211 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3212 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3213 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3214 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3216 if (c2 < 0 || 0xff < c2) {
3217 }else if (c2 == 0) { /* 0 : 1 byte*/
3219 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3222 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3224 if (c1 < 0x80 || 0xBF < c1) return 0;
3227 if (c0 == 0) return -1;
3228 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3233 if (c0 == 0) return -1;
3234 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3238 if (c0 == 0) return -1;
3239 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3243 if (c0 == 0) return -2;
3244 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3248 if (c0 == 0) return -2;
3249 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3253 if (c0 == 0) return -2;
3254 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3262 if (c2 == 0 || c2 == EOF){
3263 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3264 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3267 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3276 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3277 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3284 }else if (val < 0x800){
3285 *p2 = 0xc0 | (val >> 6);
3286 *p1 = 0x80 | (val & 0x3f);
3288 } else if (val <= NKF_INT32_C(0xFFFF)) {
3289 *p2 = 0xe0 | (val >> 12);
3290 *p1 = 0x80 | ((val >> 6) & 0x3f);
3291 *p0 = 0x80 | (val & 0x3f);
3292 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3293 *p2 = 0xe0 | (val >> 16);
3294 *p1 = 0x80 | ((val >> 12) & 0x3f);
3295 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3304 #ifdef UTF8_INPUT_ENABLE
3305 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3310 } else if (c2 >= 0xf0){
3311 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3312 val = (c2 & 0x0f) << 18;
3313 val |= (c1 & 0x3f) << 12;
3314 val |= (c0 & 0x3f00) >> 2;
3316 }else if (c2 >= 0xe0){
3317 val = (c2 & 0x0f) << 12;
3318 val |= (c1 & 0x3f) << 6;
3320 }else if (c2 >= 0xc0){
3321 val = (c2 & 0x1f) << 6;
3329 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3331 nkf_char c2, c1, c0;
3338 w16w_conv(val, &c2, &c1, &c0);
3339 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3340 #ifdef NUMCHAR_OPTION
3343 *p1 = CLASS_UNICODE | val;
3352 #ifdef UTF8_INPUT_ENABLE
3353 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3356 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3359 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3360 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3362 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3364 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3369 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3370 if (ret) return ret;
3375 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3379 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3380 } else if (is_unicode_bmp(c1)) {
3381 ret = w16e_conv(c1, &c2, &c1);
3384 c1 = CLASS_UNICODE | c1;
3386 if (ret) return ret;
3391 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3394 extern const unsigned short *const utf8_to_euc_2bytes[];
3395 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3396 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3397 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3398 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3399 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3401 const unsigned short *const *pp;
3402 const unsigned short *const *const *ppp;
3403 static const int no_best_fit_chars_table_C2[] =
3404 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3405 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3406 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3407 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3408 static const int no_best_fit_chars_table_C2_ms[] =
3409 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3410 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3411 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3412 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3413 static const int no_best_fit_chars_table_932_C2[] =
3414 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3415 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3416 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3417 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3418 static const int no_best_fit_chars_table_932_C3[] =
3419 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3420 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3421 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3422 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3428 }else if(c2 < 0xe0){
3429 if(no_best_fit_chars_f){
3430 if(ms_ucs_map_f == UCS_MAP_CP932){
3433 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3436 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3439 }else if(cp51932_f){
3442 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3445 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3448 }else if(ms_ucs_map_f == UCS_MAP_MS){
3449 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3453 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3454 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3456 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3457 }else if(c0 < 0xF0){
3458 if(no_best_fit_chars_f){
3459 if(ms_ucs_map_f == UCS_MAP_CP932){
3460 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3461 }else if(ms_ucs_map_f == UCS_MAP_MS){
3466 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3469 if(c0 == 0x92) return 1;
3474 if(c1 == 0x80 || c0 == 0x9C) return 1;
3482 if(c0 == 0x95) return 1;
3485 if(c0 == 0xA5) return 1;
3492 if(c0 == 0x8D) return 1;
3495 if(c0 == 0x9E && cp51932_f) return 1;
3498 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3506 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3507 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3509 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3511 #ifdef SHIFTJIS_CP932
3512 if (!ret && cp51932_f && is_eucg3(*p2)) {
3514 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3515 s2e_conv(s2, s1, p2, p1);
3524 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3527 const unsigned short *p;
3530 if (pp == 0) return 1;
3533 if (c1 < 0 || psize <= c1) return 1;
3535 if (p == 0) return 1;
3538 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3540 if (val == 0) return 1;
3541 if (no_cp932ext_f && (
3542 (val>>8) == 0x2D || /* NEC special characters */
3543 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3551 if (c2 == SO) c2 = X0201;
3558 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3560 const char *hex = "0123456789ABCDEF";
3566 (*f)(0, hex[(c>>shift)&0xF]);
3576 void encode_fallback_html(nkf_char c)
3581 if(c >= NKF_INT32_C(1000000))
3582 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3583 if(c >= NKF_INT32_C(100000))
3584 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3586 (*oconv)(0, 0x30+(c/10000 )%10);
3588 (*oconv)(0, 0x30+(c/1000 )%10);
3590 (*oconv)(0, 0x30+(c/100 )%10);
3592 (*oconv)(0, 0x30+(c/10 )%10);
3594 (*oconv)(0, 0x30+ c %10);
3599 void encode_fallback_xml(nkf_char c)
3604 nkf_each_char_to_hex(oconv, c);
3609 void encode_fallback_java(nkf_char c)
3611 const char *hex = "0123456789ABCDEF";
3614 if(!is_unicode_bmp(c)){
3618 (*oconv)(0, hex[(c>>20)&0xF]);
3619 (*oconv)(0, hex[(c>>16)&0xF]);
3623 (*oconv)(0, hex[(c>>12)&0xF]);
3624 (*oconv)(0, hex[(c>> 8)&0xF]);
3625 (*oconv)(0, hex[(c>> 4)&0xF]);
3626 (*oconv)(0, hex[ c &0xF]);
3630 void encode_fallback_perl(nkf_char c)
3635 nkf_each_char_to_hex(oconv, c);
3640 void encode_fallback_subchar(nkf_char c)
3642 c = unicode_subchar;
3643 (*oconv)((c>>8)&0xFF, c&0xFF);
3648 #ifdef UTF8_OUTPUT_ENABLE
3649 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3652 extern const unsigned short euc_to_utf8_1byte[];
3653 extern const unsigned short *const euc_to_utf8_2bytes[];
3654 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3655 extern const unsigned short *const x0212_to_utf8_2bytes[];
3657 const unsigned short *p;
3660 p = euc_to_utf8_1byte;
3662 } else if (is_eucg3(c2)){
3663 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3666 c2 = (c2&0x7f) - 0x21;
3667 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3668 p = x0212_to_utf8_2bytes[c2];
3674 c2 = (c2&0x7f) - 0x21;
3675 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3676 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3681 c1 = (c1 & 0x7f) - 0x21;
3682 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3687 void w_oconv(nkf_char c2, nkf_char c1)
3693 output_bom_f = FALSE;
3704 #ifdef NUMCHAR_OPTION
3705 if (c2 == 0 && is_unicode_capsule(c1)){
3706 val = c1 & VALUE_MASK;
3709 }else if (val < 0x800){
3710 (*o_putc)(0xC0 | (val >> 6));
3711 (*o_putc)(0x80 | (val & 0x3f));
3712 } else if (val <= NKF_INT32_C(0xFFFF)) {
3713 (*o_putc)(0xE0 | (val >> 12));
3714 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3715 (*o_putc)(0x80 | (val & 0x3f));
3716 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3717 (*o_putc)(0xF0 | ( val>>18));
3718 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3719 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3720 (*o_putc)(0x80 | ( val & 0x3f));
3727 output_mode = ASCII;
3729 } else if (c2 == ISO8859_1) {
3730 output_mode = ISO8859_1;
3731 (*o_putc)(c1 | 0x080);
3734 val = e2w_conv(c2, c1);
3736 w16w_conv(val, &c2, &c1, &c0);
3740 if (c0) (*o_putc)(c0);
3746 void w_oconv16(nkf_char c2, nkf_char c1)
3749 output_bom_f = FALSE;
3750 if (output_endian == ENDIAN_LITTLE){
3751 (*o_putc)((unsigned char)'\377');
3755 (*o_putc)((unsigned char)'\377');
3764 if (c2 == ISO8859_1) {
3767 #ifdef NUMCHAR_OPTION
3768 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3769 if (is_unicode_bmp(c1)) {
3770 c2 = (c1 >> 8) & 0xff;
3774 if (c1 <= UNICODE_MAX) {
3775 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3776 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3777 if (output_endian == ENDIAN_LITTLE){
3778 (*o_putc)(c2 & 0xff);
3779 (*o_putc)((c2 >> 8) & 0xff);
3780 (*o_putc)(c1 & 0xff);
3781 (*o_putc)((c1 >> 8) & 0xff);
3783 (*o_putc)((c2 >> 8) & 0xff);
3784 (*o_putc)(c2 & 0xff);
3785 (*o_putc)((c1 >> 8) & 0xff);
3786 (*o_putc)(c1 & 0xff);
3793 nkf_char val = e2w_conv(c2, c1);
3794 c2 = (val >> 8) & 0xff;
3797 if (output_endian == ENDIAN_LITTLE){
3806 void w_oconv32(nkf_char c2, nkf_char c1)
3809 output_bom_f = FALSE;
3810 if (output_endian == ENDIAN_LITTLE){
3811 (*o_putc)((unsigned char)'\377');
3819 (*o_putc)((unsigned char)'\377');
3828 if (c2 == ISO8859_1) {
3830 #ifdef NUMCHAR_OPTION
3831 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3835 c1 = e2w_conv(c2, c1);
3837 if (output_endian == ENDIAN_LITTLE){
3838 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3839 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3840 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3844 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3845 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3846 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3851 void e_oconv(nkf_char c2, nkf_char c1)
3853 #ifdef NUMCHAR_OPTION
3854 if (c2 == 0 && is_unicode_capsule(c1)){
3855 w16e_conv(c1, &c2, &c1);
3856 if (c2 == 0 && is_unicode_capsule(c1)){
3857 if(encode_fallback)(*encode_fallback)(c1);
3865 } else if (c2 == 0) {
3866 output_mode = ASCII;
3868 } else if (c2 == X0201) {
3869 output_mode = JAPANESE_EUC;
3870 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3871 } else if (c2 == ISO8859_1) {
3872 output_mode = ISO8859_1;
3873 (*o_putc)(c1 | 0x080);
3875 } else if (is_eucg3(c2)){
3876 output_mode = JAPANESE_EUC;
3877 #ifdef SHIFTJIS_CP932
3880 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3881 s2e_conv(s2, s1, &c2, &c1);
3886 output_mode = ASCII;
3888 }else if (is_eucg3(c2)){
3891 (*o_putc)((c2 & 0x7f) | 0x080);
3892 (*o_putc)(c1 | 0x080);
3895 (*o_putc)((c2 & 0x7f) | 0x080);
3896 (*o_putc)(c1 | 0x080);
3900 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3901 set_iconv(FALSE, 0);
3902 return; /* too late to rescue this char */
3904 output_mode = JAPANESE_EUC;
3905 (*o_putc)(c2 | 0x080);
3906 (*o_putc)(c1 | 0x080);
3911 nkf_char x0212_shift(nkf_char c)
3916 if (0x75 <= c && c <= 0x7f){
3917 ret = c + (0x109 - 0x75);
3920 if (0x75 <= c && c <= 0x7f){
3921 ret = c + (0x113 - 0x75);
3928 nkf_char x0212_unshift(nkf_char c)
3931 if (0x7f <= c && c <= 0x88){
3932 ret = c + (0x75 - 0x7f);
3933 }else if (0x89 <= c && c <= 0x92){
3934 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3938 #endif /* X0212_ENABLE */
3940 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3946 if((0x21 <= ndx && ndx <= 0x2F)){
3947 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3948 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3950 }else if(0x6E <= ndx && ndx <= 0x7E){
3951 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3952 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3958 else if(nkf_isgraph(ndx)){
3960 const unsigned short *ptr;
3962 extern const unsigned short *const x0212_shiftjis[];
3964 ptr = x0212_shiftjis[ndx - 0x21];
3966 val = ptr[(c1 & 0x7f) - 0x21];
3975 c2 = x0212_shift(c2);
3977 #endif /* X0212_ENABLE */
3979 if(0x7F < c2) return 1;
3980 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3981 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3985 void s_oconv(nkf_char c2, nkf_char c1)
3987 #ifdef NUMCHAR_OPTION
3988 if (c2 == 0 && is_unicode_capsule(c1)){
3989 w16e_conv(c1, &c2, &c1);
3990 if (c2 == 0 && is_unicode_capsule(c1)){
3991 if(encode_fallback)(*encode_fallback)(c1);
3999 } else if (c2 == 0) {
4000 output_mode = ASCII;
4002 } else if (c2 == X0201) {
4003 output_mode = SHIFT_JIS;
4005 } else if (c2 == ISO8859_1) {
4006 output_mode = ISO8859_1;
4007 (*o_putc)(c1 | 0x080);
4009 } else if (is_eucg3(c2)){
4010 output_mode = SHIFT_JIS;
4011 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4017 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4018 set_iconv(FALSE, 0);
4019 return; /* too late to rescue this char */
4021 output_mode = SHIFT_JIS;
4022 e2s_conv(c2, c1, &c2, &c1);
4024 #ifdef SHIFTJIS_CP932
4026 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4028 extern const unsigned short cp932inv[2][189];
4030 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4036 #endif /* SHIFTJIS_CP932 */
4039 if (prefix_table[(unsigned char)c1]){
4040 (*o_putc)(prefix_table[(unsigned char)c1]);
4046 void j_oconv(nkf_char c2, nkf_char c1)
4048 #ifdef NUMCHAR_OPTION
4049 if (c2 == 0 && is_unicode_capsule(c1)){
4050 w16e_conv(c1, &c2, &c1);
4051 if (c2 == 0 && is_unicode_capsule(c1)){
4052 if(encode_fallback)(*encode_fallback)(c1);
4058 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4061 (*o_putc)(ascii_intro);
4062 output_mode = ASCII;
4066 } else if (is_eucg3(c2)){
4068 if(output_mode!=X0213_2){
4069 output_mode = X0213_2;
4073 (*o_putc)(X0213_2&0x7F);
4076 if(output_mode!=X0212){
4077 output_mode = X0212;
4081 (*o_putc)(X0212&0x7F);
4084 (*o_putc)(c2 & 0x7f);
4087 } else if (c2==X0201) {
4088 if (output_mode!=X0201) {
4089 output_mode = X0201;
4095 } else if (c2==ISO8859_1) {
4096 /* iso8859 introduction, or 8th bit on */
4097 /* Can we convert in 7bit form using ESC-'-'-A ?
4099 output_mode = ISO8859_1;
4101 } else if (c2 == 0) {
4102 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4105 (*o_putc)(ascii_intro);
4106 output_mode = ASCII;
4110 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4112 if (output_mode!=X0213_1) {
4113 output_mode = X0213_1;
4117 (*o_putc)(X0213_1&0x7F);
4119 }else if (output_mode != X0208) {
4120 output_mode = X0208;
4123 (*o_putc)(kanji_intro);
4130 void base64_conv(nkf_char c2, nkf_char c1)
4132 mime_prechar(c2, c1);
4133 (*o_base64conv)(c2,c1);
4137 static nkf_char broken_buf[3];
4138 static int broken_counter = 0;
4139 static int broken_last = 0;
4140 nkf_char broken_getc(FILE *f)
4144 if (broken_counter>0) {
4145 return broken_buf[--broken_counter];
4148 if (c=='$' && broken_last != ESC
4149 && (input_mode==ASCII || input_mode==X0201)) {
4152 if (c1=='@'|| c1=='B') {
4153 broken_buf[0]=c1; broken_buf[1]=c;
4160 } else if (c=='(' && broken_last != ESC
4161 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4164 if (c1=='J'|| c1=='B') {
4165 broken_buf[0]=c1; broken_buf[1]=c;
4178 nkf_char broken_ungetc(nkf_char c, FILE *f)
4180 if (broken_counter<2)
4181 broken_buf[broken_counter++]=c;
4185 static nkf_char prev_cr = 0;
4187 void cr_conv(nkf_char c2, nkf_char c1)
4191 if (! (c2==0&&c1==NL) ) {
4197 } else if (c1=='\r') {
4199 } else if (c1=='\n') {
4200 if (crmode_f==CRLF) {
4201 (*o_crconv)(0,'\r');
4202 } else if (crmode_f==CR) {
4203 (*o_crconv)(0,'\r');
4207 } else if (c1!='\032' || crmode_f!=NL){
4213 Return value of fold_conv()
4215 \n add newline and output char
4216 \r add newline and output nothing
4219 1 (or else) normal output
4221 fold state in prev (previous character)
4223 >0x80 Japanese (X0208/X0201)
4228 This fold algorthm does not preserve heading space in a line.
4229 This is the main difference from fmt.
4232 #define char_size(c2,c1) (c2?2:1)
4234 void fold_conv(nkf_char c2, nkf_char c1)
4237 nkf_char fold_state;
4239 if (c1== '\r' && !fold_preserve_f) {
4240 fold_state=0; /* ignore cr */
4241 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4243 fold_state=0; /* ignore cr */
4244 } else if (c1== BS) {
4245 if (f_line>0) f_line--;
4247 } else if (c2==EOF && f_line != 0) { /* close open last line */
4249 } else if ((c1=='\n' && !fold_preserve_f)
4250 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4251 && fold_preserve_f)) {
4253 if (fold_preserve_f) {
4257 } else if ((f_prev == c1 && !fold_preserve_f)
4258 || (f_prev == '\n' && fold_preserve_f)
4259 ) { /* duplicate newline */
4262 fold_state = '\n'; /* output two newline */
4268 if (f_prev&0x80) { /* Japanese? */
4270 fold_state = 0; /* ignore given single newline */
4271 } else if (f_prev==' ') {
4275 if (++f_line<=fold_len)
4279 fold_state = '\r'; /* fold and output nothing */
4283 } else if (c1=='\f') {
4286 fold_state = '\n'; /* output newline and clear */
4287 } else if ( (c2==0 && c1==' ')||
4288 (c2==0 && c1=='\t')||
4289 (c2=='!'&& c1=='!')) {
4290 /* X0208 kankaku or ascii space */
4291 if (f_prev == ' ') {
4292 fold_state = 0; /* remove duplicate spaces */
4295 if (++f_line<=fold_len)
4296 fold_state = ' '; /* output ASCII space only */
4298 f_prev = ' '; f_line = 0;
4299 fold_state = '\r'; /* fold and output nothing */
4303 prev0 = f_prev; /* we still need this one... , but almost done */
4305 if (c2 || c2==X0201)
4306 f_prev |= 0x80; /* this is Japanese */
4307 f_line += char_size(c2,c1);
4308 if (f_line<=fold_len) { /* normal case */
4311 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4312 f_line = char_size(c2,c1);
4313 fold_state = '\n'; /* We can't wait, do fold now */
4314 } else if (c2==X0201) {
4315 /* simple kinsoku rules return 1 means no folding */
4316 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4317 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4318 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4319 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4320 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4321 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4322 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4324 fold_state = '\n';/* add one new f_line before this character */
4327 fold_state = '\n';/* add one new f_line before this character */
4330 /* kinsoku point in ASCII */
4331 if ( c1==')'|| /* { [ ( */
4342 /* just after special */
4343 } else if (!is_alnum(prev0)) {
4344 f_line = char_size(c2,c1);
4346 } else if ((prev0==' ') || /* ignored new f_line */
4347 (prev0=='\n')|| /* ignored new f_line */
4348 (prev0&0x80)) { /* X0208 - ASCII */
4349 f_line = char_size(c2,c1);
4350 fold_state = '\n';/* add one new f_line before this character */
4352 fold_state = 1; /* default no fold in ASCII */
4356 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4357 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4358 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4359 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4360 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4361 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4362 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4363 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4364 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4365 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4366 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4367 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4368 /* default no fold in kinsoku */
4371 f_line = char_size(c2,c1);
4372 /* add one new f_line before this character */
4375 f_line = char_size(c2,c1);
4377 /* add one new f_line before this character */
4382 /* terminator process */
4383 switch(fold_state) {
4402 nkf_char z_prev2=0,z_prev1=0;
4404 void z_conv(nkf_char c2, nkf_char c1)
4407 /* if (c2) c1 &= 0x7f; assertion */
4409 if (x0201_f && z_prev2==X0201) { /* X0201 */
4410 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4412 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4414 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4416 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4420 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4429 if (x0201_f && c2==X0201) {
4430 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4431 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4432 z_prev1 = c1; z_prev2 = c2;
4435 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4440 /* JISX0208 Alphabet */
4441 if (alpha_f && c2 == 0x23 ) {
4443 } else if (alpha_f && c2 == 0x21 ) {
4444 /* JISX0208 Kigou */
4449 } else if (alpha_f&0x4) {
4454 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4460 case '>': entity = ">"; break;
4461 case '<': entity = "<"; break;
4462 case '\"': entity = """; break;
4463 case '&': entity = "&"; break;
4466 while (*entity) (*o_zconv)(0, *entity++);
4476 #define rot13(c) ( \
4478 (c <= 'M') ? (c + 13): \
4479 (c <= 'Z') ? (c - 13): \
4481 (c <= 'm') ? (c + 13): \
4482 (c <= 'z') ? (c - 13): \
4486 #define rot47(c) ( \
4488 ( c <= 'O' ) ? (c + 47) : \
4489 ( c <= '~' ) ? (c - 47) : \
4493 void rot_conv(nkf_char c2, nkf_char c1)
4495 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4501 (*o_rot_conv)(c2,c1);
4504 void hira_conv(nkf_char c2, nkf_char c1)
4508 if (0x20 < c1 && c1 < 0x74) {
4510 (*o_hira_conv)(c2,c1);
4512 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4514 c1 = CLASS_UNICODE | 0x3094;
4515 (*o_hira_conv)(c2,c1);
4518 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4520 (*o_hira_conv)(c2,c1);
4525 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4528 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4530 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4534 (*o_hira_conv)(c2,c1);
4538 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4540 static const nkf_char range[RANGE_NUM_MAX][2] = {
4561 nkf_char start, end, c;
4563 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4567 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4572 for (i = 0; i < RANGE_NUM_MAX; i++) {
4573 start = range[i][0];
4576 if (c >= start && c <= end) {
4581 (*o_iso2022jp_check_conv)(c2,c1);
4585 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4587 const unsigned char *mime_pattern[] = {
4588 (const unsigned char *)"\075?EUC-JP?B?",
4589 (const unsigned char *)"\075?SHIFT_JIS?B?",
4590 (const unsigned char *)"\075?ISO-8859-1?Q?",
4591 (const unsigned char *)"\075?ISO-8859-1?B?",
4592 (const unsigned char *)"\075?ISO-2022-JP?B?",
4593 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4594 #if defined(UTF8_INPUT_ENABLE)
4595 (const unsigned char *)"\075?UTF-8?B?",
4596 (const unsigned char *)"\075?UTF-8?Q?",
4598 (const unsigned char *)"\075?US-ASCII?Q?",
4603 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4604 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4605 e_iconv, s_iconv, 0, 0, 0, 0,
4606 #if defined(UTF8_INPUT_ENABLE)
4612 const nkf_char mime_encode[] = {
4613 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4614 #if defined(UTF8_INPUT_ENABLE)
4621 const nkf_char mime_encode_method[] = {
4622 'B', 'B','Q', 'B', 'B', 'Q',
4623 #if defined(UTF8_INPUT_ENABLE)
4631 #define MAXRECOVER 20
4633 void switch_mime_getc(void)
4635 if (i_getc!=mime_getc) {
4636 i_mgetc = i_getc; i_getc = mime_getc;
4637 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4638 if(mime_f==STRICT_MIME) {
4639 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4640 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4645 void unswitch_mime_getc(void)
4647 if(mime_f==STRICT_MIME) {
4648 i_mgetc = i_mgetc_buf;
4649 i_mungetc = i_mungetc_buf;
4652 i_ungetc = i_mungetc;
4653 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4654 mime_iconv_back = NULL;
4657 nkf_char mime_begin_strict(FILE *f)
4661 const unsigned char *p,*q;
4662 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4664 mime_decode_mode = FALSE;
4665 /* =? has been checked */
4667 p = mime_pattern[j];
4670 for(i=2;p[i]>' ';i++) { /* start at =? */
4671 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4672 /* pattern fails, try next one */
4674 while (mime_pattern[++j]) {
4675 p = mime_pattern[j];
4676 for(k=2;k<i;k++) /* assume length(p) > i */
4677 if (p[k]!=q[k]) break;
4678 if (k==i && nkf_toupper(c1)==p[k]) break;
4680 p = mime_pattern[j];
4681 if (p) continue; /* found next one, continue */
4682 /* all fails, output from recovery buffer */
4690 mime_decode_mode = p[i-2];
4692 mime_iconv_back = iconv;
4693 set_iconv(FALSE, mime_priority_func[j]);
4694 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4696 if (mime_decode_mode=='B') {
4697 mimebuf_f = unbuf_f;
4699 /* do MIME integrity check */
4700 return mime_integrity(f,mime_pattern[j]);
4708 nkf_char mime_getc_buf(FILE *f)
4710 /* we don't keep eof of Fifo, becase it contains ?= as
4711 a terminator. It was checked in mime_integrity. */
4712 return ((mimebuf_f)?
4713 (*i_mgetc_buf)(f):Fifo(mime_input++));
4716 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4719 (*i_mungetc_buf)(c,f);
4721 Fifo(--mime_input) = (unsigned char)c;
4725 nkf_char mime_begin(FILE *f)
4730 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4731 /* re-read and convert again from mime_buffer. */
4733 /* =? has been checked */
4735 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4736 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4737 /* We accept any character type even if it is breaked by new lines */
4738 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4739 if (c1=='\n'||c1==' '||c1=='\r'||
4740 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4742 /* Failed. But this could be another MIME preemble */
4750 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4751 if (!(++i<MAXRECOVER) || c1==EOF) break;
4752 if (c1=='b'||c1=='B') {
4753 mime_decode_mode = 'B';
4754 } else if (c1=='q'||c1=='Q') {
4755 mime_decode_mode = 'Q';
4759 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4760 if (!(++i<MAXRECOVER) || c1==EOF) break;
4762 mime_decode_mode = FALSE;
4768 if (!mime_decode_mode) {
4769 /* false MIME premble, restart from mime_buffer */
4770 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4771 /* Since we are in MIME mode until buffer becomes empty, */
4772 /* we never go into mime_begin again for a while. */
4775 /* discard mime preemble, and goto MIME mode */
4777 /* do no MIME integrity check */
4778 return c1; /* used only for checking EOF */
4782 void no_putc(nkf_char c)
4787 void debug(const char *str)
4790 fprintf(stderr, "%s\n", str);
4795 void set_input_codename(char *codename)
4799 strcmp(codename, "") != 0 &&
4800 strcmp(codename, input_codename) != 0)
4802 is_inputcode_mixed = TRUE;
4804 input_codename = codename;
4805 is_inputcode_set = TRUE;
4808 #if !defined(PERL_XS) && !defined(WIN32DLL)
4809 void print_guessed_code(char *filename)
4811 char *codename = "BINARY";
4812 if (!is_inputcode_mixed) {
4813 if (strcmp(input_codename, "") == 0) {
4816 codename = input_codename;
4819 if (filename != NULL) printf("%s:", filename);
4820 printf("%s\n", codename);
4826 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4828 nkf_char c1, c2, c3;
4834 if (!nkf_isxdigit(c2)){
4839 if (!nkf_isxdigit(c3)){
4844 return (hex2bin(c2) << 4) | hex2bin(c3);
4847 nkf_char cap_getc(FILE *f)
4849 return hex_getc(':', f, i_cgetc, i_cungetc);
4852 nkf_char cap_ungetc(nkf_char c, FILE *f)
4854 return (*i_cungetc)(c, f);
4857 nkf_char url_getc(FILE *f)
4859 return hex_getc('%', f, i_ugetc, i_uungetc);
4862 nkf_char url_ungetc(nkf_char c, FILE *f)
4864 return (*i_uungetc)(c, f);
4868 #ifdef NUMCHAR_OPTION
4869 nkf_char numchar_getc(FILE *f)
4871 nkf_char (*g)(FILE *) = i_ngetc;
4872 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4883 if (buf[i] == 'x' || buf[i] == 'X'){
4884 for (j = 0; j < 7; j++){
4886 if (!nkf_isxdigit(buf[i])){
4893 c |= hex2bin(buf[i]);
4896 for (j = 0; j < 8; j++){
4900 if (!nkf_isdigit(buf[i])){
4907 c += hex2bin(buf[i]);
4913 return CLASS_UNICODE | c;
4922 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4924 return (*i_nungetc)(c, f);
4928 #ifdef UNICODE_NORMALIZATION
4930 /* Normalization Form C */
4931 nkf_char nfc_getc(FILE *f)
4933 nkf_char (*g)(FILE *f) = i_nfc_getc;
4934 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4935 int i=0, j, k=1, lower, upper;
4937 const nkf_nfchar *array;
4939 extern const struct normalization_pair normalization_table[];
4943 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4944 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4945 while (upper >= lower) {
4946 j = (lower+upper) / 2;
4947 array = normalization_table[j].nfd;
4948 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4949 if (array[k] != buf[k]){
4950 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4957 array = normalization_table[j].nfc;
4958 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4959 buf[i] = (nkf_char)(array[i]);
4970 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4972 return (*i_nfc_ungetc)(c, f);
4974 #endif /* UNICODE_NORMALIZATION */
4980 nkf_char c1, c2, c3, c4, cc;
4981 nkf_char t1, t2, t3, t4, mode, exit_mode;
4982 nkf_char lwsp_count;
4985 nkf_char lwsp_size = 128;
4987 if (mime_top != mime_last) { /* Something is in FIFO */
4988 return Fifo(mime_top++);
4990 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4991 mime_decode_mode=FALSE;
4992 unswitch_mime_getc();
4993 return (*i_getc)(f);
4996 if (mimebuf_f == FIXED_MIME)
4997 exit_mode = mime_decode_mode;
5000 if (mime_decode_mode == 'Q') {
5001 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5003 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5004 if (c1<=' ' || DEL<=c1) {
5005 mime_decode_mode = exit_mode; /* prepare for quit */
5008 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5012 mime_decode_mode = exit_mode; /* prepare for quit */
5013 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5014 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5015 /* end Q encoding */
5016 input_mode = exit_mode;
5018 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5019 if (lwsp_buf==NULL) {
5020 perror("can't malloc");
5023 while ((c1=(*i_getc)(f))!=EOF) {
5028 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5036 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5037 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5052 lwsp_buf[lwsp_count] = (unsigned char)c1;
5053 if (lwsp_count++>lwsp_size){
5055 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5056 if (lwsp_buf_new==NULL) {
5058 perror("can't realloc");
5061 lwsp_buf = lwsp_buf_new;
5067 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5069 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5070 i_ungetc(lwsp_buf[lwsp_count],f);
5076 if (c1=='='&&c2<' ') { /* this is soft wrap */
5077 while((c1 = (*i_mgetc)(f)) <=' ') {
5078 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5080 mime_decode_mode = 'Q'; /* still in MIME */
5081 goto restart_mime_q;
5084 mime_decode_mode = 'Q'; /* still in MIME */
5088 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5089 if (c2<=' ') return c2;
5090 mime_decode_mode = 'Q'; /* still in MIME */
5091 return ((hex2bin(c2)<<4) + hex2bin(c3));
5094 if (mime_decode_mode != 'B') {
5095 mime_decode_mode = FALSE;
5096 return (*i_mgetc)(f);
5100 /* Base64 encoding */
5102 MIME allows line break in the middle of
5103 Base64, but we are very pessimistic in decoding
5104 in unbuf mode because MIME encoded code may broken by
5105 less or editor's control sequence (such as ESC-[-K in unbuffered
5106 mode. ignore incomplete MIME.
5108 mode = mime_decode_mode;
5109 mime_decode_mode = exit_mode; /* prepare for quit */
5111 while ((c1 = (*i_mgetc)(f))<=' ') {
5116 if ((c2 = (*i_mgetc)(f))<=' ') {
5119 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5120 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5123 if ((c1 == '?') && (c2 == '=')) {
5126 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5127 if (lwsp_buf==NULL) {
5128 perror("can't malloc");
5131 while ((c1=(*i_getc)(f))!=EOF) {
5136 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5144 if ((c1=(*i_getc)(f))!=EOF) {
5148 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5163 lwsp_buf[lwsp_count] = (unsigned char)c1;
5164 if (lwsp_count++>lwsp_size){
5166 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5167 if (lwsp_buf_new==NULL) {
5169 perror("can't realloc");
5172 lwsp_buf = lwsp_buf_new;
5178 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5180 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5181 i_ungetc(lwsp_buf[lwsp_count],f);
5188 if ((c3 = (*i_mgetc)(f))<=' ') {
5191 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5192 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5196 if ((c4 = (*i_mgetc)(f))<=' ') {
5199 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5200 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5204 mime_decode_mode = mode; /* still in MIME sigh... */
5206 /* BASE 64 decoding */
5208 t1 = 0x3f & base64decode(c1);
5209 t2 = 0x3f & base64decode(c2);
5210 t3 = 0x3f & base64decode(c3);
5211 t4 = 0x3f & base64decode(c4);
5212 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5214 Fifo(mime_last++) = (unsigned char)cc;
5215 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5217 Fifo(mime_last++) = (unsigned char)cc;
5218 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5220 Fifo(mime_last++) = (unsigned char)cc;
5225 return Fifo(mime_top++);
5228 nkf_char mime_ungetc(nkf_char c, FILE *f)
5230 Fifo(--mime_top) = (unsigned char)c;
5234 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5238 /* In buffered mode, read until =? or NL or buffer full
5240 mime_input = mime_top;
5241 mime_last = mime_top;
5243 while(*p) Fifo(mime_input++) = *p++;
5246 while((c=(*i_getc)(f))!=EOF) {
5247 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5248 break; /* buffer full */
5250 if (c=='=' && d=='?') {
5251 /* checked. skip header, start decode */
5252 Fifo(mime_input++) = (unsigned char)c;
5253 /* mime_last_input = mime_input; */
5258 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5260 /* Should we check length mod 4? */
5261 Fifo(mime_input++) = (unsigned char)c;
5264 /* In case of Incomplete MIME, no MIME decode */
5265 Fifo(mime_input++) = (unsigned char)c;
5266 mime_last = mime_input; /* point undecoded buffer */
5267 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5268 switch_mime_getc(); /* anyway we need buffered getc */
5272 nkf_char base64decode(nkf_char c)
5277 i = c - 'A'; /* A..Z 0-25 */
5279 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5281 } else if (c > '/') {
5282 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5283 } else if (c == '+') {
5284 i = '>' /* 62 */ ; /* + 62 */
5286 i = '?' /* 63 */ ; /* / 63 */
5291 static const char basis_64[] =
5292 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5294 static nkf_char b64c;
5295 #define MIMEOUT_BUF_LENGTH (60)
5296 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5297 int mimeout_buf_count = 0;
5298 int mimeout_preserve_space = 0;
5299 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5301 void open_mime(nkf_char mode)
5303 const unsigned char *p;
5306 p = mime_pattern[0];
5307 for(i=0;mime_encode[i];i++) {
5308 if (mode == mime_encode[i]) {
5309 p = mime_pattern[i];
5313 mimeout_mode = mime_encode_method[i];
5316 if (base64_count>45) {
5317 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5318 (*o_mputc)(mimeout_buf[i]);
5324 if (!mimeout_preserve_space && mimeout_buf_count>0
5325 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5326 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5330 if (!mimeout_preserve_space) {
5331 for (;i<mimeout_buf_count;i++) {
5332 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5333 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5334 (*o_mputc)(mimeout_buf[i]);
5341 mimeout_preserve_space = FALSE;
5347 j = mimeout_buf_count;
5348 mimeout_buf_count = 0;
5350 mime_putc(mimeout_buf[i]);
5354 void close_mime(void)
5364 switch(mimeout_mode) {
5369 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5375 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5381 if (mimeout_f!=FIXED_MIME) {
5383 } else if (mimeout_mode != 'Q')
5388 void mimeout_addchar(nkf_char c)
5390 switch(mimeout_mode) {
5395 } else if(!nkf_isalnum(c)) {
5397 (*o_mputc)(itoh4(((c>>4)&0xf)));
5398 (*o_mputc)(itoh4((c&0xf)));
5407 (*o_mputc)(basis_64[c>>2]);
5412 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5418 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5419 (*o_mputc)(basis_64[c & 0x3F]);
5430 nkf_char mime_lastchar2, mime_lastchar1;
5432 void mime_prechar(nkf_char c2, nkf_char c1)
5436 if (base64_count + mimeout_buf_count/3*4> 66){
5437 (*o_base64conv)(EOF,0);
5438 (*o_base64conv)(0,NL);
5439 (*o_base64conv)(0,SPACE);
5441 }/*else if (mime_lastchar2){
5442 if (c1 <=DEL && !nkf_isspace(c1)){
5443 (*o_base64conv)(0,SPACE);
5447 if (c2 && mime_lastchar2 == 0
5448 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5449 (*o_base64conv)(0,SPACE);
5452 mime_lastchar2 = c2;
5453 mime_lastchar1 = c1;
5456 void mime_putc(nkf_char c)
5461 if (mimeout_f == FIXED_MIME){
5462 if (mimeout_mode == 'Q'){
5463 if (base64_count > 71){
5464 if (c!=CR && c!=NL) {
5471 if (base64_count > 71){
5476 if (c == EOF) { /* c==EOF */
5480 if (c != EOF) { /* c==EOF */
5486 /* mimeout_f != FIXED_MIME */
5488 if (c == EOF) { /* c==EOF */
5489 j = mimeout_buf_count;
5490 mimeout_buf_count = 0;
5494 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5497 mimeout_addchar(mimeout_buf[i]);
5501 mimeout_addchar(mimeout_buf[i]);
5505 mimeout_addchar(mimeout_buf[i]);
5511 if (mimeout_mode=='Q') {
5512 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5524 if (mimeout_buf_count > 0){
5525 lastchar = mimeout_buf[mimeout_buf_count - 1];
5530 if (!mimeout_mode) {
5531 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5532 if (nkf_isspace(c)) {
5533 if (c==CR || c==NL) {
5536 for (i=0;i<mimeout_buf_count;i++) {
5537 (*o_mputc)(mimeout_buf[i]);
5538 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5544 mimeout_buf[0] = (char)c;
5545 mimeout_buf_count = 1;
5547 if (base64_count > 1
5548 && base64_count + mimeout_buf_count > 76){
5551 if (!nkf_isspace(mimeout_buf[0])){
5556 mimeout_buf[mimeout_buf_count++] = (char)c;
5557 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5558 open_mime(output_mode);
5563 if (lastchar==CR || lastchar == NL){
5564 for (i=0;i<mimeout_buf_count;i++) {
5565 (*o_mputc)(mimeout_buf[i]);
5568 mimeout_buf_count = 0;
5570 if (lastchar==SPACE) {
5571 for (i=0;i<mimeout_buf_count-1;i++) {
5572 (*o_mputc)(mimeout_buf[i]);
5575 mimeout_buf[0] = SPACE;
5576 mimeout_buf_count = 1;
5578 open_mime(output_mode);
5581 /* mimeout_mode == 'B', 1, 2 */
5582 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5583 if (lastchar == CR || lastchar == NL){
5584 if (nkf_isblank(c)) {
5585 for (i=0;i<mimeout_buf_count;i++) {
5586 mimeout_addchar(mimeout_buf[i]);
5588 mimeout_buf_count = 0;
5589 } else if (SPACE<c && c<DEL) {
5591 for (i=0;i<mimeout_buf_count;i++) {
5592 (*o_mputc)(mimeout_buf[i]);
5595 mimeout_buf_count = 0;
5598 if (c==SPACE || c==TAB || c==CR || c==NL) {
5599 for (i=0;i<mimeout_buf_count;i++) {
5600 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5602 for (i=0;i<mimeout_buf_count;i++) {
5603 (*o_mputc)(mimeout_buf[i]);
5606 mimeout_buf_count = 0;
5609 mimeout_buf[mimeout_buf_count++] = (char)c;
5610 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5612 for (i=0;i<mimeout_buf_count;i++) {
5613 (*o_mputc)(mimeout_buf[i]);
5616 mimeout_buf_count = 0;
5620 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5621 mimeout_buf[mimeout_buf_count++] = (char)c;
5622 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5623 j = mimeout_buf_count;
5624 mimeout_buf_count = 0;
5626 mimeout_addchar(mimeout_buf[i]);
5633 if (mimeout_buf_count>0) {
5634 j = mimeout_buf_count;
5635 mimeout_buf_count = 0;
5637 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5639 mimeout_addchar(mimeout_buf[i]);
5645 (*o_mputc)(mimeout_buf[i]);
5647 open_mime(output_mode);
5654 #if defined(PERL_XS) || defined(WIN32DLL)
5658 struct input_code *p = input_code_list;
5671 mime_f = STRICT_MIME;
5672 mime_decode_f = FALSE;
5677 #if defined(MSDOS) || defined(__OS2__)
5682 iso2022jp_f = FALSE;
5683 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5684 ms_ucs_map_f = UCS_MAP_ASCII;
5686 #ifdef UTF8_INPUT_ENABLE
5687 no_cp932ext_f = FALSE;
5688 no_best_fit_chars_f = FALSE;
5689 encode_fallback = NULL;
5690 unicode_subchar = '?';
5691 input_endian = ENDIAN_BIG;
5693 #ifdef UTF8_OUTPUT_ENABLE
5694 output_bom_f = FALSE;
5695 output_endian = ENDIAN_BIG;
5697 #ifdef UNICODE_NORMALIZATION
5710 is_inputcode_mixed = FALSE;
5711 is_inputcode_set = FALSE;
5715 #ifdef SHIFTJIS_CP932
5725 for (i = 0; i < 256; i++){
5726 prefix_table[i] = 0;
5730 mimeout_buf_count = 0;
5735 fold_preserve_f = FALSE;
5738 kanji_intro = DEFAULT_J;
5739 ascii_intro = DEFAULT_R;
5740 fold_margin = FOLD_MARGIN;
5741 output_conv = DEFAULT_CONV;
5742 oconv = DEFAULT_CONV;
5743 o_zconv = no_connection;
5744 o_fconv = no_connection;
5745 o_crconv = no_connection;
5746 o_rot_conv = no_connection;
5747 o_hira_conv = no_connection;
5748 o_base64conv = no_connection;
5749 o_iso2022jp_check_conv = no_connection;
5752 i_ungetc = std_ungetc;
5754 i_bungetc = std_ungetc;
5757 i_mungetc = std_ungetc;
5758 i_mgetc_buf = std_getc;
5759 i_mungetc_buf = std_ungetc;
5760 output_mode = ASCII;
5763 mime_decode_mode = FALSE;
5769 z_prev2=0,z_prev1=0;
5771 iconv_for_check = 0;
5773 input_codename = "";
5780 void no_connection(nkf_char c2, nkf_char c1)
5782 no_connection2(c2,c1,0);
5785 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5787 fprintf(stderr,"nkf internal module connection failure.\n");
5789 return 0; /* LINT */
5794 #define fprintf dllprintf
5798 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5799 fprintf(stderr,"Flags:\n");
5800 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5801 #ifdef DEFAULT_CODE_SJIS
5802 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5804 #ifdef DEFAULT_CODE_JIS
5805 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5807 #ifdef DEFAULT_CODE_EUC
5808 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5810 #ifdef DEFAULT_CODE_UTF8
5811 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5813 #ifdef UTF8_OUTPUT_ENABLE
5814 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5816 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5817 #ifdef UTF8_INPUT_ENABLE
5818 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5820 fprintf(stderr,"t no conversion\n");
5821 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5822 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5823 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5824 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5825 fprintf(stderr,"v Show this usage. V: show version\n");
5826 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5827 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5828 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5829 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5830 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5831 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5832 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5833 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5835 fprintf(stderr,"T Text mode output\n");
5837 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5838 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5839 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5840 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5841 fprintf(stderr,"\n");
5842 fprintf(stderr,"Long name options\n");
5843 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5844 fprintf(stderr," Specify the input or output codeset\n");
5845 fprintf(stderr," --fj --unix --mac --windows\n");
5846 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5847 fprintf(stderr," Convert for the system or code\n");
5848 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5849 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5850 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5852 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5854 #ifdef NUMCHAR_OPTION
5855 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5857 #ifdef UTF8_INPUT_ENABLE
5858 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5859 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5862 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5863 fprintf(stderr," Overwrite original listed files by filtered result\n");
5864 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5866 fprintf(stderr," -g --guess Guess the input code\n");
5867 fprintf(stderr," --help --version Show this help/the version\n");
5868 fprintf(stderr," For more information, see also man nkf\n");
5869 fprintf(stderr,"\n");
5875 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5876 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5879 #if defined(MSDOS) && defined(__WIN16__)
5882 #if defined(MSDOS) && defined(__WIN32__)
5888 ,NKF_VERSION,NKF_RELEASE_DATE);
5889 fprintf(stderr,"\n%s\n",CopyRight);
5894 **
\e$B%Q%C%A@):n<T
\e(B
5895 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5896 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5897 ** ohta@src.ricoh.co.jp (Junn Ohta)
5898 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5899 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5900 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5901 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5902 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5903 ** GHG00637@nifty-serve.or.jp (COW)