1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.106 2006/09/14 19:30:03 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2006-09-15"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
\r
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = TRUE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
588 static int mimeout_mode = 0;
589 static int base64_count = 0;
591 /* X0208 -> ASCII converter */
594 static int f_line = 0; /* chars in line */
595 static int f_prev = 0;
596 static int fold_preserve_f = FALSE; /* preserve new lines */
597 static int fold_f = FALSE;
598 static int fold_len = 0;
601 static unsigned char kanji_intro = DEFAULT_J;
602 static unsigned char ascii_intro = DEFAULT_R;
606 #define FOLD_MARGIN 10
607 #define DEFAULT_FOLD 60
609 static int fold_margin = FOLD_MARGIN;
613 #ifdef DEFAULT_CODE_JIS
614 # define DEFAULT_CONV j_oconv
616 #ifdef DEFAULT_CODE_SJIS
617 # define DEFAULT_CONV s_oconv
619 #ifdef DEFAULT_CODE_EUC
620 # define DEFAULT_CONV e_oconv
622 #ifdef DEFAULT_CODE_UTF8
623 # define DEFAULT_CONV w_oconv
626 /* process default */
627 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
629 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
630 /* s_iconv or oconv */
631 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
633 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
634 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
635 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
641 /* static redirections */
643 static void (*o_putc)(nkf_char c) = std_putc;
645 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
646 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
648 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
649 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
651 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
653 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
654 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
656 /* for strict mime */
657 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
658 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
661 static int output_mode = ASCII, /* output kanji mode */
662 input_mode = ASCII, /* input kanji mode */
663 shift_mode = FALSE; /* TRUE shift out, or X0201 */
664 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
666 /* X0201 / X0208 conversion tables */
668 /* X0201 kana conversion table */
671 unsigned char cv[]= {
672 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
673 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
674 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
675 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
676 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
677 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
678 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
679 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
680 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
681 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
682 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
683 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
684 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
685 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
686 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
687 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
691 /* X0201 kana conversion table for daguten */
694 unsigned char dv[]= {
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
696 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
700 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
701 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
702 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
703 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
704 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
705 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
706 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
708 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 /* X0201 kana conversion table for han-daguten */
716 unsigned char ev[]= {
717 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
718 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
728 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
730 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
736 /* X0208 kigou conversion table */
737 /* 0x8140 - 0x819e */
739 unsigned char fv[] = {
741 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
742 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
743 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
744 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
745 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
746 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
747 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
748 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
749 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
751 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
758 static int file_out_f = FALSE;
760 static int overwrite_f = FALSE;
761 static int preserve_time_f = FALSE;
762 static int backup_f = FALSE;
763 static char *backup_suffix = "";
764 static char *get_backup_filename(const char *suffix, const char *filename);
767 static int crmode_f = 0; /* CR, NL, CRLF */
768 #ifdef EASYWIN /*Easy Win */
769 static int end_check;
772 #define STD_GC_BUFSIZE (256)
773 nkf_char std_gc_buf[STD_GC_BUFSIZE];
777 #include "nkf32dll.c"
778 #elif defined(PERL_XS)
780 int main(int argc, char **argv)
785 char *outfname = NULL;
788 #ifdef EASYWIN /*Easy Win */
789 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
792 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
793 cp = (unsigned char *)*argv;
798 if (pipe(fds) < 0 || (pid = fork()) < 0){
809 execvp(argv[1], &argv[1]);
823 if(x0201_f == WISH_TRUE)
824 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
826 if (binmode_f == TRUE)
827 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
828 if (freopen("","wb",stdout) == NULL)
835 setbuf(stdout, (char *) NULL);
837 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
840 if (binmode_f == TRUE)
841 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
842 if (freopen("","rb",stdin) == NULL) return (-1);
846 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
850 kanji_convert(stdin);
851 if (guess_f) print_guessed_code(NULL);
856 is_inputcode_mixed = FALSE;
857 is_inputcode_set = FALSE;
862 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
871 /* reopen file for stdout */
872 if (file_out_f == TRUE) {
875 outfname = malloc(strlen(origfname)
876 + strlen(".nkftmpXXXXXX")
882 strcpy(outfname, origfname);
886 for (i = strlen(outfname); i; --i){
887 if (outfname[i - 1] == '/'
888 || outfname[i - 1] == '\\'){
894 strcat(outfname, "ntXXXXXX");
896 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
899 strcat(outfname, ".nkftmpXXXXXX");
900 fd = mkstemp(outfname);
903 || (fd_backup = dup(fileno(stdout))) < 0
904 || dup2(fd, fileno(stdout)) < 0
915 outfname = "nkf.out";
918 if(freopen(outfname, "w", stdout) == NULL) {
922 if (binmode_f == TRUE) {
923 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
924 if (freopen("","wb",stdout) == NULL)
931 if (binmode_f == TRUE)
932 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
933 if (freopen("","rb",fin) == NULL)
938 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
942 char *filename = NULL;
944 if (nfiles > 1) filename = origfname;
945 if (guess_f) print_guessed_code(filename);
951 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
959 if (dup2(fd_backup, fileno(stdout)) < 0){
962 if (stat(origfname, &sb)) {
963 fprintf(stderr, "Can't stat %s\n", origfname);
965 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
966 if (chmod(outfname, sb.st_mode)) {
967 fprintf(stderr, "Can't set permission %s\n", outfname);
970 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
972 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
973 tb[0] = tb[1] = sb.st_mtime;
974 if (utime(outfname, tb)) {
975 fprintf(stderr, "Can't set timestamp %s\n", outfname);
978 tb.actime = sb.st_atime;
979 tb.modtime = sb.st_mtime;
980 if (utime(outfname, &tb)) {
981 fprintf(stderr, "Can't set timestamp %s\n", outfname);
986 char *backup_filename = get_backup_filename(backup_suffix, origfname);
988 unlink(backup_filename);
990 if (rename(origfname, backup_filename)) {
991 perror(backup_filename);
992 fprintf(stderr, "Can't rename %s to %s\n",
993 origfname, backup_filename);
997 if (unlink(origfname)){
1002 if (rename(outfname, origfname)) {
1004 fprintf(stderr, "Can't rename %s to %s\n",
1005 outfname, origfname);
1013 #ifdef EASYWIN /*Easy Win */
1014 if (file_out_f == FALSE)
1015 scanf("%d",&end_check);
1018 #else /* for Other OS */
1019 if (file_out_f == TRUE)
1021 #endif /*Easy Win */
1024 #endif /* WIN32DLL */
1027 char *get_backup_filename(const char *suffix, const char *filename)
1029 char *backup_filename;
1030 int asterisk_count = 0;
1032 int filename_length = strlen(filename);
1034 for(i = 0; suffix[i]; i++){
1035 if(suffix[i] == '*') asterisk_count++;
1039 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1040 if (!backup_filename){
1041 perror("Can't malloc backup filename.");
1045 for(i = 0, j = 0; suffix[i];){
1046 if(suffix[i] == '*'){
1047 backup_filename[j] = '\0';
1048 strncat(backup_filename, filename, filename_length);
1050 j += filename_length;
1052 backup_filename[j++] = suffix[i++];
1055 backup_filename[j] = '\0';
1057 j = strlen(suffix) + filename_length;
1058 backup_filename = malloc( + 1);
1059 strcpy(backup_filename, filename);
1060 strcat(backup_filename, suffix);
1061 backup_filename[j] = '\0';
1063 return backup_filename;
1092 {"katakana-hiragana","h3"},
1099 #ifdef UTF8_OUTPUT_ENABLE
1109 {"fb-subchar=", ""},
1111 #ifdef UTF8_INPUT_ENABLE
1112 {"utf8-input", "W"},
1113 {"utf16-input", "W16"},
1114 {"no-cp932ext", ""},
1115 {"no-best-fit-chars",""},
1117 #ifdef UNICODE_NORMALIZATION
1118 {"utf8mac-input", ""},
1130 #ifdef NUMCHAR_OPTION
1131 {"numchar-input", ""},
1137 #ifdef SHIFTJIS_CP932
1147 static int option_mode = 0;
1149 void options(unsigned char *cp)
1153 unsigned char *cp_back = NULL;
1158 while(*cp && *cp++!='-');
1159 while (*cp || cp_back) {
1167 case '-': /* literal options */
1168 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1172 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1173 p = (unsigned char *)long_option[i].name;
1174 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1175 if (*p == cp[j] || cp[j] == ' '){
1182 while(*cp && *cp != SPACE && cp++);
1183 if (long_option[i].alias[0]){
1185 cp = (unsigned char *)long_option[i].alias;
1187 if (strcmp(long_option[i].name, "ic=") == 0){
1188 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1189 codeset[i] = nkf_toupper(p[i]);
1192 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1193 strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1194 strcmp(codeset, "CP50220") == 0 ||
1195 strcmp(codeset, "CP50221") == 0 ||
1196 strcmp(codeset, "CP50222") == 0 ||
1197 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1198 input_f = JIS_INPUT;
1199 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1200 input_f = JIS_INPUT;
1204 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1205 input_f = JIS_INPUT;
1210 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1211 input_f = SJIS_INPUT;
1212 if (x0201_f==NO_X0201) x0201_f=TRUE;
1213 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1214 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1215 strcmp(codeset, "CP932") == 0 ||
1216 strcmp(codeset, "MS932") == 0){
1217 input_f = SJIS_INPUT;
1219 #ifdef SHIFTJIS_CP932
1222 #ifdef UTF8_OUTPUT_ENABLE
1223 ms_ucs_map_f = UCS_MAP_CP932;
1225 }else if(strcmp(codeset, "EUCJP") == 0 ||
1226 strcmp(codeset, "EUC-JP") == 0){
1227 input_f = EUC_INPUT;
1228 }else if(strcmp(codeset, "CP51932") == 0){
1229 input_f = EUC_INPUT;
1231 #ifdef SHIFTJIS_CP932
1234 #ifdef UTF8_OUTPUT_ENABLE
1235 ms_ucs_map_f = UCS_MAP_CP932;
1237 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1238 strcmp(codeset, "EUCJP-MS") == 0 ||
1239 strcmp(codeset, "EUCJPMS") == 0){
1240 input_f = EUC_INPUT;
1242 #ifdef SHIFTJIS_CP932
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_MS;
1248 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1249 strcmp(codeset, "EUCJP-ASCII") == 0){
1250 input_f = EUC_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 #ifdef UTF8_OUTPUT_ENABLE
1256 ms_ucs_map_f = UCS_MAP_ASCII;
1258 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1259 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1260 input_f = SJIS_INPUT;
1262 #ifdef SHIFTJIS_CP932
1266 if (x0201_f==NO_X0201) x0201_f=TRUE;
1267 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1268 strcmp(codeset, "EUC-JIS-2004") == 0){
1269 input_f = EUC_INPUT;
1272 #ifdef SHIFTJIS_CP932
1276 #ifdef UTF8_INPUT_ENABLE
1277 }else if(strcmp(codeset, "UTF-8") == 0 ||
1278 strcmp(codeset, "UTF-8N") == 0 ||
1279 strcmp(codeset, "UTF-8-BOM") == 0){
1280 input_f = UTF8_INPUT;
1281 #ifdef UNICODE_NORMALIZATION
1282 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1283 strcmp(codeset, "UTF-8-MAC") == 0){
1284 input_f = UTF8_INPUT;
1287 }else if(strcmp(codeset, "UTF-16") == 0 ||
1288 strcmp(codeset, "UTF-16BE") == 0 ||
1289 strcmp(codeset, "UTF-16BE-BOM") == 0){
1290 input_f = UTF16_INPUT;
1291 input_endian = ENDIAN_BIG;
1292 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1293 strcmp(codeset, "UTF-16LE-BOM") == 0){
1294 input_f = UTF16_INPUT;
1295 input_endian = ENDIAN_LITTLE;
1300 if (strcmp(long_option[i].name, "oc=") == 0){
1301 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1302 codeset[i] = nkf_toupper(p[i]);
1305 if(strcmp(codeset, "ISO-2022-JP") == 0 ||
1306 strcmp(codeset, "CP50220") == 0){
1307 output_conv = j_oconv;
1308 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1309 output_conv = j_oconv;
1310 no_cp932ext_f = TRUE;
1311 }else if(strcmp(codeset, "CP50221") == 0 ||
1312 strcmp(codeset, "ISO-2022-JP-MS") == 0){
1313 output_conv = j_oconv;
1315 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1316 output_conv = j_oconv;
1320 #ifdef SHIFTJIS_CP932
1323 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1324 output_conv = j_oconv;
1329 #ifdef SHIFTJIS_CP932
1332 }else if(strcmp(codeset, "ISO-2022-JP-MS") == 0){
1333 output_conv = j_oconv;
1338 #ifdef SHIFTJIS_CP932
1341 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1342 output_conv = s_oconv;
1343 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1344 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1345 strcmp(codeset, "CP932") == 0 ||
1346 strcmp(codeset, "MS932") == 0){
1347 output_conv = s_oconv;
1349 #ifdef SHIFTJIS_CP932
1353 #ifdef UTF8_OUTPUT_ENABLE
1354 ms_ucs_map_f = UCS_MAP_CP932;
1356 }else if(strcmp(codeset, "EUCJP") == 0 ||
1357 strcmp(codeset, "EUC-JP") == 0){
1358 output_conv = e_oconv;
1359 }else if(strcmp(codeset, "CP51932") == 0){
1360 output_conv = e_oconv;
1362 #ifdef SHIFTJIS_CP932
1365 #ifdef UTF8_OUTPUT_ENABLE
1366 ms_ucs_map_f = UCS_MAP_CP932;
1368 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1369 strcmp(codeset, "EUCJP-MS") == 0 ||
1370 strcmp(codeset, "EUCJPMS") == 0){
1371 output_conv = e_oconv;
1376 #ifdef SHIFTJIS_CP932
1379 #ifdef UTF8_OUTPUT_ENABLE
1380 ms_ucs_map_f = UCS_MAP_MS;
1382 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1383 strcmp(codeset, "EUCJP-ASCII") == 0){
1384 output_conv = e_oconv;
1389 #ifdef SHIFTJIS_CP932
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_ASCII;
1395 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1396 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1397 output_conv = s_oconv;
1399 #ifdef SHIFTJIS_CP932
1402 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1403 strcmp(codeset, "EUC-JIS-2004") == 0){
1404 output_conv = e_oconv;
1409 #ifdef SHIFTJIS_CP932
1412 #ifdef UTF8_OUTPUT_ENABLE
1413 }else if(strcmp(codeset, "UTF-8") == 0){
1414 output_conv = w_oconv;
1415 }else if(strcmp(codeset, "UTF-8N") == 0){
1416 output_conv = w_oconv;
1417 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1418 output_conv = w_oconv;
1419 output_bom_f = TRUE;
1420 }else if(strcmp(codeset, "UTF-16BE") == 0){
1421 output_conv = w_oconv16;
1422 }else if(strcmp(codeset, "UTF-16") == 0 ||
1423 strcmp(codeset, "UTF-16BE-BOM") == 0){
1424 output_conv = w_oconv16;
1425 output_bom_f = TRUE;
1426 }else if(strcmp(codeset, "UTF-16LE") == 0){
1427 output_conv = w_oconv16;
1428 output_endian = ENDIAN_LITTLE;
1429 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1430 output_conv = w_oconv16;
1431 output_endian = ENDIAN_LITTLE;
1432 output_bom_f = TRUE;
1433 }else if(strcmp(codeset, "UTF-32") == 0 ||
1434 strcmp(codeset, "UTF-32BE") == 0){
1435 output_conv = w_oconv32;
1436 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1437 output_conv = w_oconv32;
1438 output_bom_f = TRUE;
1439 }else if(strcmp(codeset, "UTF-32LE") == 0){
1440 output_conv = w_oconv32;
1441 output_endian = ENDIAN_LITTLE;
1442 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1443 output_conv = w_oconv32;
1444 output_endian = ENDIAN_LITTLE;
1445 output_bom_f = TRUE;
1451 if (strcmp(long_option[i].name, "overwrite") == 0){
1454 preserve_time_f = TRUE;
1457 if (strcmp(long_option[i].name, "overwrite=") == 0){
1460 preserve_time_f = TRUE;
1462 backup_suffix = malloc(strlen((char *) p) + 1);
1463 strcpy(backup_suffix, (char *) p);
1466 if (strcmp(long_option[i].name, "in-place") == 0){
1469 preserve_time_f = FALSE;
1472 if (strcmp(long_option[i].name, "in-place=") == 0){
1475 preserve_time_f = FALSE;
1477 backup_suffix = malloc(strlen((char *) p) + 1);
1478 strcpy(backup_suffix, (char *) p);
1483 if (strcmp(long_option[i].name, "cap-input") == 0){
1487 if (strcmp(long_option[i].name, "url-input") == 0){
1492 #ifdef NUMCHAR_OPTION
1493 if (strcmp(long_option[i].name, "numchar-input") == 0){
1499 if (strcmp(long_option[i].name, "no-output") == 0){
1503 if (strcmp(long_option[i].name, "debug") == 0){
1508 if (strcmp(long_option[i].name, "cp932") == 0){
1509 #ifdef SHIFTJIS_CP932
1513 #ifdef UTF8_OUTPUT_ENABLE
1514 ms_ucs_map_f = UCS_MAP_CP932;
1518 if (strcmp(long_option[i].name, "no-cp932") == 0){
1519 #ifdef SHIFTJIS_CP932
1523 #ifdef UTF8_OUTPUT_ENABLE
1524 ms_ucs_map_f = UCS_MAP_ASCII;
1528 #ifdef SHIFTJIS_CP932
1529 if (strcmp(long_option[i].name, "cp932inv") == 0){
1536 if (strcmp(long_option[i].name, "x0212") == 0){
1543 if (strcmp(long_option[i].name, "exec-in") == 0){
1547 if (strcmp(long_option[i].name, "exec-out") == 0){
1552 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1553 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1554 no_cp932ext_f = TRUE;
1557 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1558 no_best_fit_chars_f = TRUE;
1561 if (strcmp(long_option[i].name, "fb-skip") == 0){
1562 encode_fallback = NULL;
1565 if (strcmp(long_option[i].name, "fb-html") == 0){
1566 encode_fallback = encode_fallback_html;
1569 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1570 encode_fallback = encode_fallback_xml;
1573 if (strcmp(long_option[i].name, "fb-java") == 0){
1574 encode_fallback = encode_fallback_java;
1577 if (strcmp(long_option[i].name, "fb-perl") == 0){
1578 encode_fallback = encode_fallback_perl;
1581 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1582 encode_fallback = encode_fallback_subchar;
1585 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1586 encode_fallback = encode_fallback_subchar;
1587 unicode_subchar = 0;
1589 /* decimal number */
1590 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1591 unicode_subchar *= 10;
1592 unicode_subchar += hex2bin(p[i]);
1594 }else if(p[1] == 'x' || p[1] == 'X'){
1595 /* hexadecimal number */
1596 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1597 unicode_subchar <<= 4;
1598 unicode_subchar |= hex2bin(p[i]);
1602 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1603 unicode_subchar *= 8;
1604 unicode_subchar += hex2bin(p[i]);
1607 w16e_conv(unicode_subchar, &i, &j);
1608 unicode_subchar = i<<8 | j;
1612 #ifdef UTF8_OUTPUT_ENABLE
1613 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1614 ms_ucs_map_f = UCS_MAP_MS;
1618 #ifdef UNICODE_NORMALIZATION
1619 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1620 input_f = UTF8_INPUT;
1625 if (strcmp(long_option[i].name, "prefix=") == 0){
1626 if (nkf_isgraph(p[0])){
1627 for (i = 1; nkf_isgraph(p[i]); i++){
1628 prefix_table[p[i]] = p[0];
1635 case 'b': /* buffered mode */
1638 case 'u': /* non bufferd mode */
1641 case 't': /* transparent mode */
1646 } else if (*cp=='2') {
1650 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1658 case 'j': /* JIS output */
1660 output_conv = j_oconv;
1662 case 'e': /* AT&T EUC output */
1663 output_conv = e_oconv;
1665 case 's': /* SJIS output */
1666 output_conv = s_oconv;
1668 case 'l': /* ISO8859 Latin-1 support, no conversion */
1669 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1670 input_f = LATIN1_INPUT;
1672 case 'i': /* Kanji IN ESC-$-@/B */
1673 if (*cp=='@'||*cp=='B')
1674 kanji_intro = *cp++;
1676 case 'o': /* ASCII IN ESC-(-J/B */
1677 if (*cp=='J'||*cp=='B'||*cp=='H')
1678 ascii_intro = *cp++;
1682 bit:1 katakana->hiragana
1683 bit:2 hiragana->katakana
1685 if ('9'>= *cp && *cp>='0')
1686 hira_f |= (*cp++ -'0');
1693 #if defined(MSDOS) || defined(__OS2__)
1708 #ifdef UTF8_OUTPUT_ENABLE
1709 case 'w': /* UTF-8 output */
1711 output_conv = w_oconv; cp++;
1715 output_bom_f = TRUE;
1718 if ('1'== cp[0] && '6'==cp[1]) {
1719 output_conv = w_oconv16; cp+=2;
1720 } else if ('3'== cp[0] && '2'==cp[1]) {
1721 output_conv = w_oconv32; cp+=2;
1723 output_conv = w_oconv;
1728 output_endian = ENDIAN_LITTLE;
1729 } else if (cp[0] == 'B') {
1737 output_bom_f = TRUE;
1742 #ifdef UTF8_INPUT_ENABLE
1743 case 'W': /* UTF input */
1746 input_f = UTF8_INPUT;
1748 if ('1'== cp[0] && '6'==cp[1]) {
1750 input_f = UTF16_INPUT;
1751 input_endian = ENDIAN_BIG;
1752 } else if ('3'== cp[0] && '2'==cp[1]) {
1754 input_f = UTF32_INPUT;
1755 input_endian = ENDIAN_BIG;
1757 input_f = UTF8_INPUT;
1762 input_endian = ENDIAN_LITTLE;
1763 } else if (cp[0] == 'B') {
1769 /* Input code assumption */
1770 case 'J': /* JIS input */
1771 input_f = JIS_INPUT;
1773 case 'E': /* AT&T EUC input */
1774 input_f = EUC_INPUT;
1776 case 'S': /* MS Kanji input */
1777 input_f = SJIS_INPUT;
1778 if (x0201_f==NO_X0201) x0201_f=TRUE;
1780 case 'Z': /* Convert X0208 alphabet to asii */
1781 /* bit:0 Convert X0208
1782 bit:1 Convert Kankaku to one space
1783 bit:2 Convert Kankaku to two spaces
1784 bit:3 Convert HTML Entity
1786 if ('9'>= *cp && *cp>='0')
1787 alpha_f |= 1<<(*cp++ -'0');
1791 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1792 x0201_f = FALSE; /* No X0201->X0208 conversion */
1794 ESC-(-I in JIS, EUC, MS Kanji
1795 SI/SO in JIS, EUC, MS Kanji
1796 SSO in EUC, JIS, not in MS Kanji
1797 MS Kanji (0xa0-0xdf)
1799 ESC-(-I in JIS (0x20-0x5f)
1800 SSO in EUC (0xa0-0xdf)
1801 0xa0-0xd in MS Kanji (0xa0-0xdf)
1804 case 'X': /* Assume X0201 kana */
1805 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1808 case 'F': /* prserve new lines */
1809 fold_preserve_f = TRUE;
1810 case 'f': /* folding -f60 or -f */
1813 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1815 fold_len += *cp++ - '0';
1817 if (!(0<fold_len && fold_len<BUFSIZ))
1818 fold_len = DEFAULT_FOLD;
1822 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1824 fold_margin += *cp++ - '0';
1828 case 'm': /* MIME support */
1829 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1830 if (*cp=='B'||*cp=='Q') {
1831 mime_decode_mode = *cp++;
1832 mimebuf_f = FIXED_MIME;
1833 } else if (*cp=='N') {
1834 mime_f = TRUE; cp++;
1835 } else if (*cp=='S') {
1836 mime_f = STRICT_MIME; cp++;
1837 } else if (*cp=='0') {
1838 mime_decode_f = FALSE;
1839 mime_f = FALSE; cp++;
1842 case 'M': /* MIME output */
1845 mimeout_f = FIXED_MIME; cp++;
1846 } else if (*cp=='Q') {
1848 mimeout_f = FIXED_MIME; cp++;
1853 case 'B': /* Broken JIS support */
1855 bit:1 allow any x on ESC-(-x or ESC-$-x
1856 bit:2 reset to ascii on NL
1858 if ('9'>= *cp && *cp>='0')
1859 broken_f |= 1<<(*cp++ -'0');
1864 case 'O':/* for Output file */
1868 case 'c':/* add cr code */
1871 case 'd':/* delete cr code */
1874 case 'I': /* ISO-2022-JP output */
1877 case 'L': /* line mode */
1878 if (*cp=='u') { /* unix */
1879 crmode_f = NL; cp++;
1880 } else if (*cp=='m') { /* mac */
1881 crmode_f = CR; cp++;
1882 } else if (*cp=='w') { /* windows */
1883 crmode_f = CRLF; cp++;
1884 } else if (*cp=='0') { /* no conversion */
1894 /* module muliple options in a string are allowed for Perl moudle */
1895 while(*cp && *cp++!='-');
1898 /* bogus option but ignored */
1904 #ifdef ANSI_C_PROTOTYPE
1905 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1907 struct input_code * find_inputcode_byfunc(iconv_func)
1908 nkf_char (*iconv_func)();
1912 struct input_code *p = input_code_list;
1914 if (iconv_func == p->iconv_func){
1923 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1925 #ifdef INPUT_CODE_FIX
1933 #ifdef INPUT_CODE_FIX
1934 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1940 if (estab_f && iconv_for_check != iconv){
1941 struct input_code *p = find_inputcode_byfunc(iconv);
1943 set_input_codename(p->name);
1944 debug(input_codename);
1946 iconv_for_check = iconv;
1951 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1952 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1953 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1954 #ifdef SHIFTJIS_CP932
1955 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1956 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1958 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1960 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1961 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1963 #define SCORE_INIT (SCORE_iMIME)
1965 const nkf_char score_table_A0[] = {
1968 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1969 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1972 const nkf_char score_table_F0[] = {
1973 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1974 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1975 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1976 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1979 void set_code_score(struct input_code *ptr, nkf_char score)
1982 ptr->score |= score;
1986 void clr_code_score(struct input_code *ptr, nkf_char score)
1989 ptr->score &= ~score;
1993 void code_score(struct input_code *ptr)
1995 nkf_char c2 = ptr->buf[0];
1996 #ifdef UTF8_OUTPUT_ENABLE
1997 nkf_char c1 = ptr->buf[1];
2000 set_code_score(ptr, SCORE_ERROR);
2001 }else if (c2 == SSO){
2002 set_code_score(ptr, SCORE_KANA);
2003 #ifdef UTF8_OUTPUT_ENABLE
2004 }else if (!e2w_conv(c2, c1)){
2005 set_code_score(ptr, SCORE_NO_EXIST);
2007 }else if ((c2 & 0x70) == 0x20){
2008 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2009 }else if ((c2 & 0x70) == 0x70){
2010 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2011 }else if ((c2 & 0x70) >= 0x50){
2012 set_code_score(ptr, SCORE_L2);
2016 void status_disable(struct input_code *ptr)
2021 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2024 void status_push_ch(struct input_code *ptr, nkf_char c)
2026 ptr->buf[ptr->index++] = c;
2029 void status_clear(struct input_code *ptr)
2035 void status_reset(struct input_code *ptr)
2038 ptr->score = SCORE_INIT;
2041 void status_reinit(struct input_code *ptr)
2044 ptr->_file_stat = 0;
2047 void status_check(struct input_code *ptr, nkf_char c)
2049 if (c <= DEL && estab_f){
2054 void s_status(struct input_code *ptr, nkf_char c)
2058 status_check(ptr, c);
2063 #ifdef NUMCHAR_OPTION
2064 }else if (is_unicode_capsule(c)){
2067 }else if (0xa1 <= c && c <= 0xdf){
2068 status_push_ch(ptr, SSO);
2069 status_push_ch(ptr, c);
2072 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2074 status_push_ch(ptr, c);
2075 #ifdef SHIFTJIS_CP932
2077 && is_ibmext_in_sjis(c)){
2079 status_push_ch(ptr, c);
2080 #endif /* SHIFTJIS_CP932 */
2082 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2084 status_push_ch(ptr, c);
2085 #endif /* X0212_ENABLE */
2087 status_disable(ptr);
2091 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2092 status_push_ch(ptr, c);
2093 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2097 status_disable(ptr);
2101 #ifdef SHIFTJIS_CP932
2102 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2103 status_push_ch(ptr, c);
2104 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2105 set_code_score(ptr, SCORE_CP932);
2110 #endif /* SHIFTJIS_CP932 */
2111 #ifndef X0212_ENABLE
2112 status_disable(ptr);
2118 void e_status(struct input_code *ptr, nkf_char c)
2122 status_check(ptr, c);
2127 #ifdef NUMCHAR_OPTION
2128 }else if (is_unicode_capsule(c)){
2131 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2133 status_push_ch(ptr, c);
2135 }else if (0x8f == c){
2137 status_push_ch(ptr, c);
2138 #endif /* X0212_ENABLE */
2140 status_disable(ptr);
2144 if (0xa1 <= c && c <= 0xfe){
2145 status_push_ch(ptr, c);
2149 status_disable(ptr);
2154 if (0xa1 <= c && c <= 0xfe){
2156 status_push_ch(ptr, c);
2158 status_disable(ptr);
2160 #endif /* X0212_ENABLE */
2164 #ifdef UTF8_INPUT_ENABLE
2165 void w_status(struct input_code *ptr, nkf_char c)
2169 status_check(ptr, c);
2174 #ifdef NUMCHAR_OPTION
2175 }else if (is_unicode_capsule(c)){
2178 }else if (0xc0 <= c && c <= 0xdf){
2180 status_push_ch(ptr, c);
2181 }else if (0xe0 <= c && c <= 0xef){
2183 status_push_ch(ptr, c);
2184 }else if (0xf0 <= c && c <= 0xf4){
2186 status_push_ch(ptr, c);
2188 status_disable(ptr);
2193 if (0x80 <= c && c <= 0xbf){
2194 status_push_ch(ptr, c);
2195 if (ptr->index > ptr->stat){
2196 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2197 && ptr->buf[2] == 0xbf);
2198 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2199 &ptr->buf[0], &ptr->buf[1]);
2206 status_disable(ptr);
2210 if (0x80 <= c && c <= 0xbf){
2211 if (ptr->index < ptr->stat){
2212 status_push_ch(ptr, c);
2217 status_disable(ptr);
2224 void code_status(nkf_char c)
2226 int action_flag = 1;
2227 struct input_code *result = 0;
2228 struct input_code *p = input_code_list;
2230 (p->status_func)(p, c);
2233 }else if(p->stat == 0){
2244 if (result && !estab_f){
2245 set_iconv(TRUE, result->iconv_func);
2246 }else if (c <= DEL){
2247 struct input_code *ptr = input_code_list;
2257 nkf_char std_getc(FILE *f)
2260 return std_gc_buf[--std_gc_ndx];
2266 nkf_char std_ungetc(nkf_char c, FILE *f)
2268 if (std_gc_ndx == STD_GC_BUFSIZE){
2271 std_gc_buf[std_gc_ndx++] = c;
2276 void std_putc(nkf_char c)
2283 #if !defined(PERL_XS) && !defined(WIN32DLL)
2284 nkf_char noconvert(FILE *f)
2289 module_connection();
2290 while ((c = (*i_getc)(f)) != EOF)
2297 void module_connection(void)
2299 oconv = output_conv;
2302 /* replace continucation module, from output side */
2304 /* output redicrection */
2306 if (noout_f || guess_f){
2313 if (mimeout_f == TRUE) {
2314 o_base64conv = oconv; oconv = base64_conv;
2316 /* base64_count = 0; */
2320 o_crconv = oconv; oconv = cr_conv;
2323 o_rot_conv = oconv; oconv = rot_conv;
2326 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2329 o_hira_conv = oconv; oconv = hira_conv;
2332 o_fconv = oconv; oconv = fold_conv;
2335 if (alpha_f || x0201_f) {
2336 o_zconv = oconv; oconv = z_conv;
2340 i_ungetc = std_ungetc;
2341 /* input redicrection */
2344 i_cgetc = i_getc; i_getc = cap_getc;
2345 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2348 i_ugetc = i_getc; i_getc = url_getc;
2349 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2352 #ifdef NUMCHAR_OPTION
2354 i_ngetc = i_getc; i_getc = numchar_getc;
2355 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2358 #ifdef UNICODE_NORMALIZATION
2359 if (nfc_f && input_f == UTF8_INPUT){
2360 i_nfc_getc = i_getc; i_getc = nfc_getc;
2361 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2364 if (mime_f && mimebuf_f==FIXED_MIME) {
2365 i_mgetc = i_getc; i_getc = mime_getc;
2366 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2369 i_bgetc = i_getc; i_getc = broken_getc;
2370 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2372 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2373 set_iconv(-TRUE, e_iconv);
2374 } else if (input_f == SJIS_INPUT) {
2375 set_iconv(-TRUE, s_iconv);
2376 #ifdef UTF8_INPUT_ENABLE
2377 } else if (input_f == UTF8_INPUT) {
2378 set_iconv(-TRUE, w_iconv);
2379 } else if (input_f == UTF16_INPUT) {
2380 set_iconv(-TRUE, w_iconv16);
2381 } else if (input_f == UTF32_INPUT) {
2382 set_iconv(-TRUE, w_iconv32);
2385 set_iconv(FALSE, e_iconv);
2389 struct input_code *p = input_code_list;
2397 * Check and Ignore BOM
2399 void check_bom(FILE *f)
\r
2402 switch(c2 = (*i_getc)(f)){
2404 if((c2 = (*i_getc)(f)) == 0x00){
2405 if((c2 = (*i_getc)(f)) == 0xFE){
2406 if((c2 = (*i_getc)(f)) == 0xFF){
2408 set_iconv(TRUE, w_iconv32);
2410 input_endian = ENDIAN_BIG;
2412 }else (*i_ungetc)(c2,f);
2413 (*i_ungetc)(0xFE,f);
2414 }else if(c2 == 0xFF){
2415 if((c2 = (*i_getc)(f)) == 0xFE){
2417 set_iconv(TRUE, w_iconv32);
2419 input_endian = ENDIAN_2143;
2421 }else (*i_ungetc)(c2,f);
2422 (*i_ungetc)(0xFF,f);
2423 }else (*i_ungetc)(c2,f);
2424 (*i_ungetc)(0x00,f);
2425 }else (*i_ungetc)(c2,f);
2426 (*i_ungetc)(0x00,f);
2429 if((c2 = (*i_getc)(f)) == 0xBB){
2430 if((c2 = (*i_getc)(f)) == 0xBF){
2432 set_iconv(TRUE, w_iconv);
2435 }else (*i_ungetc)(c2,f);
2436 (*i_ungetc)(0xBB,f);
2437 }else (*i_ungetc)(c2,f);
2438 (*i_ungetc)(0xEF,f);
2441 if((c2 = (*i_getc)(f)) == 0xFF){
2442 if((c2 = (*i_getc)(f)) == 0x00){
2443 if((c2 = (*i_getc)(f)) == 0x00){
2445 set_iconv(TRUE, w_iconv32);
2447 input_endian = ENDIAN_3412;
2449 }else (*i_ungetc)(c2,f);
2450 (*i_ungetc)(0x00,f);
2451 }else (*i_ungetc)(c2,f);
2453 set_iconv(TRUE, w_iconv16);
2455 input_endian = ENDIAN_BIG;
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xFE,f);
2461 if((c2 = (*i_getc)(f)) == 0xFE){
2462 if((c2 = (*i_getc)(f)) == 0x00){
2463 if((c2 = (*i_getc)(f)) == 0x00){
2465 set_iconv(TRUE, w_iconv32);
2467 input_endian = ENDIAN_LITTLE;
2469 }else (*i_ungetc)(c2,f);
2470 (*i_ungetc)(0x00,f);
2471 }else (*i_ungetc)(c2,f);
2473 set_iconv(TRUE, w_iconv16);
2475 input_endian = ENDIAN_LITTLE;
2477 }else (*i_ungetc)(c2,f);
2478 (*i_ungetc)(0xFF,f);
2487 Conversion main loop. Code detection only.
2490 nkf_char kanji_convert(FILE *f)
2492 nkf_char c3, c2=0, c1, c0=0;
2493 int is_8bit = FALSE;
2495 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2496 #ifdef UTF8_INPUT_ENABLE
2497 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2504 output_mode = ASCII;
2507 #define NEXT continue /* no output, get next */
2508 #define SEND ; /* output c1 and c2, get next */
2509 #define LAST break /* end of loop, go closing */
2511 module_connection();
2514 while ((c1 = (*i_getc)(f)) != EOF) {
2515 #ifdef INPUT_CODE_FIX
2522 /* in case of 8th bit is on */
2523 if (!estab_f&&!mime_decode_mode) {
2524 /* in case of not established yet */
2525 /* It is still ambiguious */
2526 if (h_conv(f, c2, c1)==EOF)
2532 /* in case of already established */
2534 /* ignore bogus code */
2540 /* second byte, 7 bit code */
2541 /* it might be kanji shitfted */
2542 if ((c1 == DEL) || (c1 <= SPACE)) {
2543 /* ignore bogus first code */
2550 #ifdef UTF8_INPUT_ENABLE
2551 if (iconv == w_iconv16) {
2552 if (input_endian == ENDIAN_BIG) {
2554 if ((c1 = (*i_getc)(f)) != EOF) {
2555 if (0xD8 <= c2 && c2 <= 0xDB) {
2556 if ((c0 = (*i_getc)(f)) != EOF) {
2558 if ((c3 = (*i_getc)(f)) != EOF) {
2565 if ((c2 = (*i_getc)(f)) != EOF) {
2566 if (0xD8 <= c2 && c2 <= 0xDB) {
2567 if ((c3 = (*i_getc)(f)) != EOF) {
2569 if ((c0 = (*i_getc)(f)) != EOF) {
2577 } else if(iconv == w_iconv32){
2579 if((c2 = (*i_getc)(f)) != EOF &&
2580 (c1 = (*i_getc)(f)) != EOF &&
2581 (c0 = (*i_getc)(f)) != EOF){
2582 switch(input_endian){
2584 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2587 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2590 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2593 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2603 #ifdef NUMCHAR_OPTION
2604 if (is_unicode_capsule(c1)){
2610 if (!estab_f && !iso8859_f) {
2611 /* not established yet */
2614 } else { /* estab_f==TRUE */
2619 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2620 /* SJIS X0201 Case... */
2621 if(iso2022jp_f && x0201_f==NO_X0201) {
2622 (*oconv)(GETA1, GETA2);
2629 } else if (c1==SSO && iconv != s_iconv) {
2630 /* EUC X0201 Case */
2631 c1 = (*i_getc)(f); /* skip SSO */
2633 if (SSP<=c1 && c1<0xe0) {
2634 if(iso2022jp_f && x0201_f==NO_X0201) {
2635 (*oconv)(GETA1, GETA2);
2642 } else { /* bogus code, skip SSO and one byte */
2646 /* already established */
2651 } else if ((c1 > SPACE) && (c1 != DEL)) {
2652 /* in case of Roman characters */
2654 /* output 1 shifted byte */
2658 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2659 /* output 1 shifted byte */
2660 if(iso2022jp_f && x0201_f==NO_X0201) {
2661 (*oconv)(GETA1, GETA2);
2668 /* look like bogus code */
2671 } else if (input_mode == X0208 || input_mode == X0212 ||
2672 input_mode == X0213_1 || input_mode == X0213_2) {
2673 /* in case of Kanji shifted */
2676 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2677 /* Check MIME code */
2678 if ((c1 = (*i_getc)(f)) == EOF) {
2681 } else if (c1 == '?') {
2682 /* =? is mime conversion start sequence */
2683 if(mime_f == STRICT_MIME) {
2684 /* check in real detail */
2685 if (mime_begin_strict(f) == EOF)
2689 } else if (mime_begin(f) == EOF)
2699 /* normal ASCII code */
2702 } else if (!is_8bit && c1 == SI) {
2705 } else if (!is_8bit && c1 == SO) {
2708 } else if (!is_8bit && c1 == ESC ) {
2709 if ((c1 = (*i_getc)(f)) == EOF) {
2710 /* (*oconv)(0, ESC); don't send bogus code */
2712 } else if (c1 == '$') {
2713 if ((c1 = (*i_getc)(f)) == EOF) {
2715 (*oconv)(0, ESC); don't send bogus code
2716 (*oconv)(0, '$'); */
2718 } else if (c1 == '@'|| c1 == 'B') {
2719 /* This is kanji introduction */
2722 set_input_codename("ISO-2022-JP");
2724 debug(input_codename);
2727 } else if (c1 == '(') {
2728 if ((c1 = (*i_getc)(f)) == EOF) {
2729 /* don't send bogus code
2735 } else if (c1 == '@'|| c1 == 'B') {
2736 /* This is kanji introduction */
2741 } else if (c1 == 'D'){
2745 #endif /* X0212_ENABLE */
2746 } else if (c1 == (X0213_1&0x7F)){
2747 input_mode = X0213_1;
2750 } else if (c1 == (X0213_2&0x7F)){
2751 input_mode = X0213_2;
2755 /* could be some special code */
2762 } else if (broken_f&0x2) {
2763 /* accept any ESC-(-x as broken code ... */
2773 } else if (c1 == '(') {
2774 if ((c1 = (*i_getc)(f)) == EOF) {
2775 /* don't send bogus code
2777 (*oconv)(0, '('); */
2781 /* This is X0201 kana introduction */
2782 input_mode = X0201; shift_mode = X0201;
2784 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2785 /* This is X0208 kanji introduction */
2786 input_mode = ASCII; shift_mode = FALSE;
2788 } else if (broken_f&0x2) {
2789 input_mode = ASCII; shift_mode = FALSE;
2794 /* maintain various input_mode here */
2798 } else if ( c1 == 'N' || c1 == 'n' ){
2800 c3 = (*i_getc)(f); /* skip SS2 */
2801 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2816 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2817 input_mode = ASCII; set_iconv(FALSE, 0);
2819 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2820 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2828 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2829 if ((c1=(*i_getc)(f))!=EOF) {
2833 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2851 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2854 if ((c0 = (*i_getc)(f)) != EOF) {
2857 if ((c3 = (*i_getc)(f)) != EOF) {
2859 (*iconv)(c2, c1, c0|c3);
2864 /* 3 bytes EUC or UTF-8 */
2865 if ((c0 = (*i_getc)(f)) != EOF) {
2867 (*iconv)(c2, c1, c0);
2874 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2878 (*oconv)(PREFIX_EUCG3 | c2, c1);
2880 #endif /* X0212_ENABLE */
2882 (*oconv)(PREFIX_EUCG3 | c2, c1);
2885 (*oconv)(input_mode, c1); /* other special case */
2891 /* goto next_word */
2895 (*iconv)(EOF, 0, 0);
2896 if (!is_inputcode_set)
2899 struct input_code *p = input_code_list;
2900 struct input_code *result = p;
2902 if (p->score < result->score) result = p;
2905 set_input_codename(result->name);
2912 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2914 nkf_char ret, c3, c0;
2918 /** it must NOT be in the kanji shifte sequence */
2919 /** it must NOT be written in JIS7 */
2920 /** and it must be after 2 byte 8bit code */
2926 while ((c1 = (*i_getc)(f)) != EOF) {
2932 if (push_hold_buf(c1) == EOF || estab_f){
2938 struct input_code *p = input_code_list;
2939 struct input_code *result = p;
2944 if (p->score < result->score){
2949 set_iconv(FALSE, result->iconv_func);
2954 ** 1) EOF is detected, or
2955 ** 2) Code is established, or
2956 ** 3) Buffer is FULL (but last word is pushed)
2958 ** in 1) and 3) cases, we continue to use
2959 ** Kanji codes by oconv and leave estab_f unchanged.
2964 while (hold_index < hold_count){
2965 c2 = hold_buf[hold_index++];
2967 #ifdef NUMCHAR_OPTION
2968 || is_unicode_capsule(c2)
2973 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
2974 (*iconv)(X0201, c2, 0);
2977 if (hold_index < hold_count){
2978 c1 = hold_buf[hold_index++];
2988 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
2991 if (hold_index < hold_count){
2992 c0 = hold_buf[hold_index++];
2993 } else if ((c0 = (*i_getc)(f)) == EOF) {
2999 if (hold_index < hold_count){
3000 c3 = hold_buf[hold_index++];
3001 } else if ((c3 = (*i_getc)(f)) == EOF) {
3006 (*iconv)(c2, c1, c0|c3);
3011 /* 3 bytes EUC or UTF-8 */
3012 if (hold_index < hold_count){
3013 c0 = hold_buf[hold_index++];
3014 } else if ((c0 = (*i_getc)(f)) == EOF) {
3020 (*iconv)(c2, c1, c0);
3023 if (c0 == EOF) break;
3028 nkf_char push_hold_buf(nkf_char c2)
3030 if (hold_count >= HOLD_SIZE*2)
3032 hold_buf[hold_count++] = (unsigned char)c2;
3033 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3036 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3038 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3041 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3042 #ifdef SHIFTJIS_CP932
3043 if (cp51932_f && is_ibmext_in_sjis(c2)){
3045 extern const unsigned short shiftjis_cp932[3][189];
3047 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3053 #endif /* SHIFTJIS_CP932 */
3055 if (!x0213_f && is_ibmext_in_sjis(c2)){
3057 extern const unsigned short shiftjis_x0212[3][189];
3059 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3062 c2 = PREFIX_EUCG3 | (val >> 8);
3075 if(x0213_f && c2 >= 0xF0){
3076 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3077 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3078 }else{ /* 78<=k<=94 */
3079 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3080 if (0x9E < c1) c2++;
3083 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3084 if (0x9E < c1) c2++;
3087 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3094 c2 = x0212_unshift(c2);
3101 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3105 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3108 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3109 if (ret) return ret;
3115 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3120 }else if (c2 == 0x8f){
3124 c2 = (c2 << 8) | (c1 & 0x7f);
3126 #ifdef SHIFTJIS_CP932
3129 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3130 s2e_conv(s2, s1, &c2, &c1);
3137 #endif /* SHIFTJIS_CP932 */
3138 #endif /* X0212_ENABLE */
3139 } else if (c2 == SSO){
3142 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3152 #ifdef UTF8_INPUT_ENABLE
3153 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3160 }else if (0xc0 <= c2 && c2 <= 0xef) {
3161 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3162 #ifdef NUMCHAR_OPTION
3165 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3173 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3176 static const int w_iconv_utf8_1st_byte[] =
3178 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3179 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3180 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3181 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3183 if (c2 < 0 || 0xff < c2) {
3184 }else if (c2 == 0) { /* 0 : 1 byte*/
3186 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3189 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3191 if (c1 < 0x80 || 0xBF < c1) return 0;
3194 if (c0 == 0) return -1;
3195 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3200 if (c0 == 0) return -1;
3201 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3205 if (c0 == 0) return -1;
3206 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3210 if (c0 == 0) return -2;
3211 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3215 if (c0 == 0) return -2;
3216 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3220 if (c0 == 0) return -2;
3221 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3229 if (c2 == 0 || c2 == EOF){
3230 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3231 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3234 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3243 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3244 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3251 }else if (val < 0x800){
3252 *p2 = 0xc0 | (val >> 6);
3253 *p1 = 0x80 | (val & 0x3f);
3255 } else if (val <= NKF_INT32_C(0xFFFF)) {
3256 *p2 = 0xe0 | (val >> 12);
3257 *p1 = 0x80 | ((val >> 6) & 0x3f);
3258 *p0 = 0x80 | (val & 0x3f);
3259 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3260 *p2 = 0xe0 | (val >> 16);
3261 *p1 = 0x80 | ((val >> 12) & 0x3f);
3262 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3271 #ifdef UTF8_INPUT_ENABLE
3272 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3277 } else if (c2 >= 0xf0){
3278 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3279 val = (c2 & 0x0f) << 18;
3280 val |= (c1 & 0x3f) << 12;
3281 val |= (c0 & 0x3f00) >> 2;
3283 }else if (c2 >= 0xe0){
3284 val = (c2 & 0x0f) << 12;
3285 val |= (c1 & 0x3f) << 6;
3287 }else if (c2 >= 0xc0){
3288 val = (c2 & 0x1f) << 6;
3296 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3298 nkf_char c2, c1, c0;
3305 w16w_conv(val, &c2, &c1, &c0);
3306 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3307 #ifdef NUMCHAR_OPTION
3310 *p1 = CLASS_UNICODE | val;
3319 #ifdef UTF8_INPUT_ENABLE
3320 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3323 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3326 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3327 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3329 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3331 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3336 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3337 if (ret) return ret;
3342 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3346 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3347 } else if (is_unicode_bmp(c1)) {
3348 ret = w16e_conv(c1, &c2, &c1);
3351 c1 = CLASS_UNICODE | c1;
3353 if (ret) return ret;
3358 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3361 extern const unsigned short *const utf8_to_euc_2bytes[];
3362 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3363 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3364 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3365 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3366 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3368 const unsigned short *const *pp;
3369 const unsigned short *const *const *ppp;
3370 static const int no_best_fit_chars_table_C2[] =
3371 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3372 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3373 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3374 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3375 static const int no_best_fit_chars_table_C2_ms[] =
3376 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3378 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3379 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3380 static const int no_best_fit_chars_table_932_C2[] =
3381 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3382 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3383 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3384 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3385 static const int no_best_fit_chars_table_932_C3[] =
3386 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3387 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3389 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3395 }else if(c2 < 0xe0){
3396 if(no_best_fit_chars_f){
3397 if(ms_ucs_map_f == UCS_MAP_CP932){
3400 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3403 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3406 }else if(cp51932_f){
3409 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3412 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3415 }else if(ms_ucs_map_f == UCS_MAP_MS){
3416 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3420 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3421 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3423 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3424 }else if(c0 < 0xF0){
3425 if(no_best_fit_chars_f){
3426 if(ms_ucs_map_f == UCS_MAP_CP932){
3427 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3428 }else if(ms_ucs_map_f == UCS_MAP_MS){
3433 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3436 if(c0 == 0x92) return 1;
3441 if(c1 == 0x80 || c0 == 0x9C) return 1;
3449 if(c0 == 0x95) return 1;
3452 if(c0 == 0xA5) return 1;
3459 if(c0 == 0x8D) return 1;
3462 if(c0 == 0x9E && cp51932_f) return 1;
3465 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3473 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3474 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3476 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3478 #ifdef SHIFTJIS_CP932
3479 if (!ret && cp51932_f && is_eucg3(*p2)) {
3481 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3482 s2e_conv(s2, s1, p2, p1);
3491 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3494 const unsigned short *p;
3497 if (pp == 0) return 1;
3500 if (c1 < 0 || psize <= c1) return 1;
3502 if (p == 0) return 1;
3505 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3507 if (val == 0) return 1;
3508 if (no_cp932ext_f && (
3509 (val>>8) == 0x2D || /* NEC special characters */
3510 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3518 if (c2 == SO) c2 = X0201;
3525 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3527 const char *hex = "0123456789ABCDEF";
3533 (*f)(0, hex[(c>>shift)&0xF]);
3543 void encode_fallback_html(nkf_char c)
3548 if(c >= NKF_INT32_C(1000000))
3549 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3550 if(c >= NKF_INT32_C(100000))
3551 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3553 (*oconv)(0, 0x30+(c/10000 )%10);
3555 (*oconv)(0, 0x30+(c/1000 )%10);
3557 (*oconv)(0, 0x30+(c/100 )%10);
3559 (*oconv)(0, 0x30+(c/10 )%10);
3561 (*oconv)(0, 0x30+ c %10);
3566 void encode_fallback_xml(nkf_char c)
3571 nkf_each_char_to_hex(oconv, c);
3576 void encode_fallback_java(nkf_char c)
3578 const char *hex = "0123456789ABCDEF";
3581 if(!is_unicode_bmp(c)){
3585 (*oconv)(0, hex[(c>>20)&0xF]);
3586 (*oconv)(0, hex[(c>>16)&0xF]);
3590 (*oconv)(0, hex[(c>>12)&0xF]);
3591 (*oconv)(0, hex[(c>> 8)&0xF]);
3592 (*oconv)(0, hex[(c>> 4)&0xF]);
3593 (*oconv)(0, hex[ c &0xF]);
3597 void encode_fallback_perl(nkf_char c)
3602 nkf_each_char_to_hex(oconv, c);
3607 void encode_fallback_subchar(nkf_char c)
3609 c = unicode_subchar;
3610 (*oconv)((c>>8)&0xFF, c&0xFF);
3615 #ifdef UTF8_OUTPUT_ENABLE
3616 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3619 extern const unsigned short euc_to_utf8_1byte[];
3620 extern const unsigned short *const euc_to_utf8_2bytes[];
3621 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3622 extern const unsigned short *const x0212_to_utf8_2bytes[];
3624 const unsigned short *p;
3627 p = euc_to_utf8_1byte;
3629 } else if (is_eucg3(c2)){
3630 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3633 c2 = (c2&0x7f) - 0x21;
3634 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3635 p = x0212_to_utf8_2bytes[c2];
3641 c2 = (c2&0x7f) - 0x21;
3642 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3643 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3648 c1 = (c1 & 0x7f) - 0x21;
3649 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3654 void w_oconv(nkf_char c2, nkf_char c1)
3660 output_bom_f = FALSE;
3671 #ifdef NUMCHAR_OPTION
3672 if (c2 == 0 && is_unicode_capsule(c1)){
3673 val = c1 & VALUE_MASK;
3676 }else if (val < 0x800){
3677 (*o_putc)(0xC0 | (val >> 6));
3678 (*o_putc)(0x80 | (val & 0x3f));
3679 } else if (val <= NKF_INT32_C(0xFFFF)) {
3680 (*o_putc)(0xE0 | (val >> 12));
3681 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3682 (*o_putc)(0x80 | (val & 0x3f));
3683 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3684 (*o_putc)(0xF0 | ( val>>18));
3685 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3686 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3687 (*o_putc)(0x80 | ( val & 0x3f));
3694 output_mode = ASCII;
3696 } else if (c2 == ISO8859_1) {
3697 output_mode = ISO8859_1;
3698 (*o_putc)(c1 | 0x080);
3701 val = e2w_conv(c2, c1);
3703 w16w_conv(val, &c2, &c1, &c0);
3707 if (c0) (*o_putc)(c0);
3713 void w_oconv16(nkf_char c2, nkf_char c1)
3716 output_bom_f = FALSE;
3717 if (output_endian == ENDIAN_LITTLE){
3718 (*o_putc)((unsigned char)'\377');
3722 (*o_putc)((unsigned char)'\377');
3731 if (c2 == ISO8859_1) {
3734 #ifdef NUMCHAR_OPTION
3735 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3736 if (is_unicode_bmp(c1)) {
3737 c2 = (c1 >> 8) & 0xff;
3741 if (c1 <= UNICODE_MAX) {
3742 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3743 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3744 if (output_endian == ENDIAN_LITTLE){
3745 (*o_putc)(c2 & 0xff);
3746 (*o_putc)((c2 >> 8) & 0xff);
3747 (*o_putc)(c1 & 0xff);
3748 (*o_putc)((c1 >> 8) & 0xff);
3750 (*o_putc)((c2 >> 8) & 0xff);
3751 (*o_putc)(c2 & 0xff);
3752 (*o_putc)((c1 >> 8) & 0xff);
3753 (*o_putc)(c1 & 0xff);
3760 nkf_char val = e2w_conv(c2, c1);
3761 c2 = (val >> 8) & 0xff;
3764 if (output_endian == ENDIAN_LITTLE){
3773 void w_oconv32(nkf_char c2, nkf_char c1)
3776 output_bom_f = FALSE;
3777 if (output_endian == ENDIAN_LITTLE){
3778 (*o_putc)((unsigned char)'\377');
3786 (*o_putc)((unsigned char)'\377');
3795 if (c2 == ISO8859_1) {
3797 #ifdef NUMCHAR_OPTION
3798 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3802 c1 = e2w_conv(c2, c1);
3804 if (output_endian == ENDIAN_LITTLE){
3805 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3806 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3807 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3811 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3812 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3813 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3818 void e_oconv(nkf_char c2, nkf_char c1)
3820 #ifdef NUMCHAR_OPTION
3821 if (c2 == 0 && is_unicode_capsule(c1)){
3822 w16e_conv(c1, &c2, &c1);
3823 if (c2 == 0 && is_unicode_capsule(c1)){
3824 if(encode_fallback)(*encode_fallback)(c1);
3832 } else if (c2 == 0) {
3833 output_mode = ASCII;
3835 } else if (c2 == X0201) {
3836 output_mode = JAPANESE_EUC;
3837 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3838 } else if (c2 == ISO8859_1) {
3839 output_mode = ISO8859_1;
3840 (*o_putc)(c1 | 0x080);
3842 } else if (is_eucg3(c2)){
3843 output_mode = JAPANESE_EUC;
3844 #ifdef SHIFTJIS_CP932
3847 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3848 s2e_conv(s2, s1, &c2, &c1);
3853 output_mode = ASCII;
3855 }else if (is_eucg3(c2)){
3858 (*o_putc)((c2 & 0x7f) | 0x080);
3859 (*o_putc)(c1 | 0x080);
3862 (*o_putc)((c2 & 0x7f) | 0x080);
3863 (*o_putc)(c1 | 0x080);
3867 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3868 set_iconv(FALSE, 0);
3869 return; /* too late to rescue this char */
3871 output_mode = JAPANESE_EUC;
3872 (*o_putc)(c2 | 0x080);
3873 (*o_putc)(c1 | 0x080);
3878 nkf_char x0212_shift(nkf_char c)
3883 if (0x75 <= c && c <= 0x7f){
3884 ret = c + (0x109 - 0x75);
3887 if (0x75 <= c && c <= 0x7f){
3888 ret = c + (0x113 - 0x75);
3895 nkf_char x0212_unshift(nkf_char c)
3898 if (0x7f <= c && c <= 0x88){
3899 ret = c + (0x75 - 0x7f);
3900 }else if (0x89 <= c && c <= 0x92){
3901 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
3905 #endif /* X0212_ENABLE */
3907 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3913 if((0x21 <= ndx && ndx <= 0x2F)){
3914 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
3915 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3917 }else if(0x6E <= ndx && ndx <= 0x7E){
3918 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
3919 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3925 else if(nkf_isgraph(ndx)){
3927 const unsigned short *ptr;
3929 extern const unsigned short *const x0212_shiftjis[];
3931 ptr = x0212_shiftjis[ndx - 0x21];
3933 val = ptr[(c1 & 0x7f) - 0x21];
3942 c2 = x0212_shift(c2);
3944 #endif /* X0212_ENABLE */
3946 if(0x7F < c2) return 1;
3947 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
3948 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
3952 void s_oconv(nkf_char c2, nkf_char c1)
3954 #ifdef NUMCHAR_OPTION
3955 if (c2 == 0 && is_unicode_capsule(c1)){
3956 w16e_conv(c1, &c2, &c1);
3957 if (c2 == 0 && is_unicode_capsule(c1)){
3958 if(encode_fallback)(*encode_fallback)(c1);
3966 } else if (c2 == 0) {
3967 output_mode = ASCII;
3969 } else if (c2 == X0201) {
3970 output_mode = SHIFT_JIS;
3972 } else if (c2 == ISO8859_1) {
3973 output_mode = ISO8859_1;
3974 (*o_putc)(c1 | 0x080);
3976 } else if (is_eucg3(c2)){
3977 output_mode = SHIFT_JIS;
3978 if (e2s_conv(c2, c1, &c2, &c1) == 0){
3984 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
3985 set_iconv(FALSE, 0);
3986 return; /* too late to rescue this char */
3988 output_mode = SHIFT_JIS;
3989 e2s_conv(c2, c1, &c2, &c1);
3991 #ifdef SHIFTJIS_CP932
3993 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3995 extern const unsigned short cp932inv[2][189];
3997 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4003 #endif /* SHIFTJIS_CP932 */
4006 if (prefix_table[(unsigned char)c1]){
4007 (*o_putc)(prefix_table[(unsigned char)c1]);
4013 void j_oconv(nkf_char c2, nkf_char c1)
4015 #ifdef NUMCHAR_OPTION
4016 if (c2 == 0 && is_unicode_capsule(c1)){
4017 w16e_conv(c1, &c2, &c1);
4018 if (c2 == 0 && is_unicode_capsule(c1)){
4019 if(encode_fallback)(*encode_fallback)(c1);
4025 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4028 (*o_putc)(ascii_intro);
4029 output_mode = ASCII;
4033 } else if (is_eucg3(c2)){
4035 if(output_mode!=X0213_2){
4036 output_mode = X0213_2;
4040 (*o_putc)(X0213_2&0x7F);
4043 if(output_mode!=X0212){
4044 output_mode = X0212;
4048 (*o_putc)(X0212&0x7F);
4051 (*o_putc)(c2 & 0x7f);
4054 } else if (c2==X0201) {
4055 if (output_mode!=X0201) {
4056 output_mode = X0201;
4062 } else if (c2==ISO8859_1) {
4063 /* iso8859 introduction, or 8th bit on */
4064 /* Can we convert in 7bit form using ESC-'-'-A ?
4066 output_mode = ISO8859_1;
4068 } else if (c2 == 0) {
4069 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4072 (*o_putc)(ascii_intro);
4073 output_mode = ASCII;
4077 if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4079 if (output_mode!=X0213_1) {
4080 output_mode = X0213_1;
4084 (*o_putc)(X0213_1&0x7F);
4086 }else if (output_mode != X0208) {
4087 output_mode = X0208;
4090 (*o_putc)(kanji_intro);
4097 void base64_conv(nkf_char c2, nkf_char c1)
4099 mime_prechar(c2, c1);
4100 (*o_base64conv)(c2,c1);
4104 static nkf_char broken_buf[3];
4105 static int broken_counter = 0;
4106 static int broken_last = 0;
4107 nkf_char broken_getc(FILE *f)
4111 if (broken_counter>0) {
4112 return broken_buf[--broken_counter];
4115 if (c=='$' && broken_last != ESC
4116 && (input_mode==ASCII || input_mode==X0201)) {
4119 if (c1=='@'|| c1=='B') {
4120 broken_buf[0]=c1; broken_buf[1]=c;
4127 } else if (c=='(' && broken_last != ESC
4128 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4131 if (c1=='J'|| c1=='B') {
4132 broken_buf[0]=c1; broken_buf[1]=c;
4145 nkf_char broken_ungetc(nkf_char c, FILE *f)
4147 if (broken_counter<2)
4148 broken_buf[broken_counter++]=c;
4152 static nkf_char prev_cr = 0;
4154 void cr_conv(nkf_char c2, nkf_char c1)
4158 if (! (c2==0&&c1==NL) ) {
4164 } else if (c1=='\r') {
4166 } else if (c1=='\n') {
4167 if (crmode_f==CRLF) {
4168 (*o_crconv)(0,'\r');
4169 } else if (crmode_f==CR) {
4170 (*o_crconv)(0,'\r');
4174 } else if (c1!='\032' || crmode_f!=NL){
4180 Return value of fold_conv()
4182 \n add newline and output char
4183 \r add newline and output nothing
4186 1 (or else) normal output
4188 fold state in prev (previous character)
4190 >0x80 Japanese (X0208/X0201)
4195 This fold algorthm does not preserve heading space in a line.
4196 This is the main difference from fmt.
4199 #define char_size(c2,c1) (c2?2:1)
4201 void fold_conv(nkf_char c2, nkf_char c1)
4204 nkf_char fold_state;
4206 if (c1== '\r' && !fold_preserve_f) {
4207 fold_state=0; /* ignore cr */
4208 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4210 fold_state=0; /* ignore cr */
4211 } else if (c1== BS) {
4212 if (f_line>0) f_line--;
4214 } else if (c2==EOF && f_line != 0) { /* close open last line */
4216 } else if ((c1=='\n' && !fold_preserve_f)
4217 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4218 && fold_preserve_f)) {
4220 if (fold_preserve_f) {
4224 } else if ((f_prev == c1 && !fold_preserve_f)
4225 || (f_prev == '\n' && fold_preserve_f)
4226 ) { /* duplicate newline */
4229 fold_state = '\n'; /* output two newline */
4235 if (f_prev&0x80) { /* Japanese? */
4237 fold_state = 0; /* ignore given single newline */
4238 } else if (f_prev==' ') {
4242 if (++f_line<=fold_len)
4246 fold_state = '\r'; /* fold and output nothing */
4250 } else if (c1=='\f') {
4253 fold_state = '\n'; /* output newline and clear */
4254 } else if ( (c2==0 && c1==' ')||
4255 (c2==0 && c1=='\t')||
4256 (c2=='!'&& c1=='!')) {
4257 /* X0208 kankaku or ascii space */
4258 if (f_prev == ' ') {
4259 fold_state = 0; /* remove duplicate spaces */
4262 if (++f_line<=fold_len)
4263 fold_state = ' '; /* output ASCII space only */
4265 f_prev = ' '; f_line = 0;
4266 fold_state = '\r'; /* fold and output nothing */
4270 prev0 = f_prev; /* we still need this one... , but almost done */
4272 if (c2 || c2==X0201)
4273 f_prev |= 0x80; /* this is Japanese */
4274 f_line += char_size(c2,c1);
4275 if (f_line<=fold_len) { /* normal case */
4278 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4279 f_line = char_size(c2,c1);
4280 fold_state = '\n'; /* We can't wait, do fold now */
4281 } else if (c2==X0201) {
4282 /* simple kinsoku rules return 1 means no folding */
4283 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4284 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4285 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4286 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4287 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4288 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4289 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4291 fold_state = '\n';/* add one new f_line before this character */
4294 fold_state = '\n';/* add one new f_line before this character */
4297 /* kinsoku point in ASCII */
4298 if ( c1==')'|| /* { [ ( */
4309 /* just after special */
4310 } else if (!is_alnum(prev0)) {
4311 f_line = char_size(c2,c1);
4313 } else if ((prev0==' ') || /* ignored new f_line */
4314 (prev0=='\n')|| /* ignored new f_line */
4315 (prev0&0x80)) { /* X0208 - ASCII */
4316 f_line = char_size(c2,c1);
4317 fold_state = '\n';/* add one new f_line before this character */
4319 fold_state = 1; /* default no fold in ASCII */
4323 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4324 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4325 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4326 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4327 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4328 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4329 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4330 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4331 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4332 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4333 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4334 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4335 /* default no fold in kinsoku */
4338 f_line = char_size(c2,c1);
4339 /* add one new f_line before this character */
4342 f_line = char_size(c2,c1);
4344 /* add one new f_line before this character */
4349 /* terminator process */
4350 switch(fold_state) {
4369 nkf_char z_prev2=0,z_prev1=0;
4371 void z_conv(nkf_char c2, nkf_char c1)
4374 /* if (c2) c1 &= 0x7f; assertion */
4376 if (x0201_f && z_prev2==X0201) { /* X0201 */
4377 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4379 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4381 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4383 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4387 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4396 if (x0201_f && c2==X0201) {
4397 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4398 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4399 z_prev1 = c1; z_prev2 = c2;
4402 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4407 /* JISX0208 Alphabet */
4408 if (alpha_f && c2 == 0x23 ) {
4410 } else if (alpha_f && c2 == 0x21 ) {
4411 /* JISX0208 Kigou */
4416 } else if (alpha_f&0x4) {
4421 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4427 case '>': entity = ">"; break;
4428 case '<': entity = "<"; break;
4429 case '\"': entity = """; break;
4430 case '&': entity = "&"; break;
4433 while (*entity) (*o_zconv)(0, *entity++);
4443 #define rot13(c) ( \
4445 (c <= 'M') ? (c + 13): \
4446 (c <= 'Z') ? (c - 13): \
4448 (c <= 'm') ? (c + 13): \
4449 (c <= 'z') ? (c - 13): \
4453 #define rot47(c) ( \
4455 ( c <= 'O' ) ? (c + 47) : \
4456 ( c <= '~' ) ? (c - 47) : \
4460 void rot_conv(nkf_char c2, nkf_char c1)
4462 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4468 (*o_rot_conv)(c2,c1);
4471 void hira_conv(nkf_char c2, nkf_char c1)
4475 if (0x20 < c1 && c1 < 0x74) {
4477 (*o_hira_conv)(c2,c1);
4479 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4481 c1 = CLASS_UNICODE | 0x3094;
4482 (*o_hira_conv)(c2,c1);
4485 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4487 (*o_hira_conv)(c2,c1);
4492 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4495 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4497 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4501 (*o_hira_conv)(c2,c1);
4505 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4507 static const nkf_char range[RANGE_NUM_MAX][2] = {
4528 nkf_char start, end, c;
4530 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4534 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4539 for (i = 0; i < RANGE_NUM_MAX; i++) {
4540 start = range[i][0];
4543 if (c >= start && c <= end) {
4548 (*o_iso2022jp_check_conv)(c2,c1);
4552 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4554 const unsigned char *mime_pattern[] = {
4555 (const unsigned char *)"\075?EUC-JP?B?",
4556 (const unsigned char *)"\075?SHIFT_JIS?B?",
4557 (const unsigned char *)"\075?ISO-8859-1?Q?",
4558 (const unsigned char *)"\075?ISO-8859-1?B?",
4559 (const unsigned char *)"\075?ISO-2022-JP?B?",
4560 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4561 #if defined(UTF8_INPUT_ENABLE)
4562 (const unsigned char *)"\075?UTF-8?B?",
4563 (const unsigned char *)"\075?UTF-8?Q?",
4565 (const unsigned char *)"\075?US-ASCII?Q?",
4570 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4571 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4572 e_iconv, s_iconv, 0, 0, 0, 0,
4573 #if defined(UTF8_INPUT_ENABLE)
4579 const nkf_char mime_encode[] = {
4580 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4581 #if defined(UTF8_INPUT_ENABLE)
4588 const nkf_char mime_encode_method[] = {
4589 'B', 'B','Q', 'B', 'B', 'Q',
4590 #if defined(UTF8_INPUT_ENABLE)
4598 #define MAXRECOVER 20
4600 void switch_mime_getc(void)
4602 if (i_getc!=mime_getc) {
4603 i_mgetc = i_getc; i_getc = mime_getc;
4604 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4605 if(mime_f==STRICT_MIME) {
4606 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4607 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4612 void unswitch_mime_getc(void)
4614 if(mime_f==STRICT_MIME) {
4615 i_mgetc = i_mgetc_buf;
4616 i_mungetc = i_mungetc_buf;
4619 i_ungetc = i_mungetc;
4620 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4621 mime_iconv_back = NULL;
4624 nkf_char mime_begin_strict(FILE *f)
4628 const unsigned char *p,*q;
4629 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4631 mime_decode_mode = FALSE;
4632 /* =? has been checked */
4634 p = mime_pattern[j];
4637 for(i=2;p[i]>' ';i++) { /* start at =? */
4638 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4639 /* pattern fails, try next one */
4641 while (mime_pattern[++j]) {
4642 p = mime_pattern[j];
4643 for(k=2;k<i;k++) /* assume length(p) > i */
4644 if (p[k]!=q[k]) break;
4645 if (k==i && nkf_toupper(c1)==p[k]) break;
4647 p = mime_pattern[j];
4648 if (p) continue; /* found next one, continue */
4649 /* all fails, output from recovery buffer */
4657 mime_decode_mode = p[i-2];
4659 mime_iconv_back = iconv;
4660 set_iconv(FALSE, mime_priority_func[j]);
4661 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4663 if (mime_decode_mode=='B') {
4664 mimebuf_f = unbuf_f;
4666 /* do MIME integrity check */
4667 return mime_integrity(f,mime_pattern[j]);
4675 nkf_char mime_getc_buf(FILE *f)
4677 /* we don't keep eof of Fifo, becase it contains ?= as
4678 a terminator. It was checked in mime_integrity. */
4679 return ((mimebuf_f)?
4680 (*i_mgetc_buf)(f):Fifo(mime_input++));
4683 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4686 (*i_mungetc_buf)(c,f);
4688 Fifo(--mime_input) = (unsigned char)c;
4692 nkf_char mime_begin(FILE *f)
4697 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4698 /* re-read and convert again from mime_buffer. */
4700 /* =? has been checked */
4702 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4703 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4704 /* We accept any character type even if it is breaked by new lines */
4705 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4706 if (c1=='\n'||c1==' '||c1=='\r'||
4707 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4709 /* Failed. But this could be another MIME preemble */
4717 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4718 if (!(++i<MAXRECOVER) || c1==EOF) break;
4719 if (c1=='b'||c1=='B') {
4720 mime_decode_mode = 'B';
4721 } else if (c1=='q'||c1=='Q') {
4722 mime_decode_mode = 'Q';
4726 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4727 if (!(++i<MAXRECOVER) || c1==EOF) break;
4729 mime_decode_mode = FALSE;
4735 if (!mime_decode_mode) {
4736 /* false MIME premble, restart from mime_buffer */
4737 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4738 /* Since we are in MIME mode until buffer becomes empty, */
4739 /* we never go into mime_begin again for a while. */
4742 /* discard mime preemble, and goto MIME mode */
4744 /* do no MIME integrity check */
4745 return c1; /* used only for checking EOF */
4749 void no_putc(nkf_char c)
4754 void debug(const char *str)
4757 fprintf(stderr, "%s\n", str);
4762 void set_input_codename(char *codename)
4766 strcmp(codename, "") != 0 &&
4767 strcmp(codename, input_codename) != 0)
4769 is_inputcode_mixed = TRUE;
4771 input_codename = codename;
4772 is_inputcode_set = TRUE;
4775 #if !defined(PERL_XS) && !defined(WIN32DLL)
4776 void print_guessed_code(char *filename)
4778 char *codename = "BINARY";
4779 if (!is_inputcode_mixed) {
4780 if (strcmp(input_codename, "") == 0) {
4783 codename = input_codename;
4786 if (filename != NULL) printf("%s:", filename);
4787 printf("%s\n", codename);
4793 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4795 nkf_char c1, c2, c3;
4801 if (!nkf_isxdigit(c2)){
4806 if (!nkf_isxdigit(c3)){
4811 return (hex2bin(c2) << 4) | hex2bin(c3);
4814 nkf_char cap_getc(FILE *f)
4816 return hex_getc(':', f, i_cgetc, i_cungetc);
4819 nkf_char cap_ungetc(nkf_char c, FILE *f)
4821 return (*i_cungetc)(c, f);
4824 nkf_char url_getc(FILE *f)
4826 return hex_getc('%', f, i_ugetc, i_uungetc);
4829 nkf_char url_ungetc(nkf_char c, FILE *f)
4831 return (*i_uungetc)(c, f);
4835 #ifdef NUMCHAR_OPTION
4836 nkf_char numchar_getc(FILE *f)
4838 nkf_char (*g)(FILE *) = i_ngetc;
4839 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4850 if (buf[i] == 'x' || buf[i] == 'X'){
4851 for (j = 0; j < 7; j++){
4853 if (!nkf_isxdigit(buf[i])){
4860 c |= hex2bin(buf[i]);
4863 for (j = 0; j < 8; j++){
4867 if (!nkf_isdigit(buf[i])){
4874 c += hex2bin(buf[i]);
4880 return CLASS_UNICODE | c;
4889 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4891 return (*i_nungetc)(c, f);
4895 #ifdef UNICODE_NORMALIZATION
4897 /* Normalization Form C */
4898 nkf_char nfc_getc(FILE *f)
4900 nkf_char (*g)(FILE *f) = i_nfc_getc;
4901 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4902 int i=0, j, k=1, lower, upper;
4904 const nkf_nfchar *array;
4906 extern const struct normalization_pair normalization_table[];
4910 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4911 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4912 while (upper >= lower) {
4913 j = (lower+upper) / 2;
4914 array = normalization_table[j].nfd;
4915 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4916 if (array[k] != buf[k]){
4917 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4924 array = normalization_table[j].nfc;
4925 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4926 buf[i] = (nkf_char)(array[i]);
4937 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4939 return (*i_nfc_ungetc)(c, f);
4941 #endif /* UNICODE_NORMALIZATION */
4947 nkf_char c1, c2, c3, c4, cc;
4948 nkf_char t1, t2, t3, t4, mode, exit_mode;
4949 nkf_char lwsp_count;
4952 nkf_char lwsp_size = 128;
4954 if (mime_top != mime_last) { /* Something is in FIFO */
4955 return Fifo(mime_top++);
4957 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4958 mime_decode_mode=FALSE;
4959 unswitch_mime_getc();
4960 return (*i_getc)(f);
4963 if (mimebuf_f == FIXED_MIME)
4964 exit_mode = mime_decode_mode;
4967 if (mime_decode_mode == 'Q') {
4968 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4970 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
4971 if (c1<=' ' || DEL<=c1) {
4972 mime_decode_mode = exit_mode; /* prepare for quit */
4975 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4979 mime_decode_mode = exit_mode; /* prepare for quit */
4980 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4981 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4982 /* end Q encoding */
4983 input_mode = exit_mode;
4985 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4986 if (lwsp_buf==NULL) {
4987 perror("can't malloc");
4990 while ((c1=(*i_getc)(f))!=EOF) {
4995 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5003 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5004 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5019 lwsp_buf[lwsp_count] = (unsigned char)c1;
5020 if (lwsp_count++>lwsp_size){
5022 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5023 if (lwsp_buf_new==NULL) {
5025 perror("can't realloc");
5028 lwsp_buf = lwsp_buf_new;
5034 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5036 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5037 i_ungetc(lwsp_buf[lwsp_count],f);
5043 if (c1=='='&&c2<' ') { /* this is soft wrap */
5044 while((c1 = (*i_mgetc)(f)) <=' ') {
5045 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5047 mime_decode_mode = 'Q'; /* still in MIME */
5048 goto restart_mime_q;
5051 mime_decode_mode = 'Q'; /* still in MIME */
5055 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5056 if (c2<=' ') return c2;
5057 mime_decode_mode = 'Q'; /* still in MIME */
5058 return ((hex2bin(c2)<<4) + hex2bin(c3));
5061 if (mime_decode_mode != 'B') {
5062 mime_decode_mode = FALSE;
5063 return (*i_mgetc)(f);
5067 /* Base64 encoding */
5069 MIME allows line break in the middle of
5070 Base64, but we are very pessimistic in decoding
5071 in unbuf mode because MIME encoded code may broken by
5072 less or editor's control sequence (such as ESC-[-K in unbuffered
5073 mode. ignore incomplete MIME.
5075 mode = mime_decode_mode;
5076 mime_decode_mode = exit_mode; /* prepare for quit */
5078 while ((c1 = (*i_mgetc)(f))<=' ') {
5083 if ((c2 = (*i_mgetc)(f))<=' ') {
5086 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5087 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5090 if ((c1 == '?') && (c2 == '=')) {
5093 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5094 if (lwsp_buf==NULL) {
5095 perror("can't malloc");
5098 while ((c1=(*i_getc)(f))!=EOF) {
5103 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5111 if ((c1=(*i_getc)(f))!=EOF) {
5115 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5130 lwsp_buf[lwsp_count] = (unsigned char)c1;
5131 if (lwsp_count++>lwsp_size){
5133 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5134 if (lwsp_buf_new==NULL) {
5136 perror("can't realloc");
5139 lwsp_buf = lwsp_buf_new;
5145 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5147 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5148 i_ungetc(lwsp_buf[lwsp_count],f);
5155 if ((c3 = (*i_mgetc)(f))<=' ') {
5158 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5159 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5163 if ((c4 = (*i_mgetc)(f))<=' ') {
5166 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5167 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5171 mime_decode_mode = mode; /* still in MIME sigh... */
5173 /* BASE 64 decoding */
5175 t1 = 0x3f & base64decode(c1);
5176 t2 = 0x3f & base64decode(c2);
5177 t3 = 0x3f & base64decode(c3);
5178 t4 = 0x3f & base64decode(c4);
5179 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5181 Fifo(mime_last++) = (unsigned char)cc;
5182 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5184 Fifo(mime_last++) = (unsigned char)cc;
5185 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5187 Fifo(mime_last++) = (unsigned char)cc;
5192 return Fifo(mime_top++);
5195 nkf_char mime_ungetc(nkf_char c, FILE *f)
5197 Fifo(--mime_top) = (unsigned char)c;
5201 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5205 /* In buffered mode, read until =? or NL or buffer full
5207 mime_input = mime_top;
5208 mime_last = mime_top;
5210 while(*p) Fifo(mime_input++) = *p++;
5213 while((c=(*i_getc)(f))!=EOF) {
5214 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5215 break; /* buffer full */
5217 if (c=='=' && d=='?') {
5218 /* checked. skip header, start decode */
5219 Fifo(mime_input++) = (unsigned char)c;
5220 /* mime_last_input = mime_input; */
5225 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5227 /* Should we check length mod 4? */
5228 Fifo(mime_input++) = (unsigned char)c;
5231 /* In case of Incomplete MIME, no MIME decode */
5232 Fifo(mime_input++) = (unsigned char)c;
5233 mime_last = mime_input; /* point undecoded buffer */
5234 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5235 switch_mime_getc(); /* anyway we need buffered getc */
5239 nkf_char base64decode(nkf_char c)
5244 i = c - 'A'; /* A..Z 0-25 */
5246 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5248 } else if (c > '/') {
5249 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5250 } else if (c == '+') {
5251 i = '>' /* 62 */ ; /* + 62 */
5253 i = '?' /* 63 */ ; /* / 63 */
5258 static const char basis_64[] =
5259 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5261 static nkf_char b64c;
5262 #define MIMEOUT_BUF_LENGTH (60)
5263 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5264 int mimeout_buf_count = 0;
5265 int mimeout_preserve_space = 0;
5266 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5268 void open_mime(nkf_char mode)
5270 const unsigned char *p;
5273 p = mime_pattern[0];
5274 for(i=0;mime_encode[i];i++) {
5275 if (mode == mime_encode[i]) {
5276 p = mime_pattern[i];
5280 mimeout_mode = mime_encode_method[i];
5283 if (base64_count>45) {
5284 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5285 (*o_mputc)(mimeout_buf[i]);
5291 if (!mimeout_preserve_space && mimeout_buf_count>0
5292 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5293 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5297 if (!mimeout_preserve_space) {
5298 for (;i<mimeout_buf_count;i++) {
5299 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5300 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5301 (*o_mputc)(mimeout_buf[i]);
5308 mimeout_preserve_space = FALSE;
5314 j = mimeout_buf_count;
5315 mimeout_buf_count = 0;
5317 mime_putc(mimeout_buf[i]);
5321 void close_mime(void)
5331 switch(mimeout_mode) {
5336 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5342 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5348 if (mimeout_f!=FIXED_MIME) {
5350 } else if (mimeout_mode != 'Q')
5355 void mimeout_addchar(nkf_char c)
5357 switch(mimeout_mode) {
5362 } else if(!nkf_isalnum(c)) {
5364 (*o_mputc)(itoh4(((c>>4)&0xf)));
5365 (*o_mputc)(itoh4((c&0xf)));
5374 (*o_mputc)(basis_64[c>>2]);
5379 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5385 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5386 (*o_mputc)(basis_64[c & 0x3F]);
5397 nkf_char mime_lastchar2, mime_lastchar1;
5399 void mime_prechar(nkf_char c2, nkf_char c1)
5403 if (base64_count + mimeout_buf_count/3*4> 66){
5404 (*o_base64conv)(EOF,0);
5405 (*o_base64conv)(0,NL);
5406 (*o_base64conv)(0,SPACE);
5408 }/*else if (mime_lastchar2){
5409 if (c1 <=DEL && !nkf_isspace(c1)){
5410 (*o_base64conv)(0,SPACE);
5414 if (c2 && mime_lastchar2 == 0
5415 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5416 (*o_base64conv)(0,SPACE);
5419 mime_lastchar2 = c2;
5420 mime_lastchar1 = c1;
5423 void mime_putc(nkf_char c)
5428 if (mimeout_f == FIXED_MIME){
5429 if (mimeout_mode == 'Q'){
5430 if (base64_count > 71){
5431 if (c!=CR && c!=NL) {
5438 if (base64_count > 71){
5443 if (c == EOF) { /* c==EOF */
5447 if (c != EOF) { /* c==EOF */
5453 /* mimeout_f != FIXED_MIME */
5455 if (c == EOF) { /* c==EOF */
5456 j = mimeout_buf_count;
5457 mimeout_buf_count = 0;
5461 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5464 mimeout_addchar(mimeout_buf[i]);
5468 mimeout_addchar(mimeout_buf[i]);
5472 mimeout_addchar(mimeout_buf[i]);
5478 if (mimeout_mode=='Q') {
5479 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5491 if (mimeout_buf_count > 0){
5492 lastchar = mimeout_buf[mimeout_buf_count - 1];
5497 if (!mimeout_mode) {
5498 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5499 if (nkf_isspace(c)) {
5500 if (c==CR || c==NL) {
5503 for (i=0;i<mimeout_buf_count;i++) {
5504 (*o_mputc)(mimeout_buf[i]);
5505 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5511 mimeout_buf[0] = (char)c;
5512 mimeout_buf_count = 1;
5514 if (base64_count > 1
5515 && base64_count + mimeout_buf_count > 76){
5518 if (!nkf_isspace(mimeout_buf[0])){
5523 mimeout_buf[mimeout_buf_count++] = (char)c;
5524 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5525 open_mime(output_mode);
5530 if (lastchar==CR || lastchar == NL){
5531 for (i=0;i<mimeout_buf_count;i++) {
5532 (*o_mputc)(mimeout_buf[i]);
5535 mimeout_buf_count = 0;
5537 if (lastchar==SPACE) {
5538 for (i=0;i<mimeout_buf_count-1;i++) {
5539 (*o_mputc)(mimeout_buf[i]);
5542 mimeout_buf[0] = SPACE;
5543 mimeout_buf_count = 1;
5545 open_mime(output_mode);
5548 /* mimeout_mode == 'B', 1, 2 */
5549 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5550 if (lastchar == CR || lastchar == NL){
5551 if (nkf_isblank(c)) {
5552 for (i=0;i<mimeout_buf_count;i++) {
5553 mimeout_addchar(mimeout_buf[i]);
5555 mimeout_buf_count = 0;
5556 } else if (SPACE<c && c<DEL) {
5558 for (i=0;i<mimeout_buf_count;i++) {
5559 (*o_mputc)(mimeout_buf[i]);
5562 mimeout_buf_count = 0;
5565 if (c==SPACE || c==TAB || c==CR || c==NL) {
5566 for (i=0;i<mimeout_buf_count;i++) {
5567 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5569 for (i=0;i<mimeout_buf_count;i++) {
5570 (*o_mputc)(mimeout_buf[i]);
5573 mimeout_buf_count = 0;
5576 mimeout_buf[mimeout_buf_count++] = (char)c;
5577 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5579 for (i=0;i<mimeout_buf_count;i++) {
5580 (*o_mputc)(mimeout_buf[i]);
5583 mimeout_buf_count = 0;
5587 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5588 mimeout_buf[mimeout_buf_count++] = (char)c;
5589 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5590 j = mimeout_buf_count;
5591 mimeout_buf_count = 0;
5593 mimeout_addchar(mimeout_buf[i]);
5600 if (mimeout_buf_count>0) {
5601 j = mimeout_buf_count;
5602 mimeout_buf_count = 0;
5604 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5606 mimeout_addchar(mimeout_buf[i]);
5612 (*o_mputc)(mimeout_buf[i]);
5614 open_mime(output_mode);
5621 #if defined(PERL_XS) || defined(WIN32DLL)
5625 struct input_code *p = input_code_list;
5638 mime_f = STRICT_MIME;
5639 mime_decode_f = FALSE;
5644 #if defined(MSDOS) || defined(__OS2__)
5649 iso2022jp_f = FALSE;
5650 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5651 ms_ucs_map_f = UCS_MAP_ASCII;
5653 #ifdef UTF8_INPUT_ENABLE
5654 no_cp932ext_f = FALSE;
5655 no_best_fit_chars_f = FALSE;
5656 encode_fallback = NULL;
5657 unicode_subchar = '?';
5658 input_endian = ENDIAN_BIG;
5660 #ifdef UTF8_OUTPUT_ENABLE
5661 output_bom_f = FALSE;
5662 output_endian = ENDIAN_BIG;
5664 #ifdef UNICODE_NORMALIZATION
5677 is_inputcode_mixed = FALSE;
5678 is_inputcode_set = FALSE;
5682 #ifdef SHIFTJIS_CP932
5692 for (i = 0; i < 256; i++){
5693 prefix_table[i] = 0;
5697 mimeout_buf_count = 0;
5702 fold_preserve_f = FALSE;
5705 kanji_intro = DEFAULT_J;
5706 ascii_intro = DEFAULT_R;
5707 fold_margin = FOLD_MARGIN;
5708 output_conv = DEFAULT_CONV;
5709 oconv = DEFAULT_CONV;
5710 o_zconv = no_connection;
5711 o_fconv = no_connection;
5712 o_crconv = no_connection;
5713 o_rot_conv = no_connection;
5714 o_hira_conv = no_connection;
5715 o_base64conv = no_connection;
5716 o_iso2022jp_check_conv = no_connection;
5719 i_ungetc = std_ungetc;
5721 i_bungetc = std_ungetc;
5724 i_mungetc = std_ungetc;
5725 i_mgetc_buf = std_getc;
5726 i_mungetc_buf = std_ungetc;
5727 output_mode = ASCII;
5730 mime_decode_mode = FALSE;
5736 z_prev2=0,z_prev1=0;
5738 iconv_for_check = 0;
5740 input_codename = "";
5747 void no_connection(nkf_char c2, nkf_char c1)
5749 no_connection2(c2,c1,0);
5752 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5754 fprintf(stderr,"nkf internal module connection failure.\n");
5756 return 0; /* LINT */
5761 #define fprintf dllprintf
5765 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5766 fprintf(stderr,"Flags:\n");
5767 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5768 #ifdef DEFAULT_CODE_SJIS
5769 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5771 #ifdef DEFAULT_CODE_JIS
5772 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5774 #ifdef DEFAULT_CODE_EUC
5775 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5777 #ifdef DEFAULT_CODE_UTF8
5778 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5780 #ifdef UTF8_OUTPUT_ENABLE
5781 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5783 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5784 #ifdef UTF8_INPUT_ENABLE
5785 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5787 fprintf(stderr,"t no conversion\n");
5788 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5789 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5790 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5791 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5792 fprintf(stderr,"v Show this usage. V: show version\n");
5793 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5794 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5795 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5796 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5797 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5798 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5799 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5800 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5802 fprintf(stderr,"T Text mode output\n");
5804 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5805 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5806 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5807 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5808 fprintf(stderr,"\n");
5809 fprintf(stderr,"Long name options\n");
5810 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5811 fprintf(stderr," Specify the input or output codeset\n");
5812 fprintf(stderr," --fj --unix --mac --windows\n");
5813 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5814 fprintf(stderr," Convert for the system or code\n");
5815 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5816 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5817 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5819 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5821 #ifdef NUMCHAR_OPTION
5822 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5824 #ifdef UTF8_INPUT_ENABLE
5825 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5826 fprintf(stderr," Specify how nkf handles unassigned characters\n");
5829 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
5830 fprintf(stderr," Overwrite original listed files by filtered result\n");
5831 fprintf(stderr," --overwrite preserves timestamp of original files\n");
5833 fprintf(stderr," -g --guess Guess the input code\n");
5834 fprintf(stderr," --help --version Show this help/the version\n");
5835 fprintf(stderr," For more information, see also man nkf\n");
5836 fprintf(stderr,"\n");
5842 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
5843 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
5846 #if defined(MSDOS) && defined(__WIN16__)
5849 #if defined(MSDOS) && defined(__WIN32__)
5855 ,NKF_VERSION,NKF_RELEASE_DATE);
5856 fprintf(stderr,"\n%s\n",CopyRight);
5861 **
\e$B%Q%C%A@):n<T
\e(B
5862 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
5863 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
5864 ** ohta@src.ricoh.co.jp (Junn Ohta)
5865 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
5866 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
5867 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
5868 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
5869 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
5870 ** GHG00637@nifty-serve.or.jp (COW)