1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.139 2007/10/01 21:52:14 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-10-02"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
227 #define CP932_TABLE_BEGIN 0xFA
228 #define CP932_TABLE_END 0xFC
229 #define CP932INV_TABLE_BEGIN 0xED
230 #define CP932INV_TABLE_END 0xEE
231 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
233 #define HOLD_SIZE 1024
234 #if defined(INT_IS_SHORT)
235 #define IOBUF_SIZE 2048
237 #define IOBUF_SIZE 16384
240 #define DEFAULT_J 'B'
241 #define DEFAULT_R 'B'
243 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
244 #define SJ6394 0x0161 /* 63 - 94 ku offset */
246 #define RANGE_NUM_MAX 18
251 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
252 #define sizeof_euc_to_utf8_1byte 94
253 #define sizeof_euc_to_utf8_2bytes 94
254 #define sizeof_utf8_to_euc_C2 64
255 #define sizeof_utf8_to_euc_E5B8 64
256 #define sizeof_utf8_to_euc_2bytes 112
257 #define sizeof_utf8_to_euc_3bytes 16
260 /* MIME preprocessor */
262 #ifdef EASYWIN /*Easy Win */
263 extern POINT _BufferSize;
272 void (*status_func)(struct input_code *, nkf_char);
273 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
277 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
280 static const char *CopyRight = COPY_RIGHT;
282 #if !defined(PERL_XS) && !defined(WIN32DLL)
283 static nkf_char noconvert(FILE *f);
285 static void module_connection(void);
286 static nkf_char kanji_convert(FILE *f);
287 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
288 static nkf_char push_hold_buf(nkf_char c2);
289 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
290 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
291 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
292 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
293 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
295 * 0: Shift_JIS, eucJP-ascii
300 #define UCS_MAP_ASCII 0
302 #define UCS_MAP_CP932 2
303 #define UCS_MAP_CP10001 3
304 static int ms_ucs_map_f = UCS_MAP_ASCII;
306 #ifdef UTF8_INPUT_ENABLE
307 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
308 static int no_cp932ext_f = FALSE;
309 /* ignore ZERO WIDTH NO-BREAK SPACE */
310 static int no_best_fit_chars_f = FALSE;
311 static int input_endian = ENDIAN_BIG;
312 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
313 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
314 static void encode_fallback_html(nkf_char c);
315 static void encode_fallback_xml(nkf_char c);
316 static void encode_fallback_java(nkf_char c);
317 static void encode_fallback_perl(nkf_char c);
318 static void encode_fallback_subchar(nkf_char c);
319 static void (*encode_fallback)(nkf_char c) = NULL;
320 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
321 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
322 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
323 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
324 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
325 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
326 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
327 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
328 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
329 static void w_status(struct input_code *, nkf_char);
331 #ifdef UTF8_OUTPUT_ENABLE
332 static int output_bom_f = FALSE;
333 static int output_endian = ENDIAN_BIG;
334 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
335 static void w_oconv(nkf_char c2,nkf_char c1);
336 static void w_oconv16(nkf_char c2,nkf_char c1);
337 static void w_oconv32(nkf_char c2,nkf_char c1);
339 static void e_oconv(nkf_char c2,nkf_char c1);
340 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
341 static void s_oconv(nkf_char c2,nkf_char c1);
342 static void j_oconv(nkf_char c2,nkf_char c1);
343 static void fold_conv(nkf_char c2,nkf_char c1);
344 static void nl_conv(nkf_char c2,nkf_char c1);
345 static void z_conv(nkf_char c2,nkf_char c1);
346 static void rot_conv(nkf_char c2,nkf_char c1);
347 static void hira_conv(nkf_char c2,nkf_char c1);
348 static void base64_conv(nkf_char c2,nkf_char c1);
349 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
350 static void no_connection(nkf_char c2,nkf_char c1);
351 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
353 static void code_score(struct input_code *ptr);
354 static void code_status(nkf_char c);
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_begin(FILE *f);
364 static nkf_char mime_getc(FILE *f);
365 static nkf_char mime_ungetc(nkf_char c,FILE *f);
367 static void switch_mime_getc(void);
368 static void unswitch_mime_getc(void);
369 static nkf_char mime_begin_strict(FILE *f);
370 static nkf_char mime_getc_buf(FILE *f);
371 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
372 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
374 static nkf_char base64decode(nkf_char c);
375 static void mime_prechar(nkf_char c2, nkf_char c1);
376 static void mime_putc(nkf_char c);
377 static void open_mime(nkf_char c);
378 static void close_mime(void);
379 static void eof_mime(void);
380 static void mimeout_addchar(nkf_char c);
382 static void usage(void);
383 static void version(void);
385 static void options(unsigned char *c);
386 #if defined(PERL_XS) || defined(WIN32DLL)
387 static void reinit(void);
392 #if !defined(PERL_XS) && !defined(WIN32DLL)
393 static unsigned char stdibuf[IOBUF_SIZE];
394 static unsigned char stdobuf[IOBUF_SIZE];
396 static unsigned char hold_buf[HOLD_SIZE*2];
397 static int hold_count = 0;
399 /* MIME preprocessor fifo */
401 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
402 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
403 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
404 static unsigned char mime_buf[MIME_BUF_SIZE];
405 static unsigned int mime_top = 0;
406 static unsigned int mime_last = 0; /* decoded */
407 static unsigned int mime_input = 0; /* undecoded */
408 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
411 static int unbuf_f = FALSE;
412 static int estab_f = FALSE;
413 static int nop_f = FALSE;
414 static int binmode_f = TRUE; /* binary mode */
415 static int rot_f = FALSE; /* rot14/43 mode */
416 static int hira_f = FALSE; /* hira/kata henkan */
417 static int input_f = FALSE; /* non fixed input code */
418 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
419 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
420 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
421 static int mimebuf_f = FALSE; /* MIME buffered input */
422 static int broken_f = FALSE; /* convert ESC-less broken JIS */
423 static int iso8859_f = FALSE; /* ISO8859 through */
424 static int mimeout_f = FALSE; /* base64 mode */
425 #if defined(MSDOS) || defined(__OS2__)
426 static int x0201_f = TRUE; /* Assume JISX0201 kana */
428 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
430 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
432 #ifdef UNICODE_NORMALIZATION
433 static int nfc_f = FALSE;
434 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
435 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
436 static nkf_char nfc_getc(FILE *f);
437 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
441 static int cap_f = FALSE;
442 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
443 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
444 static nkf_char cap_getc(FILE *f);
445 static nkf_char cap_ungetc(nkf_char c,FILE *f);
447 static int url_f = FALSE;
448 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
449 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
450 static nkf_char url_getc(FILE *f);
451 static nkf_char url_ungetc(nkf_char c,FILE *f);
454 #if defined(INT_IS_SHORT)
455 #define NKF_INT32_C(n) (n##L)
457 #define NKF_INT32_C(n) (n)
459 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
460 #define CLASS_MASK NKF_INT32_C(0xFF000000)
461 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
462 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
463 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
464 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
465 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
467 #ifdef NUMCHAR_OPTION
468 static int numchar_f = FALSE;
469 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
470 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
471 static nkf_char numchar_getc(FILE *f);
472 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
476 static int noout_f = FALSE;
477 static void no_putc(nkf_char c);
478 static nkf_char debug_f = FALSE;
479 static void debug(const char *str);
480 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
483 static int guess_f = FALSE;
485 static void print_guessed_code(char *filename);
487 static void set_input_codename(char *codename);
490 static int exec_f = 0;
493 #ifdef SHIFTJIS_CP932
494 /* invert IBM extended characters to others */
495 static int cp51932_f = FALSE;
497 /* invert NEC-selected IBM extended characters to IBM extended characters */
498 static int cp932inv_f = TRUE;
500 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
501 #endif /* SHIFTJIS_CP932 */
504 static int x0212_f = FALSE;
505 static nkf_char x0212_shift(nkf_char c);
506 static nkf_char x0212_unshift(nkf_char c);
508 static int x0213_f = FALSE;
510 static unsigned char prefix_table[256];
512 static void set_code_score(struct input_code *ptr, nkf_char score);
513 static void clr_code_score(struct input_code *ptr, nkf_char score);
514 static void status_disable(struct input_code *ptr);
515 static void status_push_ch(struct input_code *ptr, nkf_char c);
516 static void status_clear(struct input_code *ptr);
517 static void status_reset(struct input_code *ptr);
518 static void status_reinit(struct input_code *ptr);
519 static void status_check(struct input_code *ptr, nkf_char c);
520 static void e_status(struct input_code *, nkf_char);
521 static void s_status(struct input_code *, nkf_char);
523 struct input_code input_code_list[] = {
524 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
525 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
526 #ifdef UTF8_INPUT_ENABLE
527 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
528 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
529 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
534 static int mimeout_mode = 0;
535 static int base64_count = 0;
537 /* X0208 -> ASCII converter */
540 static int f_line = 0; /* chars in line */
541 static int f_prev = 0;
542 static int fold_preserve_f = FALSE; /* preserve new lines */
543 static int fold_f = FALSE;
544 static int fold_len = 0;
547 static unsigned char kanji_intro = DEFAULT_J;
548 static unsigned char ascii_intro = DEFAULT_R;
552 #define FOLD_MARGIN 10
553 #define DEFAULT_FOLD 60
555 static int fold_margin = FOLD_MARGIN;
559 #ifdef DEFAULT_CODE_JIS
560 # define DEFAULT_CONV j_oconv
562 #ifdef DEFAULT_CODE_SJIS
563 # define DEFAULT_CONV s_oconv
565 #ifdef DEFAULT_CODE_EUC
566 # define DEFAULT_CONV e_oconv
568 #ifdef DEFAULT_CODE_UTF8
569 # define DEFAULT_CONV w_oconv
572 /* process default */
573 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
575 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
576 /* s_iconv or oconv */
577 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
579 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
580 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
587 /* static redirections */
589 static void (*o_putc)(nkf_char c) = std_putc;
591 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
592 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
594 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
595 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
597 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
599 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
600 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
602 /* for strict mime */
603 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
604 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
607 static int output_mode = ASCII, /* output kanji mode */
608 input_mode = ASCII, /* input kanji mode */
609 shift_mode = FALSE; /* TRUE shift out, or X0201 */
610 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
612 /* X0201 / X0208 conversion tables */
614 /* X0201 kana conversion table */
616 static const unsigned char cv[]= {
617 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
618 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
619 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
620 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
621 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
622 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
623 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
624 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
625 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
626 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
627 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
628 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
629 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
630 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
631 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
632 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
636 /* X0201 kana conversion table for daguten */
638 static const unsigned char dv[]= {
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
644 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
645 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
646 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
647 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
648 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
650 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 /* X0201 kana conversion table for han-daguten */
659 static const unsigned char ev[]= {
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
671 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
679 /* X0208 kigou conversion table */
680 /* 0x8140 - 0x819e */
681 static const unsigned char fv[] = {
683 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
684 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
685 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
686 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
687 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
688 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
689 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
690 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
691 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
693 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
694 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
699 static int file_out_f = FALSE;
701 static int overwrite_f = FALSE;
702 static int preserve_time_f = FALSE;
703 static int backup_f = FALSE;
704 static char *backup_suffix = "";
705 static char *get_backup_filename(const char *suffix, const char *filename);
708 static int nlmode_f = 0; /* CR, LF, CRLF */
709 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
710 static nkf_char prev_cr = 0; /* CR or 0 */
711 #ifdef EASYWIN /*Easy Win */
712 static int end_check;
715 #define STD_GC_BUFSIZE (256)
716 nkf_char std_gc_buf[STD_GC_BUFSIZE];
720 #include "nkf32dll.c"
721 #elif defined(PERL_XS)
723 int main(int argc, char **argv)
728 char *outfname = NULL;
731 #ifdef EASYWIN /*Easy Win */
732 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
735 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
736 cp = (unsigned char *)*argv;
741 if (pipe(fds) < 0 || (pid = fork()) < 0){
752 execvp(argv[1], &argv[1]);
766 if(x0201_f == WISH_TRUE)
767 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
769 if (binmode_f == TRUE)
770 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
771 if (freopen("","wb",stdout) == NULL)
778 setbuf(stdout, (char *) NULL);
780 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
783 if (binmode_f == TRUE)
784 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
785 if (freopen("","rb",stdin) == NULL) return (-1);
789 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
793 kanji_convert(stdin);
794 if (guess_f) print_guessed_code(NULL);
798 int is_argument_error = FALSE;
800 input_codename = NULL;
804 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
807 is_argument_error = TRUE;
815 /* reopen file for stdout */
816 if (file_out_f == TRUE) {
819 outfname = malloc(strlen(origfname)
820 + strlen(".nkftmpXXXXXX")
826 strcpy(outfname, origfname);
830 for (i = strlen(outfname); i; --i){
831 if (outfname[i - 1] == '/'
832 || outfname[i - 1] == '\\'){
838 strcat(outfname, "ntXXXXXX");
840 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
843 strcat(outfname, ".nkftmpXXXXXX");
844 fd = mkstemp(outfname);
847 || (fd_backup = dup(fileno(stdout))) < 0
848 || dup2(fd, fileno(stdout)) < 0
859 outfname = "nkf.out";
862 if(freopen(outfname, "w", stdout) == NULL) {
866 if (binmode_f == TRUE) {
867 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
868 if (freopen("","wb",stdout) == NULL)
875 if (binmode_f == TRUE)
876 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
877 if (freopen("","rb",fin) == NULL)
882 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
886 char *filename = NULL;
888 if (nfiles > 1) filename = origfname;
889 if (guess_f) print_guessed_code(filename);
895 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
903 if (dup2(fd_backup, fileno(stdout)) < 0){
906 if (stat(origfname, &sb)) {
907 fprintf(stderr, "Can't stat %s\n", origfname);
909 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
910 if (chmod(outfname, sb.st_mode)) {
911 fprintf(stderr, "Can't set permission %s\n", outfname);
914 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
916 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
917 tb[0] = tb[1] = sb.st_mtime;
918 if (utime(outfname, tb)) {
919 fprintf(stderr, "Can't set timestamp %s\n", outfname);
922 tb.actime = sb.st_atime;
923 tb.modtime = sb.st_mtime;
924 if (utime(outfname, &tb)) {
925 fprintf(stderr, "Can't set timestamp %s\n", outfname);
930 char *backup_filename = get_backup_filename(backup_suffix, origfname);
932 unlink(backup_filename);
934 if (rename(origfname, backup_filename)) {
935 perror(backup_filename);
936 fprintf(stderr, "Can't rename %s to %s\n",
937 origfname, backup_filename);
941 if (unlink(origfname)){
946 if (rename(outfname, origfname)) {
948 fprintf(stderr, "Can't rename %s to %s\n",
949 outfname, origfname);
956 if (is_argument_error)
959 #ifdef EASYWIN /*Easy Win */
960 if (file_out_f == FALSE)
961 scanf("%d",&end_check);
964 #else /* for Other OS */
965 if (file_out_f == TRUE)
970 #endif /* WIN32DLL */
973 char *get_backup_filename(const char *suffix, const char *filename)
975 char *backup_filename;
976 int asterisk_count = 0;
978 int filename_length = strlen(filename);
980 for(i = 0; suffix[i]; i++){
981 if(suffix[i] == '*') asterisk_count++;
985 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
986 if (!backup_filename){
987 perror("Can't malloc backup filename.");
991 for(i = 0, j = 0; suffix[i];){
992 if(suffix[i] == '*'){
993 backup_filename[j] = '\0';
994 strncat(backup_filename, filename, filename_length);
996 j += filename_length;
998 backup_filename[j++] = suffix[i++];
1001 backup_filename[j] = '\0';
1003 j = strlen(suffix) + filename_length;
1004 backup_filename = malloc( + 1);
1005 strcpy(backup_filename, filename);
1006 strcat(backup_filename, suffix);
1007 backup_filename[j] = '\0';
1009 return backup_filename;
1013 static const struct {
1037 {"katakana-hiragana","h3"},
1044 #ifdef UTF8_OUTPUT_ENABLE
1054 {"fb-subchar=", ""},
1056 #ifdef UTF8_INPUT_ENABLE
1057 {"utf8-input", "W"},
1058 {"utf16-input", "W16"},
1059 {"no-cp932ext", ""},
1060 {"no-best-fit-chars",""},
1062 #ifdef UNICODE_NORMALIZATION
1063 {"utf8mac-input", ""},
1075 #ifdef NUMCHAR_OPTION
1076 {"numchar-input", ""},
1082 #ifdef SHIFTJIS_CP932
1092 static int option_mode = 0;
1094 void options(unsigned char *cp)
1098 unsigned char *cp_back = NULL;
1103 while(*cp && *cp++!='-');
1104 while (*cp || cp_back) {
1112 case '-': /* literal options */
1113 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1117 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1118 p = (unsigned char *)long_option[i].name;
1119 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1120 if (*p == cp[j] || cp[j] == SP){
1127 while(*cp && *cp != SP && cp++);
1128 if (long_option[i].alias[0]){
1130 cp = (unsigned char *)long_option[i].alias;
1132 if (strcmp(long_option[i].name, "ic=") == 0){
1133 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1134 codeset[i] = nkf_toupper(p[i]);
1137 if(strcmp(codeset, "ISO-2022-JP") == 0){
1138 input_f = JIS_INPUT;
1139 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1140 strcmp(codeset, "CP50220") == 0 ||
1141 strcmp(codeset, "CP50221") == 0 ||
1142 strcmp(codeset, "CP50222") == 0){
1143 input_f = JIS_INPUT;
1144 #ifdef SHIFTJIS_CP932
1147 #ifdef UTF8_OUTPUT_ENABLE
1148 ms_ucs_map_f = UCS_MAP_CP932;
1150 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1151 input_f = JIS_INPUT;
1155 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1156 input_f = JIS_INPUT;
1161 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1162 input_f = SJIS_INPUT;
1163 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1164 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1165 strcmp(codeset, "CP932") == 0 ||
1166 strcmp(codeset, "MS932") == 0){
1167 input_f = SJIS_INPUT;
1168 #ifdef SHIFTJIS_CP932
1171 #ifdef UTF8_OUTPUT_ENABLE
1172 ms_ucs_map_f = UCS_MAP_CP932;
1174 }else if(strcmp(codeset, "CP10001") == 0){
1175 input_f = SJIS_INPUT;
1176 #ifdef SHIFTJIS_CP932
1179 #ifdef UTF8_OUTPUT_ENABLE
1180 ms_ucs_map_f = UCS_MAP_CP10001;
1182 }else if(strcmp(codeset, "EUCJP") == 0 ||
1183 strcmp(codeset, "EUC-JP") == 0){
1184 input_f = EUC_INPUT;
1185 }else if(strcmp(codeset, "CP51932") == 0){
1186 input_f = EUC_INPUT;
1187 #ifdef SHIFTJIS_CP932
1190 #ifdef UTF8_OUTPUT_ENABLE
1191 ms_ucs_map_f = UCS_MAP_CP932;
1193 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1194 strcmp(codeset, "EUCJP-MS") == 0 ||
1195 strcmp(codeset, "EUCJPMS") == 0){
1196 input_f = EUC_INPUT;
1197 #ifdef SHIFTJIS_CP932
1200 #ifdef UTF8_OUTPUT_ENABLE
1201 ms_ucs_map_f = UCS_MAP_MS;
1203 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1204 strcmp(codeset, "EUCJP-ASCII") == 0){
1205 input_f = EUC_INPUT;
1206 #ifdef SHIFTJIS_CP932
1209 #ifdef UTF8_OUTPUT_ENABLE
1210 ms_ucs_map_f = UCS_MAP_ASCII;
1212 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1213 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1214 input_f = SJIS_INPUT;
1216 #ifdef SHIFTJIS_CP932
1219 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1220 strcmp(codeset, "EUC-JIS-2004") == 0){
1221 input_f = EUC_INPUT;
1223 #ifdef SHIFTJIS_CP932
1226 #ifdef UTF8_INPUT_ENABLE
1227 }else if(strcmp(codeset, "UTF-8") == 0 ||
1228 strcmp(codeset, "UTF-8N") == 0 ||
1229 strcmp(codeset, "UTF-8-BOM") == 0){
1230 input_f = UTF8_INPUT;
1231 #ifdef UNICODE_NORMALIZATION
1232 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1233 strcmp(codeset, "UTF-8-MAC") == 0){
1234 input_f = UTF8_INPUT;
1237 }else if(strcmp(codeset, "UTF-16") == 0 ||
1238 strcmp(codeset, "UTF-16BE") == 0 ||
1239 strcmp(codeset, "UTF-16BE-BOM") == 0){
1240 input_f = UTF16_INPUT;
1241 input_endian = ENDIAN_BIG;
1242 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1243 strcmp(codeset, "UTF-16LE-BOM") == 0){
1244 input_f = UTF16_INPUT;
1245 input_endian = ENDIAN_LITTLE;
1246 }else if(strcmp(codeset, "UTF-32") == 0 ||
1247 strcmp(codeset, "UTF-32BE") == 0 ||
1248 strcmp(codeset, "UTF-32BE-BOM") == 0){
1249 input_f = UTF32_INPUT;
1250 input_endian = ENDIAN_BIG;
1251 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1252 strcmp(codeset, "UTF-32LE-BOM") == 0){
1253 input_f = UTF32_INPUT;
1254 input_endian = ENDIAN_LITTLE;
1259 if (strcmp(long_option[i].name, "oc=") == 0){
1261 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1262 codeset[i] = nkf_toupper(p[i]);
1265 if(strcmp(codeset, "ISO-2022-JP") == 0){
1266 output_conv = j_oconv;
1267 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1268 output_conv = j_oconv;
1269 no_cp932ext_f = TRUE;
1270 #ifdef SHIFTJIS_CP932
1273 #ifdef UTF8_OUTPUT_ENABLE
1274 ms_ucs_map_f = UCS_MAP_CP932;
1276 }else if(strcmp(codeset, "CP50220") == 0){
1277 output_conv = j_oconv;
1279 #ifdef SHIFTJIS_CP932
1282 #ifdef UTF8_OUTPUT_ENABLE
1283 ms_ucs_map_f = UCS_MAP_CP932;
1285 }else if(strcmp(codeset, "CP50221") == 0){
1286 output_conv = j_oconv;
1287 #ifdef SHIFTJIS_CP932
1290 #ifdef UTF8_OUTPUT_ENABLE
1291 ms_ucs_map_f = UCS_MAP_CP932;
1293 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1294 output_conv = j_oconv;
1298 #ifdef SHIFTJIS_CP932
1301 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1302 output_conv = j_oconv;
1307 #ifdef SHIFTJIS_CP932
1310 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1311 output_conv = s_oconv;
1312 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1313 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1314 strcmp(codeset, "CP932") == 0 ||
1315 strcmp(codeset, "MS932") == 0){
1316 output_conv = s_oconv;
1317 #ifdef UTF8_OUTPUT_ENABLE
1318 ms_ucs_map_f = UCS_MAP_CP932;
1320 }else if(strcmp(codeset, "CP10001") == 0){
1321 output_conv = s_oconv;
1322 #ifdef UTF8_OUTPUT_ENABLE
1323 ms_ucs_map_f = UCS_MAP_CP10001;
1325 }else if(strcmp(codeset, "EUCJP") == 0 ||
1326 strcmp(codeset, "EUC-JP") == 0){
1327 output_conv = e_oconv;
1328 }else if(strcmp(codeset, "CP51932") == 0){
1329 output_conv = e_oconv;
1330 #ifdef SHIFTJIS_CP932
1333 #ifdef UTF8_OUTPUT_ENABLE
1334 ms_ucs_map_f = UCS_MAP_CP932;
1336 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1337 strcmp(codeset, "EUCJP-MS") == 0 ||
1338 strcmp(codeset, "EUCJPMS") == 0){
1339 output_conv = e_oconv;
1343 #ifdef UTF8_OUTPUT_ENABLE
1344 ms_ucs_map_f = UCS_MAP_MS;
1346 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1347 strcmp(codeset, "EUCJP-ASCII") == 0){
1348 output_conv = e_oconv;
1352 #ifdef UTF8_OUTPUT_ENABLE
1353 ms_ucs_map_f = UCS_MAP_ASCII;
1355 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1356 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1357 output_conv = s_oconv;
1359 #ifdef SHIFTJIS_CP932
1362 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1363 strcmp(codeset, "EUC-JIS-2004") == 0){
1364 output_conv = e_oconv;
1369 #ifdef SHIFTJIS_CP932
1372 #ifdef UTF8_OUTPUT_ENABLE
1373 }else if(strcmp(codeset, "UTF-8") == 0){
1374 output_conv = w_oconv;
1375 }else if(strcmp(codeset, "UTF-8N") == 0){
1376 output_conv = w_oconv;
1377 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1378 output_conv = w_oconv;
1379 output_bom_f = TRUE;
1380 }else if(strcmp(codeset, "UTF-16BE") == 0){
1381 output_conv = w_oconv16;
1382 }else if(strcmp(codeset, "UTF-16") == 0 ||
1383 strcmp(codeset, "UTF-16BE-BOM") == 0){
1384 output_conv = w_oconv16;
1385 output_bom_f = TRUE;
1386 }else if(strcmp(codeset, "UTF-16LE") == 0){
1387 output_conv = w_oconv16;
1388 output_endian = ENDIAN_LITTLE;
1389 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1390 output_conv = w_oconv16;
1391 output_endian = ENDIAN_LITTLE;
1392 output_bom_f = TRUE;
1393 }else if(strcmp(codeset, "UTF-32") == 0 ||
1394 strcmp(codeset, "UTF-32BE") == 0){
1395 output_conv = w_oconv32;
1396 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1397 output_conv = w_oconv32;
1398 output_bom_f = TRUE;
1399 }else if(strcmp(codeset, "UTF-32LE") == 0){
1400 output_conv = w_oconv32;
1401 output_endian = ENDIAN_LITTLE;
1402 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1403 output_conv = w_oconv32;
1404 output_endian = ENDIAN_LITTLE;
1405 output_bom_f = TRUE;
1411 if (strcmp(long_option[i].name, "overwrite") == 0){
1414 preserve_time_f = TRUE;
1417 if (strcmp(long_option[i].name, "overwrite=") == 0){
1420 preserve_time_f = TRUE;
1422 backup_suffix = malloc(strlen((char *) p) + 1);
1423 strcpy(backup_suffix, (char *) p);
1426 if (strcmp(long_option[i].name, "in-place") == 0){
1429 preserve_time_f = FALSE;
1432 if (strcmp(long_option[i].name, "in-place=") == 0){
1435 preserve_time_f = FALSE;
1437 backup_suffix = malloc(strlen((char *) p) + 1);
1438 strcpy(backup_suffix, (char *) p);
1443 if (strcmp(long_option[i].name, "cap-input") == 0){
1447 if (strcmp(long_option[i].name, "url-input") == 0){
1452 #ifdef NUMCHAR_OPTION
1453 if (strcmp(long_option[i].name, "numchar-input") == 0){
1459 if (strcmp(long_option[i].name, "no-output") == 0){
1463 if (strcmp(long_option[i].name, "debug") == 0){
1468 if (strcmp(long_option[i].name, "cp932") == 0){
1469 #ifdef SHIFTJIS_CP932
1473 #ifdef UTF8_OUTPUT_ENABLE
1474 ms_ucs_map_f = UCS_MAP_CP932;
1478 if (strcmp(long_option[i].name, "no-cp932") == 0){
1479 #ifdef SHIFTJIS_CP932
1483 #ifdef UTF8_OUTPUT_ENABLE
1484 ms_ucs_map_f = UCS_MAP_ASCII;
1488 #ifdef SHIFTJIS_CP932
1489 if (strcmp(long_option[i].name, "cp932inv") == 0){
1496 if (strcmp(long_option[i].name, "x0212") == 0){
1503 if (strcmp(long_option[i].name, "exec-in") == 0){
1507 if (strcmp(long_option[i].name, "exec-out") == 0){
1512 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1513 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1514 no_cp932ext_f = TRUE;
1517 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1518 no_best_fit_chars_f = TRUE;
1521 if (strcmp(long_option[i].name, "fb-skip") == 0){
1522 encode_fallback = NULL;
1525 if (strcmp(long_option[i].name, "fb-html") == 0){
1526 encode_fallback = encode_fallback_html;
1529 if (strcmp(long_option[i].name, "fb-xml") == 0){
1530 encode_fallback = encode_fallback_xml;
1533 if (strcmp(long_option[i].name, "fb-java") == 0){
1534 encode_fallback = encode_fallback_java;
1537 if (strcmp(long_option[i].name, "fb-perl") == 0){
1538 encode_fallback = encode_fallback_perl;
1541 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1542 encode_fallback = encode_fallback_subchar;
1545 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1546 encode_fallback = encode_fallback_subchar;
1547 unicode_subchar = 0;
1549 /* decimal number */
1550 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1551 unicode_subchar *= 10;
1552 unicode_subchar += hex2bin(p[i]);
1554 }else if(p[1] == 'x' || p[1] == 'X'){
1555 /* hexadecimal number */
1556 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1557 unicode_subchar <<= 4;
1558 unicode_subchar |= hex2bin(p[i]);
1562 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1563 unicode_subchar *= 8;
1564 unicode_subchar += hex2bin(p[i]);
1567 w16e_conv(unicode_subchar, &i, &j);
1568 unicode_subchar = i<<8 | j;
1572 #ifdef UTF8_OUTPUT_ENABLE
1573 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1574 ms_ucs_map_f = UCS_MAP_MS;
1578 #ifdef UNICODE_NORMALIZATION
1579 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1580 input_f = UTF8_INPUT;
1585 if (strcmp(long_option[i].name, "prefix=") == 0){
1586 if (nkf_isgraph(p[0])){
1587 for (i = 1; nkf_isgraph(p[i]); i++){
1588 prefix_table[p[i]] = p[0];
1595 case 'b': /* buffered mode */
1598 case 'u': /* non bufferd mode */
1601 case 't': /* transparent mode */
1606 } else if (*cp=='2') {
1610 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1618 case 'j': /* JIS output */
1620 output_conv = j_oconv;
1622 case 'e': /* AT&T EUC output */
1623 output_conv = e_oconv;
1626 case 's': /* SJIS output */
1627 output_conv = s_oconv;
1629 case 'l': /* ISO8859 Latin-1 support, no conversion */
1630 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1631 input_f = LATIN1_INPUT;
1633 case 'i': /* Kanji IN ESC-$-@/B */
1634 if (*cp=='@'||*cp=='B')
1635 kanji_intro = *cp++;
1637 case 'o': /* ASCII IN ESC-(-J/B */
1638 if (*cp=='J'||*cp=='B'||*cp=='H')
1639 ascii_intro = *cp++;
1643 bit:1 katakana->hiragana
1644 bit:2 hiragana->katakana
1646 if ('9'>= *cp && *cp>='0')
1647 hira_f |= (*cp++ -'0');
1654 #if defined(MSDOS) || defined(__OS2__)
1669 #ifdef UTF8_OUTPUT_ENABLE
1670 case 'w': /* UTF-8 output */
1672 output_conv = w_oconv; cp++;
1676 output_bom_f = TRUE;
1679 if ('1'== cp[0] && '6'==cp[1]) {
1680 output_conv = w_oconv16; cp+=2;
1681 } else if ('3'== cp[0] && '2'==cp[1]) {
1682 output_conv = w_oconv32; cp+=2;
1684 output_conv = w_oconv;
1689 output_endian = ENDIAN_LITTLE;
1690 } else if (cp[0] == 'B') {
1698 output_bom_f = TRUE;
1703 #ifdef UTF8_INPUT_ENABLE
1704 case 'W': /* UTF input */
1707 input_f = UTF8_INPUT;
1709 if ('1'== cp[0] && '6'==cp[1]) {
1711 input_f = UTF16_INPUT;
1712 input_endian = ENDIAN_BIG;
1713 } else if ('3'== cp[0] && '2'==cp[1]) {
1715 input_f = UTF32_INPUT;
1716 input_endian = ENDIAN_BIG;
1718 input_f = UTF8_INPUT;
1723 input_endian = ENDIAN_LITTLE;
1724 } else if (cp[0] == 'B') {
1730 /* Input code assumption */
1731 case 'J': /* JIS input */
1732 input_f = JIS_INPUT;
1734 case 'E': /* AT&T EUC input */
1735 input_f = EUC_INPUT;
1737 case 'S': /* MS Kanji input */
1738 input_f = SJIS_INPUT;
1739 if (x0201_f==NO_X0201) x0201_f=TRUE;
1741 case 'Z': /* Convert X0208 alphabet to asii */
1743 bit:0 Convert JIS X 0208 Alphabet to ASCII
1744 bit:1 Convert Kankaku to one space
1745 bit:2 Convert Kankaku to two spaces
1746 bit:3 Convert HTML Entity
1747 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1749 while ('0'<= *cp && *cp <='9') {
1750 alpha_f |= 1 << (*cp++ - '0');
1752 if (!alpha_f) alpha_f = 1;
1754 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1755 x0201_f = FALSE; /* No X0201->X0208 conversion */
1757 ESC-(-I in JIS, EUC, MS Kanji
1758 SI/SO in JIS, EUC, MS Kanji
1759 SSO in EUC, JIS, not in MS Kanji
1760 MS Kanji (0xa0-0xdf)
1762 ESC-(-I in JIS (0x20-0x5f)
1763 SSO in EUC (0xa0-0xdf)
1764 0xa0-0xd in MS Kanji (0xa0-0xdf)
1767 case 'X': /* Assume X0201 kana */
1768 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1771 case 'F': /* prserve new lines */
1772 fold_preserve_f = TRUE;
1773 case 'f': /* folding -f60 or -f */
1776 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1778 fold_len += *cp++ - '0';
1780 if (!(0<fold_len && fold_len<BUFSIZ))
1781 fold_len = DEFAULT_FOLD;
1785 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1787 fold_margin += *cp++ - '0';
1791 case 'm': /* MIME support */
1792 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1793 if (*cp=='B'||*cp=='Q') {
1794 mime_decode_mode = *cp++;
1795 mimebuf_f = FIXED_MIME;
1796 } else if (*cp=='N') {
1797 mime_f = TRUE; cp++;
1798 } else if (*cp=='S') {
1799 mime_f = STRICT_MIME; cp++;
1800 } else if (*cp=='0') {
1801 mime_decode_f = FALSE;
1802 mime_f = FALSE; cp++;
1805 case 'M': /* MIME output */
1808 mimeout_f = FIXED_MIME; cp++;
1809 } else if (*cp=='Q') {
1811 mimeout_f = FIXED_MIME; cp++;
1816 case 'B': /* Broken JIS support */
1818 bit:1 allow any x on ESC-(-x or ESC-$-x
1819 bit:2 reset to ascii on NL
1821 if ('9'>= *cp && *cp>='0')
1822 broken_f |= 1<<(*cp++ -'0');
1827 case 'O':/* for Output file */
1831 case 'c':/* add cr code */
1834 case 'd':/* delete cr code */
1837 case 'I': /* ISO-2022-JP output */
1840 case 'L': /* line mode */
1841 if (*cp=='u') { /* unix */
1842 nlmode_f = LF; cp++;
1843 } else if (*cp=='m') { /* mac */
1844 nlmode_f = CR; cp++;
1845 } else if (*cp=='w') { /* windows */
1846 nlmode_f = CRLF; cp++;
1847 } else if (*cp=='0') { /* no conversion */
1857 /* module muliple options in a string are allowed for Perl moudle */
1858 while(*cp && *cp++!='-');
1861 /* bogus option but ignored */
1867 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1870 struct input_code *p = input_code_list;
1872 if (iconv_func == p->iconv_func){
1881 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1883 #ifdef INPUT_CODE_FIX
1891 #ifdef INPUT_CODE_FIX
1892 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1898 if (estab_f && iconv_for_check != iconv){
1899 struct input_code *p = find_inputcode_byfunc(iconv);
1901 set_input_codename(p->name);
1904 iconv_for_check = iconv;
1909 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1910 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1911 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1912 #ifdef SHIFTJIS_CP932
1913 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1914 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1916 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1918 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1919 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1921 #define SCORE_INIT (SCORE_iMIME)
1923 static const char score_table_A0[] = {
1926 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1927 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1930 static const char score_table_F0[] = {
1931 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1932 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1933 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1934 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1937 void set_code_score(struct input_code *ptr, nkf_char score)
1940 ptr->score |= score;
1944 void clr_code_score(struct input_code *ptr, nkf_char score)
1947 ptr->score &= ~score;
1951 void code_score(struct input_code *ptr)
1953 nkf_char c2 = ptr->buf[0];
1954 #ifdef UTF8_OUTPUT_ENABLE
1955 nkf_char c1 = ptr->buf[1];
1958 set_code_score(ptr, SCORE_ERROR);
1959 }else if (c2 == SSO){
1960 set_code_score(ptr, SCORE_KANA);
1961 #ifdef UTF8_OUTPUT_ENABLE
1962 }else if (!e2w_conv(c2, c1)){
1963 set_code_score(ptr, SCORE_NO_EXIST);
1965 }else if ((c2 & 0x70) == 0x20){
1966 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1967 }else if ((c2 & 0x70) == 0x70){
1968 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1969 }else if ((c2 & 0x70) >= 0x50){
1970 set_code_score(ptr, SCORE_L2);
1974 void status_disable(struct input_code *ptr)
1979 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1982 void status_push_ch(struct input_code *ptr, nkf_char c)
1984 ptr->buf[ptr->index++] = c;
1987 void status_clear(struct input_code *ptr)
1993 void status_reset(struct input_code *ptr)
1996 ptr->score = SCORE_INIT;
1999 void status_reinit(struct input_code *ptr)
2002 ptr->_file_stat = 0;
2005 void status_check(struct input_code *ptr, nkf_char c)
2007 if (c <= DEL && estab_f){
2012 void s_status(struct input_code *ptr, nkf_char c)
2016 status_check(ptr, c);
2021 #ifdef NUMCHAR_OPTION
2022 }else if (is_unicode_capsule(c)){
2025 }else if (0xa1 <= c && c <= 0xdf){
2026 status_push_ch(ptr, SSO);
2027 status_push_ch(ptr, c);
2030 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2032 status_push_ch(ptr, c);
2033 #ifdef SHIFTJIS_CP932
2035 && is_ibmext_in_sjis(c)){
2037 status_push_ch(ptr, c);
2038 #endif /* SHIFTJIS_CP932 */
2040 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2042 status_push_ch(ptr, c);
2043 #endif /* X0212_ENABLE */
2045 status_disable(ptr);
2049 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2050 status_push_ch(ptr, c);
2051 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2055 status_disable(ptr);
2059 #ifdef SHIFTJIS_CP932
2060 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2061 status_push_ch(ptr, c);
2062 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2063 set_code_score(ptr, SCORE_CP932);
2068 #endif /* SHIFTJIS_CP932 */
2069 #ifndef X0212_ENABLE
2070 status_disable(ptr);
2076 void e_status(struct input_code *ptr, nkf_char c)
2080 status_check(ptr, c);
2085 #ifdef NUMCHAR_OPTION
2086 }else if (is_unicode_capsule(c)){
2089 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2091 status_push_ch(ptr, c);
2093 }else if (0x8f == c){
2095 status_push_ch(ptr, c);
2096 #endif /* X0212_ENABLE */
2098 status_disable(ptr);
2102 if (0xa1 <= c && c <= 0xfe){
2103 status_push_ch(ptr, c);
2107 status_disable(ptr);
2112 if (0xa1 <= c && c <= 0xfe){
2114 status_push_ch(ptr, c);
2116 status_disable(ptr);
2118 #endif /* X0212_ENABLE */
2122 #ifdef UTF8_INPUT_ENABLE
2123 void w_status(struct input_code *ptr, nkf_char c)
2127 status_check(ptr, c);
2132 #ifdef NUMCHAR_OPTION
2133 }else if (is_unicode_capsule(c)){
2136 }else if (0xc0 <= c && c <= 0xdf){
2138 status_push_ch(ptr, c);
2139 }else if (0xe0 <= c && c <= 0xef){
2141 status_push_ch(ptr, c);
2142 }else if (0xf0 <= c && c <= 0xf4){
2144 status_push_ch(ptr, c);
2146 status_disable(ptr);
2151 if (0x80 <= c && c <= 0xbf){
2152 status_push_ch(ptr, c);
2153 if (ptr->index > ptr->stat){
2154 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2155 && ptr->buf[2] == 0xbf);
2156 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2157 &ptr->buf[0], &ptr->buf[1]);
2164 status_disable(ptr);
2168 if (0x80 <= c && c <= 0xbf){
2169 if (ptr->index < ptr->stat){
2170 status_push_ch(ptr, c);
2175 status_disable(ptr);
2182 void code_status(nkf_char c)
2184 int action_flag = 1;
2185 struct input_code *result = 0;
2186 struct input_code *p = input_code_list;
2188 if (!p->status_func) {
2192 if (!p->status_func)
2194 (p->status_func)(p, c);
2197 }else if(p->stat == 0){
2208 if (result && !estab_f){
2209 set_iconv(TRUE, result->iconv_func);
2210 }else if (c <= DEL){
2211 struct input_code *ptr = input_code_list;
2221 nkf_char std_getc(FILE *f)
2224 return std_gc_buf[--std_gc_ndx];
2230 nkf_char std_ungetc(nkf_char c, FILE *f)
2232 if (std_gc_ndx == STD_GC_BUFSIZE){
2235 std_gc_buf[std_gc_ndx++] = c;
2240 void std_putc(nkf_char c)
2247 #if !defined(PERL_XS) && !defined(WIN32DLL)
2248 nkf_char noconvert(FILE *f)
2253 module_connection();
2254 while ((c = (*i_getc)(f)) != EOF)
2261 void module_connection(void)
2263 oconv = output_conv;
2266 /* replace continucation module, from output side */
2268 /* output redicrection */
2270 if (noout_f || guess_f){
2277 if (mimeout_f == TRUE) {
2278 o_base64conv = oconv; oconv = base64_conv;
2280 /* base64_count = 0; */
2283 if (nlmode_f || guess_f) {
2284 o_nlconv = oconv; oconv = nl_conv;
2287 o_rot_conv = oconv; oconv = rot_conv;
2290 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2293 o_hira_conv = oconv; oconv = hira_conv;
2296 o_fconv = oconv; oconv = fold_conv;
2299 if (alpha_f || x0201_f) {
2300 o_zconv = oconv; oconv = z_conv;
2304 i_ungetc = std_ungetc;
2305 /* input redicrection */
2308 i_cgetc = i_getc; i_getc = cap_getc;
2309 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2312 i_ugetc = i_getc; i_getc = url_getc;
2313 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2316 #ifdef NUMCHAR_OPTION
2318 i_ngetc = i_getc; i_getc = numchar_getc;
2319 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2322 #ifdef UNICODE_NORMALIZATION
2323 if (nfc_f && input_f == UTF8_INPUT){
2324 i_nfc_getc = i_getc; i_getc = nfc_getc;
2325 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2328 if (mime_f && mimebuf_f==FIXED_MIME) {
2329 i_mgetc = i_getc; i_getc = mime_getc;
2330 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2333 i_bgetc = i_getc; i_getc = broken_getc;
2334 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2336 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2337 set_iconv(-TRUE, e_iconv);
2338 } else if (input_f == SJIS_INPUT) {
2339 set_iconv(-TRUE, s_iconv);
2340 #ifdef UTF8_INPUT_ENABLE
2341 } else if (input_f == UTF8_INPUT) {
2342 set_iconv(-TRUE, w_iconv);
2343 } else if (input_f == UTF16_INPUT) {
2344 set_iconv(-TRUE, w_iconv16);
2345 } else if (input_f == UTF32_INPUT) {
2346 set_iconv(-TRUE, w_iconv32);
2349 set_iconv(FALSE, e_iconv);
2353 struct input_code *p = input_code_list;
2361 * Check and Ignore BOM
2363 void check_bom(FILE *f)
2366 switch(c2 = (*i_getc)(f)){
2368 if((c2 = (*i_getc)(f)) == 0x00){
2369 if((c2 = (*i_getc)(f)) == 0xFE){
2370 if((c2 = (*i_getc)(f)) == 0xFF){
2372 set_iconv(TRUE, w_iconv32);
2374 if (iconv == w_iconv32) {
2375 input_endian = ENDIAN_BIG;
2378 (*i_ungetc)(0xFF,f);
2379 }else (*i_ungetc)(c2,f);
2380 (*i_ungetc)(0xFE,f);
2381 }else if(c2 == 0xFF){
2382 if((c2 = (*i_getc)(f)) == 0xFE){
2384 set_iconv(TRUE, w_iconv32);
2386 if (iconv == w_iconv32) {
2387 input_endian = ENDIAN_2143;
2390 (*i_ungetc)(0xFF,f);
2391 }else (*i_ungetc)(c2,f);
2392 (*i_ungetc)(0xFF,f);
2393 }else (*i_ungetc)(c2,f);
2394 (*i_ungetc)(0x00,f);
2395 }else (*i_ungetc)(c2,f);
2396 (*i_ungetc)(0x00,f);
2399 if((c2 = (*i_getc)(f)) == 0xBB){
2400 if((c2 = (*i_getc)(f)) == 0xBF){
2402 set_iconv(TRUE, w_iconv);
2404 if (iconv == w_iconv) {
2407 (*i_ungetc)(0xBF,f);
2408 }else (*i_ungetc)(c2,f);
2409 (*i_ungetc)(0xBB,f);
2410 }else (*i_ungetc)(c2,f);
2411 (*i_ungetc)(0xEF,f);
2414 if((c2 = (*i_getc)(f)) == 0xFF){
2415 if((c2 = (*i_getc)(f)) == 0x00){
2416 if((c2 = (*i_getc)(f)) == 0x00){
2418 set_iconv(TRUE, w_iconv32);
2420 if (iconv == w_iconv32) {
2421 input_endian = ENDIAN_3412;
2424 (*i_ungetc)(0x00,f);
2425 }else (*i_ungetc)(c2,f);
2426 (*i_ungetc)(0x00,f);
2427 }else (*i_ungetc)(c2,f);
2429 set_iconv(TRUE, w_iconv16);
2431 if (iconv == w_iconv16) {
2432 input_endian = ENDIAN_BIG;
2435 (*i_ungetc)(0xFF,f);
2436 }else (*i_ungetc)(c2,f);
2437 (*i_ungetc)(0xFE,f);
2440 if((c2 = (*i_getc)(f)) == 0xFE){
2441 if((c2 = (*i_getc)(f)) == 0x00){
2442 if((c2 = (*i_getc)(f)) == 0x00){
2444 set_iconv(TRUE, w_iconv32);
2446 if (iconv == w_iconv32) {
2447 input_endian = ENDIAN_LITTLE;
2450 (*i_ungetc)(0x00,f);
2451 }else (*i_ungetc)(c2,f);
2452 (*i_ungetc)(0x00,f);
2453 }else (*i_ungetc)(c2,f);
2455 set_iconv(TRUE, w_iconv16);
2457 if (iconv == w_iconv16) {
2458 input_endian = ENDIAN_LITTLE;
2461 (*i_ungetc)(0xFE,f);
2462 }else (*i_ungetc)(c2,f);
2463 (*i_ungetc)(0xFF,f);
2472 Conversion main loop. Code detection only.
2475 nkf_char kanji_convert(FILE *f)
2477 nkf_char c3, c2=0, c1, c0=0;
2478 int is_8bit = FALSE;
2480 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2481 #ifdef UTF8_INPUT_ENABLE
2482 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2489 output_mode = ASCII;
2492 #define NEXT continue /* no output, get next */
2493 #define SEND ; /* output c1 and c2, get next */
2494 #define LAST break /* end of loop, go closing */
2496 module_connection();
2499 while ((c1 = (*i_getc)(f)) != EOF) {
2500 #ifdef INPUT_CODE_FIX
2506 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2507 /* in case of 8th bit is on */
2508 if (!estab_f&&!mime_decode_mode) {
2509 /* in case of not established yet */
2510 /* It is still ambiguious */
2511 if (h_conv(f, c2, c1)==EOF)
2517 /* in case of already established */
2519 /* ignore bogus code and not CP5022x UCD */
2527 /* second byte, 7 bit code */
2528 /* it might be kanji shitfted */
2529 if ((c1 == DEL) || (c1 <= SP)) {
2530 /* ignore bogus first code */
2537 #ifdef UTF8_INPUT_ENABLE
2538 if (iconv == w_iconv16) {
2539 if (input_endian == ENDIAN_BIG) {
2541 if ((c1 = (*i_getc)(f)) != EOF) {
2542 if (0xD8 <= c2 && c2 <= 0xDB) {
2543 if ((c0 = (*i_getc)(f)) != EOF) {
2545 if ((c3 = (*i_getc)(f)) != EOF) {
2552 if ((c2 = (*i_getc)(f)) != EOF) {
2553 if (0xD8 <= c2 && c2 <= 0xDB) {
2554 if ((c3 = (*i_getc)(f)) != EOF) {
2555 if ((c0 = (*i_getc)(f)) != EOF) {
2564 } else if(iconv == w_iconv32){
2566 if((c2 = (*i_getc)(f)) != EOF &&
2567 (c1 = (*i_getc)(f)) != EOF &&
2568 (c0 = (*i_getc)(f)) != EOF){
2569 switch(input_endian){
2571 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2574 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2577 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2580 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2590 #ifdef NUMCHAR_OPTION
2591 if (is_unicode_capsule(c1)){
2595 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2597 if (!estab_f && !iso8859_f) {
2598 /* not established yet */
2601 } else { /* estab_f==TRUE */
2606 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2607 /* SJIS X0201 Case... */
2608 if(iso2022jp_f && x0201_f==NO_X0201) {
2609 (*oconv)(GETA1, GETA2);
2616 } else if (c1==SSO && iconv != s_iconv) {
2617 /* EUC X0201 Case */
2618 c1 = (*i_getc)(f); /* skip SSO */
2620 if (SSP<=c1 && c1<0xe0) {
2621 if(iso2022jp_f && x0201_f==NO_X0201) {
2622 (*oconv)(GETA1, GETA2);
2629 } else { /* bogus code, skip SSO and one byte */
2632 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2633 (c1 == 0xFD || c1 == 0xFE)) {
2639 /* already established */
2644 } else if ((c1 > SP) && (c1 != DEL)) {
2645 /* in case of Roman characters */
2647 /* output 1 shifted byte */
2651 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2652 /* output 1 shifted byte */
2653 if(iso2022jp_f && x0201_f==NO_X0201) {
2654 (*oconv)(GETA1, GETA2);
2661 /* look like bogus code */
2664 } else if (input_mode == X0208 || input_mode == X0212 ||
2665 input_mode == X0213_1 || input_mode == X0213_2) {
2666 /* in case of Kanji shifted */
2669 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2670 /* Check MIME code */
2671 if ((c1 = (*i_getc)(f)) == EOF) {
2674 } else if (c1 == '?') {
2675 /* =? is mime conversion start sequence */
2676 if(mime_f == STRICT_MIME) {
2677 /* check in real detail */
2678 if (mime_begin_strict(f) == EOF)
2682 } else if (mime_begin(f) == EOF)
2692 /* normal ASCII code */
2695 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2698 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2701 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2702 if ((c1 = (*i_getc)(f)) == EOF) {
2703 /* (*oconv)(0, ESC); don't send bogus code */
2705 } else if (c1 == '$') {
2706 if ((c1 = (*i_getc)(f)) == EOF) {
2708 (*oconv)(0, ESC); don't send bogus code
2709 (*oconv)(0, '$'); */
2711 } else if (c1 == '@'|| c1 == 'B') {
2712 /* This is kanji introduction */
2715 set_input_codename("ISO-2022-JP");
2717 debug("ISO-2022-JP");
2720 } else if (c1 == '(') {
2721 if ((c1 = (*i_getc)(f)) == EOF) {
2722 /* don't send bogus code
2728 } else if (c1 == '@'|| c1 == 'B') {
2729 /* This is kanji introduction */
2734 } else if (c1 == 'D'){
2738 #endif /* X0212_ENABLE */
2739 } else if (c1 == (X0213_1&0x7F)){
2740 input_mode = X0213_1;
2743 } else if (c1 == (X0213_2&0x7F)){
2744 input_mode = X0213_2;
2748 /* could be some special code */
2755 } else if (broken_f&0x2) {
2756 /* accept any ESC-(-x as broken code ... */
2766 } else if (c1 == '(') {
2767 if ((c1 = (*i_getc)(f)) == EOF) {
2768 /* don't send bogus code
2770 (*oconv)(0, '('); */
2774 /* This is X0201 kana introduction */
2775 input_mode = X0201; shift_mode = X0201;
2777 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2778 /* This is X0208 kanji introduction */
2779 input_mode = ASCII; shift_mode = FALSE;
2781 } else if (broken_f&0x2) {
2782 input_mode = ASCII; shift_mode = FALSE;
2787 /* maintain various input_mode here */
2791 } else if ( c1 == 'N' || c1 == 'n'){
2793 c3 = (*i_getc)(f); /* skip SS2 */
2794 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2809 } else if (c1 == ESC && iconv == s_iconv) {
2810 /* ESC in Shift_JIS */
2811 if ((c1 = (*i_getc)(f)) == EOF) {
2812 /* (*oconv)(0, ESC); don't send bogus code */
2814 } else if (c1 == '$') {
2816 if ((c1 = (*i_getc)(f)) == EOF) {
2818 (*oconv)(0, ESC); don't send bogus code
2819 (*oconv)(0, '$'); */
2822 if (('E' <= c1 && c1 <= 'G') ||
2823 ('O' <= c1 && c1 <= 'Q')) {
2831 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2832 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2833 while ((c1 = (*i_getc)(f)) != EOF) {
2834 if (SP <= c1 && c1 <= 'z') {
2835 (*oconv)(0, c1 + c0);
2836 } else break; /* c1 == SO */
2840 if (c1 == EOF) LAST;
2847 } else if (c1 == LF || c1 == CR) {
2849 input_mode = ASCII; set_iconv(FALSE, 0);
2851 } else if (mime_decode_f && !mime_decode_mode){
2853 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2861 } else { /* if (c1 == CR)*/
2862 if ((c1=(*i_getc)(f))!=EOF) {
2866 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2880 } else if (c1 == DEL && input_mode == X0208) {
2890 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2893 if ((c0 = (*i_getc)(f)) != EOF) {
2896 if ((c3 = (*i_getc)(f)) != EOF) {
2898 (*iconv)(c2, c1, c0|c3);
2903 /* 3 bytes EUC or UTF-8 */
2904 if ((c0 = (*i_getc)(f)) != EOF) {
2906 (*iconv)(c2, c1, c0);
2914 0x7F <= c2 && c2 <= 0x92 &&
2915 0x21 <= c1 && c1 <= 0x7E) {
2917 if(c1 == 0x7F) return 0;
2918 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2921 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2925 (*oconv)(PREFIX_EUCG3 | c2, c1);
2927 #endif /* X0212_ENABLE */
2929 (*oconv)(PREFIX_EUCG3 | c2, c1);
2932 (*oconv)(input_mode, c1); /* other special case */
2938 /* goto next_word */
2942 (*iconv)(EOF, 0, 0);
2943 if (!input_codename)
2946 struct input_code *p = input_code_list;
2947 struct input_code *result = p;
2949 if (p->score < result->score) result = p;
2952 set_input_codename(result->name);
2954 debug(result->name);
2962 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2964 nkf_char ret, c3, c0;
2968 /** it must NOT be in the kanji shifte sequence */
2969 /** it must NOT be written in JIS7 */
2970 /** and it must be after 2 byte 8bit code */
2976 while ((c1 = (*i_getc)(f)) != EOF) {
2982 if (push_hold_buf(c1) == EOF || estab_f){
2988 struct input_code *p = input_code_list;
2989 struct input_code *result = p;
2994 if (p->status_func && p->score < result->score){
2999 set_iconv(TRUE, result->iconv_func);
3004 ** 1) EOF is detected, or
3005 ** 2) Code is established, or
3006 ** 3) Buffer is FULL (but last word is pushed)
3008 ** in 1) and 3) cases, we continue to use
3009 ** Kanji codes by oconv and leave estab_f unchanged.
3014 while (hold_index < hold_count){
3015 c2 = hold_buf[hold_index++];
3017 #ifdef NUMCHAR_OPTION
3018 || is_unicode_capsule(c2)
3023 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3024 (*iconv)(X0201, c2, 0);
3027 if (hold_index < hold_count){
3028 c1 = hold_buf[hold_index++];
3038 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3041 if (hold_index < hold_count){
3042 c0 = hold_buf[hold_index++];
3043 } else if ((c0 = (*i_getc)(f)) == EOF) {
3049 if (hold_index < hold_count){
3050 c3 = hold_buf[hold_index++];
3051 } else if ((c3 = (*i_getc)(f)) == EOF) {
3056 (*iconv)(c2, c1, c0|c3);
3061 /* 3 bytes EUC or UTF-8 */
3062 if (hold_index < hold_count){
3063 c0 = hold_buf[hold_index++];
3064 } else if ((c0 = (*i_getc)(f)) == EOF) {
3070 (*iconv)(c2, c1, c0);
3073 if (c0 == EOF) break;
3078 nkf_char push_hold_buf(nkf_char c2)
3080 if (hold_count >= HOLD_SIZE*2)
3082 hold_buf[hold_count++] = (unsigned char)c2;
3083 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3086 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3088 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3091 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3092 #ifdef SHIFTJIS_CP932
3093 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3094 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3101 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3102 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3108 #endif /* SHIFTJIS_CP932 */
3110 if (!x0213_f && is_ibmext_in_sjis(c2)){
3111 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3114 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3127 if(x0213_f && c2 >= 0xF0){
3128 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3129 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3130 }else{ /* 78<=k<=94 */
3131 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3132 if (0x9E < c1) c2++;
3135 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3136 if (0x9E < c1) c2++;
3139 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3146 c2 = x0212_unshift(c2);
3153 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3157 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3159 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3161 if(c1 == 0x7F) return 0;
3162 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3165 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3166 if (ret) return ret;
3172 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3177 }else if (c2 == 0x8f){
3181 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3182 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3183 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3186 c2 = (c2 << 8) | (c1 & 0x7f);
3188 #ifdef SHIFTJIS_CP932
3191 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3192 s2e_conv(s2, s1, &c2, &c1);
3199 #endif /* SHIFTJIS_CP932 */
3201 #endif /* X0212_ENABLE */
3202 } else if (c2 == SSO){
3205 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3208 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3209 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3210 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3215 #ifdef SHIFTJIS_CP932
3216 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3218 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3219 s2e_conv(s2, s1, &c2, &c1);
3226 #endif /* SHIFTJIS_CP932 */
3233 #ifdef UTF8_INPUT_ENABLE
3234 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3241 }else if (0xc0 <= c2 && c2 <= 0xef) {
3242 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3243 #ifdef NUMCHAR_OPTION
3246 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3254 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3257 static const char w_iconv_utf8_1st_byte[] =
3259 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3260 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3261 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3262 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3264 if (c2 < 0 || 0xff < c2) {
3265 }else if (c2 == 0) { /* 0 : 1 byte*/
3267 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3270 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3272 if (c1 < 0x80 || 0xBF < c1) return 0;
3275 if (c0 == 0) return -1;
3276 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3281 if (c0 == 0) return -1;
3282 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3286 if (c0 == 0) return -1;
3287 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3291 if (c0 == 0) return -2;
3292 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3296 if (c0 == 0) return -2;
3297 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3301 if (c0 == 0) return -2;
3302 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3310 if (c2 == 0 || c2 == EOF){
3311 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3312 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3315 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3324 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3325 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3332 }else if (val < 0x800){
3333 *p2 = 0xc0 | (val >> 6);
3334 *p1 = 0x80 | (val & 0x3f);
3336 } else if (val <= NKF_INT32_C(0xFFFF)) {
3337 *p2 = 0xe0 | (val >> 12);
3338 *p1 = 0x80 | ((val >> 6) & 0x3f);
3339 *p0 = 0x80 | (val & 0x3f);
3340 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3341 *p2 = 0xe0 | (val >> 16);
3342 *p1 = 0x80 | ((val >> 12) & 0x3f);
3343 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3352 #ifdef UTF8_INPUT_ENABLE
3353 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3358 } else if (c2 >= 0xf0){
3359 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3360 val = (c2 & 0x0f) << 18;
3361 val |= (c1 & 0x3f) << 12;
3362 val |= (c0 & 0x3f00) >> 2;
3364 }else if (c2 >= 0xe0){
3365 val = (c2 & 0x0f) << 12;
3366 val |= (c1 & 0x3f) << 6;
3368 }else if (c2 >= 0xc0){
3369 val = (c2 & 0x1f) << 6;
3377 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3379 nkf_char c2, c1, c0;
3386 w16w_conv(val, &c2, &c1, &c0);
3387 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3388 #ifdef NUMCHAR_OPTION
3391 *p1 = CLASS_UNICODE | val;
3400 #ifdef UTF8_INPUT_ENABLE
3401 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3404 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3407 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3408 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3410 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3412 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3417 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3418 if (ret) return ret;
3423 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3427 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3428 } else if (is_unicode_bmp(c1)) {
3429 ret = w16e_conv(c1, &c2, &c1);
3432 c1 = CLASS_UNICODE | c1;
3434 if (ret) return ret;
3439 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3441 const unsigned short *const *pp;
3442 const unsigned short *const *const *ppp;
3443 static const char no_best_fit_chars_table_C2[] =
3444 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3445 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3446 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3447 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3448 static const char no_best_fit_chars_table_C2_ms[] =
3449 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3451 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3452 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3453 static const char no_best_fit_chars_table_932_C2[] =
3454 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3456 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3457 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3458 static const char no_best_fit_chars_table_932_C3[] =
3459 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3460 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3462 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3468 }else if(c2 < 0xe0){
3469 if(no_best_fit_chars_f){
3470 if(ms_ucs_map_f == UCS_MAP_CP932){
3473 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3476 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3479 }else if(!cp932inv_f){
3482 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3485 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3488 }else if(ms_ucs_map_f == UCS_MAP_MS){
3489 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3490 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3508 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3509 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3510 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3512 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3513 }else if(c0 < 0xF0){
3514 if(no_best_fit_chars_f){
3515 if(ms_ucs_map_f == UCS_MAP_CP932){
3516 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3517 }else if(ms_ucs_map_f == UCS_MAP_MS){
3522 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3525 if(c0 == 0x92) return 1;
3530 if(c1 == 0x80 || c0 == 0x9C) return 1;
3533 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3538 if(c0 == 0x94) return 1;
3541 if(c0 == 0xBB) return 1;
3551 if(c0 == 0x95) return 1;
3554 if(c0 == 0xA5) return 1;
3561 if(c0 == 0x8D) return 1;
3564 if(c0 == 0x9E && !cp932inv_f) return 1;
3567 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3575 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3576 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3577 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3579 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3581 #ifdef SHIFTJIS_CP932
3582 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3584 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3585 s2e_conv(s2, s1, p2, p1);
3594 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3597 const unsigned short *p;
3600 if (pp == 0) return 1;
3603 if (c1 < 0 || psize <= c1) return 1;
3605 if (p == 0) return 1;
3608 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3610 if (val == 0) return 1;
3611 if (no_cp932ext_f && (
3612 (val>>8) == 0x2D || /* NEC special characters */
3613 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3621 if (c2 == SO) c2 = X0201;
3628 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3635 (*f)(0, bin2hex(c>>shift));
3645 void encode_fallback_html(nkf_char c)
3650 if(c >= NKF_INT32_C(1000000))
3651 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3652 if(c >= NKF_INT32_C(100000))
3653 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3655 (*oconv)(0, 0x30+(c/10000 )%10);
3657 (*oconv)(0, 0x30+(c/1000 )%10);
3659 (*oconv)(0, 0x30+(c/100 )%10);
3661 (*oconv)(0, 0x30+(c/10 )%10);
3663 (*oconv)(0, 0x30+ c %10);
3668 void encode_fallback_xml(nkf_char c)
3673 nkf_each_char_to_hex(oconv, c);
3678 void encode_fallback_java(nkf_char c)
3682 if(!is_unicode_bmp(c)){
3686 (*oconv)(0, bin2hex(c>>20));
3687 (*oconv)(0, bin2hex(c>>16));
3691 (*oconv)(0, bin2hex(c>>12));
3692 (*oconv)(0, bin2hex(c>> 8));
3693 (*oconv)(0, bin2hex(c>> 4));
3694 (*oconv)(0, bin2hex(c ));
3698 void encode_fallback_perl(nkf_char c)
3703 nkf_each_char_to_hex(oconv, c);
3708 void encode_fallback_subchar(nkf_char c)
3710 c = unicode_subchar;
3711 (*oconv)((c>>8)&0xFF, c&0xFF);
3716 #ifdef UTF8_OUTPUT_ENABLE
3717 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3719 const unsigned short *p;
3722 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3730 p = euc_to_utf8_1byte;
3732 } else if (is_eucg3(c2)){
3733 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3736 c2 = (c2&0x7f) - 0x21;
3737 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3738 p = x0212_to_utf8_2bytes[c2];
3744 c2 = (c2&0x7f) - 0x21;
3745 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3747 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3748 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3749 euc_to_utf8_2bytes_ms[c2];
3754 c1 = (c1 & 0x7f) - 0x21;
3755 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3760 void w_oconv(nkf_char c2, nkf_char c1)
3766 output_bom_f = FALSE;
3777 #ifdef NUMCHAR_OPTION
3778 if (c2 == 0 && is_unicode_capsule(c1)){
3779 val = c1 & VALUE_MASK;
3782 }else if (val < 0x800){
3783 (*o_putc)(0xC0 | (val >> 6));
3784 (*o_putc)(0x80 | (val & 0x3f));
3785 } else if (val <= NKF_INT32_C(0xFFFF)) {
3786 (*o_putc)(0xE0 | (val >> 12));
3787 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3788 (*o_putc)(0x80 | (val & 0x3f));
3789 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3790 (*o_putc)(0xF0 | ( val>>18));
3791 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3792 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3793 (*o_putc)(0x80 | ( val & 0x3f));
3800 output_mode = ASCII;
3802 } else if (c2 == ISO8859_1) {
3803 output_mode = ISO8859_1;
3804 (*o_putc)(c1 | 0x080);
3807 val = e2w_conv(c2, c1);
3809 w16w_conv(val, &c2, &c1, &c0);
3813 if (c0) (*o_putc)(c0);
3819 void w_oconv16(nkf_char c2, nkf_char c1)
3822 output_bom_f = FALSE;
3823 if (output_endian == ENDIAN_LITTLE){
3824 (*o_putc)((unsigned char)'\377');
3828 (*o_putc)((unsigned char)'\377');
3837 if (c2 == ISO8859_1) {
3840 #ifdef NUMCHAR_OPTION
3841 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3842 if (is_unicode_bmp(c1)) {
3843 c2 = (c1 >> 8) & 0xff;
3847 if (c1 <= UNICODE_MAX) {
3848 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3849 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3850 if (output_endian == ENDIAN_LITTLE){
3851 (*o_putc)(c2 & 0xff);
3852 (*o_putc)((c2 >> 8) & 0xff);
3853 (*o_putc)(c1 & 0xff);
3854 (*o_putc)((c1 >> 8) & 0xff);
3856 (*o_putc)((c2 >> 8) & 0xff);
3857 (*o_putc)(c2 & 0xff);
3858 (*o_putc)((c1 >> 8) & 0xff);
3859 (*o_putc)(c1 & 0xff);
3866 nkf_char val = e2w_conv(c2, c1);
3867 c2 = (val >> 8) & 0xff;
3871 if (output_endian == ENDIAN_LITTLE){
3880 void w_oconv32(nkf_char c2, nkf_char c1)
3883 output_bom_f = FALSE;
3884 if (output_endian == ENDIAN_LITTLE){
3885 (*o_putc)((unsigned char)'\377');
3893 (*o_putc)((unsigned char)'\377');
3902 if (c2 == ISO8859_1) {
3904 #ifdef NUMCHAR_OPTION
3905 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3909 c1 = e2w_conv(c2, c1);
3912 if (output_endian == ENDIAN_LITTLE){
3913 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3914 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3915 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3919 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3920 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3921 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3926 void e_oconv(nkf_char c2, nkf_char c1)
3928 #ifdef NUMCHAR_OPTION
3929 if (c2 == 0 && is_unicode_capsule(c1)){
3930 w16e_conv(c1, &c2, &c1);
3931 if (c2 == 0 && is_unicode_capsule(c1)){
3932 c2 = c1 & VALUE_MASK;
3933 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3937 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3938 c1 = 0x21 + c1 % 94;
3941 (*o_putc)((c2 & 0x7f) | 0x080);
3942 (*o_putc)(c1 | 0x080);
3944 (*o_putc)((c2 & 0x7f) | 0x080);
3945 (*o_putc)(c1 | 0x080);
3949 if (encode_fallback) (*encode_fallback)(c1);
3958 } else if (c2 == 0) {
3959 output_mode = ASCII;
3961 } else if (c2 == X0201) {
3962 output_mode = JAPANESE_EUC;
3963 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3964 } else if (c2 == ISO8859_1) {
3965 output_mode = ISO8859_1;
3966 (*o_putc)(c1 | 0x080);
3968 } else if (is_eucg3(c2)){
3969 output_mode = JAPANESE_EUC;
3970 #ifdef SHIFTJIS_CP932
3973 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3974 s2e_conv(s2, s1, &c2, &c1);
3979 output_mode = ASCII;
3981 }else if (is_eucg3(c2)){
3984 (*o_putc)((c2 & 0x7f) | 0x080);
3985 (*o_putc)(c1 | 0x080);
3988 (*o_putc)((c2 & 0x7f) | 0x080);
3989 (*o_putc)(c1 | 0x080);
3993 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3994 set_iconv(FALSE, 0);
3995 return; /* too late to rescue this char */
3997 output_mode = JAPANESE_EUC;
3998 (*o_putc)(c2 | 0x080);
3999 (*o_putc)(c1 | 0x080);
4004 nkf_char x0212_shift(nkf_char c)
4009 if (0x75 <= c && c <= 0x7f){
4010 ret = c + (0x109 - 0x75);
4013 if (0x75 <= c && c <= 0x7f){
4014 ret = c + (0x113 - 0x75);
4021 nkf_char x0212_unshift(nkf_char c)
4024 if (0x7f <= c && c <= 0x88){
4025 ret = c + (0x75 - 0x7f);
4026 }else if (0x89 <= c && c <= 0x92){
4027 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4031 #endif /* X0212_ENABLE */
4033 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4039 if((0x21 <= ndx && ndx <= 0x2F)){
4040 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4041 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4043 }else if(0x6E <= ndx && ndx <= 0x7E){
4044 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4045 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4051 else if(nkf_isgraph(ndx)){
4053 const unsigned short *ptr;
4054 ptr = x0212_shiftjis[ndx - 0x21];
4056 val = ptr[(c1 & 0x7f) - 0x21];
4065 c2 = x0212_shift(c2);
4067 #endif /* X0212_ENABLE */
4069 if(0x7F < c2) return 1;
4070 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4071 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4075 void s_oconv(nkf_char c2, nkf_char c1)
4077 #ifdef NUMCHAR_OPTION
4078 if (c2 == 0 && is_unicode_capsule(c1)){
4079 w16e_conv(c1, &c2, &c1);
4080 if (c2 == 0 && is_unicode_capsule(c1)){
4081 c2 = c1 & VALUE_MASK;
4082 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4085 c2 = c1 / 188 + 0xF0;
4087 c1 += 0x40 + (c1 > 0x3e);
4092 if(encode_fallback)(*encode_fallback)(c1);
4101 } else if (c2 == 0) {
4102 output_mode = ASCII;
4104 } else if (c2 == X0201) {
4105 output_mode = SHIFT_JIS;
4107 } else if (c2 == ISO8859_1) {
4108 output_mode = ISO8859_1;
4109 (*o_putc)(c1 | 0x080);
4111 } else if (is_eucg3(c2)){
4112 output_mode = SHIFT_JIS;
4113 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4119 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4120 set_iconv(FALSE, 0);
4121 return; /* too late to rescue this char */
4123 output_mode = SHIFT_JIS;
4124 e2s_conv(c2, c1, &c2, &c1);
4126 #ifdef SHIFTJIS_CP932
4128 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4129 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4135 #endif /* SHIFTJIS_CP932 */
4138 if (prefix_table[(unsigned char)c1]){
4139 (*o_putc)(prefix_table[(unsigned char)c1]);
4145 void j_oconv(nkf_char c2, nkf_char c1)
4147 #ifdef NUMCHAR_OPTION
4148 if (c2 == 0 && is_unicode_capsule(c1)){
4149 w16e_conv(c1, &c2, &c1);
4150 if (c2 == 0 && is_unicode_capsule(c1)){
4151 c2 = c1 & VALUE_MASK;
4152 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4155 c2 = 0x7F + c1 / 94;
4156 c1 = 0x21 + c1 % 94;
4158 if (encode_fallback) (*encode_fallback)(c1);
4165 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4168 (*o_putc)(ascii_intro);
4169 output_mode = ASCII;
4173 } else if (is_eucg3(c2)){
4175 if(output_mode!=X0213_2){
4176 output_mode = X0213_2;
4180 (*o_putc)(X0213_2&0x7F);
4183 if(output_mode!=X0212){
4184 output_mode = X0212;
4188 (*o_putc)(X0212&0x7F);
4191 (*o_putc)(c2 & 0x7f);
4194 } else if (c2==X0201) {
4195 if (output_mode!=X0201) {
4196 output_mode = X0201;
4202 } else if (c2==ISO8859_1) {
4203 /* iso8859 introduction, or 8th bit on */
4204 /* Can we convert in 7bit form using ESC-'-'-A ?
4206 output_mode = ISO8859_1;
4208 } else if (c2 == 0) {
4209 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4212 (*o_putc)(ascii_intro);
4213 output_mode = ASCII;
4218 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4219 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4221 if (output_mode!=X0213_1) {
4222 output_mode = X0213_1;
4226 (*o_putc)(X0213_1&0x7F);
4228 }else if (output_mode != X0208) {
4229 output_mode = X0208;
4232 (*o_putc)(kanji_intro);
4239 void base64_conv(nkf_char c2, nkf_char c1)
4241 mime_prechar(c2, c1);
4242 (*o_base64conv)(c2,c1);
4246 static nkf_char broken_buf[3];
4247 static int broken_counter = 0;
4248 static int broken_last = 0;
4249 nkf_char broken_getc(FILE *f)
4253 if (broken_counter>0) {
4254 return broken_buf[--broken_counter];
4257 if (c=='$' && broken_last != ESC
4258 && (input_mode==ASCII || input_mode==X0201)) {
4261 if (c1=='@'|| c1=='B') {
4262 broken_buf[0]=c1; broken_buf[1]=c;
4269 } else if (c=='(' && broken_last != ESC
4270 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4273 if (c1=='J'|| c1=='B') {
4274 broken_buf[0]=c1; broken_buf[1]=c;
4287 nkf_char broken_ungetc(nkf_char c, FILE *f)
4289 if (broken_counter<2)
4290 broken_buf[broken_counter++]=c;
4294 void nl_conv(nkf_char c2, nkf_char c1)
4296 if (guess_f && input_nextline != EOF) {
4297 if (c2 == 0 && c1 == LF) {
4298 if (!input_nextline) input_nextline = prev_cr ? CRLF : LF;
4299 else if (input_nextline != (prev_cr ? CRLF : LF)) input_nextline = EOF;
4300 } else if (c2 == 0 && c1 == CR && input_nextline == LF) input_nextline = EOF;
4302 else if (!input_nextline) input_nextline = CR;
4303 else if (input_nextline != CR) input_nextline = EOF;
4305 if (prev_cr || c2 == 0 && c1 == LF) {
4307 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4308 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4310 if (c2 == 0 && c1 == CR) prev_cr = CR;
4311 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4315 Return value of fold_conv()
4317 LF add newline and output char
4318 CR add newline and output nothing
4321 1 (or else) normal output
4323 fold state in prev (previous character)
4325 >0x80 Japanese (X0208/X0201)
4330 This fold algorthm does not preserve heading space in a line.
4331 This is the main difference from fmt.
4334 #define char_size(c2,c1) (c2?2:1)
4336 void fold_conv(nkf_char c2, nkf_char c1)
4339 nkf_char fold_state;
4341 if (c1== CR && !fold_preserve_f) {
4342 fold_state=0; /* ignore cr */
4343 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4345 fold_state=0; /* ignore cr */
4346 } else if (c1== BS) {
4347 if (f_line>0) f_line--;
4349 } else if (c2==EOF && f_line != 0) { /* close open last line */
4351 } else if ((c1==LF && !fold_preserve_f)
4352 || ((c1==CR||(c1==LF&&f_prev!=CR))
4353 && fold_preserve_f)) {
4355 if (fold_preserve_f) {
4359 } else if ((f_prev == c1 && !fold_preserve_f)
4360 || (f_prev == LF && fold_preserve_f)
4361 ) { /* duplicate newline */
4364 fold_state = LF; /* output two newline */
4370 if (f_prev&0x80) { /* Japanese? */
4372 fold_state = 0; /* ignore given single newline */
4373 } else if (f_prev==SP) {
4377 if (++f_line<=fold_len)
4381 fold_state = CR; /* fold and output nothing */
4385 } else if (c1=='\f') {
4388 fold_state = LF; /* output newline and clear */
4389 } else if ( (c2==0 && c1==SP)||
4390 (c2==0 && c1==TAB)||
4391 (c2=='!'&& c1=='!')) {
4392 /* X0208 kankaku or ascii space */
4394 fold_state = 0; /* remove duplicate spaces */
4397 if (++f_line<=fold_len)
4398 fold_state = SP; /* output ASCII space only */
4400 f_prev = SP; f_line = 0;
4401 fold_state = CR; /* fold and output nothing */
4405 prev0 = f_prev; /* we still need this one... , but almost done */
4407 if (c2 || c2==X0201)
4408 f_prev |= 0x80; /* this is Japanese */
4409 f_line += char_size(c2,c1);
4410 if (f_line<=fold_len) { /* normal case */
4413 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4414 f_line = char_size(c2,c1);
4415 fold_state = LF; /* We can't wait, do fold now */
4416 } else if (c2==X0201) {
4417 /* simple kinsoku rules return 1 means no folding */
4418 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4419 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4420 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4421 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4422 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4423 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4424 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4426 fold_state = LF;/* add one new f_line before this character */
4429 fold_state = LF;/* add one new f_line before this character */
4432 /* kinsoku point in ASCII */
4433 if ( c1==')'|| /* { [ ( */
4444 /* just after special */
4445 } else if (!is_alnum(prev0)) {
4446 f_line = char_size(c2,c1);
4448 } else if ((prev0==SP) || /* ignored new f_line */
4449 (prev0==LF)|| /* ignored new f_line */
4450 (prev0&0x80)) { /* X0208 - ASCII */
4451 f_line = char_size(c2,c1);
4452 fold_state = LF;/* add one new f_line before this character */
4454 fold_state = 1; /* default no fold in ASCII */
4458 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4459 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4460 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4461 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4462 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4463 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4464 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4465 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4466 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4467 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4468 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4469 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4470 /* default no fold in kinsoku */
4473 f_line = char_size(c2,c1);
4474 /* add one new f_line before this character */
4477 f_line = char_size(c2,c1);
4479 /* add one new f_line before this character */
4484 /* terminator process */
4485 switch(fold_state) {
4504 nkf_char z_prev2=0,z_prev1=0;
4506 void z_conv(nkf_char c2, nkf_char c1)
4509 /* if (c2) c1 &= 0x7f; assertion */
4511 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4517 if (z_prev2 == X0201) {
4519 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4521 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4523 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4525 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4530 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4533 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4534 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4539 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4550 if (alpha_f&1 && c2 == 0x23) {
4551 /* JISX0208 Alphabet */
4553 } else if (c2 == 0x21) {
4554 /* JISX0208 Kigou */
4559 } else if (alpha_f&4) {
4564 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4570 if (alpha_f&8 && c2 == 0) {
4574 case '>': entity = ">"; break;
4575 case '<': entity = "<"; break;
4576 case '\"': entity = """; break;
4577 case '&': entity = "&"; break;
4580 while (*entity) (*o_zconv)(0, *entity++);
4586 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4591 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4595 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4599 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4603 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4607 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4611 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4615 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4619 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4624 (*o_zconv)(X0201, c);
4627 } else if (c2 == 0x25) {
4628 /* JISX0208 Katakana */
4629 static const int fullwidth_to_halfwidth[] =
4631 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4632 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4633 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4634 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4635 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4636 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4637 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4638 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4639 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4640 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4641 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4642 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4644 if (fullwidth_to_halfwidth[c1-0x20]){
4645 c2 = fullwidth_to_halfwidth[c1-0x20];
4646 (*o_zconv)(X0201, c2>>8);
4648 (*o_zconv)(X0201, c2&0xFF);
4658 #define rot13(c) ( \
4660 (c <= 'M') ? (c + 13): \
4661 (c <= 'Z') ? (c - 13): \
4663 (c <= 'm') ? (c + 13): \
4664 (c <= 'z') ? (c - 13): \
4668 #define rot47(c) ( \
4670 ( c <= 'O') ? (c + 47) : \
4671 ( c <= '~') ? (c - 47) : \
4675 void rot_conv(nkf_char c2, nkf_char c1)
4677 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4683 (*o_rot_conv)(c2,c1);
4686 void hira_conv(nkf_char c2, nkf_char c1)
4690 if (0x20 < c1 && c1 < 0x74) {
4692 (*o_hira_conv)(c2,c1);
4694 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4696 c1 = CLASS_UNICODE | 0x3094;
4697 (*o_hira_conv)(c2,c1);
4700 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4702 (*o_hira_conv)(c2,c1);
4707 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4710 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4712 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4716 (*o_hira_conv)(c2,c1);
4720 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4722 static const nkf_char range[RANGE_NUM_MAX][2] = {
4743 nkf_char start, end, c;
4745 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4749 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4754 for (i = 0; i < RANGE_NUM_MAX; i++) {
4755 start = range[i][0];
4758 if (c >= start && c <= end) {
4763 (*o_iso2022jp_check_conv)(c2,c1);
4767 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4769 static const unsigned char *mime_pattern[] = {
4770 (const unsigned char *)"\075?EUC-JP?B?",
4771 (const unsigned char *)"\075?SHIFT_JIS?B?",
4772 (const unsigned char *)"\075?ISO-8859-1?Q?",
4773 (const unsigned char *)"\075?ISO-8859-1?B?",
4774 (const unsigned char *)"\075?ISO-2022-JP?B?",
4775 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4776 #if defined(UTF8_INPUT_ENABLE)
4777 (const unsigned char *)"\075?UTF-8?B?",
4778 (const unsigned char *)"\075?UTF-8?Q?",
4780 (const unsigned char *)"\075?US-ASCII?Q?",
4785 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4786 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4787 e_iconv, s_iconv, 0, 0, 0, 0,
4788 #if defined(UTF8_INPUT_ENABLE)
4794 static const nkf_char mime_encode[] = {
4795 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4796 #if defined(UTF8_INPUT_ENABLE)
4803 static const nkf_char mime_encode_method[] = {
4804 'B', 'B','Q', 'B', 'B', 'Q',
4805 #if defined(UTF8_INPUT_ENABLE)
4813 #define MAXRECOVER 20
4815 void switch_mime_getc(void)
4817 if (i_getc!=mime_getc) {
4818 i_mgetc = i_getc; i_getc = mime_getc;
4819 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4820 if(mime_f==STRICT_MIME) {
4821 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4822 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4827 void unswitch_mime_getc(void)
4829 if(mime_f==STRICT_MIME) {
4830 i_mgetc = i_mgetc_buf;
4831 i_mungetc = i_mungetc_buf;
4834 i_ungetc = i_mungetc;
4835 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4836 mime_iconv_back = NULL;
4839 nkf_char mime_begin_strict(FILE *f)
4843 const unsigned char *p,*q;
4844 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4846 mime_decode_mode = FALSE;
4847 /* =? has been checked */
4849 p = mime_pattern[j];
4852 for(i=2;p[i]>SP;i++) { /* start at =? */
4853 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4854 /* pattern fails, try next one */
4856 while (mime_pattern[++j]) {
4857 p = mime_pattern[j];
4858 for(k=2;k<i;k++) /* assume length(p) > i */
4859 if (p[k]!=q[k]) break;
4860 if (k==i && nkf_toupper(c1)==p[k]) break;
4862 p = mime_pattern[j];
4863 if (p) continue; /* found next one, continue */
4864 /* all fails, output from recovery buffer */
4872 mime_decode_mode = p[i-2];
4874 mime_iconv_back = iconv;
4875 set_iconv(FALSE, mime_priority_func[j]);
4876 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4878 if (mime_decode_mode=='B') {
4879 mimebuf_f = unbuf_f;
4881 /* do MIME integrity check */
4882 return mime_integrity(f,mime_pattern[j]);
4890 nkf_char mime_getc_buf(FILE *f)
4892 /* we don't keep eof of Fifo, becase it contains ?= as
4893 a terminator. It was checked in mime_integrity. */
4894 return ((mimebuf_f)?
4895 (*i_mgetc_buf)(f):Fifo(mime_input++));
4898 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4901 (*i_mungetc_buf)(c,f);
4903 Fifo(--mime_input) = (unsigned char)c;
4907 nkf_char mime_begin(FILE *f)
4912 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4913 /* re-read and convert again from mime_buffer. */
4915 /* =? has been checked */
4917 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4918 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4919 /* We accept any character type even if it is breaked by new lines */
4920 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4921 if (c1==LF||c1==SP||c1==CR||
4922 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4924 /* Failed. But this could be another MIME preemble */
4932 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4933 if (!(++i<MAXRECOVER) || c1==EOF) break;
4934 if (c1=='b'||c1=='B') {
4935 mime_decode_mode = 'B';
4936 } else if (c1=='q'||c1=='Q') {
4937 mime_decode_mode = 'Q';
4941 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4942 if (!(++i<MAXRECOVER) || c1==EOF) break;
4944 mime_decode_mode = FALSE;
4950 if (!mime_decode_mode) {
4951 /* false MIME premble, restart from mime_buffer */
4952 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4953 /* Since we are in MIME mode until buffer becomes empty, */
4954 /* we never go into mime_begin again for a while. */
4957 /* discard mime preemble, and goto MIME mode */
4959 /* do no MIME integrity check */
4960 return c1; /* used only for checking EOF */
4964 void no_putc(nkf_char c)
4969 void debug(const char *str)
4972 fprintf(stderr, "%s\n", str ? str : "NULL");
4977 void set_input_codename(char *codename)
4979 if (!input_codename) {
4980 input_codename = codename;
4981 } else if (strcmp(codename, input_codename) != 0) {
4982 input_codename = "";
4986 #if !defined(PERL_XS) && !defined(WIN32DLL)
4987 void print_guessed_code(char *filename)
4989 char *codename = "BINARY";
4990 char *str_nlmode = NULL;
4991 if (filename != NULL) printf("%s: ", filename);
4992 if (input_codename && !*input_codename) {
4996 (input_codename ? input_codename : "ASCII"),
4997 input_nextline == CR ? " (CR)" :
4998 input_nextline == LF ? " (LF)" :
4999 input_nextline == CRLF ? " (CRLF)" :
5000 input_nextline == EOF ? " (MIXED NL)" :
5008 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5010 nkf_char c1, c2, c3;
5016 if (!nkf_isxdigit(c2)){
5021 if (!nkf_isxdigit(c3)){
5026 return (hex2bin(c2) << 4) | hex2bin(c3);
5029 nkf_char cap_getc(FILE *f)
5031 return hex_getc(':', f, i_cgetc, i_cungetc);
5034 nkf_char cap_ungetc(nkf_char c, FILE *f)
5036 return (*i_cungetc)(c, f);
5039 nkf_char url_getc(FILE *f)
5041 return hex_getc('%', f, i_ugetc, i_uungetc);
5044 nkf_char url_ungetc(nkf_char c, FILE *f)
5046 return (*i_uungetc)(c, f);
5050 #ifdef NUMCHAR_OPTION
5051 nkf_char numchar_getc(FILE *f)
5053 nkf_char (*g)(FILE *) = i_ngetc;
5054 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5065 if (buf[i] == 'x' || buf[i] == 'X'){
5066 for (j = 0; j < 7; j++){
5068 if (!nkf_isxdigit(buf[i])){
5075 c |= hex2bin(buf[i]);
5078 for (j = 0; j < 8; j++){
5082 if (!nkf_isdigit(buf[i])){
5089 c += hex2bin(buf[i]);
5095 return CLASS_UNICODE | c;
5104 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5106 return (*i_nungetc)(c, f);
5110 #ifdef UNICODE_NORMALIZATION
5112 /* Normalization Form C */
5113 nkf_char nfc_getc(FILE *f)
5115 nkf_char (*g)(FILE *f) = i_nfc_getc;
5116 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5117 int i=0, j, k=1, lower, upper;
5119 const nkf_nfchar *array;
5122 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5123 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5124 while (upper >= lower) {
5125 j = (lower+upper) / 2;
5126 array = normalization_table[j].nfd;
5127 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5128 if (array[k] != buf[k]){
5129 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5136 array = normalization_table[j].nfc;
5137 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5138 buf[i] = (nkf_char)(array[i]);
5149 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5151 return (*i_nfc_ungetc)(c, f);
5153 #endif /* UNICODE_NORMALIZATION */
5159 nkf_char c1, c2, c3, c4, cc;
5160 nkf_char t1, t2, t3, t4, mode, exit_mode;
5161 nkf_char lwsp_count;
5164 nkf_char lwsp_size = 128;
5166 if (mime_top != mime_last) { /* Something is in FIFO */
5167 return Fifo(mime_top++);
5169 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5170 mime_decode_mode=FALSE;
5171 unswitch_mime_getc();
5172 return (*i_getc)(f);
5175 if (mimebuf_f == FIXED_MIME)
5176 exit_mode = mime_decode_mode;
5179 if (mime_decode_mode == 'Q') {
5180 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5182 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5183 if (c1<=SP || DEL<=c1) {
5184 mime_decode_mode = exit_mode; /* prepare for quit */
5187 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5191 mime_decode_mode = exit_mode; /* prepare for quit */
5192 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5193 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5194 /* end Q encoding */
5195 input_mode = exit_mode;
5197 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5198 if (lwsp_buf==NULL) {
5199 perror("can't malloc");
5202 while ((c1=(*i_getc)(f))!=EOF) {
5207 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5215 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5216 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5231 lwsp_buf[lwsp_count] = (unsigned char)c1;
5232 if (lwsp_count++>lwsp_size){
5234 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5235 if (lwsp_buf_new==NULL) {
5237 perror("can't realloc");
5240 lwsp_buf = lwsp_buf_new;
5246 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5248 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5249 i_ungetc(lwsp_buf[lwsp_count],f);
5255 if (c1=='='&&c2<SP) { /* this is soft wrap */
5256 while((c1 = (*i_mgetc)(f)) <=SP) {
5257 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5259 mime_decode_mode = 'Q'; /* still in MIME */
5260 goto restart_mime_q;
5263 mime_decode_mode = 'Q'; /* still in MIME */
5267 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5268 if (c2<=SP) return c2;
5269 mime_decode_mode = 'Q'; /* still in MIME */
5270 return ((hex2bin(c2)<<4) + hex2bin(c3));
5273 if (mime_decode_mode != 'B') {
5274 mime_decode_mode = FALSE;
5275 return (*i_mgetc)(f);
5279 /* Base64 encoding */
5281 MIME allows line break in the middle of
5282 Base64, but we are very pessimistic in decoding
5283 in unbuf mode because MIME encoded code may broken by
5284 less or editor's control sequence (such as ESC-[-K in unbuffered
5285 mode. ignore incomplete MIME.
5287 mode = mime_decode_mode;
5288 mime_decode_mode = exit_mode; /* prepare for quit */
5290 while ((c1 = (*i_mgetc)(f))<=SP) {
5295 if ((c2 = (*i_mgetc)(f))<=SP) {
5298 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5299 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5302 if ((c1 == '?') && (c2 == '=')) {
5305 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5306 if (lwsp_buf==NULL) {
5307 perror("can't malloc");
5310 while ((c1=(*i_getc)(f))!=EOF) {
5315 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5323 if ((c1=(*i_getc)(f))!=EOF) {
5327 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5342 lwsp_buf[lwsp_count] = (unsigned char)c1;
5343 if (lwsp_count++>lwsp_size){
5345 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5346 if (lwsp_buf_new==NULL) {
5348 perror("can't realloc");
5351 lwsp_buf = lwsp_buf_new;
5357 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5359 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5360 i_ungetc(lwsp_buf[lwsp_count],f);
5367 if ((c3 = (*i_mgetc)(f))<=SP) {
5370 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5371 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5375 if ((c4 = (*i_mgetc)(f))<=SP) {
5378 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5379 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5383 mime_decode_mode = mode; /* still in MIME sigh... */
5385 /* BASE 64 decoding */
5387 t1 = 0x3f & base64decode(c1);
5388 t2 = 0x3f & base64decode(c2);
5389 t3 = 0x3f & base64decode(c3);
5390 t4 = 0x3f & base64decode(c4);
5391 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5393 Fifo(mime_last++) = (unsigned char)cc;
5394 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5396 Fifo(mime_last++) = (unsigned char)cc;
5397 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5399 Fifo(mime_last++) = (unsigned char)cc;
5404 return Fifo(mime_top++);
5407 nkf_char mime_ungetc(nkf_char c, FILE *f)
5409 Fifo(--mime_top) = (unsigned char)c;
5413 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5417 /* In buffered mode, read until =? or NL or buffer full
5419 mime_input = mime_top;
5420 mime_last = mime_top;
5422 while(*p) Fifo(mime_input++) = *p++;
5425 while((c=(*i_getc)(f))!=EOF) {
5426 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5427 break; /* buffer full */
5429 if (c=='=' && d=='?') {
5430 /* checked. skip header, start decode */
5431 Fifo(mime_input++) = (unsigned char)c;
5432 /* mime_last_input = mime_input; */
5437 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5439 /* Should we check length mod 4? */
5440 Fifo(mime_input++) = (unsigned char)c;
5443 /* In case of Incomplete MIME, no MIME decode */
5444 Fifo(mime_input++) = (unsigned char)c;
5445 mime_last = mime_input; /* point undecoded buffer */
5446 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5447 switch_mime_getc(); /* anyway we need buffered getc */
5451 nkf_char base64decode(nkf_char c)
5456 i = c - 'A'; /* A..Z 0-25 */
5458 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5460 } else if (c > '/') {
5461 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5462 } else if (c == '+') {
5463 i = '>' /* 62 */ ; /* + 62 */
5465 i = '?' /* 63 */ ; /* / 63 */
5470 static const char basis_64[] =
5471 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5473 static nkf_char b64c;
5474 #define MIMEOUT_BUF_LENGTH (60)
5475 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5476 int mimeout_buf_count = 0;
5477 int mimeout_preserve_space = 0;
5478 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5480 void open_mime(nkf_char mode)
5482 const unsigned char *p;
5485 p = mime_pattern[0];
5486 for(i=0;mime_pattern[i];i++) {
5487 if (mode == mime_encode[i]) {
5488 p = mime_pattern[i];
5492 mimeout_mode = mime_encode_method[i];
5495 if (base64_count>45) {
5496 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5497 (*o_mputc)(mimeout_buf[i]);
5503 if (!mimeout_preserve_space && mimeout_buf_count>0
5504 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5505 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5509 if (!mimeout_preserve_space) {
5510 for (;i<mimeout_buf_count;i++) {
5511 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5512 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5513 (*o_mputc)(mimeout_buf[i]);
5520 mimeout_preserve_space = FALSE;
5526 j = mimeout_buf_count;
5527 mimeout_buf_count = 0;
5529 mime_putc(mimeout_buf[i]);
5533 void close_mime(void)
5543 switch(mimeout_mode) {
5548 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5554 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5560 if (mimeout_f!=FIXED_MIME) {
5562 } else if (mimeout_mode != 'Q')
5567 void mimeout_addchar(nkf_char c)
5569 switch(mimeout_mode) {
5574 } else if(!nkf_isalnum(c)) {
5576 (*o_mputc)(itoh4(((c>>4)&0xf)));
5577 (*o_mputc)(itoh4((c&0xf)));
5586 (*o_mputc)(basis_64[c>>2]);
5591 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5597 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5598 (*o_mputc)(basis_64[c & 0x3F]);
5609 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5611 void mime_prechar(nkf_char c2, nkf_char c1)
5615 if (base64_count + mimeout_buf_count/3*4> 73){
5616 (*o_base64conv)(EOF,0);
5617 (*o_base64conv)(0,LF);
5618 (*o_base64conv)(0,SP);
5621 if (base64_count + mimeout_buf_count/3*4> 66){
5622 (*o_base64conv)(EOF,0);
5623 (*o_base64conv)(0,LF);
5624 (*o_base64conv)(0,SP);
5626 }/*else if (mime_lastchar2){
5627 if (c1 <=DEL && !nkf_isspace(c1)){
5628 (*o_base64conv)(0,SP);
5632 if (c2 && mime_lastchar2 == 0
5633 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5634 (*o_base64conv)(0,SP);
5637 /*mime_lastchar2 = c2;
5638 mime_lastchar1 = c1;*/
5641 void mime_putc(nkf_char c)
5646 if (mimeout_f == FIXED_MIME){
5647 if (mimeout_mode == 'Q'){
5648 if (base64_count > 71){
5649 if (c!=CR && c!=LF) {
5656 if (base64_count > 71){
5661 if (c == EOF) { /* c==EOF */
5665 if (c != EOF) { /* c==EOF */
5671 /* mimeout_f != FIXED_MIME */
5673 if (c == EOF) { /* c==EOF */
5674 j = mimeout_buf_count;
5675 mimeout_buf_count = 0;
5678 if (!nkf_isblank(mimeout_buf[j-1])) {
5680 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5683 mimeout_addchar(mimeout_buf[i]);
5687 mimeout_addchar(mimeout_buf[i]);
5691 mimeout_addchar(mimeout_buf[i]);
5697 mimeout_addchar(mimeout_buf[i]);
5703 if (mimeout_mode=='Q') {
5704 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5705 if (c == CR || c == LF) {
5710 } else if (c <= SP) {
5712 if (base64_count > 70) {
5716 if (!nkf_isblank(c)) {
5727 if (mimeout_buf_count > 0){
5728 lastchar = mimeout_buf[mimeout_buf_count - 1];
5733 if (!mimeout_mode) {
5734 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5735 if (nkf_isspace(c)) {
5736 if (c==CR || c==LF) {
5739 for (i=0;i<mimeout_buf_count;i++) {
5740 (*o_mputc)(mimeout_buf[i]);
5741 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5747 mimeout_buf[0] = (char)c;
5748 mimeout_buf_count = 1;
5750 if (base64_count > 1
5751 && base64_count + mimeout_buf_count > 76
5752 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5755 if (!nkf_isspace(mimeout_buf[0])){
5760 mimeout_buf[mimeout_buf_count++] = (char)c;
5761 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5762 open_mime(output_mode);
5767 if (lastchar==CR || lastchar == LF){
5768 for (i=0;i<mimeout_buf_count;i++) {
5769 (*o_mputc)(mimeout_buf[i]);
5772 mimeout_buf_count = 0;
5775 for (i=0;i<mimeout_buf_count-1;i++) {
5776 (*o_mputc)(mimeout_buf[i]);
5779 mimeout_buf[0] = SP;
5780 mimeout_buf_count = 1;
5782 open_mime(output_mode);
5785 /* mimeout_mode == 'B', 1, 2 */
5786 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5787 if (lastchar == CR || lastchar == LF){
5788 if (nkf_isblank(c)) {
5789 for (i=0;i<mimeout_buf_count;i++) {
5790 mimeout_addchar(mimeout_buf[i]);
5792 mimeout_buf_count = 0;
5793 } else if (SP<c && c<DEL) {
5795 for (i=0;i<mimeout_buf_count;i++) {
5796 (*o_mputc)(mimeout_buf[i]);
5799 mimeout_buf_count = 0;
5802 if (c==SP || c==TAB || c==CR || c==LF) {
5803 for (i=0;i<mimeout_buf_count;i++) {
5804 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5806 for (i=0;i<mimeout_buf_count;i++) {
5807 (*o_mputc)(mimeout_buf[i]);
5810 mimeout_buf_count = 0;
5813 mimeout_buf[mimeout_buf_count++] = (char)c;
5814 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5816 for (i=0;i<mimeout_buf_count;i++) {
5817 (*o_mputc)(mimeout_buf[i]);
5820 mimeout_buf_count = 0;
5824 if (mimeout_buf_count>0 && SP<c && c!='=') {
5825 mimeout_buf[mimeout_buf_count++] = (char)c;
5826 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5827 j = mimeout_buf_count;
5828 mimeout_buf_count = 0;
5830 mimeout_addchar(mimeout_buf[i]);
5837 if (mimeout_buf_count>0) {
5838 j = mimeout_buf_count;
5839 mimeout_buf_count = 0;
5841 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5843 mimeout_addchar(mimeout_buf[i]);
5849 (*o_mputc)(mimeout_buf[i]);
5851 open_mime(output_mode);
5858 #if defined(PERL_XS) || defined(WIN32DLL)
5862 struct input_code *p = input_code_list;
5875 mime_f = STRICT_MIME;
5876 mime_decode_f = FALSE;
5881 #if defined(MSDOS) || defined(__OS2__)
5886 iso2022jp_f = FALSE;
5887 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5888 ms_ucs_map_f = UCS_MAP_ASCII;
5890 #ifdef UTF8_INPUT_ENABLE
5891 no_cp932ext_f = FALSE;
5892 no_best_fit_chars_f = FALSE;
5893 encode_fallback = NULL;
5894 unicode_subchar = '?';
5895 input_endian = ENDIAN_BIG;
5897 #ifdef UTF8_OUTPUT_ENABLE
5898 output_bom_f = FALSE;
5899 output_endian = ENDIAN_BIG;
5901 #ifdef UNICODE_NORMALIZATION
5917 #ifdef SHIFTJIS_CP932
5927 for (i = 0; i < 256; i++){
5928 prefix_table[i] = 0;
5932 mimeout_buf_count = 0;
5937 fold_preserve_f = FALSE;
5940 kanji_intro = DEFAULT_J;
5941 ascii_intro = DEFAULT_R;
5942 fold_margin = FOLD_MARGIN;
5943 output_conv = DEFAULT_CONV;
5944 oconv = DEFAULT_CONV;
5945 o_zconv = no_connection;
5946 o_fconv = no_connection;
5947 o_nlconv = no_connection;
5948 o_rot_conv = no_connection;
5949 o_hira_conv = no_connection;
5950 o_base64conv = no_connection;
5951 o_iso2022jp_check_conv = no_connection;
5954 i_ungetc = std_ungetc;
5956 i_bungetc = std_ungetc;
5959 i_mungetc = std_ungetc;
5960 i_mgetc_buf = std_getc;
5961 i_mungetc_buf = std_ungetc;
5962 output_mode = ASCII;
5965 mime_decode_mode = FALSE;
5973 z_prev2=0,z_prev1=0;
5975 iconv_for_check = 0;
5977 input_codename = NULL;
5984 void no_connection(nkf_char c2, nkf_char c1)
5986 no_connection2(c2,c1,0);
5989 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5991 fprintf(stderr,"nkf internal module connection failure.\n");
5993 return 0; /* LINT */
5998 #define fprintf dllprintf
6002 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6003 fprintf(stderr,"Flags:\n");
6004 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6005 #ifdef DEFAULT_CODE_SJIS
6006 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6008 #ifdef DEFAULT_CODE_JIS
6009 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6011 #ifdef DEFAULT_CODE_EUC
6012 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6014 #ifdef DEFAULT_CODE_UTF8
6015 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6017 #ifdef UTF8_OUTPUT_ENABLE
6018 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6020 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6021 #ifdef UTF8_INPUT_ENABLE
6022 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6024 fprintf(stderr,"t no conversion\n");
6025 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6026 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6027 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6028 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6029 fprintf(stderr,"v Show this usage. V: show version\n");
6030 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6031 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6032 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6033 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6034 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6035 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6036 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6037 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6038 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6040 fprintf(stderr,"T Text mode output\n");
6042 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6043 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6044 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6045 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6046 fprintf(stderr,"\n");
6047 fprintf(stderr,"Long name options\n");
6048 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6049 fprintf(stderr," Specify the input or output codeset\n");
6050 fprintf(stderr," --fj --unix --mac --windows\n");
6051 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6052 fprintf(stderr," Convert for the system or code\n");
6053 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6054 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6055 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6057 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6059 #ifdef NUMCHAR_OPTION
6060 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6062 #ifdef UTF8_INPUT_ENABLE
6063 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6064 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6067 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6068 fprintf(stderr," Overwrite original listed files by filtered result\n");
6069 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6071 fprintf(stderr," -g --guess Guess the input code\n");
6072 fprintf(stderr," --help --version Show this help/the version\n");
6073 fprintf(stderr," For more information, see also man nkf\n");
6074 fprintf(stderr,"\n");
6080 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6081 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6084 #if defined(MSDOS) && defined(__WIN16__)
6087 #if defined(MSDOS) && defined(__WIN32__)
6093 ,NKF_VERSION,NKF_RELEASE_DATE);
6094 fprintf(stderr,"\n%s\n",CopyRight);