1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.142 2007/10/05 10:57:50 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-10-05"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
227 #define CP932_TABLE_BEGIN 0xFA
228 #define CP932_TABLE_END 0xFC
229 #define CP932INV_TABLE_BEGIN 0xED
230 #define CP932INV_TABLE_END 0xEE
231 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
233 #define HOLD_SIZE 1024
234 #if defined(INT_IS_SHORT)
235 #define IOBUF_SIZE 2048
237 #define IOBUF_SIZE 16384
240 #define DEFAULT_J 'B'
241 #define DEFAULT_R 'B'
243 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
244 #define SJ6394 0x0161 /* 63 - 94 ku offset */
246 #define RANGE_NUM_MAX 18
251 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
252 #define sizeof_euc_to_utf8_1byte 94
253 #define sizeof_euc_to_utf8_2bytes 94
254 #define sizeof_utf8_to_euc_C2 64
255 #define sizeof_utf8_to_euc_E5B8 64
256 #define sizeof_utf8_to_euc_2bytes 112
257 #define sizeof_utf8_to_euc_3bytes 16
260 /* MIME preprocessor */
262 #ifdef EASYWIN /*Easy Win */
263 extern POINT _BufferSize;
272 void (*status_func)(struct input_code *, nkf_char);
273 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
277 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
280 static const char *CopyRight = COPY_RIGHT;
282 #if !defined(PERL_XS) && !defined(WIN32DLL)
283 static nkf_char noconvert(FILE *f);
285 static void module_connection(void);
286 static nkf_char kanji_convert(FILE *f);
287 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
288 static nkf_char push_hold_buf(nkf_char c2);
289 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
290 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
291 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
292 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
293 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
295 * 0: Shift_JIS, eucJP-ascii
300 #define UCS_MAP_ASCII 0
302 #define UCS_MAP_CP932 2
303 #define UCS_MAP_CP10001 3
304 static int ms_ucs_map_f = UCS_MAP_ASCII;
306 #ifdef UTF8_INPUT_ENABLE
307 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
308 static int no_cp932ext_f = FALSE;
309 /* ignore ZERO WIDTH NO-BREAK SPACE */
310 static int no_best_fit_chars_f = FALSE;
311 static int input_endian = ENDIAN_BIG;
312 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
313 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
314 static void encode_fallback_html(nkf_char c);
315 static void encode_fallback_xml(nkf_char c);
316 static void encode_fallback_java(nkf_char c);
317 static void encode_fallback_perl(nkf_char c);
318 static void encode_fallback_subchar(nkf_char c);
319 static void (*encode_fallback)(nkf_char c) = NULL;
320 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
321 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
322 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
323 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
324 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
325 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
326 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
327 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
328 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
329 static void w_status(struct input_code *, nkf_char);
331 #ifdef UTF8_OUTPUT_ENABLE
332 static int output_bom_f = FALSE;
333 static int output_endian = ENDIAN_BIG;
334 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
335 static void w_oconv(nkf_char c2,nkf_char c1);
336 static void w_oconv16(nkf_char c2,nkf_char c1);
337 static void w_oconv32(nkf_char c2,nkf_char c1);
339 static void e_oconv(nkf_char c2,nkf_char c1);
340 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
341 static void s_oconv(nkf_char c2,nkf_char c1);
342 static void j_oconv(nkf_char c2,nkf_char c1);
343 static void fold_conv(nkf_char c2,nkf_char c1);
344 static void nl_conv(nkf_char c2,nkf_char c1);
345 static void z_conv(nkf_char c2,nkf_char c1);
346 static void rot_conv(nkf_char c2,nkf_char c1);
347 static void hira_conv(nkf_char c2,nkf_char c1);
348 static void base64_conv(nkf_char c2,nkf_char c1);
349 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
350 static void no_connection(nkf_char c2,nkf_char c1);
351 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
353 static void code_score(struct input_code *ptr);
354 static void code_status(nkf_char c);
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_begin(FILE *f);
364 static nkf_char mime_getc(FILE *f);
365 static nkf_char mime_ungetc(nkf_char c,FILE *f);
367 static void switch_mime_getc(void);
368 static void unswitch_mime_getc(void);
369 static nkf_char mime_begin_strict(FILE *f);
370 static nkf_char mime_getc_buf(FILE *f);
371 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
372 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
374 static nkf_char base64decode(nkf_char c);
375 static void mime_prechar(nkf_char c2, nkf_char c1);
376 static void mime_putc(nkf_char c);
377 static void open_mime(nkf_char c);
378 static void close_mime(void);
379 static void eof_mime(void);
380 static void mimeout_addchar(nkf_char c);
382 static void usage(void);
383 static void version(void);
385 static void options(unsigned char *c);
386 static void reinit(void);
390 #if !defined(PERL_XS) && !defined(WIN32DLL)
391 static unsigned char stdibuf[IOBUF_SIZE];
392 static unsigned char stdobuf[IOBUF_SIZE];
394 static unsigned char hold_buf[HOLD_SIZE*2];
395 static int hold_count = 0;
397 /* MIME preprocessor fifo */
399 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
400 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
401 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
402 static unsigned char mime_buf[MIME_BUF_SIZE];
403 static unsigned int mime_top = 0;
404 static unsigned int mime_last = 0; /* decoded */
405 static unsigned int mime_input = 0; /* undecoded */
406 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
409 static int unbuf_f = FALSE;
410 static int estab_f = FALSE;
411 static int nop_f = FALSE;
412 static int binmode_f = TRUE; /* binary mode */
413 static int rot_f = FALSE; /* rot14/43 mode */
414 static int hira_f = FALSE; /* hira/kata henkan */
415 static int input_f = FALSE; /* non fixed input code */
416 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
417 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
418 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
419 static int mimebuf_f = FALSE; /* MIME buffered input */
420 static int broken_f = FALSE; /* convert ESC-less broken JIS */
421 static int iso8859_f = FALSE; /* ISO8859 through */
422 static int mimeout_f = FALSE; /* base64 mode */
423 #if defined(MSDOS) || defined(__OS2__)
424 static int x0201_f = TRUE; /* Assume JISX0201 kana */
426 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
428 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
430 #ifdef UNICODE_NORMALIZATION
431 static int nfc_f = FALSE;
432 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
433 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static nkf_char nfc_getc(FILE *f);
435 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
439 static int cap_f = FALSE;
440 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
441 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
442 static nkf_char cap_getc(FILE *f);
443 static nkf_char cap_ungetc(nkf_char c,FILE *f);
445 static int url_f = FALSE;
446 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
447 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
448 static nkf_char url_getc(FILE *f);
449 static nkf_char url_ungetc(nkf_char c,FILE *f);
452 #if defined(INT_IS_SHORT)
453 #define NKF_INT32_C(n) (n##L)
455 #define NKF_INT32_C(n) (n)
457 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
458 #define CLASS_MASK NKF_INT32_C(0xFF000000)
459 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
460 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
461 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
462 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
463 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
465 #ifdef NUMCHAR_OPTION
466 static int numchar_f = FALSE;
467 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
468 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
469 static nkf_char numchar_getc(FILE *f);
470 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
474 static int noout_f = FALSE;
475 static void no_putc(nkf_char c);
476 static int debug_f = FALSE;
477 static void debug(const char *str);
478 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
481 static int guess_f = FALSE;
483 static void print_guessed_code(char *filename);
485 static void set_input_codename(char *codename);
488 static int exec_f = 0;
491 #ifdef SHIFTJIS_CP932
492 /* invert IBM extended characters to others */
493 static int cp51932_f = FALSE;
495 /* invert NEC-selected IBM extended characters to IBM extended characters */
496 static int cp932inv_f = TRUE;
498 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
499 #endif /* SHIFTJIS_CP932 */
502 static int x0212_f = FALSE;
503 static nkf_char x0212_shift(nkf_char c);
504 static nkf_char x0212_unshift(nkf_char c);
506 static int x0213_f = FALSE;
508 static unsigned char prefix_table[256];
510 static void set_code_score(struct input_code *ptr, nkf_char score);
511 static void clr_code_score(struct input_code *ptr, nkf_char score);
512 static void status_disable(struct input_code *ptr);
513 static void status_push_ch(struct input_code *ptr, nkf_char c);
514 static void status_clear(struct input_code *ptr);
515 static void status_reset(struct input_code *ptr);
516 static void status_reinit(struct input_code *ptr);
517 static void status_check(struct input_code *ptr, nkf_char c);
518 static void e_status(struct input_code *, nkf_char);
519 static void s_status(struct input_code *, nkf_char);
521 struct input_code input_code_list[] = {
522 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
523 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
524 #ifdef UTF8_INPUT_ENABLE
525 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
526 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
527 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
532 static int mimeout_mode = 0;
533 static int base64_count = 0;
535 /* X0208 -> ASCII converter */
538 static int f_line = 0; /* chars in line */
539 static int f_prev = 0;
540 static int fold_preserve_f = FALSE; /* preserve new lines */
541 static int fold_f = FALSE;
542 static int fold_len = 0;
545 static unsigned char kanji_intro = DEFAULT_J;
546 static unsigned char ascii_intro = DEFAULT_R;
550 #define FOLD_MARGIN 10
551 #define DEFAULT_FOLD 60
553 static int fold_margin = FOLD_MARGIN;
557 #ifdef DEFAULT_CODE_JIS
558 # define DEFAULT_CONV j_oconv
560 #ifdef DEFAULT_CODE_SJIS
561 # define DEFAULT_CONV s_oconv
563 #ifdef DEFAULT_CODE_EUC
564 # define DEFAULT_CONV e_oconv
566 #ifdef DEFAULT_CODE_UTF8
567 # define DEFAULT_CONV w_oconv
570 /* process default */
571 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
573 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
574 /* s_iconv or oconv */
575 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
577 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
578 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
579 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
580 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 /* static redirections */
587 static void (*o_putc)(nkf_char c) = std_putc;
589 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
590 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
592 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
593 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
595 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
597 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
598 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
600 /* for strict mime */
601 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
602 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
605 static int output_mode = ASCII, /* output kanji mode */
606 input_mode = ASCII, /* input kanji mode */
607 shift_mode = FALSE; /* TRUE shift out, or X0201 */
608 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
610 /* X0201 / X0208 conversion tables */
612 /* X0201 kana conversion table */
614 static const unsigned char cv[]= {
615 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
616 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
617 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
618 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
619 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
620 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
621 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
622 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
623 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
624 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
625 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
626 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
627 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
628 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
629 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
630 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
634 /* X0201 kana conversion table for daguten */
636 static const unsigned char dv[]= {
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
642 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
643 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
644 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
645 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
646 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
647 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
648 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 /* X0201 kana conversion table for han-daguten */
657 static const unsigned char ev[]= {
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
669 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 /* X0208 kigou conversion table */
678 /* 0x8140 - 0x819e */
679 static const unsigned char fv[] = {
681 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
682 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
683 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
684 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
685 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
686 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
687 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
688 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
689 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
690 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
691 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
692 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
697 static int file_out_f = FALSE;
699 static int overwrite_f = FALSE;
700 static int preserve_time_f = FALSE;
701 static int backup_f = FALSE;
702 static char *backup_suffix = "";
703 static char *get_backup_filename(const char *suffix, const char *filename);
706 static int nlmode_f = 0; /* CR, LF, CRLF */
707 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
708 static nkf_char prev_cr = 0; /* CR or 0 */
709 #ifdef EASYWIN /*Easy Win */
710 static int end_check;
713 #define STD_GC_BUFSIZE (256)
714 nkf_char std_gc_buf[STD_GC_BUFSIZE];
718 #include "nkf32dll.c"
719 #elif defined(PERL_XS)
721 int main(int argc, char **argv)
726 char *outfname = NULL;
729 #ifdef EASYWIN /*Easy Win */
730 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
733 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
734 cp = (unsigned char *)*argv;
738 int debug_f_back = debug_f;
741 int exec_f_back = exec_f;
744 int x0212_f_back = x0212_f;
747 int x0213_f_back = x0213_f;
753 debug_f = debug_f_back;
756 exec_f = exec_f_back;
759 x0212_f = x0212_f_back;
762 x0213_f = x0213_f_back;
768 if (pipe(fds) < 0 || (pid = fork()) < 0){
779 execvp(argv[1], &argv[1]);
793 if(x0201_f == WISH_TRUE)
794 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
796 if (binmode_f == TRUE)
797 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
798 if (freopen("","wb",stdout) == NULL)
805 setbuf(stdout, (char *) NULL);
807 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
810 if (binmode_f == TRUE)
811 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
812 if (freopen("","rb",stdin) == NULL) return (-1);
816 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
820 kanji_convert(stdin);
821 if (guess_f) print_guessed_code(NULL);
825 int is_argument_error = FALSE;
827 input_codename = NULL;
832 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
835 is_argument_error = TRUE;
843 /* reopen file for stdout */
844 if (file_out_f == TRUE) {
847 outfname = malloc(strlen(origfname)
848 + strlen(".nkftmpXXXXXX")
854 strcpy(outfname, origfname);
858 for (i = strlen(outfname); i; --i){
859 if (outfname[i - 1] == '/'
860 || outfname[i - 1] == '\\'){
866 strcat(outfname, "ntXXXXXX");
868 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
871 strcat(outfname, ".nkftmpXXXXXX");
872 fd = mkstemp(outfname);
875 || (fd_backup = dup(fileno(stdout))) < 0
876 || dup2(fd, fileno(stdout)) < 0
887 outfname = "nkf.out";
890 if(freopen(outfname, "w", stdout) == NULL) {
894 if (binmode_f == TRUE) {
895 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
896 if (freopen("","wb",stdout) == NULL)
903 if (binmode_f == TRUE)
904 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
905 if (freopen("","rb",fin) == NULL)
910 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
914 char *filename = NULL;
916 if (nfiles > 1) filename = origfname;
917 if (guess_f) print_guessed_code(filename);
923 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
931 if (dup2(fd_backup, fileno(stdout)) < 0){
934 if (stat(origfname, &sb)) {
935 fprintf(stderr, "Can't stat %s\n", origfname);
937 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
938 if (chmod(outfname, sb.st_mode)) {
939 fprintf(stderr, "Can't set permission %s\n", outfname);
942 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
944 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
945 tb[0] = tb[1] = sb.st_mtime;
946 if (utime(outfname, tb)) {
947 fprintf(stderr, "Can't set timestamp %s\n", outfname);
950 tb.actime = sb.st_atime;
951 tb.modtime = sb.st_mtime;
952 if (utime(outfname, &tb)) {
953 fprintf(stderr, "Can't set timestamp %s\n", outfname);
958 char *backup_filename = get_backup_filename(backup_suffix, origfname);
960 unlink(backup_filename);
962 if (rename(origfname, backup_filename)) {
963 perror(backup_filename);
964 fprintf(stderr, "Can't rename %s to %s\n",
965 origfname, backup_filename);
969 if (unlink(origfname)){
974 if (rename(outfname, origfname)) {
976 fprintf(stderr, "Can't rename %s to %s\n",
977 outfname, origfname);
984 if (is_argument_error)
987 #ifdef EASYWIN /*Easy Win */
988 if (file_out_f == FALSE)
989 scanf("%d",&end_check);
992 #else /* for Other OS */
993 if (file_out_f == TRUE)
998 #endif /* WIN32DLL */
1001 char *get_backup_filename(const char *suffix, const char *filename)
1003 char *backup_filename;
1004 int asterisk_count = 0;
1006 int filename_length = strlen(filename);
1008 for(i = 0; suffix[i]; i++){
1009 if(suffix[i] == '*') asterisk_count++;
1013 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1014 if (!backup_filename){
1015 perror("Can't malloc backup filename.");
1019 for(i = 0, j = 0; suffix[i];){
1020 if(suffix[i] == '*'){
1021 backup_filename[j] = '\0';
1022 strncat(backup_filename, filename, filename_length);
1024 j += filename_length;
1026 backup_filename[j++] = suffix[i++];
1029 backup_filename[j] = '\0';
1031 j = strlen(suffix) + filename_length;
1032 backup_filename = malloc( + 1);
1033 strcpy(backup_filename, filename);
1034 strcat(backup_filename, suffix);
1035 backup_filename[j] = '\0';
1037 return backup_filename;
1041 static const struct {
1065 {"katakana-hiragana","h3"},
1072 #ifdef UTF8_OUTPUT_ENABLE
1082 {"fb-subchar=", ""},
1084 #ifdef UTF8_INPUT_ENABLE
1085 {"utf8-input", "W"},
1086 {"utf16-input", "W16"},
1087 {"no-cp932ext", ""},
1088 {"no-best-fit-chars",""},
1090 #ifdef UNICODE_NORMALIZATION
1091 {"utf8mac-input", ""},
1103 #ifdef NUMCHAR_OPTION
1104 {"numchar-input", ""},
1110 #ifdef SHIFTJIS_CP932
1120 static int option_mode = 0;
1122 void options(unsigned char *cp)
1126 unsigned char *cp_back = NULL;
1131 while(*cp && *cp++!='-');
1132 while (*cp || cp_back) {
1140 case '-': /* literal options */
1141 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1145 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1146 p = (unsigned char *)long_option[i].name;
1147 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1148 if (*p == cp[j] || cp[j] == SP){
1155 fprintf(stderr, "unknown long option: --%s\n", cp);
1158 while(*cp && *cp != SP && cp++);
1159 if (long_option[i].alias[0]){
1161 cp = (unsigned char *)long_option[i].alias;
1163 if (strcmp(long_option[i].name, "ic=") == 0){
1164 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1165 codeset[i] = nkf_toupper(p[i]);
1168 if(strcmp(codeset, "ISO-2022-JP") == 0){
1169 input_f = JIS_INPUT;
1170 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1171 strcmp(codeset, "CP50220") == 0 ||
1172 strcmp(codeset, "CP50221") == 0 ||
1173 strcmp(codeset, "CP50222") == 0){
1174 input_f = JIS_INPUT;
1175 #ifdef SHIFTJIS_CP932
1178 #ifdef UTF8_OUTPUT_ENABLE
1179 ms_ucs_map_f = UCS_MAP_CP932;
1181 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1182 input_f = JIS_INPUT;
1186 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1187 input_f = JIS_INPUT;
1192 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1193 input_f = SJIS_INPUT;
1194 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1195 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1196 strcmp(codeset, "CP932") == 0 ||
1197 strcmp(codeset, "MS932") == 0){
1198 input_f = SJIS_INPUT;
1199 #ifdef SHIFTJIS_CP932
1202 #ifdef UTF8_OUTPUT_ENABLE
1203 ms_ucs_map_f = UCS_MAP_CP932;
1205 }else if(strcmp(codeset, "CP10001") == 0){
1206 input_f = SJIS_INPUT;
1207 #ifdef SHIFTJIS_CP932
1210 #ifdef UTF8_OUTPUT_ENABLE
1211 ms_ucs_map_f = UCS_MAP_CP10001;
1213 }else if(strcmp(codeset, "EUCJP") == 0 ||
1214 strcmp(codeset, "EUC-JP") == 0){
1215 input_f = EUC_INPUT;
1216 }else if(strcmp(codeset, "CP51932") == 0){
1217 input_f = EUC_INPUT;
1218 #ifdef SHIFTJIS_CP932
1221 #ifdef UTF8_OUTPUT_ENABLE
1222 ms_ucs_map_f = UCS_MAP_CP932;
1224 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1225 strcmp(codeset, "EUCJP-MS") == 0 ||
1226 strcmp(codeset, "EUCJPMS") == 0){
1227 input_f = EUC_INPUT;
1228 #ifdef SHIFTJIS_CP932
1231 #ifdef UTF8_OUTPUT_ENABLE
1232 ms_ucs_map_f = UCS_MAP_MS;
1234 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1235 strcmp(codeset, "EUCJP-ASCII") == 0){
1236 input_f = EUC_INPUT;
1237 #ifdef SHIFTJIS_CP932
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_ASCII;
1243 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1244 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1245 input_f = SJIS_INPUT;
1247 #ifdef SHIFTJIS_CP932
1250 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1251 strcmp(codeset, "EUC-JIS-2004") == 0){
1252 input_f = EUC_INPUT;
1254 #ifdef SHIFTJIS_CP932
1257 #ifdef UTF8_INPUT_ENABLE
1258 }else if(strcmp(codeset, "UTF-8") == 0 ||
1259 strcmp(codeset, "UTF-8N") == 0 ||
1260 strcmp(codeset, "UTF-8-BOM") == 0){
1261 input_f = UTF8_INPUT;
1262 #ifdef UNICODE_NORMALIZATION
1263 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1264 strcmp(codeset, "UTF-8-MAC") == 0){
1265 input_f = UTF8_INPUT;
1268 }else if(strcmp(codeset, "UTF-16") == 0 ||
1269 strcmp(codeset, "UTF-16BE") == 0 ||
1270 strcmp(codeset, "UTF-16BE-BOM") == 0){
1271 input_f = UTF16_INPUT;
1272 input_endian = ENDIAN_BIG;
1273 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1274 strcmp(codeset, "UTF-16LE-BOM") == 0){
1275 input_f = UTF16_INPUT;
1276 input_endian = ENDIAN_LITTLE;
1277 }else if(strcmp(codeset, "UTF-32") == 0 ||
1278 strcmp(codeset, "UTF-32BE") == 0 ||
1279 strcmp(codeset, "UTF-32BE-BOM") == 0){
1280 input_f = UTF32_INPUT;
1281 input_endian = ENDIAN_BIG;
1282 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1283 strcmp(codeset, "UTF-32LE-BOM") == 0){
1284 input_f = UTF32_INPUT;
1285 input_endian = ENDIAN_LITTLE;
1288 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1292 if (strcmp(long_option[i].name, "oc=") == 0){
1294 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1295 codeset[i] = nkf_toupper(p[i]);
1298 if(strcmp(codeset, "ISO-2022-JP") == 0){
1299 output_conv = j_oconv;
1300 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1301 output_conv = j_oconv;
1302 no_cp932ext_f = TRUE;
1303 #ifdef SHIFTJIS_CP932
1306 #ifdef UTF8_OUTPUT_ENABLE
1307 ms_ucs_map_f = UCS_MAP_CP932;
1309 }else if(strcmp(codeset, "CP50220") == 0){
1310 output_conv = j_oconv;
1312 #ifdef SHIFTJIS_CP932
1315 #ifdef UTF8_OUTPUT_ENABLE
1316 ms_ucs_map_f = UCS_MAP_CP932;
1318 }else if(strcmp(codeset, "CP50221") == 0){
1319 output_conv = j_oconv;
1320 #ifdef SHIFTJIS_CP932
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 ms_ucs_map_f = UCS_MAP_CP932;
1326 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1327 output_conv = j_oconv;
1331 #ifdef SHIFTJIS_CP932
1334 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1335 output_conv = j_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1344 output_conv = s_oconv;
1345 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1346 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1347 strcmp(codeset, "CP932") == 0 ||
1348 strcmp(codeset, "MS932") == 0){
1349 output_conv = s_oconv;
1350 #ifdef UTF8_OUTPUT_ENABLE
1351 ms_ucs_map_f = UCS_MAP_CP932;
1353 }else if(strcmp(codeset, "CP10001") == 0){
1354 output_conv = s_oconv;
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP10001;
1358 }else if(strcmp(codeset, "EUCJP") == 0 ||
1359 strcmp(codeset, "EUC-JP") == 0){
1360 output_conv = e_oconv;
1361 }else if(strcmp(codeset, "CP51932") == 0){
1362 output_conv = e_oconv;
1363 #ifdef SHIFTJIS_CP932
1366 #ifdef UTF8_OUTPUT_ENABLE
1367 ms_ucs_map_f = UCS_MAP_CP932;
1369 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1370 strcmp(codeset, "EUCJP-MS") == 0 ||
1371 strcmp(codeset, "EUCJPMS") == 0){
1372 output_conv = e_oconv;
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_MS;
1379 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1380 strcmp(codeset, "EUCJP-ASCII") == 0){
1381 output_conv = e_oconv;
1385 #ifdef UTF8_OUTPUT_ENABLE
1386 ms_ucs_map_f = UCS_MAP_ASCII;
1388 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1389 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1390 output_conv = s_oconv;
1392 #ifdef SHIFTJIS_CP932
1395 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1396 strcmp(codeset, "EUC-JIS-2004") == 0){
1397 output_conv = e_oconv;
1402 #ifdef SHIFTJIS_CP932
1405 #ifdef UTF8_OUTPUT_ENABLE
1406 }else if(strcmp(codeset, "UTF-8") == 0){
1407 output_conv = w_oconv;
1408 }else if(strcmp(codeset, "UTF-8N") == 0){
1409 output_conv = w_oconv;
1410 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1411 output_conv = w_oconv;
1412 output_bom_f = TRUE;
1413 }else if(strcmp(codeset, "UTF-16BE") == 0){
1414 output_conv = w_oconv16;
1415 }else if(strcmp(codeset, "UTF-16") == 0 ||
1416 strcmp(codeset, "UTF-16BE-BOM") == 0){
1417 output_conv = w_oconv16;
1418 output_bom_f = TRUE;
1419 }else if(strcmp(codeset, "UTF-16LE") == 0){
1420 output_conv = w_oconv16;
1421 output_endian = ENDIAN_LITTLE;
1422 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1423 output_conv = w_oconv16;
1424 output_endian = ENDIAN_LITTLE;
1425 output_bom_f = TRUE;
1426 }else if(strcmp(codeset, "UTF-32") == 0 ||
1427 strcmp(codeset, "UTF-32BE") == 0){
1428 output_conv = w_oconv32;
1429 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1430 output_conv = w_oconv32;
1431 output_bom_f = TRUE;
1432 }else if(strcmp(codeset, "UTF-32LE") == 0){
1433 output_conv = w_oconv32;
1434 output_endian = ENDIAN_LITTLE;
1435 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1436 output_conv = w_oconv32;
1437 output_endian = ENDIAN_LITTLE;
1438 output_bom_f = TRUE;
1441 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1446 if (strcmp(long_option[i].name, "overwrite") == 0){
1449 preserve_time_f = TRUE;
1452 if (strcmp(long_option[i].name, "overwrite=") == 0){
1455 preserve_time_f = TRUE;
1457 backup_suffix = malloc(strlen((char *) p) + 1);
1458 strcpy(backup_suffix, (char *) p);
1461 if (strcmp(long_option[i].name, "in-place") == 0){
1464 preserve_time_f = FALSE;
1467 if (strcmp(long_option[i].name, "in-place=") == 0){
1470 preserve_time_f = FALSE;
1472 backup_suffix = malloc(strlen((char *) p) + 1);
1473 strcpy(backup_suffix, (char *) p);
1478 if (strcmp(long_option[i].name, "cap-input") == 0){
1482 if (strcmp(long_option[i].name, "url-input") == 0){
1487 #ifdef NUMCHAR_OPTION
1488 if (strcmp(long_option[i].name, "numchar-input") == 0){
1494 if (strcmp(long_option[i].name, "no-output") == 0){
1498 if (strcmp(long_option[i].name, "debug") == 0){
1503 if (strcmp(long_option[i].name, "cp932") == 0){
1504 #ifdef SHIFTJIS_CP932
1508 #ifdef UTF8_OUTPUT_ENABLE
1509 ms_ucs_map_f = UCS_MAP_CP932;
1513 if (strcmp(long_option[i].name, "no-cp932") == 0){
1514 #ifdef SHIFTJIS_CP932
1518 #ifdef UTF8_OUTPUT_ENABLE
1519 ms_ucs_map_f = UCS_MAP_ASCII;
1523 #ifdef SHIFTJIS_CP932
1524 if (strcmp(long_option[i].name, "cp932inv") == 0){
1531 if (strcmp(long_option[i].name, "x0212") == 0){
1538 if (strcmp(long_option[i].name, "exec-in") == 0){
1542 if (strcmp(long_option[i].name, "exec-out") == 0){
1547 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1548 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1549 no_cp932ext_f = TRUE;
1552 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1553 no_best_fit_chars_f = TRUE;
1556 if (strcmp(long_option[i].name, "fb-skip") == 0){
1557 encode_fallback = NULL;
1560 if (strcmp(long_option[i].name, "fb-html") == 0){
1561 encode_fallback = encode_fallback_html;
1564 if (strcmp(long_option[i].name, "fb-xml") == 0){
1565 encode_fallback = encode_fallback_xml;
1568 if (strcmp(long_option[i].name, "fb-java") == 0){
1569 encode_fallback = encode_fallback_java;
1572 if (strcmp(long_option[i].name, "fb-perl") == 0){
1573 encode_fallback = encode_fallback_perl;
1576 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1577 encode_fallback = encode_fallback_subchar;
1580 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1581 encode_fallback = encode_fallback_subchar;
1582 unicode_subchar = 0;
1584 /* decimal number */
1585 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1586 unicode_subchar *= 10;
1587 unicode_subchar += hex2bin(p[i]);
1589 }else if(p[1] == 'x' || p[1] == 'X'){
1590 /* hexadecimal number */
1591 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1592 unicode_subchar <<= 4;
1593 unicode_subchar |= hex2bin(p[i]);
1597 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1598 unicode_subchar *= 8;
1599 unicode_subchar += hex2bin(p[i]);
1602 w16e_conv(unicode_subchar, &i, &j);
1603 unicode_subchar = i<<8 | j;
1607 #ifdef UTF8_OUTPUT_ENABLE
1608 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1609 ms_ucs_map_f = UCS_MAP_MS;
1613 #ifdef UNICODE_NORMALIZATION
1614 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1615 input_f = UTF8_INPUT;
1620 if (strcmp(long_option[i].name, "prefix=") == 0){
1621 if (nkf_isgraph(p[0])){
1622 for (i = 1; nkf_isgraph(p[i]); i++){
1623 prefix_table[p[i]] = p[0];
1630 case 'b': /* buffered mode */
1633 case 'u': /* non bufferd mode */
1636 case 't': /* transparent mode */
1641 } else if (*cp=='2') {
1645 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1653 case 'j': /* JIS output */
1655 output_conv = j_oconv;
1657 case 'e': /* AT&T EUC output */
1658 output_conv = e_oconv;
1661 case 's': /* SJIS output */
1662 output_conv = s_oconv;
1664 case 'l': /* ISO8859 Latin-1 support, no conversion */
1665 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1666 input_f = LATIN1_INPUT;
1668 case 'i': /* Kanji IN ESC-$-@/B */
1669 if (*cp=='@'||*cp=='B')
1670 kanji_intro = *cp++;
1672 case 'o': /* ASCII IN ESC-(-J/B */
1673 if (*cp=='J'||*cp=='B'||*cp=='H')
1674 ascii_intro = *cp++;
1678 bit:1 katakana->hiragana
1679 bit:2 hiragana->katakana
1681 if ('9'>= *cp && *cp>='0')
1682 hira_f |= (*cp++ -'0');
1689 #if defined(MSDOS) || defined(__OS2__)
1704 #ifdef UTF8_OUTPUT_ENABLE
1705 case 'w': /* UTF-8 output */
1707 output_conv = w_oconv; cp++;
1711 output_bom_f = TRUE;
1714 if ('1'== cp[0] && '6'==cp[1]) {
1715 output_conv = w_oconv16; cp+=2;
1716 } else if ('3'== cp[0] && '2'==cp[1]) {
1717 output_conv = w_oconv32; cp+=2;
1719 output_conv = w_oconv;
1724 output_endian = ENDIAN_LITTLE;
1725 } else if (cp[0] == 'B') {
1733 output_bom_f = TRUE;
1738 #ifdef UTF8_INPUT_ENABLE
1739 case 'W': /* UTF input */
1742 input_f = UTF8_INPUT;
1744 if ('1'== cp[0] && '6'==cp[1]) {
1746 input_f = UTF16_INPUT;
1747 input_endian = ENDIAN_BIG;
1748 } else if ('3'== cp[0] && '2'==cp[1]) {
1750 input_f = UTF32_INPUT;
1751 input_endian = ENDIAN_BIG;
1753 input_f = UTF8_INPUT;
1758 input_endian = ENDIAN_LITTLE;
1759 } else if (cp[0] == 'B') {
1765 /* Input code assumption */
1766 case 'J': /* JIS input */
1767 input_f = JIS_INPUT;
1769 case 'E': /* AT&T EUC input */
1770 input_f = EUC_INPUT;
1772 case 'S': /* MS Kanji input */
1773 input_f = SJIS_INPUT;
1774 if (x0201_f==NO_X0201) x0201_f=TRUE;
1776 case 'Z': /* Convert X0208 alphabet to asii */
1778 bit:0 Convert JIS X 0208 Alphabet to ASCII
1779 bit:1 Convert Kankaku to one space
1780 bit:2 Convert Kankaku to two spaces
1781 bit:3 Convert HTML Entity
1782 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1784 while ('0'<= *cp && *cp <='9') {
1785 alpha_f |= 1 << (*cp++ - '0');
1787 if (!alpha_f) alpha_f = 1;
1789 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1790 x0201_f = FALSE; /* No X0201->X0208 conversion */
1792 ESC-(-I in JIS, EUC, MS Kanji
1793 SI/SO in JIS, EUC, MS Kanji
1794 SSO in EUC, JIS, not in MS Kanji
1795 MS Kanji (0xa0-0xdf)
1797 ESC-(-I in JIS (0x20-0x5f)
1798 SSO in EUC (0xa0-0xdf)
1799 0xa0-0xd in MS Kanji (0xa0-0xdf)
1802 case 'X': /* Assume X0201 kana */
1803 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1806 case 'F': /* prserve new lines */
1807 fold_preserve_f = TRUE;
1808 case 'f': /* folding -f60 or -f */
1811 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1813 fold_len += *cp++ - '0';
1815 if (!(0<fold_len && fold_len<BUFSIZ))
1816 fold_len = DEFAULT_FOLD;
1820 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1822 fold_margin += *cp++ - '0';
1826 case 'm': /* MIME support */
1827 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1828 if (*cp=='B'||*cp=='Q') {
1829 mime_decode_mode = *cp++;
1830 mimebuf_f = FIXED_MIME;
1831 } else if (*cp=='N') {
1832 mime_f = TRUE; cp++;
1833 } else if (*cp=='S') {
1834 mime_f = STRICT_MIME; cp++;
1835 } else if (*cp=='0') {
1836 mime_decode_f = FALSE;
1837 mime_f = FALSE; cp++;
1840 case 'M': /* MIME output */
1843 mimeout_f = FIXED_MIME; cp++;
1844 } else if (*cp=='Q') {
1846 mimeout_f = FIXED_MIME; cp++;
1851 case 'B': /* Broken JIS support */
1853 bit:1 allow any x on ESC-(-x or ESC-$-x
1854 bit:2 reset to ascii on NL
1856 if ('9'>= *cp && *cp>='0')
1857 broken_f |= 1<<(*cp++ -'0');
1862 case 'O':/* for Output file */
1866 case 'c':/* add cr code */
1869 case 'd':/* delete cr code */
1872 case 'I': /* ISO-2022-JP output */
1875 case 'L': /* line mode */
1876 if (*cp=='u') { /* unix */
1877 nlmode_f = LF; cp++;
1878 } else if (*cp=='m') { /* mac */
1879 nlmode_f = CR; cp++;
1880 } else if (*cp=='w') { /* windows */
1881 nlmode_f = CRLF; cp++;
1882 } else if (*cp=='0') { /* no conversion */
1892 /* module muliple options in a string are allowed for Perl moudle */
1893 while(*cp && *cp++!='-');
1896 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1897 /* bogus option but ignored */
1903 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1906 struct input_code *p = input_code_list;
1908 if (iconv_func == p->iconv_func){
1917 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1919 #ifdef INPUT_CODE_FIX
1927 #ifdef INPUT_CODE_FIX
1928 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1934 if (estab_f && iconv_for_check != iconv){
1935 struct input_code *p = find_inputcode_byfunc(iconv);
1937 set_input_codename(p->name);
1940 iconv_for_check = iconv;
1945 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1946 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1947 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1948 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1949 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1950 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1951 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1953 #define SCORE_INIT (SCORE_iMIME)
1955 static const char score_table_A0[] = {
1958 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1959 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1962 static const char score_table_F0[] = {
1963 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1964 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1965 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1966 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1969 void set_code_score(struct input_code *ptr, nkf_char score)
1972 ptr->score |= score;
1976 void clr_code_score(struct input_code *ptr, nkf_char score)
1979 ptr->score &= ~score;
1983 void code_score(struct input_code *ptr)
1985 nkf_char c2 = ptr->buf[0];
1986 #ifdef UTF8_OUTPUT_ENABLE
1987 nkf_char c1 = ptr->buf[1];
1990 set_code_score(ptr, SCORE_ERROR);
1991 }else if (c2 == SSO){
1992 set_code_score(ptr, SCORE_KANA);
1993 #ifdef UTF8_OUTPUT_ENABLE
1994 }else if (!e2w_conv(c2, c1)){
1995 set_code_score(ptr, SCORE_NO_EXIST);
1997 }else if ((c2 & 0x70) == 0x20){
1998 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1999 }else if ((c2 & 0x70) == 0x70){
2000 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2001 }else if ((c2 & 0x70) >= 0x50){
2002 set_code_score(ptr, SCORE_L2);
2006 void status_disable(struct input_code *ptr)
2011 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2014 void status_push_ch(struct input_code *ptr, nkf_char c)
2016 ptr->buf[ptr->index++] = c;
2019 void status_clear(struct input_code *ptr)
2025 void status_reset(struct input_code *ptr)
2028 ptr->score = SCORE_INIT;
2031 void status_reinit(struct input_code *ptr)
2034 ptr->_file_stat = 0;
2037 void status_check(struct input_code *ptr, nkf_char c)
2039 if (c <= DEL && estab_f){
2044 void s_status(struct input_code *ptr, nkf_char c)
2048 status_check(ptr, c);
2053 #ifdef NUMCHAR_OPTION
2054 }else if (is_unicode_capsule(c)){
2057 }else if (0xa1 <= c && c <= 0xdf){
2058 status_push_ch(ptr, SSO);
2059 status_push_ch(ptr, c);
2062 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2064 status_push_ch(ptr, c);
2065 #ifdef SHIFTJIS_CP932
2066 }else if (is_ibmext_in_sjis(c)){
2068 status_push_ch(ptr, c);
2069 #endif /* SHIFTJIS_CP932 */
2071 }else if (0xf0 <= c && c <= 0xfc){
2073 status_push_ch(ptr, c);
2074 #endif /* X0212_ENABLE */
2076 status_disable(ptr);
2080 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2081 status_push_ch(ptr, c);
2082 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2086 status_disable(ptr);
2090 #ifdef SHIFTJIS_CP932
2091 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2092 status_push_ch(ptr, c);
2093 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2099 #endif /* SHIFTJIS_CP932 */
2100 status_disable(ptr);
2105 void e_status(struct input_code *ptr, nkf_char c)
2109 status_check(ptr, c);
2114 #ifdef NUMCHAR_OPTION
2115 }else if (is_unicode_capsule(c)){
2118 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2120 status_push_ch(ptr, c);
2122 }else if (0x8f == c){
2124 status_push_ch(ptr, c);
2125 #endif /* X0212_ENABLE */
2127 status_disable(ptr);
2131 if (0xa1 <= c && c <= 0xfe){
2132 status_push_ch(ptr, c);
2136 status_disable(ptr);
2141 if (0xa1 <= c && c <= 0xfe){
2143 status_push_ch(ptr, c);
2145 status_disable(ptr);
2147 #endif /* X0212_ENABLE */
2151 #ifdef UTF8_INPUT_ENABLE
2152 void w_status(struct input_code *ptr, nkf_char c)
2156 status_check(ptr, c);
2161 #ifdef NUMCHAR_OPTION
2162 }else if (is_unicode_capsule(c)){
2165 }else if (0xc0 <= c && c <= 0xdf){
2167 status_push_ch(ptr, c);
2168 }else if (0xe0 <= c && c <= 0xef){
2170 status_push_ch(ptr, c);
2171 }else if (0xf0 <= c && c <= 0xf4){
2173 status_push_ch(ptr, c);
2175 status_disable(ptr);
2180 if (0x80 <= c && c <= 0xbf){
2181 status_push_ch(ptr, c);
2182 if (ptr->index > ptr->stat){
2183 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2184 && ptr->buf[2] == 0xbf);
2185 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2186 &ptr->buf[0], &ptr->buf[1]);
2193 status_disable(ptr);
2197 if (0x80 <= c && c <= 0xbf){
2198 if (ptr->index < ptr->stat){
2199 status_push_ch(ptr, c);
2204 status_disable(ptr);
2211 void code_status(nkf_char c)
2213 int action_flag = 1;
2214 struct input_code *result = 0;
2215 struct input_code *p = input_code_list;
2217 if (!p->status_func) {
2221 if (!p->status_func)
2223 (p->status_func)(p, c);
2226 }else if(p->stat == 0){
2237 if (result && !estab_f){
2238 set_iconv(TRUE, result->iconv_func);
2239 }else if (c <= DEL){
2240 struct input_code *ptr = input_code_list;
2250 nkf_char std_getc(FILE *f)
2253 return std_gc_buf[--std_gc_ndx];
2259 nkf_char std_ungetc(nkf_char c, FILE *f)
2261 if (std_gc_ndx == STD_GC_BUFSIZE){
2264 std_gc_buf[std_gc_ndx++] = c;
2269 void std_putc(nkf_char c)
2276 #if !defined(PERL_XS) && !defined(WIN32DLL)
2277 nkf_char noconvert(FILE *f)
2282 module_connection();
2283 while ((c = (*i_getc)(f)) != EOF)
2290 void module_connection(void)
2292 oconv = output_conv;
2295 /* replace continucation module, from output side */
2297 /* output redicrection */
2299 if (noout_f || guess_f){
2306 if (mimeout_f == TRUE) {
2307 o_base64conv = oconv; oconv = base64_conv;
2309 /* base64_count = 0; */
2312 if (nlmode_f || guess_f) {
2313 o_nlconv = oconv; oconv = nl_conv;
2316 o_rot_conv = oconv; oconv = rot_conv;
2319 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2322 o_hira_conv = oconv; oconv = hira_conv;
2325 o_fconv = oconv; oconv = fold_conv;
2328 if (alpha_f || x0201_f) {
2329 o_zconv = oconv; oconv = z_conv;
2333 i_ungetc = std_ungetc;
2334 /* input redicrection */
2337 i_cgetc = i_getc; i_getc = cap_getc;
2338 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2341 i_ugetc = i_getc; i_getc = url_getc;
2342 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2345 #ifdef NUMCHAR_OPTION
2347 i_ngetc = i_getc; i_getc = numchar_getc;
2348 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2351 #ifdef UNICODE_NORMALIZATION
2352 if (nfc_f && input_f == UTF8_INPUT){
2353 i_nfc_getc = i_getc; i_getc = nfc_getc;
2354 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2357 if (mime_f && mimebuf_f==FIXED_MIME) {
2358 i_mgetc = i_getc; i_getc = mime_getc;
2359 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2362 i_bgetc = i_getc; i_getc = broken_getc;
2363 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2365 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2366 set_iconv(-TRUE, e_iconv);
2367 } else if (input_f == SJIS_INPUT) {
2368 set_iconv(-TRUE, s_iconv);
2369 #ifdef UTF8_INPUT_ENABLE
2370 } else if (input_f == UTF8_INPUT) {
2371 set_iconv(-TRUE, w_iconv);
2372 } else if (input_f == UTF16_INPUT) {
2373 set_iconv(-TRUE, w_iconv16);
2374 } else if (input_f == UTF32_INPUT) {
2375 set_iconv(-TRUE, w_iconv32);
2378 set_iconv(FALSE, e_iconv);
2382 struct input_code *p = input_code_list;
2390 * Check and Ignore BOM
2392 void check_bom(FILE *f)
2395 switch(c2 = (*i_getc)(f)){
2397 if((c2 = (*i_getc)(f)) == 0x00){
2398 if((c2 = (*i_getc)(f)) == 0xFE){
2399 if((c2 = (*i_getc)(f)) == 0xFF){
2401 set_iconv(TRUE, w_iconv32);
2403 if (iconv == w_iconv32) {
2404 input_endian = ENDIAN_BIG;
2407 (*i_ungetc)(0xFF,f);
2408 }else (*i_ungetc)(c2,f);
2409 (*i_ungetc)(0xFE,f);
2410 }else if(c2 == 0xFF){
2411 if((c2 = (*i_getc)(f)) == 0xFE){
2413 set_iconv(TRUE, w_iconv32);
2415 if (iconv == w_iconv32) {
2416 input_endian = ENDIAN_2143;
2419 (*i_ungetc)(0xFF,f);
2420 }else (*i_ungetc)(c2,f);
2421 (*i_ungetc)(0xFF,f);
2422 }else (*i_ungetc)(c2,f);
2423 (*i_ungetc)(0x00,f);
2424 }else (*i_ungetc)(c2,f);
2425 (*i_ungetc)(0x00,f);
2428 if((c2 = (*i_getc)(f)) == 0xBB){
2429 if((c2 = (*i_getc)(f)) == 0xBF){
2431 set_iconv(TRUE, w_iconv);
2433 if (iconv == w_iconv) {
2436 (*i_ungetc)(0xBF,f);
2437 }else (*i_ungetc)(c2,f);
2438 (*i_ungetc)(0xBB,f);
2439 }else (*i_ungetc)(c2,f);
2440 (*i_ungetc)(0xEF,f);
2443 if((c2 = (*i_getc)(f)) == 0xFF){
2444 if((c2 = (*i_getc)(f)) == 0x00){
2445 if((c2 = (*i_getc)(f)) == 0x00){
2447 set_iconv(TRUE, w_iconv32);
2449 if (iconv == w_iconv32) {
2450 input_endian = ENDIAN_3412;
2453 (*i_ungetc)(0x00,f);
2454 }else (*i_ungetc)(c2,f);
2455 (*i_ungetc)(0x00,f);
2456 }else (*i_ungetc)(c2,f);
2458 set_iconv(TRUE, w_iconv16);
2460 if (iconv == w_iconv16) {
2461 input_endian = ENDIAN_BIG;
2464 (*i_ungetc)(0xFF,f);
2465 }else (*i_ungetc)(c2,f);
2466 (*i_ungetc)(0xFE,f);
2469 if((c2 = (*i_getc)(f)) == 0xFE){
2470 if((c2 = (*i_getc)(f)) == 0x00){
2471 if((c2 = (*i_getc)(f)) == 0x00){
2473 set_iconv(TRUE, w_iconv32);
2475 if (iconv == w_iconv32) {
2476 input_endian = ENDIAN_LITTLE;
2479 (*i_ungetc)(0x00,f);
2480 }else (*i_ungetc)(c2,f);
2481 (*i_ungetc)(0x00,f);
2482 }else (*i_ungetc)(c2,f);
2484 set_iconv(TRUE, w_iconv16);
2486 if (iconv == w_iconv16) {
2487 input_endian = ENDIAN_LITTLE;
2490 (*i_ungetc)(0xFE,f);
2491 }else (*i_ungetc)(c2,f);
2492 (*i_ungetc)(0xFF,f);
2501 Conversion main loop. Code detection only.
2504 nkf_char kanji_convert(FILE *f)
2506 nkf_char c3, c2=0, c1, c0=0;
2507 int is_8bit = FALSE;
2509 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2510 #ifdef UTF8_INPUT_ENABLE
2511 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2518 output_mode = ASCII;
2521 #define NEXT continue /* no output, get next */
2522 #define SEND ; /* output c1 and c2, get next */
2523 #define LAST break /* end of loop, go closing */
2525 module_connection();
2528 while ((c1 = (*i_getc)(f)) != EOF) {
2529 #ifdef INPUT_CODE_FIX
2535 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2536 /* in case of 8th bit is on */
2537 if (!estab_f&&!mime_decode_mode) {
2538 /* in case of not established yet */
2539 /* It is still ambiguious */
2540 if (h_conv(f, c2, c1)==EOF)
2546 /* in case of already established */
2548 /* ignore bogus code and not CP5022x UCD */
2556 /* second byte, 7 bit code */
2557 /* it might be kanji shitfted */
2558 if ((c1 == DEL) || (c1 <= SP)) {
2559 /* ignore bogus first code */
2566 #ifdef UTF8_INPUT_ENABLE
2567 if (iconv == w_iconv16) {
2568 if (input_endian == ENDIAN_BIG) {
2570 if ((c1 = (*i_getc)(f)) != EOF) {
2571 if (0xD8 <= c2 && c2 <= 0xDB) {
2572 if ((c0 = (*i_getc)(f)) != EOF) {
2574 if ((c3 = (*i_getc)(f)) != EOF) {
2581 if ((c2 = (*i_getc)(f)) != EOF) {
2582 if (0xD8 <= c2 && c2 <= 0xDB) {
2583 if ((c3 = (*i_getc)(f)) != EOF) {
2584 if ((c0 = (*i_getc)(f)) != EOF) {
2593 } else if(iconv == w_iconv32){
2595 if((c2 = (*i_getc)(f)) != EOF &&
2596 (c1 = (*i_getc)(f)) != EOF &&
2597 (c0 = (*i_getc)(f)) != EOF){
2598 switch(input_endian){
2600 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2603 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2606 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2609 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2619 #ifdef NUMCHAR_OPTION
2620 if (is_unicode_capsule(c1)){
2624 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2626 if (!estab_f && !iso8859_f) {
2627 /* not established yet */
2630 } else { /* estab_f==TRUE */
2635 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2636 /* SJIS X0201 Case... */
2637 if(iso2022jp_f && x0201_f==NO_X0201) {
2638 (*oconv)(GETA1, GETA2);
2645 } else if (c1==SSO && iconv != s_iconv) {
2646 /* EUC X0201 Case */
2647 c1 = (*i_getc)(f); /* skip SSO */
2649 if (SSP<=c1 && c1<0xe0) {
2650 if(iso2022jp_f && x0201_f==NO_X0201) {
2651 (*oconv)(GETA1, GETA2);
2658 } else { /* bogus code, skip SSO and one byte */
2661 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2662 (c1 == 0xFD || c1 == 0xFE)) {
2668 /* already established */
2673 } else if ((c1 > SP) && (c1 != DEL)) {
2674 /* in case of Roman characters */
2676 /* output 1 shifted byte */
2680 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2681 /* output 1 shifted byte */
2682 if(iso2022jp_f && x0201_f==NO_X0201) {
2683 (*oconv)(GETA1, GETA2);
2690 /* look like bogus code */
2693 } else if (input_mode == X0208 || input_mode == X0212 ||
2694 input_mode == X0213_1 || input_mode == X0213_2) {
2695 /* in case of Kanji shifted */
2698 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2699 /* Check MIME code */
2700 if ((c1 = (*i_getc)(f)) == EOF) {
2703 } else if (c1 == '?') {
2704 /* =? is mime conversion start sequence */
2705 if(mime_f == STRICT_MIME) {
2706 /* check in real detail */
2707 if (mime_begin_strict(f) == EOF)
2711 } else if (mime_begin(f) == EOF)
2721 /* normal ASCII code */
2724 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2727 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2730 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2731 if ((c1 = (*i_getc)(f)) == EOF) {
2732 /* (*oconv)(0, ESC); don't send bogus code */
2734 } else if (c1 == '$') {
2735 if ((c1 = (*i_getc)(f)) == EOF) {
2737 (*oconv)(0, ESC); don't send bogus code
2738 (*oconv)(0, '$'); */
2740 } else if (c1 == '@'|| c1 == 'B') {
2741 /* This is kanji introduction */
2744 set_input_codename("ISO-2022-JP");
2746 debug("ISO-2022-JP");
2749 } else if (c1 == '(') {
2750 if ((c1 = (*i_getc)(f)) == EOF) {
2751 /* don't send bogus code
2757 } else if (c1 == '@'|| c1 == 'B') {
2758 /* This is kanji introduction */
2763 } else if (c1 == 'D'){
2767 #endif /* X0212_ENABLE */
2768 } else if (c1 == (X0213_1&0x7F)){
2769 input_mode = X0213_1;
2772 } else if (c1 == (X0213_2&0x7F)){
2773 input_mode = X0213_2;
2777 /* could be some special code */
2784 } else if (broken_f&0x2) {
2785 /* accept any ESC-(-x as broken code ... */
2795 } else if (c1 == '(') {
2796 if ((c1 = (*i_getc)(f)) == EOF) {
2797 /* don't send bogus code
2799 (*oconv)(0, '('); */
2803 /* This is X0201 kana introduction */
2804 input_mode = X0201; shift_mode = X0201;
2806 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2807 /* This is X0208 kanji introduction */
2808 input_mode = ASCII; shift_mode = FALSE;
2810 } else if (broken_f&0x2) {
2811 input_mode = ASCII; shift_mode = FALSE;
2816 /* maintain various input_mode here */
2820 } else if ( c1 == 'N' || c1 == 'n'){
2822 c3 = (*i_getc)(f); /* skip SS2 */
2823 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2838 } else if (c1 == ESC && iconv == s_iconv) {
2839 /* ESC in Shift_JIS */
2840 if ((c1 = (*i_getc)(f)) == EOF) {
2841 /* (*oconv)(0, ESC); don't send bogus code */
2843 } else if (c1 == '$') {
2845 if ((c1 = (*i_getc)(f)) == EOF) {
2847 (*oconv)(0, ESC); don't send bogus code
2848 (*oconv)(0, '$'); */
2851 if (('E' <= c1 && c1 <= 'G') ||
2852 ('O' <= c1 && c1 <= 'Q')) {
2860 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2861 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2862 while ((c1 = (*i_getc)(f)) != EOF) {
2863 if (SP <= c1 && c1 <= 'z') {
2864 (*oconv)(0, c1 + c0);
2865 } else break; /* c1 == SO */
2869 if (c1 == EOF) LAST;
2876 } else if (c1 == LF || c1 == CR) {
2878 input_mode = ASCII; set_iconv(FALSE, 0);
2880 } else if (mime_decode_f && !mime_decode_mode){
2882 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2890 } else { /* if (c1 == CR)*/
2891 if ((c1=(*i_getc)(f))!=EOF) {
2895 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2909 } else if (c1 == DEL && input_mode == X0208) {
2919 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2922 if ((c0 = (*i_getc)(f)) != EOF) {
2925 if ((c3 = (*i_getc)(f)) != EOF) {
2927 (*iconv)(c2, c1, c0|c3);
2932 /* 3 bytes EUC or UTF-8 */
2933 if ((c0 = (*i_getc)(f)) != EOF) {
2935 (*iconv)(c2, c1, c0);
2943 0x7F <= c2 && c2 <= 0x92 &&
2944 0x21 <= c1 && c1 <= 0x7E) {
2946 if(c1 == 0x7F) return 0;
2947 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2950 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2954 (*oconv)(PREFIX_EUCG3 | c2, c1);
2956 #endif /* X0212_ENABLE */
2958 (*oconv)(PREFIX_EUCG3 | c2, c1);
2961 (*oconv)(input_mode, c1); /* other special case */
2967 /* goto next_word */
2971 (*iconv)(EOF, 0, 0);
2972 if (!input_codename)
2975 struct input_code *p = input_code_list;
2976 struct input_code *result = p;
2978 if (p->score < result->score) result = p;
2981 set_input_codename(result->name);
2983 debug(result->name);
2991 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2993 nkf_char ret, c3, c0;
2997 /** it must NOT be in the kanji shifte sequence */
2998 /** it must NOT be written in JIS7 */
2999 /** and it must be after 2 byte 8bit code */
3005 while ((c1 = (*i_getc)(f)) != EOF) {
3011 if (push_hold_buf(c1) == EOF || estab_f){
3017 struct input_code *p = input_code_list;
3018 struct input_code *result = p;
3023 if (p->status_func && p->score < result->score){
3028 set_iconv(TRUE, result->iconv_func);
3033 ** 1) EOF is detected, or
3034 ** 2) Code is established, or
3035 ** 3) Buffer is FULL (but last word is pushed)
3037 ** in 1) and 3) cases, we continue to use
3038 ** Kanji codes by oconv and leave estab_f unchanged.
3043 while (hold_index < hold_count){
3044 c2 = hold_buf[hold_index++];
3046 #ifdef NUMCHAR_OPTION
3047 || is_unicode_capsule(c2)
3052 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3053 (*iconv)(X0201, c2, 0);
3056 if (hold_index < hold_count){
3057 c1 = hold_buf[hold_index++];
3067 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3070 if (hold_index < hold_count){
3071 c0 = hold_buf[hold_index++];
3072 } else if ((c0 = (*i_getc)(f)) == EOF) {
3078 if (hold_index < hold_count){
3079 c3 = hold_buf[hold_index++];
3080 } else if ((c3 = (*i_getc)(f)) == EOF) {
3085 (*iconv)(c2, c1, c0|c3);
3090 /* 3 bytes EUC or UTF-8 */
3091 if (hold_index < hold_count){
3092 c0 = hold_buf[hold_index++];
3093 } else if ((c0 = (*i_getc)(f)) == EOF) {
3099 (*iconv)(c2, c1, c0);
3102 if (c0 == EOF) break;
3107 nkf_char push_hold_buf(nkf_char c2)
3109 if (hold_count >= HOLD_SIZE*2)
3111 hold_buf[hold_count++] = (unsigned char)c2;
3112 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3115 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3117 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3120 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3121 #ifdef SHIFTJIS_CP932
3122 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3123 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3130 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3131 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3137 #endif /* SHIFTJIS_CP932 */
3139 if (!x0213_f && is_ibmext_in_sjis(c2)){
3140 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3143 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3156 if(x0213_f && c2 >= 0xF0){
3157 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3158 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3159 }else{ /* 78<=k<=94 */
3160 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3161 if (0x9E < c1) c2++;
3164 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3165 if (0x9E < c1) c2++;
3168 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3175 c2 = x0212_unshift(c2);
3182 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3186 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3188 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3190 if(c1 == 0x7F) return 0;
3191 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3194 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3195 if (ret) return ret;
3201 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3206 }else if (c2 == 0x8f){
3210 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3211 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3212 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3215 c2 = (c2 << 8) | (c1 & 0x7f);
3217 #ifdef SHIFTJIS_CP932
3220 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3221 s2e_conv(s2, s1, &c2, &c1);
3228 #endif /* SHIFTJIS_CP932 */
3230 #endif /* X0212_ENABLE */
3231 } else if (c2 == SSO){
3234 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3237 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3238 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3239 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3244 #ifdef SHIFTJIS_CP932
3245 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3247 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3248 s2e_conv(s2, s1, &c2, &c1);
3255 #endif /* SHIFTJIS_CP932 */
3262 #ifdef UTF8_INPUT_ENABLE
3263 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3270 }else if (0xc0 <= c2 && c2 <= 0xef) {
3271 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3272 #ifdef NUMCHAR_OPTION
3275 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3283 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3286 static const char w_iconv_utf8_1st_byte[] =
3288 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3289 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3290 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3291 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3293 if (c2 < 0 || 0xff < c2) {
3294 }else if (c2 == 0) { /* 0 : 1 byte*/
3296 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3299 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3301 if (c1 < 0x80 || 0xBF < c1) return 0;
3304 if (c0 == 0) return -1;
3305 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3310 if (c0 == 0) return -1;
3311 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3315 if (c0 == 0) return -1;
3316 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3320 if (c0 == 0) return -2;
3321 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3325 if (c0 == 0) return -2;
3326 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3330 if (c0 == 0) return -2;
3331 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3339 if (c2 == 0 || c2 == EOF){
3340 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3341 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3344 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3353 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3354 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3361 }else if (val < 0x800){
3362 *p2 = 0xc0 | (val >> 6);
3363 *p1 = 0x80 | (val & 0x3f);
3365 } else if (val <= NKF_INT32_C(0xFFFF)) {
3366 *p2 = 0xe0 | (val >> 12);
3367 *p1 = 0x80 | ((val >> 6) & 0x3f);
3368 *p0 = 0x80 | (val & 0x3f);
3369 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3370 *p2 = 0xe0 | (val >> 16);
3371 *p1 = 0x80 | ((val >> 12) & 0x3f);
3372 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3381 #ifdef UTF8_INPUT_ENABLE
3382 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3387 } else if (c2 >= 0xf0){
3388 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3389 val = (c2 & 0x0f) << 18;
3390 val |= (c1 & 0x3f) << 12;
3391 val |= (c0 & 0x3f00) >> 2;
3393 }else if (c2 >= 0xe0){
3394 val = (c2 & 0x0f) << 12;
3395 val |= (c1 & 0x3f) << 6;
3397 }else if (c2 >= 0xc0){
3398 val = (c2 & 0x1f) << 6;
3406 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3408 nkf_char c2, c1, c0;
3415 w16w_conv(val, &c2, &c1, &c0);
3416 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3417 #ifdef NUMCHAR_OPTION
3420 *p1 = CLASS_UNICODE | val;
3429 #ifdef UTF8_INPUT_ENABLE
3430 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3433 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3436 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3437 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3439 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3441 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3446 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3447 if (ret) return ret;
3452 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3456 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3457 } else if (is_unicode_bmp(c1)) {
3458 ret = w16e_conv(c1, &c2, &c1);
3461 c1 = CLASS_UNICODE | c1;
3463 if (ret) return ret;
3468 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3470 const unsigned short *const *pp;
3471 const unsigned short *const *const *ppp;
3472 static const char no_best_fit_chars_table_C2[] =
3473 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3475 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3476 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3477 static const char no_best_fit_chars_table_C2_ms[] =
3478 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3480 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3481 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3482 static const char no_best_fit_chars_table_932_C2[] =
3483 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3484 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3485 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3486 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3487 static const char no_best_fit_chars_table_932_C3[] =
3488 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3489 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3491 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3497 }else if(c2 < 0xe0){
3498 if(no_best_fit_chars_f){
3499 if(ms_ucs_map_f == UCS_MAP_CP932){
3502 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3505 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3508 }else if(!cp932inv_f){
3511 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3514 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3517 }else if(ms_ucs_map_f == UCS_MAP_MS){
3518 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3519 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3537 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3538 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3539 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3541 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3542 }else if(c0 < 0xF0){
3543 if(no_best_fit_chars_f){
3544 if(ms_ucs_map_f == UCS_MAP_CP932){
3545 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3546 }else if(ms_ucs_map_f == UCS_MAP_MS){
3551 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3554 if(c0 == 0x92) return 1;
3559 if(c1 == 0x80 || c0 == 0x9C) return 1;
3562 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3567 if(c0 == 0x94) return 1;
3570 if(c0 == 0xBB) return 1;
3580 if(c0 == 0x95) return 1;
3583 if(c0 == 0xA5) return 1;
3590 if(c0 == 0x8D) return 1;
3593 if(c0 == 0x9E && !cp932inv_f) return 1;
3596 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3604 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3605 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3606 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3608 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3610 #ifdef SHIFTJIS_CP932
3611 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3613 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3614 s2e_conv(s2, s1, p2, p1);
3623 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3626 const unsigned short *p;
3629 if (pp == 0) return 1;
3632 if (c1 < 0 || psize <= c1) return 1;
3634 if (p == 0) return 1;
3637 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3639 if (val == 0) return 1;
3640 if (no_cp932ext_f && (
3641 (val>>8) == 0x2D || /* NEC special characters */
3642 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3650 if (c2 == SO) c2 = X0201;
3657 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3664 (*f)(0, bin2hex(c>>shift));
3674 void encode_fallback_html(nkf_char c)
3679 if(c >= NKF_INT32_C(1000000))
3680 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3681 if(c >= NKF_INT32_C(100000))
3682 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3684 (*oconv)(0, 0x30+(c/10000 )%10);
3686 (*oconv)(0, 0x30+(c/1000 )%10);
3688 (*oconv)(0, 0x30+(c/100 )%10);
3690 (*oconv)(0, 0x30+(c/10 )%10);
3692 (*oconv)(0, 0x30+ c %10);
3697 void encode_fallback_xml(nkf_char c)
3702 nkf_each_char_to_hex(oconv, c);
3707 void encode_fallback_java(nkf_char c)
3711 if(!is_unicode_bmp(c)){
3715 (*oconv)(0, bin2hex(c>>20));
3716 (*oconv)(0, bin2hex(c>>16));
3720 (*oconv)(0, bin2hex(c>>12));
3721 (*oconv)(0, bin2hex(c>> 8));
3722 (*oconv)(0, bin2hex(c>> 4));
3723 (*oconv)(0, bin2hex(c ));
3727 void encode_fallback_perl(nkf_char c)
3732 nkf_each_char_to_hex(oconv, c);
3737 void encode_fallback_subchar(nkf_char c)
3739 c = unicode_subchar;
3740 (*oconv)((c>>8)&0xFF, c&0xFF);
3745 #ifdef UTF8_OUTPUT_ENABLE
3746 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3748 const unsigned short *p;
3751 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3759 p = euc_to_utf8_1byte;
3761 } else if (is_eucg3(c2)){
3762 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3765 c2 = (c2&0x7f) - 0x21;
3766 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3767 p = x0212_to_utf8_2bytes[c2];
3773 c2 = (c2&0x7f) - 0x21;
3774 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3776 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3777 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3778 euc_to_utf8_2bytes_ms[c2];
3783 c1 = (c1 & 0x7f) - 0x21;
3784 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3789 void w_oconv(nkf_char c2, nkf_char c1)
3795 output_bom_f = FALSE;
3806 #ifdef NUMCHAR_OPTION
3807 if (c2 == 0 && is_unicode_capsule(c1)){
3808 val = c1 & VALUE_MASK;
3811 }else if (val < 0x800){
3812 (*o_putc)(0xC0 | (val >> 6));
3813 (*o_putc)(0x80 | (val & 0x3f));
3814 } else if (val <= NKF_INT32_C(0xFFFF)) {
3815 (*o_putc)(0xE0 | (val >> 12));
3816 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3817 (*o_putc)(0x80 | (val & 0x3f));
3818 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3819 (*o_putc)(0xF0 | ( val>>18));
3820 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3821 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3822 (*o_putc)(0x80 | ( val & 0x3f));
3829 output_mode = ASCII;
3831 } else if (c2 == ISO8859_1) {
3832 output_mode = ISO8859_1;
3833 (*o_putc)(c1 | 0x080);
3836 val = e2w_conv(c2, c1);
3838 w16w_conv(val, &c2, &c1, &c0);
3842 if (c0) (*o_putc)(c0);
3848 void w_oconv16(nkf_char c2, nkf_char c1)
3851 output_bom_f = FALSE;
3852 if (output_endian == ENDIAN_LITTLE){
3853 (*o_putc)((unsigned char)'\377');
3857 (*o_putc)((unsigned char)'\377');
3866 if (c2 == ISO8859_1) {
3869 #ifdef NUMCHAR_OPTION
3870 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3871 if (is_unicode_bmp(c1)) {
3872 c2 = (c1 >> 8) & 0xff;
3876 if (c1 <= UNICODE_MAX) {
3877 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3878 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3879 if (output_endian == ENDIAN_LITTLE){
3880 (*o_putc)(c2 & 0xff);
3881 (*o_putc)((c2 >> 8) & 0xff);
3882 (*o_putc)(c1 & 0xff);
3883 (*o_putc)((c1 >> 8) & 0xff);
3885 (*o_putc)((c2 >> 8) & 0xff);
3886 (*o_putc)(c2 & 0xff);
3887 (*o_putc)((c1 >> 8) & 0xff);
3888 (*o_putc)(c1 & 0xff);
3895 nkf_char val = e2w_conv(c2, c1);
3896 c2 = (val >> 8) & 0xff;
3900 if (output_endian == ENDIAN_LITTLE){
3909 void w_oconv32(nkf_char c2, nkf_char c1)
3912 output_bom_f = FALSE;
3913 if (output_endian == ENDIAN_LITTLE){
3914 (*o_putc)((unsigned char)'\377');
3922 (*o_putc)((unsigned char)'\377');
3931 if (c2 == ISO8859_1) {
3933 #ifdef NUMCHAR_OPTION
3934 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3938 c1 = e2w_conv(c2, c1);
3941 if (output_endian == ENDIAN_LITTLE){
3942 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3943 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3944 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3948 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3949 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3950 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3955 void e_oconv(nkf_char c2, nkf_char c1)
3957 #ifdef NUMCHAR_OPTION
3958 if (c2 == 0 && is_unicode_capsule(c1)){
3959 w16e_conv(c1, &c2, &c1);
3960 if (c2 == 0 && is_unicode_capsule(c1)){
3961 c2 = c1 & VALUE_MASK;
3962 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3966 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3967 c1 = 0x21 + c1 % 94;
3970 (*o_putc)((c2 & 0x7f) | 0x080);
3971 (*o_putc)(c1 | 0x080);
3973 (*o_putc)((c2 & 0x7f) | 0x080);
3974 (*o_putc)(c1 | 0x080);
3978 if (encode_fallback) (*encode_fallback)(c1);
3987 } else if (c2 == 0) {
3988 output_mode = ASCII;
3990 } else if (c2 == X0201) {
3991 output_mode = JAPANESE_EUC;
3992 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3993 } else if (c2 == ISO8859_1) {
3994 output_mode = ISO8859_1;
3995 (*o_putc)(c1 | 0x080);
3997 } else if (is_eucg3(c2)){
3998 output_mode = JAPANESE_EUC;
3999 #ifdef SHIFTJIS_CP932
4002 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4003 s2e_conv(s2, s1, &c2, &c1);
4008 output_mode = ASCII;
4010 }else if (is_eucg3(c2)){
4013 (*o_putc)((c2 & 0x7f) | 0x080);
4014 (*o_putc)(c1 | 0x080);
4017 (*o_putc)((c2 & 0x7f) | 0x080);
4018 (*o_putc)(c1 | 0x080);
4022 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4023 set_iconv(FALSE, 0);
4024 return; /* too late to rescue this char */
4026 output_mode = JAPANESE_EUC;
4027 (*o_putc)(c2 | 0x080);
4028 (*o_putc)(c1 | 0x080);
4033 nkf_char x0212_shift(nkf_char c)
4038 if (0x75 <= c && c <= 0x7f){
4039 ret = c + (0x109 - 0x75);
4042 if (0x75 <= c && c <= 0x7f){
4043 ret = c + (0x113 - 0x75);
4050 nkf_char x0212_unshift(nkf_char c)
4053 if (0x7f <= c && c <= 0x88){
4054 ret = c + (0x75 - 0x7f);
4055 }else if (0x89 <= c && c <= 0x92){
4056 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4060 #endif /* X0212_ENABLE */
4062 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4068 if((0x21 <= ndx && ndx <= 0x2F)){
4069 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4070 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4072 }else if(0x6E <= ndx && ndx <= 0x7E){
4073 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4074 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4080 else if(nkf_isgraph(ndx)){
4082 const unsigned short *ptr;
4083 ptr = x0212_shiftjis[ndx - 0x21];
4085 val = ptr[(c1 & 0x7f) - 0x21];
4094 c2 = x0212_shift(c2);
4096 #endif /* X0212_ENABLE */
4098 if(0x7F < c2) return 1;
4099 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4100 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4104 void s_oconv(nkf_char c2, nkf_char c1)
4106 #ifdef NUMCHAR_OPTION
4107 if (c2 == 0 && is_unicode_capsule(c1)){
4108 w16e_conv(c1, &c2, &c1);
4109 if (c2 == 0 && is_unicode_capsule(c1)){
4110 c2 = c1 & VALUE_MASK;
4111 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4114 c2 = c1 / 188 + 0xF0;
4116 c1 += 0x40 + (c1 > 0x3e);
4121 if(encode_fallback)(*encode_fallback)(c1);
4130 } else if (c2 == 0) {
4131 output_mode = ASCII;
4133 } else if (c2 == X0201) {
4134 output_mode = SHIFT_JIS;
4136 } else if (c2 == ISO8859_1) {
4137 output_mode = ISO8859_1;
4138 (*o_putc)(c1 | 0x080);
4140 } else if (is_eucg3(c2)){
4141 output_mode = SHIFT_JIS;
4142 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4148 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4149 set_iconv(FALSE, 0);
4150 return; /* too late to rescue this char */
4152 output_mode = SHIFT_JIS;
4153 e2s_conv(c2, c1, &c2, &c1);
4155 #ifdef SHIFTJIS_CP932
4157 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4158 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4164 #endif /* SHIFTJIS_CP932 */
4167 if (prefix_table[(unsigned char)c1]){
4168 (*o_putc)(prefix_table[(unsigned char)c1]);
4174 void j_oconv(nkf_char c2, nkf_char c1)
4176 #ifdef NUMCHAR_OPTION
4177 if (c2 == 0 && is_unicode_capsule(c1)){
4178 w16e_conv(c1, &c2, &c1);
4179 if (c2 == 0 && is_unicode_capsule(c1)){
4180 c2 = c1 & VALUE_MASK;
4181 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4184 c2 = 0x7F + c1 / 94;
4185 c1 = 0x21 + c1 % 94;
4187 if (encode_fallback) (*encode_fallback)(c1);
4194 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4197 (*o_putc)(ascii_intro);
4198 output_mode = ASCII;
4202 } else if (is_eucg3(c2)){
4204 if(output_mode!=X0213_2){
4205 output_mode = X0213_2;
4209 (*o_putc)(X0213_2&0x7F);
4212 if(output_mode!=X0212){
4213 output_mode = X0212;
4217 (*o_putc)(X0212&0x7F);
4220 (*o_putc)(c2 & 0x7f);
4223 } else if (c2==X0201) {
4224 if (output_mode!=X0201) {
4225 output_mode = X0201;
4231 } else if (c2==ISO8859_1) {
4232 /* iso8859 introduction, or 8th bit on */
4233 /* Can we convert in 7bit form using ESC-'-'-A ?
4235 output_mode = ISO8859_1;
4237 } else if (c2 == 0) {
4238 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4241 (*o_putc)(ascii_intro);
4242 output_mode = ASCII;
4247 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4248 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4250 if (output_mode!=X0213_1) {
4251 output_mode = X0213_1;
4255 (*o_putc)(X0213_1&0x7F);
4257 }else if (output_mode != X0208) {
4258 output_mode = X0208;
4261 (*o_putc)(kanji_intro);
4268 void base64_conv(nkf_char c2, nkf_char c1)
4270 mime_prechar(c2, c1);
4271 (*o_base64conv)(c2,c1);
4275 static nkf_char broken_buf[3];
4276 static int broken_counter = 0;
4277 static int broken_last = 0;
4278 nkf_char broken_getc(FILE *f)
4282 if (broken_counter>0) {
4283 return broken_buf[--broken_counter];
4286 if (c=='$' && broken_last != ESC
4287 && (input_mode==ASCII || input_mode==X0201)) {
4290 if (c1=='@'|| c1=='B') {
4291 broken_buf[0]=c1; broken_buf[1]=c;
4298 } else if (c=='(' && broken_last != ESC
4299 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4302 if (c1=='J'|| c1=='B') {
4303 broken_buf[0]=c1; broken_buf[1]=c;
4316 nkf_char broken_ungetc(nkf_char c, FILE *f)
4318 if (broken_counter<2)
4319 broken_buf[broken_counter++]=c;
4323 void nl_conv(nkf_char c2, nkf_char c1)
4325 if (guess_f && input_nextline != EOF) {
4326 if (c2 == 0 && c1 == LF) {
4327 if (!input_nextline) input_nextline = prev_cr ? CRLF : LF;
4328 else if (input_nextline != (prev_cr ? CRLF : LF)) input_nextline = EOF;
4329 } else if (c2 == 0 && c1 == CR && input_nextline == LF) input_nextline = EOF;
4331 else if (!input_nextline) input_nextline = CR;
4332 else if (input_nextline != CR) input_nextline = EOF;
4334 if (prev_cr || c2 == 0 && c1 == LF) {
4336 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4337 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4339 if (c2 == 0 && c1 == CR) prev_cr = CR;
4340 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4344 Return value of fold_conv()
4346 LF add newline and output char
4347 CR add newline and output nothing
4350 1 (or else) normal output
4352 fold state in prev (previous character)
4354 >0x80 Japanese (X0208/X0201)
4359 This fold algorthm does not preserve heading space in a line.
4360 This is the main difference from fmt.
4363 #define char_size(c2,c1) (c2?2:1)
4365 void fold_conv(nkf_char c2, nkf_char c1)
4368 nkf_char fold_state;
4370 if (c1== CR && !fold_preserve_f) {
4371 fold_state=0; /* ignore cr */
4372 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4374 fold_state=0; /* ignore cr */
4375 } else if (c1== BS) {
4376 if (f_line>0) f_line--;
4378 } else if (c2==EOF && f_line != 0) { /* close open last line */
4380 } else if ((c1==LF && !fold_preserve_f)
4381 || ((c1==CR||(c1==LF&&f_prev!=CR))
4382 && fold_preserve_f)) {
4384 if (fold_preserve_f) {
4388 } else if ((f_prev == c1 && !fold_preserve_f)
4389 || (f_prev == LF && fold_preserve_f)
4390 ) { /* duplicate newline */
4393 fold_state = LF; /* output two newline */
4399 if (f_prev&0x80) { /* Japanese? */
4401 fold_state = 0; /* ignore given single newline */
4402 } else if (f_prev==SP) {
4406 if (++f_line<=fold_len)
4410 fold_state = CR; /* fold and output nothing */
4414 } else if (c1=='\f') {
4417 fold_state = LF; /* output newline and clear */
4418 } else if ( (c2==0 && c1==SP)||
4419 (c2==0 && c1==TAB)||
4420 (c2=='!'&& c1=='!')) {
4421 /* X0208 kankaku or ascii space */
4423 fold_state = 0; /* remove duplicate spaces */
4426 if (++f_line<=fold_len)
4427 fold_state = SP; /* output ASCII space only */
4429 f_prev = SP; f_line = 0;
4430 fold_state = CR; /* fold and output nothing */
4434 prev0 = f_prev; /* we still need this one... , but almost done */
4436 if (c2 || c2==X0201)
4437 f_prev |= 0x80; /* this is Japanese */
4438 f_line += char_size(c2,c1);
4439 if (f_line<=fold_len) { /* normal case */
4442 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4443 f_line = char_size(c2,c1);
4444 fold_state = LF; /* We can't wait, do fold now */
4445 } else if (c2==X0201) {
4446 /* simple kinsoku rules return 1 means no folding */
4447 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4448 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4449 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4450 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4451 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4452 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4453 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4455 fold_state = LF;/* add one new f_line before this character */
4458 fold_state = LF;/* add one new f_line before this character */
4461 /* kinsoku point in ASCII */
4462 if ( c1==')'|| /* { [ ( */
4473 /* just after special */
4474 } else if (!is_alnum(prev0)) {
4475 f_line = char_size(c2,c1);
4477 } else if ((prev0==SP) || /* ignored new f_line */
4478 (prev0==LF)|| /* ignored new f_line */
4479 (prev0&0x80)) { /* X0208 - ASCII */
4480 f_line = char_size(c2,c1);
4481 fold_state = LF;/* add one new f_line before this character */
4483 fold_state = 1; /* default no fold in ASCII */
4487 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4488 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4489 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4490 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4491 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4492 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4493 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4494 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4495 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4496 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4497 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4498 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4499 /* default no fold in kinsoku */
4502 f_line = char_size(c2,c1);
4503 /* add one new f_line before this character */
4506 f_line = char_size(c2,c1);
4508 /* add one new f_line before this character */
4513 /* terminator process */
4514 switch(fold_state) {
4533 nkf_char z_prev2=0,z_prev1=0;
4535 void z_conv(nkf_char c2, nkf_char c1)
4538 /* if (c2) c1 &= 0x7f; assertion */
4540 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4546 if (z_prev2 == X0201) {
4548 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4550 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4552 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4554 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4559 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4562 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4563 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4568 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4579 if (alpha_f&1 && c2 == 0x23) {
4580 /* JISX0208 Alphabet */
4582 } else if (c2 == 0x21) {
4583 /* JISX0208 Kigou */
4588 } else if (alpha_f&4) {
4593 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4599 if (alpha_f&8 && c2 == 0) {
4603 case '>': entity = ">"; break;
4604 case '<': entity = "<"; break;
4605 case '\"': entity = """; break;
4606 case '&': entity = "&"; break;
4609 while (*entity) (*o_zconv)(0, *entity++);
4615 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4620 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4624 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4628 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4632 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4636 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4640 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4644 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4648 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4653 (*o_zconv)(X0201, c);
4656 } else if (c2 == 0x25) {
4657 /* JISX0208 Katakana */
4658 static const int fullwidth_to_halfwidth[] =
4660 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4661 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4662 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4663 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4664 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4665 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4666 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4667 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4668 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4669 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4670 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4671 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4673 if (fullwidth_to_halfwidth[c1-0x20]){
4674 c2 = fullwidth_to_halfwidth[c1-0x20];
4675 (*o_zconv)(X0201, c2>>8);
4677 (*o_zconv)(X0201, c2&0xFF);
4687 #define rot13(c) ( \
4689 (c <= 'M') ? (c + 13): \
4690 (c <= 'Z') ? (c - 13): \
4692 (c <= 'm') ? (c + 13): \
4693 (c <= 'z') ? (c - 13): \
4697 #define rot47(c) ( \
4699 ( c <= 'O') ? (c + 47) : \
4700 ( c <= '~') ? (c - 47) : \
4704 void rot_conv(nkf_char c2, nkf_char c1)
4706 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4712 (*o_rot_conv)(c2,c1);
4715 void hira_conv(nkf_char c2, nkf_char c1)
4719 if (0x20 < c1 && c1 < 0x74) {
4721 (*o_hira_conv)(c2,c1);
4723 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4725 c1 = CLASS_UNICODE | 0x3094;
4726 (*o_hira_conv)(c2,c1);
4729 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4731 (*o_hira_conv)(c2,c1);
4736 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4739 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4741 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4745 (*o_hira_conv)(c2,c1);
4749 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4751 static const nkf_char range[RANGE_NUM_MAX][2] = {
4772 nkf_char start, end, c;
4774 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4778 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4783 for (i = 0; i < RANGE_NUM_MAX; i++) {
4784 start = range[i][0];
4787 if (c >= start && c <= end) {
4792 (*o_iso2022jp_check_conv)(c2,c1);
4796 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4798 static const unsigned char *mime_pattern[] = {
4799 (const unsigned char *)"\075?EUC-JP?B?",
4800 (const unsigned char *)"\075?SHIFT_JIS?B?",
4801 (const unsigned char *)"\075?ISO-8859-1?Q?",
4802 (const unsigned char *)"\075?ISO-8859-1?B?",
4803 (const unsigned char *)"\075?ISO-2022-JP?B?",
4804 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4805 #if defined(UTF8_INPUT_ENABLE)
4806 (const unsigned char *)"\075?UTF-8?B?",
4807 (const unsigned char *)"\075?UTF-8?Q?",
4809 (const unsigned char *)"\075?US-ASCII?Q?",
4814 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4815 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4816 e_iconv, s_iconv, 0, 0, 0, 0,
4817 #if defined(UTF8_INPUT_ENABLE)
4823 static const nkf_char mime_encode[] = {
4824 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4825 #if defined(UTF8_INPUT_ENABLE)
4832 static const nkf_char mime_encode_method[] = {
4833 'B', 'B','Q', 'B', 'B', 'Q',
4834 #if defined(UTF8_INPUT_ENABLE)
4842 #define MAXRECOVER 20
4844 void switch_mime_getc(void)
4846 if (i_getc!=mime_getc) {
4847 i_mgetc = i_getc; i_getc = mime_getc;
4848 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4849 if(mime_f==STRICT_MIME) {
4850 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4851 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4856 void unswitch_mime_getc(void)
4858 if(mime_f==STRICT_MIME) {
4859 i_mgetc = i_mgetc_buf;
4860 i_mungetc = i_mungetc_buf;
4863 i_ungetc = i_mungetc;
4864 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4865 mime_iconv_back = NULL;
4868 nkf_char mime_begin_strict(FILE *f)
4872 const unsigned char *p,*q;
4873 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4875 mime_decode_mode = FALSE;
4876 /* =? has been checked */
4878 p = mime_pattern[j];
4881 for(i=2;p[i]>SP;i++) { /* start at =? */
4882 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4883 /* pattern fails, try next one */
4885 while (mime_pattern[++j]) {
4886 p = mime_pattern[j];
4887 for(k=2;k<i;k++) /* assume length(p) > i */
4888 if (p[k]!=q[k]) break;
4889 if (k==i && nkf_toupper(c1)==p[k]) break;
4891 p = mime_pattern[j];
4892 if (p) continue; /* found next one, continue */
4893 /* all fails, output from recovery buffer */
4901 mime_decode_mode = p[i-2];
4903 mime_iconv_back = iconv;
4904 set_iconv(FALSE, mime_priority_func[j]);
4905 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4907 if (mime_decode_mode=='B') {
4908 mimebuf_f = unbuf_f;
4910 /* do MIME integrity check */
4911 return mime_integrity(f,mime_pattern[j]);
4919 nkf_char mime_getc_buf(FILE *f)
4921 /* we don't keep eof of Fifo, becase it contains ?= as
4922 a terminator. It was checked in mime_integrity. */
4923 return ((mimebuf_f)?
4924 (*i_mgetc_buf)(f):Fifo(mime_input++));
4927 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4930 (*i_mungetc_buf)(c,f);
4932 Fifo(--mime_input) = (unsigned char)c;
4936 nkf_char mime_begin(FILE *f)
4941 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4942 /* re-read and convert again from mime_buffer. */
4944 /* =? has been checked */
4946 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4947 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4948 /* We accept any character type even if it is breaked by new lines */
4949 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4950 if (c1==LF||c1==SP||c1==CR||
4951 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4953 /* Failed. But this could be another MIME preemble */
4961 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4962 if (!(++i<MAXRECOVER) || c1==EOF) break;
4963 if (c1=='b'||c1=='B') {
4964 mime_decode_mode = 'B';
4965 } else if (c1=='q'||c1=='Q') {
4966 mime_decode_mode = 'Q';
4970 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4971 if (!(++i<MAXRECOVER) || c1==EOF) break;
4973 mime_decode_mode = FALSE;
4979 if (!mime_decode_mode) {
4980 /* false MIME premble, restart from mime_buffer */
4981 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4982 /* Since we are in MIME mode until buffer becomes empty, */
4983 /* we never go into mime_begin again for a while. */
4986 /* discard mime preemble, and goto MIME mode */
4988 /* do no MIME integrity check */
4989 return c1; /* used only for checking EOF */
4993 void no_putc(nkf_char c)
4998 void debug(const char *str)
5001 fprintf(stderr, "%s\n", str ? str : "NULL");
5006 void set_input_codename(char *codename)
5008 if (!input_codename) {
5009 input_codename = codename;
5010 } else if (strcmp(codename, input_codename) != 0) {
5011 input_codename = "";
5015 #if !defined(PERL_XS) && !defined(WIN32DLL)
5016 void print_guessed_code(char *filename)
5018 char *codename = "BINARY";
5019 char *str_nlmode = NULL;
5020 if (filename != NULL) printf("%s: ", filename);
5021 if (input_codename && !*input_codename) {
5024 struct input_code *p = find_inputcode_byfunc(iconv);
5026 (input_codename ? input_codename : "ASCII"),
5027 ((p->score & (SCORE_DEPEND|SCORE_CP932|SCORE_NO_EXIST)) ? "+" : ""),
5028 input_nextline == CR ? " (CR)" :
5029 input_nextline == LF ? " (LF)" :
5030 input_nextline == CRLF ? " (CRLF)" :
5031 input_nextline == EOF ? " (MIXED NL)" :
5039 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5041 nkf_char c1, c2, c3;
5047 if (!nkf_isxdigit(c2)){
5052 if (!nkf_isxdigit(c3)){
5057 return (hex2bin(c2) << 4) | hex2bin(c3);
5060 nkf_char cap_getc(FILE *f)
5062 return hex_getc(':', f, i_cgetc, i_cungetc);
5065 nkf_char cap_ungetc(nkf_char c, FILE *f)
5067 return (*i_cungetc)(c, f);
5070 nkf_char url_getc(FILE *f)
5072 return hex_getc('%', f, i_ugetc, i_uungetc);
5075 nkf_char url_ungetc(nkf_char c, FILE *f)
5077 return (*i_uungetc)(c, f);
5081 #ifdef NUMCHAR_OPTION
5082 nkf_char numchar_getc(FILE *f)
5084 nkf_char (*g)(FILE *) = i_ngetc;
5085 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5096 if (buf[i] == 'x' || buf[i] == 'X'){
5097 for (j = 0; j < 7; j++){
5099 if (!nkf_isxdigit(buf[i])){
5106 c |= hex2bin(buf[i]);
5109 for (j = 0; j < 8; j++){
5113 if (!nkf_isdigit(buf[i])){
5120 c += hex2bin(buf[i]);
5126 return CLASS_UNICODE | c;
5135 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5137 return (*i_nungetc)(c, f);
5141 #ifdef UNICODE_NORMALIZATION
5143 /* Normalization Form C */
5144 nkf_char nfc_getc(FILE *f)
5146 nkf_char (*g)(FILE *f) = i_nfc_getc;
5147 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5148 int i=0, j, k=1, lower, upper;
5150 const nkf_nfchar *array;
5153 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5154 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5155 while (upper >= lower) {
5156 j = (lower+upper) / 2;
5157 array = normalization_table[j].nfd;
5158 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5159 if (array[k] != buf[k]){
5160 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5167 array = normalization_table[j].nfc;
5168 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5169 buf[i] = (nkf_char)(array[i]);
5180 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5182 return (*i_nfc_ungetc)(c, f);
5184 #endif /* UNICODE_NORMALIZATION */
5190 nkf_char c1, c2, c3, c4, cc;
5191 nkf_char t1, t2, t3, t4, mode, exit_mode;
5192 nkf_char lwsp_count;
5195 nkf_char lwsp_size = 128;
5197 if (mime_top != mime_last) { /* Something is in FIFO */
5198 return Fifo(mime_top++);
5200 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5201 mime_decode_mode=FALSE;
5202 unswitch_mime_getc();
5203 return (*i_getc)(f);
5206 if (mimebuf_f == FIXED_MIME)
5207 exit_mode = mime_decode_mode;
5210 if (mime_decode_mode == 'Q') {
5211 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5213 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5214 if (c1<=SP || DEL<=c1) {
5215 mime_decode_mode = exit_mode; /* prepare for quit */
5218 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5222 mime_decode_mode = exit_mode; /* prepare for quit */
5223 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5224 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5225 /* end Q encoding */
5226 input_mode = exit_mode;
5228 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5229 if (lwsp_buf==NULL) {
5230 perror("can't malloc");
5233 while ((c1=(*i_getc)(f))!=EOF) {
5238 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5246 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5247 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5262 lwsp_buf[lwsp_count] = (unsigned char)c1;
5263 if (lwsp_count++>lwsp_size){
5265 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5266 if (lwsp_buf_new==NULL) {
5268 perror("can't realloc");
5271 lwsp_buf = lwsp_buf_new;
5277 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5279 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5280 i_ungetc(lwsp_buf[lwsp_count],f);
5286 if (c1=='='&&c2<SP) { /* this is soft wrap */
5287 while((c1 = (*i_mgetc)(f)) <=SP) {
5288 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5290 mime_decode_mode = 'Q'; /* still in MIME */
5291 goto restart_mime_q;
5294 mime_decode_mode = 'Q'; /* still in MIME */
5298 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5299 if (c2<=SP) return c2;
5300 mime_decode_mode = 'Q'; /* still in MIME */
5301 return ((hex2bin(c2)<<4) + hex2bin(c3));
5304 if (mime_decode_mode != 'B') {
5305 mime_decode_mode = FALSE;
5306 return (*i_mgetc)(f);
5310 /* Base64 encoding */
5312 MIME allows line break in the middle of
5313 Base64, but we are very pessimistic in decoding
5314 in unbuf mode because MIME encoded code may broken by
5315 less or editor's control sequence (such as ESC-[-K in unbuffered
5316 mode. ignore incomplete MIME.
5318 mode = mime_decode_mode;
5319 mime_decode_mode = exit_mode; /* prepare for quit */
5321 while ((c1 = (*i_mgetc)(f))<=SP) {
5326 if ((c2 = (*i_mgetc)(f))<=SP) {
5329 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5330 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5333 if ((c1 == '?') && (c2 == '=')) {
5336 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5337 if (lwsp_buf==NULL) {
5338 perror("can't malloc");
5341 while ((c1=(*i_getc)(f))!=EOF) {
5346 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5354 if ((c1=(*i_getc)(f))!=EOF) {
5358 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5373 lwsp_buf[lwsp_count] = (unsigned char)c1;
5374 if (lwsp_count++>lwsp_size){
5376 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5377 if (lwsp_buf_new==NULL) {
5379 perror("can't realloc");
5382 lwsp_buf = lwsp_buf_new;
5388 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5390 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5391 i_ungetc(lwsp_buf[lwsp_count],f);
5398 if ((c3 = (*i_mgetc)(f))<=SP) {
5401 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5402 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5406 if ((c4 = (*i_mgetc)(f))<=SP) {
5409 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5410 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5414 mime_decode_mode = mode; /* still in MIME sigh... */
5416 /* BASE 64 decoding */
5418 t1 = 0x3f & base64decode(c1);
5419 t2 = 0x3f & base64decode(c2);
5420 t3 = 0x3f & base64decode(c3);
5421 t4 = 0x3f & base64decode(c4);
5422 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5424 Fifo(mime_last++) = (unsigned char)cc;
5425 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5427 Fifo(mime_last++) = (unsigned char)cc;
5428 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5430 Fifo(mime_last++) = (unsigned char)cc;
5435 return Fifo(mime_top++);
5438 nkf_char mime_ungetc(nkf_char c, FILE *f)
5440 Fifo(--mime_top) = (unsigned char)c;
5444 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5448 /* In buffered mode, read until =? or NL or buffer full
5450 mime_input = mime_top;
5451 mime_last = mime_top;
5453 while(*p) Fifo(mime_input++) = *p++;
5456 while((c=(*i_getc)(f))!=EOF) {
5457 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5458 break; /* buffer full */
5460 if (c=='=' && d=='?') {
5461 /* checked. skip header, start decode */
5462 Fifo(mime_input++) = (unsigned char)c;
5463 /* mime_last_input = mime_input; */
5468 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5470 /* Should we check length mod 4? */
5471 Fifo(mime_input++) = (unsigned char)c;
5474 /* In case of Incomplete MIME, no MIME decode */
5475 Fifo(mime_input++) = (unsigned char)c;
5476 mime_last = mime_input; /* point undecoded buffer */
5477 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5478 switch_mime_getc(); /* anyway we need buffered getc */
5482 nkf_char base64decode(nkf_char c)
5487 i = c - 'A'; /* A..Z 0-25 */
5489 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5491 } else if (c > '/') {
5492 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5493 } else if (c == '+') {
5494 i = '>' /* 62 */ ; /* + 62 */
5496 i = '?' /* 63 */ ; /* / 63 */
5501 static const char basis_64[] =
5502 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5504 static nkf_char b64c;
5505 #define MIMEOUT_BUF_LENGTH (60)
5506 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5507 int mimeout_buf_count = 0;
5508 int mimeout_preserve_space = 0;
5509 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5511 void open_mime(nkf_char mode)
5513 const unsigned char *p;
5516 p = mime_pattern[0];
5517 for(i=0;mime_pattern[i];i++) {
5518 if (mode == mime_encode[i]) {
5519 p = mime_pattern[i];
5523 mimeout_mode = mime_encode_method[i];
5526 if (base64_count>45) {
5527 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5528 (*o_mputc)(mimeout_buf[i]);
5534 if (!mimeout_preserve_space && mimeout_buf_count>0
5535 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5536 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5540 if (!mimeout_preserve_space) {
5541 for (;i<mimeout_buf_count;i++) {
5542 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5543 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5544 (*o_mputc)(mimeout_buf[i]);
5551 mimeout_preserve_space = FALSE;
5557 j = mimeout_buf_count;
5558 mimeout_buf_count = 0;
5560 mime_putc(mimeout_buf[i]);
5564 void close_mime(void)
5574 switch(mimeout_mode) {
5579 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5585 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5591 if (mimeout_f!=FIXED_MIME) {
5593 } else if (mimeout_mode != 'Q')
5598 void mimeout_addchar(nkf_char c)
5600 switch(mimeout_mode) {
5605 } else if(!nkf_isalnum(c)) {
5607 (*o_mputc)(itoh4(((c>>4)&0xf)));
5608 (*o_mputc)(itoh4((c&0xf)));
5617 (*o_mputc)(basis_64[c>>2]);
5622 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5628 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5629 (*o_mputc)(basis_64[c & 0x3F]);
5640 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5642 void mime_prechar(nkf_char c2, nkf_char c1)
5646 if (base64_count + mimeout_buf_count/3*4> 73){
5647 (*o_base64conv)(EOF,0);
5648 (*o_base64conv)(0,LF);
5649 (*o_base64conv)(0,SP);
5652 if (base64_count + mimeout_buf_count/3*4> 66){
5653 (*o_base64conv)(EOF,0);
5654 (*o_base64conv)(0,LF);
5655 (*o_base64conv)(0,SP);
5657 }/*else if (mime_lastchar2){
5658 if (c1 <=DEL && !nkf_isspace(c1)){
5659 (*o_base64conv)(0,SP);
5663 if (c2 && mime_lastchar2 == 0
5664 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5665 (*o_base64conv)(0,SP);
5668 /*mime_lastchar2 = c2;
5669 mime_lastchar1 = c1;*/
5672 void mime_putc(nkf_char c)
5677 if (mimeout_f == FIXED_MIME){
5678 if (mimeout_mode == 'Q'){
5679 if (base64_count > 71){
5680 if (c!=CR && c!=LF) {
5687 if (base64_count > 71){
5692 if (c == EOF) { /* c==EOF */
5696 if (c != EOF) { /* c==EOF */
5702 /* mimeout_f != FIXED_MIME */
5704 if (c == EOF) { /* c==EOF */
5705 j = mimeout_buf_count;
5706 mimeout_buf_count = 0;
5709 if (!nkf_isblank(mimeout_buf[j-1])) {
5711 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5714 mimeout_addchar(mimeout_buf[i]);
5718 mimeout_addchar(mimeout_buf[i]);
5722 mimeout_addchar(mimeout_buf[i]);
5728 mimeout_addchar(mimeout_buf[i]);
5734 if (mimeout_mode=='Q') {
5735 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5736 if (c == CR || c == LF) {
5741 } else if (c <= SP) {
5743 if (base64_count > 70) {
5747 if (!nkf_isblank(c)) {
5758 if (mimeout_buf_count > 0){
5759 lastchar = mimeout_buf[mimeout_buf_count - 1];
5764 if (!mimeout_mode) {
5765 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5766 if (nkf_isspace(c)) {
5767 if (c==CR || c==LF) {
5770 for (i=0;i<mimeout_buf_count;i++) {
5771 (*o_mputc)(mimeout_buf[i]);
5772 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5778 mimeout_buf[0] = (char)c;
5779 mimeout_buf_count = 1;
5781 if (base64_count > 1
5782 && base64_count + mimeout_buf_count > 76
5783 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5786 if (!nkf_isspace(mimeout_buf[0])){
5791 mimeout_buf[mimeout_buf_count++] = (char)c;
5792 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5793 open_mime(output_mode);
5798 if (lastchar==CR || lastchar == LF){
5799 for (i=0;i<mimeout_buf_count;i++) {
5800 (*o_mputc)(mimeout_buf[i]);
5803 mimeout_buf_count = 0;
5806 for (i=0;i<mimeout_buf_count-1;i++) {
5807 (*o_mputc)(mimeout_buf[i]);
5810 mimeout_buf[0] = SP;
5811 mimeout_buf_count = 1;
5813 open_mime(output_mode);
5816 /* mimeout_mode == 'B', 1, 2 */
5817 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5818 if (lastchar == CR || lastchar == LF){
5819 if (nkf_isblank(c)) {
5820 for (i=0;i<mimeout_buf_count;i++) {
5821 mimeout_addchar(mimeout_buf[i]);
5823 mimeout_buf_count = 0;
5824 } else if (SP<c && c<DEL) {
5826 for (i=0;i<mimeout_buf_count;i++) {
5827 (*o_mputc)(mimeout_buf[i]);
5830 mimeout_buf_count = 0;
5833 if (c==SP || c==TAB || c==CR || c==LF) {
5834 for (i=0;i<mimeout_buf_count;i++) {
5835 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5837 for (i=0;i<mimeout_buf_count;i++) {
5838 (*o_mputc)(mimeout_buf[i]);
5841 mimeout_buf_count = 0;
5844 mimeout_buf[mimeout_buf_count++] = (char)c;
5845 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5847 for (i=0;i<mimeout_buf_count;i++) {
5848 (*o_mputc)(mimeout_buf[i]);
5851 mimeout_buf_count = 0;
5855 if (mimeout_buf_count>0 && SP<c && c!='=') {
5856 mimeout_buf[mimeout_buf_count++] = (char)c;
5857 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5858 j = mimeout_buf_count;
5859 mimeout_buf_count = 0;
5861 mimeout_addchar(mimeout_buf[i]);
5868 if (mimeout_buf_count>0) {
5869 j = mimeout_buf_count;
5870 mimeout_buf_count = 0;
5872 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5874 mimeout_addchar(mimeout_buf[i]);
5880 (*o_mputc)(mimeout_buf[i]);
5882 open_mime(output_mode);
5892 struct input_code *p = input_code_list;
5905 mime_f = STRICT_MIME;
5906 mime_decode_f = FALSE;
5911 #if defined(MSDOS) || defined(__OS2__)
5916 iso2022jp_f = FALSE;
5917 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5918 ms_ucs_map_f = UCS_MAP_ASCII;
5920 #ifdef UTF8_INPUT_ENABLE
5921 no_cp932ext_f = FALSE;
5922 no_best_fit_chars_f = FALSE;
5923 encode_fallback = NULL;
5924 unicode_subchar = '?';
5925 input_endian = ENDIAN_BIG;
5927 #ifdef UTF8_OUTPUT_ENABLE
5928 output_bom_f = FALSE;
5929 output_endian = ENDIAN_BIG;
5931 #ifdef UNICODE_NORMALIZATION
5947 #ifdef SHIFTJIS_CP932
5957 for (i = 0; i < 256; i++){
5958 prefix_table[i] = 0;
5962 mimeout_buf_count = 0;
5967 fold_preserve_f = FALSE;
5970 kanji_intro = DEFAULT_J;
5971 ascii_intro = DEFAULT_R;
5972 fold_margin = FOLD_MARGIN;
5973 output_conv = DEFAULT_CONV;
5974 oconv = DEFAULT_CONV;
5975 o_zconv = no_connection;
5976 o_fconv = no_connection;
5977 o_nlconv = no_connection;
5978 o_rot_conv = no_connection;
5979 o_hira_conv = no_connection;
5980 o_base64conv = no_connection;
5981 o_iso2022jp_check_conv = no_connection;
5984 i_ungetc = std_ungetc;
5986 i_bungetc = std_ungetc;
5989 i_mungetc = std_ungetc;
5990 i_mgetc_buf = std_getc;
5991 i_mungetc_buf = std_ungetc;
5992 output_mode = ASCII;
5995 mime_decode_mode = FALSE;
6003 z_prev2=0,z_prev1=0;
6005 iconv_for_check = 0;
6007 input_codename = NULL;
6013 void no_connection(nkf_char c2, nkf_char c1)
6015 no_connection2(c2,c1,0);
6018 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6020 fprintf(stderr,"nkf internal module connection failure.\n");
6022 return 0; /* LINT */
6027 #define fprintf dllprintf
6031 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6032 fprintf(stderr,"Flags:\n");
6033 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6034 #ifdef DEFAULT_CODE_SJIS
6035 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6037 #ifdef DEFAULT_CODE_JIS
6038 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6040 #ifdef DEFAULT_CODE_EUC
6041 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6043 #ifdef DEFAULT_CODE_UTF8
6044 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6046 #ifdef UTF8_OUTPUT_ENABLE
6047 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6049 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6050 #ifdef UTF8_INPUT_ENABLE
6051 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6053 fprintf(stderr,"t no conversion\n");
6054 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6055 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6056 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6057 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6058 fprintf(stderr,"v Show this usage. V: show version\n");
6059 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6060 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6061 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6062 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6063 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6064 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6065 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6066 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6067 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6069 fprintf(stderr,"T Text mode output\n");
6071 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6072 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6073 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6074 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6075 fprintf(stderr,"\n");
6076 fprintf(stderr,"Long name options\n");
6077 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6078 fprintf(stderr," Specify the input or output codeset\n");
6079 fprintf(stderr," --fj --unix --mac --windows\n");
6080 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6081 fprintf(stderr," Convert for the system or code\n");
6082 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6083 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6084 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6086 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6088 #ifdef NUMCHAR_OPTION
6089 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6091 #ifdef UTF8_INPUT_ENABLE
6092 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6093 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6096 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6097 fprintf(stderr," Overwrite original listed files by filtered result\n");
6098 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6100 fprintf(stderr," -g --guess Guess the input code\n");
6101 fprintf(stderr," --help --version Show this help/the version\n");
6102 fprintf(stderr," For more information, see also man nkf\n");
6103 fprintf(stderr,"\n");
6109 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6110 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6113 #if defined(MSDOS) && defined(__WIN16__)
6116 #if defined(MSDOS) && defined(__WIN32__)
6122 ,NKF_VERSION,NKF_RELEASE_DATE);
6123 fprintf(stderr,"\n%s\n",CopyRight);