1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.148 2007/11/06 12:09:44 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-11-06"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
226 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
227 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
228 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
230 #define CP932_TABLE_BEGIN 0xFA
231 #define CP932_TABLE_END 0xFC
232 #define CP932INV_TABLE_BEGIN 0xED
233 #define CP932INV_TABLE_END 0xEE
234 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
236 #define HOLD_SIZE 1024
237 #if defined(INT_IS_SHORT)
238 #define IOBUF_SIZE 2048
240 #define IOBUF_SIZE 16384
243 #define DEFAULT_J 'B'
244 #define DEFAULT_R 'B'
246 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
247 #define SJ6394 0x0161 /* 63 - 94 ku offset */
249 #define RANGE_NUM_MAX 18
254 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
255 #define sizeof_euc_to_utf8_1byte 94
256 #define sizeof_euc_to_utf8_2bytes 94
257 #define sizeof_utf8_to_euc_C2 64
258 #define sizeof_utf8_to_euc_E5B8 64
259 #define sizeof_utf8_to_euc_2bytes 112
260 #define sizeof_utf8_to_euc_3bytes 16
263 /* MIME preprocessor */
265 #ifdef EASYWIN /*Easy Win */
266 extern POINT _BufferSize;
275 void (*status_func)(struct input_code *, nkf_char);
276 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
280 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
283 static const char *CopyRight = COPY_RIGHT;
285 #if !defined(PERL_XS) && !defined(WIN32DLL)
286 static nkf_char noconvert(FILE *f);
288 static void module_connection(void);
289 static nkf_char kanji_convert(FILE *f);
290 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
291 static nkf_char push_hold_buf(nkf_char c2);
292 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
293 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
294 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
295 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
296 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
298 * 0: Shift_JIS, eucJP-ascii
303 #define UCS_MAP_ASCII 0
305 #define UCS_MAP_CP932 2
306 #define UCS_MAP_CP10001 3
307 static int ms_ucs_map_f = UCS_MAP_ASCII;
309 #ifdef UTF8_INPUT_ENABLE
310 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
311 static int no_cp932ext_f = FALSE;
312 /* ignore ZERO WIDTH NO-BREAK SPACE */
313 static int no_best_fit_chars_f = FALSE;
314 static int input_endian = ENDIAN_BIG;
315 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
316 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
317 static void encode_fallback_html(nkf_char c);
318 static void encode_fallback_xml(nkf_char c);
319 static void encode_fallback_java(nkf_char c);
320 static void encode_fallback_perl(nkf_char c);
321 static void encode_fallback_subchar(nkf_char c);
322 static void (*encode_fallback)(nkf_char c) = NULL;
323 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
324 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
325 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
326 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
327 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
328 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
329 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
330 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
331 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
332 static void w_status(struct input_code *, nkf_char);
334 #ifdef UTF8_OUTPUT_ENABLE
335 static int output_bom_f = FALSE;
336 static int output_endian = ENDIAN_BIG;
337 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
338 static void w_oconv(nkf_char c2,nkf_char c1);
339 static void w_oconv16(nkf_char c2,nkf_char c1);
340 static void w_oconv32(nkf_char c2,nkf_char c1);
342 static void e_oconv(nkf_char c2,nkf_char c1);
343 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
344 static void s_oconv(nkf_char c2,nkf_char c1);
345 static void j_oconv(nkf_char c2,nkf_char c1);
346 static void fold_conv(nkf_char c2,nkf_char c1);
347 static void nl_conv(nkf_char c2,nkf_char c1);
348 static void z_conv(nkf_char c2,nkf_char c1);
349 static void rot_conv(nkf_char c2,nkf_char c1);
350 static void hira_conv(nkf_char c2,nkf_char c1);
351 static void base64_conv(nkf_char c2,nkf_char c1);
352 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
353 static void no_connection(nkf_char c2,nkf_char c1);
354 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
356 static void code_score(struct input_code *ptr);
357 static void code_status(nkf_char c);
359 static void std_putc(nkf_char c);
360 static nkf_char std_getc(FILE *f);
361 static nkf_char std_ungetc(nkf_char c,FILE *f);
363 static nkf_char broken_getc(FILE *f);
364 static nkf_char broken_ungetc(nkf_char c,FILE *f);
366 static nkf_char mime_begin(FILE *f);
367 static nkf_char mime_getc(FILE *f);
368 static nkf_char mime_ungetc(nkf_char c,FILE *f);
370 static void switch_mime_getc(void);
371 static void unswitch_mime_getc(void);
372 static nkf_char mime_begin_strict(FILE *f);
373 static nkf_char mime_getc_buf(FILE *f);
374 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
375 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
377 static nkf_char base64decode(nkf_char c);
378 static void mime_prechar(nkf_char c2, nkf_char c1);
379 static void mime_putc(nkf_char c);
380 static void open_mime(nkf_char c);
381 static void close_mime(void);
382 static void eof_mime(void);
383 static void mimeout_addchar(nkf_char c);
385 static void usage(void);
386 static void version(void);
388 static void options(unsigned char *c);
389 static void reinit(void);
393 #if !defined(PERL_XS) && !defined(WIN32DLL)
394 static unsigned char stdibuf[IOBUF_SIZE];
395 static unsigned char stdobuf[IOBUF_SIZE];
397 static unsigned char hold_buf[HOLD_SIZE*2];
398 static int hold_count = 0;
400 /* MIME preprocessor fifo */
402 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
403 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
404 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
405 static unsigned char mime_buf[MIME_BUF_SIZE];
406 static unsigned int mime_top = 0;
407 static unsigned int mime_last = 0; /* decoded */
408 static unsigned int mime_input = 0; /* undecoded */
409 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
412 static int unbuf_f = FALSE;
413 static int estab_f = FALSE;
414 static int nop_f = FALSE;
415 static int binmode_f = TRUE; /* binary mode */
416 static int rot_f = FALSE; /* rot14/43 mode */
417 static int hira_f = FALSE; /* hira/kata henkan */
418 static int input_f = FALSE; /* non fixed input code */
419 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
420 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
421 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
422 static int mimebuf_f = FALSE; /* MIME buffered input */
423 static int broken_f = FALSE; /* convert ESC-less broken JIS */
424 static int iso8859_f = FALSE; /* ISO8859 through */
425 static int mimeout_f = FALSE; /* base64 mode */
426 #if defined(MSDOS) || defined(__OS2__)
427 static int x0201_f = TRUE; /* Assume JISX0201 kana */
429 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
431 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
433 #ifdef UNICODE_NORMALIZATION
434 static int nfc_f = FALSE;
435 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
436 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
437 static nkf_char nfc_getc(FILE *f);
438 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
442 static int cap_f = FALSE;
443 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
444 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
445 static nkf_char cap_getc(FILE *f);
446 static nkf_char cap_ungetc(nkf_char c,FILE *f);
448 static int url_f = FALSE;
449 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
450 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
451 static nkf_char url_getc(FILE *f);
452 static nkf_char url_ungetc(nkf_char c,FILE *f);
455 #if defined(INT_IS_SHORT)
456 #define NKF_INT32_C(n) (n##L)
458 #define NKF_INT32_C(n) (n)
460 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
461 #define CLASS_MASK NKF_INT32_C(0xFF000000)
462 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
463 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
464 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
465 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
466 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
468 #ifdef NUMCHAR_OPTION
469 static int numchar_f = FALSE;
470 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
471 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
472 static nkf_char numchar_getc(FILE *f);
473 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
477 static int noout_f = FALSE;
478 static void no_putc(nkf_char c);
479 static int debug_f = FALSE;
480 static void debug(const char *str);
481 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
484 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
486 static void print_guessed_code(char *filename);
488 static void set_input_codename(char *codename);
491 static int exec_f = 0;
494 #ifdef SHIFTJIS_CP932
495 /* invert IBM extended characters to others */
496 static int cp51932_f = FALSE;
498 /* invert NEC-selected IBM extended characters to IBM extended characters */
499 static int cp932inv_f = TRUE;
501 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
502 #endif /* SHIFTJIS_CP932 */
505 static int x0212_f = FALSE;
506 static nkf_char x0212_shift(nkf_char c);
507 static nkf_char x0212_unshift(nkf_char c);
509 static int x0213_f = FALSE;
511 static unsigned char prefix_table[256];
513 static void set_code_score(struct input_code *ptr, nkf_char score);
514 static void clr_code_score(struct input_code *ptr, nkf_char score);
515 static void status_disable(struct input_code *ptr);
516 static void status_push_ch(struct input_code *ptr, nkf_char c);
517 static void status_clear(struct input_code *ptr);
518 static void status_reset(struct input_code *ptr);
519 static void status_reinit(struct input_code *ptr);
520 static void status_check(struct input_code *ptr, nkf_char c);
521 static void e_status(struct input_code *, nkf_char);
522 static void s_status(struct input_code *, nkf_char);
524 struct input_code input_code_list[] = {
525 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
526 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
527 #ifdef UTF8_INPUT_ENABLE
528 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
529 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
530 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
535 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
536 static int base64_count = 0;
538 /* X0208 -> ASCII converter */
541 static int f_line = 0; /* chars in line */
542 static int f_prev = 0;
543 static int fold_preserve_f = FALSE; /* preserve new lines */
544 static int fold_f = FALSE;
545 static int fold_len = 0;
548 static unsigned char kanji_intro = DEFAULT_J;
549 static unsigned char ascii_intro = DEFAULT_R;
553 #define FOLD_MARGIN 10
554 #define DEFAULT_FOLD 60
556 static int fold_margin = FOLD_MARGIN;
560 #ifdef DEFAULT_CODE_JIS
561 # define DEFAULT_CONV j_oconv
563 #ifdef DEFAULT_CODE_SJIS
564 # define DEFAULT_CONV s_oconv
566 #ifdef DEFAULT_CODE_EUC
567 # define DEFAULT_CONV e_oconv
569 #ifdef DEFAULT_CODE_UTF8
570 # define DEFAULT_CONV w_oconv
573 /* process default */
574 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
576 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
577 /* s_iconv or oconv */
578 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
580 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
588 /* static redirections */
590 static void (*o_putc)(nkf_char c) = std_putc;
592 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
593 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
595 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
596 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
598 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
600 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
601 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
603 /* for strict mime */
604 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
605 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
608 static int output_mode = ASCII, /* output kanji mode */
609 input_mode = ASCII, /* input kanji mode */
610 shift_mode = FALSE; /* TRUE shift out, or X0201 */
611 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
613 /* X0201 / X0208 conversion tables */
615 /* X0201 kana conversion table */
617 static const unsigned char cv[]= {
618 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
619 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
620 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
621 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
622 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
623 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
624 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
625 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
626 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
627 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
628 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
629 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
630 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
631 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
632 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
633 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
637 /* X0201 kana conversion table for daguten */
639 static const unsigned char dv[]= {
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
645 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
646 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
647 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
648 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
649 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
651 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 /* X0201 kana conversion table for han-daguten */
660 static const unsigned char ev[]= {
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
672 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 /* X0208 kigou conversion table */
681 /* 0x8140 - 0x819e */
682 static const unsigned char fv[] = {
684 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
685 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
686 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
687 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
688 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
689 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
690 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
692 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
694 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
700 static int file_out_f = FALSE;
702 static int overwrite_f = FALSE;
703 static int preserve_time_f = FALSE;
704 static int backup_f = FALSE;
705 static char *backup_suffix = "";
706 static char *get_backup_filename(const char *suffix, const char *filename);
709 static int nlmode_f = 0; /* CR, LF, CRLF */
710 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
711 static nkf_char prev_cr = 0; /* CR or 0 */
712 #ifdef EASYWIN /*Easy Win */
713 static int end_check;
716 #define STD_GC_BUFSIZE (256)
717 nkf_char std_gc_buf[STD_GC_BUFSIZE];
721 #include "nkf32dll.c"
722 #elif defined(PERL_XS)
724 int main(int argc, char **argv)
729 char *outfname = NULL;
732 #ifdef EASYWIN /*Easy Win */
733 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
736 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
737 cp = (unsigned char *)*argv;
741 int debug_f_back = debug_f;
744 int exec_f_back = exec_f;
747 int x0212_f_back = x0212_f;
750 int x0213_f_back = x0213_f;
752 int guess_f_back = guess_f;
754 guess_f = guess_f_back;
757 debug_f = debug_f_back;
760 exec_f = exec_f_back;
763 x0212_f = x0212_f_back;
766 x0213_f = x0213_f_back;
772 if (pipe(fds) < 0 || (pid = fork()) < 0){
783 execvp(argv[1], &argv[1]);
797 if(x0201_f == WISH_TRUE)
798 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
800 if (binmode_f == TRUE)
801 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
802 if (freopen("","wb",stdout) == NULL)
809 setbuf(stdout, (char *) NULL);
811 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
814 if (binmode_f == TRUE)
815 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
816 if (freopen("","rb",stdin) == NULL) return (-1);
820 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
824 kanji_convert(stdin);
825 if (guess_f) print_guessed_code(NULL);
829 int is_argument_error = FALSE;
831 input_codename = NULL;
836 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
839 is_argument_error = TRUE;
847 /* reopen file for stdout */
848 if (file_out_f == TRUE) {
851 outfname = malloc(strlen(origfname)
852 + strlen(".nkftmpXXXXXX")
858 strcpy(outfname, origfname);
862 for (i = strlen(outfname); i; --i){
863 if (outfname[i - 1] == '/'
864 || outfname[i - 1] == '\\'){
870 strcat(outfname, "ntXXXXXX");
872 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
875 strcat(outfname, ".nkftmpXXXXXX");
876 fd = mkstemp(outfname);
879 || (fd_backup = dup(fileno(stdout))) < 0
880 || dup2(fd, fileno(stdout)) < 0
891 outfname = "nkf.out";
894 if(freopen(outfname, "w", stdout) == NULL) {
898 if (binmode_f == TRUE) {
899 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
900 if (freopen("","wb",stdout) == NULL)
907 if (binmode_f == TRUE)
908 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
909 if (freopen("","rb",fin) == NULL)
914 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
918 char *filename = NULL;
920 if (nfiles > 1) filename = origfname;
921 if (guess_f) print_guessed_code(filename);
927 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
935 if (dup2(fd_backup, fileno(stdout)) < 0){
938 if (stat(origfname, &sb)) {
939 fprintf(stderr, "Can't stat %s\n", origfname);
941 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
942 if (chmod(outfname, sb.st_mode)) {
943 fprintf(stderr, "Can't set permission %s\n", outfname);
946 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
948 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
949 tb[0] = tb[1] = sb.st_mtime;
950 if (utime(outfname, tb)) {
951 fprintf(stderr, "Can't set timestamp %s\n", outfname);
954 tb.actime = sb.st_atime;
955 tb.modtime = sb.st_mtime;
956 if (utime(outfname, &tb)) {
957 fprintf(stderr, "Can't set timestamp %s\n", outfname);
962 char *backup_filename = get_backup_filename(backup_suffix, origfname);
964 unlink(backup_filename);
966 if (rename(origfname, backup_filename)) {
967 perror(backup_filename);
968 fprintf(stderr, "Can't rename %s to %s\n",
969 origfname, backup_filename);
973 if (unlink(origfname)){
978 if (rename(outfname, origfname)) {
980 fprintf(stderr, "Can't rename %s to %s\n",
981 outfname, origfname);
988 if (is_argument_error)
991 #ifdef EASYWIN /*Easy Win */
992 if (file_out_f == FALSE)
993 scanf("%d",&end_check);
996 #else /* for Other OS */
997 if (file_out_f == TRUE)
1002 #endif /* WIN32DLL */
1005 char *get_backup_filename(const char *suffix, const char *filename)
1007 char *backup_filename;
1008 int asterisk_count = 0;
1010 int filename_length = strlen(filename);
1012 for(i = 0; suffix[i]; i++){
1013 if(suffix[i] == '*') asterisk_count++;
1017 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1018 if (!backup_filename){
1019 perror("Can't malloc backup filename.");
1023 for(i = 0, j = 0; suffix[i];){
1024 if(suffix[i] == '*'){
1025 backup_filename[j] = '\0';
1026 strncat(backup_filename, filename, filename_length);
1028 j += filename_length;
1030 backup_filename[j++] = suffix[i++];
1033 backup_filename[j] = '\0';
1035 j = strlen(suffix) + filename_length;
1036 backup_filename = malloc( + 1);
1037 strcpy(backup_filename, filename);
1038 strcat(backup_filename, suffix);
1039 backup_filename[j] = '\0';
1041 return backup_filename;
1045 static const struct {
1069 {"katakana-hiragana","h3"},
1077 #ifdef UTF8_OUTPUT_ENABLE
1087 {"fb-subchar=", ""},
1089 #ifdef UTF8_INPUT_ENABLE
1090 {"utf8-input", "W"},
1091 {"utf16-input", "W16"},
1092 {"no-cp932ext", ""},
1093 {"no-best-fit-chars",""},
1095 #ifdef UNICODE_NORMALIZATION
1096 {"utf8mac-input", ""},
1108 #ifdef NUMCHAR_OPTION
1109 {"numchar-input", ""},
1115 #ifdef SHIFTJIS_CP932
1125 static int option_mode = 0;
1127 void options(unsigned char *cp)
1131 unsigned char *cp_back = NULL;
1136 while(*cp && *cp++!='-');
1137 while (*cp || cp_back) {
1145 case '-': /* literal options */
1146 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1150 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1151 p = (unsigned char *)long_option[i].name;
1152 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1153 if (*p == cp[j] || cp[j] == SP){
1160 fprintf(stderr, "unknown long option: --%s\n", cp);
1163 while(*cp && *cp != SP && cp++);
1164 if (long_option[i].alias[0]){
1166 cp = (unsigned char *)long_option[i].alias;
1168 if (strcmp(long_option[i].name, "ic=") == 0){
1169 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1170 codeset[i] = nkf_toupper(p[i]);
1173 if(strcmp(codeset, "ISO-2022-JP") == 0){
1174 input_f = JIS_INPUT;
1175 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1176 strcmp(codeset, "CP50220") == 0 ||
1177 strcmp(codeset, "CP50221") == 0 ||
1178 strcmp(codeset, "CP50222") == 0){
1179 input_f = JIS_INPUT;
1180 #ifdef SHIFTJIS_CP932
1183 #ifdef UTF8_OUTPUT_ENABLE
1184 ms_ucs_map_f = UCS_MAP_CP932;
1186 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1187 input_f = JIS_INPUT;
1191 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1192 input_f = JIS_INPUT;
1197 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1198 input_f = SJIS_INPUT;
1199 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1200 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1201 strcmp(codeset, "CP932") == 0 ||
1202 strcmp(codeset, "MS932") == 0){
1203 input_f = SJIS_INPUT;
1204 #ifdef SHIFTJIS_CP932
1207 #ifdef UTF8_OUTPUT_ENABLE
1208 ms_ucs_map_f = UCS_MAP_CP932;
1210 }else if(strcmp(codeset, "CP10001") == 0){
1211 input_f = SJIS_INPUT;
1212 #ifdef SHIFTJIS_CP932
1215 #ifdef UTF8_OUTPUT_ENABLE
1216 ms_ucs_map_f = UCS_MAP_CP10001;
1218 }else if(strcmp(codeset, "EUCJP") == 0 ||
1219 strcmp(codeset, "EUC-JP") == 0){
1220 input_f = EUC_INPUT;
1221 }else if(strcmp(codeset, "CP51932") == 0){
1222 input_f = EUC_INPUT;
1223 #ifdef SHIFTJIS_CP932
1226 #ifdef UTF8_OUTPUT_ENABLE
1227 ms_ucs_map_f = UCS_MAP_CP932;
1229 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1230 strcmp(codeset, "EUCJP-MS") == 0 ||
1231 strcmp(codeset, "EUCJPMS") == 0){
1232 input_f = EUC_INPUT;
1233 #ifdef SHIFTJIS_CP932
1236 #ifdef UTF8_OUTPUT_ENABLE
1237 ms_ucs_map_f = UCS_MAP_MS;
1239 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1240 strcmp(codeset, "EUCJP-ASCII") == 0){
1241 input_f = EUC_INPUT;
1242 #ifdef SHIFTJIS_CP932
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_ASCII;
1248 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1249 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1250 input_f = SJIS_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1256 strcmp(codeset, "EUC-JIS-2004") == 0){
1257 input_f = EUC_INPUT;
1259 #ifdef SHIFTJIS_CP932
1262 #ifdef UTF8_INPUT_ENABLE
1263 }else if(strcmp(codeset, "UTF-8") == 0 ||
1264 strcmp(codeset, "UTF-8N") == 0 ||
1265 strcmp(codeset, "UTF-8-BOM") == 0){
1266 input_f = UTF8_INPUT;
1267 #ifdef UNICODE_NORMALIZATION
1268 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1269 strcmp(codeset, "UTF-8-MAC") == 0){
1270 input_f = UTF8_INPUT;
1273 }else if(strcmp(codeset, "UTF-16") == 0 ||
1274 strcmp(codeset, "UTF-16BE") == 0 ||
1275 strcmp(codeset, "UTF-16BE-BOM") == 0){
1276 input_f = UTF16_INPUT;
1277 input_endian = ENDIAN_BIG;
1278 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1279 strcmp(codeset, "UTF-16LE-BOM") == 0){
1280 input_f = UTF16_INPUT;
1281 input_endian = ENDIAN_LITTLE;
1282 }else if(strcmp(codeset, "UTF-32") == 0 ||
1283 strcmp(codeset, "UTF-32BE") == 0 ||
1284 strcmp(codeset, "UTF-32BE-BOM") == 0){
1285 input_f = UTF32_INPUT;
1286 input_endian = ENDIAN_BIG;
1287 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1288 strcmp(codeset, "UTF-32LE-BOM") == 0){
1289 input_f = UTF32_INPUT;
1290 input_endian = ENDIAN_LITTLE;
1293 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1297 if (strcmp(long_option[i].name, "oc=") == 0){
1299 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1300 codeset[i] = nkf_toupper(p[i]);
1303 if(strcmp(codeset, "ISO-2022-JP") == 0){
1304 output_conv = j_oconv;
1305 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1306 output_conv = j_oconv;
1307 no_cp932ext_f = TRUE;
1308 #ifdef SHIFTJIS_CP932
1311 #ifdef UTF8_OUTPUT_ENABLE
1312 ms_ucs_map_f = UCS_MAP_CP932;
1314 }else if(strcmp(codeset, "CP50220") == 0){
1315 output_conv = j_oconv;
1317 #ifdef SHIFTJIS_CP932
1320 #ifdef UTF8_OUTPUT_ENABLE
1321 ms_ucs_map_f = UCS_MAP_CP932;
1323 }else if(strcmp(codeset, "CP50221") == 0){
1324 output_conv = j_oconv;
1325 #ifdef SHIFTJIS_CP932
1328 #ifdef UTF8_OUTPUT_ENABLE
1329 ms_ucs_map_f = UCS_MAP_CP932;
1331 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1332 output_conv = j_oconv;
1336 #ifdef SHIFTJIS_CP932
1339 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1340 output_conv = j_oconv;
1345 #ifdef SHIFTJIS_CP932
1348 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1349 output_conv = s_oconv;
1350 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1351 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1352 strcmp(codeset, "CP932") == 0 ||
1353 strcmp(codeset, "MS932") == 0){
1354 output_conv = s_oconv;
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP932;
1358 }else if(strcmp(codeset, "CP10001") == 0){
1359 output_conv = s_oconv;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP10001;
1363 }else if(strcmp(codeset, "EUCJP") == 0 ||
1364 strcmp(codeset, "EUC-JP") == 0){
1365 output_conv = e_oconv;
1366 }else if(strcmp(codeset, "CP51932") == 0){
1367 output_conv = e_oconv;
1368 #ifdef SHIFTJIS_CP932
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP932;
1374 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1375 strcmp(codeset, "EUCJP-MS") == 0 ||
1376 strcmp(codeset, "EUCJPMS") == 0){
1377 output_conv = e_oconv;
1381 #ifdef UTF8_OUTPUT_ENABLE
1382 ms_ucs_map_f = UCS_MAP_MS;
1384 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1385 strcmp(codeset, "EUCJP-ASCII") == 0){
1386 output_conv = e_oconv;
1390 #ifdef UTF8_OUTPUT_ENABLE
1391 ms_ucs_map_f = UCS_MAP_ASCII;
1393 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1394 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1395 output_conv = s_oconv;
1397 #ifdef SHIFTJIS_CP932
1400 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1401 strcmp(codeset, "EUC-JIS-2004") == 0){
1402 output_conv = e_oconv;
1407 #ifdef SHIFTJIS_CP932
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 }else if(strcmp(codeset, "UTF-8") == 0){
1412 output_conv = w_oconv;
1413 }else if(strcmp(codeset, "UTF-8N") == 0){
1414 output_conv = w_oconv;
1415 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1416 output_conv = w_oconv;
1417 output_bom_f = TRUE;
1418 }else if(strcmp(codeset, "UTF-16BE") == 0){
1419 output_conv = w_oconv16;
1420 }else if(strcmp(codeset, "UTF-16") == 0 ||
1421 strcmp(codeset, "UTF-16BE-BOM") == 0){
1422 output_conv = w_oconv16;
1423 output_bom_f = TRUE;
1424 }else if(strcmp(codeset, "UTF-16LE") == 0){
1425 output_conv = w_oconv16;
1426 output_endian = ENDIAN_LITTLE;
1427 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1428 output_conv = w_oconv16;
1429 output_endian = ENDIAN_LITTLE;
1430 output_bom_f = TRUE;
1431 }else if(strcmp(codeset, "UTF-32") == 0 ||
1432 strcmp(codeset, "UTF-32BE") == 0){
1433 output_conv = w_oconv32;
1434 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1435 output_conv = w_oconv32;
1436 output_bom_f = TRUE;
1437 }else if(strcmp(codeset, "UTF-32LE") == 0){
1438 output_conv = w_oconv32;
1439 output_endian = ENDIAN_LITTLE;
1440 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1441 output_conv = w_oconv32;
1442 output_endian = ENDIAN_LITTLE;
1443 output_bom_f = TRUE;
1446 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1450 if (strcmp(long_option[i].name, "guess=") == 0){
1459 if (strcmp(long_option[i].name, "overwrite") == 0){
1462 preserve_time_f = TRUE;
1465 if (strcmp(long_option[i].name, "overwrite=") == 0){
1468 preserve_time_f = TRUE;
1470 backup_suffix = malloc(strlen((char *) p) + 1);
1471 strcpy(backup_suffix, (char *) p);
1474 if (strcmp(long_option[i].name, "in-place") == 0){
1477 preserve_time_f = FALSE;
1480 if (strcmp(long_option[i].name, "in-place=") == 0){
1483 preserve_time_f = FALSE;
1485 backup_suffix = malloc(strlen((char *) p) + 1);
1486 strcpy(backup_suffix, (char *) p);
1491 if (strcmp(long_option[i].name, "cap-input") == 0){
1495 if (strcmp(long_option[i].name, "url-input") == 0){
1500 #ifdef NUMCHAR_OPTION
1501 if (strcmp(long_option[i].name, "numchar-input") == 0){
1507 if (strcmp(long_option[i].name, "no-output") == 0){
1511 if (strcmp(long_option[i].name, "debug") == 0){
1516 if (strcmp(long_option[i].name, "cp932") == 0){
1517 #ifdef SHIFTJIS_CP932
1521 #ifdef UTF8_OUTPUT_ENABLE
1522 ms_ucs_map_f = UCS_MAP_CP932;
1526 if (strcmp(long_option[i].name, "no-cp932") == 0){
1527 #ifdef SHIFTJIS_CP932
1531 #ifdef UTF8_OUTPUT_ENABLE
1532 ms_ucs_map_f = UCS_MAP_ASCII;
1536 #ifdef SHIFTJIS_CP932
1537 if (strcmp(long_option[i].name, "cp932inv") == 0){
1544 if (strcmp(long_option[i].name, "x0212") == 0){
1551 if (strcmp(long_option[i].name, "exec-in") == 0){
1555 if (strcmp(long_option[i].name, "exec-out") == 0){
1560 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1561 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1562 no_cp932ext_f = TRUE;
1565 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1566 no_best_fit_chars_f = TRUE;
1569 if (strcmp(long_option[i].name, "fb-skip") == 0){
1570 encode_fallback = NULL;
1573 if (strcmp(long_option[i].name, "fb-html") == 0){
1574 encode_fallback = encode_fallback_html;
1577 if (strcmp(long_option[i].name, "fb-xml") == 0){
1578 encode_fallback = encode_fallback_xml;
1581 if (strcmp(long_option[i].name, "fb-java") == 0){
1582 encode_fallback = encode_fallback_java;
1585 if (strcmp(long_option[i].name, "fb-perl") == 0){
1586 encode_fallback = encode_fallback_perl;
1589 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1590 encode_fallback = encode_fallback_subchar;
1593 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1594 encode_fallback = encode_fallback_subchar;
1595 unicode_subchar = 0;
1597 /* decimal number */
1598 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1599 unicode_subchar *= 10;
1600 unicode_subchar += hex2bin(p[i]);
1602 }else if(p[1] == 'x' || p[1] == 'X'){
1603 /* hexadecimal number */
1604 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1605 unicode_subchar <<= 4;
1606 unicode_subchar |= hex2bin(p[i]);
1610 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1611 unicode_subchar *= 8;
1612 unicode_subchar += hex2bin(p[i]);
1615 w16e_conv(unicode_subchar, &i, &j);
1616 unicode_subchar = i<<8 | j;
1620 #ifdef UTF8_OUTPUT_ENABLE
1621 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1622 ms_ucs_map_f = UCS_MAP_MS;
1626 #ifdef UNICODE_NORMALIZATION
1627 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1628 input_f = UTF8_INPUT;
1633 if (strcmp(long_option[i].name, "prefix=") == 0){
1634 if (nkf_isgraph(p[0])){
1635 for (i = 1; nkf_isgraph(p[i]); i++){
1636 prefix_table[p[i]] = p[0];
1643 case 'b': /* buffered mode */
1646 case 'u': /* non bufferd mode */
1649 case 't': /* transparent mode */
1654 } else if (*cp=='2') {
1658 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1666 case 'j': /* JIS output */
1668 output_conv = j_oconv;
1670 case 'e': /* AT&T EUC output */
1671 output_conv = e_oconv;
1674 case 's': /* SJIS output */
1675 output_conv = s_oconv;
1677 case 'l': /* ISO8859 Latin-1 support, no conversion */
1678 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1679 input_f = LATIN1_INPUT;
1681 case 'i': /* Kanji IN ESC-$-@/B */
1682 if (*cp=='@'||*cp=='B')
1683 kanji_intro = *cp++;
1685 case 'o': /* ASCII IN ESC-(-J/B */
1686 if (*cp=='J'||*cp=='B'||*cp=='H')
1687 ascii_intro = *cp++;
1691 bit:1 katakana->hiragana
1692 bit:2 hiragana->katakana
1694 if ('9'>= *cp && *cp>='0')
1695 hira_f |= (*cp++ -'0');
1702 #if defined(MSDOS) || defined(__OS2__)
1717 #ifdef UTF8_OUTPUT_ENABLE
1718 case 'w': /* UTF-8 output */
1720 output_conv = w_oconv; cp++;
1724 output_bom_f = TRUE;
1727 if ('1'== cp[0] && '6'==cp[1]) {
1728 output_conv = w_oconv16; cp+=2;
1729 } else if ('3'== cp[0] && '2'==cp[1]) {
1730 output_conv = w_oconv32; cp+=2;
1732 output_conv = w_oconv;
1737 output_endian = ENDIAN_LITTLE;
1738 } else if (cp[0] == 'B') {
1746 output_bom_f = TRUE;
1751 #ifdef UTF8_INPUT_ENABLE
1752 case 'W': /* UTF input */
1755 input_f = UTF8_INPUT;
1757 if ('1'== cp[0] && '6'==cp[1]) {
1759 input_f = UTF16_INPUT;
1760 input_endian = ENDIAN_BIG;
1761 } else if ('3'== cp[0] && '2'==cp[1]) {
1763 input_f = UTF32_INPUT;
1764 input_endian = ENDIAN_BIG;
1766 input_f = UTF8_INPUT;
1771 input_endian = ENDIAN_LITTLE;
1772 } else if (cp[0] == 'B') {
1778 /* Input code assumption */
1779 case 'J': /* JIS input */
1780 input_f = JIS_INPUT;
1782 case 'E': /* AT&T EUC input */
1783 input_f = EUC_INPUT;
1785 case 'S': /* MS Kanji input */
1786 input_f = SJIS_INPUT;
1787 if (x0201_f==NO_X0201) x0201_f=TRUE;
1789 case 'Z': /* Convert X0208 alphabet to asii */
1791 bit:0 Convert JIS X 0208 Alphabet to ASCII
1792 bit:1 Convert Kankaku to one space
1793 bit:2 Convert Kankaku to two spaces
1794 bit:3 Convert HTML Entity
1795 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1797 while ('0'<= *cp && *cp <='9') {
1798 alpha_f |= 1 << (*cp++ - '0');
1800 if (!alpha_f) alpha_f = 1;
1802 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1803 x0201_f = FALSE; /* No X0201->X0208 conversion */
1805 ESC-(-I in JIS, EUC, MS Kanji
1806 SI/SO in JIS, EUC, MS Kanji
1807 SSO in EUC, JIS, not in MS Kanji
1808 MS Kanji (0xa0-0xdf)
1810 ESC-(-I in JIS (0x20-0x5f)
1811 SSO in EUC (0xa0-0xdf)
1812 0xa0-0xd in MS Kanji (0xa0-0xdf)
1815 case 'X': /* Assume X0201 kana */
1816 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1819 case 'F': /* prserve new lines */
1820 fold_preserve_f = TRUE;
1821 case 'f': /* folding -f60 or -f */
1824 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1826 fold_len += *cp++ - '0';
1828 if (!(0<fold_len && fold_len<BUFSIZ))
1829 fold_len = DEFAULT_FOLD;
1833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1835 fold_margin += *cp++ - '0';
1839 case 'm': /* MIME support */
1840 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1841 if (*cp=='B'||*cp=='Q') {
1842 mime_decode_mode = *cp++;
1843 mimebuf_f = FIXED_MIME;
1844 } else if (*cp=='N') {
1845 mime_f = TRUE; cp++;
1846 } else if (*cp=='S') {
1847 mime_f = STRICT_MIME; cp++;
1848 } else if (*cp=='0') {
1849 mime_decode_f = FALSE;
1850 mime_f = FALSE; cp++;
1853 case 'M': /* MIME output */
1856 mimeout_f = FIXED_MIME; cp++;
1857 } else if (*cp=='Q') {
1859 mimeout_f = FIXED_MIME; cp++;
1864 case 'B': /* Broken JIS support */
1866 bit:1 allow any x on ESC-(-x or ESC-$-x
1867 bit:2 reset to ascii on NL
1869 if ('9'>= *cp && *cp>='0')
1870 broken_f |= 1<<(*cp++ -'0');
1875 case 'O':/* for Output file */
1879 case 'c':/* add cr code */
1882 case 'd':/* delete cr code */
1885 case 'I': /* ISO-2022-JP output */
1888 case 'L': /* line mode */
1889 if (*cp=='u') { /* unix */
1890 nlmode_f = LF; cp++;
1891 } else if (*cp=='m') { /* mac */
1892 nlmode_f = CR; cp++;
1893 } else if (*cp=='w') { /* windows */
1894 nlmode_f = CRLF; cp++;
1895 } else if (*cp=='0') { /* no conversion */
1904 } else if (*cp == '0') {
1913 /* module muliple options in a string are allowed for Perl moudle */
1914 while(*cp && *cp++!='-');
1917 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1918 /* bogus option but ignored */
1924 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1927 struct input_code *p = input_code_list;
1929 if (iconv_func == p->iconv_func){
1938 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1940 #ifdef INPUT_CODE_FIX
1948 #ifdef INPUT_CODE_FIX
1949 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1955 if (estab_f && iconv_for_check != iconv){
1956 struct input_code *p = find_inputcode_byfunc(iconv);
1958 set_input_codename(p->name);
1961 iconv_for_check = iconv;
1966 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1967 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1968 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1969 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1970 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1971 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1972 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1973 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1975 #define SCORE_INIT (SCORE_iMIME)
1977 static const char score_table_A0[] = {
1980 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1981 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1984 static const char score_table_F0[] = {
1985 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1986 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1987 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1988 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1991 void set_code_score(struct input_code *ptr, nkf_char score)
1994 ptr->score |= score;
1998 void clr_code_score(struct input_code *ptr, nkf_char score)
2001 ptr->score &= ~score;
2005 void code_score(struct input_code *ptr)
2007 nkf_char c2 = ptr->buf[0];
2008 #ifdef UTF8_OUTPUT_ENABLE
2009 nkf_char c1 = ptr->buf[1];
2012 set_code_score(ptr, SCORE_ERROR);
2013 }else if (c2 == SSO){
2014 set_code_score(ptr, SCORE_KANA);
2015 }else if (c2 == 0x8f){
2016 set_code_score(ptr, SCORE_X0212);
2017 #ifdef UTF8_OUTPUT_ENABLE
2018 }else if (!e2w_conv(c2, c1)){
2019 set_code_score(ptr, SCORE_NO_EXIST);
2021 }else if ((c2 & 0x70) == 0x20){
2022 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2023 }else if ((c2 & 0x70) == 0x70){
2024 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2025 }else if ((c2 & 0x70) >= 0x50){
2026 set_code_score(ptr, SCORE_L2);
2030 void status_disable(struct input_code *ptr)
2035 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2038 void status_push_ch(struct input_code *ptr, nkf_char c)
2040 ptr->buf[ptr->index++] = c;
2043 void status_clear(struct input_code *ptr)
2049 void status_reset(struct input_code *ptr)
2052 ptr->score = SCORE_INIT;
2055 void status_reinit(struct input_code *ptr)
2058 ptr->_file_stat = 0;
2061 void status_check(struct input_code *ptr, nkf_char c)
2063 if (c <= DEL && estab_f){
2068 void s_status(struct input_code *ptr, nkf_char c)
2072 status_check(ptr, c);
2077 #ifdef NUMCHAR_OPTION
2078 }else if (is_unicode_capsule(c)){
2081 }else if (0xa1 <= c && c <= 0xdf){
2082 status_push_ch(ptr, SSO);
2083 status_push_ch(ptr, c);
2086 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2088 status_push_ch(ptr, c);
2089 }else if (0xed <= c && c <= 0xee){
2091 status_push_ch(ptr, c);
2092 #ifdef SHIFTJIS_CP932
2093 }else if (is_ibmext_in_sjis(c)){
2095 status_push_ch(ptr, c);
2096 #endif /* SHIFTJIS_CP932 */
2098 }else if (0xf0 <= c && c <= 0xfc){
2100 status_push_ch(ptr, c);
2101 #endif /* X0212_ENABLE */
2103 status_disable(ptr);
2107 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2108 status_push_ch(ptr, c);
2109 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2113 status_disable(ptr);
2117 #ifdef SHIFTJIS_CP932
2118 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2119 status_push_ch(ptr, c);
2120 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2121 set_code_score(ptr, SCORE_CP932);
2126 #endif /* SHIFTJIS_CP932 */
2127 status_disable(ptr);
2130 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2131 status_push_ch(ptr, c);
2132 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2133 set_code_score(ptr, SCORE_CP932);
2136 status_disable(ptr);
2142 void e_status(struct input_code *ptr, nkf_char c)
2146 status_check(ptr, c);
2151 #ifdef NUMCHAR_OPTION
2152 }else if (is_unicode_capsule(c)){
2155 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2157 status_push_ch(ptr, c);
2159 }else if (0x8f == c){
2161 status_push_ch(ptr, c);
2162 #endif /* X0212_ENABLE */
2164 status_disable(ptr);
2168 if (0xa1 <= c && c <= 0xfe){
2169 status_push_ch(ptr, c);
2173 status_disable(ptr);
2178 if (0xa1 <= c && c <= 0xfe){
2180 status_push_ch(ptr, c);
2182 status_disable(ptr);
2184 #endif /* X0212_ENABLE */
2188 #ifdef UTF8_INPUT_ENABLE
2189 void w_status(struct input_code *ptr, nkf_char c)
2193 status_check(ptr, c);
2198 #ifdef NUMCHAR_OPTION
2199 }else if (is_unicode_capsule(c)){
2202 }else if (0xc0 <= c && c <= 0xdf){
2204 status_push_ch(ptr, c);
2205 }else if (0xe0 <= c && c <= 0xef){
2207 status_push_ch(ptr, c);
2208 }else if (0xf0 <= c && c <= 0xf4){
2210 status_push_ch(ptr, c);
2212 status_disable(ptr);
2217 if (0x80 <= c && c <= 0xbf){
2218 status_push_ch(ptr, c);
2219 if (ptr->index > ptr->stat){
2220 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2221 && ptr->buf[2] == 0xbf);
2222 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2223 &ptr->buf[0], &ptr->buf[1]);
2230 status_disable(ptr);
2234 if (0x80 <= c && c <= 0xbf){
2235 if (ptr->index < ptr->stat){
2236 status_push_ch(ptr, c);
2241 status_disable(ptr);
2248 void code_status(nkf_char c)
2250 int action_flag = 1;
2251 struct input_code *result = 0;
2252 struct input_code *p = input_code_list;
2254 if (!p->status_func) {
2258 if (!p->status_func)
2260 (p->status_func)(p, c);
2263 }else if(p->stat == 0){
2274 if (result && !estab_f){
2275 set_iconv(TRUE, result->iconv_func);
2276 }else if (c <= DEL){
2277 struct input_code *ptr = input_code_list;
2287 nkf_char std_getc(FILE *f)
2290 return std_gc_buf[--std_gc_ndx];
2296 nkf_char std_ungetc(nkf_char c, FILE *f)
2298 if (std_gc_ndx == STD_GC_BUFSIZE){
2301 std_gc_buf[std_gc_ndx++] = c;
2306 void std_putc(nkf_char c)
2313 #if !defined(PERL_XS) && !defined(WIN32DLL)
2314 nkf_char noconvert(FILE *f)
2319 module_connection();
2320 while ((c = (*i_getc)(f)) != EOF)
2327 void module_connection(void)
2329 oconv = output_conv;
2332 /* replace continucation module, from output side */
2334 /* output redicrection */
2336 if (noout_f || guess_f){
2343 if (mimeout_f == TRUE) {
2344 o_base64conv = oconv; oconv = base64_conv;
2346 /* base64_count = 0; */
2349 if (nlmode_f || guess_f) {
2350 o_nlconv = oconv; oconv = nl_conv;
2353 o_rot_conv = oconv; oconv = rot_conv;
2356 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2359 o_hira_conv = oconv; oconv = hira_conv;
2362 o_fconv = oconv; oconv = fold_conv;
2365 if (alpha_f || x0201_f) {
2366 o_zconv = oconv; oconv = z_conv;
2370 i_ungetc = std_ungetc;
2371 /* input redicrection */
2374 i_cgetc = i_getc; i_getc = cap_getc;
2375 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2378 i_ugetc = i_getc; i_getc = url_getc;
2379 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2382 #ifdef NUMCHAR_OPTION
2384 i_ngetc = i_getc; i_getc = numchar_getc;
2385 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2388 #ifdef UNICODE_NORMALIZATION
2389 if (nfc_f && input_f == UTF8_INPUT){
2390 i_nfc_getc = i_getc; i_getc = nfc_getc;
2391 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2394 if (mime_f && mimebuf_f==FIXED_MIME) {
2395 i_mgetc = i_getc; i_getc = mime_getc;
2396 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2399 i_bgetc = i_getc; i_getc = broken_getc;
2400 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2402 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2403 set_iconv(-TRUE, e_iconv);
2404 } else if (input_f == SJIS_INPUT) {
2405 set_iconv(-TRUE, s_iconv);
2406 #ifdef UTF8_INPUT_ENABLE
2407 } else if (input_f == UTF8_INPUT) {
2408 set_iconv(-TRUE, w_iconv);
2409 } else if (input_f == UTF16_INPUT) {
2410 set_iconv(-TRUE, w_iconv16);
2411 } else if (input_f == UTF32_INPUT) {
2412 set_iconv(-TRUE, w_iconv32);
2415 set_iconv(FALSE, e_iconv);
2419 struct input_code *p = input_code_list;
2427 * Check and Ignore BOM
2429 void check_bom(FILE *f)
2432 switch(c2 = (*i_getc)(f)){
2434 if((c2 = (*i_getc)(f)) == 0x00){
2435 if((c2 = (*i_getc)(f)) == 0xFE){
2436 if((c2 = (*i_getc)(f)) == 0xFF){
2438 set_iconv(TRUE, w_iconv32);
2440 if (iconv == w_iconv32) {
2441 input_endian = ENDIAN_BIG;
2444 (*i_ungetc)(0xFF,f);
2445 }else (*i_ungetc)(c2,f);
2446 (*i_ungetc)(0xFE,f);
2447 }else if(c2 == 0xFF){
2448 if((c2 = (*i_getc)(f)) == 0xFE){
2450 set_iconv(TRUE, w_iconv32);
2452 if (iconv == w_iconv32) {
2453 input_endian = ENDIAN_2143;
2456 (*i_ungetc)(0xFF,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xFF,f);
2459 }else (*i_ungetc)(c2,f);
2460 (*i_ungetc)(0x00,f);
2461 }else (*i_ungetc)(c2,f);
2462 (*i_ungetc)(0x00,f);
2465 if((c2 = (*i_getc)(f)) == 0xBB){
2466 if((c2 = (*i_getc)(f)) == 0xBF){
2468 set_iconv(TRUE, w_iconv);
2470 if (iconv == w_iconv) {
2473 (*i_ungetc)(0xBF,f);
2474 }else (*i_ungetc)(c2,f);
2475 (*i_ungetc)(0xBB,f);
2476 }else (*i_ungetc)(c2,f);
2477 (*i_ungetc)(0xEF,f);
2480 if((c2 = (*i_getc)(f)) == 0xFF){
2481 if((c2 = (*i_getc)(f)) == 0x00){
2482 if((c2 = (*i_getc)(f)) == 0x00){
2484 set_iconv(TRUE, w_iconv32);
2486 if (iconv == w_iconv32) {
2487 input_endian = ENDIAN_3412;
2490 (*i_ungetc)(0x00,f);
2491 }else (*i_ungetc)(c2,f);
2492 (*i_ungetc)(0x00,f);
2493 }else (*i_ungetc)(c2,f);
2495 set_iconv(TRUE, w_iconv16);
2497 if (iconv == w_iconv16) {
2498 input_endian = ENDIAN_BIG;
2501 (*i_ungetc)(0xFF,f);
2502 }else (*i_ungetc)(c2,f);
2503 (*i_ungetc)(0xFE,f);
2506 if((c2 = (*i_getc)(f)) == 0xFE){
2507 if((c2 = (*i_getc)(f)) == 0x00){
2508 if((c2 = (*i_getc)(f)) == 0x00){
2510 set_iconv(TRUE, w_iconv32);
2512 if (iconv == w_iconv32) {
2513 input_endian = ENDIAN_LITTLE;
2516 (*i_ungetc)(0x00,f);
2517 }else (*i_ungetc)(c2,f);
2518 (*i_ungetc)(0x00,f);
2519 }else (*i_ungetc)(c2,f);
2521 set_iconv(TRUE, w_iconv16);
2523 if (iconv == w_iconv16) {
2524 input_endian = ENDIAN_LITTLE;
2527 (*i_ungetc)(0xFE,f);
2528 }else (*i_ungetc)(c2,f);
2529 (*i_ungetc)(0xFF,f);
2538 Conversion main loop. Code detection only.
2541 nkf_char kanji_convert(FILE *f)
2543 nkf_char c3, c2=0, c1, c0=0;
2544 int is_8bit = FALSE;
2546 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2547 #ifdef UTF8_INPUT_ENABLE
2548 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2555 output_mode = ASCII;
2558 #define NEXT continue /* no output, get next */
2559 #define SEND ; /* output c1 and c2, get next */
2560 #define LAST break /* end of loop, go closing */
2562 module_connection();
2565 while ((c1 = (*i_getc)(f)) != EOF) {
2566 #ifdef INPUT_CODE_FIX
2572 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2573 /* in case of 8th bit is on */
2574 if (!estab_f&&!mime_decode_mode) {
2575 /* in case of not established yet */
2576 /* It is still ambiguious */
2577 if (h_conv(f, c2, c1)==EOF)
2583 /* in case of already established */
2585 /* ignore bogus code and not CP5022x UCD */
2593 /* second byte, 7 bit code */
2594 /* it might be kanji shitfted */
2595 if ((c1 == DEL) || (c1 <= SP)) {
2596 /* ignore bogus first code */
2603 #ifdef UTF8_INPUT_ENABLE
2604 if (iconv == w_iconv16) {
2605 if (input_endian == ENDIAN_BIG) {
2607 if ((c1 = (*i_getc)(f)) != EOF) {
2608 if (0xD8 <= c2 && c2 <= 0xDB) {
2609 if ((c0 = (*i_getc)(f)) != EOF) {
2611 if ((c3 = (*i_getc)(f)) != EOF) {
2618 if ((c2 = (*i_getc)(f)) != EOF) {
2619 if (0xD8 <= c2 && c2 <= 0xDB) {
2620 if ((c3 = (*i_getc)(f)) != EOF) {
2621 if ((c0 = (*i_getc)(f)) != EOF) {
2630 } else if(iconv == w_iconv32){
2632 if((c2 = (*i_getc)(f)) != EOF &&
2633 (c1 = (*i_getc)(f)) != EOF &&
2634 (c0 = (*i_getc)(f)) != EOF){
2635 switch(input_endian){
2637 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2640 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2643 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2646 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2656 #ifdef NUMCHAR_OPTION
2657 if (is_unicode_capsule(c1)){
2661 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2663 if (!estab_f && !iso8859_f) {
2664 /* not established yet */
2667 } else { /* estab_f==TRUE */
2672 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2673 /* SJIS X0201 Case... */
2674 if(iso2022jp_f && x0201_f==NO_X0201) {
2675 (*oconv)(GETA1, GETA2);
2682 } else if (c1==SSO && iconv != s_iconv) {
2683 /* EUC X0201 Case */
2684 c1 = (*i_getc)(f); /* skip SSO */
2686 if (SSP<=c1 && c1<0xe0) {
2687 if(iso2022jp_f && x0201_f==NO_X0201) {
2688 (*oconv)(GETA1, GETA2);
2695 } else { /* bogus code, skip SSO and one byte */
2698 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2699 (c1 == 0xFD || c1 == 0xFE)) {
2705 /* already established */
2710 } else if ((c1 > SP) && (c1 != DEL)) {
2711 /* in case of Roman characters */
2713 /* output 1 shifted byte */
2717 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2718 /* output 1 shifted byte */
2719 if(iso2022jp_f && x0201_f==NO_X0201) {
2720 (*oconv)(GETA1, GETA2);
2727 /* look like bogus code */
2730 } else if (input_mode == X0208 || input_mode == X0212 ||
2731 input_mode == X0213_1 || input_mode == X0213_2) {
2732 /* in case of Kanji shifted */
2735 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2736 /* Check MIME code */
2737 if ((c1 = (*i_getc)(f)) == EOF) {
2740 } else if (c1 == '?') {
2741 /* =? is mime conversion start sequence */
2742 if(mime_f == STRICT_MIME) {
2743 /* check in real detail */
2744 if (mime_begin_strict(f) == EOF)
2748 } else if (mime_begin(f) == EOF)
2758 /* normal ASCII code */
2761 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2764 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2767 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2768 if ((c1 = (*i_getc)(f)) == EOF) {
2769 /* (*oconv)(0, ESC); don't send bogus code */
2771 } else if (c1 == '$') {
2772 if ((c1 = (*i_getc)(f)) == EOF) {
2774 (*oconv)(0, ESC); don't send bogus code
2775 (*oconv)(0, '$'); */
2777 } else if (c1 == '@'|| c1 == 'B') {
2778 /* This is kanji introduction */
2781 set_input_codename("ISO-2022-JP");
2783 debug("ISO-2022-JP");
2786 } else if (c1 == '(') {
2787 if ((c1 = (*i_getc)(f)) == EOF) {
2788 /* don't send bogus code
2794 } else if (c1 == '@'|| c1 == 'B') {
2795 /* This is kanji introduction */
2800 } else if (c1 == 'D'){
2804 #endif /* X0212_ENABLE */
2805 } else if (c1 == (X0213_1&0x7F)){
2806 input_mode = X0213_1;
2809 } else if (c1 == (X0213_2&0x7F)){
2810 input_mode = X0213_2;
2814 /* could be some special code */
2821 } else if (broken_f&0x2) {
2822 /* accept any ESC-(-x as broken code ... */
2832 } else if (c1 == '(') {
2833 if ((c1 = (*i_getc)(f)) == EOF) {
2834 /* don't send bogus code
2836 (*oconv)(0, '('); */
2840 /* This is X0201 kana introduction */
2841 input_mode = X0201; shift_mode = X0201;
2843 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2844 /* This is X0208 kanji introduction */
2845 input_mode = ASCII; shift_mode = FALSE;
2847 } else if (broken_f&0x2) {
2848 input_mode = ASCII; shift_mode = FALSE;
2853 /* maintain various input_mode here */
2857 } else if ( c1 == 'N' || c1 == 'n'){
2859 c3 = (*i_getc)(f); /* skip SS2 */
2860 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2875 } else if (c1 == ESC && iconv == s_iconv) {
2876 /* ESC in Shift_JIS */
2877 if ((c1 = (*i_getc)(f)) == EOF) {
2878 /* (*oconv)(0, ESC); don't send bogus code */
2880 } else if (c1 == '$') {
2882 if ((c1 = (*i_getc)(f)) == EOF) {
2884 (*oconv)(0, ESC); don't send bogus code
2885 (*oconv)(0, '$'); */
2888 if (('E' <= c1 && c1 <= 'G') ||
2889 ('O' <= c1 && c1 <= 'Q')) {
2897 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2898 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2899 while ((c1 = (*i_getc)(f)) != EOF) {
2900 if (SP <= c1 && c1 <= 'z') {
2901 (*oconv)(0, c1 + c0);
2902 } else break; /* c1 == SO */
2906 if (c1 == EOF) LAST;
2913 } else if (c1 == LF || c1 == CR) {
2915 input_mode = ASCII; set_iconv(FALSE, 0);
2917 } else if (mime_decode_f && !mime_decode_mode){
2919 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2927 } else { /* if (c1 == CR)*/
2928 if ((c1=(*i_getc)(f))!=EOF) {
2932 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2946 } else if (c1 == DEL && input_mode == X0208) {
2956 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2959 if ((c0 = (*i_getc)(f)) != EOF) {
2962 if ((c3 = (*i_getc)(f)) != EOF) {
2964 (*iconv)(c2, c1, c0|c3);
2969 /* 3 bytes EUC or UTF-8 */
2970 if ((c0 = (*i_getc)(f)) != EOF) {
2972 (*iconv)(c2, c1, c0);
2980 0x7F <= c2 && c2 <= 0x92 &&
2981 0x21 <= c1 && c1 <= 0x7E) {
2983 if(c1 == 0x7F) return 0;
2984 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2987 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2991 (*oconv)(PREFIX_EUCG3 | c2, c1);
2993 #endif /* X0212_ENABLE */
2995 (*oconv)(PREFIX_EUCG3 | c2, c1);
2998 (*oconv)(input_mode, c1); /* other special case */
3004 /* goto next_word */
3008 (*iconv)(EOF, 0, 0);
3009 if (!input_codename)
3012 struct input_code *p = input_code_list;
3013 struct input_code *result = p;
3015 if (p->score < result->score) result = p;
3018 set_input_codename(result->name);
3020 debug(result->name);
3028 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3030 nkf_char ret, c3, c0;
3034 /** it must NOT be in the kanji shifte sequence */
3035 /** it must NOT be written in JIS7 */
3036 /** and it must be after 2 byte 8bit code */
3042 while ((c1 = (*i_getc)(f)) != EOF) {
3048 if (push_hold_buf(c1) == EOF || estab_f){
3054 struct input_code *p = input_code_list;
3055 struct input_code *result = p;
3060 if (p->status_func && p->score < result->score){
3065 set_iconv(TRUE, result->iconv_func);
3070 ** 1) EOF is detected, or
3071 ** 2) Code is established, or
3072 ** 3) Buffer is FULL (but last word is pushed)
3074 ** in 1) and 3) cases, we continue to use
3075 ** Kanji codes by oconv and leave estab_f unchanged.
3080 while (hold_index < hold_count){
3081 c2 = hold_buf[hold_index++];
3083 #ifdef NUMCHAR_OPTION
3084 || is_unicode_capsule(c2)
3089 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3090 (*iconv)(X0201, c2, 0);
3093 if (hold_index < hold_count){
3094 c1 = hold_buf[hold_index++];
3104 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3107 if (hold_index < hold_count){
3108 c0 = hold_buf[hold_index++];
3109 } else if ((c0 = (*i_getc)(f)) == EOF) {
3115 if (hold_index < hold_count){
3116 c3 = hold_buf[hold_index++];
3117 } else if ((c3 = (*i_getc)(f)) == EOF) {
3122 (*iconv)(c2, c1, c0|c3);
3127 /* 3 bytes EUC or UTF-8 */
3128 if (hold_index < hold_count){
3129 c0 = hold_buf[hold_index++];
3130 } else if ((c0 = (*i_getc)(f)) == EOF) {
3136 (*iconv)(c2, c1, c0);
3139 if (c0 == EOF) break;
3144 nkf_char push_hold_buf(nkf_char c2)
3146 if (hold_count >= HOLD_SIZE*2)
3148 hold_buf[hold_count++] = (unsigned char)c2;
3149 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3152 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3154 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3157 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3158 #ifdef SHIFTJIS_CP932
3159 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3160 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3167 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3168 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3174 #endif /* SHIFTJIS_CP932 */
3176 if (!x0213_f && is_ibmext_in_sjis(c2)){
3177 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3180 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3193 if(x0213_f && c2 >= 0xF0){
3194 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3195 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3196 }else{ /* 78<=k<=94 */
3197 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3198 if (0x9E < c1) c2++;
3201 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3202 if (0x9E < c1) c2++;
3205 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3212 c2 = x0212_unshift(c2);
3219 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3223 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3225 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3227 if(c1 == 0x7F) return 0;
3228 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3231 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3232 if (ret) return ret;
3238 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3243 }else if (c2 == 0x8f){
3247 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3248 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3249 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3252 c2 = (c2 << 8) | (c1 & 0x7f);
3254 #ifdef SHIFTJIS_CP932
3257 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3258 s2e_conv(s2, s1, &c2, &c1);
3265 #endif /* SHIFTJIS_CP932 */
3267 #endif /* X0212_ENABLE */
3268 } else if (c2 == SSO){
3271 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3274 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3275 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3276 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3281 #ifdef SHIFTJIS_CP932
3282 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3284 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3285 s2e_conv(s2, s1, &c2, &c1);
3292 #endif /* SHIFTJIS_CP932 */
3299 #ifdef UTF8_INPUT_ENABLE
3300 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3307 }else if (0xc0 <= c2 && c2 <= 0xef) {
3308 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3309 #ifdef NUMCHAR_OPTION
3312 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3320 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3323 static const char w_iconv_utf8_1st_byte[] =
3325 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3326 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3327 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3328 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3330 if (c2 < 0 || 0xff < c2) {
3331 }else if (c2 == 0) { /* 0 : 1 byte*/
3333 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3336 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3338 if (c1 < 0x80 || 0xBF < c1) return 0;
3341 if (c0 == 0) return -1;
3342 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3347 if (c0 == 0) return -1;
3348 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3352 if (c0 == 0) return -1;
3353 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3357 if (c0 == 0) return -2;
3358 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3362 if (c0 == 0) return -2;
3363 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3367 if (c0 == 0) return -2;
3368 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3376 if (c2 == 0 || c2 == EOF){
3377 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3378 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3381 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3390 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3391 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3398 }else if (val < 0x800){
3399 *p2 = 0xc0 | (val >> 6);
3400 *p1 = 0x80 | (val & 0x3f);
3402 } else if (val <= NKF_INT32_C(0xFFFF)) {
3403 *p2 = 0xe0 | (val >> 12);
3404 *p1 = 0x80 | ((val >> 6) & 0x3f);
3405 *p0 = 0x80 | (val & 0x3f);
3406 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3407 *p2 = 0xe0 | (val >> 16);
3408 *p1 = 0x80 | ((val >> 12) & 0x3f);
3409 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3418 #ifdef UTF8_INPUT_ENABLE
3419 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3424 } else if (c2 >= 0xf0){
3425 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3426 val = (c2 & 0x0f) << 18;
3427 val |= (c1 & 0x3f) << 12;
3428 val |= (c0 & 0x3f00) >> 2;
3430 }else if (c2 >= 0xe0){
3431 val = (c2 & 0x0f) << 12;
3432 val |= (c1 & 0x3f) << 6;
3434 }else if (c2 >= 0xc0){
3435 val = (c2 & 0x1f) << 6;
3443 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3445 nkf_char c2, c1, c0;
3452 w16w_conv(val, &c2, &c1, &c0);
3453 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3454 #ifdef NUMCHAR_OPTION
3457 *p1 = CLASS_UNICODE | val;
3466 #ifdef UTF8_INPUT_ENABLE
3467 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3470 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3473 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3474 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3476 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3478 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3483 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3484 if (ret) return ret;
3489 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3493 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3494 } else if (is_unicode_bmp(c1)) {
3495 ret = w16e_conv(c1, &c2, &c1);
3498 c1 = CLASS_UNICODE | c1;
3500 if (ret) return ret;
3505 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3507 const unsigned short *const *pp;
3508 const unsigned short *const *const *ppp;
3509 static const char no_best_fit_chars_table_C2[] =
3510 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3513 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3514 static const char no_best_fit_chars_table_C2_ms[] =
3515 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3518 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3519 static const char no_best_fit_chars_table_932_C2[] =
3520 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3523 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3524 static const char no_best_fit_chars_table_932_C3[] =
3525 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3528 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3534 }else if(c2 < 0xe0){
3535 if(no_best_fit_chars_f){
3536 if(ms_ucs_map_f == UCS_MAP_CP932){
3539 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3542 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3545 }else if(!cp932inv_f){
3548 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3551 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3554 }else if(ms_ucs_map_f == UCS_MAP_MS){
3555 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3556 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3574 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3575 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3576 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3578 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3579 }else if(c0 < 0xF0){
3580 if(no_best_fit_chars_f){
3581 if(ms_ucs_map_f == UCS_MAP_CP932){
3582 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3583 }else if(ms_ucs_map_f == UCS_MAP_MS){
3588 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3591 if(c0 == 0x92) return 1;
3596 if(c1 == 0x80 || c0 == 0x9C) return 1;
3599 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3604 if(c0 == 0x94) return 1;
3607 if(c0 == 0xBB) return 1;
3617 if(c0 == 0x95) return 1;
3620 if(c0 == 0xA5) return 1;
3627 if(c0 == 0x8D) return 1;
3630 if(c0 == 0x9E && !cp932inv_f) return 1;
3633 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3641 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3642 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3643 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3645 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3647 #ifdef SHIFTJIS_CP932
3648 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3650 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3651 s2e_conv(s2, s1, p2, p1);
3660 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3663 const unsigned short *p;
3666 if (pp == 0) return 1;
3669 if (c1 < 0 || psize <= c1) return 1;
3671 if (p == 0) return 1;
3674 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3676 if (val == 0) return 1;
3677 if (no_cp932ext_f && (
3678 (val>>8) == 0x2D || /* NEC special characters */
3679 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3687 if (c2 == SO) c2 = X0201;
3694 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3701 (*f)(0, bin2hex(c>>shift));
3711 void encode_fallback_html(nkf_char c)
3716 if(c >= NKF_INT32_C(1000000))
3717 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3718 if(c >= NKF_INT32_C(100000))
3719 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3721 (*oconv)(0, 0x30+(c/10000 )%10);
3723 (*oconv)(0, 0x30+(c/1000 )%10);
3725 (*oconv)(0, 0x30+(c/100 )%10);
3727 (*oconv)(0, 0x30+(c/10 )%10);
3729 (*oconv)(0, 0x30+ c %10);
3734 void encode_fallback_xml(nkf_char c)
3739 nkf_each_char_to_hex(oconv, c);
3744 void encode_fallback_java(nkf_char c)
3748 if(!is_unicode_bmp(c)){
3752 (*oconv)(0, bin2hex(c>>20));
3753 (*oconv)(0, bin2hex(c>>16));
3757 (*oconv)(0, bin2hex(c>>12));
3758 (*oconv)(0, bin2hex(c>> 8));
3759 (*oconv)(0, bin2hex(c>> 4));
3760 (*oconv)(0, bin2hex(c ));
3764 void encode_fallback_perl(nkf_char c)
3769 nkf_each_char_to_hex(oconv, c);
3774 void encode_fallback_subchar(nkf_char c)
3776 c = unicode_subchar;
3777 (*oconv)((c>>8)&0xFF, c&0xFF);
3782 #ifdef UTF8_OUTPUT_ENABLE
3783 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3785 const unsigned short *p;
3788 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3796 p = euc_to_utf8_1byte;
3798 } else if (is_eucg3(c2)){
3799 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3802 c2 = (c2&0x7f) - 0x21;
3803 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3804 p = x0212_to_utf8_2bytes[c2];
3810 c2 = (c2&0x7f) - 0x21;
3811 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3813 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3814 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3815 euc_to_utf8_2bytes_ms[c2];
3820 c1 = (c1 & 0x7f) - 0x21;
3821 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3826 void w_oconv(nkf_char c2, nkf_char c1)
3832 output_bom_f = FALSE;
3843 #ifdef NUMCHAR_OPTION
3844 if (c2 == 0 && is_unicode_capsule(c1)){
3845 val = c1 & VALUE_MASK;
3848 }else if (val < 0x800){
3849 (*o_putc)(0xC0 | (val >> 6));
3850 (*o_putc)(0x80 | (val & 0x3f));
3851 } else if (val <= NKF_INT32_C(0xFFFF)) {
3852 (*o_putc)(0xE0 | (val >> 12));
3853 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3854 (*o_putc)(0x80 | (val & 0x3f));
3855 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3856 (*o_putc)(0xF0 | ( val>>18));
3857 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3858 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3859 (*o_putc)(0x80 | ( val & 0x3f));
3866 output_mode = ASCII;
3868 } else if (c2 == ISO8859_1) {
3870 (*o_putc)(c1 | 0x080);
3873 val = e2w_conv(c2, c1);
3875 w16w_conv(val, &c2, &c1, &c0);
3879 if (c0) (*o_putc)(c0);
3885 void w_oconv16(nkf_char c2, nkf_char c1)
3888 output_bom_f = FALSE;
3889 if (output_endian == ENDIAN_LITTLE){
3890 (*o_putc)((unsigned char)'\377');
3894 (*o_putc)((unsigned char)'\377');
3903 if (c2 == ISO8859_1) {
3906 #ifdef NUMCHAR_OPTION
3907 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3908 if (is_unicode_bmp(c1)) {
3909 c2 = (c1 >> 8) & 0xff;
3913 if (c1 <= UNICODE_MAX) {
3914 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3915 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3916 if (output_endian == ENDIAN_LITTLE){
3917 (*o_putc)(c2 & 0xff);
3918 (*o_putc)((c2 >> 8) & 0xff);
3919 (*o_putc)(c1 & 0xff);
3920 (*o_putc)((c1 >> 8) & 0xff);
3922 (*o_putc)((c2 >> 8) & 0xff);
3923 (*o_putc)(c2 & 0xff);
3924 (*o_putc)((c1 >> 8) & 0xff);
3925 (*o_putc)(c1 & 0xff);
3932 nkf_char val = e2w_conv(c2, c1);
3933 c2 = (val >> 8) & 0xff;
3937 if (output_endian == ENDIAN_LITTLE){
3946 void w_oconv32(nkf_char c2, nkf_char c1)
3949 output_bom_f = FALSE;
3950 if (output_endian == ENDIAN_LITTLE){
3951 (*o_putc)((unsigned char)'\377');
3959 (*o_putc)((unsigned char)'\377');
3968 if (c2 == ISO8859_1) {
3970 #ifdef NUMCHAR_OPTION
3971 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3975 c1 = e2w_conv(c2, c1);
3978 if (output_endian == ENDIAN_LITTLE){
3979 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3980 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3981 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3985 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3986 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3987 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3992 void e_oconv(nkf_char c2, nkf_char c1)
3994 #ifdef NUMCHAR_OPTION
3995 if (c2 == 0 && is_unicode_capsule(c1)){
3996 w16e_conv(c1, &c2, &c1);
3997 if (c2 == 0 && is_unicode_capsule(c1)){
3998 c2 = c1 & VALUE_MASK;
3999 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4003 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4004 c1 = 0x21 + c1 % 94;
4007 (*o_putc)((c2 & 0x7f) | 0x080);
4008 (*o_putc)(c1 | 0x080);
4010 (*o_putc)((c2 & 0x7f) | 0x080);
4011 (*o_putc)(c1 | 0x080);
4015 if (encode_fallback) (*encode_fallback)(c1);
4024 } else if (c2 == 0) {
4025 output_mode = ASCII;
4027 } else if (c2 == X0201) {
4028 output_mode = JAPANESE_EUC;
4029 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4030 } else if (c2 == ISO8859_1) {
4031 output_mode = ISO8859_1;
4032 (*o_putc)(c1 | 0x080);
4034 } else if (is_eucg3(c2)){
4035 output_mode = JAPANESE_EUC;
4036 #ifdef SHIFTJIS_CP932
4039 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4040 s2e_conv(s2, s1, &c2, &c1);
4045 output_mode = ASCII;
4047 }else if (is_eucg3(c2)){
4050 (*o_putc)((c2 & 0x7f) | 0x080);
4051 (*o_putc)(c1 | 0x080);
4054 (*o_putc)((c2 & 0x7f) | 0x080);
4055 (*o_putc)(c1 | 0x080);
4059 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4060 set_iconv(FALSE, 0);
4061 return; /* too late to rescue this char */
4063 output_mode = JAPANESE_EUC;
4064 (*o_putc)(c2 | 0x080);
4065 (*o_putc)(c1 | 0x080);
4070 nkf_char x0212_shift(nkf_char c)
4075 if (0x75 <= c && c <= 0x7f){
4076 ret = c + (0x109 - 0x75);
4079 if (0x75 <= c && c <= 0x7f){
4080 ret = c + (0x113 - 0x75);
4087 nkf_char x0212_unshift(nkf_char c)
4090 if (0x7f <= c && c <= 0x88){
4091 ret = c + (0x75 - 0x7f);
4092 }else if (0x89 <= c && c <= 0x92){
4093 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4097 #endif /* X0212_ENABLE */
4099 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4105 if((0x21 <= ndx && ndx <= 0x2F)){
4106 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4107 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4109 }else if(0x6E <= ndx && ndx <= 0x7E){
4110 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4111 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4117 else if(nkf_isgraph(ndx)){
4119 const unsigned short *ptr;
4120 ptr = x0212_shiftjis[ndx - 0x21];
4122 val = ptr[(c1 & 0x7f) - 0x21];
4131 c2 = x0212_shift(c2);
4133 #endif /* X0212_ENABLE */
4135 if(0x7F < c2) return 1;
4136 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4137 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4141 void s_oconv(nkf_char c2, nkf_char c1)
4143 #ifdef NUMCHAR_OPTION
4144 if (c2 == 0 && is_unicode_capsule(c1)){
4145 w16e_conv(c1, &c2, &c1);
4146 if (c2 == 0 && is_unicode_capsule(c1)){
4147 c2 = c1 & VALUE_MASK;
4148 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4151 c2 = c1 / 188 + 0xF0;
4153 c1 += 0x40 + (c1 > 0x3e);
4158 if(encode_fallback)(*encode_fallback)(c1);
4167 } else if (c2 == 0) {
4168 output_mode = ASCII;
4170 } else if (c2 == X0201) {
4171 output_mode = SHIFT_JIS;
4173 } else if (c2 == ISO8859_1) {
4174 output_mode = ISO8859_1;
4175 (*o_putc)(c1 | 0x080);
4177 } else if (is_eucg3(c2)){
4178 output_mode = SHIFT_JIS;
4179 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4185 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4186 set_iconv(FALSE, 0);
4187 return; /* too late to rescue this char */
4189 output_mode = SHIFT_JIS;
4190 e2s_conv(c2, c1, &c2, &c1);
4192 #ifdef SHIFTJIS_CP932
4194 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4195 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4201 #endif /* SHIFTJIS_CP932 */
4204 if (prefix_table[(unsigned char)c1]){
4205 (*o_putc)(prefix_table[(unsigned char)c1]);
4211 void j_oconv(nkf_char c2, nkf_char c1)
4213 #ifdef NUMCHAR_OPTION
4214 if (c2 == 0 && is_unicode_capsule(c1)){
4215 w16e_conv(c1, &c2, &c1);
4216 if (c2 == 0 && is_unicode_capsule(c1)){
4217 c2 = c1 & VALUE_MASK;
4218 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4221 c2 = 0x7F + c1 / 94;
4222 c1 = 0x21 + c1 % 94;
4224 if (encode_fallback) (*encode_fallback)(c1);
4231 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4234 (*o_putc)(ascii_intro);
4235 output_mode = ASCII;
4239 } else if (is_eucg3(c2)){
4241 if(output_mode!=X0213_2){
4242 output_mode = X0213_2;
4246 (*o_putc)(X0213_2&0x7F);
4249 if(output_mode!=X0212){
4250 output_mode = X0212;
4254 (*o_putc)(X0212&0x7F);
4257 (*o_putc)(c2 & 0x7f);
4260 } else if (c2==X0201) {
4261 if (output_mode!=X0201) {
4262 output_mode = X0201;
4268 } else if (c2==ISO8859_1) {
4269 /* iso8859 introduction, or 8th bit on */
4270 /* Can we convert in 7bit form using ESC-'-'-A ?
4272 output_mode = ISO8859_1;
4274 } else if (c2 == 0) {
4275 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4278 (*o_putc)(ascii_intro);
4279 output_mode = ASCII;
4284 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4285 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4287 if (output_mode!=X0213_1) {
4288 output_mode = X0213_1;
4292 (*o_putc)(X0213_1&0x7F);
4294 }else if (output_mode != X0208) {
4295 output_mode = X0208;
4298 (*o_putc)(kanji_intro);
4305 void base64_conv(nkf_char c2, nkf_char c1)
4307 mime_prechar(c2, c1);
4308 (*o_base64conv)(c2,c1);
4312 static nkf_char broken_buf[3];
4313 static int broken_counter = 0;
4314 static int broken_last = 0;
4315 nkf_char broken_getc(FILE *f)
4319 if (broken_counter>0) {
4320 return broken_buf[--broken_counter];
4323 if (c=='$' && broken_last != ESC
4324 && (input_mode==ASCII || input_mode==X0201)) {
4327 if (c1=='@'|| c1=='B') {
4328 broken_buf[0]=c1; broken_buf[1]=c;
4335 } else if (c=='(' && broken_last != ESC
4336 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4339 if (c1=='J'|| c1=='B') {
4340 broken_buf[0]=c1; broken_buf[1]=c;
4353 nkf_char broken_ungetc(nkf_char c, FILE *f)
4355 if (broken_counter<2)
4356 broken_buf[broken_counter++]=c;
4360 void nl_conv(nkf_char c2, nkf_char c1)
4362 if (guess_f && input_nextline != EOF) {
4363 if (c2 == 0 && c1 == LF) {
4364 if (!input_nextline) input_nextline = prev_cr ? CRLF : LF;
4365 else if (input_nextline != (prev_cr ? CRLF : LF)) input_nextline = EOF;
4366 } else if (c2 == 0 && c1 == CR && input_nextline == LF) input_nextline = EOF;
4368 else if (!input_nextline) input_nextline = CR;
4369 else if (input_nextline != CR) input_nextline = EOF;
4371 if (prev_cr || c2 == 0 && c1 == LF) {
4373 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4374 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4376 if (c2 == 0 && c1 == CR) prev_cr = CR;
4377 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4381 Return value of fold_conv()
4383 LF add newline and output char
4384 CR add newline and output nothing
4387 1 (or else) normal output
4389 fold state in prev (previous character)
4391 >0x80 Japanese (X0208/X0201)
4396 This fold algorthm does not preserve heading space in a line.
4397 This is the main difference from fmt.
4400 #define char_size(c2,c1) (c2?2:1)
4402 void fold_conv(nkf_char c2, nkf_char c1)
4405 nkf_char fold_state;
4407 if (c1== CR && !fold_preserve_f) {
4408 fold_state=0; /* ignore cr */
4409 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4411 fold_state=0; /* ignore cr */
4412 } else if (c1== BS) {
4413 if (f_line>0) f_line--;
4415 } else if (c2==EOF && f_line != 0) { /* close open last line */
4417 } else if ((c1==LF && !fold_preserve_f)
4418 || ((c1==CR||(c1==LF&&f_prev!=CR))
4419 && fold_preserve_f)) {
4421 if (fold_preserve_f) {
4425 } else if ((f_prev == c1 && !fold_preserve_f)
4426 || (f_prev == LF && fold_preserve_f)
4427 ) { /* duplicate newline */
4430 fold_state = LF; /* output two newline */
4436 if (f_prev&0x80) { /* Japanese? */
4438 fold_state = 0; /* ignore given single newline */
4439 } else if (f_prev==SP) {
4443 if (++f_line<=fold_len)
4447 fold_state = CR; /* fold and output nothing */
4451 } else if (c1=='\f') {
4454 fold_state = LF; /* output newline and clear */
4455 } else if ( (c2==0 && c1==SP)||
4456 (c2==0 && c1==TAB)||
4457 (c2=='!'&& c1=='!')) {
4458 /* X0208 kankaku or ascii space */
4460 fold_state = 0; /* remove duplicate spaces */
4463 if (++f_line<=fold_len)
4464 fold_state = SP; /* output ASCII space only */
4466 f_prev = SP; f_line = 0;
4467 fold_state = CR; /* fold and output nothing */
4471 prev0 = f_prev; /* we still need this one... , but almost done */
4473 if (c2 || c2==X0201)
4474 f_prev |= 0x80; /* this is Japanese */
4475 f_line += char_size(c2,c1);
4476 if (f_line<=fold_len) { /* normal case */
4479 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4480 f_line = char_size(c2,c1);
4481 fold_state = LF; /* We can't wait, do fold now */
4482 } else if (c2==X0201) {
4483 /* simple kinsoku rules return 1 means no folding */
4484 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4485 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4486 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4487 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4488 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4489 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4490 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4492 fold_state = LF;/* add one new f_line before this character */
4495 fold_state = LF;/* add one new f_line before this character */
4498 /* kinsoku point in ASCII */
4499 if ( c1==')'|| /* { [ ( */
4510 /* just after special */
4511 } else if (!is_alnum(prev0)) {
4512 f_line = char_size(c2,c1);
4514 } else if ((prev0==SP) || /* ignored new f_line */
4515 (prev0==LF)|| /* ignored new f_line */
4516 (prev0&0x80)) { /* X0208 - ASCII */
4517 f_line = char_size(c2,c1);
4518 fold_state = LF;/* add one new f_line before this character */
4520 fold_state = 1; /* default no fold in ASCII */
4524 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4525 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4526 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4527 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4528 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4529 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4530 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4531 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4532 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4533 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4534 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4535 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4536 /* default no fold in kinsoku */
4539 f_line = char_size(c2,c1);
4540 /* add one new f_line before this character */
4543 f_line = char_size(c2,c1);
4545 /* add one new f_line before this character */
4550 /* terminator process */
4551 switch(fold_state) {
4570 nkf_char z_prev2=0,z_prev1=0;
4572 void z_conv(nkf_char c2, nkf_char c1)
4575 /* if (c2) c1 &= 0x7f; assertion */
4577 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4583 if (z_prev2 == X0201) {
4585 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4587 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4589 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4591 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4596 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4599 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4600 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4605 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4616 if (alpha_f&1 && c2 == 0x23) {
4617 /* JISX0208 Alphabet */
4619 } else if (c2 == 0x21) {
4620 /* JISX0208 Kigou */
4625 } else if (alpha_f&4) {
4630 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4636 if (alpha_f&8 && c2 == 0) {
4640 case '>': entity = ">"; break;
4641 case '<': entity = "<"; break;
4642 case '\"': entity = """; break;
4643 case '&': entity = "&"; break;
4646 while (*entity) (*o_zconv)(0, *entity++);
4652 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4657 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4661 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4665 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4669 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4673 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4677 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4681 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4685 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4690 (*o_zconv)(X0201, c);
4693 } else if (c2 == 0x25) {
4694 /* JISX0208 Katakana */
4695 static const int fullwidth_to_halfwidth[] =
4697 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4698 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4699 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4700 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4701 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4702 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4703 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4704 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4705 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4706 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4707 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4708 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4710 if (fullwidth_to_halfwidth[c1-0x20]){
4711 c2 = fullwidth_to_halfwidth[c1-0x20];
4712 (*o_zconv)(X0201, c2>>8);
4714 (*o_zconv)(X0201, c2&0xFF);
4724 #define rot13(c) ( \
4726 (c <= 'M') ? (c + 13): \
4727 (c <= 'Z') ? (c - 13): \
4729 (c <= 'm') ? (c + 13): \
4730 (c <= 'z') ? (c - 13): \
4734 #define rot47(c) ( \
4736 ( c <= 'O') ? (c + 47) : \
4737 ( c <= '~') ? (c - 47) : \
4741 void rot_conv(nkf_char c2, nkf_char c1)
4743 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4749 (*o_rot_conv)(c2,c1);
4752 void hira_conv(nkf_char c2, nkf_char c1)
4756 if (0x20 < c1 && c1 < 0x74) {
4758 (*o_hira_conv)(c2,c1);
4760 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4762 c1 = CLASS_UNICODE | 0x3094;
4763 (*o_hira_conv)(c2,c1);
4766 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4768 (*o_hira_conv)(c2,c1);
4773 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4776 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4778 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4782 (*o_hira_conv)(c2,c1);
4786 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4788 static const nkf_char range[RANGE_NUM_MAX][2] = {
4809 nkf_char start, end, c;
4811 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4815 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4820 for (i = 0; i < RANGE_NUM_MAX; i++) {
4821 start = range[i][0];
4824 if (c >= start && c <= end) {
4829 (*o_iso2022jp_check_conv)(c2,c1);
4833 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4835 static const unsigned char *mime_pattern[] = {
4836 (const unsigned char *)"\075?EUC-JP?B?",
4837 (const unsigned char *)"\075?SHIFT_JIS?B?",
4838 (const unsigned char *)"\075?ISO-8859-1?Q?",
4839 (const unsigned char *)"\075?ISO-8859-1?B?",
4840 (const unsigned char *)"\075?ISO-2022-JP?B?",
4841 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4842 #if defined(UTF8_INPUT_ENABLE)
4843 (const unsigned char *)"\075?UTF-8?B?",
4844 (const unsigned char *)"\075?UTF-8?Q?",
4846 (const unsigned char *)"\075?US-ASCII?Q?",
4851 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4852 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4853 e_iconv, s_iconv, 0, 0, 0, 0,
4854 #if defined(UTF8_INPUT_ENABLE)
4860 static const nkf_char mime_encode[] = {
4861 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4862 #if defined(UTF8_INPUT_ENABLE)
4869 static const nkf_char mime_encode_method[] = {
4870 'B', 'B','Q', 'B', 'B', 'Q',
4871 #if defined(UTF8_INPUT_ENABLE)
4879 #define MAXRECOVER 20
4881 void switch_mime_getc(void)
4883 if (i_getc!=mime_getc) {
4884 i_mgetc = i_getc; i_getc = mime_getc;
4885 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4886 if(mime_f==STRICT_MIME) {
4887 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4888 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4893 void unswitch_mime_getc(void)
4895 if(mime_f==STRICT_MIME) {
4896 i_mgetc = i_mgetc_buf;
4897 i_mungetc = i_mungetc_buf;
4900 i_ungetc = i_mungetc;
4901 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4902 mime_iconv_back = NULL;
4905 nkf_char mime_begin_strict(FILE *f)
4909 const unsigned char *p,*q;
4910 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4912 mime_decode_mode = FALSE;
4913 /* =? has been checked */
4915 p = mime_pattern[j];
4918 for(i=2;p[i]>SP;i++) { /* start at =? */
4919 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4920 /* pattern fails, try next one */
4922 while (mime_pattern[++j]) {
4923 p = mime_pattern[j];
4924 for(k=2;k<i;k++) /* assume length(p) > i */
4925 if (p[k]!=q[k]) break;
4926 if (k==i && nkf_toupper(c1)==p[k]) break;
4928 p = mime_pattern[j];
4929 if (p) continue; /* found next one, continue */
4930 /* all fails, output from recovery buffer */
4938 mime_decode_mode = p[i-2];
4940 mime_iconv_back = iconv;
4941 set_iconv(FALSE, mime_priority_func[j]);
4942 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4944 if (mime_decode_mode=='B') {
4945 mimebuf_f = unbuf_f;
4947 /* do MIME integrity check */
4948 return mime_integrity(f,mime_pattern[j]);
4956 nkf_char mime_getc_buf(FILE *f)
4958 /* we don't keep eof of Fifo, becase it contains ?= as
4959 a terminator. It was checked in mime_integrity. */
4960 return ((mimebuf_f)?
4961 (*i_mgetc_buf)(f):Fifo(mime_input++));
4964 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4967 (*i_mungetc_buf)(c,f);
4969 Fifo(--mime_input) = (unsigned char)c;
4973 nkf_char mime_begin(FILE *f)
4978 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4979 /* re-read and convert again from mime_buffer. */
4981 /* =? has been checked */
4983 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4984 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4985 /* We accept any character type even if it is breaked by new lines */
4986 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4987 if (c1==LF||c1==SP||c1==CR||
4988 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4990 /* Failed. But this could be another MIME preemble */
4998 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4999 if (!(++i<MAXRECOVER) || c1==EOF) break;
5000 if (c1=='b'||c1=='B') {
5001 mime_decode_mode = 'B';
5002 } else if (c1=='q'||c1=='Q') {
5003 mime_decode_mode = 'Q';
5007 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5008 if (!(++i<MAXRECOVER) || c1==EOF) break;
5010 mime_decode_mode = FALSE;
5016 if (!mime_decode_mode) {
5017 /* false MIME premble, restart from mime_buffer */
5018 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5019 /* Since we are in MIME mode until buffer becomes empty, */
5020 /* we never go into mime_begin again for a while. */
5023 /* discard mime preemble, and goto MIME mode */
5025 /* do no MIME integrity check */
5026 return c1; /* used only for checking EOF */
5030 void no_putc(nkf_char c)
5035 void debug(const char *str)
5038 fprintf(stderr, "%s\n", str ? str : "NULL");
5043 void set_input_codename(char *codename)
5045 if (!input_codename) {
5046 input_codename = codename;
5047 } else if (strcmp(codename, input_codename) != 0) {
5048 input_codename = "";
5052 #if !defined(PERL_XS) && !defined(WIN32DLL)
5053 void print_guessed_code(char *filename)
5055 char *codename = "BINARY";
5056 char *str_nlmode = NULL;
5057 if (filename != NULL) printf("%s: ", filename);
5058 if (input_codename && !*input_codename) {
5061 struct input_code *p = find_inputcode_byfunc(iconv);
5063 printf("%s\n", input_codename ? input_codename : "ASCII");
5065 if (!input_codename) {
5066 input_codename = "ASCII";
5067 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5068 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5069 input_codename = "CP932";
5070 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5071 if (p->score & (SCORE_X0212))
5072 input_codename = "EUCJP-MS";
5073 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5074 input_codename = "CP51932";
5075 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5076 if (p->score & (SCORE_KANA))
5077 input_codename = "CP50221";
5078 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5079 input_codename = "CP50220";
5083 input_nextline == CR ? " (CR)" :
5084 input_nextline == LF ? " (LF)" :
5085 input_nextline == CRLF ? " (CRLF)" :
5086 input_nextline == EOF ? " (MIXED NL)" :
5095 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5097 nkf_char c1, c2, c3;
5103 if (!nkf_isxdigit(c2)){
5108 if (!nkf_isxdigit(c3)){
5113 return (hex2bin(c2) << 4) | hex2bin(c3);
5116 nkf_char cap_getc(FILE *f)
5118 return hex_getc(':', f, i_cgetc, i_cungetc);
5121 nkf_char cap_ungetc(nkf_char c, FILE *f)
5123 return (*i_cungetc)(c, f);
5126 nkf_char url_getc(FILE *f)
5128 return hex_getc('%', f, i_ugetc, i_uungetc);
5131 nkf_char url_ungetc(nkf_char c, FILE *f)
5133 return (*i_uungetc)(c, f);
5137 #ifdef NUMCHAR_OPTION
5138 nkf_char numchar_getc(FILE *f)
5140 nkf_char (*g)(FILE *) = i_ngetc;
5141 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5152 if (buf[i] == 'x' || buf[i] == 'X'){
5153 for (j = 0; j < 7; j++){
5155 if (!nkf_isxdigit(buf[i])){
5162 c |= hex2bin(buf[i]);
5165 for (j = 0; j < 8; j++){
5169 if (!nkf_isdigit(buf[i])){
5176 c += hex2bin(buf[i]);
5182 return CLASS_UNICODE | c;
5191 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5193 return (*i_nungetc)(c, f);
5197 #ifdef UNICODE_NORMALIZATION
5199 /* Normalization Form C */
5200 nkf_char nfc_getc(FILE *f)
5202 nkf_char (*g)(FILE *f) = i_nfc_getc;
5203 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5204 int i=0, j, k=1, lower, upper;
5206 const nkf_nfchar *array;
5209 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5210 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5211 while (upper >= lower) {
5212 j = (lower+upper) / 2;
5213 array = normalization_table[j].nfd;
5214 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5215 if (array[k] != buf[k]){
5216 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5223 array = normalization_table[j].nfc;
5224 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5225 buf[i] = (nkf_char)(array[i]);
5236 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5238 return (*i_nfc_ungetc)(c, f);
5240 #endif /* UNICODE_NORMALIZATION */
5246 nkf_char c1, c2, c3, c4, cc;
5247 nkf_char t1, t2, t3, t4, mode, exit_mode;
5248 nkf_char lwsp_count;
5251 nkf_char lwsp_size = 128;
5253 if (mime_top != mime_last) { /* Something is in FIFO */
5254 return Fifo(mime_top++);
5256 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5257 mime_decode_mode=FALSE;
5258 unswitch_mime_getc();
5259 return (*i_getc)(f);
5262 if (mimebuf_f == FIXED_MIME)
5263 exit_mode = mime_decode_mode;
5266 if (mime_decode_mode == 'Q') {
5267 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5269 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5270 if (c1<=SP || DEL<=c1) {
5271 mime_decode_mode = exit_mode; /* prepare for quit */
5274 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5278 mime_decode_mode = exit_mode; /* prepare for quit */
5279 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5280 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5281 /* end Q encoding */
5282 input_mode = exit_mode;
5284 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5285 if (lwsp_buf==NULL) {
5286 perror("can't malloc");
5289 while ((c1=(*i_getc)(f))!=EOF) {
5294 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5302 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5303 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5318 lwsp_buf[lwsp_count] = (unsigned char)c1;
5319 if (lwsp_count++>lwsp_size){
5321 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5322 if (lwsp_buf_new==NULL) {
5324 perror("can't realloc");
5327 lwsp_buf = lwsp_buf_new;
5333 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5335 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5336 i_ungetc(lwsp_buf[lwsp_count],f);
5342 if (c1=='='&&c2<SP) { /* this is soft wrap */
5343 while((c1 = (*i_mgetc)(f)) <=SP) {
5344 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5346 mime_decode_mode = 'Q'; /* still in MIME */
5347 goto restart_mime_q;
5350 mime_decode_mode = 'Q'; /* still in MIME */
5354 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5355 if (c2<=SP) return c2;
5356 mime_decode_mode = 'Q'; /* still in MIME */
5357 return ((hex2bin(c2)<<4) + hex2bin(c3));
5360 if (mime_decode_mode != 'B') {
5361 mime_decode_mode = FALSE;
5362 return (*i_mgetc)(f);
5366 /* Base64 encoding */
5368 MIME allows line break in the middle of
5369 Base64, but we are very pessimistic in decoding
5370 in unbuf mode because MIME encoded code may broken by
5371 less or editor's control sequence (such as ESC-[-K in unbuffered
5372 mode. ignore incomplete MIME.
5374 mode = mime_decode_mode;
5375 mime_decode_mode = exit_mode; /* prepare for quit */
5377 while ((c1 = (*i_mgetc)(f))<=SP) {
5382 if ((c2 = (*i_mgetc)(f))<=SP) {
5385 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5386 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5389 if ((c1 == '?') && (c2 == '=')) {
5392 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5393 if (lwsp_buf==NULL) {
5394 perror("can't malloc");
5397 while ((c1=(*i_getc)(f))!=EOF) {
5402 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5410 if ((c1=(*i_getc)(f))!=EOF) {
5414 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5429 lwsp_buf[lwsp_count] = (unsigned char)c1;
5430 if (lwsp_count++>lwsp_size){
5432 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5433 if (lwsp_buf_new==NULL) {
5435 perror("can't realloc");
5438 lwsp_buf = lwsp_buf_new;
5444 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5446 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5447 i_ungetc(lwsp_buf[lwsp_count],f);
5454 if ((c3 = (*i_mgetc)(f))<=SP) {
5457 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5458 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5462 if ((c4 = (*i_mgetc)(f))<=SP) {
5465 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5466 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5470 mime_decode_mode = mode; /* still in MIME sigh... */
5472 /* BASE 64 decoding */
5474 t1 = 0x3f & base64decode(c1);
5475 t2 = 0x3f & base64decode(c2);
5476 t3 = 0x3f & base64decode(c3);
5477 t4 = 0x3f & base64decode(c4);
5478 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5480 Fifo(mime_last++) = (unsigned char)cc;
5481 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5483 Fifo(mime_last++) = (unsigned char)cc;
5484 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5486 Fifo(mime_last++) = (unsigned char)cc;
5491 return Fifo(mime_top++);
5494 nkf_char mime_ungetc(nkf_char c, FILE *f)
5496 Fifo(--mime_top) = (unsigned char)c;
5500 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5504 /* In buffered mode, read until =? or NL or buffer full
5506 mime_input = mime_top;
5507 mime_last = mime_top;
5509 while(*p) Fifo(mime_input++) = *p++;
5512 while((c=(*i_getc)(f))!=EOF) {
5513 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5514 break; /* buffer full */
5516 if (c=='=' && d=='?') {
5517 /* checked. skip header, start decode */
5518 Fifo(mime_input++) = (unsigned char)c;
5519 /* mime_last_input = mime_input; */
5524 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5526 /* Should we check length mod 4? */
5527 Fifo(mime_input++) = (unsigned char)c;
5530 /* In case of Incomplete MIME, no MIME decode */
5531 Fifo(mime_input++) = (unsigned char)c;
5532 mime_last = mime_input; /* point undecoded buffer */
5533 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5534 switch_mime_getc(); /* anyway we need buffered getc */
5538 nkf_char base64decode(nkf_char c)
5543 i = c - 'A'; /* A..Z 0-25 */
5544 } else if (c == '_') {
5545 i = '?' /* 63 */ ; /* _ 63 */
5547 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5549 } else if (c > '/') {
5550 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5551 } else if (c == '+' || c == '-') {
5552 i = '>' /* 62 */ ; /* + and - 62 */
5554 i = '?' /* 63 */ ; /* / 63 */
5559 static const char basis_64[] =
5560 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5562 static nkf_char b64c;
5563 #define MIMEOUT_BUF_LENGTH (60)
5564 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5565 int mimeout_buf_count = 0;
5567 void open_mime(nkf_char mode)
5569 const unsigned char *p;
5572 p = mime_pattern[0];
5573 for(i=0;mime_pattern[i];i++) {
5574 if (mode == mime_encode[i]) {
5575 p = mime_pattern[i];
5579 mimeout_mode = mime_encode_method[i];
5581 if (base64_count>45) {
5582 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5583 (*o_mputc)(mimeout_buf[i]);
5589 if (mimeout_buf_count>0
5590 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5591 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5595 for (;i<mimeout_buf_count;i++) {
5596 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5597 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5598 (*o_mputc)(mimeout_buf[i]);
5608 j = mimeout_buf_count;
5609 mimeout_buf_count = 0;
5611 mime_putc(mimeout_buf[i]);
5615 void close_mime(void)
5625 switch(mimeout_mode) {
5630 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5636 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5641 if (mimeout_mode > 0) {
5642 if (mimeout_f!=FIXED_MIME) {
5644 } else if (mimeout_mode != 'Q')
5649 void mimeout_addchar(nkf_char c)
5651 switch(mimeout_mode) {
5656 } else if(!nkf_isalnum(c)) {
5658 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5659 (*o_mputc)(bin2hex((c&0xf)));
5668 (*o_mputc)(basis_64[c>>2]);
5673 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5679 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5680 (*o_mputc)(basis_64[c & 0x3F]);
5691 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5693 void mime_prechar(nkf_char c2, nkf_char c1)
5695 if (mimeout_mode > 0){
5697 if (base64_count + mimeout_buf_count/3*4> 73){
5698 (*o_base64conv)(EOF,0);
5699 (*o_base64conv)(0,LF);
5700 (*o_base64conv)(0,SP);
5704 if (base64_count + mimeout_buf_count/3*4> 66) {
5705 (*o_base64conv)(EOF,0);
5706 (*o_base64conv)(0,LF);
5707 (*o_base64conv)(0,SP);
5713 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5714 mimeout_mode = (output_mode==ASCII ||output_mode == ISO8859_1) ? 'Q' : 'B';
5715 open_mime(output_mode);
5716 (*o_base64conv)(EOF,0);
5717 (*o_base64conv)(0,LF);
5718 (*o_base64conv)(0,SP);
5725 void mime_putc(nkf_char c)
5730 if (mimeout_f == FIXED_MIME){
5731 if (mimeout_mode == 'Q'){
5732 if (base64_count > 71){
5733 if (c!=CR && c!=LF) {
5740 if (base64_count > 71){
5745 if (c == EOF) { /* c==EOF */
5749 if (c != EOF) { /* c==EOF */
5755 /* mimeout_f != FIXED_MIME */
5757 if (c == EOF) { /* c==EOF */
5758 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5759 j = mimeout_buf_count;
5760 mimeout_buf_count = 0;
5762 if (mimeout_mode > 0) {
5763 if (!nkf_isblank(mimeout_buf[j-1])) {
5765 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5768 mimeout_addchar(mimeout_buf[i]);
5772 mimeout_addchar(mimeout_buf[i]);
5776 mimeout_addchar(mimeout_buf[i]);
5782 mimeout_addchar(mimeout_buf[i]);
5788 if (mimeout_buf_count > 0){
5789 lastchar = mimeout_buf[mimeout_buf_count - 1];
5794 if (mimeout_mode=='Q') {
5795 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5796 if (c == CR || c == LF) {
5801 } else if (c <= SP) {
5803 if (base64_count > 70) {
5807 if (!nkf_isblank(c)) {
5812 if (base64_count > 70) {
5817 open_mime(output_mode);
5819 if (!nkf_noescape_mime(c)) {
5830 if (mimeout_mode <= 0) {
5831 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5832 if (nkf_isspace(c)) {
5834 if (mimeout_mode == -1) {
5837 if (c==CR || c==LF) {
5839 open_mime(output_mode);
5845 for (i=0;i<mimeout_buf_count;i++) {
5846 (*o_mputc)(mimeout_buf[i]);
5847 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5858 mimeout_buf[0] = (char)c;
5859 mimeout_buf_count = 1;
5861 if (base64_count > 1
5862 && base64_count + mimeout_buf_count > 76
5863 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5866 if (!nkf_isspace(mimeout_buf[0])){
5871 mimeout_buf[mimeout_buf_count++] = (char)c;
5872 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5873 open_mime(output_mode);
5878 if (lastchar==CR || lastchar == LF){
5879 for (i=0;i<mimeout_buf_count;i++) {
5880 (*o_mputc)(mimeout_buf[i]);
5883 mimeout_buf_count = 0;
5886 for (i=0;i<mimeout_buf_count-1;i++) {
5887 (*o_mputc)(mimeout_buf[i]);
5890 mimeout_buf[0] = SP;
5891 mimeout_buf_count = 1;
5893 open_mime(output_mode);
5896 /* mimeout_mode == 'B', 1, 2 */
5897 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5898 if (lastchar == CR || lastchar == LF){
5899 if (nkf_isblank(c)) {
5900 for (i=0;i<mimeout_buf_count;i++) {
5901 mimeout_addchar(mimeout_buf[i]);
5903 mimeout_buf_count = 0;
5904 } else if (SP<c && c<DEL) {
5906 for (i=0;i<mimeout_buf_count;i++) {
5907 (*o_mputc)(mimeout_buf[i]);
5910 mimeout_buf_count = 0;
5912 mimeout_buf[mimeout_buf_count++] = (char)c;
5915 if (c==SP || c==TAB || c==CR || c==LF) {
5916 for (i=0;i<mimeout_buf_count;i++) {
5917 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5919 for (i=0;i<mimeout_buf_count;i++) {
5920 (*o_mputc)(mimeout_buf[i]);
5923 mimeout_buf_count = 0;
5926 mimeout_buf[mimeout_buf_count++] = (char)c;
5927 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5929 for (i=0;i<mimeout_buf_count;i++) {
5930 (*o_mputc)(mimeout_buf[i]);
5933 mimeout_buf_count = 0;
5937 if (mimeout_buf_count>0 && SP<c && c!='=') {
5938 mimeout_buf[mimeout_buf_count++] = (char)c;
5939 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5940 j = mimeout_buf_count;
5941 mimeout_buf_count = 0;
5943 mimeout_addchar(mimeout_buf[i]);
5950 if (mimeout_buf_count>0) {
5951 j = mimeout_buf_count;
5952 mimeout_buf_count = 0;
5954 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5956 mimeout_addchar(mimeout_buf[i]);
5962 (*o_mputc)(mimeout_buf[i]);
5964 open_mime(output_mode);
5974 struct input_code *p = input_code_list;
5987 mime_f = STRICT_MIME;
5988 mime_decode_f = FALSE;
5993 #if defined(MSDOS) || defined(__OS2__)
5998 iso2022jp_f = FALSE;
5999 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6000 ms_ucs_map_f = UCS_MAP_ASCII;
6002 #ifdef UTF8_INPUT_ENABLE
6003 no_cp932ext_f = FALSE;
6004 no_best_fit_chars_f = FALSE;
6005 encode_fallback = NULL;
6006 unicode_subchar = '?';
6007 input_endian = ENDIAN_BIG;
6009 #ifdef UTF8_OUTPUT_ENABLE
6010 output_bom_f = FALSE;
6011 output_endian = ENDIAN_BIG;
6013 #ifdef UNICODE_NORMALIZATION
6029 #ifdef SHIFTJIS_CP932
6039 for (i = 0; i < 256; i++){
6040 prefix_table[i] = 0;
6044 mimeout_buf_count = 0;
6049 fold_preserve_f = FALSE;
6052 kanji_intro = DEFAULT_J;
6053 ascii_intro = DEFAULT_R;
6054 fold_margin = FOLD_MARGIN;
6055 output_conv = DEFAULT_CONV;
6056 oconv = DEFAULT_CONV;
6057 o_zconv = no_connection;
6058 o_fconv = no_connection;
6059 o_nlconv = no_connection;
6060 o_rot_conv = no_connection;
6061 o_hira_conv = no_connection;
6062 o_base64conv = no_connection;
6063 o_iso2022jp_check_conv = no_connection;
6066 i_ungetc = std_ungetc;
6068 i_bungetc = std_ungetc;
6071 i_mungetc = std_ungetc;
6072 i_mgetc_buf = std_getc;
6073 i_mungetc_buf = std_ungetc;
6074 output_mode = ASCII;
6077 mime_decode_mode = FALSE;
6085 z_prev2=0,z_prev1=0;
6087 iconv_for_check = 0;
6089 input_codename = NULL;
6095 void no_connection(nkf_char c2, nkf_char c1)
6097 no_connection2(c2,c1,0);
6100 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6102 fprintf(stderr,"nkf internal module connection failure.\n");
6104 return 0; /* LINT */
6109 #define fprintf dllprintf
6113 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6114 fprintf(stderr,"Flags:\n");
6115 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6116 #ifdef DEFAULT_CODE_SJIS
6117 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6119 #ifdef DEFAULT_CODE_JIS
6120 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6122 #ifdef DEFAULT_CODE_EUC
6123 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6125 #ifdef DEFAULT_CODE_UTF8
6126 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6128 #ifdef UTF8_OUTPUT_ENABLE
6129 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6131 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6132 #ifdef UTF8_INPUT_ENABLE
6133 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6135 fprintf(stderr,"t no conversion\n");
6136 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6137 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6138 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6139 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6140 fprintf(stderr,"v Show this usage. V: show version\n");
6141 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6142 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6143 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6144 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6145 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6146 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6147 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6148 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6149 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6151 fprintf(stderr,"T Text mode output\n");
6153 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6154 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6155 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6156 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6157 fprintf(stderr,"\n");
6158 fprintf(stderr,"Long name options\n");
6159 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6160 fprintf(stderr," Specify the input or output codeset\n");
6161 fprintf(stderr," --fj --unix --mac --windows\n");
6162 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6163 fprintf(stderr," Convert for the system or code\n");
6164 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6165 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6166 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6168 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6170 #ifdef NUMCHAR_OPTION
6171 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6173 #ifdef UTF8_INPUT_ENABLE
6174 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6175 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6178 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6179 fprintf(stderr," Overwrite original listed files by filtered result\n");
6180 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6182 fprintf(stderr," -g --guess Guess the input code\n");
6183 fprintf(stderr," --help --version Show this help/the version\n");
6184 fprintf(stderr," For more information, see also man nkf\n");
6185 fprintf(stderr,"\n");
6191 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6192 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6195 #if defined(MSDOS) && defined(__WIN16__)
6198 #if defined(MSDOS) && defined(__WIN32__)
6204 ,NKF_VERSION,NKF_RELEASE_DATE);
6205 fprintf(stderr,"\n%s\n",CopyRight);