1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.138 2007/10/01 19:55:25 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-10-01"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
227 #define CP932_TABLE_BEGIN 0xFA
228 #define CP932_TABLE_END 0xFC
229 #define CP932INV_TABLE_BEGIN 0xED
230 #define CP932INV_TABLE_END 0xEE
231 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
233 #define HOLD_SIZE 1024
234 #if defined(INT_IS_SHORT)
235 #define IOBUF_SIZE 2048
237 #define IOBUF_SIZE 16384
240 #define DEFAULT_J 'B'
241 #define DEFAULT_R 'B'
243 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
244 #define SJ6394 0x0161 /* 63 - 94 ku offset */
246 #define RANGE_NUM_MAX 18
251 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
252 #define sizeof_euc_to_utf8_1byte 94
253 #define sizeof_euc_to_utf8_2bytes 94
254 #define sizeof_utf8_to_euc_C2 64
255 #define sizeof_utf8_to_euc_E5B8 64
256 #define sizeof_utf8_to_euc_2bytes 112
257 #define sizeof_utf8_to_euc_3bytes 16
260 /* MIME preprocessor */
262 #ifdef EASYWIN /*Easy Win */
263 extern POINT _BufferSize;
272 void (*status_func)(struct input_code *, nkf_char);
273 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
277 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
280 static const char *CopyRight = COPY_RIGHT;
282 #if !defined(PERL_XS) && !defined(WIN32DLL)
283 static nkf_char noconvert(FILE *f);
285 static void module_connection(void);
286 static nkf_char kanji_convert(FILE *f);
287 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
288 static nkf_char push_hold_buf(nkf_char c2);
289 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
290 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
291 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
292 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
293 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
295 * 0: Shift_JIS, eucJP-ascii
300 #define UCS_MAP_ASCII 0
302 #define UCS_MAP_CP932 2
303 #define UCS_MAP_CP10001 3
304 static int ms_ucs_map_f = UCS_MAP_ASCII;
306 #ifdef UTF8_INPUT_ENABLE
307 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
308 static int no_cp932ext_f = FALSE;
309 /* ignore ZERO WIDTH NO-BREAK SPACE */
310 static int no_best_fit_chars_f = FALSE;
311 static int input_endian = ENDIAN_BIG;
312 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
313 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
314 static void encode_fallback_html(nkf_char c);
315 static void encode_fallback_xml(nkf_char c);
316 static void encode_fallback_java(nkf_char c);
317 static void encode_fallback_perl(nkf_char c);
318 static void encode_fallback_subchar(nkf_char c);
319 static void (*encode_fallback)(nkf_char c) = NULL;
320 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
321 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
322 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
323 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
324 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
325 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
326 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
327 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
328 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
329 static void w_status(struct input_code *, nkf_char);
331 #ifdef UTF8_OUTPUT_ENABLE
332 static int output_bom_f = FALSE;
333 static int output_endian = ENDIAN_BIG;
334 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
335 static void w_oconv(nkf_char c2,nkf_char c1);
336 static void w_oconv16(nkf_char c2,nkf_char c1);
337 static void w_oconv32(nkf_char c2,nkf_char c1);
339 static void e_oconv(nkf_char c2,nkf_char c1);
340 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
341 static void s_oconv(nkf_char c2,nkf_char c1);
342 static void j_oconv(nkf_char c2,nkf_char c1);
343 static void fold_conv(nkf_char c2,nkf_char c1);
344 static void nl_conv(nkf_char c2,nkf_char c1);
345 static void z_conv(nkf_char c2,nkf_char c1);
346 static void rot_conv(nkf_char c2,nkf_char c1);
347 static void hira_conv(nkf_char c2,nkf_char c1);
348 static void base64_conv(nkf_char c2,nkf_char c1);
349 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
350 static void no_connection(nkf_char c2,nkf_char c1);
351 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
353 static void code_score(struct input_code *ptr);
354 static void code_status(nkf_char c);
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_begin(FILE *f);
364 static nkf_char mime_getc(FILE *f);
365 static nkf_char mime_ungetc(nkf_char c,FILE *f);
367 static void switch_mime_getc(void);
368 static void unswitch_mime_getc(void);
369 static nkf_char mime_begin_strict(FILE *f);
370 static nkf_char mime_getc_buf(FILE *f);
371 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
372 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
374 static nkf_char base64decode(nkf_char c);
375 static void mime_prechar(nkf_char c2, nkf_char c1);
376 static void mime_putc(nkf_char c);
377 static void open_mime(nkf_char c);
378 static void close_mime(void);
379 static void eof_mime(void);
380 static void mimeout_addchar(nkf_char c);
382 static void usage(void);
383 static void version(void);
385 static void options(unsigned char *c);
386 #if defined(PERL_XS) || defined(WIN32DLL)
387 static void reinit(void);
392 #if !defined(PERL_XS) && !defined(WIN32DLL)
393 static unsigned char stdibuf[IOBUF_SIZE];
394 static unsigned char stdobuf[IOBUF_SIZE];
396 static unsigned char hold_buf[HOLD_SIZE*2];
397 static int hold_count = 0;
399 /* MIME preprocessor fifo */
401 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
402 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
403 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
404 static unsigned char mime_buf[MIME_BUF_SIZE];
405 static unsigned int mime_top = 0;
406 static unsigned int mime_last = 0; /* decoded */
407 static unsigned int mime_input = 0; /* undecoded */
408 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
411 static int unbuf_f = FALSE;
412 static int estab_f = FALSE;
413 static int nop_f = FALSE;
414 static int binmode_f = TRUE; /* binary mode */
415 static int rot_f = FALSE; /* rot14/43 mode */
416 static int hira_f = FALSE; /* hira/kata henkan */
417 static int input_f = FALSE; /* non fixed input code */
418 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
419 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
420 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
421 static int mimebuf_f = FALSE; /* MIME buffered input */
422 static int broken_f = FALSE; /* convert ESC-less broken JIS */
423 static int iso8859_f = FALSE; /* ISO8859 through */
424 static int mimeout_f = FALSE; /* base64 mode */
425 #if defined(MSDOS) || defined(__OS2__)
426 static int x0201_f = TRUE; /* Assume JISX0201 kana */
428 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
430 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
432 #ifdef UNICODE_NORMALIZATION
433 static int nfc_f = FALSE;
434 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
435 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
436 static nkf_char nfc_getc(FILE *f);
437 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
441 static int cap_f = FALSE;
442 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
443 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
444 static nkf_char cap_getc(FILE *f);
445 static nkf_char cap_ungetc(nkf_char c,FILE *f);
447 static int url_f = FALSE;
448 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
449 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
450 static nkf_char url_getc(FILE *f);
451 static nkf_char url_ungetc(nkf_char c,FILE *f);
454 #if defined(INT_IS_SHORT)
455 #define NKF_INT32_C(n) (n##L)
457 #define NKF_INT32_C(n) (n)
459 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
460 #define CLASS_MASK NKF_INT32_C(0xFF000000)
461 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
462 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
463 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
464 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
465 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
467 #ifdef NUMCHAR_OPTION
468 static int numchar_f = FALSE;
469 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
470 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
471 static nkf_char numchar_getc(FILE *f);
472 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
476 static int noout_f = FALSE;
477 static void no_putc(nkf_char c);
478 static nkf_char debug_f = FALSE;
479 static void debug(const char *str);
480 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
483 static int guess_f = FALSE;
485 static void print_guessed_code(char *filename);
487 static void set_input_codename(char *codename);
488 static int is_inputcode_mixed = FALSE;
491 static int exec_f = 0;
494 #ifdef SHIFTJIS_CP932
495 /* invert IBM extended characters to others */
496 static int cp51932_f = FALSE;
498 /* invert NEC-selected IBM extended characters to IBM extended characters */
499 static int cp932inv_f = TRUE;
501 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
502 #endif /* SHIFTJIS_CP932 */
505 static int x0212_f = FALSE;
506 static nkf_char x0212_shift(nkf_char c);
507 static nkf_char x0212_unshift(nkf_char c);
509 static int x0213_f = FALSE;
511 static unsigned char prefix_table[256];
513 static void set_code_score(struct input_code *ptr, nkf_char score);
514 static void clr_code_score(struct input_code *ptr, nkf_char score);
515 static void status_disable(struct input_code *ptr);
516 static void status_push_ch(struct input_code *ptr, nkf_char c);
517 static void status_clear(struct input_code *ptr);
518 static void status_reset(struct input_code *ptr);
519 static void status_reinit(struct input_code *ptr);
520 static void status_check(struct input_code *ptr, nkf_char c);
521 static void e_status(struct input_code *, nkf_char);
522 static void s_status(struct input_code *, nkf_char);
524 struct input_code input_code_list[] = {
525 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
526 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
527 #ifdef UTF8_INPUT_ENABLE
528 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
529 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
530 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
535 static int mimeout_mode = 0;
536 static int base64_count = 0;
538 /* X0208 -> ASCII converter */
541 static int f_line = 0; /* chars in line */
542 static int f_prev = 0;
543 static int fold_preserve_f = FALSE; /* preserve new lines */
544 static int fold_f = FALSE;
545 static int fold_len = 0;
548 static unsigned char kanji_intro = DEFAULT_J;
549 static unsigned char ascii_intro = DEFAULT_R;
553 #define FOLD_MARGIN 10
554 #define DEFAULT_FOLD 60
556 static int fold_margin = FOLD_MARGIN;
560 #ifdef DEFAULT_CODE_JIS
561 # define DEFAULT_CONV j_oconv
563 #ifdef DEFAULT_CODE_SJIS
564 # define DEFAULT_CONV s_oconv
566 #ifdef DEFAULT_CODE_EUC
567 # define DEFAULT_CONV e_oconv
569 #ifdef DEFAULT_CODE_UTF8
570 # define DEFAULT_CONV w_oconv
573 /* process default */
574 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
576 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
577 /* s_iconv or oconv */
578 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
580 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
588 /* static redirections */
590 static void (*o_putc)(nkf_char c) = std_putc;
592 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
593 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
595 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
596 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
598 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
600 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
601 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
603 /* for strict mime */
604 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
605 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
608 static int output_mode = ASCII, /* output kanji mode */
609 input_mode = ASCII, /* input kanji mode */
610 shift_mode = FALSE; /* TRUE shift out, or X0201 */
611 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
613 /* X0201 / X0208 conversion tables */
615 /* X0201 kana conversion table */
617 static const unsigned char cv[]= {
618 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
619 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
620 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
621 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
622 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
623 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
624 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
625 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
626 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
627 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
628 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
629 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
630 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
631 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
632 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
633 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
637 /* X0201 kana conversion table for daguten */
639 static const unsigned char dv[]= {
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
645 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
646 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
647 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
648 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
649 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
651 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 /* X0201 kana conversion table for han-daguten */
660 static const unsigned char ev[]= {
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
672 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 /* X0208 kigou conversion table */
681 /* 0x8140 - 0x819e */
682 static const unsigned char fv[] = {
684 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
685 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
686 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
687 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
688 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
689 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
690 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
692 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
694 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
700 static int file_out_f = FALSE;
702 static int overwrite_f = FALSE;
703 static int preserve_time_f = FALSE;
704 static int backup_f = FALSE;
705 static char *backup_suffix = "";
706 static char *get_backup_filename(const char *suffix, const char *filename);
709 static int nlmode_f = 0; /* CR, LF, CRLF */
710 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
711 static nkf_char prev_cr = 0; /* CR or 0 */
712 #ifdef EASYWIN /*Easy Win */
713 static int end_check;
716 #define STD_GC_BUFSIZE (256)
717 nkf_char std_gc_buf[STD_GC_BUFSIZE];
721 #include "nkf32dll.c"
722 #elif defined(PERL_XS)
724 int main(int argc, char **argv)
729 char *outfname = NULL;
732 #ifdef EASYWIN /*Easy Win */
733 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
736 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
737 cp = (unsigned char *)*argv;
742 if (pipe(fds) < 0 || (pid = fork()) < 0){
753 execvp(argv[1], &argv[1]);
767 if(x0201_f == WISH_TRUE)
768 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
770 if (binmode_f == TRUE)
771 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
772 if (freopen("","wb",stdout) == NULL)
779 setbuf(stdout, (char *) NULL);
781 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
784 if (binmode_f == TRUE)
785 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
786 if (freopen("","rb",stdin) == NULL) return (-1);
790 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
794 kanji_convert(stdin);
795 if (guess_f) print_guessed_code(NULL);
799 int is_argument_error = FALSE;
801 is_inputcode_mixed = FALSE;
802 input_codename = NULL;
806 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
809 is_argument_error = TRUE;
817 /* reopen file for stdout */
818 if (file_out_f == TRUE) {
821 outfname = malloc(strlen(origfname)
822 + strlen(".nkftmpXXXXXX")
828 strcpy(outfname, origfname);
832 for (i = strlen(outfname); i; --i){
833 if (outfname[i - 1] == '/'
834 || outfname[i - 1] == '\\'){
840 strcat(outfname, "ntXXXXXX");
842 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
845 strcat(outfname, ".nkftmpXXXXXX");
846 fd = mkstemp(outfname);
849 || (fd_backup = dup(fileno(stdout))) < 0
850 || dup2(fd, fileno(stdout)) < 0
861 outfname = "nkf.out";
864 if(freopen(outfname, "w", stdout) == NULL) {
868 if (binmode_f == TRUE) {
869 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
870 if (freopen("","wb",stdout) == NULL)
877 if (binmode_f == TRUE)
878 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
879 if (freopen("","rb",fin) == NULL)
884 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
888 char *filename = NULL;
890 if (nfiles > 1) filename = origfname;
891 if (guess_f) print_guessed_code(filename);
897 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
905 if (dup2(fd_backup, fileno(stdout)) < 0){
908 if (stat(origfname, &sb)) {
909 fprintf(stderr, "Can't stat %s\n", origfname);
911 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
912 if (chmod(outfname, sb.st_mode)) {
913 fprintf(stderr, "Can't set permission %s\n", outfname);
916 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
918 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
919 tb[0] = tb[1] = sb.st_mtime;
920 if (utime(outfname, tb)) {
921 fprintf(stderr, "Can't set timestamp %s\n", outfname);
924 tb.actime = sb.st_atime;
925 tb.modtime = sb.st_mtime;
926 if (utime(outfname, &tb)) {
927 fprintf(stderr, "Can't set timestamp %s\n", outfname);
932 char *backup_filename = get_backup_filename(backup_suffix, origfname);
934 unlink(backup_filename);
936 if (rename(origfname, backup_filename)) {
937 perror(backup_filename);
938 fprintf(stderr, "Can't rename %s to %s\n",
939 origfname, backup_filename);
943 if (unlink(origfname)){
948 if (rename(outfname, origfname)) {
950 fprintf(stderr, "Can't rename %s to %s\n",
951 outfname, origfname);
958 if (is_argument_error)
961 #ifdef EASYWIN /*Easy Win */
962 if (file_out_f == FALSE)
963 scanf("%d",&end_check);
966 #else /* for Other OS */
967 if (file_out_f == TRUE)
972 #endif /* WIN32DLL */
975 char *get_backup_filename(const char *suffix, const char *filename)
977 char *backup_filename;
978 int asterisk_count = 0;
980 int filename_length = strlen(filename);
982 for(i = 0; suffix[i]; i++){
983 if(suffix[i] == '*') asterisk_count++;
987 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
988 if (!backup_filename){
989 perror("Can't malloc backup filename.");
993 for(i = 0, j = 0; suffix[i];){
994 if(suffix[i] == '*'){
995 backup_filename[j] = '\0';
996 strncat(backup_filename, filename, filename_length);
998 j += filename_length;
1000 backup_filename[j++] = suffix[i++];
1003 backup_filename[j] = '\0';
1005 j = strlen(suffix) + filename_length;
1006 backup_filename = malloc( + 1);
1007 strcpy(backup_filename, filename);
1008 strcat(backup_filename, suffix);
1009 backup_filename[j] = '\0';
1011 return backup_filename;
1015 static const struct {
1039 {"katakana-hiragana","h3"},
1046 #ifdef UTF8_OUTPUT_ENABLE
1056 {"fb-subchar=", ""},
1058 #ifdef UTF8_INPUT_ENABLE
1059 {"utf8-input", "W"},
1060 {"utf16-input", "W16"},
1061 {"no-cp932ext", ""},
1062 {"no-best-fit-chars",""},
1064 #ifdef UNICODE_NORMALIZATION
1065 {"utf8mac-input", ""},
1077 #ifdef NUMCHAR_OPTION
1078 {"numchar-input", ""},
1084 #ifdef SHIFTJIS_CP932
1094 static int option_mode = 0;
1096 void options(unsigned char *cp)
1100 unsigned char *cp_back = NULL;
1105 while(*cp && *cp++!='-');
1106 while (*cp || cp_back) {
1114 case '-': /* literal options */
1115 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1119 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1120 p = (unsigned char *)long_option[i].name;
1121 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1122 if (*p == cp[j] || cp[j] == SP){
1129 while(*cp && *cp != SP && cp++);
1130 if (long_option[i].alias[0]){
1132 cp = (unsigned char *)long_option[i].alias;
1134 if (strcmp(long_option[i].name, "ic=") == 0){
1135 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1136 codeset[i] = nkf_toupper(p[i]);
1139 if(strcmp(codeset, "ISO-2022-JP") == 0){
1140 input_f = JIS_INPUT;
1141 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1142 strcmp(codeset, "CP50220") == 0 ||
1143 strcmp(codeset, "CP50221") == 0 ||
1144 strcmp(codeset, "CP50222") == 0){
1145 input_f = JIS_INPUT;
1146 #ifdef SHIFTJIS_CP932
1149 #ifdef UTF8_OUTPUT_ENABLE
1150 ms_ucs_map_f = UCS_MAP_CP932;
1152 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1153 input_f = JIS_INPUT;
1157 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1158 input_f = JIS_INPUT;
1163 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1164 input_f = SJIS_INPUT;
1165 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1166 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1167 strcmp(codeset, "CP932") == 0 ||
1168 strcmp(codeset, "MS932") == 0){
1169 input_f = SJIS_INPUT;
1170 #ifdef SHIFTJIS_CP932
1173 #ifdef UTF8_OUTPUT_ENABLE
1174 ms_ucs_map_f = UCS_MAP_CP932;
1176 }else if(strcmp(codeset, "CP10001") == 0){
1177 input_f = SJIS_INPUT;
1178 #ifdef SHIFTJIS_CP932
1181 #ifdef UTF8_OUTPUT_ENABLE
1182 ms_ucs_map_f = UCS_MAP_CP10001;
1184 }else if(strcmp(codeset, "EUCJP") == 0 ||
1185 strcmp(codeset, "EUC-JP") == 0){
1186 input_f = EUC_INPUT;
1187 }else if(strcmp(codeset, "CP51932") == 0){
1188 input_f = EUC_INPUT;
1189 #ifdef SHIFTJIS_CP932
1192 #ifdef UTF8_OUTPUT_ENABLE
1193 ms_ucs_map_f = UCS_MAP_CP932;
1195 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1196 strcmp(codeset, "EUCJP-MS") == 0 ||
1197 strcmp(codeset, "EUCJPMS") == 0){
1198 input_f = EUC_INPUT;
1199 #ifdef SHIFTJIS_CP932
1202 #ifdef UTF8_OUTPUT_ENABLE
1203 ms_ucs_map_f = UCS_MAP_MS;
1205 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1206 strcmp(codeset, "EUCJP-ASCII") == 0){
1207 input_f = EUC_INPUT;
1208 #ifdef SHIFTJIS_CP932
1211 #ifdef UTF8_OUTPUT_ENABLE
1212 ms_ucs_map_f = UCS_MAP_ASCII;
1214 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1215 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1216 input_f = SJIS_INPUT;
1218 #ifdef SHIFTJIS_CP932
1221 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1222 strcmp(codeset, "EUC-JIS-2004") == 0){
1223 input_f = EUC_INPUT;
1225 #ifdef SHIFTJIS_CP932
1228 #ifdef UTF8_INPUT_ENABLE
1229 }else if(strcmp(codeset, "UTF-8") == 0 ||
1230 strcmp(codeset, "UTF-8N") == 0 ||
1231 strcmp(codeset, "UTF-8-BOM") == 0){
1232 input_f = UTF8_INPUT;
1233 #ifdef UNICODE_NORMALIZATION
1234 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1235 strcmp(codeset, "UTF-8-MAC") == 0){
1236 input_f = UTF8_INPUT;
1239 }else if(strcmp(codeset, "UTF-16") == 0 ||
1240 strcmp(codeset, "UTF-16BE") == 0 ||
1241 strcmp(codeset, "UTF-16BE-BOM") == 0){
1242 input_f = UTF16_INPUT;
1243 input_endian = ENDIAN_BIG;
1244 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1245 strcmp(codeset, "UTF-16LE-BOM") == 0){
1246 input_f = UTF16_INPUT;
1247 input_endian = ENDIAN_LITTLE;
1248 }else if(strcmp(codeset, "UTF-32") == 0 ||
1249 strcmp(codeset, "UTF-32BE") == 0 ||
1250 strcmp(codeset, "UTF-32BE-BOM") == 0){
1251 input_f = UTF32_INPUT;
1252 input_endian = ENDIAN_BIG;
1253 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1254 strcmp(codeset, "UTF-32LE-BOM") == 0){
1255 input_f = UTF32_INPUT;
1256 input_endian = ENDIAN_LITTLE;
1261 if (strcmp(long_option[i].name, "oc=") == 0){
1263 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1264 codeset[i] = nkf_toupper(p[i]);
1267 if(strcmp(codeset, "ISO-2022-JP") == 0){
1268 output_conv = j_oconv;
1269 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1270 output_conv = j_oconv;
1271 no_cp932ext_f = TRUE;
1272 #ifdef SHIFTJIS_CP932
1275 #ifdef UTF8_OUTPUT_ENABLE
1276 ms_ucs_map_f = UCS_MAP_CP932;
1278 }else if(strcmp(codeset, "CP50220") == 0){
1279 output_conv = j_oconv;
1281 #ifdef SHIFTJIS_CP932
1284 #ifdef UTF8_OUTPUT_ENABLE
1285 ms_ucs_map_f = UCS_MAP_CP932;
1287 }else if(strcmp(codeset, "CP50221") == 0){
1288 output_conv = j_oconv;
1289 #ifdef SHIFTJIS_CP932
1292 #ifdef UTF8_OUTPUT_ENABLE
1293 ms_ucs_map_f = UCS_MAP_CP932;
1295 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1296 output_conv = j_oconv;
1300 #ifdef SHIFTJIS_CP932
1303 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1304 output_conv = j_oconv;
1309 #ifdef SHIFTJIS_CP932
1312 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1313 output_conv = s_oconv;
1314 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1315 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1316 strcmp(codeset, "CP932") == 0 ||
1317 strcmp(codeset, "MS932") == 0){
1318 output_conv = s_oconv;
1319 #ifdef UTF8_OUTPUT_ENABLE
1320 ms_ucs_map_f = UCS_MAP_CP932;
1322 }else if(strcmp(codeset, "CP10001") == 0){
1323 output_conv = s_oconv;
1324 #ifdef UTF8_OUTPUT_ENABLE
1325 ms_ucs_map_f = UCS_MAP_CP10001;
1327 }else if(strcmp(codeset, "EUCJP") == 0 ||
1328 strcmp(codeset, "EUC-JP") == 0){
1329 output_conv = e_oconv;
1330 }else if(strcmp(codeset, "CP51932") == 0){
1331 output_conv = e_oconv;
1332 #ifdef SHIFTJIS_CP932
1335 #ifdef UTF8_OUTPUT_ENABLE
1336 ms_ucs_map_f = UCS_MAP_CP932;
1338 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1339 strcmp(codeset, "EUCJP-MS") == 0 ||
1340 strcmp(codeset, "EUCJPMS") == 0){
1341 output_conv = e_oconv;
1345 #ifdef UTF8_OUTPUT_ENABLE
1346 ms_ucs_map_f = UCS_MAP_MS;
1348 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1349 strcmp(codeset, "EUCJP-ASCII") == 0){
1350 output_conv = e_oconv;
1354 #ifdef UTF8_OUTPUT_ENABLE
1355 ms_ucs_map_f = UCS_MAP_ASCII;
1357 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1358 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1359 output_conv = s_oconv;
1361 #ifdef SHIFTJIS_CP932
1364 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1365 strcmp(codeset, "EUC-JIS-2004") == 0){
1366 output_conv = e_oconv;
1371 #ifdef SHIFTJIS_CP932
1374 #ifdef UTF8_OUTPUT_ENABLE
1375 }else if(strcmp(codeset, "UTF-8") == 0){
1376 output_conv = w_oconv;
1377 }else if(strcmp(codeset, "UTF-8N") == 0){
1378 output_conv = w_oconv;
1379 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1380 output_conv = w_oconv;
1381 output_bom_f = TRUE;
1382 }else if(strcmp(codeset, "UTF-16BE") == 0){
1383 output_conv = w_oconv16;
1384 }else if(strcmp(codeset, "UTF-16") == 0 ||
1385 strcmp(codeset, "UTF-16BE-BOM") == 0){
1386 output_conv = w_oconv16;
1387 output_bom_f = TRUE;
1388 }else if(strcmp(codeset, "UTF-16LE") == 0){
1389 output_conv = w_oconv16;
1390 output_endian = ENDIAN_LITTLE;
1391 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1392 output_conv = w_oconv16;
1393 output_endian = ENDIAN_LITTLE;
1394 output_bom_f = TRUE;
1395 }else if(strcmp(codeset, "UTF-32") == 0 ||
1396 strcmp(codeset, "UTF-32BE") == 0){
1397 output_conv = w_oconv32;
1398 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1399 output_conv = w_oconv32;
1400 output_bom_f = TRUE;
1401 }else if(strcmp(codeset, "UTF-32LE") == 0){
1402 output_conv = w_oconv32;
1403 output_endian = ENDIAN_LITTLE;
1404 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1405 output_conv = w_oconv32;
1406 output_endian = ENDIAN_LITTLE;
1407 output_bom_f = TRUE;
1413 if (strcmp(long_option[i].name, "overwrite") == 0){
1416 preserve_time_f = TRUE;
1419 if (strcmp(long_option[i].name, "overwrite=") == 0){
1422 preserve_time_f = TRUE;
1424 backup_suffix = malloc(strlen((char *) p) + 1);
1425 strcpy(backup_suffix, (char *) p);
1428 if (strcmp(long_option[i].name, "in-place") == 0){
1431 preserve_time_f = FALSE;
1434 if (strcmp(long_option[i].name, "in-place=") == 0){
1437 preserve_time_f = FALSE;
1439 backup_suffix = malloc(strlen((char *) p) + 1);
1440 strcpy(backup_suffix, (char *) p);
1445 if (strcmp(long_option[i].name, "cap-input") == 0){
1449 if (strcmp(long_option[i].name, "url-input") == 0){
1454 #ifdef NUMCHAR_OPTION
1455 if (strcmp(long_option[i].name, "numchar-input") == 0){
1461 if (strcmp(long_option[i].name, "no-output") == 0){
1465 if (strcmp(long_option[i].name, "debug") == 0){
1470 if (strcmp(long_option[i].name, "cp932") == 0){
1471 #ifdef SHIFTJIS_CP932
1475 #ifdef UTF8_OUTPUT_ENABLE
1476 ms_ucs_map_f = UCS_MAP_CP932;
1480 if (strcmp(long_option[i].name, "no-cp932") == 0){
1481 #ifdef SHIFTJIS_CP932
1485 #ifdef UTF8_OUTPUT_ENABLE
1486 ms_ucs_map_f = UCS_MAP_ASCII;
1490 #ifdef SHIFTJIS_CP932
1491 if (strcmp(long_option[i].name, "cp932inv") == 0){
1498 if (strcmp(long_option[i].name, "x0212") == 0){
1505 if (strcmp(long_option[i].name, "exec-in") == 0){
1509 if (strcmp(long_option[i].name, "exec-out") == 0){
1514 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1515 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1516 no_cp932ext_f = TRUE;
1519 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1520 no_best_fit_chars_f = TRUE;
1523 if (strcmp(long_option[i].name, "fb-skip") == 0){
1524 encode_fallback = NULL;
1527 if (strcmp(long_option[i].name, "fb-html") == 0){
1528 encode_fallback = encode_fallback_html;
1531 if (strcmp(long_option[i].name, "fb-xml") == 0){
1532 encode_fallback = encode_fallback_xml;
1535 if (strcmp(long_option[i].name, "fb-java") == 0){
1536 encode_fallback = encode_fallback_java;
1539 if (strcmp(long_option[i].name, "fb-perl") == 0){
1540 encode_fallback = encode_fallback_perl;
1543 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1544 encode_fallback = encode_fallback_subchar;
1547 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1548 encode_fallback = encode_fallback_subchar;
1549 unicode_subchar = 0;
1551 /* decimal number */
1552 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1553 unicode_subchar *= 10;
1554 unicode_subchar += hex2bin(p[i]);
1556 }else if(p[1] == 'x' || p[1] == 'X'){
1557 /* hexadecimal number */
1558 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1559 unicode_subchar <<= 4;
1560 unicode_subchar |= hex2bin(p[i]);
1564 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1565 unicode_subchar *= 8;
1566 unicode_subchar += hex2bin(p[i]);
1569 w16e_conv(unicode_subchar, &i, &j);
1570 unicode_subchar = i<<8 | j;
1574 #ifdef UTF8_OUTPUT_ENABLE
1575 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1576 ms_ucs_map_f = UCS_MAP_MS;
1580 #ifdef UNICODE_NORMALIZATION
1581 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1582 input_f = UTF8_INPUT;
1587 if (strcmp(long_option[i].name, "prefix=") == 0){
1588 if (nkf_isgraph(p[0])){
1589 for (i = 1; nkf_isgraph(p[i]); i++){
1590 prefix_table[p[i]] = p[0];
1597 case 'b': /* buffered mode */
1600 case 'u': /* non bufferd mode */
1603 case 't': /* transparent mode */
1608 } else if (*cp=='2') {
1612 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1620 case 'j': /* JIS output */
1622 output_conv = j_oconv;
1624 case 'e': /* AT&T EUC output */
1625 output_conv = e_oconv;
1628 case 's': /* SJIS output */
1629 output_conv = s_oconv;
1631 case 'l': /* ISO8859 Latin-1 support, no conversion */
1632 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1633 input_f = LATIN1_INPUT;
1635 case 'i': /* Kanji IN ESC-$-@/B */
1636 if (*cp=='@'||*cp=='B')
1637 kanji_intro = *cp++;
1639 case 'o': /* ASCII IN ESC-(-J/B */
1640 if (*cp=='J'||*cp=='B'||*cp=='H')
1641 ascii_intro = *cp++;
1645 bit:1 katakana->hiragana
1646 bit:2 hiragana->katakana
1648 if ('9'>= *cp && *cp>='0')
1649 hira_f |= (*cp++ -'0');
1656 #if defined(MSDOS) || defined(__OS2__)
1671 #ifdef UTF8_OUTPUT_ENABLE
1672 case 'w': /* UTF-8 output */
1674 output_conv = w_oconv; cp++;
1678 output_bom_f = TRUE;
1681 if ('1'== cp[0] && '6'==cp[1]) {
1682 output_conv = w_oconv16; cp+=2;
1683 } else if ('3'== cp[0] && '2'==cp[1]) {
1684 output_conv = w_oconv32; cp+=2;
1686 output_conv = w_oconv;
1691 output_endian = ENDIAN_LITTLE;
1692 } else if (cp[0] == 'B') {
1700 output_bom_f = TRUE;
1705 #ifdef UTF8_INPUT_ENABLE
1706 case 'W': /* UTF input */
1709 input_f = UTF8_INPUT;
1711 if ('1'== cp[0] && '6'==cp[1]) {
1713 input_f = UTF16_INPUT;
1714 input_endian = ENDIAN_BIG;
1715 } else if ('3'== cp[0] && '2'==cp[1]) {
1717 input_f = UTF32_INPUT;
1718 input_endian = ENDIAN_BIG;
1720 input_f = UTF8_INPUT;
1725 input_endian = ENDIAN_LITTLE;
1726 } else if (cp[0] == 'B') {
1732 /* Input code assumption */
1733 case 'J': /* JIS input */
1734 input_f = JIS_INPUT;
1736 case 'E': /* AT&T EUC input */
1737 input_f = EUC_INPUT;
1739 case 'S': /* MS Kanji input */
1740 input_f = SJIS_INPUT;
1741 if (x0201_f==NO_X0201) x0201_f=TRUE;
1743 case 'Z': /* Convert X0208 alphabet to asii */
1745 bit:0 Convert JIS X 0208 Alphabet to ASCII
1746 bit:1 Convert Kankaku to one space
1747 bit:2 Convert Kankaku to two spaces
1748 bit:3 Convert HTML Entity
1749 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1751 while ('0'<= *cp && *cp <='9') {
1752 alpha_f |= 1 << (*cp++ - '0');
1754 if (!alpha_f) alpha_f = 1;
1756 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1757 x0201_f = FALSE; /* No X0201->X0208 conversion */
1759 ESC-(-I in JIS, EUC, MS Kanji
1760 SI/SO in JIS, EUC, MS Kanji
1761 SSO in EUC, JIS, not in MS Kanji
1762 MS Kanji (0xa0-0xdf)
1764 ESC-(-I in JIS (0x20-0x5f)
1765 SSO in EUC (0xa0-0xdf)
1766 0xa0-0xd in MS Kanji (0xa0-0xdf)
1769 case 'X': /* Assume X0201 kana */
1770 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1773 case 'F': /* prserve new lines */
1774 fold_preserve_f = TRUE;
1775 case 'f': /* folding -f60 or -f */
1778 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1780 fold_len += *cp++ - '0';
1782 if (!(0<fold_len && fold_len<BUFSIZ))
1783 fold_len = DEFAULT_FOLD;
1787 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1789 fold_margin += *cp++ - '0';
1793 case 'm': /* MIME support */
1794 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1795 if (*cp=='B'||*cp=='Q') {
1796 mime_decode_mode = *cp++;
1797 mimebuf_f = FIXED_MIME;
1798 } else if (*cp=='N') {
1799 mime_f = TRUE; cp++;
1800 } else if (*cp=='S') {
1801 mime_f = STRICT_MIME; cp++;
1802 } else if (*cp=='0') {
1803 mime_decode_f = FALSE;
1804 mime_f = FALSE; cp++;
1807 case 'M': /* MIME output */
1810 mimeout_f = FIXED_MIME; cp++;
1811 } else if (*cp=='Q') {
1813 mimeout_f = FIXED_MIME; cp++;
1818 case 'B': /* Broken JIS support */
1820 bit:1 allow any x on ESC-(-x or ESC-$-x
1821 bit:2 reset to ascii on NL
1823 if ('9'>= *cp && *cp>='0')
1824 broken_f |= 1<<(*cp++ -'0');
1829 case 'O':/* for Output file */
1833 case 'c':/* add cr code */
1836 case 'd':/* delete cr code */
1839 case 'I': /* ISO-2022-JP output */
1842 case 'L': /* line mode */
1843 if (*cp=='u') { /* unix */
1844 nlmode_f = LF; cp++;
1845 } else if (*cp=='m') { /* mac */
1846 nlmode_f = CR; cp++;
1847 } else if (*cp=='w') { /* windows */
1848 nlmode_f = CRLF; cp++;
1849 } else if (*cp=='0') { /* no conversion */
1859 /* module muliple options in a string are allowed for Perl moudle */
1860 while(*cp && *cp++!='-');
1863 /* bogus option but ignored */
1869 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1872 struct input_code *p = input_code_list;
1874 if (iconv_func == p->iconv_func){
1883 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1885 #ifdef INPUT_CODE_FIX
1893 #ifdef INPUT_CODE_FIX
1894 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1900 if (estab_f && iconv_for_check != iconv){
1901 struct input_code *p = find_inputcode_byfunc(iconv);
1903 set_input_codename(p->name);
1906 iconv_for_check = iconv;
1911 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1912 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1913 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1914 #ifdef SHIFTJIS_CP932
1915 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1916 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1918 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1920 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1921 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1923 #define SCORE_INIT (SCORE_iMIME)
1925 static const char score_table_A0[] = {
1928 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1929 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1932 static const char score_table_F0[] = {
1933 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1934 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1935 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1936 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1939 void set_code_score(struct input_code *ptr, nkf_char score)
1942 ptr->score |= score;
1946 void clr_code_score(struct input_code *ptr, nkf_char score)
1949 ptr->score &= ~score;
1953 void code_score(struct input_code *ptr)
1955 nkf_char c2 = ptr->buf[0];
1956 #ifdef UTF8_OUTPUT_ENABLE
1957 nkf_char c1 = ptr->buf[1];
1960 set_code_score(ptr, SCORE_ERROR);
1961 }else if (c2 == SSO){
1962 set_code_score(ptr, SCORE_KANA);
1963 #ifdef UTF8_OUTPUT_ENABLE
1964 }else if (!e2w_conv(c2, c1)){
1965 set_code_score(ptr, SCORE_NO_EXIST);
1967 }else if ((c2 & 0x70) == 0x20){
1968 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
1969 }else if ((c2 & 0x70) == 0x70){
1970 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
1971 }else if ((c2 & 0x70) >= 0x50){
1972 set_code_score(ptr, SCORE_L2);
1976 void status_disable(struct input_code *ptr)
1981 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
1984 void status_push_ch(struct input_code *ptr, nkf_char c)
1986 ptr->buf[ptr->index++] = c;
1989 void status_clear(struct input_code *ptr)
1995 void status_reset(struct input_code *ptr)
1998 ptr->score = SCORE_INIT;
2001 void status_reinit(struct input_code *ptr)
2004 ptr->_file_stat = 0;
2007 void status_check(struct input_code *ptr, nkf_char c)
2009 if (c <= DEL && estab_f){
2014 void s_status(struct input_code *ptr, nkf_char c)
2018 status_check(ptr, c);
2023 #ifdef NUMCHAR_OPTION
2024 }else if (is_unicode_capsule(c)){
2027 }else if (0xa1 <= c && c <= 0xdf){
2028 status_push_ch(ptr, SSO);
2029 status_push_ch(ptr, c);
2032 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2034 status_push_ch(ptr, c);
2035 #ifdef SHIFTJIS_CP932
2037 && is_ibmext_in_sjis(c)){
2039 status_push_ch(ptr, c);
2040 #endif /* SHIFTJIS_CP932 */
2042 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2044 status_push_ch(ptr, c);
2045 #endif /* X0212_ENABLE */
2047 status_disable(ptr);
2051 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2052 status_push_ch(ptr, c);
2053 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2057 status_disable(ptr);
2061 #ifdef SHIFTJIS_CP932
2062 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2063 status_push_ch(ptr, c);
2064 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2065 set_code_score(ptr, SCORE_CP932);
2070 #endif /* SHIFTJIS_CP932 */
2071 #ifndef X0212_ENABLE
2072 status_disable(ptr);
2078 void e_status(struct input_code *ptr, nkf_char c)
2082 status_check(ptr, c);
2087 #ifdef NUMCHAR_OPTION
2088 }else if (is_unicode_capsule(c)){
2091 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2093 status_push_ch(ptr, c);
2095 }else if (0x8f == c){
2097 status_push_ch(ptr, c);
2098 #endif /* X0212_ENABLE */
2100 status_disable(ptr);
2104 if (0xa1 <= c && c <= 0xfe){
2105 status_push_ch(ptr, c);
2109 status_disable(ptr);
2114 if (0xa1 <= c && c <= 0xfe){
2116 status_push_ch(ptr, c);
2118 status_disable(ptr);
2120 #endif /* X0212_ENABLE */
2124 #ifdef UTF8_INPUT_ENABLE
2125 void w_status(struct input_code *ptr, nkf_char c)
2129 status_check(ptr, c);
2134 #ifdef NUMCHAR_OPTION
2135 }else if (is_unicode_capsule(c)){
2138 }else if (0xc0 <= c && c <= 0xdf){
2140 status_push_ch(ptr, c);
2141 }else if (0xe0 <= c && c <= 0xef){
2143 status_push_ch(ptr, c);
2144 }else if (0xf0 <= c && c <= 0xf4){
2146 status_push_ch(ptr, c);
2148 status_disable(ptr);
2153 if (0x80 <= c && c <= 0xbf){
2154 status_push_ch(ptr, c);
2155 if (ptr->index > ptr->stat){
2156 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2157 && ptr->buf[2] == 0xbf);
2158 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2159 &ptr->buf[0], &ptr->buf[1]);
2166 status_disable(ptr);
2170 if (0x80 <= c && c <= 0xbf){
2171 if (ptr->index < ptr->stat){
2172 status_push_ch(ptr, c);
2177 status_disable(ptr);
2184 void code_status(nkf_char c)
2186 int action_flag = 1;
2187 struct input_code *result = 0;
2188 struct input_code *p = input_code_list;
2190 if (!p->status_func) {
2194 if (!p->status_func)
2196 (p->status_func)(p, c);
2199 }else if(p->stat == 0){
2210 if (result && !estab_f){
2211 set_iconv(TRUE, result->iconv_func);
2212 }else if (c <= DEL){
2213 struct input_code *ptr = input_code_list;
2223 nkf_char std_getc(FILE *f)
2226 return std_gc_buf[--std_gc_ndx];
2232 nkf_char std_ungetc(nkf_char c, FILE *f)
2234 if (std_gc_ndx == STD_GC_BUFSIZE){
2237 std_gc_buf[std_gc_ndx++] = c;
2242 void std_putc(nkf_char c)
2249 #if !defined(PERL_XS) && !defined(WIN32DLL)
2250 nkf_char noconvert(FILE *f)
2255 module_connection();
2256 while ((c = (*i_getc)(f)) != EOF)
2263 void module_connection(void)
2265 oconv = output_conv;
2268 /* replace continucation module, from output side */
2270 /* output redicrection */
2272 if (noout_f || guess_f){
2279 if (mimeout_f == TRUE) {
2280 o_base64conv = oconv; oconv = base64_conv;
2282 /* base64_count = 0; */
2285 if (nlmode_f || guess_f) {
2286 o_nlconv = oconv; oconv = nl_conv;
2289 o_rot_conv = oconv; oconv = rot_conv;
2292 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2295 o_hira_conv = oconv; oconv = hira_conv;
2298 o_fconv = oconv; oconv = fold_conv;
2301 if (alpha_f || x0201_f) {
2302 o_zconv = oconv; oconv = z_conv;
2306 i_ungetc = std_ungetc;
2307 /* input redicrection */
2310 i_cgetc = i_getc; i_getc = cap_getc;
2311 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2314 i_ugetc = i_getc; i_getc = url_getc;
2315 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2318 #ifdef NUMCHAR_OPTION
2320 i_ngetc = i_getc; i_getc = numchar_getc;
2321 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2324 #ifdef UNICODE_NORMALIZATION
2325 if (nfc_f && input_f == UTF8_INPUT){
2326 i_nfc_getc = i_getc; i_getc = nfc_getc;
2327 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2330 if (mime_f && mimebuf_f==FIXED_MIME) {
2331 i_mgetc = i_getc; i_getc = mime_getc;
2332 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2335 i_bgetc = i_getc; i_getc = broken_getc;
2336 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2338 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2339 set_iconv(-TRUE, e_iconv);
2340 } else if (input_f == SJIS_INPUT) {
2341 set_iconv(-TRUE, s_iconv);
2342 #ifdef UTF8_INPUT_ENABLE
2343 } else if (input_f == UTF8_INPUT) {
2344 set_iconv(-TRUE, w_iconv);
2345 } else if (input_f == UTF16_INPUT) {
2346 set_iconv(-TRUE, w_iconv16);
2347 } else if (input_f == UTF32_INPUT) {
2348 set_iconv(-TRUE, w_iconv32);
2351 set_iconv(FALSE, e_iconv);
2355 struct input_code *p = input_code_list;
2363 * Check and Ignore BOM
2365 void check_bom(FILE *f)
2368 switch(c2 = (*i_getc)(f)){
2370 if((c2 = (*i_getc)(f)) == 0x00){
2371 if((c2 = (*i_getc)(f)) == 0xFE){
2372 if((c2 = (*i_getc)(f)) == 0xFF){
2374 set_iconv(TRUE, w_iconv32);
2376 if (iconv == w_iconv32) {
2377 input_endian = ENDIAN_BIG;
2380 (*i_ungetc)(0xFF,f);
2381 }else (*i_ungetc)(c2,f);
2382 (*i_ungetc)(0xFE,f);
2383 }else if(c2 == 0xFF){
2384 if((c2 = (*i_getc)(f)) == 0xFE){
2386 set_iconv(TRUE, w_iconv32);
2388 if (iconv == w_iconv32) {
2389 input_endian = ENDIAN_2143;
2392 (*i_ungetc)(0xFF,f);
2393 }else (*i_ungetc)(c2,f);
2394 (*i_ungetc)(0xFF,f);
2395 }else (*i_ungetc)(c2,f);
2396 (*i_ungetc)(0x00,f);
2397 }else (*i_ungetc)(c2,f);
2398 (*i_ungetc)(0x00,f);
2401 if((c2 = (*i_getc)(f)) == 0xBB){
2402 if((c2 = (*i_getc)(f)) == 0xBF){
2404 set_iconv(TRUE, w_iconv);
2406 if (iconv == w_iconv) {
2409 (*i_ungetc)(0xBF,f);
2410 }else (*i_ungetc)(c2,f);
2411 (*i_ungetc)(0xBB,f);
2412 }else (*i_ungetc)(c2,f);
2413 (*i_ungetc)(0xEF,f);
2416 if((c2 = (*i_getc)(f)) == 0xFF){
2417 if((c2 = (*i_getc)(f)) == 0x00){
2418 if((c2 = (*i_getc)(f)) == 0x00){
2420 set_iconv(TRUE, w_iconv32);
2422 if (iconv == w_iconv32) {
2423 input_endian = ENDIAN_3412;
2426 (*i_ungetc)(0x00,f);
2427 }else (*i_ungetc)(c2,f);
2428 (*i_ungetc)(0x00,f);
2429 }else (*i_ungetc)(c2,f);
2431 set_iconv(TRUE, w_iconv16);
2433 if (iconv == w_iconv16) {
2434 input_endian = ENDIAN_BIG;
2437 (*i_ungetc)(0xFF,f);
2438 }else (*i_ungetc)(c2,f);
2439 (*i_ungetc)(0xFE,f);
2442 if((c2 = (*i_getc)(f)) == 0xFE){
2443 if((c2 = (*i_getc)(f)) == 0x00){
2444 if((c2 = (*i_getc)(f)) == 0x00){
2446 set_iconv(TRUE, w_iconv32);
2448 if (iconv == w_iconv32) {
2449 input_endian = ENDIAN_LITTLE;
2452 (*i_ungetc)(0x00,f);
2453 }else (*i_ungetc)(c2,f);
2454 (*i_ungetc)(0x00,f);
2455 }else (*i_ungetc)(c2,f);
2457 set_iconv(TRUE, w_iconv16);
2459 if (iconv == w_iconv16) {
2460 input_endian = ENDIAN_LITTLE;
2463 (*i_ungetc)(0xFE,f);
2464 }else (*i_ungetc)(c2,f);
2465 (*i_ungetc)(0xFF,f);
2474 Conversion main loop. Code detection only.
2477 nkf_char kanji_convert(FILE *f)
2479 nkf_char c3, c2=0, c1, c0=0;
2480 int is_8bit = FALSE;
2482 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2483 #ifdef UTF8_INPUT_ENABLE
2484 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2491 output_mode = ASCII;
2494 #define NEXT continue /* no output, get next */
2495 #define SEND ; /* output c1 and c2, get next */
2496 #define LAST break /* end of loop, go closing */
2498 module_connection();
2501 while ((c1 = (*i_getc)(f)) != EOF) {
2502 #ifdef INPUT_CODE_FIX
2508 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2509 /* in case of 8th bit is on */
2510 if (!estab_f&&!mime_decode_mode) {
2511 /* in case of not established yet */
2512 /* It is still ambiguious */
2513 if (h_conv(f, c2, c1)==EOF)
2519 /* in case of already established */
2521 /* ignore bogus code and not CP5022x UCD */
2529 /* second byte, 7 bit code */
2530 /* it might be kanji shitfted */
2531 if ((c1 == DEL) || (c1 <= SP)) {
2532 /* ignore bogus first code */
2539 #ifdef UTF8_INPUT_ENABLE
2540 if (iconv == w_iconv16) {
2541 if (input_endian == ENDIAN_BIG) {
2543 if ((c1 = (*i_getc)(f)) != EOF) {
2544 if (0xD8 <= c2 && c2 <= 0xDB) {
2545 if ((c0 = (*i_getc)(f)) != EOF) {
2547 if ((c3 = (*i_getc)(f)) != EOF) {
2554 if ((c2 = (*i_getc)(f)) != EOF) {
2555 if (0xD8 <= c2 && c2 <= 0xDB) {
2556 if ((c3 = (*i_getc)(f)) != EOF) {
2557 if ((c0 = (*i_getc)(f)) != EOF) {
2566 } else if(iconv == w_iconv32){
2568 if((c2 = (*i_getc)(f)) != EOF &&
2569 (c1 = (*i_getc)(f)) != EOF &&
2570 (c0 = (*i_getc)(f)) != EOF){
2571 switch(input_endian){
2573 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2576 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2579 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2582 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2592 #ifdef NUMCHAR_OPTION
2593 if (is_unicode_capsule(c1)){
2597 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2599 if (!estab_f && !iso8859_f) {
2600 /* not established yet */
2603 } else { /* estab_f==TRUE */
2608 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2609 /* SJIS X0201 Case... */
2610 if(iso2022jp_f && x0201_f==NO_X0201) {
2611 (*oconv)(GETA1, GETA2);
2618 } else if (c1==SSO && iconv != s_iconv) {
2619 /* EUC X0201 Case */
2620 c1 = (*i_getc)(f); /* skip SSO */
2622 if (SSP<=c1 && c1<0xe0) {
2623 if(iso2022jp_f && x0201_f==NO_X0201) {
2624 (*oconv)(GETA1, GETA2);
2631 } else { /* bogus code, skip SSO and one byte */
2634 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2635 (c1 == 0xFD || c1 == 0xFE)) {
2641 /* already established */
2646 } else if ((c1 > SP) && (c1 != DEL)) {
2647 /* in case of Roman characters */
2649 /* output 1 shifted byte */
2653 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2654 /* output 1 shifted byte */
2655 if(iso2022jp_f && x0201_f==NO_X0201) {
2656 (*oconv)(GETA1, GETA2);
2663 /* look like bogus code */
2666 } else if (input_mode == X0208 || input_mode == X0212 ||
2667 input_mode == X0213_1 || input_mode == X0213_2) {
2668 /* in case of Kanji shifted */
2671 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2672 /* Check MIME code */
2673 if ((c1 = (*i_getc)(f)) == EOF) {
2676 } else if (c1 == '?') {
2677 /* =? is mime conversion start sequence */
2678 if(mime_f == STRICT_MIME) {
2679 /* check in real detail */
2680 if (mime_begin_strict(f) == EOF)
2684 } else if (mime_begin(f) == EOF)
2694 /* normal ASCII code */
2697 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2700 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2703 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2704 if ((c1 = (*i_getc)(f)) == EOF) {
2705 /* (*oconv)(0, ESC); don't send bogus code */
2707 } else if (c1 == '$') {
2708 if ((c1 = (*i_getc)(f)) == EOF) {
2710 (*oconv)(0, ESC); don't send bogus code
2711 (*oconv)(0, '$'); */
2713 } else if (c1 == '@'|| c1 == 'B') {
2714 /* This is kanji introduction */
2717 set_input_codename("ISO-2022-JP");
2719 debug("ISO-2022-JP");
2722 } else if (c1 == '(') {
2723 if ((c1 = (*i_getc)(f)) == EOF) {
2724 /* don't send bogus code
2730 } else if (c1 == '@'|| c1 == 'B') {
2731 /* This is kanji introduction */
2736 } else if (c1 == 'D'){
2740 #endif /* X0212_ENABLE */
2741 } else if (c1 == (X0213_1&0x7F)){
2742 input_mode = X0213_1;
2745 } else if (c1 == (X0213_2&0x7F)){
2746 input_mode = X0213_2;
2750 /* could be some special code */
2757 } else if (broken_f&0x2) {
2758 /* accept any ESC-(-x as broken code ... */
2768 } else if (c1 == '(') {
2769 if ((c1 = (*i_getc)(f)) == EOF) {
2770 /* don't send bogus code
2772 (*oconv)(0, '('); */
2776 /* This is X0201 kana introduction */
2777 input_mode = X0201; shift_mode = X0201;
2779 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2780 /* This is X0208 kanji introduction */
2781 input_mode = ASCII; shift_mode = FALSE;
2783 } else if (broken_f&0x2) {
2784 input_mode = ASCII; shift_mode = FALSE;
2789 /* maintain various input_mode here */
2793 } else if ( c1 == 'N' || c1 == 'n'){
2795 c3 = (*i_getc)(f); /* skip SS2 */
2796 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2811 } else if (c1 == ESC && iconv == s_iconv) {
2812 /* ESC in Shift_JIS */
2813 if ((c1 = (*i_getc)(f)) == EOF) {
2814 /* (*oconv)(0, ESC); don't send bogus code */
2816 } else if (c1 == '$') {
2818 if ((c1 = (*i_getc)(f)) == EOF) {
2820 (*oconv)(0, ESC); don't send bogus code
2821 (*oconv)(0, '$'); */
2824 if (('E' <= c1 && c1 <= 'G') ||
2825 ('O' <= c1 && c1 <= 'Q')) {
2833 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2834 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2835 while ((c1 = (*i_getc)(f)) != EOF) {
2836 if (SP <= c1 && c1 <= 'z') {
2837 (*oconv)(0, c1 + c0);
2838 } else break; /* c1 == SO */
2842 if (c1 == EOF) LAST;
2849 } else if (c1 == LF || c1 == CR) {
2851 input_mode = ASCII; set_iconv(FALSE, 0);
2853 } else if (mime_decode_f && !mime_decode_mode){
2855 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2863 } else { /* if (c1 == CR)*/
2864 if ((c1=(*i_getc)(f))!=EOF) {
2868 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2882 } else if (c1 == DEL && input_mode == X0208) {
2892 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2895 if ((c0 = (*i_getc)(f)) != EOF) {
2898 if ((c3 = (*i_getc)(f)) != EOF) {
2900 (*iconv)(c2, c1, c0|c3);
2905 /* 3 bytes EUC or UTF-8 */
2906 if ((c0 = (*i_getc)(f)) != EOF) {
2908 (*iconv)(c2, c1, c0);
2916 0x7F <= c2 && c2 <= 0x92 &&
2917 0x21 <= c1 && c1 <= 0x7E) {
2919 if(c1 == 0x7F) return 0;
2920 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2923 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2927 (*oconv)(PREFIX_EUCG3 | c2, c1);
2929 #endif /* X0212_ENABLE */
2931 (*oconv)(PREFIX_EUCG3 | c2, c1);
2934 (*oconv)(input_mode, c1); /* other special case */
2940 /* goto next_word */
2944 (*iconv)(EOF, 0, 0);
2945 if (!input_codename)
2948 struct input_code *p = input_code_list;
2949 struct input_code *result = p;
2951 if (p->score < result->score) result = p;
2954 set_input_codename(result->name);
2956 debug(result->name);
2964 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2966 nkf_char ret, c3, c0;
2970 /** it must NOT be in the kanji shifte sequence */
2971 /** it must NOT be written in JIS7 */
2972 /** and it must be after 2 byte 8bit code */
2978 while ((c1 = (*i_getc)(f)) != EOF) {
2984 if (push_hold_buf(c1) == EOF || estab_f){
2990 struct input_code *p = input_code_list;
2991 struct input_code *result = p;
2996 if (p->status_func && p->score < result->score){
3001 set_iconv(TRUE, result->iconv_func);
3006 ** 1) EOF is detected, or
3007 ** 2) Code is established, or
3008 ** 3) Buffer is FULL (but last word is pushed)
3010 ** in 1) and 3) cases, we continue to use
3011 ** Kanji codes by oconv and leave estab_f unchanged.
3016 while (hold_index < hold_count){
3017 c2 = hold_buf[hold_index++];
3019 #ifdef NUMCHAR_OPTION
3020 || is_unicode_capsule(c2)
3025 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3026 (*iconv)(X0201, c2, 0);
3029 if (hold_index < hold_count){
3030 c1 = hold_buf[hold_index++];
3040 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3043 if (hold_index < hold_count){
3044 c0 = hold_buf[hold_index++];
3045 } else if ((c0 = (*i_getc)(f)) == EOF) {
3051 if (hold_index < hold_count){
3052 c3 = hold_buf[hold_index++];
3053 } else if ((c3 = (*i_getc)(f)) == EOF) {
3058 (*iconv)(c2, c1, c0|c3);
3063 /* 3 bytes EUC or UTF-8 */
3064 if (hold_index < hold_count){
3065 c0 = hold_buf[hold_index++];
3066 } else if ((c0 = (*i_getc)(f)) == EOF) {
3072 (*iconv)(c2, c1, c0);
3075 if (c0 == EOF) break;
3080 nkf_char push_hold_buf(nkf_char c2)
3082 if (hold_count >= HOLD_SIZE*2)
3084 hold_buf[hold_count++] = (unsigned char)c2;
3085 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3088 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3090 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3093 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3094 #ifdef SHIFTJIS_CP932
3095 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3096 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3103 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3104 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3110 #endif /* SHIFTJIS_CP932 */
3112 if (!x0213_f && is_ibmext_in_sjis(c2)){
3113 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3116 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3129 if(x0213_f && c2 >= 0xF0){
3130 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3131 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3132 }else{ /* 78<=k<=94 */
3133 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3134 if (0x9E < c1) c2++;
3137 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3138 if (0x9E < c1) c2++;
3141 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3148 c2 = x0212_unshift(c2);
3155 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3159 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3161 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3163 if(c1 == 0x7F) return 0;
3164 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3167 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3168 if (ret) return ret;
3174 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3179 }else if (c2 == 0x8f){
3183 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3184 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3185 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3188 c2 = (c2 << 8) | (c1 & 0x7f);
3190 #ifdef SHIFTJIS_CP932
3193 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3194 s2e_conv(s2, s1, &c2, &c1);
3201 #endif /* SHIFTJIS_CP932 */
3203 #endif /* X0212_ENABLE */
3204 } else if (c2 == SSO){
3207 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3210 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3211 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3212 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3217 #ifdef SHIFTJIS_CP932
3218 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3220 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3221 s2e_conv(s2, s1, &c2, &c1);
3228 #endif /* SHIFTJIS_CP932 */
3235 #ifdef UTF8_INPUT_ENABLE
3236 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3243 }else if (0xc0 <= c2 && c2 <= 0xef) {
3244 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3245 #ifdef NUMCHAR_OPTION
3248 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3256 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3259 static const char w_iconv_utf8_1st_byte[] =
3261 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3262 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3263 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3264 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3266 if (c2 < 0 || 0xff < c2) {
3267 }else if (c2 == 0) { /* 0 : 1 byte*/
3269 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3272 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3274 if (c1 < 0x80 || 0xBF < c1) return 0;
3277 if (c0 == 0) return -1;
3278 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3283 if (c0 == 0) return -1;
3284 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3288 if (c0 == 0) return -1;
3289 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3293 if (c0 == 0) return -2;
3294 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3298 if (c0 == 0) return -2;
3299 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3303 if (c0 == 0) return -2;
3304 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3312 if (c2 == 0 || c2 == EOF){
3313 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3314 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3317 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3326 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3327 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3334 }else if (val < 0x800){
3335 *p2 = 0xc0 | (val >> 6);
3336 *p1 = 0x80 | (val & 0x3f);
3338 } else if (val <= NKF_INT32_C(0xFFFF)) {
3339 *p2 = 0xe0 | (val >> 12);
3340 *p1 = 0x80 | ((val >> 6) & 0x3f);
3341 *p0 = 0x80 | (val & 0x3f);
3342 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3343 *p2 = 0xe0 | (val >> 16);
3344 *p1 = 0x80 | ((val >> 12) & 0x3f);
3345 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3354 #ifdef UTF8_INPUT_ENABLE
3355 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3360 } else if (c2 >= 0xf0){
3361 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3362 val = (c2 & 0x0f) << 18;
3363 val |= (c1 & 0x3f) << 12;
3364 val |= (c0 & 0x3f00) >> 2;
3366 }else if (c2 >= 0xe0){
3367 val = (c2 & 0x0f) << 12;
3368 val |= (c1 & 0x3f) << 6;
3370 }else if (c2 >= 0xc0){
3371 val = (c2 & 0x1f) << 6;
3379 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3381 nkf_char c2, c1, c0;
3388 w16w_conv(val, &c2, &c1, &c0);
3389 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3390 #ifdef NUMCHAR_OPTION
3393 *p1 = CLASS_UNICODE | val;
3402 #ifdef UTF8_INPUT_ENABLE
3403 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3406 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3409 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3410 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3412 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3414 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3419 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3420 if (ret) return ret;
3425 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3429 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3430 } else if (is_unicode_bmp(c1)) {
3431 ret = w16e_conv(c1, &c2, &c1);
3434 c1 = CLASS_UNICODE | c1;
3436 if (ret) return ret;
3441 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3443 const unsigned short *const *pp;
3444 const unsigned short *const *const *ppp;
3445 static const char no_best_fit_chars_table_C2[] =
3446 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3447 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3448 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3449 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3450 static const char no_best_fit_chars_table_C2_ms[] =
3451 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3452 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3453 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3454 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3455 static const char no_best_fit_chars_table_932_C2[] =
3456 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3458 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3459 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3460 static const char no_best_fit_chars_table_932_C3[] =
3461 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3462 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3464 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3470 }else if(c2 < 0xe0){
3471 if(no_best_fit_chars_f){
3472 if(ms_ucs_map_f == UCS_MAP_CP932){
3475 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3478 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3481 }else if(!cp932inv_f){
3484 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3487 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3490 }else if(ms_ucs_map_f == UCS_MAP_MS){
3491 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3492 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3510 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3511 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3512 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3514 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3515 }else if(c0 < 0xF0){
3516 if(no_best_fit_chars_f){
3517 if(ms_ucs_map_f == UCS_MAP_CP932){
3518 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3519 }else if(ms_ucs_map_f == UCS_MAP_MS){
3524 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3527 if(c0 == 0x92) return 1;
3532 if(c1 == 0x80 || c0 == 0x9C) return 1;
3535 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3540 if(c0 == 0x94) return 1;
3543 if(c0 == 0xBB) return 1;
3553 if(c0 == 0x95) return 1;
3556 if(c0 == 0xA5) return 1;
3563 if(c0 == 0x8D) return 1;
3566 if(c0 == 0x9E && !cp932inv_f) return 1;
3569 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3577 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3578 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3579 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3581 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3583 #ifdef SHIFTJIS_CP932
3584 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3586 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3587 s2e_conv(s2, s1, p2, p1);
3596 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3599 const unsigned short *p;
3602 if (pp == 0) return 1;
3605 if (c1 < 0 || psize <= c1) return 1;
3607 if (p == 0) return 1;
3610 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3612 if (val == 0) return 1;
3613 if (no_cp932ext_f && (
3614 (val>>8) == 0x2D || /* NEC special characters */
3615 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3623 if (c2 == SO) c2 = X0201;
3630 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3637 (*f)(0, bin2hex(c>>shift));
3647 void encode_fallback_html(nkf_char c)
3652 if(c >= NKF_INT32_C(1000000))
3653 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3654 if(c >= NKF_INT32_C(100000))
3655 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3657 (*oconv)(0, 0x30+(c/10000 )%10);
3659 (*oconv)(0, 0x30+(c/1000 )%10);
3661 (*oconv)(0, 0x30+(c/100 )%10);
3663 (*oconv)(0, 0x30+(c/10 )%10);
3665 (*oconv)(0, 0x30+ c %10);
3670 void encode_fallback_xml(nkf_char c)
3675 nkf_each_char_to_hex(oconv, c);
3680 void encode_fallback_java(nkf_char c)
3684 if(!is_unicode_bmp(c)){
3688 (*oconv)(0, bin2hex(c>>20));
3689 (*oconv)(0, bin2hex(c>>16));
3693 (*oconv)(0, bin2hex(c>>12));
3694 (*oconv)(0, bin2hex(c>> 8));
3695 (*oconv)(0, bin2hex(c>> 4));
3696 (*oconv)(0, bin2hex(c ));
3700 void encode_fallback_perl(nkf_char c)
3705 nkf_each_char_to_hex(oconv, c);
3710 void encode_fallback_subchar(nkf_char c)
3712 c = unicode_subchar;
3713 (*oconv)((c>>8)&0xFF, c&0xFF);
3718 #ifdef UTF8_OUTPUT_ENABLE
3719 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3721 const unsigned short *p;
3724 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3732 p = euc_to_utf8_1byte;
3734 } else if (is_eucg3(c2)){
3735 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3738 c2 = (c2&0x7f) - 0x21;
3739 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3740 p = x0212_to_utf8_2bytes[c2];
3746 c2 = (c2&0x7f) - 0x21;
3747 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3749 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3750 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3751 euc_to_utf8_2bytes_ms[c2];
3756 c1 = (c1 & 0x7f) - 0x21;
3757 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3762 void w_oconv(nkf_char c2, nkf_char c1)
3768 output_bom_f = FALSE;
3779 #ifdef NUMCHAR_OPTION
3780 if (c2 == 0 && is_unicode_capsule(c1)){
3781 val = c1 & VALUE_MASK;
3784 }else if (val < 0x800){
3785 (*o_putc)(0xC0 | (val >> 6));
3786 (*o_putc)(0x80 | (val & 0x3f));
3787 } else if (val <= NKF_INT32_C(0xFFFF)) {
3788 (*o_putc)(0xE0 | (val >> 12));
3789 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3790 (*o_putc)(0x80 | (val & 0x3f));
3791 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3792 (*o_putc)(0xF0 | ( val>>18));
3793 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3794 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3795 (*o_putc)(0x80 | ( val & 0x3f));
3802 output_mode = ASCII;
3804 } else if (c2 == ISO8859_1) {
3805 output_mode = ISO8859_1;
3806 (*o_putc)(c1 | 0x080);
3809 val = e2w_conv(c2, c1);
3811 w16w_conv(val, &c2, &c1, &c0);
3815 if (c0) (*o_putc)(c0);
3821 void w_oconv16(nkf_char c2, nkf_char c1)
3824 output_bom_f = FALSE;
3825 if (output_endian == ENDIAN_LITTLE){
3826 (*o_putc)((unsigned char)'\377');
3830 (*o_putc)((unsigned char)'\377');
3839 if (c2 == ISO8859_1) {
3842 #ifdef NUMCHAR_OPTION
3843 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3844 if (is_unicode_bmp(c1)) {
3845 c2 = (c1 >> 8) & 0xff;
3849 if (c1 <= UNICODE_MAX) {
3850 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3851 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3852 if (output_endian == ENDIAN_LITTLE){
3853 (*o_putc)(c2 & 0xff);
3854 (*o_putc)((c2 >> 8) & 0xff);
3855 (*o_putc)(c1 & 0xff);
3856 (*o_putc)((c1 >> 8) & 0xff);
3858 (*o_putc)((c2 >> 8) & 0xff);
3859 (*o_putc)(c2 & 0xff);
3860 (*o_putc)((c1 >> 8) & 0xff);
3861 (*o_putc)(c1 & 0xff);
3868 nkf_char val = e2w_conv(c2, c1);
3869 c2 = (val >> 8) & 0xff;
3873 if (output_endian == ENDIAN_LITTLE){
3882 void w_oconv32(nkf_char c2, nkf_char c1)
3885 output_bom_f = FALSE;
3886 if (output_endian == ENDIAN_LITTLE){
3887 (*o_putc)((unsigned char)'\377');
3895 (*o_putc)((unsigned char)'\377');
3904 if (c2 == ISO8859_1) {
3906 #ifdef NUMCHAR_OPTION
3907 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3911 c1 = e2w_conv(c2, c1);
3914 if (output_endian == ENDIAN_LITTLE){
3915 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3916 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3917 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3921 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3922 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3923 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3928 void e_oconv(nkf_char c2, nkf_char c1)
3930 #ifdef NUMCHAR_OPTION
3931 if (c2 == 0 && is_unicode_capsule(c1)){
3932 w16e_conv(c1, &c2, &c1);
3933 if (c2 == 0 && is_unicode_capsule(c1)){
3934 c2 = c1 & VALUE_MASK;
3935 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3939 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3940 c1 = 0x21 + c1 % 94;
3943 (*o_putc)((c2 & 0x7f) | 0x080);
3944 (*o_putc)(c1 | 0x080);
3946 (*o_putc)((c2 & 0x7f) | 0x080);
3947 (*o_putc)(c1 | 0x080);
3951 if (encode_fallback) (*encode_fallback)(c1);
3960 } else if (c2 == 0) {
3961 output_mode = ASCII;
3963 } else if (c2 == X0201) {
3964 output_mode = JAPANESE_EUC;
3965 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3966 } else if (c2 == ISO8859_1) {
3967 output_mode = ISO8859_1;
3968 (*o_putc)(c1 | 0x080);
3970 } else if (is_eucg3(c2)){
3971 output_mode = JAPANESE_EUC;
3972 #ifdef SHIFTJIS_CP932
3975 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3976 s2e_conv(s2, s1, &c2, &c1);
3981 output_mode = ASCII;
3983 }else if (is_eucg3(c2)){
3986 (*o_putc)((c2 & 0x7f) | 0x080);
3987 (*o_putc)(c1 | 0x080);
3990 (*o_putc)((c2 & 0x7f) | 0x080);
3991 (*o_putc)(c1 | 0x080);
3995 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
3996 set_iconv(FALSE, 0);
3997 return; /* too late to rescue this char */
3999 output_mode = JAPANESE_EUC;
4000 (*o_putc)(c2 | 0x080);
4001 (*o_putc)(c1 | 0x080);
4006 nkf_char x0212_shift(nkf_char c)
4011 if (0x75 <= c && c <= 0x7f){
4012 ret = c + (0x109 - 0x75);
4015 if (0x75 <= c && c <= 0x7f){
4016 ret = c + (0x113 - 0x75);
4023 nkf_char x0212_unshift(nkf_char c)
4026 if (0x7f <= c && c <= 0x88){
4027 ret = c + (0x75 - 0x7f);
4028 }else if (0x89 <= c && c <= 0x92){
4029 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4033 #endif /* X0212_ENABLE */
4035 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4041 if((0x21 <= ndx && ndx <= 0x2F)){
4042 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4043 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4045 }else if(0x6E <= ndx && ndx <= 0x7E){
4046 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4047 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4053 else if(nkf_isgraph(ndx)){
4055 const unsigned short *ptr;
4056 ptr = x0212_shiftjis[ndx - 0x21];
4058 val = ptr[(c1 & 0x7f) - 0x21];
4067 c2 = x0212_shift(c2);
4069 #endif /* X0212_ENABLE */
4071 if(0x7F < c2) return 1;
4072 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4073 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4077 void s_oconv(nkf_char c2, nkf_char c1)
4079 #ifdef NUMCHAR_OPTION
4080 if (c2 == 0 && is_unicode_capsule(c1)){
4081 w16e_conv(c1, &c2, &c1);
4082 if (c2 == 0 && is_unicode_capsule(c1)){
4083 c2 = c1 & VALUE_MASK;
4084 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4087 c2 = c1 / 188 + 0xF0;
4089 c1 += 0x40 + (c1 > 0x3e);
4094 if(encode_fallback)(*encode_fallback)(c1);
4103 } else if (c2 == 0) {
4104 output_mode = ASCII;
4106 } else if (c2 == X0201) {
4107 output_mode = SHIFT_JIS;
4109 } else if (c2 == ISO8859_1) {
4110 output_mode = ISO8859_1;
4111 (*o_putc)(c1 | 0x080);
4113 } else if (is_eucg3(c2)){
4114 output_mode = SHIFT_JIS;
4115 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4121 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4122 set_iconv(FALSE, 0);
4123 return; /* too late to rescue this char */
4125 output_mode = SHIFT_JIS;
4126 e2s_conv(c2, c1, &c2, &c1);
4128 #ifdef SHIFTJIS_CP932
4130 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4131 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4137 #endif /* SHIFTJIS_CP932 */
4140 if (prefix_table[(unsigned char)c1]){
4141 (*o_putc)(prefix_table[(unsigned char)c1]);
4147 void j_oconv(nkf_char c2, nkf_char c1)
4149 #ifdef NUMCHAR_OPTION
4150 if (c2 == 0 && is_unicode_capsule(c1)){
4151 w16e_conv(c1, &c2, &c1);
4152 if (c2 == 0 && is_unicode_capsule(c1)){
4153 c2 = c1 & VALUE_MASK;
4154 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4157 c2 = 0x7F + c1 / 94;
4158 c1 = 0x21 + c1 % 94;
4160 if (encode_fallback) (*encode_fallback)(c1);
4167 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4170 (*o_putc)(ascii_intro);
4171 output_mode = ASCII;
4175 } else if (is_eucg3(c2)){
4177 if(output_mode!=X0213_2){
4178 output_mode = X0213_2;
4182 (*o_putc)(X0213_2&0x7F);
4185 if(output_mode!=X0212){
4186 output_mode = X0212;
4190 (*o_putc)(X0212&0x7F);
4193 (*o_putc)(c2 & 0x7f);
4196 } else if (c2==X0201) {
4197 if (output_mode!=X0201) {
4198 output_mode = X0201;
4204 } else if (c2==ISO8859_1) {
4205 /* iso8859 introduction, or 8th bit on */
4206 /* Can we convert in 7bit form using ESC-'-'-A ?
4208 output_mode = ISO8859_1;
4210 } else if (c2 == 0) {
4211 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4214 (*o_putc)(ascii_intro);
4215 output_mode = ASCII;
4220 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4221 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4223 if (output_mode!=X0213_1) {
4224 output_mode = X0213_1;
4228 (*o_putc)(X0213_1&0x7F);
4230 }else if (output_mode != X0208) {
4231 output_mode = X0208;
4234 (*o_putc)(kanji_intro);
4241 void base64_conv(nkf_char c2, nkf_char c1)
4243 mime_prechar(c2, c1);
4244 (*o_base64conv)(c2,c1);
4248 static nkf_char broken_buf[3];
4249 static int broken_counter = 0;
4250 static int broken_last = 0;
4251 nkf_char broken_getc(FILE *f)
4255 if (broken_counter>0) {
4256 return broken_buf[--broken_counter];
4259 if (c=='$' && broken_last != ESC
4260 && (input_mode==ASCII || input_mode==X0201)) {
4263 if (c1=='@'|| c1=='B') {
4264 broken_buf[0]=c1; broken_buf[1]=c;
4271 } else if (c=='(' && broken_last != ESC
4272 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4275 if (c1=='J'|| c1=='B') {
4276 broken_buf[0]=c1; broken_buf[1]=c;
4289 nkf_char broken_ungetc(nkf_char c, FILE *f)
4291 if (broken_counter<2)
4292 broken_buf[broken_counter++]=c;
4296 void nl_conv(nkf_char c2, nkf_char c1)
4298 if (guess_f && input_nextline != EOF) {
4299 if (c2 == 0 && c1 == LF) {
4300 if (!input_nextline) input_nextline = prev_cr ? CRLF : LF;
4301 else if (input_nextline != (prev_cr ? CRLF : LF)) input_nextline = EOF;
4302 } else if (c2 == 0 && c1 == CR && input_nextline == LF) input_nextline = EOF;
4304 else if (!input_nextline) input_nextline = CR;
4305 else if (input_nextline != CR) input_nextline = EOF;
4307 if (prev_cr || c2 == 0 && c1 == LF) {
4309 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4310 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4312 if (c2 == 0 && c1 == CR) prev_cr = CR;
4313 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4317 Return value of fold_conv()
4319 LF add newline and output char
4320 CR add newline and output nothing
4323 1 (or else) normal output
4325 fold state in prev (previous character)
4327 >0x80 Japanese (X0208/X0201)
4332 This fold algorthm does not preserve heading space in a line.
4333 This is the main difference from fmt.
4336 #define char_size(c2,c1) (c2?2:1)
4338 void fold_conv(nkf_char c2, nkf_char c1)
4341 nkf_char fold_state;
4343 if (c1== CR && !fold_preserve_f) {
4344 fold_state=0; /* ignore cr */
4345 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4347 fold_state=0; /* ignore cr */
4348 } else if (c1== BS) {
4349 if (f_line>0) f_line--;
4351 } else if (c2==EOF && f_line != 0) { /* close open last line */
4353 } else if ((c1==LF && !fold_preserve_f)
4354 || ((c1==CR||(c1==LF&&f_prev!=CR))
4355 && fold_preserve_f)) {
4357 if (fold_preserve_f) {
4361 } else if ((f_prev == c1 && !fold_preserve_f)
4362 || (f_prev == LF && fold_preserve_f)
4363 ) { /* duplicate newline */
4366 fold_state = LF; /* output two newline */
4372 if (f_prev&0x80) { /* Japanese? */
4374 fold_state = 0; /* ignore given single newline */
4375 } else if (f_prev==SP) {
4379 if (++f_line<=fold_len)
4383 fold_state = CR; /* fold and output nothing */
4387 } else if (c1=='\f') {
4390 fold_state = LF; /* output newline and clear */
4391 } else if ( (c2==0 && c1==SP)||
4392 (c2==0 && c1==TAB)||
4393 (c2=='!'&& c1=='!')) {
4394 /* X0208 kankaku or ascii space */
4396 fold_state = 0; /* remove duplicate spaces */
4399 if (++f_line<=fold_len)
4400 fold_state = SP; /* output ASCII space only */
4402 f_prev = SP; f_line = 0;
4403 fold_state = CR; /* fold and output nothing */
4407 prev0 = f_prev; /* we still need this one... , but almost done */
4409 if (c2 || c2==X0201)
4410 f_prev |= 0x80; /* this is Japanese */
4411 f_line += char_size(c2,c1);
4412 if (f_line<=fold_len) { /* normal case */
4415 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4416 f_line = char_size(c2,c1);
4417 fold_state = LF; /* We can't wait, do fold now */
4418 } else if (c2==X0201) {
4419 /* simple kinsoku rules return 1 means no folding */
4420 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4421 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4422 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4423 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4424 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4425 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4426 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4428 fold_state = LF;/* add one new f_line before this character */
4431 fold_state = LF;/* add one new f_line before this character */
4434 /* kinsoku point in ASCII */
4435 if ( c1==')'|| /* { [ ( */
4446 /* just after special */
4447 } else if (!is_alnum(prev0)) {
4448 f_line = char_size(c2,c1);
4450 } else if ((prev0==SP) || /* ignored new f_line */
4451 (prev0==LF)|| /* ignored new f_line */
4452 (prev0&0x80)) { /* X0208 - ASCII */
4453 f_line = char_size(c2,c1);
4454 fold_state = LF;/* add one new f_line before this character */
4456 fold_state = 1; /* default no fold in ASCII */
4460 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4461 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4462 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4463 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4464 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4465 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4466 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4467 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4468 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4469 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4470 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4471 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4472 /* default no fold in kinsoku */
4475 f_line = char_size(c2,c1);
4476 /* add one new f_line before this character */
4479 f_line = char_size(c2,c1);
4481 /* add one new f_line before this character */
4486 /* terminator process */
4487 switch(fold_state) {
4506 nkf_char z_prev2=0,z_prev1=0;
4508 void z_conv(nkf_char c2, nkf_char c1)
4511 /* if (c2) c1 &= 0x7f; assertion */
4513 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4519 if (z_prev2 == X0201) {
4521 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4523 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4525 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4527 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4532 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4535 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4536 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4541 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4552 if (alpha_f&1 && c2 == 0x23) {
4553 /* JISX0208 Alphabet */
4555 } else if (c2 == 0x21) {
4556 /* JISX0208 Kigou */
4561 } else if (alpha_f&4) {
4566 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4572 if (alpha_f&8 && c2 == 0) {
4576 case '>': entity = ">"; break;
4577 case '<': entity = "<"; break;
4578 case '\"': entity = """; break;
4579 case '&': entity = "&"; break;
4582 while (*entity) (*o_zconv)(0, *entity++);
4588 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4593 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4597 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4601 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4605 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4609 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4613 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4617 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4621 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4626 (*o_zconv)(X0201, c);
4629 } else if (c2 == 0x25) {
4630 /* JISX0208 Katakana */
4631 static const int fullwidth_to_halfwidth[] =
4633 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4634 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4635 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4636 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4637 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4638 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4639 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4640 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4641 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4642 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4643 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4644 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4646 if (fullwidth_to_halfwidth[c1-0x20]){
4647 c2 = fullwidth_to_halfwidth[c1-0x20];
4648 (*o_zconv)(X0201, c2>>8);
4650 (*o_zconv)(X0201, c2&0xFF);
4660 #define rot13(c) ( \
4662 (c <= 'M') ? (c + 13): \
4663 (c <= 'Z') ? (c - 13): \
4665 (c <= 'm') ? (c + 13): \
4666 (c <= 'z') ? (c - 13): \
4670 #define rot47(c) ( \
4672 ( c <= 'O') ? (c + 47) : \
4673 ( c <= '~') ? (c - 47) : \
4677 void rot_conv(nkf_char c2, nkf_char c1)
4679 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4685 (*o_rot_conv)(c2,c1);
4688 void hira_conv(nkf_char c2, nkf_char c1)
4692 if (0x20 < c1 && c1 < 0x74) {
4694 (*o_hira_conv)(c2,c1);
4696 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4698 c1 = CLASS_UNICODE | 0x3094;
4699 (*o_hira_conv)(c2,c1);
4702 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4704 (*o_hira_conv)(c2,c1);
4709 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4712 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4714 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4718 (*o_hira_conv)(c2,c1);
4722 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4724 static const nkf_char range[RANGE_NUM_MAX][2] = {
4745 nkf_char start, end, c;
4747 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4751 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4756 for (i = 0; i < RANGE_NUM_MAX; i++) {
4757 start = range[i][0];
4760 if (c >= start && c <= end) {
4765 (*o_iso2022jp_check_conv)(c2,c1);
4769 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4771 static const unsigned char *mime_pattern[] = {
4772 (const unsigned char *)"\075?EUC-JP?B?",
4773 (const unsigned char *)"\075?SHIFT_JIS?B?",
4774 (const unsigned char *)"\075?ISO-8859-1?Q?",
4775 (const unsigned char *)"\075?ISO-8859-1?B?",
4776 (const unsigned char *)"\075?ISO-2022-JP?B?",
4777 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4778 #if defined(UTF8_INPUT_ENABLE)
4779 (const unsigned char *)"\075?UTF-8?B?",
4780 (const unsigned char *)"\075?UTF-8?Q?",
4782 (const unsigned char *)"\075?US-ASCII?Q?",
4787 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4788 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4789 e_iconv, s_iconv, 0, 0, 0, 0,
4790 #if defined(UTF8_INPUT_ENABLE)
4796 static const nkf_char mime_encode[] = {
4797 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4798 #if defined(UTF8_INPUT_ENABLE)
4805 static const nkf_char mime_encode_method[] = {
4806 'B', 'B','Q', 'B', 'B', 'Q',
4807 #if defined(UTF8_INPUT_ENABLE)
4815 #define MAXRECOVER 20
4817 void switch_mime_getc(void)
4819 if (i_getc!=mime_getc) {
4820 i_mgetc = i_getc; i_getc = mime_getc;
4821 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4822 if(mime_f==STRICT_MIME) {
4823 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4824 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4829 void unswitch_mime_getc(void)
4831 if(mime_f==STRICT_MIME) {
4832 i_mgetc = i_mgetc_buf;
4833 i_mungetc = i_mungetc_buf;
4836 i_ungetc = i_mungetc;
4837 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4838 mime_iconv_back = NULL;
4841 nkf_char mime_begin_strict(FILE *f)
4845 const unsigned char *p,*q;
4846 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4848 mime_decode_mode = FALSE;
4849 /* =? has been checked */
4851 p = mime_pattern[j];
4854 for(i=2;p[i]>SP;i++) { /* start at =? */
4855 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4856 /* pattern fails, try next one */
4858 while (mime_pattern[++j]) {
4859 p = mime_pattern[j];
4860 for(k=2;k<i;k++) /* assume length(p) > i */
4861 if (p[k]!=q[k]) break;
4862 if (k==i && nkf_toupper(c1)==p[k]) break;
4864 p = mime_pattern[j];
4865 if (p) continue; /* found next one, continue */
4866 /* all fails, output from recovery buffer */
4874 mime_decode_mode = p[i-2];
4876 mime_iconv_back = iconv;
4877 set_iconv(FALSE, mime_priority_func[j]);
4878 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4880 if (mime_decode_mode=='B') {
4881 mimebuf_f = unbuf_f;
4883 /* do MIME integrity check */
4884 return mime_integrity(f,mime_pattern[j]);
4892 nkf_char mime_getc_buf(FILE *f)
4894 /* we don't keep eof of Fifo, becase it contains ?= as
4895 a terminator. It was checked in mime_integrity. */
4896 return ((mimebuf_f)?
4897 (*i_mgetc_buf)(f):Fifo(mime_input++));
4900 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4903 (*i_mungetc_buf)(c,f);
4905 Fifo(--mime_input) = (unsigned char)c;
4909 nkf_char mime_begin(FILE *f)
4914 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4915 /* re-read and convert again from mime_buffer. */
4917 /* =? has been checked */
4919 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4920 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4921 /* We accept any character type even if it is breaked by new lines */
4922 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4923 if (c1==LF||c1==SP||c1==CR||
4924 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4926 /* Failed. But this could be another MIME preemble */
4934 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4935 if (!(++i<MAXRECOVER) || c1==EOF) break;
4936 if (c1=='b'||c1=='B') {
4937 mime_decode_mode = 'B';
4938 } else if (c1=='q'||c1=='Q') {
4939 mime_decode_mode = 'Q';
4943 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4944 if (!(++i<MAXRECOVER) || c1==EOF) break;
4946 mime_decode_mode = FALSE;
4952 if (!mime_decode_mode) {
4953 /* false MIME premble, restart from mime_buffer */
4954 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4955 /* Since we are in MIME mode until buffer becomes empty, */
4956 /* we never go into mime_begin again for a while. */
4959 /* discard mime preemble, and goto MIME mode */
4961 /* do no MIME integrity check */
4962 return c1; /* used only for checking EOF */
4966 void no_putc(nkf_char c)
4971 void debug(const char *str)
4974 fprintf(stderr, "%s\n", str ? str : "NULL");
4979 void set_input_codename(char *codename)
4981 if (!input_codename) {
4982 input_codename = codename;
4983 } else if (strcmp(codename, input_codename) != 0) {
4984 is_inputcode_mixed = TRUE;
4985 input_codename = "";
4989 #if !defined(PERL_XS) && !defined(WIN32DLL)
4990 void print_guessed_code(char *filename)
4992 char *codename = "BINARY";
4993 char *str_nlmode = NULL;
4994 if (filename != NULL) printf("%s: ", filename);
4995 if (input_codename && !*input_codename) {
4999 (input_codename ? input_codename : "ASCII"),
5000 input_nextline == CR ? " (CR)" :
5001 input_nextline == LF ? " (LF)" :
5002 input_nextline == CRLF ? " (CRLF)" :
5003 input_nextline == EOF ? " (MIXED NL)" :
5011 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5013 nkf_char c1, c2, c3;
5019 if (!nkf_isxdigit(c2)){
5024 if (!nkf_isxdigit(c3)){
5029 return (hex2bin(c2) << 4) | hex2bin(c3);
5032 nkf_char cap_getc(FILE *f)
5034 return hex_getc(':', f, i_cgetc, i_cungetc);
5037 nkf_char cap_ungetc(nkf_char c, FILE *f)
5039 return (*i_cungetc)(c, f);
5042 nkf_char url_getc(FILE *f)
5044 return hex_getc('%', f, i_ugetc, i_uungetc);
5047 nkf_char url_ungetc(nkf_char c, FILE *f)
5049 return (*i_uungetc)(c, f);
5053 #ifdef NUMCHAR_OPTION
5054 nkf_char numchar_getc(FILE *f)
5056 nkf_char (*g)(FILE *) = i_ngetc;
5057 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5068 if (buf[i] == 'x' || buf[i] == 'X'){
5069 for (j = 0; j < 7; j++){
5071 if (!nkf_isxdigit(buf[i])){
5078 c |= hex2bin(buf[i]);
5081 for (j = 0; j < 8; j++){
5085 if (!nkf_isdigit(buf[i])){
5092 c += hex2bin(buf[i]);
5098 return CLASS_UNICODE | c;
5107 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5109 return (*i_nungetc)(c, f);
5113 #ifdef UNICODE_NORMALIZATION
5115 /* Normalization Form C */
5116 nkf_char nfc_getc(FILE *f)
5118 nkf_char (*g)(FILE *f) = i_nfc_getc;
5119 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5120 int i=0, j, k=1, lower, upper;
5122 const nkf_nfchar *array;
5125 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5126 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5127 while (upper >= lower) {
5128 j = (lower+upper) / 2;
5129 array = normalization_table[j].nfd;
5130 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5131 if (array[k] != buf[k]){
5132 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5139 array = normalization_table[j].nfc;
5140 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5141 buf[i] = (nkf_char)(array[i]);
5152 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5154 return (*i_nfc_ungetc)(c, f);
5156 #endif /* UNICODE_NORMALIZATION */
5162 nkf_char c1, c2, c3, c4, cc;
5163 nkf_char t1, t2, t3, t4, mode, exit_mode;
5164 nkf_char lwsp_count;
5167 nkf_char lwsp_size = 128;
5169 if (mime_top != mime_last) { /* Something is in FIFO */
5170 return Fifo(mime_top++);
5172 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5173 mime_decode_mode=FALSE;
5174 unswitch_mime_getc();
5175 return (*i_getc)(f);
5178 if (mimebuf_f == FIXED_MIME)
5179 exit_mode = mime_decode_mode;
5182 if (mime_decode_mode == 'Q') {
5183 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5185 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5186 if (c1<=SP || DEL<=c1) {
5187 mime_decode_mode = exit_mode; /* prepare for quit */
5190 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5194 mime_decode_mode = exit_mode; /* prepare for quit */
5195 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5196 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5197 /* end Q encoding */
5198 input_mode = exit_mode;
5200 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5201 if (lwsp_buf==NULL) {
5202 perror("can't malloc");
5205 while ((c1=(*i_getc)(f))!=EOF) {
5210 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5218 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5219 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5234 lwsp_buf[lwsp_count] = (unsigned char)c1;
5235 if (lwsp_count++>lwsp_size){
5237 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5238 if (lwsp_buf_new==NULL) {
5240 perror("can't realloc");
5243 lwsp_buf = lwsp_buf_new;
5249 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5251 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5252 i_ungetc(lwsp_buf[lwsp_count],f);
5258 if (c1=='='&&c2<SP) { /* this is soft wrap */
5259 while((c1 = (*i_mgetc)(f)) <=SP) {
5260 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5262 mime_decode_mode = 'Q'; /* still in MIME */
5263 goto restart_mime_q;
5266 mime_decode_mode = 'Q'; /* still in MIME */
5270 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5271 if (c2<=SP) return c2;
5272 mime_decode_mode = 'Q'; /* still in MIME */
5273 return ((hex2bin(c2)<<4) + hex2bin(c3));
5276 if (mime_decode_mode != 'B') {
5277 mime_decode_mode = FALSE;
5278 return (*i_mgetc)(f);
5282 /* Base64 encoding */
5284 MIME allows line break in the middle of
5285 Base64, but we are very pessimistic in decoding
5286 in unbuf mode because MIME encoded code may broken by
5287 less or editor's control sequence (such as ESC-[-K in unbuffered
5288 mode. ignore incomplete MIME.
5290 mode = mime_decode_mode;
5291 mime_decode_mode = exit_mode; /* prepare for quit */
5293 while ((c1 = (*i_mgetc)(f))<=SP) {
5298 if ((c2 = (*i_mgetc)(f))<=SP) {
5301 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5302 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5305 if ((c1 == '?') && (c2 == '=')) {
5308 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5309 if (lwsp_buf==NULL) {
5310 perror("can't malloc");
5313 while ((c1=(*i_getc)(f))!=EOF) {
5318 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5326 if ((c1=(*i_getc)(f))!=EOF) {
5330 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5345 lwsp_buf[lwsp_count] = (unsigned char)c1;
5346 if (lwsp_count++>lwsp_size){
5348 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5349 if (lwsp_buf_new==NULL) {
5351 perror("can't realloc");
5354 lwsp_buf = lwsp_buf_new;
5360 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5362 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5363 i_ungetc(lwsp_buf[lwsp_count],f);
5370 if ((c3 = (*i_mgetc)(f))<=SP) {
5373 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5374 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5378 if ((c4 = (*i_mgetc)(f))<=SP) {
5381 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5382 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5386 mime_decode_mode = mode; /* still in MIME sigh... */
5388 /* BASE 64 decoding */
5390 t1 = 0x3f & base64decode(c1);
5391 t2 = 0x3f & base64decode(c2);
5392 t3 = 0x3f & base64decode(c3);
5393 t4 = 0x3f & base64decode(c4);
5394 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5396 Fifo(mime_last++) = (unsigned char)cc;
5397 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5399 Fifo(mime_last++) = (unsigned char)cc;
5400 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5402 Fifo(mime_last++) = (unsigned char)cc;
5407 return Fifo(mime_top++);
5410 nkf_char mime_ungetc(nkf_char c, FILE *f)
5412 Fifo(--mime_top) = (unsigned char)c;
5416 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5420 /* In buffered mode, read until =? or NL or buffer full
5422 mime_input = mime_top;
5423 mime_last = mime_top;
5425 while(*p) Fifo(mime_input++) = *p++;
5428 while((c=(*i_getc)(f))!=EOF) {
5429 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5430 break; /* buffer full */
5432 if (c=='=' && d=='?') {
5433 /* checked. skip header, start decode */
5434 Fifo(mime_input++) = (unsigned char)c;
5435 /* mime_last_input = mime_input; */
5440 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5442 /* Should we check length mod 4? */
5443 Fifo(mime_input++) = (unsigned char)c;
5446 /* In case of Incomplete MIME, no MIME decode */
5447 Fifo(mime_input++) = (unsigned char)c;
5448 mime_last = mime_input; /* point undecoded buffer */
5449 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5450 switch_mime_getc(); /* anyway we need buffered getc */
5454 nkf_char base64decode(nkf_char c)
5459 i = c - 'A'; /* A..Z 0-25 */
5461 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5463 } else if (c > '/') {
5464 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5465 } else if (c == '+') {
5466 i = '>' /* 62 */ ; /* + 62 */
5468 i = '?' /* 63 */ ; /* / 63 */
5473 static const char basis_64[] =
5474 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5476 static nkf_char b64c;
5477 #define MIMEOUT_BUF_LENGTH (60)
5478 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5479 int mimeout_buf_count = 0;
5480 int mimeout_preserve_space = 0;
5481 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5483 void open_mime(nkf_char mode)
5485 const unsigned char *p;
5488 p = mime_pattern[0];
5489 for(i=0;mime_pattern[i];i++) {
5490 if (mode == mime_encode[i]) {
5491 p = mime_pattern[i];
5495 mimeout_mode = mime_encode_method[i];
5498 if (base64_count>45) {
5499 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5500 (*o_mputc)(mimeout_buf[i]);
5506 if (!mimeout_preserve_space && mimeout_buf_count>0
5507 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5508 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5512 if (!mimeout_preserve_space) {
5513 for (;i<mimeout_buf_count;i++) {
5514 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5515 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5516 (*o_mputc)(mimeout_buf[i]);
5523 mimeout_preserve_space = FALSE;
5529 j = mimeout_buf_count;
5530 mimeout_buf_count = 0;
5532 mime_putc(mimeout_buf[i]);
5536 void close_mime(void)
5546 switch(mimeout_mode) {
5551 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5557 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5563 if (mimeout_f!=FIXED_MIME) {
5565 } else if (mimeout_mode != 'Q')
5570 void mimeout_addchar(nkf_char c)
5572 switch(mimeout_mode) {
5577 } else if(!nkf_isalnum(c)) {
5579 (*o_mputc)(itoh4(((c>>4)&0xf)));
5580 (*o_mputc)(itoh4((c&0xf)));
5589 (*o_mputc)(basis_64[c>>2]);
5594 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5600 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5601 (*o_mputc)(basis_64[c & 0x3F]);
5612 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5614 void mime_prechar(nkf_char c2, nkf_char c1)
5618 if (base64_count + mimeout_buf_count/3*4> 73){
5619 (*o_base64conv)(EOF,0);
5620 (*o_base64conv)(0,LF);
5621 (*o_base64conv)(0,SP);
5624 if (base64_count + mimeout_buf_count/3*4> 66){
5625 (*o_base64conv)(EOF,0);
5626 (*o_base64conv)(0,LF);
5627 (*o_base64conv)(0,SP);
5629 }/*else if (mime_lastchar2){
5630 if (c1 <=DEL && !nkf_isspace(c1)){
5631 (*o_base64conv)(0,SP);
5635 if (c2 && mime_lastchar2 == 0
5636 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5637 (*o_base64conv)(0,SP);
5640 /*mime_lastchar2 = c2;
5641 mime_lastchar1 = c1;*/
5644 void mime_putc(nkf_char c)
5649 if (mimeout_f == FIXED_MIME){
5650 if (mimeout_mode == 'Q'){
5651 if (base64_count > 71){
5652 if (c!=CR && c!=LF) {
5659 if (base64_count > 71){
5664 if (c == EOF) { /* c==EOF */
5668 if (c != EOF) { /* c==EOF */
5674 /* mimeout_f != FIXED_MIME */
5676 if (c == EOF) { /* c==EOF */
5677 j = mimeout_buf_count;
5678 mimeout_buf_count = 0;
5681 if (!nkf_isblank(mimeout_buf[j-1])) {
5683 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5686 mimeout_addchar(mimeout_buf[i]);
5690 mimeout_addchar(mimeout_buf[i]);
5694 mimeout_addchar(mimeout_buf[i]);
5700 mimeout_addchar(mimeout_buf[i]);
5706 if (mimeout_mode=='Q') {
5707 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5708 if (c == CR || c == LF) {
5713 } else if (c <= SP) {
5715 if (base64_count > 70) {
5719 if (!nkf_isblank(c)) {
5730 if (mimeout_buf_count > 0){
5731 lastchar = mimeout_buf[mimeout_buf_count - 1];
5736 if (!mimeout_mode) {
5737 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5738 if (nkf_isspace(c)) {
5739 if (c==CR || c==LF) {
5742 for (i=0;i<mimeout_buf_count;i++) {
5743 (*o_mputc)(mimeout_buf[i]);
5744 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5750 mimeout_buf[0] = (char)c;
5751 mimeout_buf_count = 1;
5753 if (base64_count > 1
5754 && base64_count + mimeout_buf_count > 76
5755 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5758 if (!nkf_isspace(mimeout_buf[0])){
5763 mimeout_buf[mimeout_buf_count++] = (char)c;
5764 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5765 open_mime(output_mode);
5770 if (lastchar==CR || lastchar == LF){
5771 for (i=0;i<mimeout_buf_count;i++) {
5772 (*o_mputc)(mimeout_buf[i]);
5775 mimeout_buf_count = 0;
5778 for (i=0;i<mimeout_buf_count-1;i++) {
5779 (*o_mputc)(mimeout_buf[i]);
5782 mimeout_buf[0] = SP;
5783 mimeout_buf_count = 1;
5785 open_mime(output_mode);
5788 /* mimeout_mode == 'B', 1, 2 */
5789 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5790 if (lastchar == CR || lastchar == LF){
5791 if (nkf_isblank(c)) {
5792 for (i=0;i<mimeout_buf_count;i++) {
5793 mimeout_addchar(mimeout_buf[i]);
5795 mimeout_buf_count = 0;
5796 } else if (SP<c && c<DEL) {
5798 for (i=0;i<mimeout_buf_count;i++) {
5799 (*o_mputc)(mimeout_buf[i]);
5802 mimeout_buf_count = 0;
5805 if (c==SP || c==TAB || c==CR || c==LF) {
5806 for (i=0;i<mimeout_buf_count;i++) {
5807 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5809 for (i=0;i<mimeout_buf_count;i++) {
5810 (*o_mputc)(mimeout_buf[i]);
5813 mimeout_buf_count = 0;
5816 mimeout_buf[mimeout_buf_count++] = (char)c;
5817 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5819 for (i=0;i<mimeout_buf_count;i++) {
5820 (*o_mputc)(mimeout_buf[i]);
5823 mimeout_buf_count = 0;
5827 if (mimeout_buf_count>0 && SP<c && c!='=') {
5828 mimeout_buf[mimeout_buf_count++] = (char)c;
5829 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5830 j = mimeout_buf_count;
5831 mimeout_buf_count = 0;
5833 mimeout_addchar(mimeout_buf[i]);
5840 if (mimeout_buf_count>0) {
5841 j = mimeout_buf_count;
5842 mimeout_buf_count = 0;
5844 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5846 mimeout_addchar(mimeout_buf[i]);
5852 (*o_mputc)(mimeout_buf[i]);
5854 open_mime(output_mode);
5861 #if defined(PERL_XS) || defined(WIN32DLL)
5865 struct input_code *p = input_code_list;
5878 mime_f = STRICT_MIME;
5879 mime_decode_f = FALSE;
5884 #if defined(MSDOS) || defined(__OS2__)
5889 iso2022jp_f = FALSE;
5890 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5891 ms_ucs_map_f = UCS_MAP_ASCII;
5893 #ifdef UTF8_INPUT_ENABLE
5894 no_cp932ext_f = FALSE;
5895 no_best_fit_chars_f = FALSE;
5896 encode_fallback = NULL;
5897 unicode_subchar = '?';
5898 input_endian = ENDIAN_BIG;
5900 #ifdef UTF8_OUTPUT_ENABLE
5901 output_bom_f = FALSE;
5902 output_endian = ENDIAN_BIG;
5904 #ifdef UNICODE_NORMALIZATION
5917 is_inputcode_mixed = FALSE;
5921 #ifdef SHIFTJIS_CP932
5931 for (i = 0; i < 256; i++){
5932 prefix_table[i] = 0;
5936 mimeout_buf_count = 0;
5941 fold_preserve_f = FALSE;
5944 kanji_intro = DEFAULT_J;
5945 ascii_intro = DEFAULT_R;
5946 fold_margin = FOLD_MARGIN;
5947 output_conv = DEFAULT_CONV;
5948 oconv = DEFAULT_CONV;
5949 o_zconv = no_connection;
5950 o_fconv = no_connection;
5951 o_nlconv = no_connection;
5952 o_rot_conv = no_connection;
5953 o_hira_conv = no_connection;
5954 o_base64conv = no_connection;
5955 o_iso2022jp_check_conv = no_connection;
5958 i_ungetc = std_ungetc;
5960 i_bungetc = std_ungetc;
5963 i_mungetc = std_ungetc;
5964 i_mgetc_buf = std_getc;
5965 i_mungetc_buf = std_ungetc;
5966 output_mode = ASCII;
5969 mime_decode_mode = FALSE;
5975 z_prev2=0,z_prev1=0;
5977 iconv_for_check = 0;
5979 input_codename = NULL;
5986 void no_connection(nkf_char c2, nkf_char c1)
5988 no_connection2(c2,c1,0);
5991 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5993 fprintf(stderr,"nkf internal module connection failure.\n");
5995 return 0; /* LINT */
6000 #define fprintf dllprintf
6004 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6005 fprintf(stderr,"Flags:\n");
6006 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6007 #ifdef DEFAULT_CODE_SJIS
6008 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6010 #ifdef DEFAULT_CODE_JIS
6011 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6013 #ifdef DEFAULT_CODE_EUC
6014 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6016 #ifdef DEFAULT_CODE_UTF8
6017 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6019 #ifdef UTF8_OUTPUT_ENABLE
6020 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6022 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6023 #ifdef UTF8_INPUT_ENABLE
6024 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6026 fprintf(stderr,"t no conversion\n");
6027 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6028 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6029 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6030 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6031 fprintf(stderr,"v Show this usage. V: show version\n");
6032 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6033 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6034 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6035 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6036 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6037 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6038 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6039 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6040 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6042 fprintf(stderr,"T Text mode output\n");
6044 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6045 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6046 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6047 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6048 fprintf(stderr,"\n");
6049 fprintf(stderr,"Long name options\n");
6050 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6051 fprintf(stderr," Specify the input or output codeset\n");
6052 fprintf(stderr," --fj --unix --mac --windows\n");
6053 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6054 fprintf(stderr," Convert for the system or code\n");
6055 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6056 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6057 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6059 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6061 #ifdef NUMCHAR_OPTION
6062 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6064 #ifdef UTF8_INPUT_ENABLE
6065 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6066 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6069 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6070 fprintf(stderr," Overwrite original listed files by filtered result\n");
6071 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6073 fprintf(stderr," -g --guess Guess the input code\n");
6074 fprintf(stderr," --help --version Show this help/the version\n");
6075 fprintf(stderr," For more information, see also man nkf\n");
6076 fprintf(stderr,"\n");
6082 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6083 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6086 #if defined(MSDOS) && defined(__WIN16__)
6089 #if defined(MSDOS) && defined(__WIN32__)
6095 ,NKF_VERSION,NKF_RELEASE_DATE);
6096 fprintf(stderr,"\n%s\n",CopyRight);