1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.151 2007/12/06 20:13:58 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-12-07"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
43 #ifndef MIME_DECODE_DEFAULT
44 #define MIME_DECODE_DEFAULT STRICT_MIME
47 #define X0201_DEFAULT TRUE
50 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
52 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
68 #if defined(MSDOS) || defined(__OS2__)
71 #if defined(_MSC_VER) || defined(__WATCOMC__)
72 #define mktemp _mktemp
78 #define setbinmode(fp) fsetbin(fp)
79 #elif defined(__DJGPP__)
80 #include <libc/dosio.h>
81 #define setbinmode(fp) djgpp_setbinmode(fp)
82 #else /* Microsoft C, Turbo C */
83 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
86 #define setbinmode(fp)
89 #if defined(__DJGPP__)
90 void djgpp_setbinmode(FILE *fp)
92 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
95 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
96 __file_handle_set(fd, m);
100 #ifdef _IOFBF /* SysV and MSDOS, Windows */
101 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
103 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
106 /*Borland C++ 4.5 EasyWin*/
107 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
116 /* added by satoru@isoternet.org */
118 #include <sys/types.h>
120 #include <sys/stat.h>
121 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
123 #if defined(__WATCOMC__)
124 #include <sys/utime.h>
128 #else /* defined(MSDOS) */
130 #ifdef __BORLANDC__ /* BCC32 */
132 #else /* !defined(__BORLANDC__) */
133 #include <sys/utime.h>
134 #endif /* (__BORLANDC__) */
135 #else /* !defined(__WIN32__) */
136 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
137 #include <sys/utime.h>
138 #elif defined(__TURBOC__) /* BCC */
140 #elif defined(LSI_C) /* LSI C */
141 #endif /* (__WIN32__) */
149 /* state of output_mode and input_mode
165 #define X0213_1 0x284F
166 #define X0213_2 0x2850
168 /* Input Assumption */
173 #define LATIN1_INPUT 6
175 #define STRICT_MIME 8
180 #define JAPANESE_EUC 10
184 #define UTF8_INPUT 13
185 #define UTF16_INPUT 1015
186 #define UTF32_INPUT 1017
190 #define ENDIAN_BIG 1234
191 #define ENDIAN_LITTLE 4321
192 #define ENDIAN_2143 2143
193 #define ENDIAN_3412 3412
212 #define is_alnum(c) \
213 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
215 /* I don't trust portablity of toupper */
216 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
217 #define nkf_isoctal(c) ('0'<=c && c<='7')
218 #define nkf_isdigit(c) ('0'<=c && c<='9')
219 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
220 #define nkf_isblank(c) (c == SP || c == TAB)
221 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
222 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
223 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
224 #define nkf_isprint(c) (SP<=c && c<='~')
225 #define nkf_isgraph(c) ('!'<=c && c<='~')
226 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
227 ('A'<=c&&c<='F') ? (c-'A'+10) : \
228 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
229 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
230 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
231 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
232 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
233 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
235 #define CP932_TABLE_BEGIN 0xFA
236 #define CP932_TABLE_END 0xFC
237 #define CP932INV_TABLE_BEGIN 0xED
238 #define CP932INV_TABLE_END 0xEE
239 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
241 #define HOLD_SIZE 1024
242 #if defined(INT_IS_SHORT)
243 #define IOBUF_SIZE 2048
245 #define IOBUF_SIZE 16384
248 #define DEFAULT_J 'B'
249 #define DEFAULT_R 'B'
251 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
252 #define SJ6394 0x0161 /* 63 - 94 ku offset */
254 #define RANGE_NUM_MAX 18
259 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
260 #define sizeof_euc_to_utf8_1byte 94
261 #define sizeof_euc_to_utf8_2bytes 94
262 #define sizeof_utf8_to_euc_C2 64
263 #define sizeof_utf8_to_euc_E5B8 64
264 #define sizeof_utf8_to_euc_2bytes 112
265 #define sizeof_utf8_to_euc_3bytes 16
268 /* MIME preprocessor */
270 #ifdef EASYWIN /*Easy Win */
271 extern POINT _BufferSize;
280 void (*status_func)(struct input_code *, nkf_char);
281 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
285 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
288 static const char *CopyRight = COPY_RIGHT;
290 #if !defined(PERL_XS) && !defined(WIN32DLL)
291 static nkf_char noconvert(FILE *f);
293 static void module_connection(void);
294 static nkf_char kanji_convert(FILE *f);
295 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
296 static nkf_char push_hold_buf(nkf_char c2);
297 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
298 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
299 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
300 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
301 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
303 * 0: Shift_JIS, eucJP-ascii
308 #define UCS_MAP_ASCII 0
310 #define UCS_MAP_CP932 2
311 #define UCS_MAP_CP10001 3
312 static int ms_ucs_map_f = UCS_MAP_ASCII;
314 #ifdef UTF8_INPUT_ENABLE
315 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
316 static int no_cp932ext_f = FALSE;
317 /* ignore ZERO WIDTH NO-BREAK SPACE */
318 static int no_best_fit_chars_f = FALSE;
319 static int input_endian = ENDIAN_BIG;
320 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
321 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
322 static void encode_fallback_html(nkf_char c);
323 static void encode_fallback_xml(nkf_char c);
324 static void encode_fallback_java(nkf_char c);
325 static void encode_fallback_perl(nkf_char c);
326 static void encode_fallback_subchar(nkf_char c);
327 static void (*encode_fallback)(nkf_char c) = NULL;
328 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
329 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
330 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
331 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
332 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
333 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
334 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
335 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
336 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
337 static void w_status(struct input_code *, nkf_char);
339 #ifdef UTF8_OUTPUT_ENABLE
340 static int output_bom_f = FALSE;
341 static int output_endian = ENDIAN_BIG;
342 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
343 static void w_oconv(nkf_char c2,nkf_char c1);
344 static void w_oconv16(nkf_char c2,nkf_char c1);
345 static void w_oconv32(nkf_char c2,nkf_char c1);
347 static void e_oconv(nkf_char c2,nkf_char c1);
348 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
349 static void s_oconv(nkf_char c2,nkf_char c1);
350 static void j_oconv(nkf_char c2,nkf_char c1);
351 static void fold_conv(nkf_char c2,nkf_char c1);
352 static void nl_conv(nkf_char c2,nkf_char c1);
353 static void z_conv(nkf_char c2,nkf_char c1);
354 static void rot_conv(nkf_char c2,nkf_char c1);
355 static void hira_conv(nkf_char c2,nkf_char c1);
356 static void base64_conv(nkf_char c2,nkf_char c1);
357 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
358 static void no_connection(nkf_char c2,nkf_char c1);
359 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
361 static void code_score(struct input_code *ptr);
362 static void code_status(nkf_char c);
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_begin(FILE *f);
372 static nkf_char mime_getc(FILE *f);
373 static nkf_char mime_ungetc(nkf_char c,FILE *f);
375 static void switch_mime_getc(void);
376 static void unswitch_mime_getc(void);
377 static nkf_char mime_begin_strict(FILE *f);
378 static nkf_char mime_getc_buf(FILE *f);
379 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
380 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
382 static nkf_char base64decode(nkf_char c);
383 static void mime_prechar(nkf_char c2, nkf_char c1);
384 static void mime_putc(nkf_char c);
385 static void open_mime(nkf_char c);
386 static void close_mime(void);
387 static void eof_mime(void);
388 static void mimeout_addchar(nkf_char c);
390 static void usage(void);
391 static void version(void);
392 static void show_configuration(void);
394 static void options(unsigned char *c);
395 static void reinit(void);
399 #if !defined(PERL_XS) && !defined(WIN32DLL)
400 static unsigned char stdibuf[IOBUF_SIZE];
401 static unsigned char stdobuf[IOBUF_SIZE];
403 static unsigned char hold_buf[HOLD_SIZE*2];
404 static int hold_count = 0;
406 /* MIME preprocessor fifo */
408 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
409 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
410 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
411 static unsigned char mime_buf[MIME_BUF_SIZE];
412 static unsigned int mime_top = 0;
413 static unsigned int mime_last = 0; /* decoded */
414 static unsigned int mime_input = 0; /* undecoded */
415 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
418 static int unbuf_f = FALSE;
419 static int estab_f = FALSE;
420 static int nop_f = FALSE;
421 static int binmode_f = TRUE; /* binary mode */
422 static int rot_f = FALSE; /* rot14/43 mode */
423 static int hira_f = FALSE; /* hira/kata henkan */
424 static int input_f = FALSE; /* non fixed input code */
425 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
426 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
427 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
428 static int mimebuf_f = FALSE; /* MIME buffered input */
429 static int broken_f = FALSE; /* convert ESC-less broken JIS */
430 static int iso8859_f = FALSE; /* ISO8859 through */
431 static int mimeout_f = FALSE; /* base64 mode */
432 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
433 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
435 #ifdef UNICODE_NORMALIZATION
436 static int nfc_f = FALSE;
437 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
438 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
439 static nkf_char nfc_getc(FILE *f);
440 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
444 static int cap_f = FALSE;
445 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
446 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
447 static nkf_char cap_getc(FILE *f);
448 static nkf_char cap_ungetc(nkf_char c,FILE *f);
450 static int url_f = FALSE;
451 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
452 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
453 static nkf_char url_getc(FILE *f);
454 static nkf_char url_ungetc(nkf_char c,FILE *f);
457 #if defined(INT_IS_SHORT)
458 #define NKF_INT32_C(n) (n##L)
460 #define NKF_INT32_C(n) (n)
462 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
463 #define CLASS_MASK NKF_INT32_C(0xFF000000)
464 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
465 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
466 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
467 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
468 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
470 #ifdef NUMCHAR_OPTION
471 static int numchar_f = FALSE;
472 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
473 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
474 static nkf_char numchar_getc(FILE *f);
475 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
479 static int noout_f = FALSE;
480 static void no_putc(nkf_char c);
481 static int debug_f = FALSE;
482 static void debug(const char *str);
483 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
486 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
488 static void print_guessed_code(char *filename);
490 static void set_input_codename(char *codename);
493 static int exec_f = 0;
496 #ifdef SHIFTJIS_CP932
497 /* invert IBM extended characters to others */
498 static int cp51932_f = FALSE;
500 /* invert NEC-selected IBM extended characters to IBM extended characters */
501 static int cp932inv_f = TRUE;
503 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
504 #endif /* SHIFTJIS_CP932 */
507 static int x0212_f = FALSE;
508 static nkf_char x0212_shift(nkf_char c);
509 static nkf_char x0212_unshift(nkf_char c);
511 static int x0213_f = FALSE;
513 static unsigned char prefix_table[256];
515 static void set_code_score(struct input_code *ptr, nkf_char score);
516 static void clr_code_score(struct input_code *ptr, nkf_char score);
517 static void status_disable(struct input_code *ptr);
518 static void status_push_ch(struct input_code *ptr, nkf_char c);
519 static void status_clear(struct input_code *ptr);
520 static void status_reset(struct input_code *ptr);
521 static void status_reinit(struct input_code *ptr);
522 static void status_check(struct input_code *ptr, nkf_char c);
523 static void e_status(struct input_code *, nkf_char);
524 static void s_status(struct input_code *, nkf_char);
526 struct input_code input_code_list[] = {
527 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
528 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
529 #ifdef UTF8_INPUT_ENABLE
530 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
531 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
532 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
537 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
538 static int base64_count = 0;
540 /* X0208 -> ASCII converter */
543 static int f_line = 0; /* chars in line */
544 static int f_prev = 0;
545 static int fold_preserve_f = FALSE; /* preserve new lines */
546 static int fold_f = FALSE;
547 static int fold_len = 0;
550 static unsigned char kanji_intro = DEFAULT_J;
551 static unsigned char ascii_intro = DEFAULT_R;
555 #define FOLD_MARGIN 10
556 #define DEFAULT_FOLD 60
558 static int fold_margin = FOLD_MARGIN;
562 #ifdef DEFAULT_CODE_JIS
563 # define DEFAULT_CONV j_oconv
565 #ifdef DEFAULT_CODE_SJIS
566 # define DEFAULT_CONV s_oconv
568 #ifdef DEFAULT_CODE_EUC
569 # define DEFAULT_CONV e_oconv
571 #ifdef DEFAULT_CODE_UTF8
572 # define DEFAULT_CONV w_oconv
575 /* process default */
576 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
578 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
579 /* s_iconv or oconv */
580 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
582 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
587 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
588 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
590 /* static redirections */
592 static void (*o_putc)(nkf_char c) = std_putc;
594 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
595 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
597 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
598 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
600 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
602 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
603 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
605 /* for strict mime */
606 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
607 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
610 static int output_mode = ASCII, /* output kanji mode */
611 input_mode = ASCII, /* input kanji mode */
612 shift_mode = FALSE; /* TRUE shift out, or X0201 */
613 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
615 /* X0201 / X0208 conversion tables */
617 /* X0201 kana conversion table */
619 static const unsigned char cv[]= {
620 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
621 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
622 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
623 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
624 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
625 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
626 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
627 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
628 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
629 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
630 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
631 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
632 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
633 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
634 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
635 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
639 /* X0201 kana conversion table for daguten */
641 static const unsigned char dv[]= {
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
647 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
648 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
649 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
650 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
651 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
653 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 /* X0201 kana conversion table for han-daguten */
662 static const unsigned char ev[]= {
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
674 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
682 /* X0208 kigou conversion table */
683 /* 0x8140 - 0x819e */
684 static const unsigned char fv[] = {
686 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
687 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
688 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
689 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
690 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
691 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
692 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
694 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
696 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
702 static int file_out_f = FALSE;
704 static int overwrite_f = FALSE;
705 static int preserve_time_f = FALSE;
706 static int backup_f = FALSE;
707 static char *backup_suffix = "";
708 static char *get_backup_filename(const char *suffix, const char *filename);
711 static int nlmode_f = 0; /* CR, LF, CRLF */
712 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
713 static nkf_char prev_cr = 0; /* CR or 0 */
714 #ifdef EASYWIN /*Easy Win */
715 static int end_check;
718 #define STD_GC_BUFSIZE (256)
719 nkf_char std_gc_buf[STD_GC_BUFSIZE];
723 #include "nkf32dll.c"
724 #elif defined(PERL_XS)
726 int main(int argc, char **argv)
731 char *outfname = NULL;
734 #ifdef EASYWIN /*Easy Win */
735 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
738 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
739 cp = (unsigned char *)*argv;
743 int debug_f_back = debug_f;
746 int exec_f_back = exec_f;
749 int x0212_f_back = x0212_f;
751 int x0213_f_back = x0213_f;
752 int guess_f_back = guess_f;
754 guess_f = guess_f_back;
757 debug_f = debug_f_back;
760 exec_f = exec_f_back;
763 x0212_f = x0212_f_back;
765 x0213_f = x0213_f_back;
770 if (pipe(fds) < 0 || (pid = fork()) < 0){
781 execvp(argv[1], &argv[1]);
796 if (binmode_f == TRUE)
797 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
798 if (freopen("","wb",stdout) == NULL)
805 setbuf(stdout, (char *) NULL);
807 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
810 if (binmode_f == TRUE)
811 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
812 if (freopen("","rb",stdin) == NULL) return (-1);
816 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
820 kanji_convert(stdin);
821 if (guess_f) print_guessed_code(NULL);
825 int is_argument_error = FALSE;
827 input_codename = NULL;
832 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
835 is_argument_error = TRUE;
843 /* reopen file for stdout */
844 if (file_out_f == TRUE) {
847 outfname = malloc(strlen(origfname)
848 + strlen(".nkftmpXXXXXX")
854 strcpy(outfname, origfname);
858 for (i = strlen(outfname); i; --i){
859 if (outfname[i - 1] == '/'
860 || outfname[i - 1] == '\\'){
866 strcat(outfname, "ntXXXXXX");
868 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
871 strcat(outfname, ".nkftmpXXXXXX");
872 fd = mkstemp(outfname);
875 || (fd_backup = dup(fileno(stdout))) < 0
876 || dup2(fd, fileno(stdout)) < 0
887 outfname = "nkf.out";
890 if(freopen(outfname, "w", stdout) == NULL) {
894 if (binmode_f == TRUE) {
895 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
896 if (freopen("","wb",stdout) == NULL)
903 if (binmode_f == TRUE)
904 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
905 if (freopen("","rb",fin) == NULL)
910 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
914 char *filename = NULL;
916 if (nfiles > 1) filename = origfname;
917 if (guess_f) print_guessed_code(filename);
923 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
931 if (dup2(fd_backup, fileno(stdout)) < 0){
934 if (stat(origfname, &sb)) {
935 fprintf(stderr, "Can't stat %s\n", origfname);
937 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
938 if (chmod(outfname, sb.st_mode)) {
939 fprintf(stderr, "Can't set permission %s\n", outfname);
942 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
944 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
945 tb[0] = tb[1] = sb.st_mtime;
946 if (utime(outfname, tb)) {
947 fprintf(stderr, "Can't set timestamp %s\n", outfname);
950 tb.actime = sb.st_atime;
951 tb.modtime = sb.st_mtime;
952 if (utime(outfname, &tb)) {
953 fprintf(stderr, "Can't set timestamp %s\n", outfname);
958 char *backup_filename = get_backup_filename(backup_suffix, origfname);
960 unlink(backup_filename);
962 if (rename(origfname, backup_filename)) {
963 perror(backup_filename);
964 fprintf(stderr, "Can't rename %s to %s\n",
965 origfname, backup_filename);
969 if (unlink(origfname)){
974 if (rename(outfname, origfname)) {
976 fprintf(stderr, "Can't rename %s to %s\n",
977 outfname, origfname);
984 if (is_argument_error)
987 #ifdef EASYWIN /*Easy Win */
988 if (file_out_f == FALSE)
989 scanf("%d",&end_check);
992 #else /* for Other OS */
993 if (file_out_f == TRUE)
998 #endif /* WIN32DLL */
1001 char *get_backup_filename(const char *suffix, const char *filename)
1003 char *backup_filename;
1004 int asterisk_count = 0;
1006 int filename_length = strlen(filename);
1008 for(i = 0; suffix[i]; i++){
1009 if(suffix[i] == '*') asterisk_count++;
1013 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1014 if (!backup_filename){
1015 perror("Can't malloc backup filename.");
1019 for(i = 0, j = 0; suffix[i];){
1020 if(suffix[i] == '*'){
1021 backup_filename[j] = '\0';
1022 strncat(backup_filename, filename, filename_length);
1024 j += filename_length;
1026 backup_filename[j++] = suffix[i++];
1029 backup_filename[j] = '\0';
1031 j = strlen(suffix) + filename_length;
1032 backup_filename = malloc( + 1);
1033 strcpy(backup_filename, filename);
1034 strcat(backup_filename, suffix);
1035 backup_filename[j] = '\0';
1037 return backup_filename;
1041 static const struct {
1065 {"katakana-hiragana","h3"},
1073 #ifdef UTF8_OUTPUT_ENABLE
1083 {"fb-subchar=", ""},
1085 #ifdef UTF8_INPUT_ENABLE
1086 {"utf8-input", "W"},
1087 {"utf16-input", "W16"},
1088 {"no-cp932ext", ""},
1089 {"no-best-fit-chars",""},
1091 #ifdef UNICODE_NORMALIZATION
1092 {"utf8mac-input", ""},
1104 #ifdef NUMCHAR_OPTION
1105 {"numchar-input", ""},
1111 #ifdef SHIFTJIS_CP932
1121 static int option_mode = 0;
1123 void options(unsigned char *cp)
1127 unsigned char *cp_back = NULL;
1132 while(*cp && *cp++!='-');
1133 while (*cp || cp_back) {
1141 case '-': /* literal options */
1142 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1146 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1147 p = (unsigned char *)long_option[i].name;
1148 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1149 if (*p == cp[j] || cp[j] == SP){
1156 fprintf(stderr, "unknown long option: --%s\n", cp);
1159 while(*cp && *cp != SP && cp++);
1160 if (long_option[i].alias[0]){
1162 cp = (unsigned char *)long_option[i].alias;
1164 if (strcmp(long_option[i].name, "ic=") == 0){
1165 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1166 codeset[i] = nkf_toupper(p[i]);
1169 if(strcmp(codeset, "ISO-2022-JP") == 0){
1170 input_f = JIS_INPUT;
1171 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1172 strcmp(codeset, "CP50220") == 0 ||
1173 strcmp(codeset, "CP50221") == 0 ||
1174 strcmp(codeset, "CP50222") == 0){
1175 input_f = JIS_INPUT;
1176 #ifdef SHIFTJIS_CP932
1179 #ifdef UTF8_OUTPUT_ENABLE
1180 ms_ucs_map_f = UCS_MAP_CP932;
1182 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1183 input_f = JIS_INPUT;
1187 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1188 input_f = JIS_INPUT;
1193 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1194 input_f = SJIS_INPUT;
1195 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1196 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1197 strcmp(codeset, "CP932") == 0 ||
1198 strcmp(codeset, "MS932") == 0){
1199 input_f = SJIS_INPUT;
1200 #ifdef SHIFTJIS_CP932
1203 #ifdef UTF8_OUTPUT_ENABLE
1204 ms_ucs_map_f = UCS_MAP_CP932;
1206 }else if(strcmp(codeset, "CP10001") == 0){
1207 input_f = SJIS_INPUT;
1208 #ifdef SHIFTJIS_CP932
1211 #ifdef UTF8_OUTPUT_ENABLE
1212 ms_ucs_map_f = UCS_MAP_CP10001;
1214 }else if(strcmp(codeset, "EUCJP") == 0 ||
1215 strcmp(codeset, "EUC-JP") == 0){
1216 input_f = EUC_INPUT;
1217 }else if(strcmp(codeset, "CP51932") == 0){
1218 input_f = EUC_INPUT;
1219 #ifdef SHIFTJIS_CP932
1222 #ifdef UTF8_OUTPUT_ENABLE
1223 ms_ucs_map_f = UCS_MAP_CP932;
1225 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1226 strcmp(codeset, "EUCJP-MS") == 0 ||
1227 strcmp(codeset, "EUCJPMS") == 0){
1228 input_f = EUC_INPUT;
1229 #ifdef SHIFTJIS_CP932
1232 #ifdef UTF8_OUTPUT_ENABLE
1233 ms_ucs_map_f = UCS_MAP_MS;
1235 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1236 strcmp(codeset, "EUCJP-ASCII") == 0){
1237 input_f = EUC_INPUT;
1238 #ifdef SHIFTJIS_CP932
1241 #ifdef UTF8_OUTPUT_ENABLE
1242 ms_ucs_map_f = UCS_MAP_ASCII;
1244 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1245 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1246 input_f = SJIS_INPUT;
1248 #ifdef SHIFTJIS_CP932
1251 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1252 strcmp(codeset, "EUC-JIS-2004") == 0){
1253 input_f = EUC_INPUT;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_INPUT_ENABLE
1259 }else if(strcmp(codeset, "UTF-8") == 0 ||
1260 strcmp(codeset, "UTF-8N") == 0 ||
1261 strcmp(codeset, "UTF-8-BOM") == 0){
1262 input_f = UTF8_INPUT;
1263 #ifdef UNICODE_NORMALIZATION
1264 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1265 strcmp(codeset, "UTF-8-MAC") == 0){
1266 input_f = UTF8_INPUT;
1269 }else if(strcmp(codeset, "UTF-16") == 0 ||
1270 strcmp(codeset, "UTF-16BE") == 0 ||
1271 strcmp(codeset, "UTF-16BE-BOM") == 0){
1272 input_f = UTF16_INPUT;
1273 input_endian = ENDIAN_BIG;
1274 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1275 strcmp(codeset, "UTF-16LE-BOM") == 0){
1276 input_f = UTF16_INPUT;
1277 input_endian = ENDIAN_LITTLE;
1278 }else if(strcmp(codeset, "UTF-32") == 0 ||
1279 strcmp(codeset, "UTF-32BE") == 0 ||
1280 strcmp(codeset, "UTF-32BE-BOM") == 0){
1281 input_f = UTF32_INPUT;
1282 input_endian = ENDIAN_BIG;
1283 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1284 strcmp(codeset, "UTF-32LE-BOM") == 0){
1285 input_f = UTF32_INPUT;
1286 input_endian = ENDIAN_LITTLE;
1289 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1293 if (strcmp(long_option[i].name, "oc=") == 0){
1295 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1296 codeset[i] = nkf_toupper(p[i]);
1299 if(strcmp(codeset, "ISO-2022-JP") == 0){
1300 output_conv = j_oconv;
1301 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1302 output_conv = j_oconv;
1303 no_cp932ext_f = TRUE;
1304 #ifdef SHIFTJIS_CP932
1307 #ifdef UTF8_OUTPUT_ENABLE
1308 ms_ucs_map_f = UCS_MAP_CP932;
1310 }else if(strcmp(codeset, "CP50220") == 0){
1311 output_conv = j_oconv;
1313 #ifdef SHIFTJIS_CP932
1316 #ifdef UTF8_OUTPUT_ENABLE
1317 ms_ucs_map_f = UCS_MAP_CP932;
1319 }else if(strcmp(codeset, "CP50221") == 0){
1320 output_conv = j_oconv;
1321 #ifdef SHIFTJIS_CP932
1324 #ifdef UTF8_OUTPUT_ENABLE
1325 ms_ucs_map_f = UCS_MAP_CP932;
1327 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1328 output_conv = j_oconv;
1332 #ifdef SHIFTJIS_CP932
1335 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1336 output_conv = j_oconv;
1341 #ifdef SHIFTJIS_CP932
1344 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1345 output_conv = s_oconv;
1346 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1347 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1348 strcmp(codeset, "CP932") == 0 ||
1349 strcmp(codeset, "MS932") == 0){
1350 output_conv = s_oconv;
1351 #ifdef UTF8_OUTPUT_ENABLE
1352 ms_ucs_map_f = UCS_MAP_CP932;
1354 }else if(strcmp(codeset, "CP10001") == 0){
1355 output_conv = s_oconv;
1356 #ifdef UTF8_OUTPUT_ENABLE
1357 ms_ucs_map_f = UCS_MAP_CP10001;
1359 }else if(strcmp(codeset, "EUCJP") == 0 ||
1360 strcmp(codeset, "EUC-JP") == 0){
1361 output_conv = e_oconv;
1362 }else if(strcmp(codeset, "CP51932") == 0){
1363 output_conv = e_oconv;
1364 #ifdef SHIFTJIS_CP932
1367 #ifdef UTF8_OUTPUT_ENABLE
1368 ms_ucs_map_f = UCS_MAP_CP932;
1370 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1371 strcmp(codeset, "EUCJP-MS") == 0 ||
1372 strcmp(codeset, "EUCJPMS") == 0){
1373 output_conv = e_oconv;
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_MS;
1380 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1381 strcmp(codeset, "EUCJP-ASCII") == 0){
1382 output_conv = e_oconv;
1386 #ifdef UTF8_OUTPUT_ENABLE
1387 ms_ucs_map_f = UCS_MAP_ASCII;
1389 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1390 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1391 output_conv = s_oconv;
1393 #ifdef SHIFTJIS_CP932
1396 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1397 strcmp(codeset, "EUC-JIS-2004") == 0){
1398 output_conv = e_oconv;
1403 #ifdef SHIFTJIS_CP932
1406 #ifdef UTF8_OUTPUT_ENABLE
1407 }else if(strcmp(codeset, "UTF-8") == 0){
1408 output_conv = w_oconv;
1409 }else if(strcmp(codeset, "UTF-8N") == 0){
1410 output_conv = w_oconv;
1411 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1412 output_conv = w_oconv;
1413 output_bom_f = TRUE;
1414 }else if(strcmp(codeset, "UTF-16BE") == 0){
1415 output_conv = w_oconv16;
1416 }else if(strcmp(codeset, "UTF-16") == 0 ||
1417 strcmp(codeset, "UTF-16BE-BOM") == 0){
1418 output_conv = w_oconv16;
1419 output_bom_f = TRUE;
1420 }else if(strcmp(codeset, "UTF-16LE") == 0){
1421 output_conv = w_oconv16;
1422 output_endian = ENDIAN_LITTLE;
1423 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1424 output_conv = w_oconv16;
1425 output_endian = ENDIAN_LITTLE;
1426 output_bom_f = TRUE;
1427 }else if(strcmp(codeset, "UTF-32") == 0 ||
1428 strcmp(codeset, "UTF-32BE") == 0){
1429 output_conv = w_oconv32;
1430 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1431 output_conv = w_oconv32;
1432 output_bom_f = TRUE;
1433 }else if(strcmp(codeset, "UTF-32LE") == 0){
1434 output_conv = w_oconv32;
1435 output_endian = ENDIAN_LITTLE;
1436 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1437 output_conv = w_oconv32;
1438 output_endian = ENDIAN_LITTLE;
1439 output_bom_f = TRUE;
1442 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1446 if (strcmp(long_option[i].name, "guess=") == 0){
1455 if (strcmp(long_option[i].name, "overwrite") == 0){
1458 preserve_time_f = TRUE;
1461 if (strcmp(long_option[i].name, "overwrite=") == 0){
1464 preserve_time_f = TRUE;
1466 backup_suffix = malloc(strlen((char *) p) + 1);
1467 strcpy(backup_suffix, (char *) p);
1470 if (strcmp(long_option[i].name, "in-place") == 0){
1473 preserve_time_f = FALSE;
1476 if (strcmp(long_option[i].name, "in-place=") == 0){
1479 preserve_time_f = FALSE;
1481 backup_suffix = malloc(strlen((char *) p) + 1);
1482 strcpy(backup_suffix, (char *) p);
1487 if (strcmp(long_option[i].name, "cap-input") == 0){
1491 if (strcmp(long_option[i].name, "url-input") == 0){
1496 #ifdef NUMCHAR_OPTION
1497 if (strcmp(long_option[i].name, "numchar-input") == 0){
1503 if (strcmp(long_option[i].name, "no-output") == 0){
1507 if (strcmp(long_option[i].name, "debug") == 0){
1512 if (strcmp(long_option[i].name, "cp932") == 0){
1513 #ifdef SHIFTJIS_CP932
1517 #ifdef UTF8_OUTPUT_ENABLE
1518 ms_ucs_map_f = UCS_MAP_CP932;
1522 if (strcmp(long_option[i].name, "no-cp932") == 0){
1523 #ifdef SHIFTJIS_CP932
1527 #ifdef UTF8_OUTPUT_ENABLE
1528 ms_ucs_map_f = UCS_MAP_ASCII;
1532 #ifdef SHIFTJIS_CP932
1533 if (strcmp(long_option[i].name, "cp932inv") == 0){
1540 if (strcmp(long_option[i].name, "x0212") == 0){
1547 if (strcmp(long_option[i].name, "exec-in") == 0){
1551 if (strcmp(long_option[i].name, "exec-out") == 0){
1556 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1557 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1558 no_cp932ext_f = TRUE;
1561 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1562 no_best_fit_chars_f = TRUE;
1565 if (strcmp(long_option[i].name, "fb-skip") == 0){
1566 encode_fallback = NULL;
1569 if (strcmp(long_option[i].name, "fb-html") == 0){
1570 encode_fallback = encode_fallback_html;
1573 if (strcmp(long_option[i].name, "fb-xml") == 0){
1574 encode_fallback = encode_fallback_xml;
1577 if (strcmp(long_option[i].name, "fb-java") == 0){
1578 encode_fallback = encode_fallback_java;
1581 if (strcmp(long_option[i].name, "fb-perl") == 0){
1582 encode_fallback = encode_fallback_perl;
1585 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1586 encode_fallback = encode_fallback_subchar;
1589 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1590 encode_fallback = encode_fallback_subchar;
1591 unicode_subchar = 0;
1593 /* decimal number */
1594 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1595 unicode_subchar *= 10;
1596 unicode_subchar += hex2bin(p[i]);
1598 }else if(p[1] == 'x' || p[1] == 'X'){
1599 /* hexadecimal number */
1600 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1601 unicode_subchar <<= 4;
1602 unicode_subchar |= hex2bin(p[i]);
1606 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1607 unicode_subchar *= 8;
1608 unicode_subchar += hex2bin(p[i]);
1611 w16e_conv(unicode_subchar, &i, &j);
1612 unicode_subchar = i<<8 | j;
1616 #ifdef UTF8_OUTPUT_ENABLE
1617 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1618 ms_ucs_map_f = UCS_MAP_MS;
1622 #ifdef UNICODE_NORMALIZATION
1623 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1624 input_f = UTF8_INPUT;
1629 if (strcmp(long_option[i].name, "prefix=") == 0){
1630 if (nkf_isgraph(p[0])){
1631 for (i = 1; nkf_isgraph(p[i]); i++){
1632 prefix_table[p[i]] = p[0];
1639 case 'b': /* buffered mode */
1642 case 'u': /* non bufferd mode */
1645 case 't': /* transparent mode */
1650 } else if (*cp=='2') {
1654 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1662 case 'j': /* JIS output */
1664 output_conv = j_oconv;
1666 case 'e': /* AT&T EUC output */
1667 output_conv = e_oconv;
1670 case 's': /* SJIS output */
1671 output_conv = s_oconv;
1673 case 'l': /* ISO8859 Latin-1 support, no conversion */
1674 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1675 input_f = LATIN1_INPUT;
1677 case 'i': /* Kanji IN ESC-$-@/B */
1678 if (*cp=='@'||*cp=='B')
1679 kanji_intro = *cp++;
1681 case 'o': /* ASCII IN ESC-(-J/B */
1682 if (*cp=='J'||*cp=='B'||*cp=='H')
1683 ascii_intro = *cp++;
1687 bit:1 katakana->hiragana
1688 bit:2 hiragana->katakana
1690 if ('9'>= *cp && *cp>='0')
1691 hira_f |= (*cp++ -'0');
1698 #if defined(MSDOS) || defined(__OS2__)
1705 show_configuration();
1713 #ifdef UTF8_OUTPUT_ENABLE
1714 case 'w': /* UTF-8 output */
1716 output_conv = w_oconv; cp++;
1720 output_bom_f = TRUE;
1723 if ('1'== cp[0] && '6'==cp[1]) {
1724 output_conv = w_oconv16; cp+=2;
1725 } else if ('3'== cp[0] && '2'==cp[1]) {
1726 output_conv = w_oconv32; cp+=2;
1728 output_conv = w_oconv;
1733 output_endian = ENDIAN_LITTLE;
1734 } else if (cp[0] == 'B') {
1742 output_bom_f = TRUE;
1747 #ifdef UTF8_INPUT_ENABLE
1748 case 'W': /* UTF input */
1751 input_f = UTF8_INPUT;
1753 if ('1'== cp[0] && '6'==cp[1]) {
1755 input_f = UTF16_INPUT;
1756 input_endian = ENDIAN_BIG;
1757 } else if ('3'== cp[0] && '2'==cp[1]) {
1759 input_f = UTF32_INPUT;
1760 input_endian = ENDIAN_BIG;
1762 input_f = UTF8_INPUT;
1767 input_endian = ENDIAN_LITTLE;
1768 } else if (cp[0] == 'B') {
1774 /* Input code assumption */
1775 case 'J': /* JIS input */
1776 input_f = JIS_INPUT;
1778 case 'E': /* AT&T EUC input */
1779 input_f = EUC_INPUT;
1781 case 'S': /* MS Kanji input */
1782 input_f = SJIS_INPUT;
1784 case 'Z': /* Convert X0208 alphabet to asii */
1786 bit:0 Convert JIS X 0208 Alphabet to ASCII
1787 bit:1 Convert Kankaku to one space
1788 bit:2 Convert Kankaku to two spaces
1789 bit:3 Convert HTML Entity
1790 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1792 while ('0'<= *cp && *cp <='9') {
1793 alpha_f |= 1 << (*cp++ - '0');
1795 if (!alpha_f) alpha_f = 1;
1797 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1798 x0201_f = FALSE; /* No X0201->X0208 conversion */
1800 ESC-(-I in JIS, EUC, MS Kanji
1801 SI/SO in JIS, EUC, MS Kanji
1802 SSO in EUC, JIS, not in MS Kanji
1803 MS Kanji (0xa0-0xdf)
1805 ESC-(-I in JIS (0x20-0x5f)
1806 SSO in EUC (0xa0-0xdf)
1807 0xa0-0xd in MS Kanji (0xa0-0xdf)
1810 case 'X': /* Convert X0201 kana to X0208 */
1813 case 'F': /* prserve new lines */
1814 fold_preserve_f = TRUE;
1815 case 'f': /* folding -f60 or -f */
1818 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1820 fold_len += *cp++ - '0';
1822 if (!(0<fold_len && fold_len<BUFSIZ))
1823 fold_len = DEFAULT_FOLD;
1827 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1829 fold_margin += *cp++ - '0';
1833 case 'm': /* MIME support */
1834 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1835 if (*cp=='B'||*cp=='Q') {
1836 mime_decode_mode = *cp++;
1837 mimebuf_f = FIXED_MIME;
1838 } else if (*cp=='N') {
1839 mime_f = TRUE; cp++;
1840 } else if (*cp=='S') {
1841 mime_f = STRICT_MIME; cp++;
1842 } else if (*cp=='0') {
1843 mime_decode_f = FALSE;
1844 mime_f = FALSE; cp++;
1847 case 'M': /* MIME output */
1850 mimeout_f = FIXED_MIME; cp++;
1851 } else if (*cp=='Q') {
1853 mimeout_f = FIXED_MIME; cp++;
1858 case 'B': /* Broken JIS support */
1860 bit:1 allow any x on ESC-(-x or ESC-$-x
1861 bit:2 reset to ascii on NL
1863 if ('9'>= *cp && *cp>='0')
1864 broken_f |= 1<<(*cp++ -'0');
1869 case 'O':/* for Output file */
1873 case 'c':/* add cr code */
1876 case 'd':/* delete cr code */
1879 case 'I': /* ISO-2022-JP output */
1882 case 'L': /* line mode */
1883 if (*cp=='u') { /* unix */
1884 nlmode_f = LF; cp++;
1885 } else if (*cp=='m') { /* mac */
1886 nlmode_f = CR; cp++;
1887 } else if (*cp=='w') { /* windows */
1888 nlmode_f = CRLF; cp++;
1889 } else if (*cp=='0') { /* no conversion */
1898 } else if (*cp == '0') {
1907 /* module muliple options in a string are allowed for Perl moudle */
1908 while(*cp && *cp++!='-');
1911 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1912 /* bogus option but ignored */
1918 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1921 struct input_code *p = input_code_list;
1923 if (iconv_func == p->iconv_func){
1932 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1934 #ifdef INPUT_CODE_FIX
1942 #ifdef INPUT_CODE_FIX
1943 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1949 if (estab_f && iconv_for_check != iconv){
1950 struct input_code *p = find_inputcode_byfunc(iconv);
1952 set_input_codename(p->name);
1955 iconv_for_check = iconv;
1960 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1961 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1962 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1963 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1964 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1965 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1966 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1967 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1969 #define SCORE_INIT (SCORE_iMIME)
1971 static const char score_table_A0[] = {
1974 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1975 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1978 static const char score_table_F0[] = {
1979 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1980 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1981 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1982 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1985 void set_code_score(struct input_code *ptr, nkf_char score)
1988 ptr->score |= score;
1992 void clr_code_score(struct input_code *ptr, nkf_char score)
1995 ptr->score &= ~score;
1999 void code_score(struct input_code *ptr)
2001 nkf_char c2 = ptr->buf[0];
2002 #ifdef UTF8_OUTPUT_ENABLE
2003 nkf_char c1 = ptr->buf[1];
2006 set_code_score(ptr, SCORE_ERROR);
2007 }else if (c2 == SSO){
2008 set_code_score(ptr, SCORE_KANA);
2009 }else if (c2 == 0x8f){
2010 set_code_score(ptr, SCORE_X0212);
2011 #ifdef UTF8_OUTPUT_ENABLE
2012 }else if (!e2w_conv(c2, c1)){
2013 set_code_score(ptr, SCORE_NO_EXIST);
2015 }else if ((c2 & 0x70) == 0x20){
2016 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2017 }else if ((c2 & 0x70) == 0x70){
2018 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2019 }else if ((c2 & 0x70) >= 0x50){
2020 set_code_score(ptr, SCORE_L2);
2024 void status_disable(struct input_code *ptr)
2029 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2032 void status_push_ch(struct input_code *ptr, nkf_char c)
2034 ptr->buf[ptr->index++] = c;
2037 void status_clear(struct input_code *ptr)
2043 void status_reset(struct input_code *ptr)
2046 ptr->score = SCORE_INIT;
2049 void status_reinit(struct input_code *ptr)
2052 ptr->_file_stat = 0;
2055 void status_check(struct input_code *ptr, nkf_char c)
2057 if (c <= DEL && estab_f){
2062 void s_status(struct input_code *ptr, nkf_char c)
2066 status_check(ptr, c);
2071 #ifdef NUMCHAR_OPTION
2072 }else if (is_unicode_capsule(c)){
2075 }else if (0xa1 <= c && c <= 0xdf){
2076 status_push_ch(ptr, SSO);
2077 status_push_ch(ptr, c);
2080 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2082 status_push_ch(ptr, c);
2083 }else if (0xed <= c && c <= 0xee){
2085 status_push_ch(ptr, c);
2086 #ifdef SHIFTJIS_CP932
2087 }else if (is_ibmext_in_sjis(c)){
2089 status_push_ch(ptr, c);
2090 #endif /* SHIFTJIS_CP932 */
2092 }else if (0xf0 <= c && c <= 0xfc){
2094 status_push_ch(ptr, c);
2095 #endif /* X0212_ENABLE */
2097 status_disable(ptr);
2101 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2102 status_push_ch(ptr, c);
2103 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2107 status_disable(ptr);
2111 #ifdef SHIFTJIS_CP932
2112 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2113 status_push_ch(ptr, c);
2114 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2115 set_code_score(ptr, SCORE_CP932);
2120 #endif /* SHIFTJIS_CP932 */
2121 status_disable(ptr);
2124 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2125 status_push_ch(ptr, c);
2126 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2127 set_code_score(ptr, SCORE_CP932);
2130 status_disable(ptr);
2136 void e_status(struct input_code *ptr, nkf_char c)
2140 status_check(ptr, c);
2145 #ifdef NUMCHAR_OPTION
2146 }else if (is_unicode_capsule(c)){
2149 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2151 status_push_ch(ptr, c);
2153 }else if (0x8f == c){
2155 status_push_ch(ptr, c);
2156 #endif /* X0212_ENABLE */
2158 status_disable(ptr);
2162 if (0xa1 <= c && c <= 0xfe){
2163 status_push_ch(ptr, c);
2167 status_disable(ptr);
2172 if (0xa1 <= c && c <= 0xfe){
2174 status_push_ch(ptr, c);
2176 status_disable(ptr);
2178 #endif /* X0212_ENABLE */
2182 #ifdef UTF8_INPUT_ENABLE
2183 void w_status(struct input_code *ptr, nkf_char c)
2187 status_check(ptr, c);
2192 #ifdef NUMCHAR_OPTION
2193 }else if (is_unicode_capsule(c)){
2196 }else if (0xc0 <= c && c <= 0xdf){
2198 status_push_ch(ptr, c);
2199 }else if (0xe0 <= c && c <= 0xef){
2201 status_push_ch(ptr, c);
2202 }else if (0xf0 <= c && c <= 0xf4){
2204 status_push_ch(ptr, c);
2206 status_disable(ptr);
2211 if (0x80 <= c && c <= 0xbf){
2212 status_push_ch(ptr, c);
2213 if (ptr->index > ptr->stat){
2214 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2215 && ptr->buf[2] == 0xbf);
2216 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2217 &ptr->buf[0], &ptr->buf[1]);
2224 status_disable(ptr);
2228 if (0x80 <= c && c <= 0xbf){
2229 if (ptr->index < ptr->stat){
2230 status_push_ch(ptr, c);
2235 status_disable(ptr);
2242 void code_status(nkf_char c)
2244 int action_flag = 1;
2245 struct input_code *result = 0;
2246 struct input_code *p = input_code_list;
2248 if (!p->status_func) {
2252 if (!p->status_func)
2254 (p->status_func)(p, c);
2257 }else if(p->stat == 0){
2268 if (result && !estab_f){
2269 set_iconv(TRUE, result->iconv_func);
2270 }else if (c <= DEL){
2271 struct input_code *ptr = input_code_list;
2281 nkf_char std_getc(FILE *f)
2284 return std_gc_buf[--std_gc_ndx];
2290 nkf_char std_ungetc(nkf_char c, FILE *f)
2292 if (std_gc_ndx == STD_GC_BUFSIZE){
2295 std_gc_buf[std_gc_ndx++] = c;
2300 void std_putc(nkf_char c)
2307 #if !defined(PERL_XS) && !defined(WIN32DLL)
2308 nkf_char noconvert(FILE *f)
2313 module_connection();
2314 while ((c = (*i_getc)(f)) != EOF)
2321 void module_connection(void)
2323 oconv = output_conv;
2326 /* replace continucation module, from output side */
2328 /* output redicrection */
2330 if (noout_f || guess_f){
2337 if (mimeout_f == TRUE) {
2338 o_base64conv = oconv; oconv = base64_conv;
2340 /* base64_count = 0; */
2343 if (nlmode_f || guess_f) {
2344 o_nlconv = oconv; oconv = nl_conv;
2347 o_rot_conv = oconv; oconv = rot_conv;
2350 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2353 o_hira_conv = oconv; oconv = hira_conv;
2356 o_fconv = oconv; oconv = fold_conv;
2359 if (alpha_f || x0201_f) {
2360 o_zconv = oconv; oconv = z_conv;
2364 i_ungetc = std_ungetc;
2365 /* input redicrection */
2368 i_cgetc = i_getc; i_getc = cap_getc;
2369 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2372 i_ugetc = i_getc; i_getc = url_getc;
2373 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2376 #ifdef NUMCHAR_OPTION
2378 i_ngetc = i_getc; i_getc = numchar_getc;
2379 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2382 #ifdef UNICODE_NORMALIZATION
2383 if (nfc_f && input_f == UTF8_INPUT){
2384 i_nfc_getc = i_getc; i_getc = nfc_getc;
2385 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2388 if (mime_f && mimebuf_f==FIXED_MIME) {
2389 i_mgetc = i_getc; i_getc = mime_getc;
2390 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2393 i_bgetc = i_getc; i_getc = broken_getc;
2394 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2396 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2397 set_iconv(-TRUE, e_iconv);
2398 } else if (input_f == SJIS_INPUT) {
2399 set_iconv(-TRUE, s_iconv);
2400 #ifdef UTF8_INPUT_ENABLE
2401 } else if (input_f == UTF8_INPUT) {
2402 set_iconv(-TRUE, w_iconv);
2403 } else if (input_f == UTF16_INPUT) {
2404 set_iconv(-TRUE, w_iconv16);
2405 } else if (input_f == UTF32_INPUT) {
2406 set_iconv(-TRUE, w_iconv32);
2409 set_iconv(FALSE, e_iconv);
2413 struct input_code *p = input_code_list;
2421 * Check and Ignore BOM
2423 void check_bom(FILE *f)
2426 switch(c2 = (*i_getc)(f)){
2428 if((c2 = (*i_getc)(f)) == 0x00){
2429 if((c2 = (*i_getc)(f)) == 0xFE){
2430 if((c2 = (*i_getc)(f)) == 0xFF){
2432 set_iconv(TRUE, w_iconv32);
2434 if (iconv == w_iconv32) {
2435 input_endian = ENDIAN_BIG;
2438 (*i_ungetc)(0xFF,f);
2439 }else (*i_ungetc)(c2,f);
2440 (*i_ungetc)(0xFE,f);
2441 }else if(c2 == 0xFF){
2442 if((c2 = (*i_getc)(f)) == 0xFE){
2444 set_iconv(TRUE, w_iconv32);
2446 if (iconv == w_iconv32) {
2447 input_endian = ENDIAN_2143;
2450 (*i_ungetc)(0xFF,f);
2451 }else (*i_ungetc)(c2,f);
2452 (*i_ungetc)(0xFF,f);
2453 }else (*i_ungetc)(c2,f);
2454 (*i_ungetc)(0x00,f);
2455 }else (*i_ungetc)(c2,f);
2456 (*i_ungetc)(0x00,f);
2459 if((c2 = (*i_getc)(f)) == 0xBB){
2460 if((c2 = (*i_getc)(f)) == 0xBF){
2462 set_iconv(TRUE, w_iconv);
2464 if (iconv == w_iconv) {
2467 (*i_ungetc)(0xBF,f);
2468 }else (*i_ungetc)(c2,f);
2469 (*i_ungetc)(0xBB,f);
2470 }else (*i_ungetc)(c2,f);
2471 (*i_ungetc)(0xEF,f);
2474 if((c2 = (*i_getc)(f)) == 0xFF){
2475 if((c2 = (*i_getc)(f)) == 0x00){
2476 if((c2 = (*i_getc)(f)) == 0x00){
2478 set_iconv(TRUE, w_iconv32);
2480 if (iconv == w_iconv32) {
2481 input_endian = ENDIAN_3412;
2484 (*i_ungetc)(0x00,f);
2485 }else (*i_ungetc)(c2,f);
2486 (*i_ungetc)(0x00,f);
2487 }else (*i_ungetc)(c2,f);
2489 set_iconv(TRUE, w_iconv16);
2491 if (iconv == w_iconv16) {
2492 input_endian = ENDIAN_BIG;
2495 (*i_ungetc)(0xFF,f);
2496 }else (*i_ungetc)(c2,f);
2497 (*i_ungetc)(0xFE,f);
2500 if((c2 = (*i_getc)(f)) == 0xFE){
2501 if((c2 = (*i_getc)(f)) == 0x00){
2502 if((c2 = (*i_getc)(f)) == 0x00){
2504 set_iconv(TRUE, w_iconv32);
2506 if (iconv == w_iconv32) {
2507 input_endian = ENDIAN_LITTLE;
2510 (*i_ungetc)(0x00,f);
2511 }else (*i_ungetc)(c2,f);
2512 (*i_ungetc)(0x00,f);
2513 }else (*i_ungetc)(c2,f);
2515 set_iconv(TRUE, w_iconv16);
2517 if (iconv == w_iconv16) {
2518 input_endian = ENDIAN_LITTLE;
2521 (*i_ungetc)(0xFE,f);
2522 }else (*i_ungetc)(c2,f);
2523 (*i_ungetc)(0xFF,f);
2532 Conversion main loop. Code detection only.
2535 nkf_char kanji_convert(FILE *f)
2537 nkf_char c3, c2=0, c1, c0=0;
2538 int is_8bit = FALSE;
2540 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2541 #ifdef UTF8_INPUT_ENABLE
2542 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2549 output_mode = ASCII;
2552 #define NEXT continue /* no output, get next */
2553 #define SEND ; /* output c1 and c2, get next */
2554 #define LAST break /* end of loop, go closing */
2556 module_connection();
2559 while ((c1 = (*i_getc)(f)) != EOF) {
2560 #ifdef INPUT_CODE_FIX
2566 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2567 /* in case of 8th bit is on */
2568 if (!estab_f&&!mime_decode_mode) {
2569 /* in case of not established yet */
2570 /* It is still ambiguious */
2571 if (h_conv(f, c2, c1)==EOF)
2577 /* in case of already established */
2579 /* ignore bogus code and not CP5022x UCD */
2587 /* second byte, 7 bit code */
2588 /* it might be kanji shitfted */
2589 if ((c1 == DEL) || (c1 <= SP)) {
2590 /* ignore bogus first code */
2597 #ifdef UTF8_INPUT_ENABLE
2598 if (iconv == w_iconv16) {
2599 if (input_endian == ENDIAN_BIG) {
2601 if ((c1 = (*i_getc)(f)) != EOF) {
2602 if (0xD8 <= c2 && c2 <= 0xDB) {
2603 if ((c0 = (*i_getc)(f)) != EOF) {
2605 if ((c3 = (*i_getc)(f)) != EOF) {
2612 if ((c2 = (*i_getc)(f)) != EOF) {
2613 if (0xD8 <= c2 && c2 <= 0xDB) {
2614 if ((c3 = (*i_getc)(f)) != EOF) {
2615 if ((c0 = (*i_getc)(f)) != EOF) {
2624 } else if(iconv == w_iconv32){
2626 if((c2 = (*i_getc)(f)) != EOF &&
2627 (c1 = (*i_getc)(f)) != EOF &&
2628 (c0 = (*i_getc)(f)) != EOF){
2629 switch(input_endian){
2631 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2634 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2637 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2640 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2650 #ifdef NUMCHAR_OPTION
2651 if (is_unicode_capsule(c1)){
2655 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2657 if (!estab_f && !iso8859_f) {
2658 /* not established yet */
2661 } else { /* estab_f==TRUE */
2666 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2667 /* SJIS X0201 Case... */
2668 if (iso2022jp_f && !x0201_f) {
2669 (*oconv)(GETA1, GETA2);
2676 } else if (c1==SSO && iconv != s_iconv) {
2677 /* EUC X0201 Case */
2678 c1 = (*i_getc)(f); /* skip SSO */
2680 if (SSP<=c1 && c1<0xe0) {
2681 if (iso2022jp_f && !x0201_f) {
2682 (*oconv)(GETA1, GETA2);
2689 } else { /* bogus code, skip SSO and one byte */
2692 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2693 (c1 == 0xFD || c1 == 0xFE)) {
2699 /* already established */
2704 } else if ((c1 > SP) && (c1 != DEL)) {
2705 /* in case of Roman characters */
2707 /* output 1 shifted byte */
2711 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2712 /* output 1 shifted byte */
2713 if (iso2022jp_f && !x0201_f) {
2714 (*oconv)(GETA1, GETA2);
2721 /* look like bogus code */
2724 } else if (input_mode == X0208 || input_mode == X0212 ||
2725 input_mode == X0213_1 || input_mode == X0213_2) {
2726 /* in case of Kanji shifted */
2729 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2730 /* Check MIME code */
2731 if ((c1 = (*i_getc)(f)) == EOF) {
2734 } else if (c1 == '?') {
2735 /* =? is mime conversion start sequence */
2736 if(mime_f == STRICT_MIME) {
2737 /* check in real detail */
2738 if (mime_begin_strict(f) == EOF)
2742 } else if (mime_begin(f) == EOF)
2752 /* normal ASCII code */
2755 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2758 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2761 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2762 if ((c1 = (*i_getc)(f)) == EOF) {
2763 /* (*oconv)(0, ESC); don't send bogus code */
2765 } else if (c1 == '$') {
2766 if ((c1 = (*i_getc)(f)) == EOF) {
2768 (*oconv)(0, ESC); don't send bogus code
2769 (*oconv)(0, '$'); */
2771 } else if (c1 == '@'|| c1 == 'B') {
2772 /* This is kanji introduction */
2775 set_input_codename("ISO-2022-JP");
2777 debug("ISO-2022-JP");
2780 } else if (c1 == '(') {
2781 if ((c1 = (*i_getc)(f)) == EOF) {
2782 /* don't send bogus code
2788 } else if (c1 == '@'|| c1 == 'B') {
2789 /* This is kanji introduction */
2794 } else if (c1 == 'D'){
2798 #endif /* X0212_ENABLE */
2799 } else if (c1 == (X0213_1&0x7F)){
2800 input_mode = X0213_1;
2803 } else if (c1 == (X0213_2&0x7F)){
2804 input_mode = X0213_2;
2808 /* could be some special code */
2815 } else if (broken_f&0x2) {
2816 /* accept any ESC-(-x as broken code ... */
2826 } else if (c1 == '(') {
2827 if ((c1 = (*i_getc)(f)) == EOF) {
2828 /* don't send bogus code
2830 (*oconv)(0, '('); */
2834 /* This is X0201 kana introduction */
2835 input_mode = X0201; shift_mode = X0201;
2837 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2838 /* This is X0208 kanji introduction */
2839 input_mode = ASCII; shift_mode = FALSE;
2841 } else if (broken_f&0x2) {
2842 input_mode = ASCII; shift_mode = FALSE;
2847 /* maintain various input_mode here */
2851 } else if ( c1 == 'N' || c1 == 'n'){
2853 c3 = (*i_getc)(f); /* skip SS2 */
2854 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2869 } else if (c1 == ESC && iconv == s_iconv) {
2870 /* ESC in Shift_JIS */
2871 if ((c1 = (*i_getc)(f)) == EOF) {
2872 /* (*oconv)(0, ESC); don't send bogus code */
2874 } else if (c1 == '$') {
2876 if ((c1 = (*i_getc)(f)) == EOF) {
2878 (*oconv)(0, ESC); don't send bogus code
2879 (*oconv)(0, '$'); */
2882 if (('E' <= c1 && c1 <= 'G') ||
2883 ('O' <= c1 && c1 <= 'Q')) {
2891 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2892 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2893 while ((c1 = (*i_getc)(f)) != EOF) {
2894 if (SP <= c1 && c1 <= 'z') {
2895 (*oconv)(0, c1 + c0);
2896 } else break; /* c1 == SO */
2900 if (c1 == EOF) LAST;
2907 } else if (c1 == LF || c1 == CR) {
2909 input_mode = ASCII; set_iconv(FALSE, 0);
2911 } else if (mime_decode_f && !mime_decode_mode){
2913 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2921 } else { /* if (c1 == CR)*/
2922 if ((c1=(*i_getc)(f))!=EOF) {
2926 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2940 } else if (c1 == DEL && input_mode == X0208) {
2950 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2953 if ((c0 = (*i_getc)(f)) != EOF) {
2956 if ((c3 = (*i_getc)(f)) != EOF) {
2958 (*iconv)(c2, c1, c0|c3);
2963 /* 3 bytes EUC or UTF-8 */
2964 if ((c0 = (*i_getc)(f)) != EOF) {
2966 (*iconv)(c2, c1, c0);
2974 0x7F <= c2 && c2 <= 0x92 &&
2975 0x21 <= c1 && c1 <= 0x7E) {
2977 if(c1 == 0x7F) return 0;
2978 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2981 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2985 (*oconv)(PREFIX_EUCG3 | c2, c1);
2987 #endif /* X0212_ENABLE */
2989 (*oconv)(PREFIX_EUCG3 | c2, c1);
2992 (*oconv)(input_mode, c1); /* other special case */
2998 /* goto next_word */
3002 (*iconv)(EOF, 0, 0);
3003 if (!input_codename)
3006 struct input_code *p = input_code_list;
3007 struct input_code *result = p;
3009 if (p->score < result->score) result = p;
3012 set_input_codename(result->name);
3014 debug(result->name);
3022 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3024 nkf_char ret, c3, c0;
3028 /** it must NOT be in the kanji shifte sequence */
3029 /** it must NOT be written in JIS7 */
3030 /** and it must be after 2 byte 8bit code */
3036 while ((c1 = (*i_getc)(f)) != EOF) {
3042 if (push_hold_buf(c1) == EOF || estab_f){
3048 struct input_code *p = input_code_list;
3049 struct input_code *result = p;
3054 if (p->status_func && p->score < result->score){
3059 set_iconv(TRUE, result->iconv_func);
3064 ** 1) EOF is detected, or
3065 ** 2) Code is established, or
3066 ** 3) Buffer is FULL (but last word is pushed)
3068 ** in 1) and 3) cases, we continue to use
3069 ** Kanji codes by oconv and leave estab_f unchanged.
3074 while (hold_index < hold_count){
3075 c2 = hold_buf[hold_index++];
3077 #ifdef NUMCHAR_OPTION
3078 || is_unicode_capsule(c2)
3083 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3084 (*iconv)(X0201, c2, 0);
3087 if (hold_index < hold_count){
3088 c1 = hold_buf[hold_index++];
3098 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3101 if (hold_index < hold_count){
3102 c0 = hold_buf[hold_index++];
3103 } else if ((c0 = (*i_getc)(f)) == EOF) {
3109 if (hold_index < hold_count){
3110 c3 = hold_buf[hold_index++];
3111 } else if ((c3 = (*i_getc)(f)) == EOF) {
3116 (*iconv)(c2, c1, c0|c3);
3121 /* 3 bytes EUC or UTF-8 */
3122 if (hold_index < hold_count){
3123 c0 = hold_buf[hold_index++];
3124 } else if ((c0 = (*i_getc)(f)) == EOF) {
3130 (*iconv)(c2, c1, c0);
3133 if (c0 == EOF) break;
3138 nkf_char push_hold_buf(nkf_char c2)
3140 if (hold_count >= HOLD_SIZE*2)
3142 hold_buf[hold_count++] = (unsigned char)c2;
3143 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3146 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3148 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3151 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3152 #ifdef SHIFTJIS_CP932
3153 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3154 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3161 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3162 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3168 #endif /* SHIFTJIS_CP932 */
3170 if (!x0213_f && is_ibmext_in_sjis(c2)){
3171 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3174 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3187 if(x0213_f && c2 >= 0xF0){
3188 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3189 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3190 }else{ /* 78<=k<=94 */
3191 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3192 if (0x9E < c1) c2++;
3195 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3196 if (0x9E < c1) c2++;
3199 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3206 c2 = x0212_unshift(c2);
3213 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3217 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3219 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3221 if(c1 == 0x7F) return 0;
3222 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3225 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3226 if (ret) return ret;
3232 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3237 }else if (c2 == 0x8f){
3241 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3242 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3243 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3246 c2 = (c2 << 8) | (c1 & 0x7f);
3248 #ifdef SHIFTJIS_CP932
3251 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3252 s2e_conv(s2, s1, &c2, &c1);
3259 #endif /* SHIFTJIS_CP932 */
3261 #endif /* X0212_ENABLE */
3262 } else if (c2 == SSO){
3265 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3268 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3269 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3270 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3275 #ifdef SHIFTJIS_CP932
3276 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3278 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3279 s2e_conv(s2, s1, &c2, &c1);
3286 #endif /* SHIFTJIS_CP932 */
3293 #ifdef UTF8_INPUT_ENABLE
3294 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3301 }else if (0xc0 <= c2 && c2 <= 0xef) {
3302 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3303 #ifdef NUMCHAR_OPTION
3306 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3314 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3317 static const char w_iconv_utf8_1st_byte[] =
3319 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3320 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3321 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3322 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3324 if (c2 < 0 || 0xff < c2) {
3325 }else if (c2 == 0) { /* 0 : 1 byte*/
3327 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3330 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3332 if (c1 < 0x80 || 0xBF < c1) return 0;
3335 if (c0 == 0) return -1;
3336 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3341 if (c0 == 0) return -1;
3342 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3346 if (c0 == 0) return -1;
3347 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3351 if (c0 == 0) return -2;
3352 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3356 if (c0 == 0) return -2;
3357 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3361 if (c0 == 0) return -2;
3362 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3370 if (c2 == 0 || c2 == EOF){
3371 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3372 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3375 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3384 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3385 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3392 }else if (val < 0x800){
3393 *p2 = 0xc0 | (val >> 6);
3394 *p1 = 0x80 | (val & 0x3f);
3396 } else if (val <= NKF_INT32_C(0xFFFF)) {
3397 *p2 = 0xe0 | (val >> 12);
3398 *p1 = 0x80 | ((val >> 6) & 0x3f);
3399 *p0 = 0x80 | (val & 0x3f);
3400 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3401 *p2 = 0xe0 | (val >> 16);
3402 *p1 = 0x80 | ((val >> 12) & 0x3f);
3403 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3412 #ifdef UTF8_INPUT_ENABLE
3413 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3418 } else if (c2 >= 0xf0){
3419 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3420 val = (c2 & 0x0f) << 18;
3421 val |= (c1 & 0x3f) << 12;
3422 val |= (c0 & 0x3f00) >> 2;
3424 }else if (c2 >= 0xe0){
3425 val = (c2 & 0x0f) << 12;
3426 val |= (c1 & 0x3f) << 6;
3428 }else if (c2 >= 0xc0){
3429 val = (c2 & 0x1f) << 6;
3437 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3439 nkf_char c2, c1, c0;
3446 w16w_conv(val, &c2, &c1, &c0);
3447 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3448 #ifdef NUMCHAR_OPTION
3451 *p1 = CLASS_UNICODE | val;
3460 #ifdef UTF8_INPUT_ENABLE
3461 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3464 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3467 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3468 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3470 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3472 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3477 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3478 if (ret) return ret;
3483 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3487 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3488 } else if (is_unicode_bmp(c1)) {
3489 ret = w16e_conv(c1, &c2, &c1);
3492 c1 = CLASS_UNICODE | c1;
3494 if (ret) return ret;
3499 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3501 const unsigned short *const *pp;
3502 const unsigned short *const *const *ppp;
3503 static const char no_best_fit_chars_table_C2[] =
3504 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3506 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3507 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3508 static const char no_best_fit_chars_table_C2_ms[] =
3509 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3512 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3513 static const char no_best_fit_chars_table_932_C2[] =
3514 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3515 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3517 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3518 static const char no_best_fit_chars_table_932_C3[] =
3519 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3520 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3528 }else if(c2 < 0xe0){
3529 if(no_best_fit_chars_f){
3530 if(ms_ucs_map_f == UCS_MAP_CP932){
3533 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3536 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3539 }else if(!cp932inv_f){
3542 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3545 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3548 }else if(ms_ucs_map_f == UCS_MAP_MS){
3549 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3550 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3568 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3569 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3570 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3572 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3573 }else if(c0 < 0xF0){
3574 if(no_best_fit_chars_f){
3575 if(ms_ucs_map_f == UCS_MAP_CP932){
3576 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3577 }else if(ms_ucs_map_f == UCS_MAP_MS){
3582 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3585 if(c0 == 0x92) return 1;
3590 if(c1 == 0x80 || c0 == 0x9C) return 1;
3593 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3598 if(c0 == 0x94) return 1;
3601 if(c0 == 0xBB) return 1;
3611 if(c0 == 0x95) return 1;
3614 if(c0 == 0xA5) return 1;
3621 if(c0 == 0x8D) return 1;
3624 if(c0 == 0x9E && !cp932inv_f) return 1;
3627 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3635 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3636 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3637 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3639 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3641 #ifdef SHIFTJIS_CP932
3642 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3644 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3645 s2e_conv(s2, s1, p2, p1);
3654 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3657 const unsigned short *p;
3660 if (pp == 0) return 1;
3663 if (c1 < 0 || psize <= c1) return 1;
3665 if (p == 0) return 1;
3668 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3670 if (val == 0) return 1;
3671 if (no_cp932ext_f && (
3672 (val>>8) == 0x2D || /* NEC special characters */
3673 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3681 if (c2 == SO) c2 = X0201;
3688 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3695 (*f)(0, bin2hex(c>>shift));
3705 void encode_fallback_html(nkf_char c)
3710 if(c >= NKF_INT32_C(1000000))
3711 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3712 if(c >= NKF_INT32_C(100000))
3713 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3715 (*oconv)(0, 0x30+(c/10000 )%10);
3717 (*oconv)(0, 0x30+(c/1000 )%10);
3719 (*oconv)(0, 0x30+(c/100 )%10);
3721 (*oconv)(0, 0x30+(c/10 )%10);
3723 (*oconv)(0, 0x30+ c %10);
3728 void encode_fallback_xml(nkf_char c)
3733 nkf_each_char_to_hex(oconv, c);
3738 void encode_fallback_java(nkf_char c)
3742 if(!is_unicode_bmp(c)){
3746 (*oconv)(0, bin2hex(c>>20));
3747 (*oconv)(0, bin2hex(c>>16));
3751 (*oconv)(0, bin2hex(c>>12));
3752 (*oconv)(0, bin2hex(c>> 8));
3753 (*oconv)(0, bin2hex(c>> 4));
3754 (*oconv)(0, bin2hex(c ));
3758 void encode_fallback_perl(nkf_char c)
3763 nkf_each_char_to_hex(oconv, c);
3768 void encode_fallback_subchar(nkf_char c)
3770 c = unicode_subchar;
3771 (*oconv)((c>>8)&0xFF, c&0xFF);
3776 #ifdef UTF8_OUTPUT_ENABLE
3777 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3779 const unsigned short *p;
3782 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3790 p = euc_to_utf8_1byte;
3792 } else if (is_eucg3(c2)){
3793 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3796 c2 = (c2&0x7f) - 0x21;
3797 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3798 p = x0212_to_utf8_2bytes[c2];
3804 c2 = (c2&0x7f) - 0x21;
3805 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3807 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3808 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3809 euc_to_utf8_2bytes_ms[c2];
3814 c1 = (c1 & 0x7f) - 0x21;
3815 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3820 void w_oconv(nkf_char c2, nkf_char c1)
3826 output_bom_f = FALSE;
3837 #ifdef NUMCHAR_OPTION
3838 if (c2 == 0 && is_unicode_capsule(c1)){
3839 val = c1 & VALUE_MASK;
3842 }else if (val < 0x800){
3843 (*o_putc)(0xC0 | (val >> 6));
3844 (*o_putc)(0x80 | (val & 0x3f));
3845 } else if (val <= NKF_INT32_C(0xFFFF)) {
3846 (*o_putc)(0xE0 | (val >> 12));
3847 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3848 (*o_putc)(0x80 | (val & 0x3f));
3849 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3850 (*o_putc)(0xF0 | ( val>>18));
3851 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3852 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3853 (*o_putc)(0x80 | ( val & 0x3f));
3860 output_mode = ASCII;
3862 } else if (c2 == ISO8859_1) {
3864 (*o_putc)(c1 | 0x080);
3867 val = e2w_conv(c2, c1);
3869 w16w_conv(val, &c2, &c1, &c0);
3873 if (c0) (*o_putc)(c0);
3879 void w_oconv16(nkf_char c2, nkf_char c1)
3882 output_bom_f = FALSE;
3883 if (output_endian == ENDIAN_LITTLE){
3884 (*o_putc)((unsigned char)'\377');
3888 (*o_putc)((unsigned char)'\377');
3897 if (c2 == ISO8859_1) {
3900 #ifdef NUMCHAR_OPTION
3901 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3902 if (is_unicode_bmp(c1)) {
3903 c2 = (c1 >> 8) & 0xff;
3907 if (c1 <= UNICODE_MAX) {
3908 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3909 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3910 if (output_endian == ENDIAN_LITTLE){
3911 (*o_putc)(c2 & 0xff);
3912 (*o_putc)((c2 >> 8) & 0xff);
3913 (*o_putc)(c1 & 0xff);
3914 (*o_putc)((c1 >> 8) & 0xff);
3916 (*o_putc)((c2 >> 8) & 0xff);
3917 (*o_putc)(c2 & 0xff);
3918 (*o_putc)((c1 >> 8) & 0xff);
3919 (*o_putc)(c1 & 0xff);
3926 nkf_char val = e2w_conv(c2, c1);
3927 c2 = (val >> 8) & 0xff;
3931 if (output_endian == ENDIAN_LITTLE){
3940 void w_oconv32(nkf_char c2, nkf_char c1)
3943 output_bom_f = FALSE;
3944 if (output_endian == ENDIAN_LITTLE){
3945 (*o_putc)((unsigned char)'\377');
3953 (*o_putc)((unsigned char)'\377');
3962 if (c2 == ISO8859_1) {
3964 #ifdef NUMCHAR_OPTION
3965 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3969 c1 = e2w_conv(c2, c1);
3972 if (output_endian == ENDIAN_LITTLE){
3973 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3974 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3975 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3979 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3980 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3981 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3986 void e_oconv(nkf_char c2, nkf_char c1)
3988 #ifdef NUMCHAR_OPTION
3989 if (c2 == 0 && is_unicode_capsule(c1)){
3990 w16e_conv(c1, &c2, &c1);
3991 if (c2 == 0 && is_unicode_capsule(c1)){
3992 c2 = c1 & VALUE_MASK;
3993 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3997 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3998 c1 = 0x21 + c1 % 94;
4001 (*o_putc)((c2 & 0x7f) | 0x080);
4002 (*o_putc)(c1 | 0x080);
4004 (*o_putc)((c2 & 0x7f) | 0x080);
4005 (*o_putc)(c1 | 0x080);
4009 if (encode_fallback) (*encode_fallback)(c1);
4018 } else if (c2 == 0) {
4019 output_mode = ASCII;
4021 } else if (c2 == X0201) {
4022 output_mode = JAPANESE_EUC;
4023 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4024 } else if (c2 == ISO8859_1) {
4025 output_mode = ISO8859_1;
4026 (*o_putc)(c1 | 0x080);
4028 } else if (is_eucg3(c2)){
4029 output_mode = JAPANESE_EUC;
4030 #ifdef SHIFTJIS_CP932
4033 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4034 s2e_conv(s2, s1, &c2, &c1);
4039 output_mode = ASCII;
4041 }else if (is_eucg3(c2)){
4044 (*o_putc)((c2 & 0x7f) | 0x080);
4045 (*o_putc)(c1 | 0x080);
4048 (*o_putc)((c2 & 0x7f) | 0x080);
4049 (*o_putc)(c1 | 0x080);
4053 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4054 set_iconv(FALSE, 0);
4055 return; /* too late to rescue this char */
4057 output_mode = JAPANESE_EUC;
4058 (*o_putc)(c2 | 0x080);
4059 (*o_putc)(c1 | 0x080);
4064 nkf_char x0212_shift(nkf_char c)
4069 if (0x75 <= c && c <= 0x7f){
4070 ret = c + (0x109 - 0x75);
4073 if (0x75 <= c && c <= 0x7f){
4074 ret = c + (0x113 - 0x75);
4081 nkf_char x0212_unshift(nkf_char c)
4084 if (0x7f <= c && c <= 0x88){
4085 ret = c + (0x75 - 0x7f);
4086 }else if (0x89 <= c && c <= 0x92){
4087 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4091 #endif /* X0212_ENABLE */
4093 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4099 if((0x21 <= ndx && ndx <= 0x2F)){
4100 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4101 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4103 }else if(0x6E <= ndx && ndx <= 0x7E){
4104 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4105 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4111 else if(nkf_isgraph(ndx)){
4113 const unsigned short *ptr;
4114 ptr = x0212_shiftjis[ndx - 0x21];
4116 val = ptr[(c1 & 0x7f) - 0x21];
4125 c2 = x0212_shift(c2);
4127 #endif /* X0212_ENABLE */
4129 if(0x7F < c2) return 1;
4130 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4131 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4135 void s_oconv(nkf_char c2, nkf_char c1)
4137 #ifdef NUMCHAR_OPTION
4138 if (c2 == 0 && is_unicode_capsule(c1)){
4139 w16e_conv(c1, &c2, &c1);
4140 if (c2 == 0 && is_unicode_capsule(c1)){
4141 c2 = c1 & VALUE_MASK;
4142 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4145 c2 = c1 / 188 + 0xF0;
4147 c1 += 0x40 + (c1 > 0x3e);
4152 if(encode_fallback)(*encode_fallback)(c1);
4161 } else if (c2 == 0) {
4162 output_mode = ASCII;
4164 } else if (c2 == X0201) {
4165 output_mode = SHIFT_JIS;
4167 } else if (c2 == ISO8859_1) {
4168 output_mode = ISO8859_1;
4169 (*o_putc)(c1 | 0x080);
4171 } else if (is_eucg3(c2)){
4172 output_mode = SHIFT_JIS;
4173 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4179 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4180 set_iconv(FALSE, 0);
4181 return; /* too late to rescue this char */
4183 output_mode = SHIFT_JIS;
4184 e2s_conv(c2, c1, &c2, &c1);
4186 #ifdef SHIFTJIS_CP932
4188 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4189 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4195 #endif /* SHIFTJIS_CP932 */
4198 if (prefix_table[(unsigned char)c1]){
4199 (*o_putc)(prefix_table[(unsigned char)c1]);
4205 void j_oconv(nkf_char c2, nkf_char c1)
4207 #ifdef NUMCHAR_OPTION
4208 if (c2 == 0 && is_unicode_capsule(c1)){
4209 w16e_conv(c1, &c2, &c1);
4210 if (c2 == 0 && is_unicode_capsule(c1)){
4211 c2 = c1 & VALUE_MASK;
4212 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4215 c2 = 0x7F + c1 / 94;
4216 c1 = 0x21 + c1 % 94;
4218 if (encode_fallback) (*encode_fallback)(c1);
4225 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4228 (*o_putc)(ascii_intro);
4229 output_mode = ASCII;
4233 } else if (is_eucg3(c2)){
4235 if(output_mode!=X0213_2){
4236 output_mode = X0213_2;
4240 (*o_putc)(X0213_2&0x7F);
4243 if(output_mode!=X0212){
4244 output_mode = X0212;
4248 (*o_putc)(X0212&0x7F);
4251 (*o_putc)(c2 & 0x7f);
4254 } else if (c2==X0201) {
4255 if (output_mode!=X0201) {
4256 output_mode = X0201;
4262 } else if (c2==ISO8859_1) {
4263 /* iso8859 introduction, or 8th bit on */
4264 /* Can we convert in 7bit form using ESC-'-'-A ?
4266 output_mode = ISO8859_1;
4268 } else if (c2 == 0) {
4269 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4272 (*o_putc)(ascii_intro);
4273 output_mode = ASCII;
4278 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4279 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4281 if (output_mode!=X0213_1) {
4282 output_mode = X0213_1;
4286 (*o_putc)(X0213_1&0x7F);
4288 }else if (output_mode != X0208) {
4289 output_mode = X0208;
4292 (*o_putc)(kanji_intro);
4299 void base64_conv(nkf_char c2, nkf_char c1)
4301 mime_prechar(c2, c1);
4302 (*o_base64conv)(c2,c1);
4306 static nkf_char broken_buf[3];
4307 static int broken_counter = 0;
4308 static int broken_last = 0;
4309 nkf_char broken_getc(FILE *f)
4313 if (broken_counter>0) {
4314 return broken_buf[--broken_counter];
4317 if (c=='$' && broken_last != ESC
4318 && (input_mode==ASCII || input_mode==X0201)) {
4321 if (c1=='@'|| c1=='B') {
4322 broken_buf[0]=c1; broken_buf[1]=c;
4329 } else if (c=='(' && broken_last != ESC
4330 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4333 if (c1=='J'|| c1=='B') {
4334 broken_buf[0]=c1; broken_buf[1]=c;
4347 nkf_char broken_ungetc(nkf_char c, FILE *f)
4349 if (broken_counter<2)
4350 broken_buf[broken_counter++]=c;
4354 void nl_conv(nkf_char c2, nkf_char c1)
4356 if (guess_f && input_newline != EOF) {
4357 if (c2 == 0 && c1 == LF) {
4358 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4359 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4360 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4362 else if (!input_newline) input_newline = CR;
4363 else if (input_newline != CR) input_newline = EOF;
4365 if (prev_cr || c2 == 0 && c1 == LF) {
4367 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4368 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4370 if (c2 == 0 && c1 == CR) prev_cr = CR;
4371 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4375 Return value of fold_conv()
4377 LF add newline and output char
4378 CR add newline and output nothing
4381 1 (or else) normal output
4383 fold state in prev (previous character)
4385 >0x80 Japanese (X0208/X0201)
4390 This fold algorthm does not preserve heading space in a line.
4391 This is the main difference from fmt.
4394 #define char_size(c2,c1) (c2?2:1)
4396 void fold_conv(nkf_char c2, nkf_char c1)
4399 nkf_char fold_state;
4401 if (c1== CR && !fold_preserve_f) {
4402 fold_state=0; /* ignore cr */
4403 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4405 fold_state=0; /* ignore cr */
4406 } else if (c1== BS) {
4407 if (f_line>0) f_line--;
4409 } else if (c2==EOF && f_line != 0) { /* close open last line */
4411 } else if ((c1==LF && !fold_preserve_f)
4412 || ((c1==CR||(c1==LF&&f_prev!=CR))
4413 && fold_preserve_f)) {
4415 if (fold_preserve_f) {
4419 } else if ((f_prev == c1 && !fold_preserve_f)
4420 || (f_prev == LF && fold_preserve_f)
4421 ) { /* duplicate newline */
4424 fold_state = LF; /* output two newline */
4430 if (f_prev&0x80) { /* Japanese? */
4432 fold_state = 0; /* ignore given single newline */
4433 } else if (f_prev==SP) {
4437 if (++f_line<=fold_len)
4441 fold_state = CR; /* fold and output nothing */
4445 } else if (c1=='\f') {
4448 fold_state = LF; /* output newline and clear */
4449 } else if ( (c2==0 && c1==SP)||
4450 (c2==0 && c1==TAB)||
4451 (c2=='!'&& c1=='!')) {
4452 /* X0208 kankaku or ascii space */
4454 fold_state = 0; /* remove duplicate spaces */
4457 if (++f_line<=fold_len)
4458 fold_state = SP; /* output ASCII space only */
4460 f_prev = SP; f_line = 0;
4461 fold_state = CR; /* fold and output nothing */
4465 prev0 = f_prev; /* we still need this one... , but almost done */
4467 if (c2 || c2==X0201)
4468 f_prev |= 0x80; /* this is Japanese */
4469 f_line += char_size(c2,c1);
4470 if (f_line<=fold_len) { /* normal case */
4473 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4474 f_line = char_size(c2,c1);
4475 fold_state = LF; /* We can't wait, do fold now */
4476 } else if (c2==X0201) {
4477 /* simple kinsoku rules return 1 means no folding */
4478 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4479 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4480 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4481 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4482 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4483 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4484 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4486 fold_state = LF;/* add one new f_line before this character */
4489 fold_state = LF;/* add one new f_line before this character */
4492 /* kinsoku point in ASCII */
4493 if ( c1==')'|| /* { [ ( */
4504 /* just after special */
4505 } else if (!is_alnum(prev0)) {
4506 f_line = char_size(c2,c1);
4508 } else if ((prev0==SP) || /* ignored new f_line */
4509 (prev0==LF)|| /* ignored new f_line */
4510 (prev0&0x80)) { /* X0208 - ASCII */
4511 f_line = char_size(c2,c1);
4512 fold_state = LF;/* add one new f_line before this character */
4514 fold_state = 1; /* default no fold in ASCII */
4518 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4519 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4520 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4521 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4522 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4523 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4524 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4525 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4526 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4527 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4528 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4529 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4530 /* default no fold in kinsoku */
4533 f_line = char_size(c2,c1);
4534 /* add one new f_line before this character */
4537 f_line = char_size(c2,c1);
4539 /* add one new f_line before this character */
4544 /* terminator process */
4545 switch(fold_state) {
4564 nkf_char z_prev2=0,z_prev1=0;
4566 void z_conv(nkf_char c2, nkf_char c1)
4569 /* if (c2) c1 &= 0x7f; assertion */
4571 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4577 if (z_prev2 == X0201) {
4579 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4581 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4583 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4585 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4590 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4593 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4594 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4599 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4610 if (alpha_f&1 && c2 == 0x23) {
4611 /* JISX0208 Alphabet */
4613 } else if (c2 == 0x21) {
4614 /* JISX0208 Kigou */
4619 } else if (alpha_f&4) {
4624 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4630 if (alpha_f&8 && c2 == 0) {
4634 case '>': entity = ">"; break;
4635 case '<': entity = "<"; break;
4636 case '\"': entity = """; break;
4637 case '&': entity = "&"; break;
4640 while (*entity) (*o_zconv)(0, *entity++);
4646 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4651 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4655 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4659 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4663 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4667 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4671 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4675 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4679 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4684 (*o_zconv)(X0201, c);
4687 } else if (c2 == 0x25) {
4688 /* JISX0208 Katakana */
4689 static const int fullwidth_to_halfwidth[] =
4691 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4692 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4693 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4694 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4695 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4696 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4697 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4698 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4699 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4700 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4701 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4702 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4704 if (fullwidth_to_halfwidth[c1-0x20]){
4705 c2 = fullwidth_to_halfwidth[c1-0x20];
4706 (*o_zconv)(X0201, c2>>8);
4708 (*o_zconv)(X0201, c2&0xFF);
4718 #define rot13(c) ( \
4720 (c <= 'M') ? (c + 13): \
4721 (c <= 'Z') ? (c - 13): \
4723 (c <= 'm') ? (c + 13): \
4724 (c <= 'z') ? (c - 13): \
4728 #define rot47(c) ( \
4730 ( c <= 'O') ? (c + 47) : \
4731 ( c <= '~') ? (c - 47) : \
4735 void rot_conv(nkf_char c2, nkf_char c1)
4737 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4743 (*o_rot_conv)(c2,c1);
4746 void hira_conv(nkf_char c2, nkf_char c1)
4750 if (0x20 < c1 && c1 < 0x74) {
4752 (*o_hira_conv)(c2,c1);
4754 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4756 c1 = CLASS_UNICODE | 0x3094;
4757 (*o_hira_conv)(c2,c1);
4760 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4762 (*o_hira_conv)(c2,c1);
4767 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4770 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4772 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4776 (*o_hira_conv)(c2,c1);
4780 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4782 static const nkf_char range[RANGE_NUM_MAX][2] = {
4803 nkf_char start, end, c;
4805 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4809 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4814 for (i = 0; i < RANGE_NUM_MAX; i++) {
4815 start = range[i][0];
4818 if (c >= start && c <= end) {
4823 (*o_iso2022jp_check_conv)(c2,c1);
4827 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4829 static const unsigned char *mime_pattern[] = {
4830 (const unsigned char *)"\075?EUC-JP?B?",
4831 (const unsigned char *)"\075?SHIFT_JIS?B?",
4832 (const unsigned char *)"\075?ISO-8859-1?Q?",
4833 (const unsigned char *)"\075?ISO-8859-1?B?",
4834 (const unsigned char *)"\075?ISO-2022-JP?B?",
4835 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4836 #if defined(UTF8_INPUT_ENABLE)
4837 (const unsigned char *)"\075?UTF-8?B?",
4838 (const unsigned char *)"\075?UTF-8?Q?",
4840 (const unsigned char *)"\075?US-ASCII?Q?",
4845 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4846 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4847 e_iconv, s_iconv, 0, 0, 0, 0,
4848 #if defined(UTF8_INPUT_ENABLE)
4854 static const nkf_char mime_encode[] = {
4855 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4856 #if defined(UTF8_INPUT_ENABLE)
4863 static const nkf_char mime_encode_method[] = {
4864 'B', 'B','Q', 'B', 'B', 'Q',
4865 #if defined(UTF8_INPUT_ENABLE)
4873 #define MAXRECOVER 20
4875 void switch_mime_getc(void)
4877 if (i_getc!=mime_getc) {
4878 i_mgetc = i_getc; i_getc = mime_getc;
4879 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4880 if(mime_f==STRICT_MIME) {
4881 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4882 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4887 void unswitch_mime_getc(void)
4889 if(mime_f==STRICT_MIME) {
4890 i_mgetc = i_mgetc_buf;
4891 i_mungetc = i_mungetc_buf;
4894 i_ungetc = i_mungetc;
4895 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4896 mime_iconv_back = NULL;
4899 nkf_char mime_begin_strict(FILE *f)
4903 const unsigned char *p,*q;
4904 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4906 mime_decode_mode = FALSE;
4907 /* =? has been checked */
4909 p = mime_pattern[j];
4912 for(i=2;p[i]>SP;i++) { /* start at =? */
4913 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4914 /* pattern fails, try next one */
4916 while (mime_pattern[++j]) {
4917 p = mime_pattern[j];
4918 for(k=2;k<i;k++) /* assume length(p) > i */
4919 if (p[k]!=q[k]) break;
4920 if (k==i && nkf_toupper(c1)==p[k]) break;
4922 p = mime_pattern[j];
4923 if (p) continue; /* found next one, continue */
4924 /* all fails, output from recovery buffer */
4932 mime_decode_mode = p[i-2];
4934 mime_iconv_back = iconv;
4935 set_iconv(FALSE, mime_priority_func[j]);
4936 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4938 if (mime_decode_mode=='B') {
4939 mimebuf_f = unbuf_f;
4941 /* do MIME integrity check */
4942 return mime_integrity(f,mime_pattern[j]);
4950 nkf_char mime_getc_buf(FILE *f)
4952 /* we don't keep eof of Fifo, becase it contains ?= as
4953 a terminator. It was checked in mime_integrity. */
4954 return ((mimebuf_f)?
4955 (*i_mgetc_buf)(f):Fifo(mime_input++));
4958 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4961 (*i_mungetc_buf)(c,f);
4963 Fifo(--mime_input) = (unsigned char)c;
4967 nkf_char mime_begin(FILE *f)
4972 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4973 /* re-read and convert again from mime_buffer. */
4975 /* =? has been checked */
4977 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4978 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4979 /* We accept any character type even if it is breaked by new lines */
4980 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4981 if (c1==LF||c1==SP||c1==CR||
4982 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4984 /* Failed. But this could be another MIME preemble */
4992 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4993 if (!(++i<MAXRECOVER) || c1==EOF) break;
4994 if (c1=='b'||c1=='B') {
4995 mime_decode_mode = 'B';
4996 } else if (c1=='q'||c1=='Q') {
4997 mime_decode_mode = 'Q';
5001 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5002 if (!(++i<MAXRECOVER) || c1==EOF) break;
5004 mime_decode_mode = FALSE;
5010 if (!mime_decode_mode) {
5011 /* false MIME premble, restart from mime_buffer */
5012 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5013 /* Since we are in MIME mode until buffer becomes empty, */
5014 /* we never go into mime_begin again for a while. */
5017 /* discard mime preemble, and goto MIME mode */
5019 /* do no MIME integrity check */
5020 return c1; /* used only for checking EOF */
5024 void no_putc(nkf_char c)
5029 void debug(const char *str)
5032 fprintf(stderr, "%s\n", str ? str : "NULL");
5037 void set_input_codename(char *codename)
5039 if (!input_codename) {
5040 input_codename = codename;
5041 } else if (strcmp(codename, input_codename) != 0) {
5042 input_codename = "";
5046 #if !defined(PERL_XS) && !defined(WIN32DLL)
5047 void print_guessed_code(char *filename)
5049 char *codename = "BINARY";
5050 char *str_nlmode = NULL;
5051 if (filename != NULL) printf("%s: ", filename);
5052 if (input_codename && !*input_codename) {
5055 struct input_code *p = find_inputcode_byfunc(iconv);
5057 printf("%s\n", input_codename ? input_codename : "ASCII");
5059 if (!input_codename) {
5060 input_codename = "ASCII";
5061 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5062 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5063 input_codename = "CP932";
5064 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5065 if (p->score & (SCORE_X0212))
5066 input_codename = "EUCJP-MS";
5067 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5068 input_codename = "CP51932";
5069 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5070 if (p->score & (SCORE_KANA))
5071 input_codename = "CP50221";
5072 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5073 input_codename = "CP50220";
5077 input_newline == CR ? " (CR)" :
5078 input_newline == LF ? " (LF)" :
5079 input_newline == CRLF ? " (CRLF)" :
5080 input_newline == EOF ? " (MIXED NL)" :
5089 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5091 nkf_char c1, c2, c3;
5097 if (!nkf_isxdigit(c2)){
5102 if (!nkf_isxdigit(c3)){
5107 return (hex2bin(c2) << 4) | hex2bin(c3);
5110 nkf_char cap_getc(FILE *f)
5112 return hex_getc(':', f, i_cgetc, i_cungetc);
5115 nkf_char cap_ungetc(nkf_char c, FILE *f)
5117 return (*i_cungetc)(c, f);
5120 nkf_char url_getc(FILE *f)
5122 return hex_getc('%', f, i_ugetc, i_uungetc);
5125 nkf_char url_ungetc(nkf_char c, FILE *f)
5127 return (*i_uungetc)(c, f);
5131 #ifdef NUMCHAR_OPTION
5132 nkf_char numchar_getc(FILE *f)
5134 nkf_char (*g)(FILE *) = i_ngetc;
5135 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5146 if (buf[i] == 'x' || buf[i] == 'X'){
5147 for (j = 0; j < 7; j++){
5149 if (!nkf_isxdigit(buf[i])){
5156 c |= hex2bin(buf[i]);
5159 for (j = 0; j < 8; j++){
5163 if (!nkf_isdigit(buf[i])){
5170 c += hex2bin(buf[i]);
5176 return CLASS_UNICODE | c;
5185 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5187 return (*i_nungetc)(c, f);
5191 #ifdef UNICODE_NORMALIZATION
5193 /* Normalization Form C */
5194 nkf_char nfc_getc(FILE *f)
5196 nkf_char (*g)(FILE *f) = i_nfc_getc;
5197 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5198 int i=0, j, k=1, lower, upper;
5200 const nkf_nfchar *array;
5203 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5204 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5205 while (upper >= lower) {
5206 j = (lower+upper) / 2;
5207 array = normalization_table[j].nfd;
5208 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5209 if (array[k] != buf[k]){
5210 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5217 array = normalization_table[j].nfc;
5218 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5219 buf[i] = (nkf_char)(array[i]);
5230 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5232 return (*i_nfc_ungetc)(c, f);
5234 #endif /* UNICODE_NORMALIZATION */
5240 nkf_char c1, c2, c3, c4, cc;
5241 nkf_char t1, t2, t3, t4, mode, exit_mode;
5242 nkf_char lwsp_count;
5245 nkf_char lwsp_size = 128;
5247 if (mime_top != mime_last) { /* Something is in FIFO */
5248 return Fifo(mime_top++);
5250 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5251 mime_decode_mode=FALSE;
5252 unswitch_mime_getc();
5253 return (*i_getc)(f);
5256 if (mimebuf_f == FIXED_MIME)
5257 exit_mode = mime_decode_mode;
5260 if (mime_decode_mode == 'Q') {
5261 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5263 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5264 if (c1<=SP || DEL<=c1) {
5265 mime_decode_mode = exit_mode; /* prepare for quit */
5268 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5272 mime_decode_mode = exit_mode; /* prepare for quit */
5273 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5274 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5275 /* end Q encoding */
5276 input_mode = exit_mode;
5278 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5279 if (lwsp_buf==NULL) {
5280 perror("can't malloc");
5283 while ((c1=(*i_getc)(f))!=EOF) {
5288 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5296 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5297 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5312 lwsp_buf[lwsp_count] = (unsigned char)c1;
5313 if (lwsp_count++>lwsp_size){
5315 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5316 if (lwsp_buf_new==NULL) {
5318 perror("can't realloc");
5321 lwsp_buf = lwsp_buf_new;
5327 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5329 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5330 i_ungetc(lwsp_buf[lwsp_count],f);
5336 if (c1=='='&&c2<SP) { /* this is soft wrap */
5337 while((c1 = (*i_mgetc)(f)) <=SP) {
5338 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5340 mime_decode_mode = 'Q'; /* still in MIME */
5341 goto restart_mime_q;
5344 mime_decode_mode = 'Q'; /* still in MIME */
5348 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5349 if (c2<=SP) return c2;
5350 mime_decode_mode = 'Q'; /* still in MIME */
5351 return ((hex2bin(c2)<<4) + hex2bin(c3));
5354 if (mime_decode_mode != 'B') {
5355 mime_decode_mode = FALSE;
5356 return (*i_mgetc)(f);
5360 /* Base64 encoding */
5362 MIME allows line break in the middle of
5363 Base64, but we are very pessimistic in decoding
5364 in unbuf mode because MIME encoded code may broken by
5365 less or editor's control sequence (such as ESC-[-K in unbuffered
5366 mode. ignore incomplete MIME.
5368 mode = mime_decode_mode;
5369 mime_decode_mode = exit_mode; /* prepare for quit */
5371 while ((c1 = (*i_mgetc)(f))<=SP) {
5376 if ((c2 = (*i_mgetc)(f))<=SP) {
5379 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5380 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5383 if ((c1 == '?') && (c2 == '=')) {
5386 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5387 if (lwsp_buf==NULL) {
5388 perror("can't malloc");
5391 while ((c1=(*i_getc)(f))!=EOF) {
5396 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5404 if ((c1=(*i_getc)(f))!=EOF) {
5408 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5423 lwsp_buf[lwsp_count] = (unsigned char)c1;
5424 if (lwsp_count++>lwsp_size){
5426 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5427 if (lwsp_buf_new==NULL) {
5429 perror("can't realloc");
5432 lwsp_buf = lwsp_buf_new;
5438 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5440 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5441 i_ungetc(lwsp_buf[lwsp_count],f);
5448 if ((c3 = (*i_mgetc)(f))<=SP) {
5451 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5452 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5456 if ((c4 = (*i_mgetc)(f))<=SP) {
5459 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5460 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5464 mime_decode_mode = mode; /* still in MIME sigh... */
5466 /* BASE 64 decoding */
5468 t1 = 0x3f & base64decode(c1);
5469 t2 = 0x3f & base64decode(c2);
5470 t3 = 0x3f & base64decode(c3);
5471 t4 = 0x3f & base64decode(c4);
5472 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5474 Fifo(mime_last++) = (unsigned char)cc;
5475 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5477 Fifo(mime_last++) = (unsigned char)cc;
5478 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5480 Fifo(mime_last++) = (unsigned char)cc;
5485 return Fifo(mime_top++);
5488 nkf_char mime_ungetc(nkf_char c, FILE *f)
5490 Fifo(--mime_top) = (unsigned char)c;
5494 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5498 /* In buffered mode, read until =? or NL or buffer full
5500 mime_input = mime_top;
5501 mime_last = mime_top;
5503 while(*p) Fifo(mime_input++) = *p++;
5506 while((c=(*i_getc)(f))!=EOF) {
5507 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5508 break; /* buffer full */
5510 if (c=='=' && d=='?') {
5511 /* checked. skip header, start decode */
5512 Fifo(mime_input++) = (unsigned char)c;
5513 /* mime_last_input = mime_input; */
5518 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5520 /* Should we check length mod 4? */
5521 Fifo(mime_input++) = (unsigned char)c;
5524 /* In case of Incomplete MIME, no MIME decode */
5525 Fifo(mime_input++) = (unsigned char)c;
5526 mime_last = mime_input; /* point undecoded buffer */
5527 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5528 switch_mime_getc(); /* anyway we need buffered getc */
5532 nkf_char base64decode(nkf_char c)
5537 i = c - 'A'; /* A..Z 0-25 */
5538 } else if (c == '_') {
5539 i = '?' /* 63 */ ; /* _ 63 */
5541 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5543 } else if (c > '/') {
5544 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5545 } else if (c == '+' || c == '-') {
5546 i = '>' /* 62 */ ; /* + and - 62 */
5548 i = '?' /* 63 */ ; /* / 63 */
5553 static const char basis_64[] =
5554 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5556 static nkf_char b64c;
5557 #define MIMEOUT_BUF_LENGTH (60)
5558 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5559 int mimeout_buf_count = 0;
5561 void open_mime(nkf_char mode)
5563 const unsigned char *p;
5566 p = mime_pattern[0];
5567 for(i=0;mime_pattern[i];i++) {
5568 if (mode == mime_encode[i]) {
5569 p = mime_pattern[i];
5573 mimeout_mode = mime_encode_method[i];
5575 if (base64_count>45) {
5576 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5577 (*o_mputc)(mimeout_buf[i]);
5583 if (mimeout_buf_count>0
5584 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5585 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5589 for (;i<mimeout_buf_count;i++) {
5590 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5591 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5592 (*o_mputc)(mimeout_buf[i]);
5602 j = mimeout_buf_count;
5603 mimeout_buf_count = 0;
5605 mime_putc(mimeout_buf[i]);
5609 void close_mime(void)
5619 switch(mimeout_mode) {
5624 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5630 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5635 if (mimeout_mode > 0) {
5636 if (mimeout_f!=FIXED_MIME) {
5638 } else if (mimeout_mode != 'Q')
5643 void mimeout_addchar(nkf_char c)
5645 switch(mimeout_mode) {
5650 } else if(!nkf_isalnum(c)) {
5652 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5653 (*o_mputc)(bin2hex((c&0xf)));
5662 (*o_mputc)(basis_64[c>>2]);
5667 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5673 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5674 (*o_mputc)(basis_64[c & 0x3F]);
5685 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5687 void mime_prechar(nkf_char c2, nkf_char c1)
5689 if (mimeout_mode > 0){
5691 if (base64_count + mimeout_buf_count/3*4> 73){
5692 (*o_base64conv)(EOF,0);
5693 (*o_base64conv)(0,LF);
5694 (*o_base64conv)(0,SP);
5698 if (base64_count + mimeout_buf_count/3*4> 66) {
5699 (*o_base64conv)(EOF,0);
5700 (*o_base64conv)(0,LF);
5701 (*o_base64conv)(0,SP);
5707 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5708 mimeout_mode = (output_mode==ASCII ||output_mode == ISO8859_1) ? 'Q' : 'B';
5709 open_mime(output_mode);
5710 (*o_base64conv)(EOF,0);
5711 (*o_base64conv)(0,LF);
5712 (*o_base64conv)(0,SP);
5719 void mime_putc(nkf_char c)
5724 if (mimeout_f == FIXED_MIME){
5725 if (mimeout_mode == 'Q'){
5726 if (base64_count > 71){
5727 if (c!=CR && c!=LF) {
5734 if (base64_count > 71){
5739 if (c == EOF) { /* c==EOF */
5743 if (c != EOF) { /* c==EOF */
5749 /* mimeout_f != FIXED_MIME */
5751 if (c == EOF) { /* c==EOF */
5752 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5753 j = mimeout_buf_count;
5754 mimeout_buf_count = 0;
5756 if (mimeout_mode > 0) {
5757 if (!nkf_isblank(mimeout_buf[j-1])) {
5759 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5762 mimeout_addchar(mimeout_buf[i]);
5766 mimeout_addchar(mimeout_buf[i]);
5770 mimeout_addchar(mimeout_buf[i]);
5776 mimeout_addchar(mimeout_buf[i]);
5782 if (mimeout_buf_count > 0){
5783 lastchar = mimeout_buf[mimeout_buf_count - 1];
5788 if (mimeout_mode=='Q') {
5789 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5790 if (c == CR || c == LF) {
5795 } else if (c <= SP) {
5797 if (base64_count > 70) {
5801 if (!nkf_isblank(c)) {
5806 if (base64_count > 70) {
5811 open_mime(output_mode);
5813 if (!nkf_noescape_mime(c)) {
5824 if (mimeout_mode <= 0) {
5825 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5826 if (nkf_isspace(c)) {
5828 if (mimeout_mode == -1) {
5831 if (c==CR || c==LF) {
5833 open_mime(output_mode);
5839 for (i=0;i<mimeout_buf_count;i++) {
5840 (*o_mputc)(mimeout_buf[i]);
5841 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5852 mimeout_buf[0] = (char)c;
5853 mimeout_buf_count = 1;
5855 if (base64_count > 1
5856 && base64_count + mimeout_buf_count > 76
5857 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5860 if (!nkf_isspace(mimeout_buf[0])){
5865 mimeout_buf[mimeout_buf_count++] = (char)c;
5866 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5867 open_mime(output_mode);
5872 if (lastchar==CR || lastchar == LF){
5873 for (i=0;i<mimeout_buf_count;i++) {
5874 (*o_mputc)(mimeout_buf[i]);
5877 mimeout_buf_count = 0;
5880 for (i=0;i<mimeout_buf_count-1;i++) {
5881 (*o_mputc)(mimeout_buf[i]);
5884 mimeout_buf[0] = SP;
5885 mimeout_buf_count = 1;
5887 open_mime(output_mode);
5890 /* mimeout_mode == 'B', 1, 2 */
5891 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5892 if (lastchar == CR || lastchar == LF){
5893 if (nkf_isblank(c)) {
5894 for (i=0;i<mimeout_buf_count;i++) {
5895 mimeout_addchar(mimeout_buf[i]);
5897 mimeout_buf_count = 0;
5898 } else if (SP<c && c<DEL) {
5900 for (i=0;i<mimeout_buf_count;i++) {
5901 (*o_mputc)(mimeout_buf[i]);
5904 mimeout_buf_count = 0;
5906 mimeout_buf[mimeout_buf_count++] = (char)c;
5909 if (c==SP || c==TAB || c==CR || c==LF) {
5910 for (i=0;i<mimeout_buf_count;i++) {
5911 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5913 for (i=0;i<mimeout_buf_count;i++) {
5914 (*o_mputc)(mimeout_buf[i]);
5917 mimeout_buf_count = 0;
5920 mimeout_buf[mimeout_buf_count++] = (char)c;
5921 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5923 for (i=0;i<mimeout_buf_count;i++) {
5924 (*o_mputc)(mimeout_buf[i]);
5927 mimeout_buf_count = 0;
5931 if (mimeout_buf_count>0 && SP<c && c!='=') {
5932 mimeout_buf[mimeout_buf_count++] = (char)c;
5933 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5934 j = mimeout_buf_count;
5935 mimeout_buf_count = 0;
5937 mimeout_addchar(mimeout_buf[i]);
5944 if (mimeout_buf_count>0) {
5945 j = mimeout_buf_count;
5946 mimeout_buf_count = 0;
5948 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
5950 mimeout_addchar(mimeout_buf[i]);
5956 (*o_mputc)(mimeout_buf[i]);
5958 open_mime(output_mode);
5968 struct input_code *p = input_code_list;
5981 mime_f = MIME_DECODE_DEFAULT;
5982 mime_decode_f = FALSE;
5987 x0201_f = X0201_DEFAULT;
5988 iso2022jp_f = FALSE;
5989 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5990 ms_ucs_map_f = UCS_MAP_ASCII;
5992 #ifdef UTF8_INPUT_ENABLE
5993 no_cp932ext_f = FALSE;
5994 no_best_fit_chars_f = FALSE;
5995 encode_fallback = NULL;
5996 unicode_subchar = '?';
5997 input_endian = ENDIAN_BIG;
5999 #ifdef UTF8_OUTPUT_ENABLE
6000 output_bom_f = FALSE;
6001 output_endian = ENDIAN_BIG;
6003 #ifdef UNICODE_NORMALIZATION
6019 #ifdef SHIFTJIS_CP932
6029 for (i = 0; i < 256; i++){
6030 prefix_table[i] = 0;
6034 mimeout_buf_count = 0;
6039 fold_preserve_f = FALSE;
6042 kanji_intro = DEFAULT_J;
6043 ascii_intro = DEFAULT_R;
6044 fold_margin = FOLD_MARGIN;
6045 output_conv = DEFAULT_CONV;
6046 oconv = DEFAULT_CONV;
6047 o_zconv = no_connection;
6048 o_fconv = no_connection;
6049 o_nlconv = no_connection;
6050 o_rot_conv = no_connection;
6051 o_hira_conv = no_connection;
6052 o_base64conv = no_connection;
6053 o_iso2022jp_check_conv = no_connection;
6056 i_ungetc = std_ungetc;
6058 i_bungetc = std_ungetc;
6061 i_mungetc = std_ungetc;
6062 i_mgetc_buf = std_getc;
6063 i_mungetc_buf = std_ungetc;
6064 output_mode = ASCII;
6067 mime_decode_mode = FALSE;
6075 z_prev2=0,z_prev1=0;
6077 iconv_for_check = 0;
6079 input_codename = NULL;
6085 void no_connection(nkf_char c2, nkf_char c1)
6087 no_connection2(c2,c1,0);
6090 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6092 fprintf(stderr,"nkf internal module connection failure.\n");
6094 return 0; /* LINT */
6099 #define fprintf dllprintf
6103 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6104 fprintf(stderr,"Flags:\n");
6105 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6106 #ifdef DEFAULT_CODE_SJIS
6107 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6109 #ifdef DEFAULT_CODE_JIS
6110 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6112 #ifdef DEFAULT_CODE_EUC
6113 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6115 #ifdef DEFAULT_CODE_UTF8
6116 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6118 #ifdef UTF8_OUTPUT_ENABLE
6119 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6121 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6122 #ifdef UTF8_INPUT_ENABLE
6123 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6125 fprintf(stderr,"t no conversion\n");
6126 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6127 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6128 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6129 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6130 fprintf(stderr,"v Show this usage. V: show version\n");
6131 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6132 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6133 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6134 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6135 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6136 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6137 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6138 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6139 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6141 fprintf(stderr,"T Text mode output\n");
6143 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6144 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6145 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6146 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6147 fprintf(stderr,"\n");
6148 fprintf(stderr,"Long name options\n");
6149 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6150 fprintf(stderr," Specify the input or output codeset\n");
6151 fprintf(stderr," --fj --unix --mac --windows\n");
6152 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6153 fprintf(stderr," Convert for the system or code\n");
6154 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6155 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6156 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6158 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6160 #ifdef NUMCHAR_OPTION
6161 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6163 #ifdef UTF8_INPUT_ENABLE
6164 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6165 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6168 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6169 fprintf(stderr," Overwrite original listed files by filtered result\n");
6170 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6172 fprintf(stderr," -g --guess Guess the input code\n");
6173 fprintf(stderr," --help --version Show this help/the version\n");
6174 fprintf(stderr," For more information, see also man nkf\n");
6175 fprintf(stderr,"\n");
6179 void show_configuration(void)
6181 fprintf(stderr, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n");
6182 fprintf(stderr, " Compile-time options:\n");
6183 fprintf(stderr, " Default encoding: "
6184 #if defined(DEFAULT_CODE_JIS)
6186 #elif defined(DEFAULT_CODE_SJIS)
6188 #elif defined(DEFAULT_CODE_EUC)
6190 #elif defined(DEFAULT_CODE_UTF8)
6196 fprintf(stderr, " Decode MIME encoded string: %s\n", MIME_DECODE_DEFAULT ? "ON" : "OFF");
6197 fprintf(stderr, " Convert JIS X 0201 Katakana: %s\n", X0201_DEFAULT ? "ON" : "OFF");
6203 fprintf(stderr,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");