1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 /* $Id: nkf.c,v 1.147 2007/11/03 08:02:49 naruse Exp $ */
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2007-11-03"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
42 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
44 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
60 #if defined(MSDOS) || defined(__OS2__)
63 #if defined(_MSC_VER) || defined(__WATCOMC__)
64 #define mktemp _mktemp
70 #define setbinmode(fp) fsetbin(fp)
71 #elif defined(__DJGPP__)
72 #include <libc/dosio.h>
73 #define setbinmode(fp) djgpp_setbinmode(fp)
74 #else /* Microsoft C, Turbo C */
75 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
78 #define setbinmode(fp)
81 #if defined(__DJGPP__)
82 void djgpp_setbinmode(FILE *fp)
84 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
87 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
88 __file_handle_set(fd, m);
92 #ifdef _IOFBF /* SysV and MSDOS, Windows */
93 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
95 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
98 /*Borland C++ 4.5 EasyWin*/
99 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
108 /* added by satoru@isoternet.org */
110 #include <sys/types.h>
112 #include <sys/stat.h>
113 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
115 #if defined(__WATCOMC__)
116 #include <sys/utime.h>
120 #else /* defined(MSDOS) */
122 #ifdef __BORLANDC__ /* BCC32 */
124 #else /* !defined(__BORLANDC__) */
125 #include <sys/utime.h>
126 #endif /* (__BORLANDC__) */
127 #else /* !defined(__WIN32__) */
128 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
129 #include <sys/utime.h>
130 #elif defined(__TURBOC__) /* BCC */
132 #elif defined(LSI_C) /* LSI C */
133 #endif /* (__WIN32__) */
141 /* state of output_mode and input_mode
158 #define X0213_1 0x284F
159 #define X0213_2 0x2850
161 /* Input Assumption */
166 #define LATIN1_INPUT 6
168 #define STRICT_MIME 8
173 #define JAPANESE_EUC 10
177 #define UTF8_INPUT 13
178 #define UTF16_INPUT 1015
179 #define UTF32_INPUT 1017
183 #define ENDIAN_BIG 1234
184 #define ENDIAN_LITTLE 4321
185 #define ENDIAN_2143 2143
186 #define ENDIAN_3412 3412
207 #define is_alnum(c) \
208 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
210 /* I don't trust portablity of toupper */
211 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
212 #define nkf_isoctal(c) ('0'<=c && c<='7')
213 #define nkf_isdigit(c) ('0'<=c && c<='9')
214 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
215 #define nkf_isblank(c) (c == SP || c == TAB)
216 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
217 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
218 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
219 #define nkf_isprint(c) (SP<=c && c<='~')
220 #define nkf_isgraph(c) ('!'<=c && c<='~')
221 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
222 ('A'<=c&&c<='F') ? (c-'A'+10) : \
223 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
224 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
225 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
226 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
227 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
228 && (c != '.') && (c != 0x22)))
230 #define CP932_TABLE_BEGIN 0xFA
231 #define CP932_TABLE_END 0xFC
232 #define CP932INV_TABLE_BEGIN 0xED
233 #define CP932INV_TABLE_END 0xEE
234 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
236 #define HOLD_SIZE 1024
237 #if defined(INT_IS_SHORT)
238 #define IOBUF_SIZE 2048
240 #define IOBUF_SIZE 16384
243 #define DEFAULT_J 'B'
244 #define DEFAULT_R 'B'
246 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
247 #define SJ6394 0x0161 /* 63 - 94 ku offset */
249 #define RANGE_NUM_MAX 18
254 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
255 #define sizeof_euc_to_utf8_1byte 94
256 #define sizeof_euc_to_utf8_2bytes 94
257 #define sizeof_utf8_to_euc_C2 64
258 #define sizeof_utf8_to_euc_E5B8 64
259 #define sizeof_utf8_to_euc_2bytes 112
260 #define sizeof_utf8_to_euc_3bytes 16
263 /* MIME preprocessor */
265 #ifdef EASYWIN /*Easy Win */
266 extern POINT _BufferSize;
275 void (*status_func)(struct input_code *, nkf_char);
276 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
280 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
283 static const char *CopyRight = COPY_RIGHT;
285 #if !defined(PERL_XS) && !defined(WIN32DLL)
286 static nkf_char noconvert(FILE *f);
288 static void module_connection(void);
289 static nkf_char kanji_convert(FILE *f);
290 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
291 static nkf_char push_hold_buf(nkf_char c2);
292 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
293 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
294 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
295 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
296 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
298 * 0: Shift_JIS, eucJP-ascii
303 #define UCS_MAP_ASCII 0
305 #define UCS_MAP_CP932 2
306 #define UCS_MAP_CP10001 3
307 static int ms_ucs_map_f = UCS_MAP_ASCII;
309 #ifdef UTF8_INPUT_ENABLE
310 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
311 static int no_cp932ext_f = FALSE;
312 /* ignore ZERO WIDTH NO-BREAK SPACE */
313 static int no_best_fit_chars_f = FALSE;
314 static int input_endian = ENDIAN_BIG;
315 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
316 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
317 static void encode_fallback_html(nkf_char c);
318 static void encode_fallback_xml(nkf_char c);
319 static void encode_fallback_java(nkf_char c);
320 static void encode_fallback_perl(nkf_char c);
321 static void encode_fallback_subchar(nkf_char c);
322 static void (*encode_fallback)(nkf_char c) = NULL;
323 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
324 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
325 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
326 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
327 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
328 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
329 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
330 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
331 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
332 static void w_status(struct input_code *, nkf_char);
334 #ifdef UTF8_OUTPUT_ENABLE
335 static int output_bom_f = FALSE;
336 static int output_endian = ENDIAN_BIG;
337 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
338 static void w_oconv(nkf_char c2,nkf_char c1);
339 static void w_oconv16(nkf_char c2,nkf_char c1);
340 static void w_oconv32(nkf_char c2,nkf_char c1);
342 static void e_oconv(nkf_char c2,nkf_char c1);
343 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
344 static void s_oconv(nkf_char c2,nkf_char c1);
345 static void j_oconv(nkf_char c2,nkf_char c1);
346 static void fold_conv(nkf_char c2,nkf_char c1);
347 static void nl_conv(nkf_char c2,nkf_char c1);
348 static void z_conv(nkf_char c2,nkf_char c1);
349 static void rot_conv(nkf_char c2,nkf_char c1);
350 static void hira_conv(nkf_char c2,nkf_char c1);
351 static void base64_conv(nkf_char c2,nkf_char c1);
352 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
353 static void no_connection(nkf_char c2,nkf_char c1);
354 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
356 static void code_score(struct input_code *ptr);
357 static void code_status(nkf_char c);
359 static void std_putc(nkf_char c);
360 static nkf_char std_getc(FILE *f);
361 static nkf_char std_ungetc(nkf_char c,FILE *f);
363 static nkf_char broken_getc(FILE *f);
364 static nkf_char broken_ungetc(nkf_char c,FILE *f);
366 static nkf_char mime_begin(FILE *f);
367 static nkf_char mime_getc(FILE *f);
368 static nkf_char mime_ungetc(nkf_char c,FILE *f);
370 static void switch_mime_getc(void);
371 static void unswitch_mime_getc(void);
372 static nkf_char mime_begin_strict(FILE *f);
373 static nkf_char mime_getc_buf(FILE *f);
374 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
375 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
377 static nkf_char base64decode(nkf_char c);
378 static void mime_prechar(nkf_char c2, nkf_char c1);
379 static void mime_putc(nkf_char c);
380 static void open_mime(nkf_char c);
381 static void close_mime(void);
382 static void eof_mime(void);
383 static void mimeout_addchar(nkf_char c);
385 static void usage(void);
386 static void version(void);
388 static void options(unsigned char *c);
389 static void reinit(void);
393 #if !defined(PERL_XS) && !defined(WIN32DLL)
394 static unsigned char stdibuf[IOBUF_SIZE];
395 static unsigned char stdobuf[IOBUF_SIZE];
397 static unsigned char hold_buf[HOLD_SIZE*2];
398 static int hold_count = 0;
400 /* MIME preprocessor fifo */
402 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
403 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
404 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
405 static unsigned char mime_buf[MIME_BUF_SIZE];
406 static unsigned int mime_top = 0;
407 static unsigned int mime_last = 0; /* decoded */
408 static unsigned int mime_input = 0; /* undecoded */
409 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
412 static int unbuf_f = FALSE;
413 static int estab_f = FALSE;
414 static int nop_f = FALSE;
415 static int binmode_f = TRUE; /* binary mode */
416 static int rot_f = FALSE; /* rot14/43 mode */
417 static int hira_f = FALSE; /* hira/kata henkan */
418 static int input_f = FALSE; /* non fixed input code */
419 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
420 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
421 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
422 static int mimebuf_f = FALSE; /* MIME buffered input */
423 static int broken_f = FALSE; /* convert ESC-less broken JIS */
424 static int iso8859_f = FALSE; /* ISO8859 through */
425 static int mimeout_f = FALSE; /* base64 mode */
426 #if defined(MSDOS) || defined(__OS2__)
427 static int x0201_f = TRUE; /* Assume JISX0201 kana */
429 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
431 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
433 #ifdef UNICODE_NORMALIZATION
434 static int nfc_f = FALSE;
435 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
436 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
437 static nkf_char nfc_getc(FILE *f);
438 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
442 static int cap_f = FALSE;
443 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
444 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
445 static nkf_char cap_getc(FILE *f);
446 static nkf_char cap_ungetc(nkf_char c,FILE *f);
448 static int url_f = FALSE;
449 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
450 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
451 static nkf_char url_getc(FILE *f);
452 static nkf_char url_ungetc(nkf_char c,FILE *f);
455 #if defined(INT_IS_SHORT)
456 #define NKF_INT32_C(n) (n##L)
458 #define NKF_INT32_C(n) (n)
460 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
461 #define CLASS_MASK NKF_INT32_C(0xFF000000)
462 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
463 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
464 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
465 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
466 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
468 #ifdef NUMCHAR_OPTION
469 static int numchar_f = FALSE;
470 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
471 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
472 static nkf_char numchar_getc(FILE *f);
473 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
477 static int noout_f = FALSE;
478 static void no_putc(nkf_char c);
479 static int debug_f = FALSE;
480 static void debug(const char *str);
481 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
484 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
486 static void print_guessed_code(char *filename);
488 static void set_input_codename(char *codename);
491 static int exec_f = 0;
494 #ifdef SHIFTJIS_CP932
495 /* invert IBM extended characters to others */
496 static int cp51932_f = FALSE;
498 /* invert NEC-selected IBM extended characters to IBM extended characters */
499 static int cp932inv_f = TRUE;
501 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
502 #endif /* SHIFTJIS_CP932 */
505 static int x0212_f = FALSE;
506 static nkf_char x0212_shift(nkf_char c);
507 static nkf_char x0212_unshift(nkf_char c);
509 static int x0213_f = FALSE;
511 static unsigned char prefix_table[256];
513 static void set_code_score(struct input_code *ptr, nkf_char score);
514 static void clr_code_score(struct input_code *ptr, nkf_char score);
515 static void status_disable(struct input_code *ptr);
516 static void status_push_ch(struct input_code *ptr, nkf_char c);
517 static void status_clear(struct input_code *ptr);
518 static void status_reset(struct input_code *ptr);
519 static void status_reinit(struct input_code *ptr);
520 static void status_check(struct input_code *ptr, nkf_char c);
521 static void e_status(struct input_code *, nkf_char);
522 static void s_status(struct input_code *, nkf_char);
524 struct input_code input_code_list[] = {
525 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
526 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
527 #ifdef UTF8_INPUT_ENABLE
528 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
529 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
530 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
535 static int mimeout_mode = 0;
536 static int base64_count = 0;
538 /* X0208 -> ASCII converter */
541 static int f_line = 0; /* chars in line */
542 static int f_prev = 0;
543 static int fold_preserve_f = FALSE; /* preserve new lines */
544 static int fold_f = FALSE;
545 static int fold_len = 0;
548 static unsigned char kanji_intro = DEFAULT_J;
549 static unsigned char ascii_intro = DEFAULT_R;
553 #define FOLD_MARGIN 10
554 #define DEFAULT_FOLD 60
556 static int fold_margin = FOLD_MARGIN;
560 #ifdef DEFAULT_CODE_JIS
561 # define DEFAULT_CONV j_oconv
563 #ifdef DEFAULT_CODE_SJIS
564 # define DEFAULT_CONV s_oconv
566 #ifdef DEFAULT_CODE_EUC
567 # define DEFAULT_CONV e_oconv
569 #ifdef DEFAULT_CODE_UTF8
570 # define DEFAULT_CONV w_oconv
573 /* process default */
574 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
576 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
577 /* s_iconv or oconv */
578 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
580 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
581 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
588 /* static redirections */
590 static void (*o_putc)(nkf_char c) = std_putc;
592 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
593 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
595 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
596 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
598 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
600 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
601 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
603 /* for strict mime */
604 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
605 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
608 static int output_mode = ASCII, /* output kanji mode */
609 input_mode = ASCII, /* input kanji mode */
610 shift_mode = FALSE; /* TRUE shift out, or X0201 */
611 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
613 /* X0201 / X0208 conversion tables */
615 /* X0201 kana conversion table */
617 static const unsigned char cv[]= {
618 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
619 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
620 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
621 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
622 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
623 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
624 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
625 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
626 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
627 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
628 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
629 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
630 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
631 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
632 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
633 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
637 /* X0201 kana conversion table for daguten */
639 static const unsigned char dv[]= {
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
645 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
646 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
647 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
648 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
649 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
650 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
651 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
658 /* X0201 kana conversion table for han-daguten */
660 static const unsigned char ev[]= {
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
672 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
680 /* X0208 kigou conversion table */
681 /* 0x8140 - 0x819e */
682 static const unsigned char fv[] = {
684 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
685 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
686 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
687 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
688 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
689 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
690 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
691 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
692 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
694 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
700 static int file_out_f = FALSE;
702 static int overwrite_f = FALSE;
703 static int preserve_time_f = FALSE;
704 static int backup_f = FALSE;
705 static char *backup_suffix = "";
706 static char *get_backup_filename(const char *suffix, const char *filename);
709 static int nlmode_f = 0; /* CR, LF, CRLF */
710 static int input_nextline = 0; /* 0: unestablished, EOF: MIXED */
711 static nkf_char prev_cr = 0; /* CR or 0 */
712 #ifdef EASYWIN /*Easy Win */
713 static int end_check;
716 #define STD_GC_BUFSIZE (256)
717 nkf_char std_gc_buf[STD_GC_BUFSIZE];
721 #include "nkf32dll.c"
722 #elif defined(PERL_XS)
724 int main(int argc, char **argv)
729 char *outfname = NULL;
732 #ifdef EASYWIN /*Easy Win */
733 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
736 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
737 cp = (unsigned char *)*argv;
741 int debug_f_back = debug_f;
744 int exec_f_back = exec_f;
747 int x0212_f_back = x0212_f;
750 int x0213_f_back = x0213_f;
752 int guess_f_back = guess_f;
754 guess_f = guess_f_back;
757 debug_f = debug_f_back;
760 exec_f = exec_f_back;
763 x0212_f = x0212_f_back;
766 x0213_f = x0213_f_back;
772 if (pipe(fds) < 0 || (pid = fork()) < 0){
783 execvp(argv[1], &argv[1]);
797 if(x0201_f == WISH_TRUE)
798 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
800 if (binmode_f == TRUE)
801 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
802 if (freopen("","wb",stdout) == NULL)
809 setbuf(stdout, (char *) NULL);
811 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
814 if (binmode_f == TRUE)
815 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
816 if (freopen("","rb",stdin) == NULL) return (-1);
820 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
824 kanji_convert(stdin);
825 if (guess_f) print_guessed_code(NULL);
829 int is_argument_error = FALSE;
831 input_codename = NULL;
836 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
839 is_argument_error = TRUE;
847 /* reopen file for stdout */
848 if (file_out_f == TRUE) {
851 outfname = malloc(strlen(origfname)
852 + strlen(".nkftmpXXXXXX")
858 strcpy(outfname, origfname);
862 for (i = strlen(outfname); i; --i){
863 if (outfname[i - 1] == '/'
864 || outfname[i - 1] == '\\'){
870 strcat(outfname, "ntXXXXXX");
872 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
875 strcat(outfname, ".nkftmpXXXXXX");
876 fd = mkstemp(outfname);
879 || (fd_backup = dup(fileno(stdout))) < 0
880 || dup2(fd, fileno(stdout)) < 0
891 outfname = "nkf.out";
894 if(freopen(outfname, "w", stdout) == NULL) {
898 if (binmode_f == TRUE) {
899 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
900 if (freopen("","wb",stdout) == NULL)
907 if (binmode_f == TRUE)
908 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
909 if (freopen("","rb",fin) == NULL)
914 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
918 char *filename = NULL;
920 if (nfiles > 1) filename = origfname;
921 if (guess_f) print_guessed_code(filename);
927 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
935 if (dup2(fd_backup, fileno(stdout)) < 0){
938 if (stat(origfname, &sb)) {
939 fprintf(stderr, "Can't stat %s\n", origfname);
941 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
942 if (chmod(outfname, sb.st_mode)) {
943 fprintf(stderr, "Can't set permission %s\n", outfname);
946 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
948 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
949 tb[0] = tb[1] = sb.st_mtime;
950 if (utime(outfname, tb)) {
951 fprintf(stderr, "Can't set timestamp %s\n", outfname);
954 tb.actime = sb.st_atime;
955 tb.modtime = sb.st_mtime;
956 if (utime(outfname, &tb)) {
957 fprintf(stderr, "Can't set timestamp %s\n", outfname);
962 char *backup_filename = get_backup_filename(backup_suffix, origfname);
964 unlink(backup_filename);
966 if (rename(origfname, backup_filename)) {
967 perror(backup_filename);
968 fprintf(stderr, "Can't rename %s to %s\n",
969 origfname, backup_filename);
973 if (unlink(origfname)){
978 if (rename(outfname, origfname)) {
980 fprintf(stderr, "Can't rename %s to %s\n",
981 outfname, origfname);
988 if (is_argument_error)
991 #ifdef EASYWIN /*Easy Win */
992 if (file_out_f == FALSE)
993 scanf("%d",&end_check);
996 #else /* for Other OS */
997 if (file_out_f == TRUE)
1002 #endif /* WIN32DLL */
1005 char *get_backup_filename(const char *suffix, const char *filename)
1007 char *backup_filename;
1008 int asterisk_count = 0;
1010 int filename_length = strlen(filename);
1012 for(i = 0; suffix[i]; i++){
1013 if(suffix[i] == '*') asterisk_count++;
1017 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1018 if (!backup_filename){
1019 perror("Can't malloc backup filename.");
1023 for(i = 0, j = 0; suffix[i];){
1024 if(suffix[i] == '*'){
1025 backup_filename[j] = '\0';
1026 strncat(backup_filename, filename, filename_length);
1028 j += filename_length;
1030 backup_filename[j++] = suffix[i++];
1033 backup_filename[j] = '\0';
1035 j = strlen(suffix) + filename_length;
1036 backup_filename = malloc( + 1);
1037 strcpy(backup_filename, filename);
1038 strcat(backup_filename, suffix);
1039 backup_filename[j] = '\0';
1041 return backup_filename;
1045 static const struct {
1069 {"katakana-hiragana","h3"},
1077 #ifdef UTF8_OUTPUT_ENABLE
1087 {"fb-subchar=", ""},
1089 #ifdef UTF8_INPUT_ENABLE
1090 {"utf8-input", "W"},
1091 {"utf16-input", "W16"},
1092 {"no-cp932ext", ""},
1093 {"no-best-fit-chars",""},
1095 #ifdef UNICODE_NORMALIZATION
1096 {"utf8mac-input", ""},
1108 #ifdef NUMCHAR_OPTION
1109 {"numchar-input", ""},
1115 #ifdef SHIFTJIS_CP932
1125 static int option_mode = 0;
1127 void options(unsigned char *cp)
1131 unsigned char *cp_back = NULL;
1136 while(*cp && *cp++!='-');
1137 while (*cp || cp_back) {
1145 case '-': /* literal options */
1146 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1150 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1151 p = (unsigned char *)long_option[i].name;
1152 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1153 if (*p == cp[j] || cp[j] == SP){
1160 fprintf(stderr, "unknown long option: --%s\n", cp);
1163 while(*cp && *cp != SP && cp++);
1164 if (long_option[i].alias[0]){
1166 cp = (unsigned char *)long_option[i].alias;
1168 if (strcmp(long_option[i].name, "ic=") == 0){
1169 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1170 codeset[i] = nkf_toupper(p[i]);
1173 if(strcmp(codeset, "ISO-2022-JP") == 0){
1174 input_f = JIS_INPUT;
1175 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1176 strcmp(codeset, "CP50220") == 0 ||
1177 strcmp(codeset, "CP50221") == 0 ||
1178 strcmp(codeset, "CP50222") == 0){
1179 input_f = JIS_INPUT;
1180 #ifdef SHIFTJIS_CP932
1183 #ifdef UTF8_OUTPUT_ENABLE
1184 ms_ucs_map_f = UCS_MAP_CP932;
1186 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1187 input_f = JIS_INPUT;
1191 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1192 input_f = JIS_INPUT;
1197 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1198 input_f = SJIS_INPUT;
1199 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1200 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1201 strcmp(codeset, "CP932") == 0 ||
1202 strcmp(codeset, "MS932") == 0){
1203 input_f = SJIS_INPUT;
1204 #ifdef SHIFTJIS_CP932
1207 #ifdef UTF8_OUTPUT_ENABLE
1208 ms_ucs_map_f = UCS_MAP_CP932;
1210 }else if(strcmp(codeset, "CP10001") == 0){
1211 input_f = SJIS_INPUT;
1212 #ifdef SHIFTJIS_CP932
1215 #ifdef UTF8_OUTPUT_ENABLE
1216 ms_ucs_map_f = UCS_MAP_CP10001;
1218 }else if(strcmp(codeset, "EUCJP") == 0 ||
1219 strcmp(codeset, "EUC-JP") == 0){
1220 input_f = EUC_INPUT;
1221 }else if(strcmp(codeset, "CP51932") == 0){
1222 input_f = EUC_INPUT;
1223 #ifdef SHIFTJIS_CP932
1226 #ifdef UTF8_OUTPUT_ENABLE
1227 ms_ucs_map_f = UCS_MAP_CP932;
1229 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1230 strcmp(codeset, "EUCJP-MS") == 0 ||
1231 strcmp(codeset, "EUCJPMS") == 0){
1232 input_f = EUC_INPUT;
1233 #ifdef SHIFTJIS_CP932
1236 #ifdef UTF8_OUTPUT_ENABLE
1237 ms_ucs_map_f = UCS_MAP_MS;
1239 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1240 strcmp(codeset, "EUCJP-ASCII") == 0){
1241 input_f = EUC_INPUT;
1242 #ifdef SHIFTJIS_CP932
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_ASCII;
1248 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1249 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1250 input_f = SJIS_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1256 strcmp(codeset, "EUC-JIS-2004") == 0){
1257 input_f = EUC_INPUT;
1259 #ifdef SHIFTJIS_CP932
1262 #ifdef UTF8_INPUT_ENABLE
1263 }else if(strcmp(codeset, "UTF-8") == 0 ||
1264 strcmp(codeset, "UTF-8N") == 0 ||
1265 strcmp(codeset, "UTF-8-BOM") == 0){
1266 input_f = UTF8_INPUT;
1267 #ifdef UNICODE_NORMALIZATION
1268 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1269 strcmp(codeset, "UTF-8-MAC") == 0){
1270 input_f = UTF8_INPUT;
1273 }else if(strcmp(codeset, "UTF-16") == 0 ||
1274 strcmp(codeset, "UTF-16BE") == 0 ||
1275 strcmp(codeset, "UTF-16BE-BOM") == 0){
1276 input_f = UTF16_INPUT;
1277 input_endian = ENDIAN_BIG;
1278 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1279 strcmp(codeset, "UTF-16LE-BOM") == 0){
1280 input_f = UTF16_INPUT;
1281 input_endian = ENDIAN_LITTLE;
1282 }else if(strcmp(codeset, "UTF-32") == 0 ||
1283 strcmp(codeset, "UTF-32BE") == 0 ||
1284 strcmp(codeset, "UTF-32BE-BOM") == 0){
1285 input_f = UTF32_INPUT;
1286 input_endian = ENDIAN_BIG;
1287 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1288 strcmp(codeset, "UTF-32LE-BOM") == 0){
1289 input_f = UTF32_INPUT;
1290 input_endian = ENDIAN_LITTLE;
1293 fprintf(stderr, "unknown input encoding: %s\n", codeset);
1297 if (strcmp(long_option[i].name, "oc=") == 0){
1299 for (i=0; i < 16 && SP < p[i] && p[i] < DEL; i++){
1300 codeset[i] = nkf_toupper(p[i]);
1303 if(strcmp(codeset, "ISO-2022-JP") == 0){
1304 output_conv = j_oconv;
1305 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1306 output_conv = j_oconv;
1307 no_cp932ext_f = TRUE;
1308 #ifdef SHIFTJIS_CP932
1311 #ifdef UTF8_OUTPUT_ENABLE
1312 ms_ucs_map_f = UCS_MAP_CP932;
1314 }else if(strcmp(codeset, "CP50220") == 0){
1315 output_conv = j_oconv;
1317 #ifdef SHIFTJIS_CP932
1320 #ifdef UTF8_OUTPUT_ENABLE
1321 ms_ucs_map_f = UCS_MAP_CP932;
1323 }else if(strcmp(codeset, "CP50221") == 0){
1324 output_conv = j_oconv;
1325 #ifdef SHIFTJIS_CP932
1328 #ifdef UTF8_OUTPUT_ENABLE
1329 ms_ucs_map_f = UCS_MAP_CP932;
1331 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1332 output_conv = j_oconv;
1336 #ifdef SHIFTJIS_CP932
1339 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1340 output_conv = j_oconv;
1345 #ifdef SHIFTJIS_CP932
1348 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1349 output_conv = s_oconv;
1350 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1351 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1352 strcmp(codeset, "CP932") == 0 ||
1353 strcmp(codeset, "MS932") == 0){
1354 output_conv = s_oconv;
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP932;
1358 }else if(strcmp(codeset, "CP10001") == 0){
1359 output_conv = s_oconv;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP10001;
1363 }else if(strcmp(codeset, "EUCJP") == 0 ||
1364 strcmp(codeset, "EUC-JP") == 0){
1365 output_conv = e_oconv;
1366 }else if(strcmp(codeset, "CP51932") == 0){
1367 output_conv = e_oconv;
1368 #ifdef SHIFTJIS_CP932
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP932;
1374 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1375 strcmp(codeset, "EUCJP-MS") == 0 ||
1376 strcmp(codeset, "EUCJPMS") == 0){
1377 output_conv = e_oconv;
1381 #ifdef UTF8_OUTPUT_ENABLE
1382 ms_ucs_map_f = UCS_MAP_MS;
1384 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1385 strcmp(codeset, "EUCJP-ASCII") == 0){
1386 output_conv = e_oconv;
1390 #ifdef UTF8_OUTPUT_ENABLE
1391 ms_ucs_map_f = UCS_MAP_ASCII;
1393 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1394 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1395 output_conv = s_oconv;
1397 #ifdef SHIFTJIS_CP932
1400 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1401 strcmp(codeset, "EUC-JIS-2004") == 0){
1402 output_conv = e_oconv;
1407 #ifdef SHIFTJIS_CP932
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 }else if(strcmp(codeset, "UTF-8") == 0){
1412 output_conv = w_oconv;
1413 }else if(strcmp(codeset, "UTF-8N") == 0){
1414 output_conv = w_oconv;
1415 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1416 output_conv = w_oconv;
1417 output_bom_f = TRUE;
1418 }else if(strcmp(codeset, "UTF-16BE") == 0){
1419 output_conv = w_oconv16;
1420 }else if(strcmp(codeset, "UTF-16") == 0 ||
1421 strcmp(codeset, "UTF-16BE-BOM") == 0){
1422 output_conv = w_oconv16;
1423 output_bom_f = TRUE;
1424 }else if(strcmp(codeset, "UTF-16LE") == 0){
1425 output_conv = w_oconv16;
1426 output_endian = ENDIAN_LITTLE;
1427 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1428 output_conv = w_oconv16;
1429 output_endian = ENDIAN_LITTLE;
1430 output_bom_f = TRUE;
1431 }else if(strcmp(codeset, "UTF-32") == 0 ||
1432 strcmp(codeset, "UTF-32BE") == 0){
1433 output_conv = w_oconv32;
1434 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1435 output_conv = w_oconv32;
1436 output_bom_f = TRUE;
1437 }else if(strcmp(codeset, "UTF-32LE") == 0){
1438 output_conv = w_oconv32;
1439 output_endian = ENDIAN_LITTLE;
1440 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1441 output_conv = w_oconv32;
1442 output_endian = ENDIAN_LITTLE;
1443 output_bom_f = TRUE;
1446 fprintf(stderr, "unknown output encoding: %s\n", codeset);
1450 if (strcmp(long_option[i].name, "guess=") == 0){
1459 if (strcmp(long_option[i].name, "overwrite") == 0){
1462 preserve_time_f = TRUE;
1465 if (strcmp(long_option[i].name, "overwrite=") == 0){
1468 preserve_time_f = TRUE;
1470 backup_suffix = malloc(strlen((char *) p) + 1);
1471 strcpy(backup_suffix, (char *) p);
1474 if (strcmp(long_option[i].name, "in-place") == 0){
1477 preserve_time_f = FALSE;
1480 if (strcmp(long_option[i].name, "in-place=") == 0){
1483 preserve_time_f = FALSE;
1485 backup_suffix = malloc(strlen((char *) p) + 1);
1486 strcpy(backup_suffix, (char *) p);
1491 if (strcmp(long_option[i].name, "cap-input") == 0){
1495 if (strcmp(long_option[i].name, "url-input") == 0){
1500 #ifdef NUMCHAR_OPTION
1501 if (strcmp(long_option[i].name, "numchar-input") == 0){
1507 if (strcmp(long_option[i].name, "no-output") == 0){
1511 if (strcmp(long_option[i].name, "debug") == 0){
1516 if (strcmp(long_option[i].name, "cp932") == 0){
1517 #ifdef SHIFTJIS_CP932
1521 #ifdef UTF8_OUTPUT_ENABLE
1522 ms_ucs_map_f = UCS_MAP_CP932;
1526 if (strcmp(long_option[i].name, "no-cp932") == 0){
1527 #ifdef SHIFTJIS_CP932
1531 #ifdef UTF8_OUTPUT_ENABLE
1532 ms_ucs_map_f = UCS_MAP_ASCII;
1536 #ifdef SHIFTJIS_CP932
1537 if (strcmp(long_option[i].name, "cp932inv") == 0){
1544 if (strcmp(long_option[i].name, "x0212") == 0){
1551 if (strcmp(long_option[i].name, "exec-in") == 0){
1555 if (strcmp(long_option[i].name, "exec-out") == 0){
1560 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1561 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1562 no_cp932ext_f = TRUE;
1565 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1566 no_best_fit_chars_f = TRUE;
1569 if (strcmp(long_option[i].name, "fb-skip") == 0){
1570 encode_fallback = NULL;
1573 if (strcmp(long_option[i].name, "fb-html") == 0){
1574 encode_fallback = encode_fallback_html;
1577 if (strcmp(long_option[i].name, "fb-xml") == 0){
1578 encode_fallback = encode_fallback_xml;
1581 if (strcmp(long_option[i].name, "fb-java") == 0){
1582 encode_fallback = encode_fallback_java;
1585 if (strcmp(long_option[i].name, "fb-perl") == 0){
1586 encode_fallback = encode_fallback_perl;
1589 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1590 encode_fallback = encode_fallback_subchar;
1593 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1594 encode_fallback = encode_fallback_subchar;
1595 unicode_subchar = 0;
1597 /* decimal number */
1598 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1599 unicode_subchar *= 10;
1600 unicode_subchar += hex2bin(p[i]);
1602 }else if(p[1] == 'x' || p[1] == 'X'){
1603 /* hexadecimal number */
1604 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1605 unicode_subchar <<= 4;
1606 unicode_subchar |= hex2bin(p[i]);
1610 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1611 unicode_subchar *= 8;
1612 unicode_subchar += hex2bin(p[i]);
1615 w16e_conv(unicode_subchar, &i, &j);
1616 unicode_subchar = i<<8 | j;
1620 #ifdef UTF8_OUTPUT_ENABLE
1621 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1622 ms_ucs_map_f = UCS_MAP_MS;
1626 #ifdef UNICODE_NORMALIZATION
1627 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1628 input_f = UTF8_INPUT;
1633 if (strcmp(long_option[i].name, "prefix=") == 0){
1634 if (nkf_isgraph(p[0])){
1635 for (i = 1; nkf_isgraph(p[i]); i++){
1636 prefix_table[p[i]] = p[0];
1643 case 'b': /* buffered mode */
1646 case 'u': /* non bufferd mode */
1649 case 't': /* transparent mode */
1654 } else if (*cp=='2') {
1658 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1666 case 'j': /* JIS output */
1668 output_conv = j_oconv;
1670 case 'e': /* AT&T EUC output */
1671 output_conv = e_oconv;
1674 case 's': /* SJIS output */
1675 output_conv = s_oconv;
1677 case 'l': /* ISO8859 Latin-1 support, no conversion */
1678 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1679 input_f = LATIN1_INPUT;
1681 case 'i': /* Kanji IN ESC-$-@/B */
1682 if (*cp=='@'||*cp=='B')
1683 kanji_intro = *cp++;
1685 case 'o': /* ASCII IN ESC-(-J/B */
1686 if (*cp=='J'||*cp=='B'||*cp=='H')
1687 ascii_intro = *cp++;
1691 bit:1 katakana->hiragana
1692 bit:2 hiragana->katakana
1694 if ('9'>= *cp && *cp>='0')
1695 hira_f |= (*cp++ -'0');
1702 #if defined(MSDOS) || defined(__OS2__)
1717 #ifdef UTF8_OUTPUT_ENABLE
1718 case 'w': /* UTF-8 output */
1720 output_conv = w_oconv; cp++;
1724 output_bom_f = TRUE;
1727 if ('1'== cp[0] && '6'==cp[1]) {
1728 output_conv = w_oconv16; cp+=2;
1729 } else if ('3'== cp[0] && '2'==cp[1]) {
1730 output_conv = w_oconv32; cp+=2;
1732 output_conv = w_oconv;
1737 output_endian = ENDIAN_LITTLE;
1738 } else if (cp[0] == 'B') {
1746 output_bom_f = TRUE;
1751 #ifdef UTF8_INPUT_ENABLE
1752 case 'W': /* UTF input */
1755 input_f = UTF8_INPUT;
1757 if ('1'== cp[0] && '6'==cp[1]) {
1759 input_f = UTF16_INPUT;
1760 input_endian = ENDIAN_BIG;
1761 } else if ('3'== cp[0] && '2'==cp[1]) {
1763 input_f = UTF32_INPUT;
1764 input_endian = ENDIAN_BIG;
1766 input_f = UTF8_INPUT;
1771 input_endian = ENDIAN_LITTLE;
1772 } else if (cp[0] == 'B') {
1778 /* Input code assumption */
1779 case 'J': /* JIS input */
1780 input_f = JIS_INPUT;
1782 case 'E': /* AT&T EUC input */
1783 input_f = EUC_INPUT;
1785 case 'S': /* MS Kanji input */
1786 input_f = SJIS_INPUT;
1787 if (x0201_f==NO_X0201) x0201_f=TRUE;
1789 case 'Z': /* Convert X0208 alphabet to asii */
1791 bit:0 Convert JIS X 0208 Alphabet to ASCII
1792 bit:1 Convert Kankaku to one space
1793 bit:2 Convert Kankaku to two spaces
1794 bit:3 Convert HTML Entity
1795 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1797 while ('0'<= *cp && *cp <='9') {
1798 alpha_f |= 1 << (*cp++ - '0');
1800 if (!alpha_f) alpha_f = 1;
1802 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1803 x0201_f = FALSE; /* No X0201->X0208 conversion */
1805 ESC-(-I in JIS, EUC, MS Kanji
1806 SI/SO in JIS, EUC, MS Kanji
1807 SSO in EUC, JIS, not in MS Kanji
1808 MS Kanji (0xa0-0xdf)
1810 ESC-(-I in JIS (0x20-0x5f)
1811 SSO in EUC (0xa0-0xdf)
1812 0xa0-0xd in MS Kanji (0xa0-0xdf)
1815 case 'X': /* Assume X0201 kana */
1816 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1819 case 'F': /* prserve new lines */
1820 fold_preserve_f = TRUE;
1821 case 'f': /* folding -f60 or -f */
1824 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1826 fold_len += *cp++ - '0';
1828 if (!(0<fold_len && fold_len<BUFSIZ))
1829 fold_len = DEFAULT_FOLD;
1833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1835 fold_margin += *cp++ - '0';
1839 case 'm': /* MIME support */
1840 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1841 if (*cp=='B'||*cp=='Q') {
1842 mime_decode_mode = *cp++;
1843 mimebuf_f = FIXED_MIME;
1844 } else if (*cp=='N') {
1845 mime_f = TRUE; cp++;
1846 } else if (*cp=='S') {
1847 mime_f = STRICT_MIME; cp++;
1848 } else if (*cp=='0') {
1849 mime_decode_f = FALSE;
1850 mime_f = FALSE; cp++;
1853 case 'M': /* MIME output */
1856 mimeout_f = FIXED_MIME; cp++;
1857 } else if (*cp=='Q') {
1859 mimeout_f = FIXED_MIME; cp++;
1864 case 'B': /* Broken JIS support */
1866 bit:1 allow any x on ESC-(-x or ESC-$-x
1867 bit:2 reset to ascii on NL
1869 if ('9'>= *cp && *cp>='0')
1870 broken_f |= 1<<(*cp++ -'0');
1875 case 'O':/* for Output file */
1879 case 'c':/* add cr code */
1882 case 'd':/* delete cr code */
1885 case 'I': /* ISO-2022-JP output */
1888 case 'L': /* line mode */
1889 if (*cp=='u') { /* unix */
1890 nlmode_f = LF; cp++;
1891 } else if (*cp=='m') { /* mac */
1892 nlmode_f = CR; cp++;
1893 } else if (*cp=='w') { /* windows */
1894 nlmode_f = CRLF; cp++;
1895 } else if (*cp=='0') { /* no conversion */
1904 } else if (*cp == '0') {
1913 /* module muliple options in a string are allowed for Perl moudle */
1914 while(*cp && *cp++!='-');
1917 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
1918 /* bogus option but ignored */
1924 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1927 struct input_code *p = input_code_list;
1929 if (iconv_func == p->iconv_func){
1938 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1940 #ifdef INPUT_CODE_FIX
1948 #ifdef INPUT_CODE_FIX
1949 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1955 if (estab_f && iconv_for_check != iconv){
1956 struct input_code *p = find_inputcode_byfunc(iconv);
1958 set_input_codename(p->name);
1961 iconv_for_check = iconv;
1966 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1967 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1968 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1969 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
1970 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
1971 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1972 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1973 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1975 #define SCORE_INIT (SCORE_iMIME)
1977 static const char score_table_A0[] = {
1980 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1981 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1984 static const char score_table_F0[] = {
1985 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1986 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1987 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
1988 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1991 void set_code_score(struct input_code *ptr, nkf_char score)
1994 ptr->score |= score;
1998 void clr_code_score(struct input_code *ptr, nkf_char score)
2001 ptr->score &= ~score;
2005 void code_score(struct input_code *ptr)
2007 nkf_char c2 = ptr->buf[0];
2008 #ifdef UTF8_OUTPUT_ENABLE
2009 nkf_char c1 = ptr->buf[1];
2012 set_code_score(ptr, SCORE_ERROR);
2013 }else if (c2 == SSO){
2014 set_code_score(ptr, SCORE_KANA);
2015 }else if (c2 == 0x8f){
2016 set_code_score(ptr, SCORE_X0212);
2017 #ifdef UTF8_OUTPUT_ENABLE
2018 }else if (!e2w_conv(c2, c1)){
2019 set_code_score(ptr, SCORE_NO_EXIST);
2021 }else if ((c2 & 0x70) == 0x20){
2022 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2023 }else if ((c2 & 0x70) == 0x70){
2024 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2025 }else if ((c2 & 0x70) >= 0x50){
2026 set_code_score(ptr, SCORE_L2);
2030 void status_disable(struct input_code *ptr)
2035 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2038 void status_push_ch(struct input_code *ptr, nkf_char c)
2040 ptr->buf[ptr->index++] = c;
2043 void status_clear(struct input_code *ptr)
2049 void status_reset(struct input_code *ptr)
2052 ptr->score = SCORE_INIT;
2055 void status_reinit(struct input_code *ptr)
2058 ptr->_file_stat = 0;
2061 void status_check(struct input_code *ptr, nkf_char c)
2063 if (c <= DEL && estab_f){
2068 void s_status(struct input_code *ptr, nkf_char c)
2072 status_check(ptr, c);
2077 #ifdef NUMCHAR_OPTION
2078 }else if (is_unicode_capsule(c)){
2081 }else if (0xa1 <= c && c <= 0xdf){
2082 status_push_ch(ptr, SSO);
2083 status_push_ch(ptr, c);
2086 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2088 status_push_ch(ptr, c);
2089 }else if (0xed <= c && c <= 0xee){
2091 status_push_ch(ptr, c);
2092 #ifdef SHIFTJIS_CP932
2093 }else if (is_ibmext_in_sjis(c)){
2095 status_push_ch(ptr, c);
2096 #endif /* SHIFTJIS_CP932 */
2098 }else if (0xf0 <= c && c <= 0xfc){
2100 status_push_ch(ptr, c);
2101 #endif /* X0212_ENABLE */
2103 status_disable(ptr);
2107 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2108 status_push_ch(ptr, c);
2109 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2113 status_disable(ptr);
2117 #ifdef SHIFTJIS_CP932
2118 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2119 status_push_ch(ptr, c);
2120 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2121 set_code_score(ptr, SCORE_CP932);
2126 #endif /* SHIFTJIS_CP932 */
2127 status_disable(ptr);
2130 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2131 status_push_ch(ptr, c);
2132 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2133 set_code_score(ptr, SCORE_CP932);
2136 status_disable(ptr);
2142 void e_status(struct input_code *ptr, nkf_char c)
2146 status_check(ptr, c);
2151 #ifdef NUMCHAR_OPTION
2152 }else if (is_unicode_capsule(c)){
2155 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2157 status_push_ch(ptr, c);
2159 }else if (0x8f == c){
2161 status_push_ch(ptr, c);
2162 #endif /* X0212_ENABLE */
2164 status_disable(ptr);
2168 if (0xa1 <= c && c <= 0xfe){
2169 status_push_ch(ptr, c);
2173 status_disable(ptr);
2178 if (0xa1 <= c && c <= 0xfe){
2180 status_push_ch(ptr, c);
2182 status_disable(ptr);
2184 #endif /* X0212_ENABLE */
2188 #ifdef UTF8_INPUT_ENABLE
2189 void w_status(struct input_code *ptr, nkf_char c)
2193 status_check(ptr, c);
2198 #ifdef NUMCHAR_OPTION
2199 }else if (is_unicode_capsule(c)){
2202 }else if (0xc0 <= c && c <= 0xdf){
2204 status_push_ch(ptr, c);
2205 }else if (0xe0 <= c && c <= 0xef){
2207 status_push_ch(ptr, c);
2208 }else if (0xf0 <= c && c <= 0xf4){
2210 status_push_ch(ptr, c);
2212 status_disable(ptr);
2217 if (0x80 <= c && c <= 0xbf){
2218 status_push_ch(ptr, c);
2219 if (ptr->index > ptr->stat){
2220 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2221 && ptr->buf[2] == 0xbf);
2222 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2223 &ptr->buf[0], &ptr->buf[1]);
2230 status_disable(ptr);
2234 if (0x80 <= c && c <= 0xbf){
2235 if (ptr->index < ptr->stat){
2236 status_push_ch(ptr, c);
2241 status_disable(ptr);
2248 void code_status(nkf_char c)
2250 int action_flag = 1;
2251 struct input_code *result = 0;
2252 struct input_code *p = input_code_list;
2254 if (!p->status_func) {
2258 if (!p->status_func)
2260 (p->status_func)(p, c);
2263 }else if(p->stat == 0){
2274 if (result && !estab_f){
2275 set_iconv(TRUE, result->iconv_func);
2276 }else if (c <= DEL){
2277 struct input_code *ptr = input_code_list;
2287 nkf_char std_getc(FILE *f)
2290 return std_gc_buf[--std_gc_ndx];
2296 nkf_char std_ungetc(nkf_char c, FILE *f)
2298 if (std_gc_ndx == STD_GC_BUFSIZE){
2301 std_gc_buf[std_gc_ndx++] = c;
2306 void std_putc(nkf_char c)
2313 #if !defined(PERL_XS) && !defined(WIN32DLL)
2314 nkf_char noconvert(FILE *f)
2319 module_connection();
2320 while ((c = (*i_getc)(f)) != EOF)
2327 void module_connection(void)
2329 oconv = output_conv;
2332 /* replace continucation module, from output side */
2334 /* output redicrection */
2336 if (noout_f || guess_f){
2343 if (mimeout_f == TRUE) {
2344 o_base64conv = oconv; oconv = base64_conv;
2346 /* base64_count = 0; */
2349 if (nlmode_f || guess_f) {
2350 o_nlconv = oconv; oconv = nl_conv;
2353 o_rot_conv = oconv; oconv = rot_conv;
2356 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2359 o_hira_conv = oconv; oconv = hira_conv;
2362 o_fconv = oconv; oconv = fold_conv;
2365 if (alpha_f || x0201_f) {
2366 o_zconv = oconv; oconv = z_conv;
2370 i_ungetc = std_ungetc;
2371 /* input redicrection */
2374 i_cgetc = i_getc; i_getc = cap_getc;
2375 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2378 i_ugetc = i_getc; i_getc = url_getc;
2379 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2382 #ifdef NUMCHAR_OPTION
2384 i_ngetc = i_getc; i_getc = numchar_getc;
2385 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2388 #ifdef UNICODE_NORMALIZATION
2389 if (nfc_f && input_f == UTF8_INPUT){
2390 i_nfc_getc = i_getc; i_getc = nfc_getc;
2391 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2394 if (mime_f && mimebuf_f==FIXED_MIME) {
2395 i_mgetc = i_getc; i_getc = mime_getc;
2396 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2399 i_bgetc = i_getc; i_getc = broken_getc;
2400 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2402 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2403 set_iconv(-TRUE, e_iconv);
2404 } else if (input_f == SJIS_INPUT) {
2405 set_iconv(-TRUE, s_iconv);
2406 #ifdef UTF8_INPUT_ENABLE
2407 } else if (input_f == UTF8_INPUT) {
2408 set_iconv(-TRUE, w_iconv);
2409 } else if (input_f == UTF16_INPUT) {
2410 set_iconv(-TRUE, w_iconv16);
2411 } else if (input_f == UTF32_INPUT) {
2412 set_iconv(-TRUE, w_iconv32);
2415 set_iconv(FALSE, e_iconv);
2419 struct input_code *p = input_code_list;
2427 * Check and Ignore BOM
2429 void check_bom(FILE *f)
2432 switch(c2 = (*i_getc)(f)){
2434 if((c2 = (*i_getc)(f)) == 0x00){
2435 if((c2 = (*i_getc)(f)) == 0xFE){
2436 if((c2 = (*i_getc)(f)) == 0xFF){
2438 set_iconv(TRUE, w_iconv32);
2440 if (iconv == w_iconv32) {
2441 input_endian = ENDIAN_BIG;
2444 (*i_ungetc)(0xFF,f);
2445 }else (*i_ungetc)(c2,f);
2446 (*i_ungetc)(0xFE,f);
2447 }else if(c2 == 0xFF){
2448 if((c2 = (*i_getc)(f)) == 0xFE){
2450 set_iconv(TRUE, w_iconv32);
2452 if (iconv == w_iconv32) {
2453 input_endian = ENDIAN_2143;
2456 (*i_ungetc)(0xFF,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xFF,f);
2459 }else (*i_ungetc)(c2,f);
2460 (*i_ungetc)(0x00,f);
2461 }else (*i_ungetc)(c2,f);
2462 (*i_ungetc)(0x00,f);
2465 if((c2 = (*i_getc)(f)) == 0xBB){
2466 if((c2 = (*i_getc)(f)) == 0xBF){
2468 set_iconv(TRUE, w_iconv);
2470 if (iconv == w_iconv) {
2473 (*i_ungetc)(0xBF,f);
2474 }else (*i_ungetc)(c2,f);
2475 (*i_ungetc)(0xBB,f);
2476 }else (*i_ungetc)(c2,f);
2477 (*i_ungetc)(0xEF,f);
2480 if((c2 = (*i_getc)(f)) == 0xFF){
2481 if((c2 = (*i_getc)(f)) == 0x00){
2482 if((c2 = (*i_getc)(f)) == 0x00){
2484 set_iconv(TRUE, w_iconv32);
2486 if (iconv == w_iconv32) {
2487 input_endian = ENDIAN_3412;
2490 (*i_ungetc)(0x00,f);
2491 }else (*i_ungetc)(c2,f);
2492 (*i_ungetc)(0x00,f);
2493 }else (*i_ungetc)(c2,f);
2495 set_iconv(TRUE, w_iconv16);
2497 if (iconv == w_iconv16) {
2498 input_endian = ENDIAN_BIG;
2501 (*i_ungetc)(0xFF,f);
2502 }else (*i_ungetc)(c2,f);
2503 (*i_ungetc)(0xFE,f);
2506 if((c2 = (*i_getc)(f)) == 0xFE){
2507 if((c2 = (*i_getc)(f)) == 0x00){
2508 if((c2 = (*i_getc)(f)) == 0x00){
2510 set_iconv(TRUE, w_iconv32);
2512 if (iconv == w_iconv32) {
2513 input_endian = ENDIAN_LITTLE;
2516 (*i_ungetc)(0x00,f);
2517 }else (*i_ungetc)(c2,f);
2518 (*i_ungetc)(0x00,f);
2519 }else (*i_ungetc)(c2,f);
2521 set_iconv(TRUE, w_iconv16);
2523 if (iconv == w_iconv16) {
2524 input_endian = ENDIAN_LITTLE;
2527 (*i_ungetc)(0xFE,f);
2528 }else (*i_ungetc)(c2,f);
2529 (*i_ungetc)(0xFF,f);
2538 Conversion main loop. Code detection only.
2541 nkf_char kanji_convert(FILE *f)
2543 nkf_char c3, c2=0, c1, c0=0;
2544 int is_8bit = FALSE;
2546 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2547 #ifdef UTF8_INPUT_ENABLE
2548 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2555 output_mode = ASCII;
2558 #define NEXT continue /* no output, get next */
2559 #define SEND ; /* output c1 and c2, get next */
2560 #define LAST break /* end of loop, go closing */
2562 module_connection();
2565 while ((c1 = (*i_getc)(f)) != EOF) {
2566 #ifdef INPUT_CODE_FIX
2572 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2573 /* in case of 8th bit is on */
2574 if (!estab_f&&!mime_decode_mode) {
2575 /* in case of not established yet */
2576 /* It is still ambiguious */
2577 if (h_conv(f, c2, c1)==EOF)
2583 /* in case of already established */
2585 /* ignore bogus code and not CP5022x UCD */
2593 /* second byte, 7 bit code */
2594 /* it might be kanji shitfted */
2595 if ((c1 == DEL) || (c1 <= SP)) {
2596 /* ignore bogus first code */
2603 #ifdef UTF8_INPUT_ENABLE
2604 if (iconv == w_iconv16) {
2605 if (input_endian == ENDIAN_BIG) {
2607 if ((c1 = (*i_getc)(f)) != EOF) {
2608 if (0xD8 <= c2 && c2 <= 0xDB) {
2609 if ((c0 = (*i_getc)(f)) != EOF) {
2611 if ((c3 = (*i_getc)(f)) != EOF) {
2618 if ((c2 = (*i_getc)(f)) != EOF) {
2619 if (0xD8 <= c2 && c2 <= 0xDB) {
2620 if ((c3 = (*i_getc)(f)) != EOF) {
2621 if ((c0 = (*i_getc)(f)) != EOF) {
2630 } else if(iconv == w_iconv32){
2632 if((c2 = (*i_getc)(f)) != EOF &&
2633 (c1 = (*i_getc)(f)) != EOF &&
2634 (c0 = (*i_getc)(f)) != EOF){
2635 switch(input_endian){
2637 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2640 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2643 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2646 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2656 #ifdef NUMCHAR_OPTION
2657 if (is_unicode_capsule(c1)){
2661 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2663 if (!estab_f && !iso8859_f) {
2664 /* not established yet */
2667 } else { /* estab_f==TRUE */
2672 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2673 /* SJIS X0201 Case... */
2674 if(iso2022jp_f && x0201_f==NO_X0201) {
2675 (*oconv)(GETA1, GETA2);
2682 } else if (c1==SSO && iconv != s_iconv) {
2683 /* EUC X0201 Case */
2684 c1 = (*i_getc)(f); /* skip SSO */
2686 if (SSP<=c1 && c1<0xe0) {
2687 if(iso2022jp_f && x0201_f==NO_X0201) {
2688 (*oconv)(GETA1, GETA2);
2695 } else { /* bogus code, skip SSO and one byte */
2698 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2699 (c1 == 0xFD || c1 == 0xFE)) {
2705 /* already established */
2710 } else if ((c1 > SP) && (c1 != DEL)) {
2711 /* in case of Roman characters */
2713 /* output 1 shifted byte */
2717 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2718 /* output 1 shifted byte */
2719 if(iso2022jp_f && x0201_f==NO_X0201) {
2720 (*oconv)(GETA1, GETA2);
2727 /* look like bogus code */
2730 } else if (input_mode == X0208 || input_mode == X0212 ||
2731 input_mode == X0213_1 || input_mode == X0213_2) {
2732 /* in case of Kanji shifted */
2735 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2736 /* Check MIME code */
2737 if ((c1 = (*i_getc)(f)) == EOF) {
2740 } else if (c1 == '?') {
2741 /* =? is mime conversion start sequence */
2742 if(mime_f == STRICT_MIME) {
2743 /* check in real detail */
2744 if (mime_begin_strict(f) == EOF)
2748 } else if (mime_begin(f) == EOF)
2758 /* normal ASCII code */
2761 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2764 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2767 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2768 if ((c1 = (*i_getc)(f)) == EOF) {
2769 /* (*oconv)(0, ESC); don't send bogus code */
2771 } else if (c1 == '$') {
2772 if ((c1 = (*i_getc)(f)) == EOF) {
2774 (*oconv)(0, ESC); don't send bogus code
2775 (*oconv)(0, '$'); */
2777 } else if (c1 == '@'|| c1 == 'B') {
2778 /* This is kanji introduction */
2781 set_input_codename("ISO-2022-JP");
2783 debug("ISO-2022-JP");
2786 } else if (c1 == '(') {
2787 if ((c1 = (*i_getc)(f)) == EOF) {
2788 /* don't send bogus code
2794 } else if (c1 == '@'|| c1 == 'B') {
2795 /* This is kanji introduction */
2800 } else if (c1 == 'D'){
2804 #endif /* X0212_ENABLE */
2805 } else if (c1 == (X0213_1&0x7F)){
2806 input_mode = X0213_1;
2809 } else if (c1 == (X0213_2&0x7F)){
2810 input_mode = X0213_2;
2814 /* could be some special code */
2821 } else if (broken_f&0x2) {
2822 /* accept any ESC-(-x as broken code ... */
2832 } else if (c1 == '(') {
2833 if ((c1 = (*i_getc)(f)) == EOF) {
2834 /* don't send bogus code
2836 (*oconv)(0, '('); */
2840 /* This is X0201 kana introduction */
2841 input_mode = X0201; shift_mode = X0201;
2843 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2844 /* This is X0208 kanji introduction */
2845 input_mode = ASCII; shift_mode = FALSE;
2847 } else if (broken_f&0x2) {
2848 input_mode = ASCII; shift_mode = FALSE;
2853 /* maintain various input_mode here */
2857 } else if ( c1 == 'N' || c1 == 'n'){
2859 c3 = (*i_getc)(f); /* skip SS2 */
2860 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2875 } else if (c1 == ESC && iconv == s_iconv) {
2876 /* ESC in Shift_JIS */
2877 if ((c1 = (*i_getc)(f)) == EOF) {
2878 /* (*oconv)(0, ESC); don't send bogus code */
2880 } else if (c1 == '$') {
2882 if ((c1 = (*i_getc)(f)) == EOF) {
2884 (*oconv)(0, ESC); don't send bogus code
2885 (*oconv)(0, '$'); */
2888 if (('E' <= c1 && c1 <= 'G') ||
2889 ('O' <= c1 && c1 <= 'Q')) {
2897 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2898 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2899 while ((c1 = (*i_getc)(f)) != EOF) {
2900 if (SP <= c1 && c1 <= 'z') {
2901 (*oconv)(0, c1 + c0);
2902 } else break; /* c1 == SO */
2906 if (c1 == EOF) LAST;
2913 } else if (c1 == LF || c1 == CR) {
2915 input_mode = ASCII; set_iconv(FALSE, 0);
2917 } else if (mime_decode_f && !mime_decode_mode){
2919 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
2927 } else { /* if (c1 == CR)*/
2928 if ((c1=(*i_getc)(f))!=EOF) {
2932 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
2946 } else if (c1 == DEL && input_mode == X0208) {
2956 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2959 if ((c0 = (*i_getc)(f)) != EOF) {
2962 if ((c3 = (*i_getc)(f)) != EOF) {
2964 (*iconv)(c2, c1, c0|c3);
2969 /* 3 bytes EUC or UTF-8 */
2970 if ((c0 = (*i_getc)(f)) != EOF) {
2972 (*iconv)(c2, c1, c0);
2980 0x7F <= c2 && c2 <= 0x92 &&
2981 0x21 <= c1 && c1 <= 0x7E) {
2983 if(c1 == 0x7F) return 0;
2984 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2987 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2991 (*oconv)(PREFIX_EUCG3 | c2, c1);
2993 #endif /* X0212_ENABLE */
2995 (*oconv)(PREFIX_EUCG3 | c2, c1);
2998 (*oconv)(input_mode, c1); /* other special case */
3004 /* goto next_word */
3008 (*iconv)(EOF, 0, 0);
3009 if (!input_codename)
3012 struct input_code *p = input_code_list;
3013 struct input_code *result = p;
3015 if (p->score < result->score) result = p;
3018 set_input_codename(result->name);
3020 debug(result->name);
3028 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3030 nkf_char ret, c3, c0;
3034 /** it must NOT be in the kanji shifte sequence */
3035 /** it must NOT be written in JIS7 */
3036 /** and it must be after 2 byte 8bit code */
3042 while ((c1 = (*i_getc)(f)) != EOF) {
3048 if (push_hold_buf(c1) == EOF || estab_f){
3054 struct input_code *p = input_code_list;
3055 struct input_code *result = p;
3060 if (p->status_func && p->score < result->score){
3065 set_iconv(TRUE, result->iconv_func);
3070 ** 1) EOF is detected, or
3071 ** 2) Code is established, or
3072 ** 3) Buffer is FULL (but last word is pushed)
3074 ** in 1) and 3) cases, we continue to use
3075 ** Kanji codes by oconv and leave estab_f unchanged.
3080 while (hold_index < hold_count){
3081 c2 = hold_buf[hold_index++];
3083 #ifdef NUMCHAR_OPTION
3084 || is_unicode_capsule(c2)
3089 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3090 (*iconv)(X0201, c2, 0);
3093 if (hold_index < hold_count){
3094 c1 = hold_buf[hold_index++];
3104 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3107 if (hold_index < hold_count){
3108 c0 = hold_buf[hold_index++];
3109 } else if ((c0 = (*i_getc)(f)) == EOF) {
3115 if (hold_index < hold_count){
3116 c3 = hold_buf[hold_index++];
3117 } else if ((c3 = (*i_getc)(f)) == EOF) {
3122 (*iconv)(c2, c1, c0|c3);
3127 /* 3 bytes EUC or UTF-8 */
3128 if (hold_index < hold_count){
3129 c0 = hold_buf[hold_index++];
3130 } else if ((c0 = (*i_getc)(f)) == EOF) {
3136 (*iconv)(c2, c1, c0);
3139 if (c0 == EOF) break;
3144 nkf_char push_hold_buf(nkf_char c2)
3146 if (hold_count >= HOLD_SIZE*2)
3148 hold_buf[hold_count++] = (unsigned char)c2;
3149 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3152 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3154 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3157 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3158 #ifdef SHIFTJIS_CP932
3159 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3160 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3167 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3168 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3174 #endif /* SHIFTJIS_CP932 */
3176 if (!x0213_f && is_ibmext_in_sjis(c2)){
3177 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3180 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3193 if(x0213_f && c2 >= 0xF0){
3194 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3195 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3196 }else{ /* 78<=k<=94 */
3197 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3198 if (0x9E < c1) c2++;
3201 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3202 if (0x9E < c1) c2++;
3205 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3212 c2 = x0212_unshift(c2);
3219 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3223 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3225 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3227 if(c1 == 0x7F) return 0;
3228 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3231 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3232 if (ret) return ret;
3238 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3243 }else if (c2 == 0x8f){
3247 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3248 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3249 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3252 c2 = (c2 << 8) | (c1 & 0x7f);
3254 #ifdef SHIFTJIS_CP932
3257 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3258 s2e_conv(s2, s1, &c2, &c1);
3265 #endif /* SHIFTJIS_CP932 */
3267 #endif /* X0212_ENABLE */
3268 } else if (c2 == SSO){
3271 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3274 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3275 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3276 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3281 #ifdef SHIFTJIS_CP932
3282 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3284 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3285 s2e_conv(s2, s1, &c2, &c1);
3292 #endif /* SHIFTJIS_CP932 */
3299 #ifdef UTF8_INPUT_ENABLE
3300 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3307 }else if (0xc0 <= c2 && c2 <= 0xef) {
3308 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3309 #ifdef NUMCHAR_OPTION
3312 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3320 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3323 static const char w_iconv_utf8_1st_byte[] =
3325 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3326 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3327 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3328 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3330 if (c2 < 0 || 0xff < c2) {
3331 }else if (c2 == 0) { /* 0 : 1 byte*/
3333 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3336 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3338 if (c1 < 0x80 || 0xBF < c1) return 0;
3341 if (c0 == 0) return -1;
3342 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3347 if (c0 == 0) return -1;
3348 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3352 if (c0 == 0) return -1;
3353 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3357 if (c0 == 0) return -2;
3358 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3362 if (c0 == 0) return -2;
3363 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3367 if (c0 == 0) return -2;
3368 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3376 if (c2 == 0 || c2 == EOF){
3377 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3378 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3381 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3390 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3391 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3398 }else if (val < 0x800){
3399 *p2 = 0xc0 | (val >> 6);
3400 *p1 = 0x80 | (val & 0x3f);
3402 } else if (val <= NKF_INT32_C(0xFFFF)) {
3403 *p2 = 0xe0 | (val >> 12);
3404 *p1 = 0x80 | ((val >> 6) & 0x3f);
3405 *p0 = 0x80 | (val & 0x3f);
3406 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3407 *p2 = 0xe0 | (val >> 16);
3408 *p1 = 0x80 | ((val >> 12) & 0x3f);
3409 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3418 #ifdef UTF8_INPUT_ENABLE
3419 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3424 } else if (c2 >= 0xf0){
3425 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3426 val = (c2 & 0x0f) << 18;
3427 val |= (c1 & 0x3f) << 12;
3428 val |= (c0 & 0x3f00) >> 2;
3430 }else if (c2 >= 0xe0){
3431 val = (c2 & 0x0f) << 12;
3432 val |= (c1 & 0x3f) << 6;
3434 }else if (c2 >= 0xc0){
3435 val = (c2 & 0x1f) << 6;
3443 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3445 nkf_char c2, c1, c0;
3452 w16w_conv(val, &c2, &c1, &c0);
3453 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3454 #ifdef NUMCHAR_OPTION
3457 *p1 = CLASS_UNICODE | val;
3466 #ifdef UTF8_INPUT_ENABLE
3467 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3470 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3473 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3474 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3476 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3478 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3483 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3484 if (ret) return ret;
3489 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3493 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3494 } else if (is_unicode_bmp(c1)) {
3495 ret = w16e_conv(c1, &c2, &c1);
3498 c1 = CLASS_UNICODE | c1;
3500 if (ret) return ret;
3505 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3507 const unsigned short *const *pp;
3508 const unsigned short *const *const *ppp;
3509 static const char no_best_fit_chars_table_C2[] =
3510 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3513 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3514 static const char no_best_fit_chars_table_C2_ms[] =
3515 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3518 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3519 static const char no_best_fit_chars_table_932_C2[] =
3520 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3523 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3524 static const char no_best_fit_chars_table_932_C3[] =
3525 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3528 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3534 }else if(c2 < 0xe0){
3535 if(no_best_fit_chars_f){
3536 if(ms_ucs_map_f == UCS_MAP_CP932){
3539 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3542 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3545 }else if(!cp932inv_f){
3548 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3551 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3554 }else if(ms_ucs_map_f == UCS_MAP_MS){