1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.127 2007/07/19 20:08:29 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2007-07-20"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
356 #define UCS_MAP_ASCII 0
358 #define UCS_MAP_CP932 2
359 #define UCS_MAP_CP10001 3
360 static int ms_ucs_map_f = UCS_MAP_ASCII;
362 #ifdef UTF8_INPUT_ENABLE
363 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
364 static int no_cp932ext_f = FALSE;
365 /* ignore ZERO WIDTH NO-BREAK SPACE */
366 static int no_best_fit_chars_f = FALSE;
367 static int input_endian = ENDIAN_BIG;
368 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
369 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
370 static void encode_fallback_html(nkf_char c);
371 static void encode_fallback_xml(nkf_char c);
372 static void encode_fallback_java(nkf_char c);
373 static void encode_fallback_perl(nkf_char c);
374 static void encode_fallback_subchar(nkf_char c);
375 static void (*encode_fallback)(nkf_char c) = NULL;
376 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
377 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
379 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
380 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
381 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
382 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
383 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
384 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
385 static void w_status(struct input_code *, nkf_char);
387 #ifdef UTF8_OUTPUT_ENABLE
388 static int output_bom_f = FALSE;
389 static int output_endian = ENDIAN_BIG;
390 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
391 static void w_oconv(nkf_char c2,nkf_char c1);
392 static void w_oconv16(nkf_char c2,nkf_char c1);
393 static void w_oconv32(nkf_char c2,nkf_char c1);
395 static void e_oconv(nkf_char c2,nkf_char c1);
396 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
397 static void s_oconv(nkf_char c2,nkf_char c1);
398 static void j_oconv(nkf_char c2,nkf_char c1);
399 static void fold_conv(nkf_char c2,nkf_char c1);
400 static void cr_conv(nkf_char c2,nkf_char c1);
401 static void z_conv(nkf_char c2,nkf_char c1);
402 static void rot_conv(nkf_char c2,nkf_char c1);
403 static void hira_conv(nkf_char c2,nkf_char c1);
404 static void base64_conv(nkf_char c2,nkf_char c1);
405 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
406 static void no_connection(nkf_char c2,nkf_char c1);
407 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
409 static void code_score(struct input_code *ptr);
410 static void code_status(nkf_char c);
412 static void std_putc(nkf_char c);
413 static nkf_char std_getc(FILE *f);
414 static nkf_char std_ungetc(nkf_char c,FILE *f);
416 static nkf_char broken_getc(FILE *f);
417 static nkf_char broken_ungetc(nkf_char c,FILE *f);
419 static nkf_char mime_begin(FILE *f);
420 static nkf_char mime_getc(FILE *f);
421 static nkf_char mime_ungetc(nkf_char c,FILE *f);
423 static void switch_mime_getc(void);
424 static void unswitch_mime_getc(void);
425 static nkf_char mime_begin_strict(FILE *f);
426 static nkf_char mime_getc_buf(FILE *f);
427 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
428 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
430 static nkf_char base64decode(nkf_char c);
431 static void mime_prechar(nkf_char c2, nkf_char c1);
432 static void mime_putc(nkf_char c);
433 static void open_mime(nkf_char c);
434 static void close_mime(void);
435 static void eof_mime(void);
436 static void mimeout_addchar(nkf_char c);
438 static void usage(void);
439 static void version(void);
441 static void options(unsigned char *c);
442 #if defined(PERL_XS) || defined(WIN32DLL)
443 static void reinit(void);
448 #if !defined(PERL_XS) && !defined(WIN32DLL)
449 static unsigned char stdibuf[IOBUF_SIZE];
450 static unsigned char stdobuf[IOBUF_SIZE];
452 static unsigned char hold_buf[HOLD_SIZE*2];
453 static int hold_count = 0;
455 /* MIME preprocessor fifo */
457 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
458 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
459 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
460 static unsigned char mime_buf[MIME_BUF_SIZE];
461 static unsigned int mime_top = 0;
462 static unsigned int mime_last = 0; /* decoded */
463 static unsigned int mime_input = 0; /* undecoded */
464 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
467 static int unbuf_f = FALSE;
468 static int estab_f = FALSE;
469 static int nop_f = FALSE;
470 static int binmode_f = TRUE; /* binary mode */
471 static int rot_f = FALSE; /* rot14/43 mode */
472 static int hira_f = FALSE; /* hira/kata henkan */
473 static int input_f = FALSE; /* non fixed input code */
474 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
475 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
476 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
477 static int mimebuf_f = FALSE; /* MIME buffered input */
478 static int broken_f = FALSE; /* convert ESC-less broken JIS */
479 static int iso8859_f = FALSE; /* ISO8859 through */
480 static int mimeout_f = FALSE; /* base64 mode */
481 #if defined(MSDOS) || defined(__OS2__)
482 static int x0201_f = TRUE; /* Assume JISX0201 kana */
484 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
486 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
488 #ifdef UNICODE_NORMALIZATION
489 static int nfc_f = FALSE;
490 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
491 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
492 static nkf_char nfc_getc(FILE *f);
493 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
497 static int cap_f = FALSE;
498 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
499 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
500 static nkf_char cap_getc(FILE *f);
501 static nkf_char cap_ungetc(nkf_char c,FILE *f);
503 static int url_f = FALSE;
504 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
505 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
506 static nkf_char url_getc(FILE *f);
507 static nkf_char url_ungetc(nkf_char c,FILE *f);
510 #if defined(INT_IS_SHORT)
511 #define NKF_INT32_C(n) (n##L)
513 #define NKF_INT32_C(n) (n)
515 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
516 #define CLASS_MASK NKF_INT32_C(0xFF000000)
517 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
518 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
519 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
520 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
521 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
523 #ifdef NUMCHAR_OPTION
524 static int numchar_f = FALSE;
525 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
526 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
527 static nkf_char numchar_getc(FILE *f);
528 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
532 static int noout_f = FALSE;
533 static void no_putc(nkf_char c);
534 static nkf_char debug_f = FALSE;
535 static void debug(const char *str);
536 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
539 static int guess_f = FALSE;
541 static void print_guessed_code(char *filename);
543 static void set_input_codename(char *codename);
544 static int is_inputcode_mixed = FALSE;
545 static int is_inputcode_set = FALSE;
548 static int exec_f = 0;
551 #ifdef SHIFTJIS_CP932
552 /* invert IBM extended characters to others */
553 static int cp51932_f = FALSE;
555 /* invert NEC-selected IBM extended characters to IBM extended characters */
556 static int cp932inv_f = TRUE;
558 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
559 #endif /* SHIFTJIS_CP932 */
562 static int x0212_f = FALSE;
563 static nkf_char x0212_shift(nkf_char c);
564 static nkf_char x0212_unshift(nkf_char c);
566 static int x0213_f = FALSE;
568 static unsigned char prefix_table[256];
570 static void set_code_score(struct input_code *ptr, nkf_char score);
571 static void clr_code_score(struct input_code *ptr, nkf_char score);
572 static void status_disable(struct input_code *ptr);
573 static void status_push_ch(struct input_code *ptr, nkf_char c);
574 static void status_clear(struct input_code *ptr);
575 static void status_reset(struct input_code *ptr);
576 static void status_reinit(struct input_code *ptr);
577 static void status_check(struct input_code *ptr, nkf_char c);
578 static void e_status(struct input_code *, nkf_char);
579 static void s_status(struct input_code *, nkf_char);
581 struct input_code input_code_list[] = {
582 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
583 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
584 #ifdef UTF8_INPUT_ENABLE
585 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
586 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
587 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
592 static int mimeout_mode = 0;
593 static int base64_count = 0;
595 /* X0208 -> ASCII converter */
598 static int f_line = 0; /* chars in line */
599 static int f_prev = 0;
600 static int fold_preserve_f = FALSE; /* preserve new lines */
601 static int fold_f = FALSE;
602 static int fold_len = 0;
605 static unsigned char kanji_intro = DEFAULT_J;
606 static unsigned char ascii_intro = DEFAULT_R;
610 #define FOLD_MARGIN 10
611 #define DEFAULT_FOLD 60
613 static int fold_margin = FOLD_MARGIN;
617 #ifdef DEFAULT_CODE_JIS
618 # define DEFAULT_CONV j_oconv
620 #ifdef DEFAULT_CODE_SJIS
621 # define DEFAULT_CONV s_oconv
623 #ifdef DEFAULT_CODE_EUC
624 # define DEFAULT_CONV e_oconv
626 #ifdef DEFAULT_CODE_UTF8
627 # define DEFAULT_CONV w_oconv
630 /* process default */
631 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
633 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
634 /* s_iconv or oconv */
635 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
637 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
642 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
643 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
645 /* static redirections */
647 static void (*o_putc)(nkf_char c) = std_putc;
649 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
650 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
652 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
653 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
655 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
657 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
658 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
660 /* for strict mime */
661 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
662 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
665 static int output_mode = ASCII, /* output kanji mode */
666 input_mode = ASCII, /* input kanji mode */
667 shift_mode = FALSE; /* TRUE shift out, or X0201 */
668 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
670 /* X0201 / X0208 conversion tables */
672 /* X0201 kana conversion table */
675 unsigned char cv[]= {
676 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
677 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
678 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
679 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
680 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
681 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
682 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
683 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
684 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
685 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
686 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
687 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
688 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
689 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
690 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
691 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
695 /* X0201 kana conversion table for daguten */
698 unsigned char dv[]= {
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
704 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
705 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
706 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
707 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
708 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
710 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
714 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
717 /* X0201 kana conversion table for han-daguten */
720 unsigned char ev[]= {
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
730 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
732 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
735 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
736 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
740 /* X0208 kigou conversion table */
741 /* 0x8140 - 0x819e */
743 unsigned char fv[] = {
745 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
746 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
747 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
748 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
749 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
750 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
751 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
753 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
755 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
756 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
762 static int file_out_f = FALSE;
764 static int overwrite_f = FALSE;
765 static int preserve_time_f = FALSE;
766 static int backup_f = FALSE;
767 static char *backup_suffix = "";
768 static char *get_backup_filename(const char *suffix, const char *filename);
771 static int crmode_f = 0; /* CR, NL, CRLF */
772 static nkf_char prev_cr = 0;
773 #ifdef EASYWIN /*Easy Win */
774 static int end_check;
777 #define STD_GC_BUFSIZE (256)
778 nkf_char std_gc_buf[STD_GC_BUFSIZE];
782 #include "nkf32dll.c"
783 #elif defined(PERL_XS)
785 int main(int argc, char **argv)
790 char *outfname = NULL;
793 #ifdef EASYWIN /*Easy Win */
794 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
797 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
798 cp = (unsigned char *)*argv;
803 if (pipe(fds) < 0 || (pid = fork()) < 0){
814 execvp(argv[1], &argv[1]);
828 if(x0201_f == WISH_TRUE)
829 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
831 if (binmode_f == TRUE)
832 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
833 if (freopen("","wb",stdout) == NULL)
840 setbuf(stdout, (char *) NULL);
842 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
845 if (binmode_f == TRUE)
846 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
847 if (freopen("","rb",stdin) == NULL) return (-1);
851 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
855 kanji_convert(stdin);
856 if (guess_f) print_guessed_code(NULL);
860 int is_argument_error = FALSE;
862 is_inputcode_mixed = FALSE;
863 is_inputcode_set = FALSE;
868 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
871 is_argument_error = TRUE;
879 /* reopen file for stdout */
880 if (file_out_f == TRUE) {
883 outfname = malloc(strlen(origfname)
884 + strlen(".nkftmpXXXXXX")
890 strcpy(outfname, origfname);
894 for (i = strlen(outfname); i; --i){
895 if (outfname[i - 1] == '/'
896 || outfname[i - 1] == '\\'){
902 strcat(outfname, "ntXXXXXX");
904 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
907 strcat(outfname, ".nkftmpXXXXXX");
908 fd = mkstemp(outfname);
911 || (fd_backup = dup(fileno(stdout))) < 0
912 || dup2(fd, fileno(stdout)) < 0
923 outfname = "nkf.out";
926 if(freopen(outfname, "w", stdout) == NULL) {
930 if (binmode_f == TRUE) {
931 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
932 if (freopen("","wb",stdout) == NULL)
939 if (binmode_f == TRUE)
940 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
941 if (freopen("","rb",fin) == NULL)
946 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
950 char *filename = NULL;
952 if (nfiles > 1) filename = origfname;
953 if (guess_f) print_guessed_code(filename);
959 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
967 if (dup2(fd_backup, fileno(stdout)) < 0){
970 if (stat(origfname, &sb)) {
971 fprintf(stderr, "Can't stat %s\n", origfname);
973 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
974 if (chmod(outfname, sb.st_mode)) {
975 fprintf(stderr, "Can't set permission %s\n", outfname);
978 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
980 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
981 tb[0] = tb[1] = sb.st_mtime;
982 if (utime(outfname, tb)) {
983 fprintf(stderr, "Can't set timestamp %s\n", outfname);
986 tb.actime = sb.st_atime;
987 tb.modtime = sb.st_mtime;
988 if (utime(outfname, &tb)) {
989 fprintf(stderr, "Can't set timestamp %s\n", outfname);
994 char *backup_filename = get_backup_filename(backup_suffix, origfname);
996 unlink(backup_filename);
998 if (rename(origfname, backup_filename)) {
999 perror(backup_filename);
1000 fprintf(stderr, "Can't rename %s to %s\n",
1001 origfname, backup_filename);
1005 if (unlink(origfname)){
1010 if (rename(outfname, origfname)) {
1012 fprintf(stderr, "Can't rename %s to %s\n",
1013 outfname, origfname);
1020 if (is_argument_error)
1023 #ifdef EASYWIN /*Easy Win */
1024 if (file_out_f == FALSE)
1025 scanf("%d",&end_check);
1028 #else /* for Other OS */
1029 if (file_out_f == TRUE)
1031 #endif /*Easy Win */
1034 #endif /* WIN32DLL */
1037 char *get_backup_filename(const char *suffix, const char *filename)
1039 char *backup_filename;
1040 int asterisk_count = 0;
1042 int filename_length = strlen(filename);
1044 for(i = 0; suffix[i]; i++){
1045 if(suffix[i] == '*') asterisk_count++;
1049 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1050 if (!backup_filename){
1051 perror("Can't malloc backup filename.");
1055 for(i = 0, j = 0; suffix[i];){
1056 if(suffix[i] == '*'){
1057 backup_filename[j] = '\0';
1058 strncat(backup_filename, filename, filename_length);
1060 j += filename_length;
1062 backup_filename[j++] = suffix[i++];
1065 backup_filename[j] = '\0';
1067 j = strlen(suffix) + filename_length;
1068 backup_filename = malloc( + 1);
1069 strcpy(backup_filename, filename);
1070 strcat(backup_filename, suffix);
1071 backup_filename[j] = '\0';
1073 return backup_filename;
1102 {"katakana-hiragana","h3"},
1109 #ifdef UTF8_OUTPUT_ENABLE
1119 {"fb-subchar=", ""},
1121 #ifdef UTF8_INPUT_ENABLE
1122 {"utf8-input", "W"},
1123 {"utf16-input", "W16"},
1124 {"no-cp932ext", ""},
1125 {"no-best-fit-chars",""},
1127 #ifdef UNICODE_NORMALIZATION
1128 {"utf8mac-input", ""},
1140 #ifdef NUMCHAR_OPTION
1141 {"numchar-input", ""},
1147 #ifdef SHIFTJIS_CP932
1157 static int option_mode = 0;
1159 void options(unsigned char *cp)
1163 unsigned char *cp_back = NULL;
1168 while(*cp && *cp++!='-');
1169 while (*cp || cp_back) {
1177 case '-': /* literal options */
1178 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1182 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1183 p = (unsigned char *)long_option[i].name;
1184 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1185 if (*p == cp[j] || cp[j] == ' '){
1192 while(*cp && *cp != SPACE && cp++);
1193 if (long_option[i].alias[0]){
1195 cp = (unsigned char *)long_option[i].alias;
1197 if (strcmp(long_option[i].name, "ic=") == 0){
1198 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1199 codeset[i] = nkf_toupper(p[i]);
1202 if(strcmp(codeset, "ISO-2022-JP") == 0){
1203 input_f = JIS_INPUT;
1204 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1205 strcmp(codeset, "CP50220") == 0 ||
1206 strcmp(codeset, "CP50221") == 0 ||
1207 strcmp(codeset, "CP50222") == 0){
1208 input_f = JIS_INPUT;
1209 #ifdef SHIFTJIS_CP932
1212 #ifdef UTF8_OUTPUT_ENABLE
1213 ms_ucs_map_f = UCS_MAP_CP932;
1215 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1216 input_f = JIS_INPUT;
1220 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1221 input_f = JIS_INPUT;
1226 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1227 input_f = SJIS_INPUT;
1228 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1229 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1230 strcmp(codeset, "CP932") == 0 ||
1231 strcmp(codeset, "MS932") == 0){
1232 input_f = SJIS_INPUT;
1233 #ifdef SHIFTJIS_CP932
1236 #ifdef UTF8_OUTPUT_ENABLE
1237 ms_ucs_map_f = UCS_MAP_CP932;
1239 }else if(strcmp(codeset, "CP10001") == 0){
1240 input_f = SJIS_INPUT;
1241 #ifdef SHIFTJIS_CP932
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP10001;
1247 }else if(strcmp(codeset, "EUCJP") == 0 ||
1248 strcmp(codeset, "EUC-JP") == 0){
1249 input_f = EUC_INPUT;
1250 }else if(strcmp(codeset, "CP51932") == 0){
1251 input_f = EUC_INPUT;
1252 #ifdef SHIFTJIS_CP932
1255 #ifdef UTF8_OUTPUT_ENABLE
1256 ms_ucs_map_f = UCS_MAP_CP932;
1258 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1259 strcmp(codeset, "EUCJP-MS") == 0 ||
1260 strcmp(codeset, "EUCJPMS") == 0){
1261 input_f = EUC_INPUT;
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_MS;
1268 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1269 strcmp(codeset, "EUCJP-ASCII") == 0){
1270 input_f = EUC_INPUT;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_ASCII;
1277 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1278 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1279 input_f = SJIS_INPUT;
1281 #ifdef SHIFTJIS_CP932
1284 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1285 strcmp(codeset, "EUC-JIS-2004") == 0){
1286 input_f = EUC_INPUT;
1288 #ifdef SHIFTJIS_CP932
1291 #ifdef UTF8_INPUT_ENABLE
1292 }else if(strcmp(codeset, "UTF-8") == 0 ||
1293 strcmp(codeset, "UTF-8N") == 0 ||
1294 strcmp(codeset, "UTF-8-BOM") == 0){
1295 input_f = UTF8_INPUT;
1296 #ifdef UNICODE_NORMALIZATION
1297 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1298 strcmp(codeset, "UTF-8-MAC") == 0){
1299 input_f = UTF8_INPUT;
1302 }else if(strcmp(codeset, "UTF-16") == 0 ||
1303 strcmp(codeset, "UTF-16BE") == 0 ||
1304 strcmp(codeset, "UTF-16BE-BOM") == 0){
1305 input_f = UTF16_INPUT;
1306 input_endian = ENDIAN_BIG;
1307 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1308 strcmp(codeset, "UTF-16LE-BOM") == 0){
1309 input_f = UTF16_INPUT;
1310 input_endian = ENDIAN_LITTLE;
1311 }else if(strcmp(codeset, "UTF-32") == 0 ||
1312 strcmp(codeset, "UTF-32BE") == 0 ||
1313 strcmp(codeset, "UTF-32BE-BOM") == 0){
1314 input_f = UTF32_INPUT;
1315 input_endian = ENDIAN_BIG;
1316 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1317 strcmp(codeset, "UTF-32LE-BOM") == 0){
1318 input_f = UTF32_INPUT;
1319 input_endian = ENDIAN_LITTLE;
1324 if (strcmp(long_option[i].name, "oc=") == 0){
1326 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1327 codeset[i] = nkf_toupper(p[i]);
1330 if(strcmp(codeset, "ISO-2022-JP") == 0){
1331 output_conv = j_oconv;
1332 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1333 output_conv = j_oconv;
1334 no_cp932ext_f = TRUE;
1335 #ifdef SHIFTJIS_CP932
1338 #ifdef UTF8_OUTPUT_ENABLE
1339 ms_ucs_map_f = UCS_MAP_CP932;
1341 }else if(strcmp(codeset, "CP50220") == 0){
1342 output_conv = j_oconv;
1344 #ifdef SHIFTJIS_CP932
1347 #ifdef UTF8_OUTPUT_ENABLE
1348 ms_ucs_map_f = UCS_MAP_CP932;
1350 }else if(strcmp(codeset, "CP50221") == 0){
1351 output_conv = j_oconv;
1352 #ifdef SHIFTJIS_CP932
1355 #ifdef UTF8_OUTPUT_ENABLE
1356 ms_ucs_map_f = UCS_MAP_CP932;
1358 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1359 output_conv = j_oconv;
1363 #ifdef SHIFTJIS_CP932
1366 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1367 output_conv = j_oconv;
1372 #ifdef SHIFTJIS_CP932
1375 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1376 output_conv = s_oconv;
1377 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1378 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1379 strcmp(codeset, "CP932") == 0 ||
1380 strcmp(codeset, "MS932") == 0){
1381 output_conv = s_oconv;
1382 #ifdef UTF8_OUTPUT_ENABLE
1383 ms_ucs_map_f = UCS_MAP_CP932;
1385 }else if(strcmp(codeset, "CP10001") == 0){
1386 output_conv = s_oconv;
1387 #ifdef UTF8_OUTPUT_ENABLE
1388 ms_ucs_map_f = UCS_MAP_CP10001;
1390 }else if(strcmp(codeset, "EUCJP") == 0 ||
1391 strcmp(codeset, "EUC-JP") == 0){
1392 output_conv = e_oconv;
1393 }else if(strcmp(codeset, "CP51932") == 0){
1394 output_conv = e_oconv;
1395 #ifdef SHIFTJIS_CP932
1398 #ifdef UTF8_OUTPUT_ENABLE
1399 ms_ucs_map_f = UCS_MAP_CP932;
1401 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1402 strcmp(codeset, "EUCJP-MS") == 0 ||
1403 strcmp(codeset, "EUCJPMS") == 0){
1404 output_conv = e_oconv;
1408 #ifdef UTF8_OUTPUT_ENABLE
1409 ms_ucs_map_f = UCS_MAP_MS;
1411 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1412 strcmp(codeset, "EUCJP-ASCII") == 0){
1413 output_conv = e_oconv;
1417 #ifdef UTF8_OUTPUT_ENABLE
1418 ms_ucs_map_f = UCS_MAP_ASCII;
1420 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1421 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1422 output_conv = s_oconv;
1424 #ifdef SHIFTJIS_CP932
1427 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1428 strcmp(codeset, "EUC-JIS-2004") == 0){
1429 output_conv = e_oconv;
1434 #ifdef SHIFTJIS_CP932
1437 #ifdef UTF8_OUTPUT_ENABLE
1438 }else if(strcmp(codeset, "UTF-8") == 0){
1439 output_conv = w_oconv;
1440 }else if(strcmp(codeset, "UTF-8N") == 0){
1441 output_conv = w_oconv;
1442 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1443 output_conv = w_oconv;
1444 output_bom_f = TRUE;
1445 }else if(strcmp(codeset, "UTF-16BE") == 0){
1446 output_conv = w_oconv16;
1447 }else if(strcmp(codeset, "UTF-16") == 0 ||
1448 strcmp(codeset, "UTF-16BE-BOM") == 0){
1449 output_conv = w_oconv16;
1450 output_bom_f = TRUE;
1451 }else if(strcmp(codeset, "UTF-16LE") == 0){
1452 output_conv = w_oconv16;
1453 output_endian = ENDIAN_LITTLE;
1454 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1455 output_conv = w_oconv16;
1456 output_endian = ENDIAN_LITTLE;
1457 output_bom_f = TRUE;
1458 }else if(strcmp(codeset, "UTF-32") == 0 ||
1459 strcmp(codeset, "UTF-32BE") == 0){
1460 output_conv = w_oconv32;
1461 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1462 output_conv = w_oconv32;
1463 output_bom_f = TRUE;
1464 }else if(strcmp(codeset, "UTF-32LE") == 0){
1465 output_conv = w_oconv32;
1466 output_endian = ENDIAN_LITTLE;
1467 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1468 output_conv = w_oconv32;
1469 output_endian = ENDIAN_LITTLE;
1470 output_bom_f = TRUE;
1476 if (strcmp(long_option[i].name, "overwrite") == 0){
1479 preserve_time_f = TRUE;
1482 if (strcmp(long_option[i].name, "overwrite=") == 0){
1485 preserve_time_f = TRUE;
1487 backup_suffix = malloc(strlen((char *) p) + 1);
1488 strcpy(backup_suffix, (char *) p);
1491 if (strcmp(long_option[i].name, "in-place") == 0){
1494 preserve_time_f = FALSE;
1497 if (strcmp(long_option[i].name, "in-place=") == 0){
1500 preserve_time_f = FALSE;
1502 backup_suffix = malloc(strlen((char *) p) + 1);
1503 strcpy(backup_suffix, (char *) p);
1508 if (strcmp(long_option[i].name, "cap-input") == 0){
1512 if (strcmp(long_option[i].name, "url-input") == 0){
1517 #ifdef NUMCHAR_OPTION
1518 if (strcmp(long_option[i].name, "numchar-input") == 0){
1524 if (strcmp(long_option[i].name, "no-output") == 0){
1528 if (strcmp(long_option[i].name, "debug") == 0){
1533 if (strcmp(long_option[i].name, "cp932") == 0){
1534 #ifdef SHIFTJIS_CP932
1538 #ifdef UTF8_OUTPUT_ENABLE
1539 ms_ucs_map_f = UCS_MAP_CP932;
1543 if (strcmp(long_option[i].name, "no-cp932") == 0){
1544 #ifdef SHIFTJIS_CP932
1548 #ifdef UTF8_OUTPUT_ENABLE
1549 ms_ucs_map_f = UCS_MAP_ASCII;
1553 #ifdef SHIFTJIS_CP932
1554 if (strcmp(long_option[i].name, "cp932inv") == 0){
1561 if (strcmp(long_option[i].name, "x0212") == 0){
1568 if (strcmp(long_option[i].name, "exec-in") == 0){
1572 if (strcmp(long_option[i].name, "exec-out") == 0){
1577 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1578 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1579 no_cp932ext_f = TRUE;
1582 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1583 no_best_fit_chars_f = TRUE;
1586 if (strcmp(long_option[i].name, "fb-skip") == 0){
1587 encode_fallback = NULL;
1590 if (strcmp(long_option[i].name, "fb-html") == 0){
1591 encode_fallback = encode_fallback_html;
1594 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1595 encode_fallback = encode_fallback_xml;
1598 if (strcmp(long_option[i].name, "fb-java") == 0){
1599 encode_fallback = encode_fallback_java;
1602 if (strcmp(long_option[i].name, "fb-perl") == 0){
1603 encode_fallback = encode_fallback_perl;
1606 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1607 encode_fallback = encode_fallback_subchar;
1610 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1611 encode_fallback = encode_fallback_subchar;
1612 unicode_subchar = 0;
1614 /* decimal number */
1615 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1616 unicode_subchar *= 10;
1617 unicode_subchar += hex2bin(p[i]);
1619 }else if(p[1] == 'x' || p[1] == 'X'){
1620 /* hexadecimal number */
1621 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1622 unicode_subchar <<= 4;
1623 unicode_subchar |= hex2bin(p[i]);
1627 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1628 unicode_subchar *= 8;
1629 unicode_subchar += hex2bin(p[i]);
1632 w16e_conv(unicode_subchar, &i, &j);
1633 unicode_subchar = i<<8 | j;
1637 #ifdef UTF8_OUTPUT_ENABLE
1638 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1639 ms_ucs_map_f = UCS_MAP_MS;
1643 #ifdef UNICODE_NORMALIZATION
1644 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1645 input_f = UTF8_INPUT;
1650 if (strcmp(long_option[i].name, "prefix=") == 0){
1651 if (nkf_isgraph(p[0])){
1652 for (i = 1; nkf_isgraph(p[i]); i++){
1653 prefix_table[p[i]] = p[0];
1660 case 'b': /* buffered mode */
1663 case 'u': /* non bufferd mode */
1666 case 't': /* transparent mode */
1671 } else if (*cp=='2') {
1675 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1683 case 'j': /* JIS output */
1685 output_conv = j_oconv;
1687 case 'e': /* AT&T EUC output */
1688 output_conv = e_oconv;
1691 case 's': /* SJIS output */
1692 output_conv = s_oconv;
1694 case 'l': /* ISO8859 Latin-1 support, no conversion */
1695 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1696 input_f = LATIN1_INPUT;
1698 case 'i': /* Kanji IN ESC-$-@/B */
1699 if (*cp=='@'||*cp=='B')
1700 kanji_intro = *cp++;
1702 case 'o': /* ASCII IN ESC-(-J/B */
1703 if (*cp=='J'||*cp=='B'||*cp=='H')
1704 ascii_intro = *cp++;
1708 bit:1 katakana->hiragana
1709 bit:2 hiragana->katakana
1711 if ('9'>= *cp && *cp>='0')
1712 hira_f |= (*cp++ -'0');
1719 #if defined(MSDOS) || defined(__OS2__)
1734 #ifdef UTF8_OUTPUT_ENABLE
1735 case 'w': /* UTF-8 output */
1737 output_conv = w_oconv; cp++;
1741 output_bom_f = TRUE;
1744 if ('1'== cp[0] && '6'==cp[1]) {
1745 output_conv = w_oconv16; cp+=2;
1746 } else if ('3'== cp[0] && '2'==cp[1]) {
1747 output_conv = w_oconv32; cp+=2;
1749 output_conv = w_oconv;
1754 output_endian = ENDIAN_LITTLE;
1755 } else if (cp[0] == 'B') {
1763 output_bom_f = TRUE;
1768 #ifdef UTF8_INPUT_ENABLE
1769 case 'W': /* UTF input */
1772 input_f = UTF8_INPUT;
1774 if ('1'== cp[0] && '6'==cp[1]) {
1776 input_f = UTF16_INPUT;
1777 input_endian = ENDIAN_BIG;
1778 } else if ('3'== cp[0] && '2'==cp[1]) {
1780 input_f = UTF32_INPUT;
1781 input_endian = ENDIAN_BIG;
1783 input_f = UTF8_INPUT;
1788 input_endian = ENDIAN_LITTLE;
1789 } else if (cp[0] == 'B') {
1795 /* Input code assumption */
1796 case 'J': /* JIS input */
1797 input_f = JIS_INPUT;
1799 case 'E': /* AT&T EUC input */
1800 input_f = EUC_INPUT;
1802 case 'S': /* MS Kanji input */
1803 input_f = SJIS_INPUT;
1804 if (x0201_f==NO_X0201) x0201_f=TRUE;
1806 case 'Z': /* Convert X0208 alphabet to asii */
1807 /* bit:0 Convert X0208
1808 bit:1 Convert Kankaku to one space
1809 bit:2 Convert Kankaku to two spaces
1810 bit:3 Convert HTML Entity
1812 if ('9'>= *cp && *cp>='0')
1813 alpha_f |= 1<<(*cp++ -'0');
1817 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1818 x0201_f = FALSE; /* No X0201->X0208 conversion */
1820 ESC-(-I in JIS, EUC, MS Kanji
1821 SI/SO in JIS, EUC, MS Kanji
1822 SSO in EUC, JIS, not in MS Kanji
1823 MS Kanji (0xa0-0xdf)
1825 ESC-(-I in JIS (0x20-0x5f)
1826 SSO in EUC (0xa0-0xdf)
1827 0xa0-0xd in MS Kanji (0xa0-0xdf)
1830 case 'X': /* Assume X0201 kana */
1831 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1834 case 'F': /* prserve new lines */
1835 fold_preserve_f = TRUE;
1836 case 'f': /* folding -f60 or -f */
1839 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1841 fold_len += *cp++ - '0';
1843 if (!(0<fold_len && fold_len<BUFSIZ))
1844 fold_len = DEFAULT_FOLD;
1848 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1850 fold_margin += *cp++ - '0';
1854 case 'm': /* MIME support */
1855 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1856 if (*cp=='B'||*cp=='Q') {
1857 mime_decode_mode = *cp++;
1858 mimebuf_f = FIXED_MIME;
1859 } else if (*cp=='N') {
1860 mime_f = TRUE; cp++;
1861 } else if (*cp=='S') {
1862 mime_f = STRICT_MIME; cp++;
1863 } else if (*cp=='0') {
1864 mime_decode_f = FALSE;
1865 mime_f = FALSE; cp++;
1868 case 'M': /* MIME output */
1871 mimeout_f = FIXED_MIME; cp++;
1872 } else if (*cp=='Q') {
1874 mimeout_f = FIXED_MIME; cp++;
1879 case 'B': /* Broken JIS support */
1881 bit:1 allow any x on ESC-(-x or ESC-$-x
1882 bit:2 reset to ascii on NL
1884 if ('9'>= *cp && *cp>='0')
1885 broken_f |= 1<<(*cp++ -'0');
1890 case 'O':/* for Output file */
1894 case 'c':/* add cr code */
1897 case 'd':/* delete cr code */
1900 case 'I': /* ISO-2022-JP output */
1903 case 'L': /* line mode */
1904 if (*cp=='u') { /* unix */
1905 crmode_f = NL; cp++;
1906 } else if (*cp=='m') { /* mac */
1907 crmode_f = CR; cp++;
1908 } else if (*cp=='w') { /* windows */
1909 crmode_f = CRLF; cp++;
1910 } else if (*cp=='0') { /* no conversion */
1920 /* module muliple options in a string are allowed for Perl moudle */
1921 while(*cp && *cp++!='-');
1924 /* bogus option but ignored */
1930 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1933 struct input_code *p = input_code_list;
1935 if (iconv_func == p->iconv_func){
1944 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1946 #ifdef INPUT_CODE_FIX
1954 #ifdef INPUT_CODE_FIX
1955 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1961 if (estab_f && iconv_for_check != iconv){
1962 struct input_code *p = find_inputcode_byfunc(iconv);
1964 set_input_codename(p->name);
1965 debug(input_codename);
1967 iconv_for_check = iconv;
1972 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1973 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1974 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1975 #ifdef SHIFTJIS_CP932
1976 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1977 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1979 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1981 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1982 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1984 #define SCORE_INIT (SCORE_iMIME)
1986 const nkf_char score_table_A0[] = {
1989 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1990 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1993 const nkf_char score_table_F0[] = {
1994 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1995 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1996 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1997 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2000 void set_code_score(struct input_code *ptr, nkf_char score)
2003 ptr->score |= score;
2007 void clr_code_score(struct input_code *ptr, nkf_char score)
2010 ptr->score &= ~score;
2014 void code_score(struct input_code *ptr)
2016 nkf_char c2 = ptr->buf[0];
2017 #ifdef UTF8_OUTPUT_ENABLE
2018 nkf_char c1 = ptr->buf[1];
2021 set_code_score(ptr, SCORE_ERROR);
2022 }else if (c2 == SSO){
2023 set_code_score(ptr, SCORE_KANA);
2024 #ifdef UTF8_OUTPUT_ENABLE
2025 }else if (!e2w_conv(c2, c1)){
2026 set_code_score(ptr, SCORE_NO_EXIST);
2028 }else if ((c2 & 0x70) == 0x20){
2029 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2030 }else if ((c2 & 0x70) == 0x70){
2031 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2032 }else if ((c2 & 0x70) >= 0x50){
2033 set_code_score(ptr, SCORE_L2);
2037 void status_disable(struct input_code *ptr)
2042 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2045 void status_push_ch(struct input_code *ptr, nkf_char c)
2047 ptr->buf[ptr->index++] = c;
2050 void status_clear(struct input_code *ptr)
2056 void status_reset(struct input_code *ptr)
2059 ptr->score = SCORE_INIT;
2062 void status_reinit(struct input_code *ptr)
2065 ptr->_file_stat = 0;
2068 void status_check(struct input_code *ptr, nkf_char c)
2070 if (c <= DEL && estab_f){
2075 void s_status(struct input_code *ptr, nkf_char c)
2079 status_check(ptr, c);
2084 #ifdef NUMCHAR_OPTION
2085 }else if (is_unicode_capsule(c)){
2088 }else if (0xa1 <= c && c <= 0xdf){
2089 status_push_ch(ptr, SSO);
2090 status_push_ch(ptr, c);
2093 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2095 status_push_ch(ptr, c);
2096 #ifdef SHIFTJIS_CP932
2098 && is_ibmext_in_sjis(c)){
2100 status_push_ch(ptr, c);
2101 #endif /* SHIFTJIS_CP932 */
2103 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2105 status_push_ch(ptr, c);
2106 #endif /* X0212_ENABLE */
2108 status_disable(ptr);
2112 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2113 status_push_ch(ptr, c);
2114 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2118 status_disable(ptr);
2122 #ifdef SHIFTJIS_CP932
2123 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2124 status_push_ch(ptr, c);
2125 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2126 set_code_score(ptr, SCORE_CP932);
2131 #endif /* SHIFTJIS_CP932 */
2132 #ifndef X0212_ENABLE
2133 status_disable(ptr);
2139 void e_status(struct input_code *ptr, nkf_char c)
2143 status_check(ptr, c);
2148 #ifdef NUMCHAR_OPTION
2149 }else if (is_unicode_capsule(c)){
2152 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2154 status_push_ch(ptr, c);
2156 }else if (0x8f == c){
2158 status_push_ch(ptr, c);
2159 #endif /* X0212_ENABLE */
2161 status_disable(ptr);
2165 if (0xa1 <= c && c <= 0xfe){
2166 status_push_ch(ptr, c);
2170 status_disable(ptr);
2175 if (0xa1 <= c && c <= 0xfe){
2177 status_push_ch(ptr, c);
2179 status_disable(ptr);
2181 #endif /* X0212_ENABLE */
2185 #ifdef UTF8_INPUT_ENABLE
2186 void w_status(struct input_code *ptr, nkf_char c)
2190 status_check(ptr, c);
2195 #ifdef NUMCHAR_OPTION
2196 }else if (is_unicode_capsule(c)){
2199 }else if (0xc0 <= c && c <= 0xdf){
2201 status_push_ch(ptr, c);
2202 }else if (0xe0 <= c && c <= 0xef){
2204 status_push_ch(ptr, c);
2205 }else if (0xf0 <= c && c <= 0xf4){
2207 status_push_ch(ptr, c);
2209 status_disable(ptr);
2214 if (0x80 <= c && c <= 0xbf){
2215 status_push_ch(ptr, c);
2216 if (ptr->index > ptr->stat){
2217 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2218 && ptr->buf[2] == 0xbf);
2219 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2220 &ptr->buf[0], &ptr->buf[1]);
2227 status_disable(ptr);
2231 if (0x80 <= c && c <= 0xbf){
2232 if (ptr->index < ptr->stat){
2233 status_push_ch(ptr, c);
2238 status_disable(ptr);
2245 void code_status(nkf_char c)
2247 int action_flag = 1;
2248 struct input_code *result = 0;
2249 struct input_code *p = input_code_list;
2251 if (!p->status_func) {
2255 if (!p->status_func)
2257 (p->status_func)(p, c);
2260 }else if(p->stat == 0){
2271 if (result && !estab_f){
2272 set_iconv(TRUE, result->iconv_func);
2273 }else if (c <= DEL){
2274 struct input_code *ptr = input_code_list;
2284 nkf_char std_getc(FILE *f)
2287 return std_gc_buf[--std_gc_ndx];
2293 nkf_char std_ungetc(nkf_char c, FILE *f)
2295 if (std_gc_ndx == STD_GC_BUFSIZE){
2298 std_gc_buf[std_gc_ndx++] = c;
2303 void std_putc(nkf_char c)
2310 #if !defined(PERL_XS) && !defined(WIN32DLL)
2311 nkf_char noconvert(FILE *f)
2316 module_connection();
2317 while ((c = (*i_getc)(f)) != EOF)
2324 void module_connection(void)
2326 oconv = output_conv;
2329 /* replace continucation module, from output side */
2331 /* output redicrection */
2333 if (noout_f || guess_f){
2340 if (mimeout_f == TRUE) {
2341 o_base64conv = oconv; oconv = base64_conv;
2343 /* base64_count = 0; */
2347 o_crconv = oconv; oconv = cr_conv;
2350 o_rot_conv = oconv; oconv = rot_conv;
2353 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2356 o_hira_conv = oconv; oconv = hira_conv;
2359 o_fconv = oconv; oconv = fold_conv;
2362 if (alpha_f || x0201_f) {
2363 o_zconv = oconv; oconv = z_conv;
2367 i_ungetc = std_ungetc;
2368 /* input redicrection */
2371 i_cgetc = i_getc; i_getc = cap_getc;
2372 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2375 i_ugetc = i_getc; i_getc = url_getc;
2376 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2379 #ifdef NUMCHAR_OPTION
2381 i_ngetc = i_getc; i_getc = numchar_getc;
2382 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2385 #ifdef UNICODE_NORMALIZATION
2386 if (nfc_f && input_f == UTF8_INPUT){
2387 i_nfc_getc = i_getc; i_getc = nfc_getc;
2388 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2391 if (mime_f && mimebuf_f==FIXED_MIME) {
2392 i_mgetc = i_getc; i_getc = mime_getc;
2393 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2396 i_bgetc = i_getc; i_getc = broken_getc;
2397 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2399 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2400 set_iconv(-TRUE, e_iconv);
2401 } else if (input_f == SJIS_INPUT) {
2402 set_iconv(-TRUE, s_iconv);
2403 #ifdef UTF8_INPUT_ENABLE
2404 } else if (input_f == UTF8_INPUT) {
2405 set_iconv(-TRUE, w_iconv);
2406 } else if (input_f == UTF16_INPUT) {
2407 set_iconv(-TRUE, w_iconv16);
2408 } else if (input_f == UTF32_INPUT) {
2409 set_iconv(-TRUE, w_iconv32);
2412 set_iconv(FALSE, e_iconv);
2416 struct input_code *p = input_code_list;
2424 * Check and Ignore BOM
2426 void check_bom(FILE *f)
2429 switch(c2 = (*i_getc)(f)){
2431 if((c2 = (*i_getc)(f)) == 0x00){
2432 if((c2 = (*i_getc)(f)) == 0xFE){
2433 if((c2 = (*i_getc)(f)) == 0xFF){
2435 set_iconv(TRUE, w_iconv32);
2437 if (iconv == w_iconv32) {
2438 input_endian = ENDIAN_BIG;
2441 (*i_ungetc)(0xFF,f);
2442 }else (*i_ungetc)(c2,f);
2443 (*i_ungetc)(0xFE,f);
2444 }else if(c2 == 0xFF){
2445 if((c2 = (*i_getc)(f)) == 0xFE){
2447 set_iconv(TRUE, w_iconv32);
2449 if (iconv == w_iconv32) {
2450 input_endian = ENDIAN_2143;
2453 (*i_ungetc)(0xFF,f);
2454 }else (*i_ungetc)(c2,f);
2455 (*i_ungetc)(0xFF,f);
2456 }else (*i_ungetc)(c2,f);
2457 (*i_ungetc)(0x00,f);
2458 }else (*i_ungetc)(c2,f);
2459 (*i_ungetc)(0x00,f);
2462 if((c2 = (*i_getc)(f)) == 0xBB){
2463 if((c2 = (*i_getc)(f)) == 0xBF){
2465 set_iconv(TRUE, w_iconv);
2467 if (iconv == w_iconv) {
2470 (*i_ungetc)(0xBF,f);
2471 }else (*i_ungetc)(c2,f);
2472 (*i_ungetc)(0xBB,f);
2473 }else (*i_ungetc)(c2,f);
2474 (*i_ungetc)(0xEF,f);
2477 if((c2 = (*i_getc)(f)) == 0xFF){
2478 if((c2 = (*i_getc)(f)) == 0x00){
2479 if((c2 = (*i_getc)(f)) == 0x00){
2481 set_iconv(TRUE, w_iconv32);
2483 if (iconv == w_iconv32) {
2484 input_endian = ENDIAN_3412;
2487 (*i_ungetc)(0x00,f);
2488 }else (*i_ungetc)(c2,f);
2489 (*i_ungetc)(0x00,f);
2490 }else (*i_ungetc)(c2,f);
2492 set_iconv(TRUE, w_iconv16);
2494 if (iconv == w_iconv16) {
2495 input_endian = ENDIAN_BIG;
2498 (*i_ungetc)(0xFF,f);
2499 }else (*i_ungetc)(c2,f);
2500 (*i_ungetc)(0xFE,f);
2503 if((c2 = (*i_getc)(f)) == 0xFE){
2504 if((c2 = (*i_getc)(f)) == 0x00){
2505 if((c2 = (*i_getc)(f)) == 0x00){
2507 set_iconv(TRUE, w_iconv32);
2509 if (iconv == w_iconv32) {
2510 input_endian = ENDIAN_LITTLE;
2513 (*i_ungetc)(0x00,f);
2514 }else (*i_ungetc)(c2,f);
2515 (*i_ungetc)(0x00,f);
2516 }else (*i_ungetc)(c2,f);
2518 set_iconv(TRUE, w_iconv16);
2520 if (iconv == w_iconv16) {
2521 input_endian = ENDIAN_LITTLE;
2524 (*i_ungetc)(0xFE,f);
2525 }else (*i_ungetc)(c2,f);
2526 (*i_ungetc)(0xFF,f);
2535 Conversion main loop. Code detection only.
2538 nkf_char kanji_convert(FILE *f)
2540 nkf_char c3, c2=0, c1, c0=0;
2541 int is_8bit = FALSE;
2543 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2544 #ifdef UTF8_INPUT_ENABLE
2545 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2552 output_mode = ASCII;
2555 #define NEXT continue /* no output, get next */
2556 #define SEND ; /* output c1 and c2, get next */
2557 #define LAST break /* end of loop, go closing */
2559 module_connection();
2562 while ((c1 = (*i_getc)(f)) != EOF) {
2563 #ifdef INPUT_CODE_FIX
2569 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2570 /* in case of 8th bit is on */
2571 if (!estab_f&&!mime_decode_mode) {
2572 /* in case of not established yet */
2573 /* It is still ambiguious */
2574 if (h_conv(f, c2, c1)==EOF)
2580 /* in case of already established */
2582 /* ignore bogus code and not CP5022x UCD */
2590 /* second byte, 7 bit code */
2591 /* it might be kanji shitfted */
2592 if ((c1 == DEL) || (c1 <= SPACE)) {
2593 /* ignore bogus first code */
2600 #ifdef UTF8_INPUT_ENABLE
2601 if (iconv == w_iconv16) {
2602 if (input_endian == ENDIAN_BIG) {
2604 if ((c1 = (*i_getc)(f)) != EOF) {
2605 if (0xD8 <= c2 && c2 <= 0xDB) {
2606 if ((c0 = (*i_getc)(f)) != EOF) {
2608 if ((c3 = (*i_getc)(f)) != EOF) {
2615 if ((c2 = (*i_getc)(f)) != EOF) {
2616 if (0xD8 <= c2 && c2 <= 0xDB) {
2617 if ((c3 = (*i_getc)(f)) != EOF) {
2618 if ((c0 = (*i_getc)(f)) != EOF) {
2627 } else if(iconv == w_iconv32){
2629 if((c2 = (*i_getc)(f)) != EOF &&
2630 (c1 = (*i_getc)(f)) != EOF &&
2631 (c0 = (*i_getc)(f)) != EOF){
2632 switch(input_endian){
2634 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2637 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2640 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2643 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2653 #ifdef NUMCHAR_OPTION
2654 if (is_unicode_capsule(c1)){
2658 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2660 if (!estab_f && !iso8859_f) {
2661 /* not established yet */
2664 } else { /* estab_f==TRUE */
2669 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2670 /* SJIS X0201 Case... */
2671 if(iso2022jp_f && x0201_f==NO_X0201) {
2672 (*oconv)(GETA1, GETA2);
2679 } else if (c1==SSO && iconv != s_iconv) {
2680 /* EUC X0201 Case */
2681 c1 = (*i_getc)(f); /* skip SSO */
2683 if (SSP<=c1 && c1<0xe0) {
2684 if(iso2022jp_f && x0201_f==NO_X0201) {
2685 (*oconv)(GETA1, GETA2);
2692 } else { /* bogus code, skip SSO and one byte */
2695 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2696 (c1 == 0xFD || c1 == 0xFE)) {
2702 /* already established */
2707 } else if ((c1 > SPACE) && (c1 != DEL)) {
2708 /* in case of Roman characters */
2710 /* output 1 shifted byte */
2714 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2715 /* output 1 shifted byte */
2716 if(iso2022jp_f && x0201_f==NO_X0201) {
2717 (*oconv)(GETA1, GETA2);
2724 /* look like bogus code */
2727 } else if (input_mode == X0208 || input_mode == X0212 ||
2728 input_mode == X0213_1 || input_mode == X0213_2) {
2729 /* in case of Kanji shifted */
2732 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2733 /* Check MIME code */
2734 if ((c1 = (*i_getc)(f)) == EOF) {
2737 } else if (c1 == '?') {
2738 /* =? is mime conversion start sequence */
2739 if(mime_f == STRICT_MIME) {
2740 /* check in real detail */
2741 if (mime_begin_strict(f) == EOF)
2745 } else if (mime_begin(f) == EOF)
2755 /* normal ASCII code */
2758 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
\r
2761 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
\r
2764 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
\r
2765 if ((c1 = (*i_getc)(f)) == EOF) {
2766 /* (*oconv)(0, ESC); don't send bogus code */
2768 } else if (c1 == '$') {
2769 if ((c1 = (*i_getc)(f)) == EOF) {
2771 (*oconv)(0, ESC); don't send bogus code
2772 (*oconv)(0, '$'); */
2774 } else if (c1 == '@'|| c1 == 'B') {
2775 /* This is kanji introduction */
2778 set_input_codename("ISO-2022-JP");
2780 debug(input_codename);
2783 } else if (c1 == '(') {
2784 if ((c1 = (*i_getc)(f)) == EOF) {
2785 /* don't send bogus code
2791 } else if (c1 == '@'|| c1 == 'B') {
2792 /* This is kanji introduction */
2797 } else if (c1 == 'D'){
2801 #endif /* X0212_ENABLE */
2802 } else if (c1 == (X0213_1&0x7F)){
2803 input_mode = X0213_1;
2806 } else if (c1 == (X0213_2&0x7F)){
2807 input_mode = X0213_2;
2811 /* could be some special code */
2818 } else if (broken_f&0x2) {
2819 /* accept any ESC-(-x as broken code ... */
2829 } else if (c1 == '(') {
2830 if ((c1 = (*i_getc)(f)) == EOF) {
2831 /* don't send bogus code
2833 (*oconv)(0, '('); */
2837 /* This is X0201 kana introduction */
2838 input_mode = X0201; shift_mode = X0201;
2840 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2841 /* This is X0208 kanji introduction */
2842 input_mode = ASCII; shift_mode = FALSE;
2844 } else if (broken_f&0x2) {
2845 input_mode = ASCII; shift_mode = FALSE;
2850 /* maintain various input_mode here */
2854 } else if ( c1 == 'N' || c1 == 'n' ){
2856 c3 = (*i_getc)(f); /* skip SS2 */
2857 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2872 } else if (c1 == ESC && iconv == s_iconv) {
2873 /* ESC in Shift_JIS */
2874 if ((c1 = (*i_getc)(f)) == EOF) {
2875 /* (*oconv)(0, ESC); don't send bogus code */
2877 } else if (c1 == '$') {
2879 if ((c1 = (*i_getc)(f)) == EOF) {
2881 (*oconv)(0, ESC); don't send bogus code
2882 (*oconv)(0, '$'); */
2885 if (('E' <= c1 && c1 <= 'G') ||
2886 ('O' <= c1 && c1 <= 'Q')) {
2894 static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2895 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2896 while ((c1 = (*i_getc)(f)) != EOF) {
2897 if (SPACE <= c1 && c1 <= 'z') {
2898 (*oconv)(0, c1 + c0);
2899 } else break; /* c1 == SO */
2903 if (c1 == EOF) LAST;
2910 } else if (c1 == NL || c1 == CR) {
2912 input_mode = ASCII; set_iconv(FALSE, 0);
2914 } else if (mime_decode_f && !mime_decode_mode){
2916 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2924 } else { /* if (c1 == CR)*/
2925 if ((c1=(*i_getc)(f))!=EOF) {
2929 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2944 if (prev_cr && c1 == NL) crmode_f = CRLF;
2947 } else if (c1 == DEL && input_mode == X0208 ) {
2957 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2960 if ((c0 = (*i_getc)(f)) != EOF) {
2963 if ((c3 = (*i_getc)(f)) != EOF) {
2965 (*iconv)(c2, c1, c0|c3);
2970 /* 3 bytes EUC or UTF-8 */
2971 if ((c0 = (*i_getc)(f)) != EOF) {
2973 (*iconv)(c2, c1, c0);
2981 0x7F <= c2 && c2 <= 0x92 &&
2982 0x21 <= c1 && c1 <= 0x7E) {
2984 if(c1 == 0x7F) return 0;
2985 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2988 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2992 (*oconv)(PREFIX_EUCG3 | c2, c1);
2994 #endif /* X0212_ENABLE */
2996 (*oconv)(PREFIX_EUCG3 | c2, c1);
2999 (*oconv)(input_mode, c1); /* other special case */
3005 /* goto next_word */
3009 (*iconv)(EOF, 0, 0);
3010 if (!is_inputcode_set)
3013 struct input_code *p = input_code_list;
3014 struct input_code *result = p;
3016 if (p->score < result->score) result = p;
3019 set_input_codename(result->name);
3026 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3028 nkf_char ret, c3, c0;
3032 /** it must NOT be in the kanji shifte sequence */
3033 /** it must NOT be written in JIS7 */
3034 /** and it must be after 2 byte 8bit code */
3040 while ((c1 = (*i_getc)(f)) != EOF) {
3046 if (push_hold_buf(c1) == EOF || estab_f){
3052 struct input_code *p = input_code_list;
3053 struct input_code *result = p;
3058 if (p->status_func && p->score < result->score){
3063 set_iconv(TRUE, result->iconv_func);
3068 ** 1) EOF is detected, or
3069 ** 2) Code is established, or
3070 ** 3) Buffer is FULL (but last word is pushed)
3072 ** in 1) and 3) cases, we continue to use
3073 ** Kanji codes by oconv and leave estab_f unchanged.
3078 while (hold_index < hold_count){
3079 c2 = hold_buf[hold_index++];
3081 #ifdef NUMCHAR_OPTION
3082 || is_unicode_capsule(c2)
3087 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3088 (*iconv)(X0201, c2, 0);
3091 if (hold_index < hold_count){
3092 c1 = hold_buf[hold_index++];
3102 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3105 if (hold_index < hold_count){
3106 c0 = hold_buf[hold_index++];
3107 } else if ((c0 = (*i_getc)(f)) == EOF) {
3113 if (hold_index < hold_count){
3114 c3 = hold_buf[hold_index++];
3115 } else if ((c3 = (*i_getc)(f)) == EOF) {
3120 (*iconv)(c2, c1, c0|c3);
3125 /* 3 bytes EUC or UTF-8 */
3126 if (hold_index < hold_count){
3127 c0 = hold_buf[hold_index++];
3128 } else if ((c0 = (*i_getc)(f)) == EOF) {
3134 (*iconv)(c2, c1, c0);
3137 if (c0 == EOF) break;
3142 nkf_char push_hold_buf(nkf_char c2)
3144 if (hold_count >= HOLD_SIZE*2)
3146 hold_buf[hold_count++] = (unsigned char)c2;
3147 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3150 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3152 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3155 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3156 #ifdef SHIFTJIS_CP932
3157 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3158 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3165 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3166 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3172 #endif /* SHIFTJIS_CP932 */
3174 if (!x0213_f && is_ibmext_in_sjis(c2)){
3175 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3178 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3191 if(x0213_f && c2 >= 0xF0){
3192 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3193 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3194 }else{ /* 78<=k<=94 */
3195 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3196 if (0x9E < c1) c2++;
3199 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3200 if (0x9E < c1) c2++;
3203 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3210 c2 = x0212_unshift(c2);
3217 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3221 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3223 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3225 if(c1 == 0x7F) return 0;
3226 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3229 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3230 if (ret) return ret;
3236 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3241 }else if (c2 == 0x8f){
3245 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3246 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3247 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3250 c2 = (c2 << 8) | (c1 & 0x7f);
3252 #ifdef SHIFTJIS_CP932
3255 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3256 s2e_conv(s2, s1, &c2, &c1);
3263 #endif /* SHIFTJIS_CP932 */
3265 #endif /* X0212_ENABLE */
3266 } else if (c2 == SSO){
3269 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3272 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3273 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3274 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3279 #ifdef SHIFTJIS_CP932
3280 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3282 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3283 s2e_conv(s2, s1, &c2, &c1);
3290 #endif /* SHIFTJIS_CP932 */
3297 #ifdef UTF8_INPUT_ENABLE
3298 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3305 }else if (0xc0 <= c2 && c2 <= 0xef) {
3306 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3307 #ifdef NUMCHAR_OPTION
3310 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3318 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3321 static const int w_iconv_utf8_1st_byte[] =
3323 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3324 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3325 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3326 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3328 if (c2 < 0 || 0xff < c2) {
3329 }else if (c2 == 0) { /* 0 : 1 byte*/
3331 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3334 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3336 if (c1 < 0x80 || 0xBF < c1) return 0;
3339 if (c0 == 0) return -1;
3340 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3345 if (c0 == 0) return -1;
3346 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3350 if (c0 == 0) return -1;
3351 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3355 if (c0 == 0) return -2;
3356 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3360 if (c0 == 0) return -2;
3361 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3365 if (c0 == 0) return -2;
3366 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3374 if (c2 == 0 || c2 == EOF){
3375 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3376 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3379 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3388 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3389 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3396 }else if (val < 0x800){
3397 *p2 = 0xc0 | (val >> 6);
3398 *p1 = 0x80 | (val & 0x3f);
3400 } else if (val <= NKF_INT32_C(0xFFFF)) {
3401 *p2 = 0xe0 | (val >> 12);
3402 *p1 = 0x80 | ((val >> 6) & 0x3f);
3403 *p0 = 0x80 | (val & 0x3f);
3404 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3405 *p2 = 0xe0 | (val >> 16);
3406 *p1 = 0x80 | ((val >> 12) & 0x3f);
3407 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3416 #ifdef UTF8_INPUT_ENABLE
3417 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3422 } else if (c2 >= 0xf0){
3423 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3424 val = (c2 & 0x0f) << 18;
3425 val |= (c1 & 0x3f) << 12;
3426 val |= (c0 & 0x3f00) >> 2;
3428 }else if (c2 >= 0xe0){
3429 val = (c2 & 0x0f) << 12;
3430 val |= (c1 & 0x3f) << 6;
3432 }else if (c2 >= 0xc0){
3433 val = (c2 & 0x1f) << 6;
3441 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3443 nkf_char c2, c1, c0;
3450 w16w_conv(val, &c2, &c1, &c0);
3451 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3452 #ifdef NUMCHAR_OPTION
3455 *p1 = CLASS_UNICODE | val;
3464 #ifdef UTF8_INPUT_ENABLE
3465 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3468 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3471 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3472 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3474 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3476 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3481 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3482 if (ret) return ret;
3487 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3491 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3492 } else if (is_unicode_bmp(c1)) {
3493 ret = w16e_conv(c1, &c2, &c1);
3496 c1 = CLASS_UNICODE | c1;
3498 if (ret) return ret;
3503 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3505 const unsigned short *const *pp;
3506 const unsigned short *const *const *ppp;
3507 static const int no_best_fit_chars_table_C2[] =
3508 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3510 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3511 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3512 static const int no_best_fit_chars_table_C2_ms[] =
3513 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3515 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3516 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3517 static const int no_best_fit_chars_table_932_C2[] =
3518 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3519 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3520 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3521 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3522 static const int no_best_fit_chars_table_932_C3[] =
3523 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3524 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3525 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3526 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3532 }else if(c2 < 0xe0){
3533 if(no_best_fit_chars_f){
3534 if(ms_ucs_map_f == UCS_MAP_CP932){
3537 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3540 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3543 }else if(!cp932inv_f){
3546 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3549 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3552 }else if(ms_ucs_map_f == UCS_MAP_MS){
3553 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3554 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3572 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3573 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3574 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3576 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3577 }else if(c0 < 0xF0){
3578 if(no_best_fit_chars_f){
3579 if(ms_ucs_map_f == UCS_MAP_CP932){
3580 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3581 }else if(ms_ucs_map_f == UCS_MAP_MS){
3586 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3589 if(c0 == 0x92) return 1;
3594 if(c1 == 0x80 || c0 == 0x9C) return 1;
3597 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3602 if(c0 == 0x94) return 1;
3605 if(c0 == 0xBB) return 1;
3615 if(c0 == 0x95) return 1;
3618 if(c0 == 0xA5) return 1;
3625 if(c0 == 0x8D) return 1;
3628 if(c0 == 0x9E && !cp932inv_f) return 1;
3631 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3639 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3640 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3641 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3643 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3645 #ifdef SHIFTJIS_CP932
3646 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3648 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3649 s2e_conv(s2, s1, p2, p1);
3658 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3661 const unsigned short *p;
3664 if (pp == 0) return 1;
3667 if (c1 < 0 || psize <= c1) return 1;
3669 if (p == 0) return 1;
3672 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3674 if (val == 0) return 1;
3675 if (no_cp932ext_f && (
3676 (val>>8) == 0x2D || /* NEC special characters */
3677 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3685 if (c2 == SO) c2 = X0201;
3692 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3694 const char *hex = "0123456789ABCDEF";
3700 (*f)(0, hex[(c>>shift)&0xF]);
3710 void encode_fallback_html(nkf_char c)
3715 if(c >= NKF_INT32_C(1000000))
3716 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3717 if(c >= NKF_INT32_C(100000))
3718 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3720 (*oconv)(0, 0x30+(c/10000 )%10);
3722 (*oconv)(0, 0x30+(c/1000 )%10);
3724 (*oconv)(0, 0x30+(c/100 )%10);
3726 (*oconv)(0, 0x30+(c/10 )%10);
3728 (*oconv)(0, 0x30+ c %10);
3733 void encode_fallback_xml(nkf_char c)
3738 nkf_each_char_to_hex(oconv, c);
3743 void encode_fallback_java(nkf_char c)
3745 const char *hex = "0123456789ABCDEF";
3748 if(!is_unicode_bmp(c)){
3752 (*oconv)(0, hex[(c>>20)&0xF]);
3753 (*oconv)(0, hex[(c>>16)&0xF]);
3757 (*oconv)(0, hex[(c>>12)&0xF]);
3758 (*oconv)(0, hex[(c>> 8)&0xF]);
3759 (*oconv)(0, hex[(c>> 4)&0xF]);
3760 (*oconv)(0, hex[ c &0xF]);
3764 void encode_fallback_perl(nkf_char c)
3769 nkf_each_char_to_hex(oconv, c);
3774 void encode_fallback_subchar(nkf_char c)
3776 c = unicode_subchar;
3777 (*oconv)((c>>8)&0xFF, c&0xFF);
3782 #ifdef UTF8_OUTPUT_ENABLE
3783 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3785 const unsigned short *p;
3788 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3796 p = euc_to_utf8_1byte;
3798 } else if (is_eucg3(c2)){
3799 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3802 c2 = (c2&0x7f) - 0x21;
3803 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3804 p = x0212_to_utf8_2bytes[c2];
3810 c2 = (c2&0x7f) - 0x21;
3811 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3813 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3814 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3815 euc_to_utf8_2bytes_ms[c2];
3820 c1 = (c1 & 0x7f) - 0x21;
3821 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3826 void w_oconv(nkf_char c2, nkf_char c1)
3832 output_bom_f = FALSE;
3843 #ifdef NUMCHAR_OPTION
3844 if (c2 == 0 && is_unicode_capsule(c1)){
3845 val = c1 & VALUE_MASK;
3848 }else if (val < 0x800){
3849 (*o_putc)(0xC0 | (val >> 6));
3850 (*o_putc)(0x80 | (val & 0x3f));
3851 } else if (val <= NKF_INT32_C(0xFFFF)) {
3852 (*o_putc)(0xE0 | (val >> 12));
3853 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3854 (*o_putc)(0x80 | (val & 0x3f));
3855 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3856 (*o_putc)(0xF0 | ( val>>18));
3857 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3858 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3859 (*o_putc)(0x80 | ( val & 0x3f));
3866 output_mode = ASCII;
3868 } else if (c2 == ISO8859_1) {
3869 output_mode = ISO8859_1;
3870 (*o_putc)(c1 | 0x080);
3873 val = e2w_conv(c2, c1);
3875 w16w_conv(val, &c2, &c1, &c0);
3879 if (c0) (*o_putc)(c0);
3885 void w_oconv16(nkf_char c2, nkf_char c1)
3888 output_bom_f = FALSE;
3889 if (output_endian == ENDIAN_LITTLE){
3890 (*o_putc)((unsigned char)'\377');
3894 (*o_putc)((unsigned char)'\377');
3903 if (c2 == ISO8859_1) {
3906 #ifdef NUMCHAR_OPTION
3907 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3908 if (is_unicode_bmp(c1)) {
3909 c2 = (c1 >> 8) & 0xff;
3913 if (c1 <= UNICODE_MAX) {
3914 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3915 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3916 if (output_endian == ENDIAN_LITTLE){
3917 (*o_putc)(c2 & 0xff);
3918 (*o_putc)((c2 >> 8) & 0xff);
3919 (*o_putc)(c1 & 0xff);
3920 (*o_putc)((c1 >> 8) & 0xff);
3922 (*o_putc)((c2 >> 8) & 0xff);
3923 (*o_putc)(c2 & 0xff);
3924 (*o_putc)((c1 >> 8) & 0xff);
3925 (*o_putc)(c1 & 0xff);
3932 nkf_char val = e2w_conv(c2, c1);
3933 c2 = (val >> 8) & 0xff;
3937 if (output_endian == ENDIAN_LITTLE){
3946 void w_oconv32(nkf_char c2, nkf_char c1)
3949 output_bom_f = FALSE;
3950 if (output_endian == ENDIAN_LITTLE){
3951 (*o_putc)((unsigned char)'\377');
3959 (*o_putc)((unsigned char)'\377');
3968 if (c2 == ISO8859_1) {
3970 #ifdef NUMCHAR_OPTION
3971 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3975 c1 = e2w_conv(c2, c1);
3978 if (output_endian == ENDIAN_LITTLE){
3979 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3980 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3981 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3985 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3986 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3987 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3992 void e_oconv(nkf_char c2, nkf_char c1)
3994 #ifdef NUMCHAR_OPTION
3995 if (c2 == 0 && is_unicode_capsule(c1)){
3996 w16e_conv(c1, &c2, &c1);
3997 if (c2 == 0 && is_unicode_capsule(c1)){
3998 c2 = c1 & VALUE_MASK;
3999 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4003 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4004 c1 = 0x21 + c1 % 94;
4007 (*o_putc)((c2 & 0x7f) | 0x080);
4008 (*o_putc)(c1 | 0x080);
4010 (*o_putc)((c2 & 0x7f) | 0x080);
4011 (*o_putc)(c1 | 0x080);
4015 if (encode_fallback) (*encode_fallback)(c1);
4024 } else if (c2 == 0) {
4025 output_mode = ASCII;
4027 } else if (c2 == X0201) {
4028 output_mode = JAPANESE_EUC;
4029 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4030 } else if (c2 == ISO8859_1) {
4031 output_mode = ISO8859_1;
4032 (*o_putc)(c1 | 0x080);
4034 } else if (is_eucg3(c2)){
4035 output_mode = JAPANESE_EUC;
4036 #ifdef SHIFTJIS_CP932
4039 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4040 s2e_conv(s2, s1, &c2, &c1);
4045 output_mode = ASCII;
4047 }else if (is_eucg3(c2)){
4050 (*o_putc)((c2 & 0x7f) | 0x080);
4051 (*o_putc)(c1 | 0x080);
4054 (*o_putc)((c2 & 0x7f) | 0x080);
4055 (*o_putc)(c1 | 0x080);
4059 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4060 set_iconv(FALSE, 0);
4061 return; /* too late to rescue this char */
4063 output_mode = JAPANESE_EUC;
4064 (*o_putc)(c2 | 0x080);
4065 (*o_putc)(c1 | 0x080);
4070 nkf_char x0212_shift(nkf_char c)
4075 if (0x75 <= c && c <= 0x7f){
4076 ret = c + (0x109 - 0x75);
4079 if (0x75 <= c && c <= 0x7f){
4080 ret = c + (0x113 - 0x75);
4087 nkf_char x0212_unshift(nkf_char c)
4090 if (0x7f <= c && c <= 0x88){
4091 ret = c + (0x75 - 0x7f);
4092 }else if (0x89 <= c && c <= 0x92){
4093 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4097 #endif /* X0212_ENABLE */
4099 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4105 if((0x21 <= ndx && ndx <= 0x2F)){
4106 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4107 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4109 }else if(0x6E <= ndx && ndx <= 0x7E){
4110 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4111 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4117 else if(nkf_isgraph(ndx)){
4119 const unsigned short *ptr;
4120 ptr = x0212_shiftjis[ndx - 0x21];
4122 val = ptr[(c1 & 0x7f) - 0x21];
4131 c2 = x0212_shift(c2);
4133 #endif /* X0212_ENABLE */
4135 if(0x7F < c2) return 1;
4136 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4137 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4141 void s_oconv(nkf_char c2, nkf_char c1)
4143 #ifdef NUMCHAR_OPTION
4144 if (c2 == 0 && is_unicode_capsule(c1)){
4145 w16e_conv(c1, &c2, &c1);
4146 if (c2 == 0 && is_unicode_capsule(c1)){
4147 c2 = c1 & VALUE_MASK;
4148 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4151 c2 = c1 / 188 + 0xF0;
4153 c1 += 0x40 + (c1 > 0x3e);
4158 if(encode_fallback)(*encode_fallback)(c1);
4167 } else if (c2 == 0) {
4168 output_mode = ASCII;
4170 } else if (c2 == X0201) {
4171 output_mode = SHIFT_JIS;
4173 } else if (c2 == ISO8859_1) {
4174 output_mode = ISO8859_1;
4175 (*o_putc)(c1 | 0x080);
4177 } else if (is_eucg3(c2)){
4178 output_mode = SHIFT_JIS;
4179 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4185 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4186 set_iconv(FALSE, 0);
4187 return; /* too late to rescue this char */
4189 output_mode = SHIFT_JIS;
4190 e2s_conv(c2, c1, &c2, &c1);
4192 #ifdef SHIFTJIS_CP932
4194 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4195 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4201 #endif /* SHIFTJIS_CP932 */
4204 if (prefix_table[(unsigned char)c1]){
4205 (*o_putc)(prefix_table[(unsigned char)c1]);
4211 void j_oconv(nkf_char c2, nkf_char c1)
4213 #ifdef NUMCHAR_OPTION
4214 if (c2 == 0 && is_unicode_capsule(c1)){
4215 w16e_conv(c1, &c2, &c1);
4216 if (c2 == 0 && is_unicode_capsule(c1)){
4217 c2 = c1 & VALUE_MASK;
4218 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4221 c2 = 0x7F + c1 / 94;
4222 c1 = 0x21 + c1 % 94;
4224 if (encode_fallback) (*encode_fallback)(c1);
4231 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4234 (*o_putc)(ascii_intro);
4235 output_mode = ASCII;
4239 } else if (is_eucg3(c2)){
4241 if(output_mode!=X0213_2){
4242 output_mode = X0213_2;
4246 (*o_putc)(X0213_2&0x7F);
4249 if(output_mode!=X0212){
4250 output_mode = X0212;
4254 (*o_putc)(X0212&0x7F);
4257 (*o_putc)(c2 & 0x7f);
4260 } else if (c2==X0201) {
4261 if (output_mode!=X0201) {
4262 output_mode = X0201;
4268 } else if (c2==ISO8859_1) {
4269 /* iso8859 introduction, or 8th bit on */
4270 /* Can we convert in 7bit form using ESC-'-'-A ?
4272 output_mode = ISO8859_1;
4274 } else if (c2 == 0) {
4275 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4278 (*o_putc)(ascii_intro);
4279 output_mode = ASCII;
4284 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4285 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4287 if (output_mode!=X0213_1) {
4288 output_mode = X0213_1;
4292 (*o_putc)(X0213_1&0x7F);
4294 }else if (output_mode != X0208) {
4295 output_mode = X0208;
4298 (*o_putc)(kanji_intro);
4305 void base64_conv(nkf_char c2, nkf_char c1)
4307 mime_prechar(c2, c1);
4308 (*o_base64conv)(c2,c1);
4312 static nkf_char broken_buf[3];
4313 static int broken_counter = 0;
4314 static int broken_last = 0;
4315 nkf_char broken_getc(FILE *f)
4319 if (broken_counter>0) {
4320 return broken_buf[--broken_counter];
4323 if (c=='$' && broken_last != ESC
4324 && (input_mode==ASCII || input_mode==X0201)) {
4327 if (c1=='@'|| c1=='B') {
4328 broken_buf[0]=c1; broken_buf[1]=c;
4335 } else if (c=='(' && broken_last != ESC
4336 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4339 if (c1=='J'|| c1=='B') {
4340 broken_buf[0]=c1; broken_buf[1]=c;
4353 nkf_char broken_ungetc(nkf_char c, FILE *f)
4355 if (broken_counter<2)
4356 broken_buf[broken_counter++]=c;
4360 void cr_conv(nkf_char c2, nkf_char c1)
4364 if (! (c2==0&&c1==NL) ) {
4370 } else if (c1=='\r') {
4372 } else if (c1=='\n') {
4373 if (crmode_f==CRLF) {
4374 (*o_crconv)(0,'\r');
4375 } else if (crmode_f==CR) {
4376 (*o_crconv)(0,'\r');
4380 } else if (c1!='\032' || crmode_f!=NL){
4386 Return value of fold_conv()
4388 \n add newline and output char
4389 \r add newline and output nothing
4392 1 (or else) normal output
4394 fold state in prev (previous character)
4396 >0x80 Japanese (X0208/X0201)
4401 This fold algorthm does not preserve heading space in a line.
4402 This is the main difference from fmt.
4405 #define char_size(c2,c1) (c2?2:1)
4407 void fold_conv(nkf_char c2, nkf_char c1)
4410 nkf_char fold_state;
4412 if (c1== '\r' && !fold_preserve_f) {
4413 fold_state=0; /* ignore cr */
4414 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4416 fold_state=0; /* ignore cr */
4417 } else if (c1== BS) {
4418 if (f_line>0) f_line--;
4420 } else if (c2==EOF && f_line != 0) { /* close open last line */
4422 } else if ((c1=='\n' && !fold_preserve_f)
4423 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4424 && fold_preserve_f)) {
4426 if (fold_preserve_f) {
4430 } else if ((f_prev == c1 && !fold_preserve_f)
4431 || (f_prev == '\n' && fold_preserve_f)
4432 ) { /* duplicate newline */
4435 fold_state = '\n'; /* output two newline */
4441 if (f_prev&0x80) { /* Japanese? */
4443 fold_state = 0; /* ignore given single newline */
4444 } else if (f_prev==' ') {
4448 if (++f_line<=fold_len)
4452 fold_state = '\r'; /* fold and output nothing */
4456 } else if (c1=='\f') {
4459 fold_state = '\n'; /* output newline and clear */
4460 } else if ( (c2==0 && c1==' ')||
4461 (c2==0 && c1=='\t')||
4462 (c2=='!'&& c1=='!')) {
4463 /* X0208 kankaku or ascii space */
4464 if (f_prev == ' ') {
4465 fold_state = 0; /* remove duplicate spaces */
4468 if (++f_line<=fold_len)
4469 fold_state = ' '; /* output ASCII space only */
4471 f_prev = ' '; f_line = 0;
4472 fold_state = '\r'; /* fold and output nothing */
4476 prev0 = f_prev; /* we still need this one... , but almost done */
4478 if (c2 || c2==X0201)
4479 f_prev |= 0x80; /* this is Japanese */
4480 f_line += char_size(c2,c1);
4481 if (f_line<=fold_len) { /* normal case */
4484 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4485 f_line = char_size(c2,c1);
4486 fold_state = '\n'; /* We can't wait, do fold now */
4487 } else if (c2==X0201) {
4488 /* simple kinsoku rules return 1 means no folding */
4489 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4490 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4491 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4492 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4493 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4494 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4495 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4497 fold_state = '\n';/* add one new f_line before this character */
4500 fold_state = '\n';/* add one new f_line before this character */
4503 /* kinsoku point in ASCII */
4504 if ( c1==')'|| /* { [ ( */
4515 /* just after special */
4516 } else if (!is_alnum(prev0)) {
4517 f_line = char_size(c2,c1);
4519 } else if ((prev0==' ') || /* ignored new f_line */
4520 (prev0=='\n')|| /* ignored new f_line */
4521 (prev0&0x80)) { /* X0208 - ASCII */
4522 f_line = char_size(c2,c1);
4523 fold_state = '\n';/* add one new f_line before this character */
4525 fold_state = 1; /* default no fold in ASCII */
4529 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4530 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4531 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4532 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4533 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4534 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4535 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4536 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4537 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4538 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4539 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4540 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4541 /* default no fold in kinsoku */
4544 f_line = char_size(c2,c1);
4545 /* add one new f_line before this character */
4548 f_line = char_size(c2,c1);
4550 /* add one new f_line before this character */
4555 /* terminator process */
4556 switch(fold_state) {
4575 nkf_char z_prev2=0,z_prev1=0;
4577 void z_conv(nkf_char c2, nkf_char c1)
4580 /* if (c2) c1 &= 0x7f; assertion */
4582 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4586 if (x0201_f && z_prev2==X0201) { /* X0201 */
4587 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4589 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4591 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4593 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4597 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4606 if (x0201_f && c2==X0201) {
4607 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4608 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4609 z_prev1 = c1; z_prev2 = c2;
4612 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4617 /* JISX0208 Alphabet */
4618 if (alpha_f && c2 == 0x23 ) {
4620 } else if (alpha_f && c2 == 0x21 ) {
4621 /* JISX0208 Kigou */
4626 } else if (alpha_f&0x4) {
4631 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4637 case '>': entity = ">"; break;
4638 case '<': entity = "<"; break;
4639 case '\"': entity = """; break;
4640 case '&': entity = "&"; break;
4643 while (*entity) (*o_zconv)(0, *entity++);
4653 #define rot13(c) ( \
4655 (c <= 'M') ? (c + 13): \
4656 (c <= 'Z') ? (c - 13): \
4658 (c <= 'm') ? (c + 13): \
4659 (c <= 'z') ? (c - 13): \
4663 #define rot47(c) ( \
4665 ( c <= 'O' ) ? (c + 47) : \
4666 ( c <= '~' ) ? (c - 47) : \
4670 void rot_conv(nkf_char c2, nkf_char c1)
4672 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4678 (*o_rot_conv)(c2,c1);
4681 void hira_conv(nkf_char c2, nkf_char c1)
4685 if (0x20 < c1 && c1 < 0x74) {
4687 (*o_hira_conv)(c2,c1);
4689 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4691 c1 = CLASS_UNICODE | 0x3094;
4692 (*o_hira_conv)(c2,c1);
4695 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4697 (*o_hira_conv)(c2,c1);
4702 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4705 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4707 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4711 (*o_hira_conv)(c2,c1);
4715 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4717 static const nkf_char range[RANGE_NUM_MAX][2] = {
4738 nkf_char start, end, c;
4740 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4744 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4749 for (i = 0; i < RANGE_NUM_MAX; i++) {
4750 start = range[i][0];
4753 if (c >= start && c <= end) {
4758 (*o_iso2022jp_check_conv)(c2,c1);
4762 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4764 const unsigned char *mime_pattern[] = {
4765 (const unsigned char *)"\075?EUC-JP?B?",
4766 (const unsigned char *)"\075?SHIFT_JIS?B?",
4767 (const unsigned char *)"\075?ISO-8859-1?Q?",
4768 (const unsigned char *)"\075?ISO-8859-1?B?",
4769 (const unsigned char *)"\075?ISO-2022-JP?B?",
4770 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4771 #if defined(UTF8_INPUT_ENABLE)
4772 (const unsigned char *)"\075?UTF-8?B?",
4773 (const unsigned char *)"\075?UTF-8?Q?",
4775 (const unsigned char *)"\075?US-ASCII?Q?",
4780 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4781 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4782 e_iconv, s_iconv, 0, 0, 0, 0,
4783 #if defined(UTF8_INPUT_ENABLE)
4789 const nkf_char mime_encode[] = {
4790 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4791 #if defined(UTF8_INPUT_ENABLE)
4798 const nkf_char mime_encode_method[] = {
4799 'B', 'B','Q', 'B', 'B', 'Q',
4800 #if defined(UTF8_INPUT_ENABLE)
4808 #define MAXRECOVER 20
4810 void switch_mime_getc(void)
4812 if (i_getc!=mime_getc) {
4813 i_mgetc = i_getc; i_getc = mime_getc;
4814 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4815 if(mime_f==STRICT_MIME) {
4816 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4817 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4822 void unswitch_mime_getc(void)
4824 if(mime_f==STRICT_MIME) {
4825 i_mgetc = i_mgetc_buf;
4826 i_mungetc = i_mungetc_buf;
4829 i_ungetc = i_mungetc;
4830 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4831 mime_iconv_back = NULL;
4834 nkf_char mime_begin_strict(FILE *f)
4838 const unsigned char *p,*q;
4839 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4841 mime_decode_mode = FALSE;
4842 /* =? has been checked */
4844 p = mime_pattern[j];
4847 for(i=2;p[i]>' ';i++) { /* start at =? */
4848 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4849 /* pattern fails, try next one */
4851 while (mime_pattern[++j]) {
4852 p = mime_pattern[j];
4853 for(k=2;k<i;k++) /* assume length(p) > i */
4854 if (p[k]!=q[k]) break;
4855 if (k==i && nkf_toupper(c1)==p[k]) break;
4857 p = mime_pattern[j];
4858 if (p) continue; /* found next one, continue */
4859 /* all fails, output from recovery buffer */
4867 mime_decode_mode = p[i-2];
4869 mime_iconv_back = iconv;
4870 set_iconv(FALSE, mime_priority_func[j]);
4871 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4873 if (mime_decode_mode=='B') {
4874 mimebuf_f = unbuf_f;
4876 /* do MIME integrity check */
4877 return mime_integrity(f,mime_pattern[j]);
4885 nkf_char mime_getc_buf(FILE *f)
4887 /* we don't keep eof of Fifo, becase it contains ?= as
4888 a terminator. It was checked in mime_integrity. */
4889 return ((mimebuf_f)?
4890 (*i_mgetc_buf)(f):Fifo(mime_input++));
4893 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4896 (*i_mungetc_buf)(c,f);
4898 Fifo(--mime_input) = (unsigned char)c;
4902 nkf_char mime_begin(FILE *f)
4907 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4908 /* re-read and convert again from mime_buffer. */
4910 /* =? has been checked */
4912 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4913 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4914 /* We accept any character type even if it is breaked by new lines */
4915 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4916 if (c1=='\n'||c1==' '||c1=='\r'||
4917 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4919 /* Failed. But this could be another MIME preemble */
4927 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4928 if (!(++i<MAXRECOVER) || c1==EOF) break;
4929 if (c1=='b'||c1=='B') {
4930 mime_decode_mode = 'B';
4931 } else if (c1=='q'||c1=='Q') {
4932 mime_decode_mode = 'Q';
4936 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4937 if (!(++i<MAXRECOVER) || c1==EOF) break;
4939 mime_decode_mode = FALSE;
4945 if (!mime_decode_mode) {
4946 /* false MIME premble, restart from mime_buffer */
4947 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4948 /* Since we are in MIME mode until buffer becomes empty, */
4949 /* we never go into mime_begin again for a while. */
4952 /* discard mime preemble, and goto MIME mode */
4954 /* do no MIME integrity check */
4955 return c1; /* used only for checking EOF */
4959 void no_putc(nkf_char c)
4964 void debug(const char *str)
4967 fprintf(stderr, "%s\n", str);
4972 void set_input_codename(char *codename)
4976 strcmp(codename, "") != 0 &&
4977 strcmp(codename, input_codename) != 0)
4979 is_inputcode_mixed = TRUE;
4981 input_codename = codename;
4982 is_inputcode_set = TRUE;
4985 #if !defined(PERL_XS) && !defined(WIN32DLL)
4986 void print_guessed_code(char *filename)
4988 char *codename = "BINARY";
4989 char *str_crmode = NULL;
4990 if (!is_inputcode_mixed) {
4991 if (strcmp(input_codename, "") == 0) {
4994 codename = input_codename;
4996 if (crmode_f == CR) str_crmode = "CR";
4997 else if (crmode_f == NL) str_crmode = "LF";
4998 else if (crmode_f == CRLF) str_crmode = "CRLF";
5000 if (filename != NULL) printf("%s:", filename);
5001 if (str_crmode != NULL) printf("%s (%s)\n", codename, str_crmode);
5002 else printf("%s\n", codename);
5008 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5010 nkf_char c1, c2, c3;
5016 if (!nkf_isxdigit(c2)){
5021 if (!nkf_isxdigit(c3)){
5026 return (hex2bin(c2) << 4) | hex2bin(c3);
5029 nkf_char cap_getc(FILE *f)
5031 return hex_getc(':', f, i_cgetc, i_cungetc);
5034 nkf_char cap_ungetc(nkf_char c, FILE *f)
5036 return (*i_cungetc)(c, f);
5039 nkf_char url_getc(FILE *f)
5041 return hex_getc('%', f, i_ugetc, i_uungetc);
5044 nkf_char url_ungetc(nkf_char c, FILE *f)
5046 return (*i_uungetc)(c, f);
5050 #ifdef NUMCHAR_OPTION
5051 nkf_char numchar_getc(FILE *f)
5053 nkf_char (*g)(FILE *) = i_ngetc;
5054 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5065 if (buf[i] == 'x' || buf[i] == 'X'){
5066 for (j = 0; j < 7; j++){
5068 if (!nkf_isxdigit(buf[i])){
5075 c |= hex2bin(buf[i]);
5078 for (j = 0; j < 8; j++){
5082 if (!nkf_isdigit(buf[i])){
5089 c += hex2bin(buf[i]);
5095 return CLASS_UNICODE | c;
5104 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5106 return (*i_nungetc)(c, f);
5110 #ifdef UNICODE_NORMALIZATION
5112 /* Normalization Form C */
5113 nkf_char nfc_getc(FILE *f)
5115 nkf_char (*g)(FILE *f) = i_nfc_getc;
5116 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5117 int i=0, j, k=1, lower, upper;
5119 const nkf_nfchar *array;
5122 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5123 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5124 while (upper >= lower) {
5125 j = (lower+upper) / 2;
5126 array = normalization_table[j].nfd;
5127 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5128 if (array[k] != buf[k]){
5129 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5136 array = normalization_table[j].nfc;
5137 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5138 buf[i] = (nkf_char)(array[i]);
5149 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5151 return (*i_nfc_ungetc)(c, f);
5153 #endif /* UNICODE_NORMALIZATION */
5159 nkf_char c1, c2, c3, c4, cc;
5160 nkf_char t1, t2, t3, t4, mode, exit_mode;
5161 nkf_char lwsp_count;
5164 nkf_char lwsp_size = 128;
5166 if (mime_top != mime_last) { /* Something is in FIFO */
5167 return Fifo(mime_top++);
5169 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5170 mime_decode_mode=FALSE;
5171 unswitch_mime_getc();
5172 return (*i_getc)(f);
5175 if (mimebuf_f == FIXED_MIME)
5176 exit_mode = mime_decode_mode;
5179 if (mime_decode_mode == 'Q') {
5180 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5182 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5183 if (c1<=' ' || DEL<=c1) {
5184 mime_decode_mode = exit_mode; /* prepare for quit */
5187 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5191 mime_decode_mode = exit_mode; /* prepare for quit */
5192 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5193 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5194 /* end Q encoding */
5195 input_mode = exit_mode;
5197 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5198 if (lwsp_buf==NULL) {
5199 perror("can't malloc");
5202 while ((c1=(*i_getc)(f))!=EOF) {
5207 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5215 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5216 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5231 lwsp_buf[lwsp_count] = (unsigned char)c1;
5232 if (lwsp_count++>lwsp_size){
5234 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5235 if (lwsp_buf_new==NULL) {
5237 perror("can't realloc");
5240 lwsp_buf = lwsp_buf_new;
5246 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5248 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5249 i_ungetc(lwsp_buf[lwsp_count],f);
5255 if (c1=='='&&c2<' ') { /* this is soft wrap */
5256 while((c1 = (*i_mgetc)(f)) <=' ') {
5257 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5259 mime_decode_mode = 'Q'; /* still in MIME */
5260 goto restart_mime_q;
5263 mime_decode_mode = 'Q'; /* still in MIME */
5267 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5268 if (c2<=' ') return c2;
5269 mime_decode_mode = 'Q'; /* still in MIME */
5270 return ((hex2bin(c2)<<4) + hex2bin(c3));
5273 if (mime_decode_mode != 'B') {
5274 mime_decode_mode = FALSE;
5275 return (*i_mgetc)(f);
5279 /* Base64 encoding */
5281 MIME allows line break in the middle of
5282 Base64, but we are very pessimistic in decoding
5283 in unbuf mode because MIME encoded code may broken by
5284 less or editor's control sequence (such as ESC-[-K in unbuffered
5285 mode. ignore incomplete MIME.
5287 mode = mime_decode_mode;
5288 mime_decode_mode = exit_mode; /* prepare for quit */
5290 while ((c1 = (*i_mgetc)(f))<=' ') {
5295 if ((c2 = (*i_mgetc)(f))<=' ') {
5298 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5299 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5302 if ((c1 == '?') && (c2 == '=')) {
5305 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5306 if (lwsp_buf==NULL) {
5307 perror("can't malloc");
5310 while ((c1=(*i_getc)(f))!=EOF) {
5315 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5323 if ((c1=(*i_getc)(f))!=EOF) {
5327 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5342 lwsp_buf[lwsp_count] = (unsigned char)c1;
5343 if (lwsp_count++>lwsp_size){
5345 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5346 if (lwsp_buf_new==NULL) {
5348 perror("can't realloc");
5351 lwsp_buf = lwsp_buf_new;
5357 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5359 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5360 i_ungetc(lwsp_buf[lwsp_count],f);
5367 if ((c3 = (*i_mgetc)(f))<=' ') {
5370 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5371 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5375 if ((c4 = (*i_mgetc)(f))<=' ') {
5378 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5379 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5383 mime_decode_mode = mode; /* still in MIME sigh... */
5385 /* BASE 64 decoding */
5387 t1 = 0x3f & base64decode(c1);
5388 t2 = 0x3f & base64decode(c2);
5389 t3 = 0x3f & base64decode(c3);
5390 t4 = 0x3f & base64decode(c4);
5391 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5393 Fifo(mime_last++) = (unsigned char)cc;
5394 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5396 Fifo(mime_last++) = (unsigned char)cc;
5397 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5399 Fifo(mime_last++) = (unsigned char)cc;
5404 return Fifo(mime_top++);
5407 nkf_char mime_ungetc(nkf_char c, FILE *f)
5409 Fifo(--mime_top) = (unsigned char)c;
5413 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5417 /* In buffered mode, read until =? or NL or buffer full
5419 mime_input = mime_top;
5420 mime_last = mime_top;
5422 while(*p) Fifo(mime_input++) = *p++;
5425 while((c=(*i_getc)(f))!=EOF) {
5426 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5427 break; /* buffer full */
5429 if (c=='=' && d=='?') {
5430 /* checked. skip header, start decode */
5431 Fifo(mime_input++) = (unsigned char)c;
5432 /* mime_last_input = mime_input; */
5437 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5439 /* Should we check length mod 4? */
5440 Fifo(mime_input++) = (unsigned char)c;
5443 /* In case of Incomplete MIME, no MIME decode */
5444 Fifo(mime_input++) = (unsigned char)c;
5445 mime_last = mime_input; /* point undecoded buffer */
5446 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5447 switch_mime_getc(); /* anyway we need buffered getc */
5451 nkf_char base64decode(nkf_char c)
5456 i = c - 'A'; /* A..Z 0-25 */
5458 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5460 } else if (c > '/') {
5461 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5462 } else if (c == '+') {
5463 i = '>' /* 62 */ ; /* + 62 */
5465 i = '?' /* 63 */ ; /* / 63 */
5470 static const char basis_64[] =
5471 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5473 static nkf_char b64c;
5474 #define MIMEOUT_BUF_LENGTH (60)
5475 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5476 int mimeout_buf_count = 0;
5477 int mimeout_preserve_space = 0;
5478 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5480 void open_mime(nkf_char mode)
5482 const unsigned char *p;
5485 p = mime_pattern[0];
5486 for(i=0;mime_pattern[i];i++) {
5487 if (mode == mime_encode[i]) {
5488 p = mime_pattern[i];
5492 mimeout_mode = mime_encode_method[i];
5495 if (base64_count>45) {
5496 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5497 (*o_mputc)(mimeout_buf[i]);
5503 if (!mimeout_preserve_space && mimeout_buf_count>0
5504 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5505 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5509 if (!mimeout_preserve_space) {
5510 for (;i<mimeout_buf_count;i++) {
5511 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5512 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5513 (*o_mputc)(mimeout_buf[i]);
5520 mimeout_preserve_space = FALSE;
5526 j = mimeout_buf_count;
5527 mimeout_buf_count = 0;
5529 mime_putc(mimeout_buf[i]);
5533 void close_mime(void)
5543 switch(mimeout_mode) {
5548 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5554 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5560 if (mimeout_f!=FIXED_MIME) {
5562 } else if (mimeout_mode != 'Q')
5567 void mimeout_addchar(nkf_char c)
5569 switch(mimeout_mode) {
5574 } else if(!nkf_isalnum(c)) {
5576 (*o_mputc)(itoh4(((c>>4)&0xf)));
5577 (*o_mputc)(itoh4((c&0xf)));
5586 (*o_mputc)(basis_64[c>>2]);
5591 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5597 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5598 (*o_mputc)(basis_64[c & 0x3F]);
5609 nkf_char mime_lastchar2, mime_lastchar1;
5611 void mime_prechar(nkf_char c2, nkf_char c1)
5615 if (base64_count + mimeout_buf_count/3*4> 66){
5616 (*o_base64conv)(EOF,0);
5617 (*o_base64conv)(0,NL);
5618 (*o_base64conv)(0,SPACE);
5620 }/*else if (mime_lastchar2){
5621 if (c1 <=DEL && !nkf_isspace(c1)){
5622 (*o_base64conv)(0,SPACE);
5626 if (c2 && mime_lastchar2 == 0
5627 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5628 (*o_base64conv)(0,SPACE);
5631 mime_lastchar2 = c2;
5632 mime_lastchar1 = c1;
5635 void mime_putc(nkf_char c)
5640 if (mimeout_f == FIXED_MIME){
5641 if (mimeout_mode == 'Q'){
5642 if (base64_count > 71){
5643 if (c!=CR && c!=NL) {
5650 if (base64_count > 71){
5655 if (c == EOF) { /* c==EOF */
5659 if (c != EOF) { /* c==EOF */
5665 /* mimeout_f != FIXED_MIME */
5667 if (c == EOF) { /* c==EOF */
5668 j = mimeout_buf_count;
5669 mimeout_buf_count = 0;
5673 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5676 mimeout_addchar(mimeout_buf[i]);
5680 mimeout_addchar(mimeout_buf[i]);
5684 mimeout_addchar(mimeout_buf[i]);
5690 if (mimeout_mode=='Q') {
5691 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5692 if (c == CR || c == NL) {
5697 } else if (c <= SPACE) {
5699 if (base64_count > 70) {
5703 if (!nkf_isblank(c)) {
5714 if (mimeout_buf_count > 0){
5715 lastchar = mimeout_buf[mimeout_buf_count - 1];
5720 if (!mimeout_mode) {
5721 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5722 if (nkf_isspace(c)) {
5723 if (c==CR || c==NL) {
5726 for (i=0;i<mimeout_buf_count;i++) {
5727 (*o_mputc)(mimeout_buf[i]);
5728 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5734 mimeout_buf[0] = (char)c;
5735 mimeout_buf_count = 1;
5737 if (base64_count > 1
5738 && base64_count + mimeout_buf_count > 76
5739 && mimeout_buf[0] != CR && mimeout_buf[0] != NL){
5742 if (!nkf_isspace(mimeout_buf[0])){
5747 mimeout_buf[mimeout_buf_count++] = (char)c;
5748 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5749 open_mime(output_mode);
5754 if (lastchar==CR || lastchar == NL){
5755 for (i=0;i<mimeout_buf_count;i++) {
5756 (*o_mputc)(mimeout_buf[i]);
5759 mimeout_buf_count = 0;
5761 if (lastchar==SPACE) {
5762 for (i=0;i<mimeout_buf_count-1;i++) {
5763 (*o_mputc)(mimeout_buf[i]);
5766 mimeout_buf[0] = SPACE;
5767 mimeout_buf_count = 1;
5769 open_mime(output_mode);
5772 /* mimeout_mode == 'B', 1, 2 */
5773 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5774 if (lastchar == CR || lastchar == NL){
5775 if (nkf_isblank(c)) {
5776 for (i=0;i<mimeout_buf_count;i++) {
5777 mimeout_addchar(mimeout_buf[i]);
5779 mimeout_buf_count = 0;
5780 } else if (SPACE<c && c<DEL) {
5782 for (i=0;i<mimeout_buf_count;i++) {
5783 (*o_mputc)(mimeout_buf[i]);
5786 mimeout_buf_count = 0;
5789 if (c==SPACE || c==TAB || c==CR || c==NL) {
5790 for (i=0;i<mimeout_buf_count;i++) {
5791 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5793 for (i=0;i<mimeout_buf_count;i++) {
5794 (*o_mputc)(mimeout_buf[i]);
5797 mimeout_buf_count = 0;
5800 mimeout_buf[mimeout_buf_count++] = (char)c;
5801 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5803 for (i=0;i<mimeout_buf_count;i++) {
5804 (*o_mputc)(mimeout_buf[i]);
5807 mimeout_buf_count = 0;
5811 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5812 mimeout_buf[mimeout_buf_count++] = (char)c;
5813 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5814 j = mimeout_buf_count;
5815 mimeout_buf_count = 0;
5817 mimeout_addchar(mimeout_buf[i]);
5824 if (mimeout_buf_count>0) {
5825 j = mimeout_buf_count;
5826 mimeout_buf_count = 0;
5828 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5830 mimeout_addchar(mimeout_buf[i]);
5836 (*o_mputc)(mimeout_buf[i]);
5838 open_mime(output_mode);
5845 #if defined(PERL_XS) || defined(WIN32DLL)
5849 struct input_code *p = input_code_list;
5862 mime_f = STRICT_MIME;
5863 mime_decode_f = FALSE;
5868 #if defined(MSDOS) || defined(__OS2__)
5873 iso2022jp_f = FALSE;
5874 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5875 ms_ucs_map_f = UCS_MAP_ASCII;
5877 #ifdef UTF8_INPUT_ENABLE
5878 no_cp932ext_f = FALSE;
5879 no_best_fit_chars_f = FALSE;
5880 encode_fallback = NULL;
5881 unicode_subchar = '?';
5882 input_endian = ENDIAN_BIG;
5884 #ifdef UTF8_OUTPUT_ENABLE
5885 output_bom_f = FALSE;
5886 output_endian = ENDIAN_BIG;
5888 #ifdef UNICODE_NORMALIZATION
5901 is_inputcode_mixed = FALSE;
5902 is_inputcode_set = FALSE;
5906 #ifdef SHIFTJIS_CP932
5916 for (i = 0; i < 256; i++){
5917 prefix_table[i] = 0;
5921 mimeout_buf_count = 0;
5926 fold_preserve_f = FALSE;
5929 kanji_intro = DEFAULT_J;
5930 ascii_intro = DEFAULT_R;
5931 fold_margin = FOLD_MARGIN;
5932 output_conv = DEFAULT_CONV;
5933 oconv = DEFAULT_CONV;
5934 o_zconv = no_connection;
5935 o_fconv = no_connection;
5936 o_crconv = no_connection;
5937 o_rot_conv = no_connection;
5938 o_hira_conv = no_connection;
5939 o_base64conv = no_connection;
5940 o_iso2022jp_check_conv = no_connection;
5943 i_ungetc = std_ungetc;
5945 i_bungetc = std_ungetc;
5948 i_mungetc = std_ungetc;
5949 i_mgetc_buf = std_getc;
5950 i_mungetc_buf = std_ungetc;
5951 output_mode = ASCII;
5954 mime_decode_mode = FALSE;
5960 z_prev2=0,z_prev1=0;
5962 iconv_for_check = 0;
5964 input_codename = "";
5971 void no_connection(nkf_char c2, nkf_char c1)
5973 no_connection2(c2,c1,0);
5976 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5978 fprintf(stderr,"nkf internal module connection failure.\n");
5980 return 0; /* LINT */
5985 #define fprintf dllprintf
5989 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5990 fprintf(stderr,"Flags:\n");
5991 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5992 #ifdef DEFAULT_CODE_SJIS
5993 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5995 #ifdef DEFAULT_CODE_JIS
5996 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5998 #ifdef DEFAULT_CODE_EUC
5999 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6001 #ifdef DEFAULT_CODE_UTF8
6002 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6004 #ifdef UTF8_OUTPUT_ENABLE
6005 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6007 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6008 #ifdef UTF8_INPUT_ENABLE
6009 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6011 fprintf(stderr,"t no conversion\n");
6012 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6013 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6014 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6015 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6016 fprintf(stderr,"v Show this usage. V: show version\n");
6017 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6018 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6019 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6020 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6021 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
6022 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
6023 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6024 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6026 fprintf(stderr,"T Text mode output\n");
6028 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6029 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6030 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6031 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6032 fprintf(stderr,"\n");
6033 fprintf(stderr,"Long name options\n");
6034 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6035 fprintf(stderr," Specify the input or output codeset\n");
6036 fprintf(stderr," --fj --unix --mac --windows\n");
6037 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6038 fprintf(stderr," Convert for the system or code\n");
6039 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6040 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6041 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6043 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6045 #ifdef NUMCHAR_OPTION
6046 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6048 #ifdef UTF8_INPUT_ENABLE
6049 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6050 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6053 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6054 fprintf(stderr," Overwrite original listed files by filtered result\n");
6055 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6057 fprintf(stderr," -g --guess Guess the input code\n");
6058 fprintf(stderr," --help --version Show this help/the version\n");
6059 fprintf(stderr," For more information, see also man nkf\n");
6060 fprintf(stderr,"\n");
6066 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6067 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6070 #if defined(MSDOS) && defined(__WIN16__)
6073 #if defined(MSDOS) && defined(__WIN32__)
6079 ,NKF_VERSION,NKF_RELEASE_DATE);
6080 fprintf(stderr,"\n%s\n",CopyRight);
6085 **
\e$B%Q%C%A@):n<T
\e(B
6086 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6087 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6088 ** ohta@src.ricoh.co.jp (Junn Ohta)
6089 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6090 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6091 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6092 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6093 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6094 ** GHG00637@nifty-serve.or.jp (COW)