1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.126 2007/07/09 09:11:57 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2007-07-09"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
356 #define UCS_MAP_ASCII 0
358 #define UCS_MAP_CP932 2
359 #define UCS_MAP_CP10001 3
360 static int ms_ucs_map_f = UCS_MAP_ASCII;
362 #ifdef UTF8_INPUT_ENABLE
363 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
364 static int no_cp932ext_f = FALSE;
365 /* ignore ZERO WIDTH NO-BREAK SPACE */
366 static int no_best_fit_chars_f = FALSE;
367 static int input_endian = ENDIAN_BIG;
368 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
369 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
370 static void encode_fallback_html(nkf_char c);
371 static void encode_fallback_xml(nkf_char c);
372 static void encode_fallback_java(nkf_char c);
373 static void encode_fallback_perl(nkf_char c);
374 static void encode_fallback_subchar(nkf_char c);
375 static void (*encode_fallback)(nkf_char c) = NULL;
376 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
377 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
379 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
380 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
381 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
382 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
383 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
384 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
385 static void w_status(struct input_code *, nkf_char);
387 #ifdef UTF8_OUTPUT_ENABLE
388 static int output_bom_f = FALSE;
389 static int output_endian = ENDIAN_BIG;
390 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
391 static void w_oconv(nkf_char c2,nkf_char c1);
392 static void w_oconv16(nkf_char c2,nkf_char c1);
393 static void w_oconv32(nkf_char c2,nkf_char c1);
395 static void e_oconv(nkf_char c2,nkf_char c1);
396 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
397 static void s_oconv(nkf_char c2,nkf_char c1);
398 static void j_oconv(nkf_char c2,nkf_char c1);
399 static void fold_conv(nkf_char c2,nkf_char c1);
400 static void cr_conv(nkf_char c2,nkf_char c1);
401 static void z_conv(nkf_char c2,nkf_char c1);
402 static void rot_conv(nkf_char c2,nkf_char c1);
403 static void hira_conv(nkf_char c2,nkf_char c1);
404 static void base64_conv(nkf_char c2,nkf_char c1);
405 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
406 static void no_connection(nkf_char c2,nkf_char c1);
407 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
409 static void code_score(struct input_code *ptr);
410 static void code_status(nkf_char c);
412 static void std_putc(nkf_char c);
413 static nkf_char std_getc(FILE *f);
414 static nkf_char std_ungetc(nkf_char c,FILE *f);
416 static nkf_char broken_getc(FILE *f);
417 static nkf_char broken_ungetc(nkf_char c,FILE *f);
419 static nkf_char mime_begin(FILE *f);
420 static nkf_char mime_getc(FILE *f);
421 static nkf_char mime_ungetc(nkf_char c,FILE *f);
423 static void switch_mime_getc(void);
424 static void unswitch_mime_getc(void);
425 static nkf_char mime_begin_strict(FILE *f);
426 static nkf_char mime_getc_buf(FILE *f);
427 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
428 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
430 static nkf_char base64decode(nkf_char c);
431 static void mime_prechar(nkf_char c2, nkf_char c1);
432 static void mime_putc(nkf_char c);
433 static void open_mime(nkf_char c);
434 static void close_mime(void);
435 static void eof_mime(void);
436 static void mimeout_addchar(nkf_char c);
438 static void usage(void);
439 static void version(void);
441 static void options(unsigned char *c);
442 #if defined(PERL_XS) || defined(WIN32DLL)
443 static void reinit(void);
448 #if !defined(PERL_XS) && !defined(WIN32DLL)
449 static unsigned char stdibuf[IOBUF_SIZE];
450 static unsigned char stdobuf[IOBUF_SIZE];
452 static unsigned char hold_buf[HOLD_SIZE*2];
453 static int hold_count = 0;
455 /* MIME preprocessor fifo */
457 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
458 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
459 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
460 static unsigned char mime_buf[MIME_BUF_SIZE];
461 static unsigned int mime_top = 0;
462 static unsigned int mime_last = 0; /* decoded */
463 static unsigned int mime_input = 0; /* undecoded */
464 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
467 static int unbuf_f = FALSE;
468 static int estab_f = FALSE;
469 static int nop_f = FALSE;
470 static int binmode_f = TRUE; /* binary mode */
471 static int rot_f = FALSE; /* rot14/43 mode */
472 static int hira_f = FALSE; /* hira/kata henkan */
473 static int input_f = FALSE; /* non fixed input code */
474 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
475 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
476 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
477 static int mimebuf_f = FALSE; /* MIME buffered input */
478 static int broken_f = FALSE; /* convert ESC-less broken JIS */
479 static int iso8859_f = FALSE; /* ISO8859 through */
480 static int mimeout_f = FALSE; /* base64 mode */
481 #if defined(MSDOS) || defined(__OS2__)
482 static int x0201_f = TRUE; /* Assume JISX0201 kana */
484 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
486 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
488 #ifdef UNICODE_NORMALIZATION
489 static int nfc_f = FALSE;
490 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
491 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
492 static nkf_char nfc_getc(FILE *f);
493 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
497 static int cap_f = FALSE;
498 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
499 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
500 static nkf_char cap_getc(FILE *f);
501 static nkf_char cap_ungetc(nkf_char c,FILE *f);
503 static int url_f = FALSE;
504 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
505 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
506 static nkf_char url_getc(FILE *f);
507 static nkf_char url_ungetc(nkf_char c,FILE *f);
510 #if defined(INT_IS_SHORT)
511 #define NKF_INT32_C(n) (n##L)
513 #define NKF_INT32_C(n) (n)
515 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
516 #define CLASS_MASK NKF_INT32_C(0xFF000000)
517 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
518 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
519 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
520 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
521 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
523 #ifdef NUMCHAR_OPTION
524 static int numchar_f = FALSE;
525 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
526 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
527 static nkf_char numchar_getc(FILE *f);
528 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
532 static int noout_f = FALSE;
533 static void no_putc(nkf_char c);
534 static nkf_char debug_f = FALSE;
535 static void debug(const char *str);
536 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
539 static int guess_f = FALSE;
541 static void print_guessed_code(char *filename);
543 static void set_input_codename(char *codename);
544 static int is_inputcode_mixed = FALSE;
545 static int is_inputcode_set = FALSE;
548 static int exec_f = 0;
551 #ifdef SHIFTJIS_CP932
552 /* invert IBM extended characters to others */
553 static int cp51932_f = FALSE;
555 /* invert NEC-selected IBM extended characters to IBM extended characters */
556 static int cp932inv_f = TRUE;
558 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
559 #endif /* SHIFTJIS_CP932 */
562 static int x0212_f = FALSE;
563 static nkf_char x0212_shift(nkf_char c);
564 static nkf_char x0212_unshift(nkf_char c);
566 static int x0213_f = FALSE;
568 static unsigned char prefix_table[256];
570 static void set_code_score(struct input_code *ptr, nkf_char score);
571 static void clr_code_score(struct input_code *ptr, nkf_char score);
572 static void status_disable(struct input_code *ptr);
573 static void status_push_ch(struct input_code *ptr, nkf_char c);
574 static void status_clear(struct input_code *ptr);
575 static void status_reset(struct input_code *ptr);
576 static void status_reinit(struct input_code *ptr);
577 static void status_check(struct input_code *ptr, nkf_char c);
578 static void e_status(struct input_code *, nkf_char);
579 static void s_status(struct input_code *, nkf_char);
581 struct input_code input_code_list[] = {
582 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
583 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
584 #ifdef UTF8_INPUT_ENABLE
585 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
586 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
587 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
592 static int mimeout_mode = 0;
593 static int base64_count = 0;
595 /* X0208 -> ASCII converter */
598 static int f_line = 0; /* chars in line */
599 static int f_prev = 0;
600 static int fold_preserve_f = FALSE; /* preserve new lines */
601 static int fold_f = FALSE;
602 static int fold_len = 0;
605 static unsigned char kanji_intro = DEFAULT_J;
606 static unsigned char ascii_intro = DEFAULT_R;
610 #define FOLD_MARGIN 10
611 #define DEFAULT_FOLD 60
613 static int fold_margin = FOLD_MARGIN;
617 #ifdef DEFAULT_CODE_JIS
618 # define DEFAULT_CONV j_oconv
620 #ifdef DEFAULT_CODE_SJIS
621 # define DEFAULT_CONV s_oconv
623 #ifdef DEFAULT_CODE_EUC
624 # define DEFAULT_CONV e_oconv
626 #ifdef DEFAULT_CODE_UTF8
627 # define DEFAULT_CONV w_oconv
630 /* process default */
631 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
633 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
634 /* s_iconv or oconv */
635 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
637 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
642 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
643 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
645 /* static redirections */
647 static void (*o_putc)(nkf_char c) = std_putc;
649 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
650 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
652 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
653 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
655 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
657 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
658 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
660 /* for strict mime */
661 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
662 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
665 static int output_mode = ASCII, /* output kanji mode */
666 input_mode = ASCII, /* input kanji mode */
667 shift_mode = FALSE; /* TRUE shift out, or X0201 */
668 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
670 /* X0201 / X0208 conversion tables */
672 /* X0201 kana conversion table */
675 unsigned char cv[]= {
676 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
677 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
678 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
679 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
680 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
681 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
682 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
683 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
684 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
685 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
686 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
687 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
688 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
689 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
690 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
691 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
695 /* X0201 kana conversion table for daguten */
698 unsigned char dv[]= {
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
703 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
704 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
705 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
706 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
707 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
708 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
710 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
714 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
717 /* X0201 kana conversion table for han-daguten */
720 unsigned char ev[]= {
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
730 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
732 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
735 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
736 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
740 /* X0208 kigou conversion table */
741 /* 0x8140 - 0x819e */
743 unsigned char fv[] = {
745 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
746 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
747 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
748 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
749 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
750 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
751 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
753 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
755 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
756 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
762 static int file_out_f = FALSE;
764 static int overwrite_f = FALSE;
765 static int preserve_time_f = FALSE;
766 static int backup_f = FALSE;
767 static char *backup_suffix = "";
768 static char *get_backup_filename(const char *suffix, const char *filename);
771 static int crmode_f = 0; /* CR, NL, CRLF */
772 #ifdef EASYWIN /*Easy Win */
773 static int end_check;
776 #define STD_GC_BUFSIZE (256)
777 nkf_char std_gc_buf[STD_GC_BUFSIZE];
781 #include "nkf32dll.c"
782 #elif defined(PERL_XS)
784 int main(int argc, char **argv)
789 char *outfname = NULL;
792 #ifdef EASYWIN /*Easy Win */
793 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
796 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
797 cp = (unsigned char *)*argv;
802 if (pipe(fds) < 0 || (pid = fork()) < 0){
813 execvp(argv[1], &argv[1]);
827 if(x0201_f == WISH_TRUE)
828 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
830 if (binmode_f == TRUE)
831 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
832 if (freopen("","wb",stdout) == NULL)
839 setbuf(stdout, (char *) NULL);
841 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
844 if (binmode_f == TRUE)
845 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
846 if (freopen("","rb",stdin) == NULL) return (-1);
850 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
854 kanji_convert(stdin);
855 if (guess_f) print_guessed_code(NULL);
859 int is_argument_error = FALSE;
861 is_inputcode_mixed = FALSE;
862 is_inputcode_set = FALSE;
867 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
870 is_argument_error = TRUE;
878 /* reopen file for stdout */
879 if (file_out_f == TRUE) {
882 outfname = malloc(strlen(origfname)
883 + strlen(".nkftmpXXXXXX")
889 strcpy(outfname, origfname);
893 for (i = strlen(outfname); i; --i){
894 if (outfname[i - 1] == '/'
895 || outfname[i - 1] == '\\'){
901 strcat(outfname, "ntXXXXXX");
903 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
906 strcat(outfname, ".nkftmpXXXXXX");
907 fd = mkstemp(outfname);
910 || (fd_backup = dup(fileno(stdout))) < 0
911 || dup2(fd, fileno(stdout)) < 0
922 outfname = "nkf.out";
925 if(freopen(outfname, "w", stdout) == NULL) {
929 if (binmode_f == TRUE) {
930 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
931 if (freopen("","wb",stdout) == NULL)
938 if (binmode_f == TRUE)
939 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
940 if (freopen("","rb",fin) == NULL)
945 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
949 char *filename = NULL;
951 if (nfiles > 1) filename = origfname;
952 if (guess_f) print_guessed_code(filename);
958 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
966 if (dup2(fd_backup, fileno(stdout)) < 0){
969 if (stat(origfname, &sb)) {
970 fprintf(stderr, "Can't stat %s\n", origfname);
972 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
973 if (chmod(outfname, sb.st_mode)) {
974 fprintf(stderr, "Can't set permission %s\n", outfname);
977 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
979 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
980 tb[0] = tb[1] = sb.st_mtime;
981 if (utime(outfname, tb)) {
982 fprintf(stderr, "Can't set timestamp %s\n", outfname);
985 tb.actime = sb.st_atime;
986 tb.modtime = sb.st_mtime;
987 if (utime(outfname, &tb)) {
988 fprintf(stderr, "Can't set timestamp %s\n", outfname);
993 char *backup_filename = get_backup_filename(backup_suffix, origfname);
995 unlink(backup_filename);
997 if (rename(origfname, backup_filename)) {
998 perror(backup_filename);
999 fprintf(stderr, "Can't rename %s to %s\n",
1000 origfname, backup_filename);
1004 if (unlink(origfname)){
1009 if (rename(outfname, origfname)) {
1011 fprintf(stderr, "Can't rename %s to %s\n",
1012 outfname, origfname);
1019 if (is_argument_error)
1022 #ifdef EASYWIN /*Easy Win */
1023 if (file_out_f == FALSE)
1024 scanf("%d",&end_check);
1027 #else /* for Other OS */
1028 if (file_out_f == TRUE)
1030 #endif /*Easy Win */
1033 #endif /* WIN32DLL */
1036 char *get_backup_filename(const char *suffix, const char *filename)
1038 char *backup_filename;
1039 int asterisk_count = 0;
1041 int filename_length = strlen(filename);
1043 for(i = 0; suffix[i]; i++){
1044 if(suffix[i] == '*') asterisk_count++;
1048 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1049 if (!backup_filename){
1050 perror("Can't malloc backup filename.");
1054 for(i = 0, j = 0; suffix[i];){
1055 if(suffix[i] == '*'){
1056 backup_filename[j] = '\0';
1057 strncat(backup_filename, filename, filename_length);
1059 j += filename_length;
1061 backup_filename[j++] = suffix[i++];
1064 backup_filename[j] = '\0';
1066 j = strlen(suffix) + filename_length;
1067 backup_filename = malloc( + 1);
1068 strcpy(backup_filename, filename);
1069 strcat(backup_filename, suffix);
1070 backup_filename[j] = '\0';
1072 return backup_filename;
1101 {"katakana-hiragana","h3"},
1108 #ifdef UTF8_OUTPUT_ENABLE
1118 {"fb-subchar=", ""},
1120 #ifdef UTF8_INPUT_ENABLE
1121 {"utf8-input", "W"},
1122 {"utf16-input", "W16"},
1123 {"no-cp932ext", ""},
1124 {"no-best-fit-chars",""},
1126 #ifdef UNICODE_NORMALIZATION
1127 {"utf8mac-input", ""},
1139 #ifdef NUMCHAR_OPTION
1140 {"numchar-input", ""},
1146 #ifdef SHIFTJIS_CP932
1156 static int option_mode = 0;
1158 void options(unsigned char *cp)
1162 unsigned char *cp_back = NULL;
1167 while(*cp && *cp++!='-');
1168 while (*cp || cp_back) {
1176 case '-': /* literal options */
1177 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1181 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1182 p = (unsigned char *)long_option[i].name;
1183 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1184 if (*p == cp[j] || cp[j] == ' '){
1191 while(*cp && *cp != SPACE && cp++);
1192 if (long_option[i].alias[0]){
1194 cp = (unsigned char *)long_option[i].alias;
1196 if (strcmp(long_option[i].name, "ic=") == 0){
1197 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1198 codeset[i] = nkf_toupper(p[i]);
1201 if(strcmp(codeset, "ISO-2022-JP") == 0){
1202 input_f = JIS_INPUT;
1203 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1204 strcmp(codeset, "CP50220") == 0 ||
1205 strcmp(codeset, "CP50221") == 0 ||
1206 strcmp(codeset, "CP50222") == 0){
1207 input_f = JIS_INPUT;
1208 #ifdef SHIFTJIS_CP932
1211 #ifdef UTF8_OUTPUT_ENABLE
1212 ms_ucs_map_f = UCS_MAP_CP932;
1214 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1215 input_f = JIS_INPUT;
1219 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1220 input_f = JIS_INPUT;
1225 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1226 input_f = SJIS_INPUT;
1227 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1228 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1229 strcmp(codeset, "CP932") == 0 ||
1230 strcmp(codeset, "MS932") == 0){
1231 input_f = SJIS_INPUT;
1232 #ifdef SHIFTJIS_CP932
1235 #ifdef UTF8_OUTPUT_ENABLE
1236 ms_ucs_map_f = UCS_MAP_CP932;
1238 }else if(strcmp(codeset, "CP10001") == 0){
1239 input_f = SJIS_INPUT;
1240 #ifdef SHIFTJIS_CP932
1243 #ifdef UTF8_OUTPUT_ENABLE
1244 ms_ucs_map_f = UCS_MAP_CP10001;
1246 }else if(strcmp(codeset, "EUCJP") == 0 ||
1247 strcmp(codeset, "EUC-JP") == 0){
1248 input_f = EUC_INPUT;
1249 }else if(strcmp(codeset, "CP51932") == 0){
1250 input_f = EUC_INPUT;
1251 #ifdef SHIFTJIS_CP932
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_CP932;
1257 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1258 strcmp(codeset, "EUCJP-MS") == 0 ||
1259 strcmp(codeset, "EUCJPMS") == 0){
1260 input_f = EUC_INPUT;
1261 #ifdef SHIFTJIS_CP932
1264 #ifdef UTF8_OUTPUT_ENABLE
1265 ms_ucs_map_f = UCS_MAP_MS;
1267 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1268 strcmp(codeset, "EUCJP-ASCII") == 0){
1269 input_f = EUC_INPUT;
1270 #ifdef SHIFTJIS_CP932
1273 #ifdef UTF8_OUTPUT_ENABLE
1274 ms_ucs_map_f = UCS_MAP_ASCII;
1276 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1277 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1278 input_f = SJIS_INPUT;
1280 #ifdef SHIFTJIS_CP932
1283 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1284 strcmp(codeset, "EUC-JIS-2004") == 0){
1285 input_f = EUC_INPUT;
1287 #ifdef SHIFTJIS_CP932
1290 #ifdef UTF8_INPUT_ENABLE
1291 }else if(strcmp(codeset, "UTF-8") == 0 ||
1292 strcmp(codeset, "UTF-8N") == 0 ||
1293 strcmp(codeset, "UTF-8-BOM") == 0){
1294 input_f = UTF8_INPUT;
1295 #ifdef UNICODE_NORMALIZATION
1296 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1297 strcmp(codeset, "UTF-8-MAC") == 0){
1298 input_f = UTF8_INPUT;
1301 }else if(strcmp(codeset, "UTF-16") == 0 ||
1302 strcmp(codeset, "UTF-16BE") == 0 ||
1303 strcmp(codeset, "UTF-16BE-BOM") == 0){
1304 input_f = UTF16_INPUT;
1305 input_endian = ENDIAN_BIG;
1306 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1307 strcmp(codeset, "UTF-16LE-BOM") == 0){
1308 input_f = UTF16_INPUT;
1309 input_endian = ENDIAN_LITTLE;
1310 }else if(strcmp(codeset, "UTF-32") == 0 ||
1311 strcmp(codeset, "UTF-32BE") == 0 ||
1312 strcmp(codeset, "UTF-32BE-BOM") == 0){
1313 input_f = UTF32_INPUT;
1314 input_endian = ENDIAN_BIG;
1315 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1316 strcmp(codeset, "UTF-32LE-BOM") == 0){
1317 input_f = UTF32_INPUT;
1318 input_endian = ENDIAN_LITTLE;
1323 if (strcmp(long_option[i].name, "oc=") == 0){
1325 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1326 codeset[i] = nkf_toupper(p[i]);
1329 if(strcmp(codeset, "ISO-2022-JP") == 0){
1330 output_conv = j_oconv;
1331 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1332 output_conv = j_oconv;
1333 no_cp932ext_f = TRUE;
1334 #ifdef SHIFTJIS_CP932
1337 #ifdef UTF8_OUTPUT_ENABLE
1338 ms_ucs_map_f = UCS_MAP_CP932;
1340 }else if(strcmp(codeset, "CP50220") == 0){
1341 output_conv = j_oconv;
1343 #ifdef SHIFTJIS_CP932
1346 #ifdef UTF8_OUTPUT_ENABLE
1347 ms_ucs_map_f = UCS_MAP_CP932;
1349 }else if(strcmp(codeset, "CP50221") == 0){
1350 output_conv = j_oconv;
1351 #ifdef SHIFTJIS_CP932
1354 #ifdef UTF8_OUTPUT_ENABLE
1355 ms_ucs_map_f = UCS_MAP_CP932;
1357 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1358 output_conv = j_oconv;
1362 #ifdef SHIFTJIS_CP932
1365 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1366 output_conv = j_oconv;
1371 #ifdef SHIFTJIS_CP932
1374 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1375 output_conv = s_oconv;
1376 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1377 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1378 strcmp(codeset, "CP932") == 0 ||
1379 strcmp(codeset, "MS932") == 0){
1380 output_conv = s_oconv;
1381 #ifdef UTF8_OUTPUT_ENABLE
1382 ms_ucs_map_f = UCS_MAP_CP932;
1384 }else if(strcmp(codeset, "CP10001") == 0){
1385 output_conv = s_oconv;
1386 #ifdef UTF8_OUTPUT_ENABLE
1387 ms_ucs_map_f = UCS_MAP_CP10001;
1389 }else if(strcmp(codeset, "EUCJP") == 0 ||
1390 strcmp(codeset, "EUC-JP") == 0){
1391 output_conv = e_oconv;
1392 }else if(strcmp(codeset, "CP51932") == 0){
1393 output_conv = e_oconv;
1394 #ifdef SHIFTJIS_CP932
1397 #ifdef UTF8_OUTPUT_ENABLE
1398 ms_ucs_map_f = UCS_MAP_CP932;
1400 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1401 strcmp(codeset, "EUCJP-MS") == 0 ||
1402 strcmp(codeset, "EUCJPMS") == 0){
1403 output_conv = e_oconv;
1407 #ifdef UTF8_OUTPUT_ENABLE
1408 ms_ucs_map_f = UCS_MAP_MS;
1410 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1411 strcmp(codeset, "EUCJP-ASCII") == 0){
1412 output_conv = e_oconv;
1416 #ifdef UTF8_OUTPUT_ENABLE
1417 ms_ucs_map_f = UCS_MAP_ASCII;
1419 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1420 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1421 output_conv = s_oconv;
1423 #ifdef SHIFTJIS_CP932
1426 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1427 strcmp(codeset, "EUC-JIS-2004") == 0){
1428 output_conv = e_oconv;
1433 #ifdef SHIFTJIS_CP932
1436 #ifdef UTF8_OUTPUT_ENABLE
1437 }else if(strcmp(codeset, "UTF-8") == 0){
1438 output_conv = w_oconv;
1439 }else if(strcmp(codeset, "UTF-8N") == 0){
1440 output_conv = w_oconv;
1441 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1442 output_conv = w_oconv;
1443 output_bom_f = TRUE;
1444 }else if(strcmp(codeset, "UTF-16BE") == 0){
1445 output_conv = w_oconv16;
1446 }else if(strcmp(codeset, "UTF-16") == 0 ||
1447 strcmp(codeset, "UTF-16BE-BOM") == 0){
1448 output_conv = w_oconv16;
1449 output_bom_f = TRUE;
1450 }else if(strcmp(codeset, "UTF-16LE") == 0){
1451 output_conv = w_oconv16;
1452 output_endian = ENDIAN_LITTLE;
1453 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1454 output_conv = w_oconv16;
1455 output_endian = ENDIAN_LITTLE;
1456 output_bom_f = TRUE;
1457 }else if(strcmp(codeset, "UTF-32") == 0 ||
1458 strcmp(codeset, "UTF-32BE") == 0){
1459 output_conv = w_oconv32;
1460 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1461 output_conv = w_oconv32;
1462 output_bom_f = TRUE;
1463 }else if(strcmp(codeset, "UTF-32LE") == 0){
1464 output_conv = w_oconv32;
1465 output_endian = ENDIAN_LITTLE;
1466 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1467 output_conv = w_oconv32;
1468 output_endian = ENDIAN_LITTLE;
1469 output_bom_f = TRUE;
1475 if (strcmp(long_option[i].name, "overwrite") == 0){
1478 preserve_time_f = TRUE;
1481 if (strcmp(long_option[i].name, "overwrite=") == 0){
1484 preserve_time_f = TRUE;
1486 backup_suffix = malloc(strlen((char *) p) + 1);
1487 strcpy(backup_suffix, (char *) p);
1490 if (strcmp(long_option[i].name, "in-place") == 0){
1493 preserve_time_f = FALSE;
1496 if (strcmp(long_option[i].name, "in-place=") == 0){
1499 preserve_time_f = FALSE;
1501 backup_suffix = malloc(strlen((char *) p) + 1);
1502 strcpy(backup_suffix, (char *) p);
1507 if (strcmp(long_option[i].name, "cap-input") == 0){
1511 if (strcmp(long_option[i].name, "url-input") == 0){
1516 #ifdef NUMCHAR_OPTION
1517 if (strcmp(long_option[i].name, "numchar-input") == 0){
1523 if (strcmp(long_option[i].name, "no-output") == 0){
1527 if (strcmp(long_option[i].name, "debug") == 0){
1532 if (strcmp(long_option[i].name, "cp932") == 0){
1533 #ifdef SHIFTJIS_CP932
1537 #ifdef UTF8_OUTPUT_ENABLE
1538 ms_ucs_map_f = UCS_MAP_CP932;
1542 if (strcmp(long_option[i].name, "no-cp932") == 0){
1543 #ifdef SHIFTJIS_CP932
1547 #ifdef UTF8_OUTPUT_ENABLE
1548 ms_ucs_map_f = UCS_MAP_ASCII;
1552 #ifdef SHIFTJIS_CP932
1553 if (strcmp(long_option[i].name, "cp932inv") == 0){
1560 if (strcmp(long_option[i].name, "x0212") == 0){
1567 if (strcmp(long_option[i].name, "exec-in") == 0){
1571 if (strcmp(long_option[i].name, "exec-out") == 0){
1576 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1577 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1578 no_cp932ext_f = TRUE;
1581 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1582 no_best_fit_chars_f = TRUE;
1585 if (strcmp(long_option[i].name, "fb-skip") == 0){
1586 encode_fallback = NULL;
1589 if (strcmp(long_option[i].name, "fb-html") == 0){
1590 encode_fallback = encode_fallback_html;
1593 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1594 encode_fallback = encode_fallback_xml;
1597 if (strcmp(long_option[i].name, "fb-java") == 0){
1598 encode_fallback = encode_fallback_java;
1601 if (strcmp(long_option[i].name, "fb-perl") == 0){
1602 encode_fallback = encode_fallback_perl;
1605 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1606 encode_fallback = encode_fallback_subchar;
1609 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1610 encode_fallback = encode_fallback_subchar;
1611 unicode_subchar = 0;
1613 /* decimal number */
1614 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1615 unicode_subchar *= 10;
1616 unicode_subchar += hex2bin(p[i]);
1618 }else if(p[1] == 'x' || p[1] == 'X'){
1619 /* hexadecimal number */
1620 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1621 unicode_subchar <<= 4;
1622 unicode_subchar |= hex2bin(p[i]);
1626 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1627 unicode_subchar *= 8;
1628 unicode_subchar += hex2bin(p[i]);
1631 w16e_conv(unicode_subchar, &i, &j);
1632 unicode_subchar = i<<8 | j;
1636 #ifdef UTF8_OUTPUT_ENABLE
1637 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1638 ms_ucs_map_f = UCS_MAP_MS;
1642 #ifdef UNICODE_NORMALIZATION
1643 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1644 input_f = UTF8_INPUT;
1649 if (strcmp(long_option[i].name, "prefix=") == 0){
1650 if (nkf_isgraph(p[0])){
1651 for (i = 1; nkf_isgraph(p[i]); i++){
1652 prefix_table[p[i]] = p[0];
1659 case 'b': /* buffered mode */
1662 case 'u': /* non bufferd mode */
1665 case 't': /* transparent mode */
1670 } else if (*cp=='2') {
1674 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1682 case 'j': /* JIS output */
1684 output_conv = j_oconv;
1686 case 'e': /* AT&T EUC output */
1687 output_conv = e_oconv;
1690 case 's': /* SJIS output */
1691 output_conv = s_oconv;
1693 case 'l': /* ISO8859 Latin-1 support, no conversion */
1694 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1695 input_f = LATIN1_INPUT;
1697 case 'i': /* Kanji IN ESC-$-@/B */
1698 if (*cp=='@'||*cp=='B')
1699 kanji_intro = *cp++;
1701 case 'o': /* ASCII IN ESC-(-J/B */
1702 if (*cp=='J'||*cp=='B'||*cp=='H')
1703 ascii_intro = *cp++;
1707 bit:1 katakana->hiragana
1708 bit:2 hiragana->katakana
1710 if ('9'>= *cp && *cp>='0')
1711 hira_f |= (*cp++ -'0');
1718 #if defined(MSDOS) || defined(__OS2__)
1733 #ifdef UTF8_OUTPUT_ENABLE
1734 case 'w': /* UTF-8 output */
1736 output_conv = w_oconv; cp++;
1740 output_bom_f = TRUE;
1743 if ('1'== cp[0] && '6'==cp[1]) {
1744 output_conv = w_oconv16; cp+=2;
1745 } else if ('3'== cp[0] && '2'==cp[1]) {
1746 output_conv = w_oconv32; cp+=2;
1748 output_conv = w_oconv;
1753 output_endian = ENDIAN_LITTLE;
1754 } else if (cp[0] == 'B') {
1762 output_bom_f = TRUE;
1767 #ifdef UTF8_INPUT_ENABLE
1768 case 'W': /* UTF input */
1771 input_f = UTF8_INPUT;
1773 if ('1'== cp[0] && '6'==cp[1]) {
1775 input_f = UTF16_INPUT;
1776 input_endian = ENDIAN_BIG;
1777 } else if ('3'== cp[0] && '2'==cp[1]) {
1779 input_f = UTF32_INPUT;
1780 input_endian = ENDIAN_BIG;
1782 input_f = UTF8_INPUT;
1787 input_endian = ENDIAN_LITTLE;
1788 } else if (cp[0] == 'B') {
1794 /* Input code assumption */
1795 case 'J': /* JIS input */
1796 input_f = JIS_INPUT;
1798 case 'E': /* AT&T EUC input */
1799 input_f = EUC_INPUT;
1801 case 'S': /* MS Kanji input */
1802 input_f = SJIS_INPUT;
1803 if (x0201_f==NO_X0201) x0201_f=TRUE;
1805 case 'Z': /* Convert X0208 alphabet to asii */
1806 /* bit:0 Convert X0208
1807 bit:1 Convert Kankaku to one space
1808 bit:2 Convert Kankaku to two spaces
1809 bit:3 Convert HTML Entity
1811 if ('9'>= *cp && *cp>='0')
1812 alpha_f |= 1<<(*cp++ -'0');
1816 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1817 x0201_f = FALSE; /* No X0201->X0208 conversion */
1819 ESC-(-I in JIS, EUC, MS Kanji
1820 SI/SO in JIS, EUC, MS Kanji
1821 SSO in EUC, JIS, not in MS Kanji
1822 MS Kanji (0xa0-0xdf)
1824 ESC-(-I in JIS (0x20-0x5f)
1825 SSO in EUC (0xa0-0xdf)
1826 0xa0-0xd in MS Kanji (0xa0-0xdf)
1829 case 'X': /* Assume X0201 kana */
1830 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1833 case 'F': /* prserve new lines */
1834 fold_preserve_f = TRUE;
1835 case 'f': /* folding -f60 or -f */
1838 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1840 fold_len += *cp++ - '0';
1842 if (!(0<fold_len && fold_len<BUFSIZ))
1843 fold_len = DEFAULT_FOLD;
1847 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1849 fold_margin += *cp++ - '0';
1853 case 'm': /* MIME support */
1854 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1855 if (*cp=='B'||*cp=='Q') {
1856 mime_decode_mode = *cp++;
1857 mimebuf_f = FIXED_MIME;
1858 } else if (*cp=='N') {
1859 mime_f = TRUE; cp++;
1860 } else if (*cp=='S') {
1861 mime_f = STRICT_MIME; cp++;
1862 } else if (*cp=='0') {
1863 mime_decode_f = FALSE;
1864 mime_f = FALSE; cp++;
1867 case 'M': /* MIME output */
1870 mimeout_f = FIXED_MIME; cp++;
1871 } else if (*cp=='Q') {
1873 mimeout_f = FIXED_MIME; cp++;
1878 case 'B': /* Broken JIS support */
1880 bit:1 allow any x on ESC-(-x or ESC-$-x
1881 bit:2 reset to ascii on NL
1883 if ('9'>= *cp && *cp>='0')
1884 broken_f |= 1<<(*cp++ -'0');
1889 case 'O':/* for Output file */
1893 case 'c':/* add cr code */
1896 case 'd':/* delete cr code */
1899 case 'I': /* ISO-2022-JP output */
1902 case 'L': /* line mode */
1903 if (*cp=='u') { /* unix */
1904 crmode_f = NL; cp++;
1905 } else if (*cp=='m') { /* mac */
1906 crmode_f = CR; cp++;
1907 } else if (*cp=='w') { /* windows */
1908 crmode_f = CRLF; cp++;
1909 } else if (*cp=='0') { /* no conversion */
1919 /* module muliple options in a string are allowed for Perl moudle */
1920 while(*cp && *cp++!='-');
1923 /* bogus option but ignored */
1929 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1932 struct input_code *p = input_code_list;
1934 if (iconv_func == p->iconv_func){
1943 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1945 #ifdef INPUT_CODE_FIX
1953 #ifdef INPUT_CODE_FIX
1954 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1960 if (estab_f && iconv_for_check != iconv){
1961 struct input_code *p = find_inputcode_byfunc(iconv);
1963 set_input_codename(p->name);
1964 debug(input_codename);
1966 iconv_for_check = iconv;
1971 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1972 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1973 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1974 #ifdef SHIFTJIS_CP932
1975 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1976 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1978 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1980 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1981 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1983 #define SCORE_INIT (SCORE_iMIME)
1985 const nkf_char score_table_A0[] = {
1988 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1989 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1992 const nkf_char score_table_F0[] = {
1993 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1994 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1995 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1996 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1999 void set_code_score(struct input_code *ptr, nkf_char score)
2002 ptr->score |= score;
2006 void clr_code_score(struct input_code *ptr, nkf_char score)
2009 ptr->score &= ~score;
2013 void code_score(struct input_code *ptr)
2015 nkf_char c2 = ptr->buf[0];
2016 #ifdef UTF8_OUTPUT_ENABLE
2017 nkf_char c1 = ptr->buf[1];
2020 set_code_score(ptr, SCORE_ERROR);
2021 }else if (c2 == SSO){
2022 set_code_score(ptr, SCORE_KANA);
2023 #ifdef UTF8_OUTPUT_ENABLE
2024 }else if (!e2w_conv(c2, c1)){
2025 set_code_score(ptr, SCORE_NO_EXIST);
2027 }else if ((c2 & 0x70) == 0x20){
2028 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2029 }else if ((c2 & 0x70) == 0x70){
2030 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2031 }else if ((c2 & 0x70) >= 0x50){
2032 set_code_score(ptr, SCORE_L2);
2036 void status_disable(struct input_code *ptr)
2041 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2044 void status_push_ch(struct input_code *ptr, nkf_char c)
2046 ptr->buf[ptr->index++] = c;
2049 void status_clear(struct input_code *ptr)
2055 void status_reset(struct input_code *ptr)
2058 ptr->score = SCORE_INIT;
2061 void status_reinit(struct input_code *ptr)
2064 ptr->_file_stat = 0;
2067 void status_check(struct input_code *ptr, nkf_char c)
2069 if (c <= DEL && estab_f){
2074 void s_status(struct input_code *ptr, nkf_char c)
2078 status_check(ptr, c);
2083 #ifdef NUMCHAR_OPTION
2084 }else if (is_unicode_capsule(c)){
2087 }else if (0xa1 <= c && c <= 0xdf){
2088 status_push_ch(ptr, SSO);
2089 status_push_ch(ptr, c);
2092 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2094 status_push_ch(ptr, c);
2095 #ifdef SHIFTJIS_CP932
2097 && is_ibmext_in_sjis(c)){
2099 status_push_ch(ptr, c);
2100 #endif /* SHIFTJIS_CP932 */
2102 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2104 status_push_ch(ptr, c);
2105 #endif /* X0212_ENABLE */
2107 status_disable(ptr);
2111 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2112 status_push_ch(ptr, c);
2113 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2117 status_disable(ptr);
2121 #ifdef SHIFTJIS_CP932
2122 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2123 status_push_ch(ptr, c);
2124 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2125 set_code_score(ptr, SCORE_CP932);
2130 #endif /* SHIFTJIS_CP932 */
2131 #ifndef X0212_ENABLE
2132 status_disable(ptr);
2138 void e_status(struct input_code *ptr, nkf_char c)
2142 status_check(ptr, c);
2147 #ifdef NUMCHAR_OPTION
2148 }else if (is_unicode_capsule(c)){
2151 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2153 status_push_ch(ptr, c);
2155 }else if (0x8f == c){
2157 status_push_ch(ptr, c);
2158 #endif /* X0212_ENABLE */
2160 status_disable(ptr);
2164 if (0xa1 <= c && c <= 0xfe){
2165 status_push_ch(ptr, c);
2169 status_disable(ptr);
2174 if (0xa1 <= c && c <= 0xfe){
2176 status_push_ch(ptr, c);
2178 status_disable(ptr);
2180 #endif /* X0212_ENABLE */
2184 #ifdef UTF8_INPUT_ENABLE
2185 void w_status(struct input_code *ptr, nkf_char c)
2189 status_check(ptr, c);
2194 #ifdef NUMCHAR_OPTION
2195 }else if (is_unicode_capsule(c)){
2198 }else if (0xc0 <= c && c <= 0xdf){
2200 status_push_ch(ptr, c);
2201 }else if (0xe0 <= c && c <= 0xef){
2203 status_push_ch(ptr, c);
2204 }else if (0xf0 <= c && c <= 0xf4){
2206 status_push_ch(ptr, c);
2208 status_disable(ptr);
2213 if (0x80 <= c && c <= 0xbf){
2214 status_push_ch(ptr, c);
2215 if (ptr->index > ptr->stat){
2216 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2217 && ptr->buf[2] == 0xbf);
2218 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2219 &ptr->buf[0], &ptr->buf[1]);
2226 status_disable(ptr);
2230 if (0x80 <= c && c <= 0xbf){
2231 if (ptr->index < ptr->stat){
2232 status_push_ch(ptr, c);
2237 status_disable(ptr);
2244 void code_status(nkf_char c)
2246 int action_flag = 1;
2247 struct input_code *result = 0;
2248 struct input_code *p = input_code_list;
2250 if (!p->status_func) {
2254 if (!p->status_func)
2256 (p->status_func)(p, c);
2259 }else if(p->stat == 0){
2270 if (result && !estab_f){
2271 set_iconv(TRUE, result->iconv_func);
2272 }else if (c <= DEL){
2273 struct input_code *ptr = input_code_list;
2283 nkf_char std_getc(FILE *f)
2286 return std_gc_buf[--std_gc_ndx];
2292 nkf_char std_ungetc(nkf_char c, FILE *f)
2294 if (std_gc_ndx == STD_GC_BUFSIZE){
2297 std_gc_buf[std_gc_ndx++] = c;
2302 void std_putc(nkf_char c)
2309 #if !defined(PERL_XS) && !defined(WIN32DLL)
2310 nkf_char noconvert(FILE *f)
2315 module_connection();
2316 while ((c = (*i_getc)(f)) != EOF)
2323 void module_connection(void)
2325 oconv = output_conv;
2328 /* replace continucation module, from output side */
2330 /* output redicrection */
2332 if (noout_f || guess_f){
2339 if (mimeout_f == TRUE) {
2340 o_base64conv = oconv; oconv = base64_conv;
2342 /* base64_count = 0; */
2346 o_crconv = oconv; oconv = cr_conv;
2349 o_rot_conv = oconv; oconv = rot_conv;
2352 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2355 o_hira_conv = oconv; oconv = hira_conv;
2358 o_fconv = oconv; oconv = fold_conv;
2361 if (alpha_f || x0201_f) {
2362 o_zconv = oconv; oconv = z_conv;
2366 i_ungetc = std_ungetc;
2367 /* input redicrection */
2370 i_cgetc = i_getc; i_getc = cap_getc;
2371 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2374 i_ugetc = i_getc; i_getc = url_getc;
2375 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2378 #ifdef NUMCHAR_OPTION
2380 i_ngetc = i_getc; i_getc = numchar_getc;
2381 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2384 #ifdef UNICODE_NORMALIZATION
2385 if (nfc_f && input_f == UTF8_INPUT){
2386 i_nfc_getc = i_getc; i_getc = nfc_getc;
2387 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2390 if (mime_f && mimebuf_f==FIXED_MIME) {
2391 i_mgetc = i_getc; i_getc = mime_getc;
2392 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2395 i_bgetc = i_getc; i_getc = broken_getc;
2396 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2398 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2399 set_iconv(-TRUE, e_iconv);
2400 } else if (input_f == SJIS_INPUT) {
2401 set_iconv(-TRUE, s_iconv);
2402 #ifdef UTF8_INPUT_ENABLE
2403 } else if (input_f == UTF8_INPUT) {
2404 set_iconv(-TRUE, w_iconv);
2405 } else if (input_f == UTF16_INPUT) {
2406 set_iconv(-TRUE, w_iconv16);
2407 } else if (input_f == UTF32_INPUT) {
2408 set_iconv(-TRUE, w_iconv32);
2411 set_iconv(FALSE, e_iconv);
2415 struct input_code *p = input_code_list;
2423 * Check and Ignore BOM
2425 void check_bom(FILE *f)
2428 switch(c2 = (*i_getc)(f)){
2430 if((c2 = (*i_getc)(f)) == 0x00){
2431 if((c2 = (*i_getc)(f)) == 0xFE){
2432 if((c2 = (*i_getc)(f)) == 0xFF){
2434 set_iconv(TRUE, w_iconv32);
2436 if (iconv == w_iconv32) {
2437 input_endian = ENDIAN_BIG;
2440 (*i_ungetc)(0xFF,f);
2441 }else (*i_ungetc)(c2,f);
2442 (*i_ungetc)(0xFE,f);
2443 }else if(c2 == 0xFF){
2444 if((c2 = (*i_getc)(f)) == 0xFE){
2446 set_iconv(TRUE, w_iconv32);
2448 if (iconv == w_iconv32) {
2449 input_endian = ENDIAN_2143;
2452 (*i_ungetc)(0xFF,f);
2453 }else (*i_ungetc)(c2,f);
2454 (*i_ungetc)(0xFF,f);
2455 }else (*i_ungetc)(c2,f);
2456 (*i_ungetc)(0x00,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0x00,f);
2461 if((c2 = (*i_getc)(f)) == 0xBB){
2462 if((c2 = (*i_getc)(f)) == 0xBF){
2464 set_iconv(TRUE, w_iconv);
2466 if (iconv == w_iconv) {
2469 (*i_ungetc)(0xBF,f);
2470 }else (*i_ungetc)(c2,f);
2471 (*i_ungetc)(0xBB,f);
2472 }else (*i_ungetc)(c2,f);
2473 (*i_ungetc)(0xEF,f);
2476 if((c2 = (*i_getc)(f)) == 0xFF){
2477 if((c2 = (*i_getc)(f)) == 0x00){
2478 if((c2 = (*i_getc)(f)) == 0x00){
2480 set_iconv(TRUE, w_iconv32);
2482 if (iconv == w_iconv32) {
2483 input_endian = ENDIAN_3412;
2486 (*i_ungetc)(0x00,f);
2487 }else (*i_ungetc)(c2,f);
2488 (*i_ungetc)(0x00,f);
2489 }else (*i_ungetc)(c2,f);
2491 set_iconv(TRUE, w_iconv16);
2493 if (iconv == w_iconv16) {
2494 input_endian = ENDIAN_BIG;
2497 (*i_ungetc)(0xFF,f);
2498 }else (*i_ungetc)(c2,f);
2499 (*i_ungetc)(0xFE,f);
2502 if((c2 = (*i_getc)(f)) == 0xFE){
2503 if((c2 = (*i_getc)(f)) == 0x00){
2504 if((c2 = (*i_getc)(f)) == 0x00){
2506 set_iconv(TRUE, w_iconv32);
2508 if (iconv == w_iconv32) {
2509 input_endian = ENDIAN_LITTLE;
2512 (*i_ungetc)(0x00,f);
2513 }else (*i_ungetc)(c2,f);
2514 (*i_ungetc)(0x00,f);
2515 }else (*i_ungetc)(c2,f);
2517 set_iconv(TRUE, w_iconv16);
2519 if (iconv == w_iconv16) {
2520 input_endian = ENDIAN_LITTLE;
2523 (*i_ungetc)(0xFE,f);
2524 }else (*i_ungetc)(c2,f);
2525 (*i_ungetc)(0xFF,f);
2534 Conversion main loop. Code detection only.
2537 nkf_char kanji_convert(FILE *f)
2539 nkf_char c3, c2=0, c1, c0=0;
2540 int is_8bit = FALSE;
2542 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2543 #ifdef UTF8_INPUT_ENABLE
2544 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2551 output_mode = ASCII;
2554 #define NEXT continue /* no output, get next */
2555 #define SEND ; /* output c1 and c2, get next */
2556 #define LAST break /* end of loop, go closing */
2558 module_connection();
2561 while ((c1 = (*i_getc)(f)) != EOF) {
2562 #ifdef INPUT_CODE_FIX
2568 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2569 /* in case of 8th bit is on */
2570 if (!estab_f&&!mime_decode_mode) {
2571 /* in case of not established yet */
2572 /* It is still ambiguious */
2573 if (h_conv(f, c2, c1)==EOF)
2579 /* in case of already established */
2581 /* ignore bogus code and not CP5022x UCD */
2589 /* second byte, 7 bit code */
2590 /* it might be kanji shitfted */
2591 if ((c1 == DEL) || (c1 <= SPACE)) {
2592 /* ignore bogus first code */
2599 #ifdef UTF8_INPUT_ENABLE
2600 if (iconv == w_iconv16) {
2601 if (input_endian == ENDIAN_BIG) {
2603 if ((c1 = (*i_getc)(f)) != EOF) {
2604 if (0xD8 <= c2 && c2 <= 0xDB) {
2605 if ((c0 = (*i_getc)(f)) != EOF) {
2607 if ((c3 = (*i_getc)(f)) != EOF) {
2614 if ((c2 = (*i_getc)(f)) != EOF) {
2615 if (0xD8 <= c2 && c2 <= 0xDB) {
2616 if ((c3 = (*i_getc)(f)) != EOF) {
2617 if ((c0 = (*i_getc)(f)) != EOF) {
2626 } else if(iconv == w_iconv32){
2628 if((c2 = (*i_getc)(f)) != EOF &&
2629 (c1 = (*i_getc)(f)) != EOF &&
2630 (c0 = (*i_getc)(f)) != EOF){
2631 switch(input_endian){
2633 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2636 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2639 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2642 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2652 #ifdef NUMCHAR_OPTION
2653 if (is_unicode_capsule(c1)){
2657 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2659 if (!estab_f && !iso8859_f) {
2660 /* not established yet */
2663 } else { /* estab_f==TRUE */
2668 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2669 /* SJIS X0201 Case... */
2670 if(iso2022jp_f && x0201_f==NO_X0201) {
2671 (*oconv)(GETA1, GETA2);
2678 } else if (c1==SSO && iconv != s_iconv) {
2679 /* EUC X0201 Case */
2680 c1 = (*i_getc)(f); /* skip SSO */
2682 if (SSP<=c1 && c1<0xe0) {
2683 if(iso2022jp_f && x0201_f==NO_X0201) {
2684 (*oconv)(GETA1, GETA2);
2691 } else { /* bogus code, skip SSO and one byte */
2694 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2695 (c1 == 0xFD || c1 == 0xFE)) {
2701 /* already established */
2706 } else if ((c1 > SPACE) && (c1 != DEL)) {
2707 /* in case of Roman characters */
2709 /* output 1 shifted byte */
2713 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2714 /* output 1 shifted byte */
2715 if(iso2022jp_f && x0201_f==NO_X0201) {
2716 (*oconv)(GETA1, GETA2);
2723 /* look like bogus code */
2726 } else if (input_mode == X0208 || input_mode == X0212 ||
2727 input_mode == X0213_1 || input_mode == X0213_2) {
2728 /* in case of Kanji shifted */
2731 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2732 /* Check MIME code */
2733 if ((c1 = (*i_getc)(f)) == EOF) {
2736 } else if (c1 == '?') {
2737 /* =? is mime conversion start sequence */
2738 if(mime_f == STRICT_MIME) {
2739 /* check in real detail */
2740 if (mime_begin_strict(f) == EOF)
2744 } else if (mime_begin(f) == EOF)
2754 /* normal ASCII code */
2757 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
\r
2760 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
\r
2763 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
\r
2764 if ((c1 = (*i_getc)(f)) == EOF) {
2765 /* (*oconv)(0, ESC); don't send bogus code */
2767 } else if (c1 == '$') {
2768 if ((c1 = (*i_getc)(f)) == EOF) {
2770 (*oconv)(0, ESC); don't send bogus code
2771 (*oconv)(0, '$'); */
2773 } else if (c1 == '@'|| c1 == 'B') {
2774 /* This is kanji introduction */
2777 set_input_codename("ISO-2022-JP");
2779 debug(input_codename);
2782 } else if (c1 == '(') {
2783 if ((c1 = (*i_getc)(f)) == EOF) {
2784 /* don't send bogus code
2790 } else if (c1 == '@'|| c1 == 'B') {
2791 /* This is kanji introduction */
2796 } else if (c1 == 'D'){
2800 #endif /* X0212_ENABLE */
2801 } else if (c1 == (X0213_1&0x7F)){
2802 input_mode = X0213_1;
2805 } else if (c1 == (X0213_2&0x7F)){
2806 input_mode = X0213_2;
2810 /* could be some special code */
2817 } else if (broken_f&0x2) {
2818 /* accept any ESC-(-x as broken code ... */
2828 } else if (c1 == '(') {
2829 if ((c1 = (*i_getc)(f)) == EOF) {
2830 /* don't send bogus code
2832 (*oconv)(0, '('); */
2836 /* This is X0201 kana introduction */
2837 input_mode = X0201; shift_mode = X0201;
2839 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2840 /* This is X0208 kanji introduction */
2841 input_mode = ASCII; shift_mode = FALSE;
2843 } else if (broken_f&0x2) {
2844 input_mode = ASCII; shift_mode = FALSE;
2849 /* maintain various input_mode here */
2853 } else if ( c1 == 'N' || c1 == 'n' ){
2855 c3 = (*i_getc)(f); /* skip SS2 */
2856 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2871 } else if (c1 == ESC && iconv == s_iconv) {
2872 /* ESC in Shift_JIS */
2873 if ((c1 = (*i_getc)(f)) == EOF) {
2874 /* (*oconv)(0, ESC); don't send bogus code */
2876 } else if (c1 == '$') {
2878 if ((c1 = (*i_getc)(f)) == EOF) {
2880 (*oconv)(0, ESC); don't send bogus code
2881 (*oconv)(0, '$'); */
2884 if (('E' <= c1 && c1 <= 'G') ||
2885 ('O' <= c1 && c1 <= 'Q')) {
2893 static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2894 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2895 while ((c1 = (*i_getc)(f)) != EOF) {
2896 if (SPACE <= c1 && c1 <= 'z') {
2897 (*oconv)(0, c1 + c0);
2898 } else break; /* c1 == SO */
2902 if (c1 == EOF) LAST;
2909 } else if (c1 == NL || c1 == CR) {
2911 input_mode = ASCII; set_iconv(FALSE, 0);
2913 } else if (mime_decode_f && !mime_decode_mode){
2915 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2923 } else { /* if (c1 == CR)*/
2924 if ((c1=(*i_getc)(f))!=EOF) {
2928 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2942 if (crmode_f == CR && c1 == NL) crmode_f = CRLF;
2944 } else if (c1 == DEL && input_mode == X0208 ) {
2954 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2957 if ((c0 = (*i_getc)(f)) != EOF) {
2960 if ((c3 = (*i_getc)(f)) != EOF) {
2962 (*iconv)(c2, c1, c0|c3);
2967 /* 3 bytes EUC or UTF-8 */
2968 if ((c0 = (*i_getc)(f)) != EOF) {
2970 (*iconv)(c2, c1, c0);
2978 0x7F <= c2 && c2 <= 0x92 &&
2979 0x21 <= c1 && c1 <= 0x7E) {
2981 if(c1 == 0x7F) return 0;
2982 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2985 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2989 (*oconv)(PREFIX_EUCG3 | c2, c1);
2991 #endif /* X0212_ENABLE */
2993 (*oconv)(PREFIX_EUCG3 | c2, c1);
2996 (*oconv)(input_mode, c1); /* other special case */
3002 /* goto next_word */
3006 (*iconv)(EOF, 0, 0);
3007 if (!is_inputcode_set)
3010 struct input_code *p = input_code_list;
3011 struct input_code *result = p;
3013 if (p->score < result->score) result = p;
3016 set_input_codename(result->name);
3023 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3025 nkf_char ret, c3, c0;
3029 /** it must NOT be in the kanji shifte sequence */
3030 /** it must NOT be written in JIS7 */
3031 /** and it must be after 2 byte 8bit code */
3037 while ((c1 = (*i_getc)(f)) != EOF) {
3043 if (push_hold_buf(c1) == EOF || estab_f){
3049 struct input_code *p = input_code_list;
3050 struct input_code *result = p;
3055 if (p->status_func && p->score < result->score){
3060 set_iconv(TRUE, result->iconv_func);
3065 ** 1) EOF is detected, or
3066 ** 2) Code is established, or
3067 ** 3) Buffer is FULL (but last word is pushed)
3069 ** in 1) and 3) cases, we continue to use
3070 ** Kanji codes by oconv and leave estab_f unchanged.
3075 while (hold_index < hold_count){
3076 c2 = hold_buf[hold_index++];
3078 #ifdef NUMCHAR_OPTION
3079 || is_unicode_capsule(c2)
3084 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3085 (*iconv)(X0201, c2, 0);
3088 if (hold_index < hold_count){
3089 c1 = hold_buf[hold_index++];
3099 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3102 if (hold_index < hold_count){
3103 c0 = hold_buf[hold_index++];
3104 } else if ((c0 = (*i_getc)(f)) == EOF) {
3110 if (hold_index < hold_count){
3111 c3 = hold_buf[hold_index++];
3112 } else if ((c3 = (*i_getc)(f)) == EOF) {
3117 (*iconv)(c2, c1, c0|c3);
3122 /* 3 bytes EUC or UTF-8 */
3123 if (hold_index < hold_count){
3124 c0 = hold_buf[hold_index++];
3125 } else if ((c0 = (*i_getc)(f)) == EOF) {
3131 (*iconv)(c2, c1, c0);
3134 if (c0 == EOF) break;
3139 nkf_char push_hold_buf(nkf_char c2)
3141 if (hold_count >= HOLD_SIZE*2)
3143 hold_buf[hold_count++] = (unsigned char)c2;
3144 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3147 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3149 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3152 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3153 #ifdef SHIFTJIS_CP932
3154 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3155 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3162 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3163 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3169 #endif /* SHIFTJIS_CP932 */
3171 if (!x0213_f && is_ibmext_in_sjis(c2)){
3172 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3175 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3188 if(x0213_f && c2 >= 0xF0){
3189 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3190 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3191 }else{ /* 78<=k<=94 */
3192 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3193 if (0x9E < c1) c2++;
3196 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3197 if (0x9E < c1) c2++;
3200 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3207 c2 = x0212_unshift(c2);
3214 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3218 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3220 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3222 if(c1 == 0x7F) return 0;
3223 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3226 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3227 if (ret) return ret;
3233 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3238 }else if (c2 == 0x8f){
3242 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3243 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3244 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3247 c2 = (c2 << 8) | (c1 & 0x7f);
3249 #ifdef SHIFTJIS_CP932
3252 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3253 s2e_conv(s2, s1, &c2, &c1);
3260 #endif /* SHIFTJIS_CP932 */
3262 #endif /* X0212_ENABLE */
3263 } else if (c2 == SSO){
3266 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3269 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3270 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3271 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3276 #ifdef SHIFTJIS_CP932
3277 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3279 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3280 s2e_conv(s2, s1, &c2, &c1);
3287 #endif /* SHIFTJIS_CP932 */
3294 #ifdef UTF8_INPUT_ENABLE
3295 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3302 }else if (0xc0 <= c2 && c2 <= 0xef) {
3303 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3304 #ifdef NUMCHAR_OPTION
3307 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3315 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3318 static const int w_iconv_utf8_1st_byte[] =
3320 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3321 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3322 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3323 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3325 if (c2 < 0 || 0xff < c2) {
3326 }else if (c2 == 0) { /* 0 : 1 byte*/
3328 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3331 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3333 if (c1 < 0x80 || 0xBF < c1) return 0;
3336 if (c0 == 0) return -1;
3337 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3342 if (c0 == 0) return -1;
3343 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3347 if (c0 == 0) return -1;
3348 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3352 if (c0 == 0) return -2;
3353 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3357 if (c0 == 0) return -2;
3358 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3362 if (c0 == 0) return -2;
3363 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3371 if (c2 == 0 || c2 == EOF){
3372 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3373 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3376 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3385 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3386 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3393 }else if (val < 0x800){
3394 *p2 = 0xc0 | (val >> 6);
3395 *p1 = 0x80 | (val & 0x3f);
3397 } else if (val <= NKF_INT32_C(0xFFFF)) {
3398 *p2 = 0xe0 | (val >> 12);
3399 *p1 = 0x80 | ((val >> 6) & 0x3f);
3400 *p0 = 0x80 | (val & 0x3f);
3401 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3402 *p2 = 0xe0 | (val >> 16);
3403 *p1 = 0x80 | ((val >> 12) & 0x3f);
3404 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3413 #ifdef UTF8_INPUT_ENABLE
3414 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3419 } else if (c2 >= 0xf0){
3420 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3421 val = (c2 & 0x0f) << 18;
3422 val |= (c1 & 0x3f) << 12;
3423 val |= (c0 & 0x3f00) >> 2;
3425 }else if (c2 >= 0xe0){
3426 val = (c2 & 0x0f) << 12;
3427 val |= (c1 & 0x3f) << 6;
3429 }else if (c2 >= 0xc0){
3430 val = (c2 & 0x1f) << 6;
3438 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3440 nkf_char c2, c1, c0;
3447 w16w_conv(val, &c2, &c1, &c0);
3448 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3449 #ifdef NUMCHAR_OPTION
3452 *p1 = CLASS_UNICODE | val;
3461 #ifdef UTF8_INPUT_ENABLE
3462 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3465 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3468 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3469 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3471 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3473 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3478 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3479 if (ret) return ret;
3484 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3488 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3489 } else if (is_unicode_bmp(c1)) {
3490 ret = w16e_conv(c1, &c2, &c1);
3493 c1 = CLASS_UNICODE | c1;
3495 if (ret) return ret;
3500 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3502 const unsigned short *const *pp;
3503 const unsigned short *const *const *ppp;
3504 static const int no_best_fit_chars_table_C2[] =
3505 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3507 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3508 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3509 static const int no_best_fit_chars_table_C2_ms[] =
3510 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3513 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3514 static const int no_best_fit_chars_table_932_C2[] =
3515 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3518 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3519 static const int no_best_fit_chars_table_932_C3[] =
3520 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3521 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3523 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3529 }else if(c2 < 0xe0){
3530 if(no_best_fit_chars_f){
3531 if(ms_ucs_map_f == UCS_MAP_CP932){
3534 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3537 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3540 }else if(!cp932inv_f){
3543 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3546 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3549 }else if(ms_ucs_map_f == UCS_MAP_MS){
3550 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3551 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3569 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3570 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3571 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3573 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3574 }else if(c0 < 0xF0){
3575 if(no_best_fit_chars_f){
3576 if(ms_ucs_map_f == UCS_MAP_CP932){
3577 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3578 }else if(ms_ucs_map_f == UCS_MAP_MS){
3583 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3586 if(c0 == 0x92) return 1;
3591 if(c1 == 0x80 || c0 == 0x9C) return 1;
3594 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3599 if(c0 == 0x94) return 1;
3602 if(c0 == 0xBB) return 1;
3612 if(c0 == 0x95) return 1;
3615 if(c0 == 0xA5) return 1;
3622 if(c0 == 0x8D) return 1;
3625 if(c0 == 0x9E && !cp932inv_f) return 1;
3628 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3636 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3637 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3638 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3640 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3642 #ifdef SHIFTJIS_CP932
3643 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3645 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3646 s2e_conv(s2, s1, p2, p1);
3655 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3658 const unsigned short *p;
3661 if (pp == 0) return 1;
3664 if (c1 < 0 || psize <= c1) return 1;
3666 if (p == 0) return 1;
3669 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3671 if (val == 0) return 1;
3672 if (no_cp932ext_f && (
3673 (val>>8) == 0x2D || /* NEC special characters */
3674 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3682 if (c2 == SO) c2 = X0201;
3689 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3691 const char *hex = "0123456789ABCDEF";
3697 (*f)(0, hex[(c>>shift)&0xF]);
3707 void encode_fallback_html(nkf_char c)
3712 if(c >= NKF_INT32_C(1000000))
3713 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3714 if(c >= NKF_INT32_C(100000))
3715 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3717 (*oconv)(0, 0x30+(c/10000 )%10);
3719 (*oconv)(0, 0x30+(c/1000 )%10);
3721 (*oconv)(0, 0x30+(c/100 )%10);
3723 (*oconv)(0, 0x30+(c/10 )%10);
3725 (*oconv)(0, 0x30+ c %10);
3730 void encode_fallback_xml(nkf_char c)
3735 nkf_each_char_to_hex(oconv, c);
3740 void encode_fallback_java(nkf_char c)
3742 const char *hex = "0123456789ABCDEF";
3745 if(!is_unicode_bmp(c)){
3749 (*oconv)(0, hex[(c>>20)&0xF]);
3750 (*oconv)(0, hex[(c>>16)&0xF]);
3754 (*oconv)(0, hex[(c>>12)&0xF]);
3755 (*oconv)(0, hex[(c>> 8)&0xF]);
3756 (*oconv)(0, hex[(c>> 4)&0xF]);
3757 (*oconv)(0, hex[ c &0xF]);
3761 void encode_fallback_perl(nkf_char c)
3766 nkf_each_char_to_hex(oconv, c);
3771 void encode_fallback_subchar(nkf_char c)
3773 c = unicode_subchar;
3774 (*oconv)((c>>8)&0xFF, c&0xFF);
3779 #ifdef UTF8_OUTPUT_ENABLE
3780 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3782 const unsigned short *p;
3785 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3793 p = euc_to_utf8_1byte;
3795 } else if (is_eucg3(c2)){
3796 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3799 c2 = (c2&0x7f) - 0x21;
3800 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3801 p = x0212_to_utf8_2bytes[c2];
3807 c2 = (c2&0x7f) - 0x21;
3808 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3810 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3811 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3812 euc_to_utf8_2bytes_ms[c2];
3817 c1 = (c1 & 0x7f) - 0x21;
3818 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3823 void w_oconv(nkf_char c2, nkf_char c1)
3829 output_bom_f = FALSE;
3840 #ifdef NUMCHAR_OPTION
3841 if (c2 == 0 && is_unicode_capsule(c1)){
3842 val = c1 & VALUE_MASK;
3845 }else if (val < 0x800){
3846 (*o_putc)(0xC0 | (val >> 6));
3847 (*o_putc)(0x80 | (val & 0x3f));
3848 } else if (val <= NKF_INT32_C(0xFFFF)) {
3849 (*o_putc)(0xE0 | (val >> 12));
3850 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3851 (*o_putc)(0x80 | (val & 0x3f));
3852 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3853 (*o_putc)(0xF0 | ( val>>18));
3854 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3855 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3856 (*o_putc)(0x80 | ( val & 0x3f));
3863 output_mode = ASCII;
3865 } else if (c2 == ISO8859_1) {
3866 output_mode = ISO8859_1;
3867 (*o_putc)(c1 | 0x080);
3870 val = e2w_conv(c2, c1);
3872 w16w_conv(val, &c2, &c1, &c0);
3876 if (c0) (*o_putc)(c0);
3882 void w_oconv16(nkf_char c2, nkf_char c1)
3885 output_bom_f = FALSE;
3886 if (output_endian == ENDIAN_LITTLE){
3887 (*o_putc)((unsigned char)'\377');
3891 (*o_putc)((unsigned char)'\377');
3900 if (c2 == ISO8859_1) {
3903 #ifdef NUMCHAR_OPTION
3904 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3905 if (is_unicode_bmp(c1)) {
3906 c2 = (c1 >> 8) & 0xff;
3910 if (c1 <= UNICODE_MAX) {
3911 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3912 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3913 if (output_endian == ENDIAN_LITTLE){
3914 (*o_putc)(c2 & 0xff);
3915 (*o_putc)((c2 >> 8) & 0xff);
3916 (*o_putc)(c1 & 0xff);
3917 (*o_putc)((c1 >> 8) & 0xff);
3919 (*o_putc)((c2 >> 8) & 0xff);
3920 (*o_putc)(c2 & 0xff);
3921 (*o_putc)((c1 >> 8) & 0xff);
3922 (*o_putc)(c1 & 0xff);
3929 nkf_char val = e2w_conv(c2, c1);
3930 c2 = (val >> 8) & 0xff;
3934 if (output_endian == ENDIAN_LITTLE){
3943 void w_oconv32(nkf_char c2, nkf_char c1)
3946 output_bom_f = FALSE;
3947 if (output_endian == ENDIAN_LITTLE){
3948 (*o_putc)((unsigned char)'\377');
3956 (*o_putc)((unsigned char)'\377');
3965 if (c2 == ISO8859_1) {
3967 #ifdef NUMCHAR_OPTION
3968 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3972 c1 = e2w_conv(c2, c1);
3975 if (output_endian == ENDIAN_LITTLE){
3976 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3977 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3978 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3982 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3983 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3984 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3989 void e_oconv(nkf_char c2, nkf_char c1)
3991 #ifdef NUMCHAR_OPTION
3992 if (c2 == 0 && is_unicode_capsule(c1)){
3993 w16e_conv(c1, &c2, &c1);
3994 if (c2 == 0 && is_unicode_capsule(c1)){
3995 c2 = c1 & VALUE_MASK;
3996 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4000 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4001 c1 = 0x21 + c1 % 94;
4004 (*o_putc)((c2 & 0x7f) | 0x080);
4005 (*o_putc)(c1 | 0x080);
4007 (*o_putc)((c2 & 0x7f) | 0x080);
4008 (*o_putc)(c1 | 0x080);
4012 if (encode_fallback) (*encode_fallback)(c1);
4021 } else if (c2 == 0) {
4022 output_mode = ASCII;
4024 } else if (c2 == X0201) {
4025 output_mode = JAPANESE_EUC;
4026 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4027 } else if (c2 == ISO8859_1) {
4028 output_mode = ISO8859_1;
4029 (*o_putc)(c1 | 0x080);
4031 } else if (is_eucg3(c2)){
4032 output_mode = JAPANESE_EUC;
4033 #ifdef SHIFTJIS_CP932
4036 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4037 s2e_conv(s2, s1, &c2, &c1);
4042 output_mode = ASCII;
4044 }else if (is_eucg3(c2)){
4047 (*o_putc)((c2 & 0x7f) | 0x080);
4048 (*o_putc)(c1 | 0x080);
4051 (*o_putc)((c2 & 0x7f) | 0x080);
4052 (*o_putc)(c1 | 0x080);
4056 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4057 set_iconv(FALSE, 0);
4058 return; /* too late to rescue this char */
4060 output_mode = JAPANESE_EUC;
4061 (*o_putc)(c2 | 0x080);
4062 (*o_putc)(c1 | 0x080);
4067 nkf_char x0212_shift(nkf_char c)
4072 if (0x75 <= c && c <= 0x7f){
4073 ret = c + (0x109 - 0x75);
4076 if (0x75 <= c && c <= 0x7f){
4077 ret = c + (0x113 - 0x75);
4084 nkf_char x0212_unshift(nkf_char c)
4087 if (0x7f <= c && c <= 0x88){
4088 ret = c + (0x75 - 0x7f);
4089 }else if (0x89 <= c && c <= 0x92){
4090 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4094 #endif /* X0212_ENABLE */
4096 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4102 if((0x21 <= ndx && ndx <= 0x2F)){
4103 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4104 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4106 }else if(0x6E <= ndx && ndx <= 0x7E){
4107 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4108 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4114 else if(nkf_isgraph(ndx)){
4116 const unsigned short *ptr;
4117 ptr = x0212_shiftjis[ndx - 0x21];
4119 val = ptr[(c1 & 0x7f) - 0x21];
4128 c2 = x0212_shift(c2);
4130 #endif /* X0212_ENABLE */
4132 if(0x7F < c2) return 1;
4133 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4134 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4138 void s_oconv(nkf_char c2, nkf_char c1)
4140 #ifdef NUMCHAR_OPTION
4141 if (c2 == 0 && is_unicode_capsule(c1)){
4142 w16e_conv(c1, &c2, &c1);
4143 if (c2 == 0 && is_unicode_capsule(c1)){
4144 c2 = c1 & VALUE_MASK;
4145 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4148 c2 = c1 / 188 + 0xF0;
4150 c1 += 0x40 + (c1 > 0x3e);
4155 if(encode_fallback)(*encode_fallback)(c1);
4164 } else if (c2 == 0) {
4165 output_mode = ASCII;
4167 } else if (c2 == X0201) {
4168 output_mode = SHIFT_JIS;
4170 } else if (c2 == ISO8859_1) {
4171 output_mode = ISO8859_1;
4172 (*o_putc)(c1 | 0x080);
4174 } else if (is_eucg3(c2)){
4175 output_mode = SHIFT_JIS;
4176 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4182 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4183 set_iconv(FALSE, 0);
4184 return; /* too late to rescue this char */
4186 output_mode = SHIFT_JIS;
4187 e2s_conv(c2, c1, &c2, &c1);
4189 #ifdef SHIFTJIS_CP932
4191 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4192 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4198 #endif /* SHIFTJIS_CP932 */
4201 if (prefix_table[(unsigned char)c1]){
4202 (*o_putc)(prefix_table[(unsigned char)c1]);
4208 void j_oconv(nkf_char c2, nkf_char c1)
4210 #ifdef NUMCHAR_OPTION
4211 if (c2 == 0 && is_unicode_capsule(c1)){
4212 w16e_conv(c1, &c2, &c1);
4213 if (c2 == 0 && is_unicode_capsule(c1)){
4214 c2 = c1 & VALUE_MASK;
4215 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4218 c2 = 0x7F + c1 / 94;
4219 c1 = 0x21 + c1 % 94;
4221 if (encode_fallback) (*encode_fallback)(c1);
4228 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4231 (*o_putc)(ascii_intro);
4232 output_mode = ASCII;
4236 } else if (is_eucg3(c2)){
4238 if(output_mode!=X0213_2){
4239 output_mode = X0213_2;
4243 (*o_putc)(X0213_2&0x7F);
4246 if(output_mode!=X0212){
4247 output_mode = X0212;
4251 (*o_putc)(X0212&0x7F);
4254 (*o_putc)(c2 & 0x7f);
4257 } else if (c2==X0201) {
4258 if (output_mode!=X0201) {
4259 output_mode = X0201;
4265 } else if (c2==ISO8859_1) {
4266 /* iso8859 introduction, or 8th bit on */
4267 /* Can we convert in 7bit form using ESC-'-'-A ?
4269 output_mode = ISO8859_1;
4271 } else if (c2 == 0) {
4272 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4275 (*o_putc)(ascii_intro);
4276 output_mode = ASCII;
4281 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4282 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4284 if (output_mode!=X0213_1) {
4285 output_mode = X0213_1;
4289 (*o_putc)(X0213_1&0x7F);
4291 }else if (output_mode != X0208) {
4292 output_mode = X0208;
4295 (*o_putc)(kanji_intro);
4302 void base64_conv(nkf_char c2, nkf_char c1)
4304 mime_prechar(c2, c1);
4305 (*o_base64conv)(c2,c1);
4309 static nkf_char broken_buf[3];
4310 static int broken_counter = 0;
4311 static int broken_last = 0;
4312 nkf_char broken_getc(FILE *f)
4316 if (broken_counter>0) {
4317 return broken_buf[--broken_counter];
4320 if (c=='$' && broken_last != ESC
4321 && (input_mode==ASCII || input_mode==X0201)) {
4324 if (c1=='@'|| c1=='B') {
4325 broken_buf[0]=c1; broken_buf[1]=c;
4332 } else if (c=='(' && broken_last != ESC
4333 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4336 if (c1=='J'|| c1=='B') {
4337 broken_buf[0]=c1; broken_buf[1]=c;
4350 nkf_char broken_ungetc(nkf_char c, FILE *f)
4352 if (broken_counter<2)
4353 broken_buf[broken_counter++]=c;
4357 static nkf_char prev_cr = 0;
4359 void cr_conv(nkf_char c2, nkf_char c1)
4363 if (! (c2==0&&c1==NL) ) {
4369 } else if (c1=='\r') {
4371 } else if (c1=='\n') {
4372 if (crmode_f==CRLF) {
4373 (*o_crconv)(0,'\r');
4374 } else if (crmode_f==CR) {
4375 (*o_crconv)(0,'\r');
4379 } else if (c1!='\032' || crmode_f!=NL){
4385 Return value of fold_conv()
4387 \n add newline and output char
4388 \r add newline and output nothing
4391 1 (or else) normal output
4393 fold state in prev (previous character)
4395 >0x80 Japanese (X0208/X0201)
4400 This fold algorthm does not preserve heading space in a line.
4401 This is the main difference from fmt.
4404 #define char_size(c2,c1) (c2?2:1)
4406 void fold_conv(nkf_char c2, nkf_char c1)
4409 nkf_char fold_state;
4411 if (c1== '\r' && !fold_preserve_f) {
4412 fold_state=0; /* ignore cr */
4413 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4415 fold_state=0; /* ignore cr */
4416 } else if (c1== BS) {
4417 if (f_line>0) f_line--;
4419 } else if (c2==EOF && f_line != 0) { /* close open last line */
4421 } else if ((c1=='\n' && !fold_preserve_f)
4422 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4423 && fold_preserve_f)) {
4425 if (fold_preserve_f) {
4429 } else if ((f_prev == c1 && !fold_preserve_f)
4430 || (f_prev == '\n' && fold_preserve_f)
4431 ) { /* duplicate newline */
4434 fold_state = '\n'; /* output two newline */
4440 if (f_prev&0x80) { /* Japanese? */
4442 fold_state = 0; /* ignore given single newline */
4443 } else if (f_prev==' ') {
4447 if (++f_line<=fold_len)
4451 fold_state = '\r'; /* fold and output nothing */
4455 } else if (c1=='\f') {
4458 fold_state = '\n'; /* output newline and clear */
4459 } else if ( (c2==0 && c1==' ')||
4460 (c2==0 && c1=='\t')||
4461 (c2=='!'&& c1=='!')) {
4462 /* X0208 kankaku or ascii space */
4463 if (f_prev == ' ') {
4464 fold_state = 0; /* remove duplicate spaces */
4467 if (++f_line<=fold_len)
4468 fold_state = ' '; /* output ASCII space only */
4470 f_prev = ' '; f_line = 0;
4471 fold_state = '\r'; /* fold and output nothing */
4475 prev0 = f_prev; /* we still need this one... , but almost done */
4477 if (c2 || c2==X0201)
4478 f_prev |= 0x80; /* this is Japanese */
4479 f_line += char_size(c2,c1);
4480 if (f_line<=fold_len) { /* normal case */
4483 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4484 f_line = char_size(c2,c1);
4485 fold_state = '\n'; /* We can't wait, do fold now */
4486 } else if (c2==X0201) {
4487 /* simple kinsoku rules return 1 means no folding */
4488 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4489 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4490 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4491 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4492 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4493 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4494 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4496 fold_state = '\n';/* add one new f_line before this character */
4499 fold_state = '\n';/* add one new f_line before this character */
4502 /* kinsoku point in ASCII */
4503 if ( c1==')'|| /* { [ ( */
4514 /* just after special */
4515 } else if (!is_alnum(prev0)) {
4516 f_line = char_size(c2,c1);
4518 } else if ((prev0==' ') || /* ignored new f_line */
4519 (prev0=='\n')|| /* ignored new f_line */
4520 (prev0&0x80)) { /* X0208 - ASCII */
4521 f_line = char_size(c2,c1);
4522 fold_state = '\n';/* add one new f_line before this character */
4524 fold_state = 1; /* default no fold in ASCII */
4528 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4529 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4530 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4531 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4532 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4533 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4534 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4535 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4536 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4537 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4538 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4539 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4540 /* default no fold in kinsoku */
4543 f_line = char_size(c2,c1);
4544 /* add one new f_line before this character */
4547 f_line = char_size(c2,c1);
4549 /* add one new f_line before this character */
4554 /* terminator process */
4555 switch(fold_state) {
4574 nkf_char z_prev2=0,z_prev1=0;
4576 void z_conv(nkf_char c2, nkf_char c1)
4579 /* if (c2) c1 &= 0x7f; assertion */
4581 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4585 if (x0201_f && z_prev2==X0201) { /* X0201 */
4586 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4588 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4590 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4592 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4596 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4605 if (x0201_f && c2==X0201) {
4606 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4607 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4608 z_prev1 = c1; z_prev2 = c2;
4611 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4616 /* JISX0208 Alphabet */
4617 if (alpha_f && c2 == 0x23 ) {
4619 } else if (alpha_f && c2 == 0x21 ) {
4620 /* JISX0208 Kigou */
4625 } else if (alpha_f&0x4) {
4630 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4636 case '>': entity = ">"; break;
4637 case '<': entity = "<"; break;
4638 case '\"': entity = """; break;
4639 case '&': entity = "&"; break;
4642 while (*entity) (*o_zconv)(0, *entity++);
4652 #define rot13(c) ( \
4654 (c <= 'M') ? (c + 13): \
4655 (c <= 'Z') ? (c - 13): \
4657 (c <= 'm') ? (c + 13): \
4658 (c <= 'z') ? (c - 13): \
4662 #define rot47(c) ( \
4664 ( c <= 'O' ) ? (c + 47) : \
4665 ( c <= '~' ) ? (c - 47) : \
4669 void rot_conv(nkf_char c2, nkf_char c1)
4671 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4677 (*o_rot_conv)(c2,c1);
4680 void hira_conv(nkf_char c2, nkf_char c1)
4684 if (0x20 < c1 && c1 < 0x74) {
4686 (*o_hira_conv)(c2,c1);
4688 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4690 c1 = CLASS_UNICODE | 0x3094;
4691 (*o_hira_conv)(c2,c1);
4694 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4696 (*o_hira_conv)(c2,c1);
4701 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4704 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4706 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4710 (*o_hira_conv)(c2,c1);
4714 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4716 static const nkf_char range[RANGE_NUM_MAX][2] = {
4737 nkf_char start, end, c;
4739 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4743 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4748 for (i = 0; i < RANGE_NUM_MAX; i++) {
4749 start = range[i][0];
4752 if (c >= start && c <= end) {
4757 (*o_iso2022jp_check_conv)(c2,c1);
4761 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4763 const unsigned char *mime_pattern[] = {
4764 (const unsigned char *)"\075?EUC-JP?B?",
4765 (const unsigned char *)"\075?SHIFT_JIS?B?",
4766 (const unsigned char *)"\075?ISO-8859-1?Q?",
4767 (const unsigned char *)"\075?ISO-8859-1?B?",
4768 (const unsigned char *)"\075?ISO-2022-JP?B?",
4769 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4770 #if defined(UTF8_INPUT_ENABLE)
4771 (const unsigned char *)"\075?UTF-8?B?",
4772 (const unsigned char *)"\075?UTF-8?Q?",
4774 (const unsigned char *)"\075?US-ASCII?Q?",
4779 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4780 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4781 e_iconv, s_iconv, 0, 0, 0, 0,
4782 #if defined(UTF8_INPUT_ENABLE)
4788 const nkf_char mime_encode[] = {
4789 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4790 #if defined(UTF8_INPUT_ENABLE)
4797 const nkf_char mime_encode_method[] = {
4798 'B', 'B','Q', 'B', 'B', 'Q',
4799 #if defined(UTF8_INPUT_ENABLE)
4807 #define MAXRECOVER 20
4809 void switch_mime_getc(void)
4811 if (i_getc!=mime_getc) {
4812 i_mgetc = i_getc; i_getc = mime_getc;
4813 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4814 if(mime_f==STRICT_MIME) {
4815 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4816 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4821 void unswitch_mime_getc(void)
4823 if(mime_f==STRICT_MIME) {
4824 i_mgetc = i_mgetc_buf;
4825 i_mungetc = i_mungetc_buf;
4828 i_ungetc = i_mungetc;
4829 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4830 mime_iconv_back = NULL;
4833 nkf_char mime_begin_strict(FILE *f)
4837 const unsigned char *p,*q;
4838 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4840 mime_decode_mode = FALSE;
4841 /* =? has been checked */
4843 p = mime_pattern[j];
4846 for(i=2;p[i]>' ';i++) { /* start at =? */
4847 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4848 /* pattern fails, try next one */
4850 while (mime_pattern[++j]) {
4851 p = mime_pattern[j];
4852 for(k=2;k<i;k++) /* assume length(p) > i */
4853 if (p[k]!=q[k]) break;
4854 if (k==i && nkf_toupper(c1)==p[k]) break;
4856 p = mime_pattern[j];
4857 if (p) continue; /* found next one, continue */
4858 /* all fails, output from recovery buffer */
4866 mime_decode_mode = p[i-2];
4868 mime_iconv_back = iconv;
4869 set_iconv(FALSE, mime_priority_func[j]);
4870 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4872 if (mime_decode_mode=='B') {
4873 mimebuf_f = unbuf_f;
4875 /* do MIME integrity check */
4876 return mime_integrity(f,mime_pattern[j]);
4884 nkf_char mime_getc_buf(FILE *f)
4886 /* we don't keep eof of Fifo, becase it contains ?= as
4887 a terminator. It was checked in mime_integrity. */
4888 return ((mimebuf_f)?
4889 (*i_mgetc_buf)(f):Fifo(mime_input++));
4892 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4895 (*i_mungetc_buf)(c,f);
4897 Fifo(--mime_input) = (unsigned char)c;
4901 nkf_char mime_begin(FILE *f)
4906 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4907 /* re-read and convert again from mime_buffer. */
4909 /* =? has been checked */
4911 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4912 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4913 /* We accept any character type even if it is breaked by new lines */
4914 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4915 if (c1=='\n'||c1==' '||c1=='\r'||
4916 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4918 /* Failed. But this could be another MIME preemble */
4926 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4927 if (!(++i<MAXRECOVER) || c1==EOF) break;
4928 if (c1=='b'||c1=='B') {
4929 mime_decode_mode = 'B';
4930 } else if (c1=='q'||c1=='Q') {
4931 mime_decode_mode = 'Q';
4935 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4936 if (!(++i<MAXRECOVER) || c1==EOF) break;
4938 mime_decode_mode = FALSE;
4944 if (!mime_decode_mode) {
4945 /* false MIME premble, restart from mime_buffer */
4946 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4947 /* Since we are in MIME mode until buffer becomes empty, */
4948 /* we never go into mime_begin again for a while. */
4951 /* discard mime preemble, and goto MIME mode */
4953 /* do no MIME integrity check */
4954 return c1; /* used only for checking EOF */
4958 void no_putc(nkf_char c)
4963 void debug(const char *str)
4966 fprintf(stderr, "%s\n", str);
4971 void set_input_codename(char *codename)
4975 strcmp(codename, "") != 0 &&
4976 strcmp(codename, input_codename) != 0)
4978 is_inputcode_mixed = TRUE;
4980 input_codename = codename;
4981 is_inputcode_set = TRUE;
4984 #if !defined(PERL_XS) && !defined(WIN32DLL)
4985 void print_guessed_code(char *filename)
4987 char *codename = "BINARY";
4988 char *str_crmode = NULL;
4989 if (!is_inputcode_mixed) {
4990 if (strcmp(input_codename, "") == 0) {
4993 codename = input_codename;
4995 if (crmode_f == CR) str_crmode = "CR";
4996 else if (crmode_f == NL) str_crmode = "LF";
4997 else if (crmode_f == CRLF) str_crmode = "CRLF";
4999 if (filename != NULL) printf("%s:", filename);
5000 if (str_crmode != NULL) printf("%s (%s)\n", codename, str_crmode);
5001 else printf("%s\n", codename);
5007 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5009 nkf_char c1, c2, c3;
5015 if (!nkf_isxdigit(c2)){
5020 if (!nkf_isxdigit(c3)){
5025 return (hex2bin(c2) << 4) | hex2bin(c3);
5028 nkf_char cap_getc(FILE *f)
5030 return hex_getc(':', f, i_cgetc, i_cungetc);
5033 nkf_char cap_ungetc(nkf_char c, FILE *f)
5035 return (*i_cungetc)(c, f);
5038 nkf_char url_getc(FILE *f)
5040 return hex_getc('%', f, i_ugetc, i_uungetc);
5043 nkf_char url_ungetc(nkf_char c, FILE *f)
5045 return (*i_uungetc)(c, f);
5049 #ifdef NUMCHAR_OPTION
5050 nkf_char numchar_getc(FILE *f)
5052 nkf_char (*g)(FILE *) = i_ngetc;
5053 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5064 if (buf[i] == 'x' || buf[i] == 'X'){
5065 for (j = 0; j < 7; j++){
5067 if (!nkf_isxdigit(buf[i])){
5074 c |= hex2bin(buf[i]);
5077 for (j = 0; j < 8; j++){
5081 if (!nkf_isdigit(buf[i])){
5088 c += hex2bin(buf[i]);
5094 return CLASS_UNICODE | c;
5103 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5105 return (*i_nungetc)(c, f);
5109 #ifdef UNICODE_NORMALIZATION
5111 /* Normalization Form C */
5112 nkf_char nfc_getc(FILE *f)
5114 nkf_char (*g)(FILE *f) = i_nfc_getc;
5115 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5116 int i=0, j, k=1, lower, upper;
5118 const nkf_nfchar *array;
5121 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5122 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5123 while (upper >= lower) {
5124 j = (lower+upper) / 2;
5125 array = normalization_table[j].nfd;
5126 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5127 if (array[k] != buf[k]){
5128 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5135 array = normalization_table[j].nfc;
5136 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5137 buf[i] = (nkf_char)(array[i]);
5148 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5150 return (*i_nfc_ungetc)(c, f);
5152 #endif /* UNICODE_NORMALIZATION */
5158 nkf_char c1, c2, c3, c4, cc;
5159 nkf_char t1, t2, t3, t4, mode, exit_mode;
5160 nkf_char lwsp_count;
5163 nkf_char lwsp_size = 128;
5165 if (mime_top != mime_last) { /* Something is in FIFO */
5166 return Fifo(mime_top++);
5168 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5169 mime_decode_mode=FALSE;
5170 unswitch_mime_getc();
5171 return (*i_getc)(f);
5174 if (mimebuf_f == FIXED_MIME)
5175 exit_mode = mime_decode_mode;
5178 if (mime_decode_mode == 'Q') {
5179 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5181 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5182 if (c1<=' ' || DEL<=c1) {
5183 mime_decode_mode = exit_mode; /* prepare for quit */
5186 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5190 mime_decode_mode = exit_mode; /* prepare for quit */
5191 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5192 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5193 /* end Q encoding */
5194 input_mode = exit_mode;
5196 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5197 if (lwsp_buf==NULL) {
5198 perror("can't malloc");
5201 while ((c1=(*i_getc)(f))!=EOF) {
5206 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5214 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5215 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5230 lwsp_buf[lwsp_count] = (unsigned char)c1;
5231 if (lwsp_count++>lwsp_size){
5233 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5234 if (lwsp_buf_new==NULL) {
5236 perror("can't realloc");
5239 lwsp_buf = lwsp_buf_new;
5245 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5247 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5248 i_ungetc(lwsp_buf[lwsp_count],f);
5254 if (c1=='='&&c2<' ') { /* this is soft wrap */
5255 while((c1 = (*i_mgetc)(f)) <=' ') {
5256 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5258 mime_decode_mode = 'Q'; /* still in MIME */
5259 goto restart_mime_q;
5262 mime_decode_mode = 'Q'; /* still in MIME */
5266 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5267 if (c2<=' ') return c2;
5268 mime_decode_mode = 'Q'; /* still in MIME */
5269 return ((hex2bin(c2)<<4) + hex2bin(c3));
5272 if (mime_decode_mode != 'B') {
5273 mime_decode_mode = FALSE;
5274 return (*i_mgetc)(f);
5278 /* Base64 encoding */
5280 MIME allows line break in the middle of
5281 Base64, but we are very pessimistic in decoding
5282 in unbuf mode because MIME encoded code may broken by
5283 less or editor's control sequence (such as ESC-[-K in unbuffered
5284 mode. ignore incomplete MIME.
5286 mode = mime_decode_mode;
5287 mime_decode_mode = exit_mode; /* prepare for quit */
5289 while ((c1 = (*i_mgetc)(f))<=' ') {
5294 if ((c2 = (*i_mgetc)(f))<=' ') {
5297 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5298 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5301 if ((c1 == '?') && (c2 == '=')) {
5304 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5305 if (lwsp_buf==NULL) {
5306 perror("can't malloc");
5309 while ((c1=(*i_getc)(f))!=EOF) {
5314 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5322 if ((c1=(*i_getc)(f))!=EOF) {
5326 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5341 lwsp_buf[lwsp_count] = (unsigned char)c1;
5342 if (lwsp_count++>lwsp_size){
5344 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5345 if (lwsp_buf_new==NULL) {
5347 perror("can't realloc");
5350 lwsp_buf = lwsp_buf_new;
5356 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5358 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5359 i_ungetc(lwsp_buf[lwsp_count],f);
5366 if ((c3 = (*i_mgetc)(f))<=' ') {
5369 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5370 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5374 if ((c4 = (*i_mgetc)(f))<=' ') {
5377 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5378 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5382 mime_decode_mode = mode; /* still in MIME sigh... */
5384 /* BASE 64 decoding */
5386 t1 = 0x3f & base64decode(c1);
5387 t2 = 0x3f & base64decode(c2);
5388 t3 = 0x3f & base64decode(c3);
5389 t4 = 0x3f & base64decode(c4);
5390 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5392 Fifo(mime_last++) = (unsigned char)cc;
5393 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5395 Fifo(mime_last++) = (unsigned char)cc;
5396 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5398 Fifo(mime_last++) = (unsigned char)cc;
5403 return Fifo(mime_top++);
5406 nkf_char mime_ungetc(nkf_char c, FILE *f)
5408 Fifo(--mime_top) = (unsigned char)c;
5412 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5416 /* In buffered mode, read until =? or NL or buffer full
5418 mime_input = mime_top;
5419 mime_last = mime_top;
5421 while(*p) Fifo(mime_input++) = *p++;
5424 while((c=(*i_getc)(f))!=EOF) {
5425 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5426 break; /* buffer full */
5428 if (c=='=' && d=='?') {
5429 /* checked. skip header, start decode */
5430 Fifo(mime_input++) = (unsigned char)c;
5431 /* mime_last_input = mime_input; */
5436 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5438 /* Should we check length mod 4? */
5439 Fifo(mime_input++) = (unsigned char)c;
5442 /* In case of Incomplete MIME, no MIME decode */
5443 Fifo(mime_input++) = (unsigned char)c;
5444 mime_last = mime_input; /* point undecoded buffer */
5445 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5446 switch_mime_getc(); /* anyway we need buffered getc */
5450 nkf_char base64decode(nkf_char c)
5455 i = c - 'A'; /* A..Z 0-25 */
5457 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5459 } else if (c > '/') {
5460 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5461 } else if (c == '+') {
5462 i = '>' /* 62 */ ; /* + 62 */
5464 i = '?' /* 63 */ ; /* / 63 */
5469 static const char basis_64[] =
5470 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5472 static nkf_char b64c;
5473 #define MIMEOUT_BUF_LENGTH (60)
5474 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5475 int mimeout_buf_count = 0;
5476 int mimeout_preserve_space = 0;
5477 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5479 void open_mime(nkf_char mode)
5481 const unsigned char *p;
5484 p = mime_pattern[0];
5485 for(i=0;mime_pattern[i];i++) {
5486 if (mode == mime_encode[i]) {
5487 p = mime_pattern[i];
5491 mimeout_mode = mime_encode_method[i];
5494 if (base64_count>45) {
5495 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5496 (*o_mputc)(mimeout_buf[i]);
5502 if (!mimeout_preserve_space && mimeout_buf_count>0
5503 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5504 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5508 if (!mimeout_preserve_space) {
5509 for (;i<mimeout_buf_count;i++) {
5510 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5511 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5512 (*o_mputc)(mimeout_buf[i]);
5519 mimeout_preserve_space = FALSE;
5525 j = mimeout_buf_count;
5526 mimeout_buf_count = 0;
5528 mime_putc(mimeout_buf[i]);
5532 void close_mime(void)
5542 switch(mimeout_mode) {
5547 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5553 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5559 if (mimeout_f!=FIXED_MIME) {
5561 } else if (mimeout_mode != 'Q')
5566 void mimeout_addchar(nkf_char c)
5568 switch(mimeout_mode) {
5573 } else if(!nkf_isalnum(c)) {
5575 (*o_mputc)(itoh4(((c>>4)&0xf)));
5576 (*o_mputc)(itoh4((c&0xf)));
5585 (*o_mputc)(basis_64[c>>2]);
5590 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5596 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5597 (*o_mputc)(basis_64[c & 0x3F]);
5608 nkf_char mime_lastchar2, mime_lastchar1;
5610 void mime_prechar(nkf_char c2, nkf_char c1)
5614 if (base64_count + mimeout_buf_count/3*4> 66){
5615 (*o_base64conv)(EOF,0);
5616 (*o_base64conv)(0,NL);
5617 (*o_base64conv)(0,SPACE);
5619 }/*else if (mime_lastchar2){
5620 if (c1 <=DEL && !nkf_isspace(c1)){
5621 (*o_base64conv)(0,SPACE);
5625 if (c2 && mime_lastchar2 == 0
5626 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5627 (*o_base64conv)(0,SPACE);
5630 mime_lastchar2 = c2;
5631 mime_lastchar1 = c1;
5634 void mime_putc(nkf_char c)
5639 if (mimeout_f == FIXED_MIME){
5640 if (mimeout_mode == 'Q'){
5641 if (base64_count > 71){
5642 if (c!=CR && c!=NL) {
5649 if (base64_count > 71){
5654 if (c == EOF) { /* c==EOF */
5658 if (c != EOF) { /* c==EOF */
5664 /* mimeout_f != FIXED_MIME */
5666 if (c == EOF) { /* c==EOF */
5667 j = mimeout_buf_count;
5668 mimeout_buf_count = 0;
5672 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5675 mimeout_addchar(mimeout_buf[i]);
5679 mimeout_addchar(mimeout_buf[i]);
5683 mimeout_addchar(mimeout_buf[i]);
5689 if (mimeout_mode=='Q') {
5690 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5691 if (c == CR || c == NL) {
5696 } else if (c <= SPACE) {
5698 if (base64_count > 70) {
5702 if (!nkf_isblank(c)) {
5713 if (mimeout_buf_count > 0){
5714 lastchar = mimeout_buf[mimeout_buf_count - 1];
5719 if (!mimeout_mode) {
5720 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5721 if (nkf_isspace(c)) {
5722 if (c==CR || c==NL) {
5725 for (i=0;i<mimeout_buf_count;i++) {
5726 (*o_mputc)(mimeout_buf[i]);
5727 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5733 mimeout_buf[0] = (char)c;
5734 mimeout_buf_count = 1;
5736 if (base64_count > 1
5737 && base64_count + mimeout_buf_count > 76
5738 && mimeout_buf[0] != CR && mimeout_buf[0] != NL){
5741 if (!nkf_isspace(mimeout_buf[0])){
5746 mimeout_buf[mimeout_buf_count++] = (char)c;
5747 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5748 open_mime(output_mode);
5753 if (lastchar==CR || lastchar == NL){
5754 for (i=0;i<mimeout_buf_count;i++) {
5755 (*o_mputc)(mimeout_buf[i]);
5758 mimeout_buf_count = 0;
5760 if (lastchar==SPACE) {
5761 for (i=0;i<mimeout_buf_count-1;i++) {
5762 (*o_mputc)(mimeout_buf[i]);
5765 mimeout_buf[0] = SPACE;
5766 mimeout_buf_count = 1;
5768 open_mime(output_mode);
5771 /* mimeout_mode == 'B', 1, 2 */
5772 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5773 if (lastchar == CR || lastchar == NL){
5774 if (nkf_isblank(c)) {
5775 for (i=0;i<mimeout_buf_count;i++) {
5776 mimeout_addchar(mimeout_buf[i]);
5778 mimeout_buf_count = 0;
5779 } else if (SPACE<c && c<DEL) {
5781 for (i=0;i<mimeout_buf_count;i++) {
5782 (*o_mputc)(mimeout_buf[i]);
5785 mimeout_buf_count = 0;
5788 if (c==SPACE || c==TAB || c==CR || c==NL) {
5789 for (i=0;i<mimeout_buf_count;i++) {
5790 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5792 for (i=0;i<mimeout_buf_count;i++) {
5793 (*o_mputc)(mimeout_buf[i]);
5796 mimeout_buf_count = 0;
5799 mimeout_buf[mimeout_buf_count++] = (char)c;
5800 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5802 for (i=0;i<mimeout_buf_count;i++) {
5803 (*o_mputc)(mimeout_buf[i]);
5806 mimeout_buf_count = 0;
5810 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5811 mimeout_buf[mimeout_buf_count++] = (char)c;
5812 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5813 j = mimeout_buf_count;
5814 mimeout_buf_count = 0;
5816 mimeout_addchar(mimeout_buf[i]);
5823 if (mimeout_buf_count>0) {
5824 j = mimeout_buf_count;
5825 mimeout_buf_count = 0;
5827 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5829 mimeout_addchar(mimeout_buf[i]);
5835 (*o_mputc)(mimeout_buf[i]);
5837 open_mime(output_mode);
5844 #if defined(PERL_XS) || defined(WIN32DLL)
5848 struct input_code *p = input_code_list;
5861 mime_f = STRICT_MIME;
5862 mime_decode_f = FALSE;
5867 #if defined(MSDOS) || defined(__OS2__)
5872 iso2022jp_f = FALSE;
5873 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5874 ms_ucs_map_f = UCS_MAP_ASCII;
5876 #ifdef UTF8_INPUT_ENABLE
5877 no_cp932ext_f = FALSE;
5878 no_best_fit_chars_f = FALSE;
5879 encode_fallback = NULL;
5880 unicode_subchar = '?';
5881 input_endian = ENDIAN_BIG;
5883 #ifdef UTF8_OUTPUT_ENABLE
5884 output_bom_f = FALSE;
5885 output_endian = ENDIAN_BIG;
5887 #ifdef UNICODE_NORMALIZATION
5900 is_inputcode_mixed = FALSE;
5901 is_inputcode_set = FALSE;
5905 #ifdef SHIFTJIS_CP932
5915 for (i = 0; i < 256; i++){
5916 prefix_table[i] = 0;
5920 mimeout_buf_count = 0;
5925 fold_preserve_f = FALSE;
5928 kanji_intro = DEFAULT_J;
5929 ascii_intro = DEFAULT_R;
5930 fold_margin = FOLD_MARGIN;
5931 output_conv = DEFAULT_CONV;
5932 oconv = DEFAULT_CONV;
5933 o_zconv = no_connection;
5934 o_fconv = no_connection;
5935 o_crconv = no_connection;
5936 o_rot_conv = no_connection;
5937 o_hira_conv = no_connection;
5938 o_base64conv = no_connection;
5939 o_iso2022jp_check_conv = no_connection;
5942 i_ungetc = std_ungetc;
5944 i_bungetc = std_ungetc;
5947 i_mungetc = std_ungetc;
5948 i_mgetc_buf = std_getc;
5949 i_mungetc_buf = std_ungetc;
5950 output_mode = ASCII;
5953 mime_decode_mode = FALSE;
5959 z_prev2=0,z_prev1=0;
5961 iconv_for_check = 0;
5963 input_codename = "";
5970 void no_connection(nkf_char c2, nkf_char c1)
5972 no_connection2(c2,c1,0);
5975 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5977 fprintf(stderr,"nkf internal module connection failure.\n");
5979 return 0; /* LINT */
5984 #define fprintf dllprintf
5988 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5989 fprintf(stderr,"Flags:\n");
5990 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5991 #ifdef DEFAULT_CODE_SJIS
5992 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5994 #ifdef DEFAULT_CODE_JIS
5995 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5997 #ifdef DEFAULT_CODE_EUC
5998 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6000 #ifdef DEFAULT_CODE_UTF8
6001 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6003 #ifdef UTF8_OUTPUT_ENABLE
6004 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6006 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6007 #ifdef UTF8_INPUT_ENABLE
6008 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6010 fprintf(stderr,"t no conversion\n");
6011 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6012 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6013 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6014 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6015 fprintf(stderr,"v Show this usage. V: show version\n");
6016 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6017 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6018 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6019 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6020 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
6021 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
6022 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6023 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6025 fprintf(stderr,"T Text mode output\n");
6027 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6028 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6029 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6030 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6031 fprintf(stderr,"\n");
6032 fprintf(stderr,"Long name options\n");
6033 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6034 fprintf(stderr," Specify the input or output codeset\n");
6035 fprintf(stderr," --fj --unix --mac --windows\n");
6036 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6037 fprintf(stderr," Convert for the system or code\n");
6038 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6039 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6040 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6042 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6044 #ifdef NUMCHAR_OPTION
6045 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6047 #ifdef UTF8_INPUT_ENABLE
6048 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6049 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6052 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6053 fprintf(stderr," Overwrite original listed files by filtered result\n");
6054 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6056 fprintf(stderr," -g --guess Guess the input code\n");
6057 fprintf(stderr," --help --version Show this help/the version\n");
6058 fprintf(stderr," For more information, see also man nkf\n");
6059 fprintf(stderr,"\n");
6065 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6066 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6069 #if defined(MSDOS) && defined(__WIN16__)
6072 #if defined(MSDOS) && defined(__WIN32__)
6078 ,NKF_VERSION,NKF_RELEASE_DATE);
6079 fprintf(stderr,"\n%s\n",CopyRight);
6084 **
\e$B%Q%C%A@):n<T
\e(B
6085 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6086 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6087 ** ohta@src.ricoh.co.jp (Junn Ohta)
6088 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6089 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6090 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6091 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6092 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6093 ** GHG00637@nifty-serve.or.jp (COW)