1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.121 2007/03/13 18:52:16 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2007-03-14"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2006 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
283 #define CP932_TABLE_BEGIN 0xFA
284 #define CP932_TABLE_END 0xFC
285 #define CP932INV_TABLE_BEGIN 0xED
286 #define CP932INV_TABLE_END 0xEE
287 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
289 #define HOLD_SIZE 1024
290 #if defined(INT_IS_SHORT)
291 #define IOBUF_SIZE 2048
293 #define IOBUF_SIZE 16384
296 #define DEFAULT_J 'B'
297 #define DEFAULT_R 'B'
299 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
300 #define SJ6394 0x0161 /* 63 - 94 ku offset */
302 #define RANGE_NUM_MAX 18
307 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
308 #define sizeof_euc_to_utf8_1byte 94
309 #define sizeof_euc_to_utf8_2bytes 94
310 #define sizeof_utf8_to_euc_C2 64
311 #define sizeof_utf8_to_euc_E5B8 64
312 #define sizeof_utf8_to_euc_2bytes 112
313 #define sizeof_utf8_to_euc_3bytes 16
316 /* MIME preprocessor */
318 #ifdef EASYWIN /*Easy Win */
319 extern POINT _BufferSize;
328 void (*status_func)(struct input_code *, nkf_char);
329 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 static char *input_codename = "";
336 static const char *CopyRight = COPY_RIGHT;
338 #if !defined(PERL_XS) && !defined(WIN32DLL)
339 static nkf_char noconvert(FILE *f);
341 static void module_connection(void);
342 static nkf_char kanji_convert(FILE *f);
343 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
344 static nkf_char push_hold_buf(nkf_char c2);
345 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
346 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
347 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
348 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
349 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
351 * 0: Shift_JIS, eucJP-ascii
355 #define UCS_MAP_ASCII 0
357 #define UCS_MAP_CP932 2
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
368 static void encode_fallback_html(nkf_char c);
369 static void encode_fallback_xml(nkf_char c);
370 static void encode_fallback_java(nkf_char c);
371 static void encode_fallback_perl(nkf_char c);
372 static void encode_fallback_subchar(nkf_char c);
373 static void (*encode_fallback)(nkf_char c) = NULL;
374 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
375 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
376 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
377 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
378 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
379 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
380 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
381 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
382 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
383 static void w_status(struct input_code *, nkf_char);
385 #ifdef UTF8_OUTPUT_ENABLE
386 static int output_bom_f = FALSE;
387 static int output_endian = ENDIAN_BIG;
388 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
389 static void w_oconv(nkf_char c2,nkf_char c1);
390 static void w_oconv16(nkf_char c2,nkf_char c1);
391 static void w_oconv32(nkf_char c2,nkf_char c1);
393 static void e_oconv(nkf_char c2,nkf_char c1);
394 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
395 static void s_oconv(nkf_char c2,nkf_char c1);
396 static void j_oconv(nkf_char c2,nkf_char c1);
397 static void fold_conv(nkf_char c2,nkf_char c1);
398 static void cr_conv(nkf_char c2,nkf_char c1);
399 static void z_conv(nkf_char c2,nkf_char c1);
400 static void rot_conv(nkf_char c2,nkf_char c1);
401 static void hira_conv(nkf_char c2,nkf_char c1);
402 static void base64_conv(nkf_char c2,nkf_char c1);
403 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
404 static void no_connection(nkf_char c2,nkf_char c1);
405 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
407 static void code_score(struct input_code *ptr);
408 static void code_status(nkf_char c);
410 static void std_putc(nkf_char c);
411 static nkf_char std_getc(FILE *f);
412 static nkf_char std_ungetc(nkf_char c,FILE *f);
414 static nkf_char broken_getc(FILE *f);
415 static nkf_char broken_ungetc(nkf_char c,FILE *f);
417 static nkf_char mime_begin(FILE *f);
418 static nkf_char mime_getc(FILE *f);
419 static nkf_char mime_ungetc(nkf_char c,FILE *f);
421 static void switch_mime_getc(void);
422 static void unswitch_mime_getc(void);
423 static nkf_char mime_begin_strict(FILE *f);
424 static nkf_char mime_getc_buf(FILE *f);
425 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
426 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
428 static nkf_char base64decode(nkf_char c);
429 static void mime_prechar(nkf_char c2, nkf_char c1);
430 static void mime_putc(nkf_char c);
431 static void open_mime(nkf_char c);
432 static void close_mime(void);
433 static void eof_mime(void);
434 static void mimeout_addchar(nkf_char c);
436 static void usage(void);
437 static void version(void);
439 static void options(unsigned char *c);
440 #if defined(PERL_XS) || defined(WIN32DLL)
441 static void reinit(void);
446 #if !defined(PERL_XS) && !defined(WIN32DLL)
447 static unsigned char stdibuf[IOBUF_SIZE];
448 static unsigned char stdobuf[IOBUF_SIZE];
450 static unsigned char hold_buf[HOLD_SIZE*2];
451 static int hold_count = 0;
453 /* MIME preprocessor fifo */
455 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
456 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
457 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
458 static unsigned char mime_buf[MIME_BUF_SIZE];
459 static unsigned int mime_top = 0;
460 static unsigned int mime_last = 0; /* decoded */
461 static unsigned int mime_input = 0; /* undecoded */
462 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
465 static int unbuf_f = FALSE;
466 static int estab_f = FALSE;
467 static int nop_f = FALSE;
468 static int binmode_f = TRUE; /* binary mode */
469 static int rot_f = FALSE; /* rot14/43 mode */
470 static int hira_f = FALSE; /* hira/kata henkan */
471 static int input_f = FALSE; /* non fixed input code */
472 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
473 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
474 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
475 static int mimebuf_f = FALSE; /* MIME buffered input */
476 static int broken_f = FALSE; /* convert ESC-less broken JIS */
477 static int iso8859_f = FALSE; /* ISO8859 through */
478 static int mimeout_f = FALSE; /* base64 mode */
479 #if defined(MSDOS) || defined(__OS2__)
480 static int x0201_f = TRUE; /* Assume JISX0201 kana */
482 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
484 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
486 #ifdef UNICODE_NORMALIZATION
487 static int nfc_f = FALSE;
488 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
489 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
490 static nkf_char nfc_getc(FILE *f);
491 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
495 static int cap_f = FALSE;
496 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
497 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
498 static nkf_char cap_getc(FILE *f);
499 static nkf_char cap_ungetc(nkf_char c,FILE *f);
501 static int url_f = FALSE;
502 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
503 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
504 static nkf_char url_getc(FILE *f);
505 static nkf_char url_ungetc(nkf_char c,FILE *f);
508 #if defined(INT_IS_SHORT)
509 #define NKF_INT32_C(n) (n##L)
511 #define NKF_INT32_C(n) (n)
513 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
514 #define CLASS_MASK NKF_INT32_C(0xFF000000)
515 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
516 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
517 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
518 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
519 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
521 #ifdef NUMCHAR_OPTION
522 static int numchar_f = FALSE;
523 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
524 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
525 static nkf_char numchar_getc(FILE *f);
526 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
530 static int noout_f = FALSE;
531 static void no_putc(nkf_char c);
532 static nkf_char debug_f = FALSE;
533 static void debug(const char *str);
534 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
537 static int guess_f = FALSE;
539 static void print_guessed_code(char *filename);
541 static void set_input_codename(char *codename);
542 static int is_inputcode_mixed = FALSE;
543 static int is_inputcode_set = FALSE;
546 static int exec_f = 0;
549 #ifdef SHIFTJIS_CP932
550 /* invert IBM extended characters to others */
551 static int cp51932_f = FALSE;
553 /* invert NEC-selected IBM extended characters to IBM extended characters */
554 static int cp932inv_f = TRUE;
556 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
557 #endif /* SHIFTJIS_CP932 */
560 static int x0212_f = FALSE;
561 static nkf_char x0212_shift(nkf_char c);
562 static nkf_char x0212_unshift(nkf_char c);
564 static int x0213_f = FALSE;
566 static unsigned char prefix_table[256];
568 static void set_code_score(struct input_code *ptr, nkf_char score);
569 static void clr_code_score(struct input_code *ptr, nkf_char score);
570 static void status_disable(struct input_code *ptr);
571 static void status_push_ch(struct input_code *ptr, nkf_char c);
572 static void status_clear(struct input_code *ptr);
573 static void status_reset(struct input_code *ptr);
574 static void status_reinit(struct input_code *ptr);
575 static void status_check(struct input_code *ptr, nkf_char c);
576 static void e_status(struct input_code *, nkf_char);
577 static void s_status(struct input_code *, nkf_char);
579 struct input_code input_code_list[] = {
580 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
581 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
582 #ifdef UTF8_INPUT_ENABLE
583 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
584 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
585 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
590 static int mimeout_mode = 0;
591 static int base64_count = 0;
593 /* X0208 -> ASCII converter */
596 static int f_line = 0; /* chars in line */
597 static int f_prev = 0;
598 static int fold_preserve_f = FALSE; /* preserve new lines */
599 static int fold_f = FALSE;
600 static int fold_len = 0;
603 static unsigned char kanji_intro = DEFAULT_J;
604 static unsigned char ascii_intro = DEFAULT_R;
608 #define FOLD_MARGIN 10
609 #define DEFAULT_FOLD 60
611 static int fold_margin = FOLD_MARGIN;
615 #ifdef DEFAULT_CODE_JIS
616 # define DEFAULT_CONV j_oconv
618 #ifdef DEFAULT_CODE_SJIS
619 # define DEFAULT_CONV s_oconv
621 #ifdef DEFAULT_CODE_EUC
622 # define DEFAULT_CONV e_oconv
624 #ifdef DEFAULT_CODE_UTF8
625 # define DEFAULT_CONV w_oconv
628 /* process default */
629 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
631 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
632 /* s_iconv or oconv */
633 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
635 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
636 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
637 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
638 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 /* static redirections */
645 static void (*o_putc)(nkf_char c) = std_putc;
647 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
648 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
650 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
651 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
653 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
655 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
656 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
658 /* for strict mime */
659 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
660 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
663 static int output_mode = ASCII, /* output kanji mode */
664 input_mode = ASCII, /* input kanji mode */
665 shift_mode = FALSE; /* TRUE shift out, or X0201 */
666 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
668 /* X0201 / X0208 conversion tables */
670 /* X0201 kana conversion table */
673 unsigned char cv[]= {
674 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
675 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
676 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
677 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
678 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
679 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
680 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
681 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
682 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
683 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
684 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
685 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
686 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
687 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
688 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
689 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
693 /* X0201 kana conversion table for daguten */
696 unsigned char dv[]= {
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
702 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
703 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
704 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
705 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
706 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
707 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
708 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
709 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
715 /* X0201 kana conversion table for han-daguten */
718 unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
741 unsigned char fv[] = {
743 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
744 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
745 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
746 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
747 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
748 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
749 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
750 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
751 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
752 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
753 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
754 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
760 static int file_out_f = FALSE;
762 static int overwrite_f = FALSE;
763 static int preserve_time_f = FALSE;
764 static int backup_f = FALSE;
765 static char *backup_suffix = "";
766 static char *get_backup_filename(const char *suffix, const char *filename);
769 static int crmode_f = 0; /* CR, NL, CRLF */
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
857 int is_argument_error = FALSE;
859 is_inputcode_mixed = FALSE;
860 is_inputcode_set = FALSE;
865 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
868 is_argument_error = TRUE;
876 /* reopen file for stdout */
877 if (file_out_f == TRUE) {
880 outfname = malloc(strlen(origfname)
881 + strlen(".nkftmpXXXXXX")
887 strcpy(outfname, origfname);
891 for (i = strlen(outfname); i; --i){
892 if (outfname[i - 1] == '/'
893 || outfname[i - 1] == '\\'){
899 strcat(outfname, "ntXXXXXX");
901 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
904 strcat(outfname, ".nkftmpXXXXXX");
905 fd = mkstemp(outfname);
908 || (fd_backup = dup(fileno(stdout))) < 0
909 || dup2(fd, fileno(stdout)) < 0
920 outfname = "nkf.out";
923 if(freopen(outfname, "w", stdout) == NULL) {
927 if (binmode_f == TRUE) {
928 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
929 if (freopen("","wb",stdout) == NULL)
936 if (binmode_f == TRUE)
937 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
938 if (freopen("","rb",fin) == NULL)
943 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
947 char *filename = NULL;
949 if (nfiles > 1) filename = origfname;
950 if (guess_f) print_guessed_code(filename);
956 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
964 if (dup2(fd_backup, fileno(stdout)) < 0){
967 if (stat(origfname, &sb)) {
968 fprintf(stderr, "Can't stat %s\n", origfname);
970 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
971 if (chmod(outfname, sb.st_mode)) {
972 fprintf(stderr, "Can't set permission %s\n", outfname);
975 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
977 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
978 tb[0] = tb[1] = sb.st_mtime;
979 if (utime(outfname, tb)) {
980 fprintf(stderr, "Can't set timestamp %s\n", outfname);
983 tb.actime = sb.st_atime;
984 tb.modtime = sb.st_mtime;
985 if (utime(outfname, &tb)) {
986 fprintf(stderr, "Can't set timestamp %s\n", outfname);
991 char *backup_filename = get_backup_filename(backup_suffix, origfname);
993 unlink(backup_filename);
995 if (rename(origfname, backup_filename)) {
996 perror(backup_filename);
997 fprintf(stderr, "Can't rename %s to %s\n",
998 origfname, backup_filename);
1002 if (unlink(origfname)){
1007 if (rename(outfname, origfname)) {
1009 fprintf(stderr, "Can't rename %s to %s\n",
1010 outfname, origfname);
1017 if (is_argument_error)
1020 #ifdef EASYWIN /*Easy Win */
1021 if (file_out_f == FALSE)
1022 scanf("%d",&end_check);
1025 #else /* for Other OS */
1026 if (file_out_f == TRUE)
1028 #endif /*Easy Win */
1031 #endif /* WIN32DLL */
1034 char *get_backup_filename(const char *suffix, const char *filename)
1036 char *backup_filename;
1037 int asterisk_count = 0;
1039 int filename_length = strlen(filename);
1041 for(i = 0; suffix[i]; i++){
1042 if(suffix[i] == '*') asterisk_count++;
1046 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1047 if (!backup_filename){
1048 perror("Can't malloc backup filename.");
1052 for(i = 0, j = 0; suffix[i];){
1053 if(suffix[i] == '*'){
1054 backup_filename[j] = '\0';
1055 strncat(backup_filename, filename, filename_length);
1057 j += filename_length;
1059 backup_filename[j++] = suffix[i++];
1062 backup_filename[j] = '\0';
1064 j = strlen(suffix) + filename_length;
1065 backup_filename = malloc( + 1);
1066 strcpy(backup_filename, filename);
1067 strcat(backup_filename, suffix);
1068 backup_filename[j] = '\0';
1070 return backup_filename;
1099 {"katakana-hiragana","h3"},
1106 #ifdef UTF8_OUTPUT_ENABLE
1116 {"fb-subchar=", ""},
1118 #ifdef UTF8_INPUT_ENABLE
1119 {"utf8-input", "W"},
1120 {"utf16-input", "W16"},
1121 {"no-cp932ext", ""},
1122 {"no-best-fit-chars",""},
1124 #ifdef UNICODE_NORMALIZATION
1125 {"utf8mac-input", ""},
1137 #ifdef NUMCHAR_OPTION
1138 {"numchar-input", ""},
1144 #ifdef SHIFTJIS_CP932
1154 static int option_mode = 0;
1156 void options(unsigned char *cp)
1160 unsigned char *cp_back = NULL;
1165 while(*cp && *cp++!='-');
1166 while (*cp || cp_back) {
1174 case '-': /* literal options */
1175 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1179 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1180 p = (unsigned char *)long_option[i].name;
1181 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1182 if (*p == cp[j] || cp[j] == ' '){
1189 while(*cp && *cp != SPACE && cp++);
1190 if (long_option[i].alias[0]){
1192 cp = (unsigned char *)long_option[i].alias;
1194 if (strcmp(long_option[i].name, "ic=") == 0){
1195 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1196 codeset[i] = nkf_toupper(p[i]);
1199 if(strcmp(codeset, "ISO-2022-JP") == 0){
1200 input_f = JIS_INPUT;
1201 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1202 strcmp(codeset, "CP50220") == 0 ||
1203 strcmp(codeset, "CP50221") == 0 ||
1204 strcmp(codeset, "CP50222") == 0){
1205 input_f = JIS_INPUT;
1206 #ifdef SHIFTJIS_CP932
1209 #ifdef UTF8_OUTPUT_ENABLE
1210 ms_ucs_map_f = UCS_MAP_CP932;
1212 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1213 input_f = JIS_INPUT;
1217 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1218 input_f = JIS_INPUT;
1223 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1224 input_f = SJIS_INPUT;
1225 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1226 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1227 strcmp(codeset, "CP932") == 0 ||
1228 strcmp(codeset, "MS932") == 0){
1229 input_f = SJIS_INPUT;
1230 #ifdef SHIFTJIS_CP932
1233 #ifdef UTF8_OUTPUT_ENABLE
1234 ms_ucs_map_f = UCS_MAP_CP932;
1236 }else if(strcmp(codeset, "EUCJP") == 0 ||
1237 strcmp(codeset, "EUC-JP") == 0){
1238 input_f = EUC_INPUT;
1239 }else if(strcmp(codeset, "CP51932") == 0){
1240 input_f = EUC_INPUT;
1241 #ifdef SHIFTJIS_CP932
1244 #ifdef UTF8_OUTPUT_ENABLE
1245 ms_ucs_map_f = UCS_MAP_CP932;
1247 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1248 strcmp(codeset, "EUCJP-MS") == 0 ||
1249 strcmp(codeset, "EUCJPMS") == 0){
1250 input_f = EUC_INPUT;
1251 #ifdef SHIFTJIS_CP932
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_MS;
1257 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1258 strcmp(codeset, "EUCJP-ASCII") == 0){
1259 input_f = EUC_INPUT;
1260 #ifdef SHIFTJIS_CP932
1263 #ifdef UTF8_OUTPUT_ENABLE
1264 ms_ucs_map_f = UCS_MAP_ASCII;
1266 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1267 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1268 input_f = SJIS_INPUT;
1270 #ifdef SHIFTJIS_CP932
1273 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1274 strcmp(codeset, "EUC-JIS-2004") == 0){
1275 input_f = EUC_INPUT;
1277 #ifdef SHIFTJIS_CP932
1280 #ifdef UTF8_INPUT_ENABLE
1281 }else if(strcmp(codeset, "UTF-8") == 0 ||
1282 strcmp(codeset, "UTF-8N") == 0 ||
1283 strcmp(codeset, "UTF-8-BOM") == 0){
1284 input_f = UTF8_INPUT;
1285 #ifdef UNICODE_NORMALIZATION
1286 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1287 strcmp(codeset, "UTF-8-MAC") == 0){
1288 input_f = UTF8_INPUT;
1291 }else if(strcmp(codeset, "UTF-16") == 0 ||
1292 strcmp(codeset, "UTF-16BE") == 0 ||
1293 strcmp(codeset, "UTF-16BE-BOM") == 0){
1294 input_f = UTF16_INPUT;
1295 input_endian = ENDIAN_BIG;
1296 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1297 strcmp(codeset, "UTF-16LE-BOM") == 0){
1298 input_f = UTF16_INPUT;
1299 input_endian = ENDIAN_LITTLE;
1300 }else if(strcmp(codeset, "UTF-32") == 0 ||
1301 strcmp(codeset, "UTF-32BE") == 0 ||
1302 strcmp(codeset, "UTF-32BE-BOM") == 0){
1303 input_f = UTF32_INPUT;
1304 input_endian = ENDIAN_BIG;
1305 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1306 strcmp(codeset, "UTF-32LE-BOM") == 0){
1307 input_f = UTF32_INPUT;
1308 input_endian = ENDIAN_LITTLE;
1313 if (strcmp(long_option[i].name, "oc=") == 0){
1315 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1316 codeset[i] = nkf_toupper(p[i]);
1319 if(strcmp(codeset, "ISO-2022-JP") == 0){
1320 output_conv = j_oconv;
1321 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1322 output_conv = j_oconv;
1323 no_cp932ext_f = TRUE;
1324 #ifdef SHIFTJIS_CP932
1327 #ifdef UTF8_OUTPUT_ENABLE
1328 ms_ucs_map_f = UCS_MAP_CP932;
1330 }else if(strcmp(codeset, "CP50220") == 0){
1331 output_conv = j_oconv;
1333 #ifdef SHIFTJIS_CP932
1336 #ifdef UTF8_OUTPUT_ENABLE
1337 ms_ucs_map_f = UCS_MAP_CP932;
1339 }else if(strcmp(codeset, "CP50221") == 0){
1340 output_conv = j_oconv;
1341 #ifdef SHIFTJIS_CP932
1344 #ifdef UTF8_OUTPUT_ENABLE
1345 ms_ucs_map_f = UCS_MAP_CP932;
1347 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1348 output_conv = j_oconv;
1352 #ifdef SHIFTJIS_CP932
1355 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1356 output_conv = j_oconv;
1361 #ifdef SHIFTJIS_CP932
1364 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1365 output_conv = s_oconv;
1366 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1367 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1368 strcmp(codeset, "CP932") == 0 ||
1369 strcmp(codeset, "MS932") == 0){
1370 output_conv = s_oconv;
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP932;
1374 }else if(strcmp(codeset, "EUCJP") == 0 ||
1375 strcmp(codeset, "EUC-JP") == 0){
1376 output_conv = e_oconv;
1377 }else if(strcmp(codeset, "CP51932") == 0){
1378 output_conv = e_oconv;
1379 #ifdef SHIFTJIS_CP932
1382 #ifdef UTF8_OUTPUT_ENABLE
1383 ms_ucs_map_f = UCS_MAP_CP932;
1385 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1386 strcmp(codeset, "EUCJP-MS") == 0 ||
1387 strcmp(codeset, "EUCJPMS") == 0){
1388 output_conv = e_oconv;
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_MS;
1395 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1396 strcmp(codeset, "EUCJP-ASCII") == 0){
1397 output_conv = e_oconv;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_ASCII;
1404 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1405 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1406 output_conv = s_oconv;
1408 #ifdef SHIFTJIS_CP932
1411 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1412 strcmp(codeset, "EUC-JIS-2004") == 0){
1413 output_conv = e_oconv;
1418 #ifdef SHIFTJIS_CP932
1421 #ifdef UTF8_OUTPUT_ENABLE
1422 }else if(strcmp(codeset, "UTF-8") == 0){
1423 output_conv = w_oconv;
1424 }else if(strcmp(codeset, "UTF-8N") == 0){
1425 output_conv = w_oconv;
1426 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1427 output_conv = w_oconv;
1428 output_bom_f = TRUE;
1429 }else if(strcmp(codeset, "UTF-16BE") == 0){
1430 output_conv = w_oconv16;
1431 }else if(strcmp(codeset, "UTF-16") == 0 ||
1432 strcmp(codeset, "UTF-16BE-BOM") == 0){
1433 output_conv = w_oconv16;
1434 output_bom_f = TRUE;
1435 }else if(strcmp(codeset, "UTF-16LE") == 0){
1436 output_conv = w_oconv16;
1437 output_endian = ENDIAN_LITTLE;
1438 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1439 output_conv = w_oconv16;
1440 output_endian = ENDIAN_LITTLE;
1441 output_bom_f = TRUE;
1442 }else if(strcmp(codeset, "UTF-32") == 0 ||
1443 strcmp(codeset, "UTF-32BE") == 0){
1444 output_conv = w_oconv32;
1445 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1446 output_conv = w_oconv32;
1447 output_bom_f = TRUE;
1448 }else if(strcmp(codeset, "UTF-32LE") == 0){
1449 output_conv = w_oconv32;
1450 output_endian = ENDIAN_LITTLE;
1451 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1452 output_conv = w_oconv32;
1453 output_endian = ENDIAN_LITTLE;
1454 output_bom_f = TRUE;
1460 if (strcmp(long_option[i].name, "overwrite") == 0){
1463 preserve_time_f = TRUE;
1466 if (strcmp(long_option[i].name, "overwrite=") == 0){
1469 preserve_time_f = TRUE;
1471 backup_suffix = malloc(strlen((char *) p) + 1);
1472 strcpy(backup_suffix, (char *) p);
1475 if (strcmp(long_option[i].name, "in-place") == 0){
1478 preserve_time_f = FALSE;
1481 if (strcmp(long_option[i].name, "in-place=") == 0){
1484 preserve_time_f = FALSE;
1486 backup_suffix = malloc(strlen((char *) p) + 1);
1487 strcpy(backup_suffix, (char *) p);
1492 if (strcmp(long_option[i].name, "cap-input") == 0){
1496 if (strcmp(long_option[i].name, "url-input") == 0){
1501 #ifdef NUMCHAR_OPTION
1502 if (strcmp(long_option[i].name, "numchar-input") == 0){
1508 if (strcmp(long_option[i].name, "no-output") == 0){
1512 if (strcmp(long_option[i].name, "debug") == 0){
1517 if (strcmp(long_option[i].name, "cp932") == 0){
1518 #ifdef SHIFTJIS_CP932
1522 #ifdef UTF8_OUTPUT_ENABLE
1523 ms_ucs_map_f = UCS_MAP_CP932;
1527 if (strcmp(long_option[i].name, "no-cp932") == 0){
1528 #ifdef SHIFTJIS_CP932
1532 #ifdef UTF8_OUTPUT_ENABLE
1533 ms_ucs_map_f = UCS_MAP_ASCII;
1537 #ifdef SHIFTJIS_CP932
1538 if (strcmp(long_option[i].name, "cp932inv") == 0){
1545 if (strcmp(long_option[i].name, "x0212") == 0){
1552 if (strcmp(long_option[i].name, "exec-in") == 0){
1556 if (strcmp(long_option[i].name, "exec-out") == 0){
1561 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1562 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1563 no_cp932ext_f = TRUE;
1566 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1567 no_best_fit_chars_f = TRUE;
1570 if (strcmp(long_option[i].name, "fb-skip") == 0){
1571 encode_fallback = NULL;
1574 if (strcmp(long_option[i].name, "fb-html") == 0){
1575 encode_fallback = encode_fallback_html;
1578 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1579 encode_fallback = encode_fallback_xml;
1582 if (strcmp(long_option[i].name, "fb-java") == 0){
1583 encode_fallback = encode_fallback_java;
1586 if (strcmp(long_option[i].name, "fb-perl") == 0){
1587 encode_fallback = encode_fallback_perl;
1590 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1591 encode_fallback = encode_fallback_subchar;
1594 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1595 encode_fallback = encode_fallback_subchar;
1596 unicode_subchar = 0;
1598 /* decimal number */
1599 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1600 unicode_subchar *= 10;
1601 unicode_subchar += hex2bin(p[i]);
1603 }else if(p[1] == 'x' || p[1] == 'X'){
1604 /* hexadecimal number */
1605 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1606 unicode_subchar <<= 4;
1607 unicode_subchar |= hex2bin(p[i]);
1611 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1612 unicode_subchar *= 8;
1613 unicode_subchar += hex2bin(p[i]);
1616 w16e_conv(unicode_subchar, &i, &j);
1617 unicode_subchar = i<<8 | j;
1621 #ifdef UTF8_OUTPUT_ENABLE
1622 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1623 ms_ucs_map_f = UCS_MAP_MS;
1627 #ifdef UNICODE_NORMALIZATION
1628 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1629 input_f = UTF8_INPUT;
1634 if (strcmp(long_option[i].name, "prefix=") == 0){
1635 if (nkf_isgraph(p[0])){
1636 for (i = 1; nkf_isgraph(p[i]); i++){
1637 prefix_table[p[i]] = p[0];
1644 case 'b': /* buffered mode */
1647 case 'u': /* non bufferd mode */
1650 case 't': /* transparent mode */
1655 } else if (*cp=='2') {
1659 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1667 case 'j': /* JIS output */
1669 output_conv = j_oconv;
1671 case 'e': /* AT&T EUC output */
1672 output_conv = e_oconv;
1675 case 's': /* SJIS output */
1676 output_conv = s_oconv;
1678 case 'l': /* ISO8859 Latin-1 support, no conversion */
1679 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1680 input_f = LATIN1_INPUT;
1682 case 'i': /* Kanji IN ESC-$-@/B */
1683 if (*cp=='@'||*cp=='B')
1684 kanji_intro = *cp++;
1686 case 'o': /* ASCII IN ESC-(-J/B */
1687 if (*cp=='J'||*cp=='B'||*cp=='H')
1688 ascii_intro = *cp++;
1692 bit:1 katakana->hiragana
1693 bit:2 hiragana->katakana
1695 if ('9'>= *cp && *cp>='0')
1696 hira_f |= (*cp++ -'0');
1703 #if defined(MSDOS) || defined(__OS2__)
1718 #ifdef UTF8_OUTPUT_ENABLE
1719 case 'w': /* UTF-8 output */
1721 output_conv = w_oconv; cp++;
1725 output_bom_f = TRUE;
1728 if ('1'== cp[0] && '6'==cp[1]) {
1729 output_conv = w_oconv16; cp+=2;
1730 } else if ('3'== cp[0] && '2'==cp[1]) {
1731 output_conv = w_oconv32; cp+=2;
1733 output_conv = w_oconv;
1738 output_endian = ENDIAN_LITTLE;
1739 } else if (cp[0] == 'B') {
1747 output_bom_f = TRUE;
1752 #ifdef UTF8_INPUT_ENABLE
1753 case 'W': /* UTF input */
1756 input_f = UTF8_INPUT;
1758 if ('1'== cp[0] && '6'==cp[1]) {
1760 input_f = UTF16_INPUT;
1761 input_endian = ENDIAN_BIG;
1762 } else if ('3'== cp[0] && '2'==cp[1]) {
1764 input_f = UTF32_INPUT;
1765 input_endian = ENDIAN_BIG;
1767 input_f = UTF8_INPUT;
1772 input_endian = ENDIAN_LITTLE;
1773 } else if (cp[0] == 'B') {
1779 /* Input code assumption */
1780 case 'J': /* JIS input */
1781 input_f = JIS_INPUT;
1783 case 'E': /* AT&T EUC input */
1784 input_f = EUC_INPUT;
1786 case 'S': /* MS Kanji input */
1787 input_f = SJIS_INPUT;
1788 if (x0201_f==NO_X0201) x0201_f=TRUE;
1790 case 'Z': /* Convert X0208 alphabet to asii */
1791 /* bit:0 Convert X0208
1792 bit:1 Convert Kankaku to one space
1793 bit:2 Convert Kankaku to two spaces
1794 bit:3 Convert HTML Entity
1796 if ('9'>= *cp && *cp>='0')
1797 alpha_f |= 1<<(*cp++ -'0');
1801 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1802 x0201_f = FALSE; /* No X0201->X0208 conversion */
1804 ESC-(-I in JIS, EUC, MS Kanji
1805 SI/SO in JIS, EUC, MS Kanji
1806 SSO in EUC, JIS, not in MS Kanji
1807 MS Kanji (0xa0-0xdf)
1809 ESC-(-I in JIS (0x20-0x5f)
1810 SSO in EUC (0xa0-0xdf)
1811 0xa0-0xd in MS Kanji (0xa0-0xdf)
1814 case 'X': /* Assume X0201 kana */
1815 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1818 case 'F': /* prserve new lines */
1819 fold_preserve_f = TRUE;
1820 case 'f': /* folding -f60 or -f */
1823 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1825 fold_len += *cp++ - '0';
1827 if (!(0<fold_len && fold_len<BUFSIZ))
1828 fold_len = DEFAULT_FOLD;
1832 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1834 fold_margin += *cp++ - '0';
1838 case 'm': /* MIME support */
1839 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1840 if (*cp=='B'||*cp=='Q') {
1841 mime_decode_mode = *cp++;
1842 mimebuf_f = FIXED_MIME;
1843 } else if (*cp=='N') {
1844 mime_f = TRUE; cp++;
1845 } else if (*cp=='S') {
1846 mime_f = STRICT_MIME; cp++;
1847 } else if (*cp=='0') {
1848 mime_decode_f = FALSE;
1849 mime_f = FALSE; cp++;
1852 case 'M': /* MIME output */
1855 mimeout_f = FIXED_MIME; cp++;
1856 } else if (*cp=='Q') {
1858 mimeout_f = FIXED_MIME; cp++;
1863 case 'B': /* Broken JIS support */
1865 bit:1 allow any x on ESC-(-x or ESC-$-x
1866 bit:2 reset to ascii on NL
1868 if ('9'>= *cp && *cp>='0')
1869 broken_f |= 1<<(*cp++ -'0');
1874 case 'O':/* for Output file */
1878 case 'c':/* add cr code */
1881 case 'd':/* delete cr code */
1884 case 'I': /* ISO-2022-JP output */
1887 case 'L': /* line mode */
1888 if (*cp=='u') { /* unix */
1889 crmode_f = NL; cp++;
1890 } else if (*cp=='m') { /* mac */
1891 crmode_f = CR; cp++;
1892 } else if (*cp=='w') { /* windows */
1893 crmode_f = CRLF; cp++;
1894 } else if (*cp=='0') { /* no conversion */
1904 /* module muliple options in a string are allowed for Perl moudle */
1905 while(*cp && *cp++!='-');
1908 /* bogus option but ignored */
1914 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1917 struct input_code *p = input_code_list;
1919 if (iconv_func == p->iconv_func){
1928 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1930 #ifdef INPUT_CODE_FIX
1938 #ifdef INPUT_CODE_FIX
1939 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1945 if (estab_f && iconv_for_check != iconv){
1946 struct input_code *p = find_inputcode_byfunc(iconv);
1948 set_input_codename(p->name);
1949 debug(input_codename);
1951 iconv_for_check = iconv;
1956 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1957 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1958 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1959 #ifdef SHIFTJIS_CP932
1960 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1961 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1963 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1965 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1966 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1968 #define SCORE_INIT (SCORE_iMIME)
1970 const nkf_char score_table_A0[] = {
1973 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1974 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1977 const nkf_char score_table_F0[] = {
1978 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1979 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1980 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1981 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1984 void set_code_score(struct input_code *ptr, nkf_char score)
1987 ptr->score |= score;
1991 void clr_code_score(struct input_code *ptr, nkf_char score)
1994 ptr->score &= ~score;
1998 void code_score(struct input_code *ptr)
2000 nkf_char c2 = ptr->buf[0];
2001 #ifdef UTF8_OUTPUT_ENABLE
2002 nkf_char c1 = ptr->buf[1];
2005 set_code_score(ptr, SCORE_ERROR);
2006 }else if (c2 == SSO){
2007 set_code_score(ptr, SCORE_KANA);
2008 #ifdef UTF8_OUTPUT_ENABLE
2009 }else if (!e2w_conv(c2, c1)){
2010 set_code_score(ptr, SCORE_NO_EXIST);
2012 }else if ((c2 & 0x70) == 0x20){
2013 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2014 }else if ((c2 & 0x70) == 0x70){
2015 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2016 }else if ((c2 & 0x70) >= 0x50){
2017 set_code_score(ptr, SCORE_L2);
2021 void status_disable(struct input_code *ptr)
2026 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2029 void status_push_ch(struct input_code *ptr, nkf_char c)
2031 ptr->buf[ptr->index++] = c;
2034 void status_clear(struct input_code *ptr)
2040 void status_reset(struct input_code *ptr)
2043 ptr->score = SCORE_INIT;
2046 void status_reinit(struct input_code *ptr)
2049 ptr->_file_stat = 0;
2052 void status_check(struct input_code *ptr, nkf_char c)
2054 if (c <= DEL && estab_f){
2059 void s_status(struct input_code *ptr, nkf_char c)
2063 status_check(ptr, c);
2068 #ifdef NUMCHAR_OPTION
2069 }else if (is_unicode_capsule(c)){
2072 }else if (0xa1 <= c && c <= 0xdf){
2073 status_push_ch(ptr, SSO);
2074 status_push_ch(ptr, c);
2077 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2079 status_push_ch(ptr, c);
2080 #ifdef SHIFTJIS_CP932
2082 && is_ibmext_in_sjis(c)){
2084 status_push_ch(ptr, c);
2085 #endif /* SHIFTJIS_CP932 */
2087 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2089 status_push_ch(ptr, c);
2090 #endif /* X0212_ENABLE */
2092 status_disable(ptr);
2096 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2097 status_push_ch(ptr, c);
2098 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2102 status_disable(ptr);
2106 #ifdef SHIFTJIS_CP932
2107 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2108 status_push_ch(ptr, c);
2109 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2110 set_code_score(ptr, SCORE_CP932);
2115 #endif /* SHIFTJIS_CP932 */
2116 #ifndef X0212_ENABLE
2117 status_disable(ptr);
2123 void e_status(struct input_code *ptr, nkf_char c)
2127 status_check(ptr, c);
2132 #ifdef NUMCHAR_OPTION
2133 }else if (is_unicode_capsule(c)){
2136 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2138 status_push_ch(ptr, c);
2140 }else if (0x8f == c){
2142 status_push_ch(ptr, c);
2143 #endif /* X0212_ENABLE */
2145 status_disable(ptr);
2149 if (0xa1 <= c && c <= 0xfe){
2150 status_push_ch(ptr, c);
2154 status_disable(ptr);
2159 if (0xa1 <= c && c <= 0xfe){
2161 status_push_ch(ptr, c);
2163 status_disable(ptr);
2165 #endif /* X0212_ENABLE */
2169 #ifdef UTF8_INPUT_ENABLE
2170 void w_status(struct input_code *ptr, nkf_char c)
2174 status_check(ptr, c);
2179 #ifdef NUMCHAR_OPTION
2180 }else if (is_unicode_capsule(c)){
2183 }else if (0xc0 <= c && c <= 0xdf){
2185 status_push_ch(ptr, c);
2186 }else if (0xe0 <= c && c <= 0xef){
2188 status_push_ch(ptr, c);
2189 }else if (0xf0 <= c && c <= 0xf4){
2191 status_push_ch(ptr, c);
2193 status_disable(ptr);
2198 if (0x80 <= c && c <= 0xbf){
2199 status_push_ch(ptr, c);
2200 if (ptr->index > ptr->stat){
2201 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2202 && ptr->buf[2] == 0xbf);
2203 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2204 &ptr->buf[0], &ptr->buf[1]);
2211 status_disable(ptr);
2215 if (0x80 <= c && c <= 0xbf){
2216 if (ptr->index < ptr->stat){
2217 status_push_ch(ptr, c);
2222 status_disable(ptr);
2229 void code_status(nkf_char c)
2231 int action_flag = 1;
2232 struct input_code *result = 0;
2233 struct input_code *p = input_code_list;
2235 if (!p->status_func) {
2239 if (!p->status_func)
2241 (p->status_func)(p, c);
2244 }else if(p->stat == 0){
2255 if (result && !estab_f){
2256 set_iconv(TRUE, result->iconv_func);
2257 }else if (c <= DEL){
2258 struct input_code *ptr = input_code_list;
2268 nkf_char std_getc(FILE *f)
2271 return std_gc_buf[--std_gc_ndx];
2277 nkf_char std_ungetc(nkf_char c, FILE *f)
2279 if (std_gc_ndx == STD_GC_BUFSIZE){
2282 std_gc_buf[std_gc_ndx++] = c;
2287 void std_putc(nkf_char c)
2294 #if !defined(PERL_XS) && !defined(WIN32DLL)
2295 nkf_char noconvert(FILE *f)
2300 module_connection();
2301 while ((c = (*i_getc)(f)) != EOF)
2308 void module_connection(void)
2310 oconv = output_conv;
2313 /* replace continucation module, from output side */
2315 /* output redicrection */
2317 if (noout_f || guess_f){
2324 if (mimeout_f == TRUE) {
2325 o_base64conv = oconv; oconv = base64_conv;
2327 /* base64_count = 0; */
2331 o_crconv = oconv; oconv = cr_conv;
2334 o_rot_conv = oconv; oconv = rot_conv;
2337 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2340 o_hira_conv = oconv; oconv = hira_conv;
2343 o_fconv = oconv; oconv = fold_conv;
2346 if (alpha_f || x0201_f) {
2347 o_zconv = oconv; oconv = z_conv;
2351 i_ungetc = std_ungetc;
2352 /* input redicrection */
2355 i_cgetc = i_getc; i_getc = cap_getc;
2356 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2359 i_ugetc = i_getc; i_getc = url_getc;
2360 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2363 #ifdef NUMCHAR_OPTION
2365 i_ngetc = i_getc; i_getc = numchar_getc;
2366 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2369 #ifdef UNICODE_NORMALIZATION
2370 if (nfc_f && input_f == UTF8_INPUT){
2371 i_nfc_getc = i_getc; i_getc = nfc_getc;
2372 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2375 if (mime_f && mimebuf_f==FIXED_MIME) {
2376 i_mgetc = i_getc; i_getc = mime_getc;
2377 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2380 i_bgetc = i_getc; i_getc = broken_getc;
2381 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2383 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2384 set_iconv(-TRUE, e_iconv);
2385 } else if (input_f == SJIS_INPUT) {
2386 set_iconv(-TRUE, s_iconv);
2387 #ifdef UTF8_INPUT_ENABLE
2388 } else if (input_f == UTF8_INPUT) {
2389 set_iconv(-TRUE, w_iconv);
2390 } else if (input_f == UTF16_INPUT) {
2391 set_iconv(-TRUE, w_iconv16);
2392 } else if (input_f == UTF32_INPUT) {
2393 set_iconv(-TRUE, w_iconv32);
2396 set_iconv(FALSE, e_iconv);
2400 struct input_code *p = input_code_list;
2408 * Check and Ignore BOM
2410 void check_bom(FILE *f)
2413 switch(c2 = (*i_getc)(f)){
2415 if((c2 = (*i_getc)(f)) == 0x00){
2416 if((c2 = (*i_getc)(f)) == 0xFE){
2417 if((c2 = (*i_getc)(f)) == 0xFF){
2419 set_iconv(TRUE, w_iconv32);
2421 if (iconv == w_iconv32) {
2422 input_endian = ENDIAN_BIG;
2425 (*i_ungetc)(0xFF,f);
2426 }else (*i_ungetc)(c2,f);
2427 (*i_ungetc)(0xFE,f);
2428 }else if(c2 == 0xFF){
2429 if((c2 = (*i_getc)(f)) == 0xFE){
2431 set_iconv(TRUE, w_iconv32);
2433 if (iconv == w_iconv32) {
2434 input_endian = ENDIAN_2143;
2437 (*i_ungetc)(0xFF,f);
2438 }else (*i_ungetc)(c2,f);
2439 (*i_ungetc)(0xFF,f);
2440 }else (*i_ungetc)(c2,f);
2441 (*i_ungetc)(0x00,f);
2442 }else (*i_ungetc)(c2,f);
2443 (*i_ungetc)(0x00,f);
2446 if((c2 = (*i_getc)(f)) == 0xBB){
2447 if((c2 = (*i_getc)(f)) == 0xBF){
2449 set_iconv(TRUE, w_iconv);
2451 if (iconv == w_iconv) {
2454 (*i_ungetc)(0xBF,f);
2455 }else (*i_ungetc)(c2,f);
2456 (*i_ungetc)(0xBB,f);
2457 }else (*i_ungetc)(c2,f);
2458 (*i_ungetc)(0xEF,f);
2461 if((c2 = (*i_getc)(f)) == 0xFF){
2462 if((c2 = (*i_getc)(f)) == 0x00){
2463 if((c2 = (*i_getc)(f)) == 0x00){
2465 set_iconv(TRUE, w_iconv32);
2467 if (iconv == w_iconv32) {
2468 input_endian = ENDIAN_3412;
2471 (*i_ungetc)(0x00,f);
2472 }else (*i_ungetc)(c2,f);
2473 (*i_ungetc)(0x00,f);
2474 }else (*i_ungetc)(c2,f);
2476 set_iconv(TRUE, w_iconv16);
2478 if (iconv == w_iconv16) {
2479 input_endian = ENDIAN_BIG;
2482 (*i_ungetc)(0xFF,f);
2483 }else (*i_ungetc)(c2,f);
2484 (*i_ungetc)(0xFE,f);
2487 if((c2 = (*i_getc)(f)) == 0xFE){
2488 if((c2 = (*i_getc)(f)) == 0x00){
2489 if((c2 = (*i_getc)(f)) == 0x00){
2491 set_iconv(TRUE, w_iconv32);
2493 if (iconv == w_iconv32) {
2494 input_endian = ENDIAN_LITTLE;
2497 (*i_ungetc)(0x00,f);
2498 }else (*i_ungetc)(c2,f);
2499 (*i_ungetc)(0x00,f);
2500 }else (*i_ungetc)(c2,f);
2502 set_iconv(TRUE, w_iconv16);
2504 if (iconv == w_iconv16) {
2505 input_endian = ENDIAN_LITTLE;
2508 (*i_ungetc)(0xFE,f);
2509 }else (*i_ungetc)(c2,f);
2510 (*i_ungetc)(0xFF,f);
2519 Conversion main loop. Code detection only.
2522 nkf_char kanji_convert(FILE *f)
2524 nkf_char c3, c2=0, c1, c0=0;
2525 int is_8bit = FALSE;
2527 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2528 #ifdef UTF8_INPUT_ENABLE
2529 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2536 output_mode = ASCII;
2539 #define NEXT continue /* no output, get next */
2540 #define SEND ; /* output c1 and c2, get next */
2541 #define LAST break /* end of loop, go closing */
2543 module_connection();
2546 while ((c1 = (*i_getc)(f)) != EOF) {
2547 #ifdef INPUT_CODE_FIX
2553 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2554 /* in case of 8th bit is on */
2555 if (!estab_f&&!mime_decode_mode) {
2556 /* in case of not established yet */
2557 /* It is still ambiguious */
2558 if (h_conv(f, c2, c1)==EOF)
2564 /* in case of already established */
2566 /* ignore bogus code and not CP5022x UCD */
2574 /* second byte, 7 bit code */
2575 /* it might be kanji shitfted */
2576 if ((c1 == DEL) || (c1 <= SPACE)) {
2577 /* ignore bogus first code */
2584 #ifdef UTF8_INPUT_ENABLE
2585 if (iconv == w_iconv16) {
2586 if (input_endian == ENDIAN_BIG) {
2588 if ((c1 = (*i_getc)(f)) != EOF) {
2589 if (0xD8 <= c2 && c2 <= 0xDB) {
2590 if ((c0 = (*i_getc)(f)) != EOF) {
2592 if ((c3 = (*i_getc)(f)) != EOF) {
2599 if ((c2 = (*i_getc)(f)) != EOF) {
2600 if (0xD8 <= c2 && c2 <= 0xDB) {
2601 if ((c3 = (*i_getc)(f)) != EOF) {
2602 if ((c0 = (*i_getc)(f)) != EOF) {
2611 } else if(iconv == w_iconv32){
2613 if((c2 = (*i_getc)(f)) != EOF &&
2614 (c1 = (*i_getc)(f)) != EOF &&
2615 (c0 = (*i_getc)(f)) != EOF){
2616 switch(input_endian){
2618 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2621 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2624 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2627 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2637 #ifdef NUMCHAR_OPTION
2638 if (is_unicode_capsule(c1)){
2642 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2644 if (!estab_f && !iso8859_f) {
2645 /* not established yet */
2648 } else { /* estab_f==TRUE */
2653 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2654 /* SJIS X0201 Case... */
2655 if(iso2022jp_f && x0201_f==NO_X0201) {
2656 (*oconv)(GETA1, GETA2);
2663 } else if (c1==SSO && iconv != s_iconv) {
2664 /* EUC X0201 Case */
2665 c1 = (*i_getc)(f); /* skip SSO */
2667 if (SSP<=c1 && c1<0xe0) {
2668 if(iso2022jp_f && x0201_f==NO_X0201) {
2669 (*oconv)(GETA1, GETA2);
2676 } else { /* bogus code, skip SSO and one byte */
2680 /* already established */
2685 } else if ((c1 > SPACE) && (c1 != DEL)) {
2686 /* in case of Roman characters */
2688 /* output 1 shifted byte */
2692 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2693 /* output 1 shifted byte */
2694 if(iso2022jp_f && x0201_f==NO_X0201) {
2695 (*oconv)(GETA1, GETA2);
2702 /* look like bogus code */
2705 } else if (input_mode == X0208 || input_mode == X0212 ||
2706 input_mode == X0213_1 || input_mode == X0213_2) {
2707 /* in case of Kanji shifted */
2710 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2711 /* Check MIME code */
2712 if ((c1 = (*i_getc)(f)) == EOF) {
2715 } else if (c1 == '?') {
2716 /* =? is mime conversion start sequence */
2717 if(mime_f == STRICT_MIME) {
2718 /* check in real detail */
2719 if (mime_begin_strict(f) == EOF)
2723 } else if (mime_begin(f) == EOF)
2733 /* normal ASCII code */
2736 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
\r
2739 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
\r
2742 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
\r
2743 if ((c1 = (*i_getc)(f)) == EOF) {
2744 /* (*oconv)(0, ESC); don't send bogus code */
2746 } else if (c1 == '$') {
2747 if ((c1 = (*i_getc)(f)) == EOF) {
2749 (*oconv)(0, ESC); don't send bogus code
2750 (*oconv)(0, '$'); */
2752 } else if (c1 == '@'|| c1 == 'B') {
2753 /* This is kanji introduction */
2756 set_input_codename("ISO-2022-JP");
2758 debug(input_codename);
2761 } else if (c1 == '(') {
2762 if ((c1 = (*i_getc)(f)) == EOF) {
2763 /* don't send bogus code
2769 } else if (c1 == '@'|| c1 == 'B') {
2770 /* This is kanji introduction */
2775 } else if (c1 == 'D'){
2779 #endif /* X0212_ENABLE */
2780 } else if (c1 == (X0213_1&0x7F)){
2781 input_mode = X0213_1;
2784 } else if (c1 == (X0213_2&0x7F)){
2785 input_mode = X0213_2;
2789 /* could be some special code */
2796 } else if (broken_f&0x2) {
2797 /* accept any ESC-(-x as broken code ... */
2807 } else if (c1 == '(') {
2808 if ((c1 = (*i_getc)(f)) == EOF) {
2809 /* don't send bogus code
2811 (*oconv)(0, '('); */
2815 /* This is X0201 kana introduction */
2816 input_mode = X0201; shift_mode = X0201;
2818 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2819 /* This is X0208 kanji introduction */
2820 input_mode = ASCII; shift_mode = FALSE;
2822 } else if (broken_f&0x2) {
2823 input_mode = ASCII; shift_mode = FALSE;
2828 /* maintain various input_mode here */
2832 } else if ( c1 == 'N' || c1 == 'n' ){
2834 c3 = (*i_getc)(f); /* skip SS2 */
2835 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2850 } else if (c1 == ESC && iconv == s_iconv) {
2851 /* ESC in Shift_JIS */
2852 if ((c1 = (*i_getc)(f)) == EOF) {
2853 /* (*oconv)(0, ESC); don't send bogus code */
2855 } else if (c1 == '$') {
2857 if ((c1 = (*i_getc)(f)) == EOF) {
2859 (*oconv)(0, ESC); don't send bogus code
2860 (*oconv)(0, '$'); */
2863 if (('E' <= c1 && c1 <= 'G') ||
2864 ('O' <= c1 && c1 <= 'Q')) {
2872 static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2873 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2874 while ((c1 = (*i_getc)(f)) != EOF) {
2875 if (SPACE <= c1 && c1 <= 'z') {
2876 (*oconv)(0, c1 + c0);
2877 } else break; /* c1 == SO */
2881 if (c1 == EOF) LAST;
2888 } else if ((c1 == NL || c1 == CR) && broken_f&4) {
2889 input_mode = ASCII; set_iconv(FALSE, 0);
2891 } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
2892 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2900 } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
2901 if ((c1=(*i_getc)(f))!=EOF) {
2905 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2917 } else if (c1 == DEL && input_mode == X0208 ) {
2927 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2930 if ((c0 = (*i_getc)(f)) != EOF) {
2933 if ((c3 = (*i_getc)(f)) != EOF) {
2935 (*iconv)(c2, c1, c0|c3);
2940 /* 3 bytes EUC or UTF-8 */
2941 if ((c0 = (*i_getc)(f)) != EOF) {
2943 (*iconv)(c2, c1, c0);
2951 0x7F <= c2 && c2 <= 0x92 &&
2952 0x21 <= c1 && c1 <= 0x7E) {
2954 if(c1 == 0x7F) return 0;
2955 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2958 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2962 (*oconv)(PREFIX_EUCG3 | c2, c1);
2964 #endif /* X0212_ENABLE */
2966 (*oconv)(PREFIX_EUCG3 | c2, c1);
2969 (*oconv)(input_mode, c1); /* other special case */
2975 /* goto next_word */
2979 (*iconv)(EOF, 0, 0);
2980 if (!is_inputcode_set)
2983 struct input_code *p = input_code_list;
2984 struct input_code *result = p;
2986 if (p->score < result->score) result = p;
2989 set_input_codename(result->name);
2996 h_conv(FILE *f, nkf_char c2, nkf_char c1)
2998 nkf_char ret, c3, c0;
3002 /** it must NOT be in the kanji shifte sequence */
3003 /** it must NOT be written in JIS7 */
3004 /** and it must be after 2 byte 8bit code */
3010 while ((c1 = (*i_getc)(f)) != EOF) {
3016 if (push_hold_buf(c1) == EOF || estab_f){
3022 struct input_code *p = input_code_list;
3023 struct input_code *result = p;
3028 if (p->score < result->score){
3033 set_iconv(FALSE, result->iconv_func);
3038 ** 1) EOF is detected, or
3039 ** 2) Code is established, or
3040 ** 3) Buffer is FULL (but last word is pushed)
3042 ** in 1) and 3) cases, we continue to use
3043 ** Kanji codes by oconv and leave estab_f unchanged.
3048 while (hold_index < hold_count){
3049 c2 = hold_buf[hold_index++];
3051 #ifdef NUMCHAR_OPTION
3052 || is_unicode_capsule(c2)
3057 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3058 (*iconv)(X0201, c2, 0);
3061 if (hold_index < hold_count){
3062 c1 = hold_buf[hold_index++];
3072 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3075 if (hold_index < hold_count){
3076 c0 = hold_buf[hold_index++];
3077 } else if ((c0 = (*i_getc)(f)) == EOF) {
3083 if (hold_index < hold_count){
3084 c3 = hold_buf[hold_index++];
3085 } else if ((c3 = (*i_getc)(f)) == EOF) {
3090 (*iconv)(c2, c1, c0|c3);
3095 /* 3 bytes EUC or UTF-8 */
3096 if (hold_index < hold_count){
3097 c0 = hold_buf[hold_index++];
3098 } else if ((c0 = (*i_getc)(f)) == EOF) {
3104 (*iconv)(c2, c1, c0);
3107 if (c0 == EOF) break;
3112 nkf_char push_hold_buf(nkf_char c2)
3114 if (hold_count >= HOLD_SIZE*2)
3116 hold_buf[hold_count++] = (unsigned char)c2;
3117 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3120 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3122 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3125 static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3126 #ifdef SHIFTJIS_CP932
3127 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3129 extern const unsigned short shiftjis_cp932[3][189];
3131 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3138 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3140 extern const unsigned short cp932inv[2][189];
3142 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3148 #endif /* SHIFTJIS_CP932 */
3150 if (!x0213_f && is_ibmext_in_sjis(c2)){
3152 extern const unsigned short shiftjis_x0212[3][189];
3154 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3157 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3170 if(x0213_f && c2 >= 0xF0){
3171 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3172 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3173 }else{ /* 78<=k<=94 */
3174 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3175 if (0x9E < c1) c2++;
3178 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3179 if (0x9E < c1) c2++;
3182 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3189 c2 = x0212_unshift(c2);
3196 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3200 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3202 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3204 if(c1 == 0x7F) return 0;
3205 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3208 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3209 if (ret) return ret;
3215 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3220 }else if (c2 == 0x8f){
3224 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3225 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3226 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3229 c2 = (c2 << 8) | (c1 & 0x7f);
3231 #ifdef SHIFTJIS_CP932
3234 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3235 s2e_conv(s2, s1, &c2, &c1);
3242 #endif /* SHIFTJIS_CP932 */
3244 #endif /* X0212_ENABLE */
3245 } else if (c2 == SSO){
3248 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3251 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3252 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3253 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3258 #ifdef SHIFTJIS_CP932
3259 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3261 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3262 s2e_conv(s2, s1, &c2, &c1);
3269 #endif /* SHIFTJIS_CP932 */
3276 #ifdef UTF8_INPUT_ENABLE
3277 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3284 }else if (0xc0 <= c2 && c2 <= 0xef) {
3285 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3286 #ifdef NUMCHAR_OPTION
3289 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3297 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3300 static const int w_iconv_utf8_1st_byte[] =
3302 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3303 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3304 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3305 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3307 if (c2 < 0 || 0xff < c2) {
3308 }else if (c2 == 0) { /* 0 : 1 byte*/
3310 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3313 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3315 if (c1 < 0x80 || 0xBF < c1) return 0;
3318 if (c0 == 0) return -1;
3319 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3324 if (c0 == 0) return -1;
3325 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3329 if (c0 == 0) return -1;
3330 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3334 if (c0 == 0) return -2;
3335 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3339 if (c0 == 0) return -2;
3340 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3344 if (c0 == 0) return -2;
3345 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3353 if (c2 == 0 || c2 == EOF){
3354 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3355 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3358 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3367 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3368 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3375 }else if (val < 0x800){
3376 *p2 = 0xc0 | (val >> 6);
3377 *p1 = 0x80 | (val & 0x3f);
3379 } else if (val <= NKF_INT32_C(0xFFFF)) {
3380 *p2 = 0xe0 | (val >> 12);
3381 *p1 = 0x80 | ((val >> 6) & 0x3f);
3382 *p0 = 0x80 | (val & 0x3f);
3383 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3384 *p2 = 0xe0 | (val >> 16);
3385 *p1 = 0x80 | ((val >> 12) & 0x3f);
3386 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3395 #ifdef UTF8_INPUT_ENABLE
3396 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3401 } else if (c2 >= 0xf0){
3402 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3403 val = (c2 & 0x0f) << 18;
3404 val |= (c1 & 0x3f) << 12;
3405 val |= (c0 & 0x3f00) >> 2;
3407 }else if (c2 >= 0xe0){
3408 val = (c2 & 0x0f) << 12;
3409 val |= (c1 & 0x3f) << 6;
3411 }else if (c2 >= 0xc0){
3412 val = (c2 & 0x1f) << 6;
3420 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3422 nkf_char c2, c1, c0;
3429 w16w_conv(val, &c2, &c1, &c0);
3430 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3431 #ifdef NUMCHAR_OPTION
3434 *p1 = CLASS_UNICODE | val;
3443 #ifdef UTF8_INPUT_ENABLE
3444 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3447 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3450 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3451 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3453 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3455 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3460 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3461 if (ret) return ret;
3466 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3470 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3471 } else if (is_unicode_bmp(c1)) {
3472 ret = w16e_conv(c1, &c2, &c1);
3475 c1 = CLASS_UNICODE | c1;
3477 if (ret) return ret;
3482 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3485 extern const unsigned short *const utf8_to_euc_2bytes[];
3486 extern const unsigned short *const utf8_to_euc_2bytes_ms[];
3487 extern const unsigned short *const utf8_to_euc_2bytes_932[];
3488 extern const unsigned short *const *const utf8_to_euc_3bytes[];
3489 extern const unsigned short *const *const utf8_to_euc_3bytes_ms[];
3490 extern const unsigned short *const *const utf8_to_euc_3bytes_932[];
3492 const unsigned short *const *pp;
3493 const unsigned short *const *const *ppp;
3494 static const int no_best_fit_chars_table_C2[] =
3495 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3496 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3497 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3498 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3499 static const int no_best_fit_chars_table_C2_ms[] =
3500 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3502 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3503 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3504 static const int no_best_fit_chars_table_932_C2[] =
3505 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3507 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3508 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3509 static const int no_best_fit_chars_table_932_C3[] =
3510 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3513 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3519 }else if(c2 < 0xe0){
3520 if(no_best_fit_chars_f){
3521 if(ms_ucs_map_f == UCS_MAP_CP932){
3524 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3527 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3530 }else if(!cp932inv_f){
3533 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3536 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3539 }else if(ms_ucs_map_f == UCS_MAP_MS){
3540 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3544 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3545 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3547 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3548 }else if(c0 < 0xF0){
3549 if(no_best_fit_chars_f){
3550 if(ms_ucs_map_f == UCS_MAP_CP932){
3551 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3552 }else if(ms_ucs_map_f == UCS_MAP_MS){
3557 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3560 if(c0 == 0x92) return 1;
3565 if(c1 == 0x80 || c0 == 0x9C) return 1;
3573 if(c0 == 0x95) return 1;
3576 if(c0 == 0xA5) return 1;
3583 if(c0 == 0x8D) return 1;
3586 if(c0 == 0x9E && !cp932inv_f) return 1;
3589 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3597 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3598 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3600 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3602 #ifdef SHIFTJIS_CP932
3603 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3605 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3606 s2e_conv(s2, s1, p2, p1);
3615 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3618 const unsigned short *p;
3621 if (pp == 0) return 1;
3624 if (c1 < 0 || psize <= c1) return 1;
3626 if (p == 0) return 1;
3629 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3631 if (val == 0) return 1;
3632 if (no_cp932ext_f && (
3633 (val>>8) == 0x2D || /* NEC special characters */
3634 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3642 if (c2 == SO) c2 = X0201;
3649 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3651 const char *hex = "0123456789ABCDEF";
3657 (*f)(0, hex[(c>>shift)&0xF]);
3667 void encode_fallback_html(nkf_char c)
3672 if(c >= NKF_INT32_C(1000000))
3673 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3674 if(c >= NKF_INT32_C(100000))
3675 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3677 (*oconv)(0, 0x30+(c/10000 )%10);
3679 (*oconv)(0, 0x30+(c/1000 )%10);
3681 (*oconv)(0, 0x30+(c/100 )%10);
3683 (*oconv)(0, 0x30+(c/10 )%10);
3685 (*oconv)(0, 0x30+ c %10);
3690 void encode_fallback_xml(nkf_char c)
3695 nkf_each_char_to_hex(oconv, c);
3700 void encode_fallback_java(nkf_char c)
3702 const char *hex = "0123456789ABCDEF";
3705 if(!is_unicode_bmp(c)){
3709 (*oconv)(0, hex[(c>>20)&0xF]);
3710 (*oconv)(0, hex[(c>>16)&0xF]);
3714 (*oconv)(0, hex[(c>>12)&0xF]);
3715 (*oconv)(0, hex[(c>> 8)&0xF]);
3716 (*oconv)(0, hex[(c>> 4)&0xF]);
3717 (*oconv)(0, hex[ c &0xF]);
3721 void encode_fallback_perl(nkf_char c)
3726 nkf_each_char_to_hex(oconv, c);
3731 void encode_fallback_subchar(nkf_char c)
3733 c = unicode_subchar;
3734 (*oconv)((c>>8)&0xFF, c&0xFF);
3739 #ifdef UTF8_OUTPUT_ENABLE
3740 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3743 extern const unsigned short euc_to_utf8_1byte[];
3744 extern const unsigned short *const euc_to_utf8_2bytes[];
3745 extern const unsigned short *const euc_to_utf8_2bytes_ms[];
3746 extern const unsigned short *const x0212_to_utf8_2bytes[];
3748 const unsigned short *p;
3751 p = euc_to_utf8_1byte;
3753 } else if (is_eucg3(c2)){
3754 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3757 c2 = (c2&0x7f) - 0x21;
3758 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3759 p = x0212_to_utf8_2bytes[c2];
3765 c2 = (c2&0x7f) - 0x21;
3766 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3767 p = ms_ucs_map_f != UCS_MAP_ASCII ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
3772 c1 = (c1 & 0x7f) - 0x21;
3773 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3778 void w_oconv(nkf_char c2, nkf_char c1)
3784 output_bom_f = FALSE;
3795 #ifdef NUMCHAR_OPTION
3796 if (c2 == 0 && is_unicode_capsule(c1)){
3797 val = c1 & VALUE_MASK;
3800 }else if (val < 0x800){
3801 (*o_putc)(0xC0 | (val >> 6));
3802 (*o_putc)(0x80 | (val & 0x3f));
3803 } else if (val <= NKF_INT32_C(0xFFFF)) {
3804 (*o_putc)(0xE0 | (val >> 12));
3805 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3806 (*o_putc)(0x80 | (val & 0x3f));
3807 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3808 (*o_putc)(0xF0 | ( val>>18));
3809 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3810 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3811 (*o_putc)(0x80 | ( val & 0x3f));
3818 output_mode = ASCII;
3820 } else if (c2 == ISO8859_1) {
3821 output_mode = ISO8859_1;
3822 (*o_putc)(c1 | 0x080);
3825 val = e2w_conv(c2, c1);
3827 w16w_conv(val, &c2, &c1, &c0);
3831 if (c0) (*o_putc)(c0);
3837 void w_oconv16(nkf_char c2, nkf_char c1)
3840 output_bom_f = FALSE;
3841 if (output_endian == ENDIAN_LITTLE){
3842 (*o_putc)((unsigned char)'\377');
3846 (*o_putc)((unsigned char)'\377');
3855 if (c2 == ISO8859_1) {
3858 #ifdef NUMCHAR_OPTION
3859 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3860 if (is_unicode_bmp(c1)) {
3861 c2 = (c1 >> 8) & 0xff;
3865 if (c1 <= UNICODE_MAX) {
3866 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3867 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3868 if (output_endian == ENDIAN_LITTLE){
3869 (*o_putc)(c2 & 0xff);
3870 (*o_putc)((c2 >> 8) & 0xff);
3871 (*o_putc)(c1 & 0xff);
3872 (*o_putc)((c1 >> 8) & 0xff);
3874 (*o_putc)((c2 >> 8) & 0xff);
3875 (*o_putc)(c2 & 0xff);
3876 (*o_putc)((c1 >> 8) & 0xff);
3877 (*o_putc)(c1 & 0xff);
3884 nkf_char val = e2w_conv(c2, c1);
3885 c2 = (val >> 8) & 0xff;
3889 if (output_endian == ENDIAN_LITTLE){
3898 void w_oconv32(nkf_char c2, nkf_char c1)
3901 output_bom_f = FALSE;
3902 if (output_endian == ENDIAN_LITTLE){
3903 (*o_putc)((unsigned char)'\377');
3911 (*o_putc)((unsigned char)'\377');
3920 if (c2 == ISO8859_1) {
3922 #ifdef NUMCHAR_OPTION
3923 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3927 c1 = e2w_conv(c2, c1);
3930 if (output_endian == ENDIAN_LITTLE){
3931 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3932 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3933 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3937 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3938 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3939 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3944 void e_oconv(nkf_char c2, nkf_char c1)
3946 #ifdef NUMCHAR_OPTION
3947 if (c2 == 0 && is_unicode_capsule(c1)){
3948 w16e_conv(c1, &c2, &c1);
3949 if (c2 == 0 && is_unicode_capsule(c1)){
3950 c2 = c1 & VALUE_MASK;
3951 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3955 c2 += c2 < 10 ? 0x75 : 0x8FEB;
3956 c1 = 0x21 + c1 % 94;
3959 (*o_putc)((c2 & 0x7f) | 0x080);
3960 (*o_putc)(c1 | 0x080);
3962 (*o_putc)((c2 & 0x7f) | 0x080);
3963 (*o_putc)(c1 | 0x080);
3967 if (encode_fallback) (*encode_fallback)(c1);
3976 } else if (c2 == 0) {
3977 output_mode = ASCII;
3979 } else if (c2 == X0201) {
3980 output_mode = JAPANESE_EUC;
3981 (*o_putc)(SSO); (*o_putc)(c1|0x80);
3982 } else if (c2 == ISO8859_1) {
3983 output_mode = ISO8859_1;
3984 (*o_putc)(c1 | 0x080);
3986 } else if (is_eucg3(c2)){
3987 output_mode = JAPANESE_EUC;
3988 #ifdef SHIFTJIS_CP932
3991 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3992 s2e_conv(s2, s1, &c2, &c1);
3997 output_mode = ASCII;
3999 }else if (is_eucg3(c2)){
4002 (*o_putc)((c2 & 0x7f) | 0x080);
4003 (*o_putc)(c1 | 0x080);
4006 (*o_putc)((c2 & 0x7f) | 0x080);
4007 (*o_putc)(c1 | 0x080);
4011 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4012 set_iconv(FALSE, 0);
4013 return; /* too late to rescue this char */
4015 output_mode = JAPANESE_EUC;
4016 (*o_putc)(c2 | 0x080);
4017 (*o_putc)(c1 | 0x080);
4022 nkf_char x0212_shift(nkf_char c)
4027 if (0x75 <= c && c <= 0x7f){
4028 ret = c + (0x109 - 0x75);
4031 if (0x75 <= c && c <= 0x7f){
4032 ret = c + (0x113 - 0x75);
4039 nkf_char x0212_unshift(nkf_char c)
4042 if (0x7f <= c && c <= 0x88){
4043 ret = c + (0x75 - 0x7f);
4044 }else if (0x89 <= c && c <= 0x92){
4045 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4049 #endif /* X0212_ENABLE */
4051 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4057 if((0x21 <= ndx && ndx <= 0x2F)){
4058 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4059 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4061 }else if(0x6E <= ndx && ndx <= 0x7E){
4062 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4063 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4069 else if(nkf_isgraph(ndx)){
4071 const unsigned short *ptr;
4073 extern const unsigned short *const x0212_shiftjis[];
4075 ptr = x0212_shiftjis[ndx - 0x21];
4077 val = ptr[(c1 & 0x7f) - 0x21];
4086 c2 = x0212_shift(c2);
4088 #endif /* X0212_ENABLE */
4090 if(0x7F < c2) return 1;
4091 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4092 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4096 void s_oconv(nkf_char c2, nkf_char c1)
4098 #ifdef NUMCHAR_OPTION
4099 if (c2 == 0 && is_unicode_capsule(c1)){
4100 w16e_conv(c1, &c2, &c1);
4101 if (c2 == 0 && is_unicode_capsule(c1)){
4102 c2 = c1 & VALUE_MASK;
4103 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4106 c2 = c1 / 188 + 0xF0;
4108 c1 += 0x40 + (c1 > 0x3e);
4113 if(encode_fallback)(*encode_fallback)(c1);
4122 } else if (c2 == 0) {
4123 output_mode = ASCII;
4125 } else if (c2 == X0201) {
4126 output_mode = SHIFT_JIS;
4128 } else if (c2 == ISO8859_1) {
4129 output_mode = ISO8859_1;
4130 (*o_putc)(c1 | 0x080);
4132 } else if (is_eucg3(c2)){
4133 output_mode = SHIFT_JIS;
4134 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4140 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4141 set_iconv(FALSE, 0);
4142 return; /* too late to rescue this char */
4144 output_mode = SHIFT_JIS;
4145 e2s_conv(c2, c1, &c2, &c1);
4147 #ifdef SHIFTJIS_CP932
4149 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4151 extern const unsigned short cp932inv[2][189];
4153 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4159 #endif /* SHIFTJIS_CP932 */
4162 if (prefix_table[(unsigned char)c1]){
4163 (*o_putc)(prefix_table[(unsigned char)c1]);
4169 void j_oconv(nkf_char c2, nkf_char c1)
4171 #ifdef NUMCHAR_OPTION
4172 if (c2 == 0 && is_unicode_capsule(c1)){
4173 w16e_conv(c1, &c2, &c1);
4174 if (c2 == 0 && is_unicode_capsule(c1)){
4175 c2 = c1 & VALUE_MASK;
4176 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4179 c2 = 0x7F + c1 / 94;
4180 c1 = 0x21 + c1 % 94;
4182 if (encode_fallback) (*encode_fallback)(c1);
4189 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4192 (*o_putc)(ascii_intro);
4193 output_mode = ASCII;
4197 } else if (is_eucg3(c2)){
4199 if(output_mode!=X0213_2){
4200 output_mode = X0213_2;
4204 (*o_putc)(X0213_2&0x7F);
4207 if(output_mode!=X0212){
4208 output_mode = X0212;
4212 (*o_putc)(X0212&0x7F);
4215 (*o_putc)(c2 & 0x7f);
4218 } else if (c2==X0201) {
4219 if (output_mode!=X0201) {
4220 output_mode = X0201;
4226 } else if (c2==ISO8859_1) {
4227 /* iso8859 introduction, or 8th bit on */
4228 /* Can we convert in 7bit form using ESC-'-'-A ?
4230 output_mode = ISO8859_1;
4232 } else if (c2 == 0) {
4233 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4236 (*o_putc)(ascii_intro);
4237 output_mode = ASCII;
4242 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4243 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4245 if (output_mode!=X0213_1) {
4246 output_mode = X0213_1;
4250 (*o_putc)(X0213_1&0x7F);
4252 }else if (output_mode != X0208) {
4253 output_mode = X0208;
4256 (*o_putc)(kanji_intro);
4263 void base64_conv(nkf_char c2, nkf_char c1)
4265 mime_prechar(c2, c1);
4266 (*o_base64conv)(c2,c1);
4270 static nkf_char broken_buf[3];
4271 static int broken_counter = 0;
4272 static int broken_last = 0;
4273 nkf_char broken_getc(FILE *f)
4277 if (broken_counter>0) {
4278 return broken_buf[--broken_counter];
4281 if (c=='$' && broken_last != ESC
4282 && (input_mode==ASCII || input_mode==X0201)) {
4285 if (c1=='@'|| c1=='B') {
4286 broken_buf[0]=c1; broken_buf[1]=c;
4293 } else if (c=='(' && broken_last != ESC
4294 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4297 if (c1=='J'|| c1=='B') {
4298 broken_buf[0]=c1; broken_buf[1]=c;
4311 nkf_char broken_ungetc(nkf_char c, FILE *f)
4313 if (broken_counter<2)
4314 broken_buf[broken_counter++]=c;
4318 static nkf_char prev_cr = 0;
4320 void cr_conv(nkf_char c2, nkf_char c1)
4324 if (! (c2==0&&c1==NL) ) {
4330 } else if (c1=='\r') {
4332 } else if (c1=='\n') {
4333 if (crmode_f==CRLF) {
4334 (*o_crconv)(0,'\r');
4335 } else if (crmode_f==CR) {
4336 (*o_crconv)(0,'\r');
4340 } else if (c1!='\032' || crmode_f!=NL){
4346 Return value of fold_conv()
4348 \n add newline and output char
4349 \r add newline and output nothing
4352 1 (or else) normal output
4354 fold state in prev (previous character)
4356 >0x80 Japanese (X0208/X0201)
4361 This fold algorthm does not preserve heading space in a line.
4362 This is the main difference from fmt.
4365 #define char_size(c2,c1) (c2?2:1)
4367 void fold_conv(nkf_char c2, nkf_char c1)
4370 nkf_char fold_state;
4372 if (c1== '\r' && !fold_preserve_f) {
4373 fold_state=0; /* ignore cr */
4374 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4376 fold_state=0; /* ignore cr */
4377 } else if (c1== BS) {
4378 if (f_line>0) f_line--;
4380 } else if (c2==EOF && f_line != 0) { /* close open last line */
4382 } else if ((c1=='\n' && !fold_preserve_f)
4383 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4384 && fold_preserve_f)) {
4386 if (fold_preserve_f) {
4390 } else if ((f_prev == c1 && !fold_preserve_f)
4391 || (f_prev == '\n' && fold_preserve_f)
4392 ) { /* duplicate newline */
4395 fold_state = '\n'; /* output two newline */
4401 if (f_prev&0x80) { /* Japanese? */
4403 fold_state = 0; /* ignore given single newline */
4404 } else if (f_prev==' ') {
4408 if (++f_line<=fold_len)
4412 fold_state = '\r'; /* fold and output nothing */
4416 } else if (c1=='\f') {
4419 fold_state = '\n'; /* output newline and clear */
4420 } else if ( (c2==0 && c1==' ')||
4421 (c2==0 && c1=='\t')||
4422 (c2=='!'&& c1=='!')) {
4423 /* X0208 kankaku or ascii space */
4424 if (f_prev == ' ') {
4425 fold_state = 0; /* remove duplicate spaces */
4428 if (++f_line<=fold_len)
4429 fold_state = ' '; /* output ASCII space only */
4431 f_prev = ' '; f_line = 0;
4432 fold_state = '\r'; /* fold and output nothing */
4436 prev0 = f_prev; /* we still need this one... , but almost done */
4438 if (c2 || c2==X0201)
4439 f_prev |= 0x80; /* this is Japanese */
4440 f_line += char_size(c2,c1);
4441 if (f_line<=fold_len) { /* normal case */
4444 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4445 f_line = char_size(c2,c1);
4446 fold_state = '\n'; /* We can't wait, do fold now */
4447 } else if (c2==X0201) {
4448 /* simple kinsoku rules return 1 means no folding */
4449 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4450 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4451 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4452 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4453 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4454 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4455 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4457 fold_state = '\n';/* add one new f_line before this character */
4460 fold_state = '\n';/* add one new f_line before this character */
4463 /* kinsoku point in ASCII */
4464 if ( c1==')'|| /* { [ ( */
4475 /* just after special */
4476 } else if (!is_alnum(prev0)) {
4477 f_line = char_size(c2,c1);
4479 } else if ((prev0==' ') || /* ignored new f_line */
4480 (prev0=='\n')|| /* ignored new f_line */
4481 (prev0&0x80)) { /* X0208 - ASCII */
4482 f_line = char_size(c2,c1);
4483 fold_state = '\n';/* add one new f_line before this character */
4485 fold_state = 1; /* default no fold in ASCII */
4489 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4490 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4491 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4492 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4493 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4494 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4495 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4496 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4497 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4498 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4499 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4500 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4501 /* default no fold in kinsoku */
4504 f_line = char_size(c2,c1);
4505 /* add one new f_line before this character */
4508 f_line = char_size(c2,c1);
4510 /* add one new f_line before this character */
4515 /* terminator process */
4516 switch(fold_state) {
4535 nkf_char z_prev2=0,z_prev1=0;
4537 void z_conv(nkf_char c2, nkf_char c1)
4540 /* if (c2) c1 &= 0x7f; assertion */
4542 if (x0201_f && z_prev2==X0201) { /* X0201 */
4543 if (c1==(0xde&0x7f)) { /*
\e$BByE@
\e(B */
4545 (*o_zconv)(dv[(z_prev1-SPACE)*2],dv[(z_prev1-SPACE)*2+1]);
4547 } else if (c1==(0xdf&0x7f)&&ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4549 (*o_zconv)(ev[(z_prev1-SPACE)*2],ev[(z_prev1-SPACE)*2+1]);
4553 (*o_zconv)(cv[(z_prev1-SPACE)*2],cv[(z_prev1-SPACE)*2+1]);
4562 if (x0201_f && c2==X0201) {
4563 if (dv[(c1-SPACE)*2]||ev[(c1-SPACE)*2]) {
4564 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4565 z_prev1 = c1; z_prev2 = c2;
4568 (*o_zconv)(cv[(c1-SPACE)*2],cv[(c1-SPACE)*2+1]);
4573 /* JISX0208 Alphabet */
4574 if (alpha_f && c2 == 0x23 ) {
4576 } else if (alpha_f && c2 == 0x21 ) {
4577 /* JISX0208 Kigou */
4582 } else if (alpha_f&0x4) {
4587 } else if (0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4593 case '>': entity = ">"; break;
4594 case '<': entity = "<"; break;
4595 case '\"': entity = """; break;
4596 case '&': entity = "&"; break;
4599 while (*entity) (*o_zconv)(0, *entity++);
4609 #define rot13(c) ( \
4611 (c <= 'M') ? (c + 13): \
4612 (c <= 'Z') ? (c - 13): \
4614 (c <= 'm') ? (c + 13): \
4615 (c <= 'z') ? (c - 13): \
4619 #define rot47(c) ( \
4621 ( c <= 'O' ) ? (c + 47) : \
4622 ( c <= '~' ) ? (c - 47) : \
4626 void rot_conv(nkf_char c2, nkf_char c1)
4628 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4634 (*o_rot_conv)(c2,c1);
4637 void hira_conv(nkf_char c2, nkf_char c1)
4641 if (0x20 < c1 && c1 < 0x74) {
4643 (*o_hira_conv)(c2,c1);
4645 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4647 c1 = CLASS_UNICODE | 0x3094;
4648 (*o_hira_conv)(c2,c1);
4651 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4653 (*o_hira_conv)(c2,c1);
4658 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4661 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4663 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4667 (*o_hira_conv)(c2,c1);
4671 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4673 static const nkf_char range[RANGE_NUM_MAX][2] = {
4694 nkf_char start, end, c;
4696 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4700 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4705 for (i = 0; i < RANGE_NUM_MAX; i++) {
4706 start = range[i][0];
4709 if (c >= start && c <= end) {
4714 (*o_iso2022jp_check_conv)(c2,c1);
4718 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4720 const unsigned char *mime_pattern[] = {
4721 (const unsigned char *)"\075?EUC-JP?B?",
4722 (const unsigned char *)"\075?SHIFT_JIS?B?",
4723 (const unsigned char *)"\075?ISO-8859-1?Q?",
4724 (const unsigned char *)"\075?ISO-8859-1?B?",
4725 (const unsigned char *)"\075?ISO-2022-JP?B?",
4726 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4727 #if defined(UTF8_INPUT_ENABLE)
4728 (const unsigned char *)"\075?UTF-8?B?",
4729 (const unsigned char *)"\075?UTF-8?Q?",
4731 (const unsigned char *)"\075?US-ASCII?Q?",
4736 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4737 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4738 e_iconv, s_iconv, 0, 0, 0, 0,
4739 #if defined(UTF8_INPUT_ENABLE)
4745 const nkf_char mime_encode[] = {
4746 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4747 #if defined(UTF8_INPUT_ENABLE)
4754 const nkf_char mime_encode_method[] = {
4755 'B', 'B','Q', 'B', 'B', 'Q',
4756 #if defined(UTF8_INPUT_ENABLE)
4764 #define MAXRECOVER 20
4766 void switch_mime_getc(void)
4768 if (i_getc!=mime_getc) {
4769 i_mgetc = i_getc; i_getc = mime_getc;
4770 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4771 if(mime_f==STRICT_MIME) {
4772 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4773 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4778 void unswitch_mime_getc(void)
4780 if(mime_f==STRICT_MIME) {
4781 i_mgetc = i_mgetc_buf;
4782 i_mungetc = i_mungetc_buf;
4785 i_ungetc = i_mungetc;
4786 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4787 mime_iconv_back = NULL;
4790 nkf_char mime_begin_strict(FILE *f)
4794 const unsigned char *p,*q;
4795 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4797 mime_decode_mode = FALSE;
4798 /* =? has been checked */
4800 p = mime_pattern[j];
4803 for(i=2;p[i]>' ';i++) { /* start at =? */
4804 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4805 /* pattern fails, try next one */
4807 while (mime_pattern[++j]) {
4808 p = mime_pattern[j];
4809 for(k=2;k<i;k++) /* assume length(p) > i */
4810 if (p[k]!=q[k]) break;
4811 if (k==i && nkf_toupper(c1)==p[k]) break;
4813 p = mime_pattern[j];
4814 if (p) continue; /* found next one, continue */
4815 /* all fails, output from recovery buffer */
4823 mime_decode_mode = p[i-2];
4825 mime_iconv_back = iconv;
4826 set_iconv(FALSE, mime_priority_func[j]);
4827 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4829 if (mime_decode_mode=='B') {
4830 mimebuf_f = unbuf_f;
4832 /* do MIME integrity check */
4833 return mime_integrity(f,mime_pattern[j]);
4841 nkf_char mime_getc_buf(FILE *f)
4843 /* we don't keep eof of Fifo, becase it contains ?= as
4844 a terminator. It was checked in mime_integrity. */
4845 return ((mimebuf_f)?
4846 (*i_mgetc_buf)(f):Fifo(mime_input++));
4849 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4852 (*i_mungetc_buf)(c,f);
4854 Fifo(--mime_input) = (unsigned char)c;
4858 nkf_char mime_begin(FILE *f)
4863 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4864 /* re-read and convert again from mime_buffer. */
4866 /* =? has been checked */
4868 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4869 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4870 /* We accept any character type even if it is breaked by new lines */
4871 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4872 if (c1=='\n'||c1==' '||c1=='\r'||
4873 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4875 /* Failed. But this could be another MIME preemble */
4883 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4884 if (!(++i<MAXRECOVER) || c1==EOF) break;
4885 if (c1=='b'||c1=='B') {
4886 mime_decode_mode = 'B';
4887 } else if (c1=='q'||c1=='Q') {
4888 mime_decode_mode = 'Q';
4892 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4893 if (!(++i<MAXRECOVER) || c1==EOF) break;
4895 mime_decode_mode = FALSE;
4901 if (!mime_decode_mode) {
4902 /* false MIME premble, restart from mime_buffer */
4903 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4904 /* Since we are in MIME mode until buffer becomes empty, */
4905 /* we never go into mime_begin again for a while. */
4908 /* discard mime preemble, and goto MIME mode */
4910 /* do no MIME integrity check */
4911 return c1; /* used only for checking EOF */
4915 void no_putc(nkf_char c)
4920 void debug(const char *str)
4923 fprintf(stderr, "%s\n", str);
4928 void set_input_codename(char *codename)
4932 strcmp(codename, "") != 0 &&
4933 strcmp(codename, input_codename) != 0)
4935 is_inputcode_mixed = TRUE;
4937 input_codename = codename;
4938 is_inputcode_set = TRUE;
4941 #if !defined(PERL_XS) && !defined(WIN32DLL)
4942 void print_guessed_code(char *filename)
4944 char *codename = "BINARY";
4945 if (!is_inputcode_mixed) {
4946 if (strcmp(input_codename, "") == 0) {
4949 codename = input_codename;
4952 if (filename != NULL) printf("%s:", filename);
4953 printf("%s\n", codename);
4959 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4961 nkf_char c1, c2, c3;
4967 if (!nkf_isxdigit(c2)){
4972 if (!nkf_isxdigit(c3)){
4977 return (hex2bin(c2) << 4) | hex2bin(c3);
4980 nkf_char cap_getc(FILE *f)
4982 return hex_getc(':', f, i_cgetc, i_cungetc);
4985 nkf_char cap_ungetc(nkf_char c, FILE *f)
4987 return (*i_cungetc)(c, f);
4990 nkf_char url_getc(FILE *f)
4992 return hex_getc('%', f, i_ugetc, i_uungetc);
4995 nkf_char url_ungetc(nkf_char c, FILE *f)
4997 return (*i_uungetc)(c, f);
5001 #ifdef NUMCHAR_OPTION
5002 nkf_char numchar_getc(FILE *f)
5004 nkf_char (*g)(FILE *) = i_ngetc;
5005 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5016 if (buf[i] == 'x' || buf[i] == 'X'){
5017 for (j = 0; j < 7; j++){
5019 if (!nkf_isxdigit(buf[i])){
5026 c |= hex2bin(buf[i]);
5029 for (j = 0; j < 8; j++){
5033 if (!nkf_isdigit(buf[i])){
5040 c += hex2bin(buf[i]);
5046 return CLASS_UNICODE | c;
5055 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5057 return (*i_nungetc)(c, f);
5061 #ifdef UNICODE_NORMALIZATION
5063 /* Normalization Form C */
5064 nkf_char nfc_getc(FILE *f)
5066 nkf_char (*g)(FILE *f) = i_nfc_getc;
5067 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5068 int i=0, j, k=1, lower, upper;
5070 const nkf_nfchar *array;
5072 extern const struct normalization_pair normalization_table[];
5076 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5077 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5078 while (upper >= lower) {
5079 j = (lower+upper) / 2;
5080 array = normalization_table[j].nfd;
5081 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5082 if (array[k] != buf[k]){
5083 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5090 array = normalization_table[j].nfc;
5091 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5092 buf[i] = (nkf_char)(array[i]);
5103 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5105 return (*i_nfc_ungetc)(c, f);
5107 #endif /* UNICODE_NORMALIZATION */
5113 nkf_char c1, c2, c3, c4, cc;
5114 nkf_char t1, t2, t3, t4, mode, exit_mode;
5115 nkf_char lwsp_count;
5118 nkf_char lwsp_size = 128;
5120 if (mime_top != mime_last) { /* Something is in FIFO */
5121 return Fifo(mime_top++);
5123 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5124 mime_decode_mode=FALSE;
5125 unswitch_mime_getc();
5126 return (*i_getc)(f);
5129 if (mimebuf_f == FIXED_MIME)
5130 exit_mode = mime_decode_mode;
5133 if (mime_decode_mode == 'Q') {
5134 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5136 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5137 if (c1<=' ' || DEL<=c1) {
5138 mime_decode_mode = exit_mode; /* prepare for quit */
5141 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5145 mime_decode_mode = exit_mode; /* prepare for quit */
5146 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5147 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5148 /* end Q encoding */
5149 input_mode = exit_mode;
5151 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5152 if (lwsp_buf==NULL) {
5153 perror("can't malloc");
5156 while ((c1=(*i_getc)(f))!=EOF) {
5161 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5169 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5170 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5185 lwsp_buf[lwsp_count] = (unsigned char)c1;
5186 if (lwsp_count++>lwsp_size){
5188 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5189 if (lwsp_buf_new==NULL) {
5191 perror("can't realloc");
5194 lwsp_buf = lwsp_buf_new;
5200 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5202 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5203 i_ungetc(lwsp_buf[lwsp_count],f);
5209 if (c1=='='&&c2<' ') { /* this is soft wrap */
5210 while((c1 = (*i_mgetc)(f)) <=' ') {
5211 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5213 mime_decode_mode = 'Q'; /* still in MIME */
5214 goto restart_mime_q;
5217 mime_decode_mode = 'Q'; /* still in MIME */
5221 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5222 if (c2<=' ') return c2;
5223 mime_decode_mode = 'Q'; /* still in MIME */
5224 return ((hex2bin(c2)<<4) + hex2bin(c3));
5227 if (mime_decode_mode != 'B') {
5228 mime_decode_mode = FALSE;
5229 return (*i_mgetc)(f);
5233 /* Base64 encoding */
5235 MIME allows line break in the middle of
5236 Base64, but we are very pessimistic in decoding
5237 in unbuf mode because MIME encoded code may broken by
5238 less or editor's control sequence (such as ESC-[-K in unbuffered
5239 mode. ignore incomplete MIME.
5241 mode = mime_decode_mode;
5242 mime_decode_mode = exit_mode; /* prepare for quit */
5244 while ((c1 = (*i_mgetc)(f))<=' ') {
5249 if ((c2 = (*i_mgetc)(f))<=' ') {
5252 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5253 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5256 if ((c1 == '?') && (c2 == '=')) {
5259 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5260 if (lwsp_buf==NULL) {
5261 perror("can't malloc");
5264 while ((c1=(*i_getc)(f))!=EOF) {
5269 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5277 if ((c1=(*i_getc)(f))!=EOF) {
5281 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5296 lwsp_buf[lwsp_count] = (unsigned char)c1;
5297 if (lwsp_count++>lwsp_size){
5299 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5300 if (lwsp_buf_new==NULL) {
5302 perror("can't realloc");
5305 lwsp_buf = lwsp_buf_new;
5311 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5313 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5314 i_ungetc(lwsp_buf[lwsp_count],f);
5321 if ((c3 = (*i_mgetc)(f))<=' ') {
5324 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5325 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5329 if ((c4 = (*i_mgetc)(f))<=' ') {
5332 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5333 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5337 mime_decode_mode = mode; /* still in MIME sigh... */
5339 /* BASE 64 decoding */
5341 t1 = 0x3f & base64decode(c1);
5342 t2 = 0x3f & base64decode(c2);
5343 t3 = 0x3f & base64decode(c3);
5344 t4 = 0x3f & base64decode(c4);
5345 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5347 Fifo(mime_last++) = (unsigned char)cc;
5348 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5350 Fifo(mime_last++) = (unsigned char)cc;
5351 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5353 Fifo(mime_last++) = (unsigned char)cc;
5358 return Fifo(mime_top++);
5361 nkf_char mime_ungetc(nkf_char c, FILE *f)
5363 Fifo(--mime_top) = (unsigned char)c;
5367 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5371 /* In buffered mode, read until =? or NL or buffer full
5373 mime_input = mime_top;
5374 mime_last = mime_top;
5376 while(*p) Fifo(mime_input++) = *p++;
5379 while((c=(*i_getc)(f))!=EOF) {
5380 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5381 break; /* buffer full */
5383 if (c=='=' && d=='?') {
5384 /* checked. skip header, start decode */
5385 Fifo(mime_input++) = (unsigned char)c;
5386 /* mime_last_input = mime_input; */
5391 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5393 /* Should we check length mod 4? */
5394 Fifo(mime_input++) = (unsigned char)c;
5397 /* In case of Incomplete MIME, no MIME decode */
5398 Fifo(mime_input++) = (unsigned char)c;
5399 mime_last = mime_input; /* point undecoded buffer */
5400 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5401 switch_mime_getc(); /* anyway we need buffered getc */
5405 nkf_char base64decode(nkf_char c)
5410 i = c - 'A'; /* A..Z 0-25 */
5412 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5414 } else if (c > '/') {
5415 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5416 } else if (c == '+') {
5417 i = '>' /* 62 */ ; /* + 62 */
5419 i = '?' /* 63 */ ; /* / 63 */
5424 static const char basis_64[] =
5425 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5427 static nkf_char b64c;
5428 #define MIMEOUT_BUF_LENGTH (60)
5429 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5430 int mimeout_buf_count = 0;
5431 int mimeout_preserve_space = 0;
5432 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5434 void open_mime(nkf_char mode)
5436 const unsigned char *p;
5439 p = mime_pattern[0];
5440 for(i=0;mime_pattern[i];i++) {
5441 if (mode == mime_encode[i]) {
5442 p = mime_pattern[i];
5446 mimeout_mode = mime_encode_method[i];
5449 if (base64_count>45) {
5450 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5451 (*o_mputc)(mimeout_buf[i]);
5457 if (!mimeout_preserve_space && mimeout_buf_count>0
5458 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5459 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5463 if (!mimeout_preserve_space) {
5464 for (;i<mimeout_buf_count;i++) {
5465 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5466 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5467 (*o_mputc)(mimeout_buf[i]);
5474 mimeout_preserve_space = FALSE;
5480 j = mimeout_buf_count;
5481 mimeout_buf_count = 0;
5483 mime_putc(mimeout_buf[i]);
5487 void close_mime(void)
5497 switch(mimeout_mode) {
5502 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5508 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5514 if (mimeout_f!=FIXED_MIME) {
5516 } else if (mimeout_mode != 'Q')
5521 void mimeout_addchar(nkf_char c)
5523 switch(mimeout_mode) {
5528 } else if(!nkf_isalnum(c)) {
5530 (*o_mputc)(itoh4(((c>>4)&0xf)));
5531 (*o_mputc)(itoh4((c&0xf)));
5540 (*o_mputc)(basis_64[c>>2]);
5545 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5551 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5552 (*o_mputc)(basis_64[c & 0x3F]);
5563 nkf_char mime_lastchar2, mime_lastchar1;
5565 void mime_prechar(nkf_char c2, nkf_char c1)
5569 if (base64_count + mimeout_buf_count/3*4> 66){
5570 (*o_base64conv)(EOF,0);
5571 (*o_base64conv)(0,NL);
5572 (*o_base64conv)(0,SPACE);
5574 }/*else if (mime_lastchar2){
5575 if (c1 <=DEL && !nkf_isspace(c1)){
5576 (*o_base64conv)(0,SPACE);
5580 if (c2 && mime_lastchar2 == 0
5581 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5582 (*o_base64conv)(0,SPACE);
5585 mime_lastchar2 = c2;
5586 mime_lastchar1 = c1;
5589 void mime_putc(nkf_char c)
5594 if (mimeout_f == FIXED_MIME){
5595 if (mimeout_mode == 'Q'){
5596 if (base64_count > 71){
5597 if (c!=CR && c!=NL) {
5604 if (base64_count > 71){
5609 if (c == EOF) { /* c==EOF */
5613 if (c != EOF) { /* c==EOF */
5619 /* mimeout_f != FIXED_MIME */
5621 if (c == EOF) { /* c==EOF */
5622 j = mimeout_buf_count;
5623 mimeout_buf_count = 0;
5627 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5630 mimeout_addchar(mimeout_buf[i]);
5634 mimeout_addchar(mimeout_buf[i]);
5638 mimeout_addchar(mimeout_buf[i]);
5644 if (mimeout_mode=='Q') {
5645 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5646 if (c == CR || c == NL) {
5651 } else if (c <= SPACE) {
5662 if (mimeout_buf_count > 0){
5663 lastchar = mimeout_buf[mimeout_buf_count - 1];
5668 if (!mimeout_mode) {
5669 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5670 if (nkf_isspace(c)) {
5671 if (c==CR || c==NL) {
5674 for (i=0;i<mimeout_buf_count;i++) {
5675 (*o_mputc)(mimeout_buf[i]);
5676 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5682 mimeout_buf[0] = (char)c;
5683 mimeout_buf_count = 1;
5685 if (base64_count > 1
5686 && base64_count + mimeout_buf_count > 76
5687 && mimeout_buf[0] != CR && mimeout_buf[0] != NL){
5690 if (!nkf_isspace(mimeout_buf[0])){
5695 mimeout_buf[mimeout_buf_count++] = (char)c;
5696 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5697 open_mime(output_mode);
5702 if (lastchar==CR || lastchar == NL){
5703 for (i=0;i<mimeout_buf_count;i++) {
5704 (*o_mputc)(mimeout_buf[i]);
5707 mimeout_buf_count = 0;
5709 if (lastchar==SPACE) {
5710 for (i=0;i<mimeout_buf_count-1;i++) {
5711 (*o_mputc)(mimeout_buf[i]);
5714 mimeout_buf[0] = SPACE;
5715 mimeout_buf_count = 1;
5717 open_mime(output_mode);
5720 /* mimeout_mode == 'B', 1, 2 */
5721 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5722 if (lastchar == CR || lastchar == NL){
5723 if (nkf_isblank(c)) {
5724 for (i=0;i<mimeout_buf_count;i++) {
5725 mimeout_addchar(mimeout_buf[i]);
5727 mimeout_buf_count = 0;
5728 } else if (SPACE<c && c<DEL) {
5730 for (i=0;i<mimeout_buf_count;i++) {
5731 (*o_mputc)(mimeout_buf[i]);
5734 mimeout_buf_count = 0;
5737 if (c==SPACE || c==TAB || c==CR || c==NL) {
5738 for (i=0;i<mimeout_buf_count;i++) {
5739 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5741 for (i=0;i<mimeout_buf_count;i++) {
5742 (*o_mputc)(mimeout_buf[i]);
5745 mimeout_buf_count = 0;
5748 mimeout_buf[mimeout_buf_count++] = (char)c;
5749 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5751 for (i=0;i<mimeout_buf_count;i++) {
5752 (*o_mputc)(mimeout_buf[i]);
5755 mimeout_buf_count = 0;
5759 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5760 mimeout_buf[mimeout_buf_count++] = (char)c;
5761 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5762 j = mimeout_buf_count;
5763 mimeout_buf_count = 0;
5765 mimeout_addchar(mimeout_buf[i]);
5772 if (mimeout_buf_count>0) {
5773 j = mimeout_buf_count;
5774 mimeout_buf_count = 0;
5776 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5778 mimeout_addchar(mimeout_buf[i]);
5784 (*o_mputc)(mimeout_buf[i]);
5786 open_mime(output_mode);
5793 #if defined(PERL_XS) || defined(WIN32DLL)
5797 struct input_code *p = input_code_list;
5810 mime_f = STRICT_MIME;
5811 mime_decode_f = FALSE;
5816 #if defined(MSDOS) || defined(__OS2__)
5821 iso2022jp_f = FALSE;
5822 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5823 ms_ucs_map_f = UCS_MAP_ASCII;
5825 #ifdef UTF8_INPUT_ENABLE
5826 no_cp932ext_f = FALSE;
5827 no_best_fit_chars_f = FALSE;
5828 encode_fallback = NULL;
5829 unicode_subchar = '?';
5830 input_endian = ENDIAN_BIG;
5832 #ifdef UTF8_OUTPUT_ENABLE
5833 output_bom_f = FALSE;
5834 output_endian = ENDIAN_BIG;
5836 #ifdef UNICODE_NORMALIZATION
5849 is_inputcode_mixed = FALSE;
5850 is_inputcode_set = FALSE;
5854 #ifdef SHIFTJIS_CP932
5864 for (i = 0; i < 256; i++){
5865 prefix_table[i] = 0;
5869 mimeout_buf_count = 0;
5874 fold_preserve_f = FALSE;
5877 kanji_intro = DEFAULT_J;
5878 ascii_intro = DEFAULT_R;
5879 fold_margin = FOLD_MARGIN;
5880 output_conv = DEFAULT_CONV;
5881 oconv = DEFAULT_CONV;
5882 o_zconv = no_connection;
5883 o_fconv = no_connection;
5884 o_crconv = no_connection;
5885 o_rot_conv = no_connection;
5886 o_hira_conv = no_connection;
5887 o_base64conv = no_connection;
5888 o_iso2022jp_check_conv = no_connection;
5891 i_ungetc = std_ungetc;
5893 i_bungetc = std_ungetc;
5896 i_mungetc = std_ungetc;
5897 i_mgetc_buf = std_getc;
5898 i_mungetc_buf = std_ungetc;
5899 output_mode = ASCII;
5902 mime_decode_mode = FALSE;
5908 z_prev2=0,z_prev1=0;
5910 iconv_for_check = 0;
5912 input_codename = "";
5919 void no_connection(nkf_char c2, nkf_char c1)
5921 no_connection2(c2,c1,0);
5924 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
5926 fprintf(stderr,"nkf internal module connection failure.\n");
5928 return 0; /* LINT */
5933 #define fprintf dllprintf
5937 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
5938 fprintf(stderr,"Flags:\n");
5939 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
5940 #ifdef DEFAULT_CODE_SJIS
5941 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
5943 #ifdef DEFAULT_CODE_JIS
5944 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
5946 #ifdef DEFAULT_CODE_EUC
5947 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
5949 #ifdef DEFAULT_CODE_UTF8
5950 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
5952 #ifdef UTF8_OUTPUT_ENABLE
5953 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
5955 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
5956 #ifdef UTF8_INPUT_ENABLE
5957 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
5959 fprintf(stderr,"t no conversion\n");
5960 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
5961 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
5962 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
5963 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
5964 fprintf(stderr,"v Show this usage. V: show version\n");
5965 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
5966 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
5967 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
5968 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
5969 fprintf(stderr,"Z[0-3] Convert X0208 alphabet to ASCII\n");
5970 fprintf(stderr," 1: Kankaku to 1 space 2: to 2 spaces 3: Convert to HTML Entity\n");
5971 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
5972 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
5974 fprintf(stderr,"T Text mode output\n");
5976 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
5977 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
5978 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
5979 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
5980 fprintf(stderr,"\n");
5981 fprintf(stderr,"Long name options\n");
5982 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
5983 fprintf(stderr," Specify the input or output codeset\n");
5984 fprintf(stderr," --fj --unix --mac --windows\n");
5985 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
5986 fprintf(stderr," Convert for the system or code\n");
5987 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
5988 fprintf(stderr," To Hiragana/Katakana Conversion\n");
5989 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
5991 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
5993 #ifdef NUMCHAR_OPTION
5994 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
5996 #ifdef UTF8_INPUT_ENABLE
5997 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
5998 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6001 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6002 fprintf(stderr," Overwrite original listed files by filtered result\n");
6003 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6005 fprintf(stderr," -g --guess Guess the input code\n");
6006 fprintf(stderr," --help --version Show this help/the version\n");
6007 fprintf(stderr," For more information, see also man nkf\n");
6008 fprintf(stderr,"\n");
6014 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6015 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6018 #if defined(MSDOS) && defined(__WIN16__)
6021 #if defined(MSDOS) && defined(__WIN32__)
6027 ,NKF_VERSION,NKF_RELEASE_DATE);
6028 fprintf(stderr,"\n%s\n",CopyRight);
6033 **
\e$B%Q%C%A@):n<T
\e(B
6034 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6035 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6036 ** ohta@src.ricoh.co.jp (Junn Ohta)
6037 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6038 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6039 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6040 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6041 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6042 ** GHG00637@nifty-serve.or.jp (COW)