1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 ** UTF-8
\e$B%5%]!<%H$K$D$$$F
\e(B
31 **
\e$B=>Mh$N
\e(B nkf
\e$B$HF~$l$+$($F$=$N$^$^;H$($k$h$&$K$J$C$F$$$^$9
\e(B
32 ** nkf -e
\e$B$J$I$H$7$F5/F0$9$k$H!"<+F0H=JL$G
\e(B UTF-8
\e$B$HH=Dj$5$l$l$P!"
\e(B
33 **
\e$B$=$N$^$^
\e(B euc-jp
\e$B$KJQ49$5$l$^$9
\e(B
35 **
\e$B$^$@%P%0$,$"$k2DG=@-$,9b$$$G$9!#
\e(B
36 ** (
\e$BFC$K<+F0H=JL!"%3!<%I:.:_!"%(%i!<=hM}7O
\e(B)
38 **
\e$B2?$+LdBj$r8+$D$1$?$i!"
\e(B
39 ** E-Mail: furukawa@tcp-ip.or.jp
40 **
\e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#
\e(B
41 ***********************************************************************/
42 /* $Id: nkf.c,v 1.133 2007/09/19 13:03:15 naruse Exp $ */
43 #define NKF_VERSION "2.0.8"
44 #define NKF_RELEASE_DATE "2007-09-19"
49 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
50 "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon"
57 ** USAGE: nkf [flags] [file]
60 ** b Output is buffered (DEFAULT)
61 ** u Output is unbuffered
65 ** j Output code is JIS 7 bit (DEFAULT SELECT)
66 ** s Output code is MS Kanji (DEFAULT SELECT)
67 ** e Output code is AT&T JIS (DEFAULT SELECT)
68 ** w Output code is AT&T JIS (DEFAULT SELECT)
69 ** l Output code is JIS 7bit and ISO8859-1 Latin-1
71 ** m MIME conversion for ISO-2022-JP
72 ** I Convert non ISO-2022-JP charactor to GETA by Pekoe <pekoe@lair.net>
73 ** i_ Output sequence to designate JIS-kanji (DEFAULT_J)
74 ** o_ Output sequence to designate single-byte roman characters (DEFAULT_R)
75 ** M MIME output conversion
77 ** r {de/en}crypt ROT13/47
81 ** T Text mode output (for MS-DOS)
83 ** x Do not convert X0201 kana into X0208
84 ** Z Convert X0208 alphabet to ASCII
89 ** B try to fix broken JIS, missing Escape
90 ** B[1-9] broken level
92 ** O Output to 'nkf.out' file or last file name
93 ** d Delete \r in line feed
94 ** c Add \r in line feed
95 ** -- other long option
96 ** -- ignore following option (don't use with -O )
100 #if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS)
102 #if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__)
118 #if defined(MSDOS) || defined(__OS2__)
121 #if defined(_MSC_VER) || defined(__WATCOMC__)
122 #define mktemp _mktemp
128 #define setbinmode(fp) fsetbin(fp)
129 #elif defined(__DJGPP__)
130 #include <libc/dosio.h>
131 #define setbinmode(fp) djgpp_setbinmode(fp)
132 #else /* Microsoft C, Turbo C */
133 #define setbinmode(fp) setmode(fileno(fp), O_BINARY)
136 #define setbinmode(fp)
139 #if defined(__DJGPP__)
140 void djgpp_setbinmode(FILE *fp)
142 /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */
145 m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY;
146 __file_handle_set(fd, m);
150 #ifdef _IOFBF /* SysV and MSDOS, Windows */
151 #define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size)
153 #define setvbuffer(fp, buf, size) setbuffer(fp, buf, size)
156 /*Borland C++ 4.5 EasyWin*/
157 #if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */
166 /* added by satoru@isoternet.org */
168 #include <sys/types.h>
170 #include <sys/stat.h>
171 #if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */
173 #if defined(__WATCOMC__)
174 #include <sys/utime.h>
178 #else /* defined(MSDOS) */
180 #ifdef __BORLANDC__ /* BCC32 */
182 #else /* !defined(__BORLANDC__) */
183 #include <sys/utime.h>
184 #endif /* (__BORLANDC__) */
185 #else /* !defined(__WIN32__) */
186 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */
187 #include <sys/utime.h>
188 #elif defined(__TURBOC__) /* BCC */
190 #elif defined(LSI_C) /* LSI C */
191 #endif /* (__WIN32__) */
199 /* state of output_mode and input_mode
216 #define X0213_1 0x284F
217 #define X0213_2 0x2850
219 /* Input Assumption */
224 #define LATIN1_INPUT 6
226 #define STRICT_MIME 8
231 #define JAPANESE_EUC 10
235 #define UTF8_INPUT 13
236 #define UTF16_INPUT 1015
237 #define UTF32_INPUT 1017
241 #define ENDIAN_BIG 1234
242 #define ENDIAN_LITTLE 4321
243 #define ENDIAN_2143 2143
244 #define ENDIAN_3412 3412
264 #define is_alnum(c) \
265 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
267 /* I don't trust portablity of toupper */
268 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
269 #define nkf_isoctal(c) ('0'<=c && c<='7')
270 #define nkf_isdigit(c) ('0'<=c && c<='9')
271 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
272 #define nkf_isblank(c) (c == SPACE || c == TAB)
273 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
274 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
275 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
276 #define nkf_isprint(c) (' '<=c && c<='~')
277 #define nkf_isgraph(c) ('!'<=c && c<='~')
278 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
279 ('A'<=c&&c<='F') ? (c-'A'+10) : \
280 ('a'<=c&&c<='f') ? (c-'a'+10) : 0 )
281 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
282 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
284 #define CP932_TABLE_BEGIN 0xFA
285 #define CP932_TABLE_END 0xFC
286 #define CP932INV_TABLE_BEGIN 0xED
287 #define CP932INV_TABLE_END 0xEE
288 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
290 #define HOLD_SIZE 1024
291 #if defined(INT_IS_SHORT)
292 #define IOBUF_SIZE 2048
294 #define IOBUF_SIZE 16384
297 #define DEFAULT_J 'B'
298 #define DEFAULT_R 'B'
300 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
301 #define SJ6394 0x0161 /* 63 - 94 ku offset */
303 #define RANGE_NUM_MAX 18
308 #if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE)
309 #define sizeof_euc_to_utf8_1byte 94
310 #define sizeof_euc_to_utf8_2bytes 94
311 #define sizeof_utf8_to_euc_C2 64
312 #define sizeof_utf8_to_euc_E5B8 64
313 #define sizeof_utf8_to_euc_2bytes 112
314 #define sizeof_utf8_to_euc_3bytes 16
317 /* MIME preprocessor */
319 #ifdef EASYWIN /*Easy Win */
320 extern POINT _BufferSize;
329 void (*status_func)(struct input_code *, nkf_char);
330 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
334 static char *input_codename = "";
337 static const char *CopyRight = COPY_RIGHT;
339 #if !defined(PERL_XS) && !defined(WIN32DLL)
340 static nkf_char noconvert(FILE *f);
342 static void module_connection(void);
343 static nkf_char kanji_convert(FILE *f);
344 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
345 static nkf_char push_hold_buf(nkf_char c2);
346 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
347 static nkf_char s_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
348 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
349 static nkf_char e_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
350 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
352 * 0: Shift_JIS, eucJP-ascii
357 #define UCS_MAP_ASCII 0
359 #define UCS_MAP_CP932 2
360 #define UCS_MAP_CP10001 3
361 static int ms_ucs_map_f = UCS_MAP_ASCII;
363 #ifdef UTF8_INPUT_ENABLE
364 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
365 static int no_cp932ext_f = FALSE;
366 /* ignore ZERO WIDTH NO-BREAK SPACE */
367 static int no_best_fit_chars_f = FALSE;
368 static int input_endian = ENDIAN_BIG;
369 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
370 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
371 static void encode_fallback_html(nkf_char c);
372 static void encode_fallback_xml(nkf_char c);
373 static void encode_fallback_java(nkf_char c);
374 static void encode_fallback_perl(nkf_char c);
375 static void encode_fallback_subchar(nkf_char c);
376 static void (*encode_fallback)(nkf_char c) = NULL;
377 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
378 static nkf_char w_iconv(nkf_char c2,nkf_char c1,nkf_char c0);
379 static nkf_char w_iconv16(nkf_char c2,nkf_char c1,nkf_char c0);
380 static nkf_char w_iconv32(nkf_char c2,nkf_char c1,nkf_char c0);
381 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
382 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
383 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
384 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
385 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
386 static void w_status(struct input_code *, nkf_char);
388 #ifdef UTF8_OUTPUT_ENABLE
389 static int output_bom_f = FALSE;
390 static int output_endian = ENDIAN_BIG;
391 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
392 static void w_oconv(nkf_char c2,nkf_char c1);
393 static void w_oconv16(nkf_char c2,nkf_char c1);
394 static void w_oconv32(nkf_char c2,nkf_char c1);
396 static void e_oconv(nkf_char c2,nkf_char c1);
397 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
398 static void s_oconv(nkf_char c2,nkf_char c1);
399 static void j_oconv(nkf_char c2,nkf_char c1);
400 static void fold_conv(nkf_char c2,nkf_char c1);
401 static void cr_conv(nkf_char c2,nkf_char c1);
402 static void z_conv(nkf_char c2,nkf_char c1);
403 static void rot_conv(nkf_char c2,nkf_char c1);
404 static void hira_conv(nkf_char c2,nkf_char c1);
405 static void base64_conv(nkf_char c2,nkf_char c1);
406 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
407 static void no_connection(nkf_char c2,nkf_char c1);
408 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
410 static void code_score(struct input_code *ptr);
411 static void code_status(nkf_char c);
413 static void std_putc(nkf_char c);
414 static nkf_char std_getc(FILE *f);
415 static nkf_char std_ungetc(nkf_char c,FILE *f);
417 static nkf_char broken_getc(FILE *f);
418 static nkf_char broken_ungetc(nkf_char c,FILE *f);
420 static nkf_char mime_begin(FILE *f);
421 static nkf_char mime_getc(FILE *f);
422 static nkf_char mime_ungetc(nkf_char c,FILE *f);
424 static void switch_mime_getc(void);
425 static void unswitch_mime_getc(void);
426 static nkf_char mime_begin_strict(FILE *f);
427 static nkf_char mime_getc_buf(FILE *f);
428 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
429 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
431 static nkf_char base64decode(nkf_char c);
432 static void mime_prechar(nkf_char c2, nkf_char c1);
433 static void mime_putc(nkf_char c);
434 static void open_mime(nkf_char c);
435 static void close_mime(void);
436 static void eof_mime(void);
437 static void mimeout_addchar(nkf_char c);
439 static void usage(void);
440 static void version(void);
442 static void options(unsigned char *c);
443 #if defined(PERL_XS) || defined(WIN32DLL)
444 static void reinit(void);
449 #if !defined(PERL_XS) && !defined(WIN32DLL)
450 static unsigned char stdibuf[IOBUF_SIZE];
451 static unsigned char stdobuf[IOBUF_SIZE];
453 static unsigned char hold_buf[HOLD_SIZE*2];
454 static int hold_count = 0;
456 /* MIME preprocessor fifo */
458 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
459 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
460 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
461 static unsigned char mime_buf[MIME_BUF_SIZE];
462 static unsigned int mime_top = 0;
463 static unsigned int mime_last = 0; /* decoded */
464 static unsigned int mime_input = 0; /* undecoded */
465 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
468 static int unbuf_f = FALSE;
469 static int estab_f = FALSE;
470 static int nop_f = FALSE;
471 static int binmode_f = TRUE; /* binary mode */
472 static int rot_f = FALSE; /* rot14/43 mode */
473 static int hira_f = FALSE; /* hira/kata henkan */
474 static int input_f = FALSE; /* non fixed input code */
475 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
476 static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
477 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
478 static int mimebuf_f = FALSE; /* MIME buffered input */
479 static int broken_f = FALSE; /* convert ESC-less broken JIS */
480 static int iso8859_f = FALSE; /* ISO8859 through */
481 static int mimeout_f = FALSE; /* base64 mode */
482 #if defined(MSDOS) || defined(__OS2__)
483 static int x0201_f = TRUE; /* Assume JISX0201 kana */
485 static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
487 static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
489 #ifdef UNICODE_NORMALIZATION
490 static int nfc_f = FALSE;
491 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
492 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
493 static nkf_char nfc_getc(FILE *f);
494 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
498 static int cap_f = FALSE;
499 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
500 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
501 static nkf_char cap_getc(FILE *f);
502 static nkf_char cap_ungetc(nkf_char c,FILE *f);
504 static int url_f = FALSE;
505 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
506 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
507 static nkf_char url_getc(FILE *f);
508 static nkf_char url_ungetc(nkf_char c,FILE *f);
511 #if defined(INT_IS_SHORT)
512 #define NKF_INT32_C(n) (n##L)
514 #define NKF_INT32_C(n) (n)
516 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
517 #define CLASS_MASK NKF_INT32_C(0xFF000000)
518 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
519 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
520 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
521 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
522 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
524 #ifdef NUMCHAR_OPTION
525 static int numchar_f = FALSE;
526 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
527 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
528 static nkf_char numchar_getc(FILE *f);
529 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
533 static int noout_f = FALSE;
534 static void no_putc(nkf_char c);
535 static nkf_char debug_f = FALSE;
536 static void debug(const char *str);
537 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
540 static int guess_f = FALSE;
542 static void print_guessed_code(char *filename);
544 static void set_input_codename(char *codename);
545 static int is_inputcode_mixed = FALSE;
546 static int is_inputcode_set = FALSE;
549 static int exec_f = 0;
552 #ifdef SHIFTJIS_CP932
553 /* invert IBM extended characters to others */
554 static int cp51932_f = FALSE;
556 /* invert NEC-selected IBM extended characters to IBM extended characters */
557 static int cp932inv_f = TRUE;
559 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
560 #endif /* SHIFTJIS_CP932 */
563 static int x0212_f = FALSE;
564 static nkf_char x0212_shift(nkf_char c);
565 static nkf_char x0212_unshift(nkf_char c);
567 static int x0213_f = FALSE;
569 static unsigned char prefix_table[256];
571 static void set_code_score(struct input_code *ptr, nkf_char score);
572 static void clr_code_score(struct input_code *ptr, nkf_char score);
573 static void status_disable(struct input_code *ptr);
574 static void status_push_ch(struct input_code *ptr, nkf_char c);
575 static void status_clear(struct input_code *ptr);
576 static void status_reset(struct input_code *ptr);
577 static void status_reinit(struct input_code *ptr);
578 static void status_check(struct input_code *ptr, nkf_char c);
579 static void e_status(struct input_code *, nkf_char);
580 static void s_status(struct input_code *, nkf_char);
582 struct input_code input_code_list[] = {
583 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
584 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
585 #ifdef UTF8_INPUT_ENABLE
586 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
587 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
588 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
593 static int mimeout_mode = 0;
594 static int base64_count = 0;
596 /* X0208 -> ASCII converter */
599 static int f_line = 0; /* chars in line */
600 static int f_prev = 0;
601 static int fold_preserve_f = FALSE; /* preserve new lines */
602 static int fold_f = FALSE;
603 static int fold_len = 0;
606 static unsigned char kanji_intro = DEFAULT_J;
607 static unsigned char ascii_intro = DEFAULT_R;
611 #define FOLD_MARGIN 10
612 #define DEFAULT_FOLD 60
614 static int fold_margin = FOLD_MARGIN;
618 #ifdef DEFAULT_CODE_JIS
619 # define DEFAULT_CONV j_oconv
621 #ifdef DEFAULT_CODE_SJIS
622 # define DEFAULT_CONV s_oconv
624 #ifdef DEFAULT_CODE_EUC
625 # define DEFAULT_CONV e_oconv
627 #ifdef DEFAULT_CODE_UTF8
628 # define DEFAULT_CONV w_oconv
631 /* process default */
632 static void (*output_conv)(nkf_char c2,nkf_char c1) = DEFAULT_CONV;
634 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
635 /* s_iconv or oconv */
636 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
638 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
639 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
640 static void (*o_crconv)(nkf_char c2,nkf_char c1) = no_connection;
641 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
642 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
643 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
644 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
646 /* static redirections */
648 static void (*o_putc)(nkf_char c) = std_putc;
650 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
651 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
653 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
654 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
656 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
658 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
659 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
661 /* for strict mime */
662 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
663 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
666 static int output_mode = ASCII, /* output kanji mode */
667 input_mode = ASCII, /* input kanji mode */
668 shift_mode = FALSE; /* TRUE shift out, or X0201 */
669 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
671 /* X0201 / X0208 conversion tables */
673 /* X0201 kana conversion table */
675 static const unsigned char cv[]= {
676 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
677 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
678 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
679 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
680 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
681 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
682 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
683 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
684 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
685 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
686 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
687 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
688 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
689 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
690 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
691 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
695 /* X0201 kana conversion table for daguten */
697 static const unsigned char dv[]= {
698 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
699 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
700 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
701 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
702 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
703 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
704 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
705 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
706 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
707 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
708 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
709 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
710 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
711 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
712 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
713 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
716 /* X0201 kana conversion table for han-daguten */
718 static const unsigned char ev[]= {
719 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
720 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
721 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
722 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
723 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
724 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
725 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
726 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
727 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
728 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
729 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
730 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
731 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
732 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
733 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
734 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
738 /* X0208 kigou conversion table */
739 /* 0x8140 - 0x819e */
740 static const unsigned char fv[] = {
742 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
743 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
744 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
745 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
746 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
747 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
748 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
749 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
750 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
751 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
752 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
753 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
759 static int file_out_f = FALSE;
761 static int overwrite_f = FALSE;
762 static int preserve_time_f = FALSE;
763 static int backup_f = FALSE;
764 static char *backup_suffix = "";
765 static char *get_backup_filename(const char *suffix, const char *filename);
768 static int crmode_f = 0; /* CR, NL, CRLF */
769 static nkf_char prev_cr = 0;
770 #ifdef EASYWIN /*Easy Win */
771 static int end_check;
774 #define STD_GC_BUFSIZE (256)
775 nkf_char std_gc_buf[STD_GC_BUFSIZE];
779 #include "nkf32dll.c"
780 #elif defined(PERL_XS)
782 int main(int argc, char **argv)
787 char *outfname = NULL;
790 #ifdef EASYWIN /*Easy Win */
791 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
794 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
795 cp = (unsigned char *)*argv;
800 if (pipe(fds) < 0 || (pid = fork()) < 0){
811 execvp(argv[1], &argv[1]);
825 if(x0201_f == WISH_TRUE)
826 x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
828 if (binmode_f == TRUE)
829 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
830 if (freopen("","wb",stdout) == NULL)
837 setbuf(stdout, (char *) NULL);
839 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
842 if (binmode_f == TRUE)
843 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
844 if (freopen("","rb",stdin) == NULL) return (-1);
848 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
852 kanji_convert(stdin);
853 if (guess_f) print_guessed_code(NULL);
857 int is_argument_error = FALSE;
859 is_inputcode_mixed = FALSE;
860 is_inputcode_set = FALSE;
865 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
868 is_argument_error = TRUE;
876 /* reopen file for stdout */
877 if (file_out_f == TRUE) {
880 outfname = malloc(strlen(origfname)
881 + strlen(".nkftmpXXXXXX")
887 strcpy(outfname, origfname);
891 for (i = strlen(outfname); i; --i){
892 if (outfname[i - 1] == '/'
893 || outfname[i - 1] == '\\'){
899 strcat(outfname, "ntXXXXXX");
901 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
904 strcat(outfname, ".nkftmpXXXXXX");
905 fd = mkstemp(outfname);
908 || (fd_backup = dup(fileno(stdout))) < 0
909 || dup2(fd, fileno(stdout)) < 0
920 outfname = "nkf.out";
923 if(freopen(outfname, "w", stdout) == NULL) {
927 if (binmode_f == TRUE) {
928 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
929 if (freopen("","wb",stdout) == NULL)
936 if (binmode_f == TRUE)
937 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
938 if (freopen("","rb",fin) == NULL)
943 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
947 char *filename = NULL;
949 if (nfiles > 1) filename = origfname;
950 if (guess_f) print_guessed_code(filename);
956 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
964 if (dup2(fd_backup, fileno(stdout)) < 0){
967 if (stat(origfname, &sb)) {
968 fprintf(stderr, "Can't stat %s\n", origfname);
970 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
971 if (chmod(outfname, sb.st_mode)) {
972 fprintf(stderr, "Can't set permission %s\n", outfname);
975 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
977 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
978 tb[0] = tb[1] = sb.st_mtime;
979 if (utime(outfname, tb)) {
980 fprintf(stderr, "Can't set timestamp %s\n", outfname);
983 tb.actime = sb.st_atime;
984 tb.modtime = sb.st_mtime;
985 if (utime(outfname, &tb)) {
986 fprintf(stderr, "Can't set timestamp %s\n", outfname);
991 char *backup_filename = get_backup_filename(backup_suffix, origfname);
993 unlink(backup_filename);
995 if (rename(origfname, backup_filename)) {
996 perror(backup_filename);
997 fprintf(stderr, "Can't rename %s to %s\n",
998 origfname, backup_filename);
1002 if (unlink(origfname)){
1007 if (rename(outfname, origfname)) {
1009 fprintf(stderr, "Can't rename %s to %s\n",
1010 outfname, origfname);
1017 if (is_argument_error)
1020 #ifdef EASYWIN /*Easy Win */
1021 if (file_out_f == FALSE)
1022 scanf("%d",&end_check);
1025 #else /* for Other OS */
1026 if (file_out_f == TRUE)
1028 #endif /*Easy Win */
1031 #endif /* WIN32DLL */
1034 char *get_backup_filename(const char *suffix, const char *filename)
1036 char *backup_filename;
1037 int asterisk_count = 0;
1039 int filename_length = strlen(filename);
1041 for(i = 0; suffix[i]; i++){
1042 if(suffix[i] == '*') asterisk_count++;
1046 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1047 if (!backup_filename){
1048 perror("Can't malloc backup filename.");
1052 for(i = 0, j = 0; suffix[i];){
1053 if(suffix[i] == '*'){
1054 backup_filename[j] = '\0';
1055 strncat(backup_filename, filename, filename_length);
1057 j += filename_length;
1059 backup_filename[j++] = suffix[i++];
1062 backup_filename[j] = '\0';
1064 j = strlen(suffix) + filename_length;
1065 backup_filename = malloc( + 1);
1066 strcpy(backup_filename, filename);
1067 strcat(backup_filename, suffix);
1068 backup_filename[j] = '\0';
1070 return backup_filename;
1074 static const struct {
1098 {"katakana-hiragana","h3"},
1105 #ifdef UTF8_OUTPUT_ENABLE
1115 {"fb-subchar=", ""},
1117 #ifdef UTF8_INPUT_ENABLE
1118 {"utf8-input", "W"},
1119 {"utf16-input", "W16"},
1120 {"no-cp932ext", ""},
1121 {"no-best-fit-chars",""},
1123 #ifdef UNICODE_NORMALIZATION
1124 {"utf8mac-input", ""},
1136 #ifdef NUMCHAR_OPTION
1137 {"numchar-input", ""},
1143 #ifdef SHIFTJIS_CP932
1153 static int option_mode = 0;
1155 void options(unsigned char *cp)
1159 unsigned char *cp_back = NULL;
1164 while(*cp && *cp++!='-');
1165 while (*cp || cp_back) {
1173 case '-': /* literal options */
1174 if (!*cp || *cp == SPACE) { /* ignore the rest of arguments */
1178 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1179 p = (unsigned char *)long_option[i].name;
1180 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1181 if (*p == cp[j] || cp[j] == ' '){
1188 while(*cp && *cp != SPACE && cp++);
1189 if (long_option[i].alias[0]){
1191 cp = (unsigned char *)long_option[i].alias;
1193 if (strcmp(long_option[i].name, "ic=") == 0){
1194 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1195 codeset[i] = nkf_toupper(p[i]);
1198 if(strcmp(codeset, "ISO-2022-JP") == 0){
1199 input_f = JIS_INPUT;
1200 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
1201 strcmp(codeset, "CP50220") == 0 ||
1202 strcmp(codeset, "CP50221") == 0 ||
1203 strcmp(codeset, "CP50222") == 0){
1204 input_f = JIS_INPUT;
1205 #ifdef SHIFTJIS_CP932
1208 #ifdef UTF8_OUTPUT_ENABLE
1209 ms_ucs_map_f = UCS_MAP_CP932;
1211 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1212 input_f = JIS_INPUT;
1216 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1217 input_f = JIS_INPUT;
1222 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1223 input_f = SJIS_INPUT;
1224 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1225 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1226 strcmp(codeset, "CP932") == 0 ||
1227 strcmp(codeset, "MS932") == 0){
1228 input_f = SJIS_INPUT;
1229 #ifdef SHIFTJIS_CP932
1232 #ifdef UTF8_OUTPUT_ENABLE
1233 ms_ucs_map_f = UCS_MAP_CP932;
1235 }else if(strcmp(codeset, "CP10001") == 0){
1236 input_f = SJIS_INPUT;
1237 #ifdef SHIFTJIS_CP932
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_CP10001;
1243 }else if(strcmp(codeset, "EUCJP") == 0 ||
1244 strcmp(codeset, "EUC-JP") == 0){
1245 input_f = EUC_INPUT;
1246 }else if(strcmp(codeset, "CP51932") == 0){
1247 input_f = EUC_INPUT;
1248 #ifdef SHIFTJIS_CP932
1251 #ifdef UTF8_OUTPUT_ENABLE
1252 ms_ucs_map_f = UCS_MAP_CP932;
1254 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1255 strcmp(codeset, "EUCJP-MS") == 0 ||
1256 strcmp(codeset, "EUCJPMS") == 0){
1257 input_f = EUC_INPUT;
1258 #ifdef SHIFTJIS_CP932
1261 #ifdef UTF8_OUTPUT_ENABLE
1262 ms_ucs_map_f = UCS_MAP_MS;
1264 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1265 strcmp(codeset, "EUCJP-ASCII") == 0){
1266 input_f = EUC_INPUT;
1267 #ifdef SHIFTJIS_CP932
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_ASCII;
1273 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1274 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1275 input_f = SJIS_INPUT;
1277 #ifdef SHIFTJIS_CP932
1280 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1281 strcmp(codeset, "EUC-JIS-2004") == 0){
1282 input_f = EUC_INPUT;
1284 #ifdef SHIFTJIS_CP932
1287 #ifdef UTF8_INPUT_ENABLE
1288 }else if(strcmp(codeset, "UTF-8") == 0 ||
1289 strcmp(codeset, "UTF-8N") == 0 ||
1290 strcmp(codeset, "UTF-8-BOM") == 0){
1291 input_f = UTF8_INPUT;
1292 #ifdef UNICODE_NORMALIZATION
1293 }else if(strcmp(codeset, "UTF8-MAC") == 0 ||
1294 strcmp(codeset, "UTF-8-MAC") == 0){
1295 input_f = UTF8_INPUT;
1298 }else if(strcmp(codeset, "UTF-16") == 0 ||
1299 strcmp(codeset, "UTF-16BE") == 0 ||
1300 strcmp(codeset, "UTF-16BE-BOM") == 0){
1301 input_f = UTF16_INPUT;
1302 input_endian = ENDIAN_BIG;
1303 }else if(strcmp(codeset, "UTF-16LE") == 0 ||
1304 strcmp(codeset, "UTF-16LE-BOM") == 0){
1305 input_f = UTF16_INPUT;
1306 input_endian = ENDIAN_LITTLE;
1307 }else if(strcmp(codeset, "UTF-32") == 0 ||
1308 strcmp(codeset, "UTF-32BE") == 0 ||
1309 strcmp(codeset, "UTF-32BE-BOM") == 0){
1310 input_f = UTF32_INPUT;
1311 input_endian = ENDIAN_BIG;
1312 }else if(strcmp(codeset, "UTF-32LE") == 0 ||
1313 strcmp(codeset, "UTF-32LE-BOM") == 0){
1314 input_f = UTF32_INPUT;
1315 input_endian = ENDIAN_LITTLE;
1320 if (strcmp(long_option[i].name, "oc=") == 0){
1322 for (i=0; i < 16 && SPACE < p[i] && p[i] < DEL; i++){
1323 codeset[i] = nkf_toupper(p[i]);
1326 if(strcmp(codeset, "ISO-2022-JP") == 0){
1327 output_conv = j_oconv;
1328 }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
1329 output_conv = j_oconv;
1330 no_cp932ext_f = TRUE;
1331 #ifdef SHIFTJIS_CP932
1334 #ifdef UTF8_OUTPUT_ENABLE
1335 ms_ucs_map_f = UCS_MAP_CP932;
1337 }else if(strcmp(codeset, "CP50220") == 0){
1338 output_conv = j_oconv;
1340 #ifdef SHIFTJIS_CP932
1343 #ifdef UTF8_OUTPUT_ENABLE
1344 ms_ucs_map_f = UCS_MAP_CP932;
1346 }else if(strcmp(codeset, "CP50221") == 0){
1347 output_conv = j_oconv;
1348 #ifdef SHIFTJIS_CP932
1351 #ifdef UTF8_OUTPUT_ENABLE
1352 ms_ucs_map_f = UCS_MAP_CP932;
1354 }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
1355 output_conv = j_oconv;
1359 #ifdef SHIFTJIS_CP932
1362 }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
1363 output_conv = j_oconv;
1368 #ifdef SHIFTJIS_CP932
1371 }else if(strcmp(codeset, "SHIFT_JIS") == 0){
1372 output_conv = s_oconv;
1373 }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
1374 strcmp(codeset, "CSWINDOWS31J") == 0 ||
1375 strcmp(codeset, "CP932") == 0 ||
1376 strcmp(codeset, "MS932") == 0){
1377 output_conv = s_oconv;
1378 #ifdef UTF8_OUTPUT_ENABLE
1379 ms_ucs_map_f = UCS_MAP_CP932;
1381 }else if(strcmp(codeset, "CP10001") == 0){
1382 output_conv = s_oconv;
1383 #ifdef UTF8_OUTPUT_ENABLE
1384 ms_ucs_map_f = UCS_MAP_CP10001;
1386 }else if(strcmp(codeset, "EUCJP") == 0 ||
1387 strcmp(codeset, "EUC-JP") == 0){
1388 output_conv = e_oconv;
1389 }else if(strcmp(codeset, "CP51932") == 0){
1390 output_conv = e_oconv;
1391 #ifdef SHIFTJIS_CP932
1394 #ifdef UTF8_OUTPUT_ENABLE
1395 ms_ucs_map_f = UCS_MAP_CP932;
1397 }else if(strcmp(codeset, "EUC-JP-MS") == 0 ||
1398 strcmp(codeset, "EUCJP-MS") == 0 ||
1399 strcmp(codeset, "EUCJPMS") == 0){
1400 output_conv = e_oconv;
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_MS;
1407 }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
1408 strcmp(codeset, "EUCJP-ASCII") == 0){
1409 output_conv = e_oconv;
1413 #ifdef UTF8_OUTPUT_ENABLE
1414 ms_ucs_map_f = UCS_MAP_ASCII;
1416 }else if(strcmp(codeset, "SHIFT_JISX0213") == 0 ||
1417 strcmp(codeset, "SHIFT_JIS-2004") == 0){
1418 output_conv = s_oconv;
1420 #ifdef SHIFTJIS_CP932
1423 }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
1424 strcmp(codeset, "EUC-JIS-2004") == 0){
1425 output_conv = e_oconv;
1430 #ifdef SHIFTJIS_CP932
1433 #ifdef UTF8_OUTPUT_ENABLE
1434 }else if(strcmp(codeset, "UTF-8") == 0){
1435 output_conv = w_oconv;
1436 }else if(strcmp(codeset, "UTF-8N") == 0){
1437 output_conv = w_oconv;
1438 }else if(strcmp(codeset, "UTF-8-BOM") == 0){
1439 output_conv = w_oconv;
1440 output_bom_f = TRUE;
1441 }else if(strcmp(codeset, "UTF-16BE") == 0){
1442 output_conv = w_oconv16;
1443 }else if(strcmp(codeset, "UTF-16") == 0 ||
1444 strcmp(codeset, "UTF-16BE-BOM") == 0){
1445 output_conv = w_oconv16;
1446 output_bom_f = TRUE;
1447 }else if(strcmp(codeset, "UTF-16LE") == 0){
1448 output_conv = w_oconv16;
1449 output_endian = ENDIAN_LITTLE;
1450 }else if(strcmp(codeset, "UTF-16LE-BOM") == 0){
1451 output_conv = w_oconv16;
1452 output_endian = ENDIAN_LITTLE;
1453 output_bom_f = TRUE;
1454 }else if(strcmp(codeset, "UTF-32") == 0 ||
1455 strcmp(codeset, "UTF-32BE") == 0){
1456 output_conv = w_oconv32;
1457 }else if(strcmp(codeset, "UTF-32BE-BOM") == 0){
1458 output_conv = w_oconv32;
1459 output_bom_f = TRUE;
1460 }else if(strcmp(codeset, "UTF-32LE") == 0){
1461 output_conv = w_oconv32;
1462 output_endian = ENDIAN_LITTLE;
1463 }else if(strcmp(codeset, "UTF-32LE-BOM") == 0){
1464 output_conv = w_oconv32;
1465 output_endian = ENDIAN_LITTLE;
1466 output_bom_f = TRUE;
1472 if (strcmp(long_option[i].name, "overwrite") == 0){
1475 preserve_time_f = TRUE;
1478 if (strcmp(long_option[i].name, "overwrite=") == 0){
1481 preserve_time_f = TRUE;
1483 backup_suffix = malloc(strlen((char *) p) + 1);
1484 strcpy(backup_suffix, (char *) p);
1487 if (strcmp(long_option[i].name, "in-place") == 0){
1490 preserve_time_f = FALSE;
1493 if (strcmp(long_option[i].name, "in-place=") == 0){
1496 preserve_time_f = FALSE;
1498 backup_suffix = malloc(strlen((char *) p) + 1);
1499 strcpy(backup_suffix, (char *) p);
1504 if (strcmp(long_option[i].name, "cap-input") == 0){
1508 if (strcmp(long_option[i].name, "url-input") == 0){
1513 #ifdef NUMCHAR_OPTION
1514 if (strcmp(long_option[i].name, "numchar-input") == 0){
1520 if (strcmp(long_option[i].name, "no-output") == 0){
1524 if (strcmp(long_option[i].name, "debug") == 0){
1529 if (strcmp(long_option[i].name, "cp932") == 0){
1530 #ifdef SHIFTJIS_CP932
1534 #ifdef UTF8_OUTPUT_ENABLE
1535 ms_ucs_map_f = UCS_MAP_CP932;
1539 if (strcmp(long_option[i].name, "no-cp932") == 0){
1540 #ifdef SHIFTJIS_CP932
1544 #ifdef UTF8_OUTPUT_ENABLE
1545 ms_ucs_map_f = UCS_MAP_ASCII;
1549 #ifdef SHIFTJIS_CP932
1550 if (strcmp(long_option[i].name, "cp932inv") == 0){
1557 if (strcmp(long_option[i].name, "x0212") == 0){
1564 if (strcmp(long_option[i].name, "exec-in") == 0){
1568 if (strcmp(long_option[i].name, "exec-out") == 0){
1573 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1574 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1575 no_cp932ext_f = TRUE;
1578 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1579 no_best_fit_chars_f = TRUE;
1582 if (strcmp(long_option[i].name, "fb-skip") == 0){
1583 encode_fallback = NULL;
1586 if (strcmp(long_option[i].name, "fb-html") == 0){
1587 encode_fallback = encode_fallback_html;
1590 if (strcmp(long_option[i].name, "fb-xml" ) == 0){
1591 encode_fallback = encode_fallback_xml;
1594 if (strcmp(long_option[i].name, "fb-java") == 0){
1595 encode_fallback = encode_fallback_java;
1598 if (strcmp(long_option[i].name, "fb-perl") == 0){
1599 encode_fallback = encode_fallback_perl;
1602 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1603 encode_fallback = encode_fallback_subchar;
1606 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1607 encode_fallback = encode_fallback_subchar;
1608 unicode_subchar = 0;
1610 /* decimal number */
1611 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1612 unicode_subchar *= 10;
1613 unicode_subchar += hex2bin(p[i]);
1615 }else if(p[1] == 'x' || p[1] == 'X'){
1616 /* hexadecimal number */
1617 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1618 unicode_subchar <<= 4;
1619 unicode_subchar |= hex2bin(p[i]);
1623 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1624 unicode_subchar *= 8;
1625 unicode_subchar += hex2bin(p[i]);
1628 w16e_conv(unicode_subchar, &i, &j);
1629 unicode_subchar = i<<8 | j;
1633 #ifdef UTF8_OUTPUT_ENABLE
1634 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1635 ms_ucs_map_f = UCS_MAP_MS;
1639 #ifdef UNICODE_NORMALIZATION
1640 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1641 input_f = UTF8_INPUT;
1646 if (strcmp(long_option[i].name, "prefix=") == 0){
1647 if (nkf_isgraph(p[0])){
1648 for (i = 1; nkf_isgraph(p[i]); i++){
1649 prefix_table[p[i]] = p[0];
1656 case 'b': /* buffered mode */
1659 case 'u': /* non bufferd mode */
1662 case 't': /* transparent mode */
1667 } else if (*cp=='2') {
1671 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1679 case 'j': /* JIS output */
1681 output_conv = j_oconv;
1683 case 'e': /* AT&T EUC output */
1684 output_conv = e_oconv;
1687 case 's': /* SJIS output */
1688 output_conv = s_oconv;
1690 case 'l': /* ISO8859 Latin-1 support, no conversion */
1691 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1692 input_f = LATIN1_INPUT;
1694 case 'i': /* Kanji IN ESC-$-@/B */
1695 if (*cp=='@'||*cp=='B')
1696 kanji_intro = *cp++;
1698 case 'o': /* ASCII IN ESC-(-J/B */
1699 if (*cp=='J'||*cp=='B'||*cp=='H')
1700 ascii_intro = *cp++;
1704 bit:1 katakana->hiragana
1705 bit:2 hiragana->katakana
1707 if ('9'>= *cp && *cp>='0')
1708 hira_f |= (*cp++ -'0');
1715 #if defined(MSDOS) || defined(__OS2__)
1730 #ifdef UTF8_OUTPUT_ENABLE
1731 case 'w': /* UTF-8 output */
1733 output_conv = w_oconv; cp++;
1737 output_bom_f = TRUE;
1740 if ('1'== cp[0] && '6'==cp[1]) {
1741 output_conv = w_oconv16; cp+=2;
1742 } else if ('3'== cp[0] && '2'==cp[1]) {
1743 output_conv = w_oconv32; cp+=2;
1745 output_conv = w_oconv;
1750 output_endian = ENDIAN_LITTLE;
1751 } else if (cp[0] == 'B') {
1759 output_bom_f = TRUE;
1764 #ifdef UTF8_INPUT_ENABLE
1765 case 'W': /* UTF input */
1768 input_f = UTF8_INPUT;
1770 if ('1'== cp[0] && '6'==cp[1]) {
1772 input_f = UTF16_INPUT;
1773 input_endian = ENDIAN_BIG;
1774 } else if ('3'== cp[0] && '2'==cp[1]) {
1776 input_f = UTF32_INPUT;
1777 input_endian = ENDIAN_BIG;
1779 input_f = UTF8_INPUT;
1784 input_endian = ENDIAN_LITTLE;
1785 } else if (cp[0] == 'B') {
1791 /* Input code assumption */
1792 case 'J': /* JIS input */
1793 input_f = JIS_INPUT;
1795 case 'E': /* AT&T EUC input */
1796 input_f = EUC_INPUT;
1798 case 'S': /* MS Kanji input */
1799 input_f = SJIS_INPUT;
1800 if (x0201_f==NO_X0201) x0201_f=TRUE;
1802 case 'Z': /* Convert X0208 alphabet to asii */
1804 bit:0 Convert JIS X 0208 Alphabet to ASCII
1805 bit:1 Convert Kankaku to one space
1806 bit:2 Convert Kankaku to two spaces
1807 bit:3 Convert HTML Entity
1808 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1810 while ('0'<= *cp && *cp <='9') {
1811 alpha_f |= 1 << (*cp++ - '0');
1813 if (!alpha_f) alpha_f = 1;
1815 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1816 x0201_f = FALSE; /* No X0201->X0208 conversion */
1818 ESC-(-I in JIS, EUC, MS Kanji
1819 SI/SO in JIS, EUC, MS Kanji
1820 SSO in EUC, JIS, not in MS Kanji
1821 MS Kanji (0xa0-0xdf)
1823 ESC-(-I in JIS (0x20-0x5f)
1824 SSO in EUC (0xa0-0xdf)
1825 0xa0-0xd in MS Kanji (0xa0-0xdf)
1828 case 'X': /* Assume X0201 kana */
1829 /* Default value is NO_X0201 for EUC/MS-Kanji mix */
1832 case 'F': /* prserve new lines */
1833 fold_preserve_f = TRUE;
1834 case 'f': /* folding -f60 or -f */
1837 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1839 fold_len += *cp++ - '0';
1841 if (!(0<fold_len && fold_len<BUFSIZ))
1842 fold_len = DEFAULT_FOLD;
1846 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1848 fold_margin += *cp++ - '0';
1852 case 'm': /* MIME support */
1853 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1854 if (*cp=='B'||*cp=='Q') {
1855 mime_decode_mode = *cp++;
1856 mimebuf_f = FIXED_MIME;
1857 } else if (*cp=='N') {
1858 mime_f = TRUE; cp++;
1859 } else if (*cp=='S') {
1860 mime_f = STRICT_MIME; cp++;
1861 } else if (*cp=='0') {
1862 mime_decode_f = FALSE;
1863 mime_f = FALSE; cp++;
1866 case 'M': /* MIME output */
1869 mimeout_f = FIXED_MIME; cp++;
1870 } else if (*cp=='Q') {
1872 mimeout_f = FIXED_MIME; cp++;
1877 case 'B': /* Broken JIS support */
1879 bit:1 allow any x on ESC-(-x or ESC-$-x
1880 bit:2 reset to ascii on NL
1882 if ('9'>= *cp && *cp>='0')
1883 broken_f |= 1<<(*cp++ -'0');
1888 case 'O':/* for Output file */
1892 case 'c':/* add cr code */
1895 case 'd':/* delete cr code */
1898 case 'I': /* ISO-2022-JP output */
1901 case 'L': /* line mode */
1902 if (*cp=='u') { /* unix */
1903 crmode_f = NL; cp++;
1904 } else if (*cp=='m') { /* mac */
1905 crmode_f = CR; cp++;
1906 } else if (*cp=='w') { /* windows */
1907 crmode_f = CRLF; cp++;
1908 } else if (*cp=='0') { /* no conversion */
1918 /* module muliple options in a string are allowed for Perl moudle */
1919 while(*cp && *cp++!='-');
1922 /* bogus option but ignored */
1928 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1931 struct input_code *p = input_code_list;
1933 if (iconv_func == p->iconv_func){
1942 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1944 #ifdef INPUT_CODE_FIX
1952 #ifdef INPUT_CODE_FIX
1953 && (f == -TRUE || !input_f) /* -TRUE means "FORCE" */
1959 if (estab_f && iconv_for_check != iconv){
1960 struct input_code *p = find_inputcode_byfunc(iconv);
1962 set_input_codename(p->name);
1963 debug(input_codename);
1965 iconv_for_check = iconv;
1970 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
1971 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
1972 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
1973 #ifdef SHIFTJIS_CP932
1974 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B */
1975 #define SCORE_NO_EXIST (SCORE_CP932 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1977 #define SCORE_NO_EXIST (SCORE_DEPEND << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
1979 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
1980 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
1982 #define SCORE_INIT (SCORE_iMIME)
1984 static const char score_table_A0[] = {
1987 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1988 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
1991 static const char score_table_F0[] = {
1992 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
1993 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
1994 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
1995 SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
1998 void set_code_score(struct input_code *ptr, nkf_char score)
2001 ptr->score |= score;
2005 void clr_code_score(struct input_code *ptr, nkf_char score)
2008 ptr->score &= ~score;
2012 void code_score(struct input_code *ptr)
2014 nkf_char c2 = ptr->buf[0];
2015 #ifdef UTF8_OUTPUT_ENABLE
2016 nkf_char c1 = ptr->buf[1];
2019 set_code_score(ptr, SCORE_ERROR);
2020 }else if (c2 == SSO){
2021 set_code_score(ptr, SCORE_KANA);
2022 #ifdef UTF8_OUTPUT_ENABLE
2023 }else if (!e2w_conv(c2, c1)){
2024 set_code_score(ptr, SCORE_NO_EXIST);
2026 }else if ((c2 & 0x70) == 0x20){
2027 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2028 }else if ((c2 & 0x70) == 0x70){
2029 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2030 }else if ((c2 & 0x70) >= 0x50){
2031 set_code_score(ptr, SCORE_L2);
2035 void status_disable(struct input_code *ptr)
2040 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2043 void status_push_ch(struct input_code *ptr, nkf_char c)
2045 ptr->buf[ptr->index++] = c;
2048 void status_clear(struct input_code *ptr)
2054 void status_reset(struct input_code *ptr)
2057 ptr->score = SCORE_INIT;
2060 void status_reinit(struct input_code *ptr)
2063 ptr->_file_stat = 0;
2066 void status_check(struct input_code *ptr, nkf_char c)
2068 if (c <= DEL && estab_f){
2073 void s_status(struct input_code *ptr, nkf_char c)
2077 status_check(ptr, c);
2082 #ifdef NUMCHAR_OPTION
2083 }else if (is_unicode_capsule(c)){
2086 }else if (0xa1 <= c && c <= 0xdf){
2087 status_push_ch(ptr, SSO);
2088 status_push_ch(ptr, c);
2091 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xef)){
2093 status_push_ch(ptr, c);
2094 #ifdef SHIFTJIS_CP932
2096 && is_ibmext_in_sjis(c)){
2098 status_push_ch(ptr, c);
2099 #endif /* SHIFTJIS_CP932 */
2101 }else if (x0212_f && 0xf0 <= c && c <= 0xfc){
2103 status_push_ch(ptr, c);
2104 #endif /* X0212_ENABLE */
2106 status_disable(ptr);
2110 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2111 status_push_ch(ptr, c);
2112 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2116 status_disable(ptr);
2120 #ifdef SHIFTJIS_CP932
2121 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2122 status_push_ch(ptr, c);
2123 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0){
2124 set_code_score(ptr, SCORE_CP932);
2129 #endif /* SHIFTJIS_CP932 */
2130 #ifndef X0212_ENABLE
2131 status_disable(ptr);
2137 void e_status(struct input_code *ptr, nkf_char c)
2141 status_check(ptr, c);
2146 #ifdef NUMCHAR_OPTION
2147 }else if (is_unicode_capsule(c)){
2150 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2152 status_push_ch(ptr, c);
2154 }else if (0x8f == c){
2156 status_push_ch(ptr, c);
2157 #endif /* X0212_ENABLE */
2159 status_disable(ptr);
2163 if (0xa1 <= c && c <= 0xfe){
2164 status_push_ch(ptr, c);
2168 status_disable(ptr);
2173 if (0xa1 <= c && c <= 0xfe){
2175 status_push_ch(ptr, c);
2177 status_disable(ptr);
2179 #endif /* X0212_ENABLE */
2183 #ifdef UTF8_INPUT_ENABLE
2184 void w_status(struct input_code *ptr, nkf_char c)
2188 status_check(ptr, c);
2193 #ifdef NUMCHAR_OPTION
2194 }else if (is_unicode_capsule(c)){
2197 }else if (0xc0 <= c && c <= 0xdf){
2199 status_push_ch(ptr, c);
2200 }else if (0xe0 <= c && c <= 0xef){
2202 status_push_ch(ptr, c);
2203 }else if (0xf0 <= c && c <= 0xf4){
2205 status_push_ch(ptr, c);
2207 status_disable(ptr);
2212 if (0x80 <= c && c <= 0xbf){
2213 status_push_ch(ptr, c);
2214 if (ptr->index > ptr->stat){
2215 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2216 && ptr->buf[2] == 0xbf);
2217 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2218 &ptr->buf[0], &ptr->buf[1]);
2225 status_disable(ptr);
2229 if (0x80 <= c && c <= 0xbf){
2230 if (ptr->index < ptr->stat){
2231 status_push_ch(ptr, c);
2236 status_disable(ptr);
2243 void code_status(nkf_char c)
2245 int action_flag = 1;
2246 struct input_code *result = 0;
2247 struct input_code *p = input_code_list;
2249 if (!p->status_func) {
2253 if (!p->status_func)
2255 (p->status_func)(p, c);
2258 }else if(p->stat == 0){
2269 if (result && !estab_f){
2270 set_iconv(TRUE, result->iconv_func);
2271 }else if (c <= DEL){
2272 struct input_code *ptr = input_code_list;
2282 nkf_char std_getc(FILE *f)
2285 return std_gc_buf[--std_gc_ndx];
2291 nkf_char std_ungetc(nkf_char c, FILE *f)
2293 if (std_gc_ndx == STD_GC_BUFSIZE){
2296 std_gc_buf[std_gc_ndx++] = c;
2301 void std_putc(nkf_char c)
2308 #if !defined(PERL_XS) && !defined(WIN32DLL)
2309 nkf_char noconvert(FILE *f)
2314 module_connection();
2315 while ((c = (*i_getc)(f)) != EOF)
2322 void module_connection(void)
2324 oconv = output_conv;
2327 /* replace continucation module, from output side */
2329 /* output redicrection */
2331 if (noout_f || guess_f){
2338 if (mimeout_f == TRUE) {
2339 o_base64conv = oconv; oconv = base64_conv;
2341 /* base64_count = 0; */
2345 o_crconv = oconv; oconv = cr_conv;
2348 o_rot_conv = oconv; oconv = rot_conv;
2351 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2354 o_hira_conv = oconv; oconv = hira_conv;
2357 o_fconv = oconv; oconv = fold_conv;
2360 if (alpha_f || x0201_f) {
2361 o_zconv = oconv; oconv = z_conv;
2365 i_ungetc = std_ungetc;
2366 /* input redicrection */
2369 i_cgetc = i_getc; i_getc = cap_getc;
2370 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2373 i_ugetc = i_getc; i_getc = url_getc;
2374 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2377 #ifdef NUMCHAR_OPTION
2379 i_ngetc = i_getc; i_getc = numchar_getc;
2380 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2383 #ifdef UNICODE_NORMALIZATION
2384 if (nfc_f && input_f == UTF8_INPUT){
2385 i_nfc_getc = i_getc; i_getc = nfc_getc;
2386 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2389 if (mime_f && mimebuf_f==FIXED_MIME) {
2390 i_mgetc = i_getc; i_getc = mime_getc;
2391 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2394 i_bgetc = i_getc; i_getc = broken_getc;
2395 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2397 if (input_f == JIS_INPUT || input_f == EUC_INPUT || input_f == LATIN1_INPUT) {
2398 set_iconv(-TRUE, e_iconv);
2399 } else if (input_f == SJIS_INPUT) {
2400 set_iconv(-TRUE, s_iconv);
2401 #ifdef UTF8_INPUT_ENABLE
2402 } else if (input_f == UTF8_INPUT) {
2403 set_iconv(-TRUE, w_iconv);
2404 } else if (input_f == UTF16_INPUT) {
2405 set_iconv(-TRUE, w_iconv16);
2406 } else if (input_f == UTF32_INPUT) {
2407 set_iconv(-TRUE, w_iconv32);
2410 set_iconv(FALSE, e_iconv);
2414 struct input_code *p = input_code_list;
2422 * Check and Ignore BOM
2424 void check_bom(FILE *f)
2427 switch(c2 = (*i_getc)(f)){
2429 if((c2 = (*i_getc)(f)) == 0x00){
2430 if((c2 = (*i_getc)(f)) == 0xFE){
2431 if((c2 = (*i_getc)(f)) == 0xFF){
2433 set_iconv(TRUE, w_iconv32);
2435 if (iconv == w_iconv32) {
2436 input_endian = ENDIAN_BIG;
2439 (*i_ungetc)(0xFF,f);
2440 }else (*i_ungetc)(c2,f);
2441 (*i_ungetc)(0xFE,f);
2442 }else if(c2 == 0xFF){
2443 if((c2 = (*i_getc)(f)) == 0xFE){
2445 set_iconv(TRUE, w_iconv32);
2447 if (iconv == w_iconv32) {
2448 input_endian = ENDIAN_2143;
2451 (*i_ungetc)(0xFF,f);
2452 }else (*i_ungetc)(c2,f);
2453 (*i_ungetc)(0xFF,f);
2454 }else (*i_ungetc)(c2,f);
2455 (*i_ungetc)(0x00,f);
2456 }else (*i_ungetc)(c2,f);
2457 (*i_ungetc)(0x00,f);
2460 if((c2 = (*i_getc)(f)) == 0xBB){
2461 if((c2 = (*i_getc)(f)) == 0xBF){
2463 set_iconv(TRUE, w_iconv);
2465 if (iconv == w_iconv) {
2468 (*i_ungetc)(0xBF,f);
2469 }else (*i_ungetc)(c2,f);
2470 (*i_ungetc)(0xBB,f);
2471 }else (*i_ungetc)(c2,f);
2472 (*i_ungetc)(0xEF,f);
2475 if((c2 = (*i_getc)(f)) == 0xFF){
2476 if((c2 = (*i_getc)(f)) == 0x00){
2477 if((c2 = (*i_getc)(f)) == 0x00){
2479 set_iconv(TRUE, w_iconv32);
2481 if (iconv == w_iconv32) {
2482 input_endian = ENDIAN_3412;
2485 (*i_ungetc)(0x00,f);
2486 }else (*i_ungetc)(c2,f);
2487 (*i_ungetc)(0x00,f);
2488 }else (*i_ungetc)(c2,f);
2490 set_iconv(TRUE, w_iconv16);
2492 if (iconv == w_iconv16) {
2493 input_endian = ENDIAN_BIG;
2496 (*i_ungetc)(0xFF,f);
2497 }else (*i_ungetc)(c2,f);
2498 (*i_ungetc)(0xFE,f);
2501 if((c2 = (*i_getc)(f)) == 0xFE){
2502 if((c2 = (*i_getc)(f)) == 0x00){
2503 if((c2 = (*i_getc)(f)) == 0x00){
2505 set_iconv(TRUE, w_iconv32);
2507 if (iconv == w_iconv32) {
2508 input_endian = ENDIAN_LITTLE;
2511 (*i_ungetc)(0x00,f);
2512 }else (*i_ungetc)(c2,f);
2513 (*i_ungetc)(0x00,f);
2514 }else (*i_ungetc)(c2,f);
2516 set_iconv(TRUE, w_iconv16);
2518 if (iconv == w_iconv16) {
2519 input_endian = ENDIAN_LITTLE;
2522 (*i_ungetc)(0xFE,f);
2523 }else (*i_ungetc)(c2,f);
2524 (*i_ungetc)(0xFF,f);
2533 Conversion main loop. Code detection only.
2536 nkf_char kanji_convert(FILE *f)
2538 nkf_char c3, c2=0, c1, c0=0;
2539 int is_8bit = FALSE;
2541 if(input_f == SJIS_INPUT || input_f == EUC_INPUT
2542 #ifdef UTF8_INPUT_ENABLE
2543 || input_f == UTF8_INPUT || input_f == UTF16_INPUT
2550 output_mode = ASCII;
2553 #define NEXT continue /* no output, get next */
2554 #define SEND ; /* output c1 and c2, get next */
2555 #define LAST break /* end of loop, go closing */
2557 module_connection();
2560 while ((c1 = (*i_getc)(f)) != EOF) {
2561 #ifdef INPUT_CODE_FIX
2567 if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2568 /* in case of 8th bit is on */
2569 if (!estab_f&&!mime_decode_mode) {
2570 /* in case of not established yet */
2571 /* It is still ambiguious */
2572 if (h_conv(f, c2, c1)==EOF)
2578 /* in case of already established */
2580 /* ignore bogus code and not CP5022x UCD */
2588 /* second byte, 7 bit code */
2589 /* it might be kanji shitfted */
2590 if ((c1 == DEL) || (c1 <= SPACE)) {
2591 /* ignore bogus first code */
2598 #ifdef UTF8_INPUT_ENABLE
2599 if (iconv == w_iconv16) {
2600 if (input_endian == ENDIAN_BIG) {
2602 if ((c1 = (*i_getc)(f)) != EOF) {
2603 if (0xD8 <= c2 && c2 <= 0xDB) {
2604 if ((c0 = (*i_getc)(f)) != EOF) {
2606 if ((c3 = (*i_getc)(f)) != EOF) {
2613 if ((c2 = (*i_getc)(f)) != EOF) {
2614 if (0xD8 <= c2 && c2 <= 0xDB) {
2615 if ((c3 = (*i_getc)(f)) != EOF) {
2616 if ((c0 = (*i_getc)(f)) != EOF) {
2625 } else if(iconv == w_iconv32){
2627 if((c2 = (*i_getc)(f)) != EOF &&
2628 (c1 = (*i_getc)(f)) != EOF &&
2629 (c0 = (*i_getc)(f)) != EOF){
2630 switch(input_endian){
2632 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2635 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2638 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2641 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2651 #ifdef NUMCHAR_OPTION
2652 if (is_unicode_capsule(c1)){
2656 if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
2658 if (!estab_f && !iso8859_f) {
2659 /* not established yet */
2662 } else { /* estab_f==TRUE */
2667 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2668 /* SJIS X0201 Case... */
2669 if(iso2022jp_f && x0201_f==NO_X0201) {
2670 (*oconv)(GETA1, GETA2);
2677 } else if (c1==SSO && iconv != s_iconv) {
2678 /* EUC X0201 Case */
2679 c1 = (*i_getc)(f); /* skip SSO */
2681 if (SSP<=c1 && c1<0xe0) {
2682 if(iso2022jp_f && x0201_f==NO_X0201) {
2683 (*oconv)(GETA1, GETA2);
2690 } else { /* bogus code, skip SSO and one byte */
2693 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2694 (c1 == 0xFD || c1 == 0xFE)) {
2700 /* already established */
2705 } else if ((c1 > SPACE) && (c1 != DEL)) {
2706 /* in case of Roman characters */
2708 /* output 1 shifted byte */
2712 } else if (SPACE<=c1 && c1<(0xe0&0x7f) ){
2713 /* output 1 shifted byte */
2714 if(iso2022jp_f && x0201_f==NO_X0201) {
2715 (*oconv)(GETA1, GETA2);
2722 /* look like bogus code */
2725 } else if (input_mode == X0208 || input_mode == X0212 ||
2726 input_mode == X0213_1 || input_mode == X0213_2) {
2727 /* in case of Kanji shifted */
2730 } else if (c1 == '=' && mime_f && !mime_decode_mode ) {
2731 /* Check MIME code */
2732 if ((c1 = (*i_getc)(f)) == EOF) {
2735 } else if (c1 == '?') {
2736 /* =? is mime conversion start sequence */
2737 if(mime_f == STRICT_MIME) {
2738 /* check in real detail */
2739 if (mime_begin_strict(f) == EOF)
2743 } else if (mime_begin(f) == EOF)
2753 /* normal ASCII code */
2756 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
\r
2759 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
\r
2762 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
\r
2763 if ((c1 = (*i_getc)(f)) == EOF) {
2764 /* (*oconv)(0, ESC); don't send bogus code */
2766 } else if (c1 == '$') {
2767 if ((c1 = (*i_getc)(f)) == EOF) {
2769 (*oconv)(0, ESC); don't send bogus code
2770 (*oconv)(0, '$'); */
2772 } else if (c1 == '@'|| c1 == 'B') {
2773 /* This is kanji introduction */
2776 set_input_codename("ISO-2022-JP");
2778 debug(input_codename);
2781 } else if (c1 == '(') {
2782 if ((c1 = (*i_getc)(f)) == EOF) {
2783 /* don't send bogus code
2789 } else if (c1 == '@'|| c1 == 'B') {
2790 /* This is kanji introduction */
2795 } else if (c1 == 'D'){
2799 #endif /* X0212_ENABLE */
2800 } else if (c1 == (X0213_1&0x7F)){
2801 input_mode = X0213_1;
2804 } else if (c1 == (X0213_2&0x7F)){
2805 input_mode = X0213_2;
2809 /* could be some special code */
2816 } else if (broken_f&0x2) {
2817 /* accept any ESC-(-x as broken code ... */
2827 } else if (c1 == '(') {
2828 if ((c1 = (*i_getc)(f)) == EOF) {
2829 /* don't send bogus code
2831 (*oconv)(0, '('); */
2835 /* This is X0201 kana introduction */
2836 input_mode = X0201; shift_mode = X0201;
2838 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2839 /* This is X0208 kanji introduction */
2840 input_mode = ASCII; shift_mode = FALSE;
2842 } else if (broken_f&0x2) {
2843 input_mode = ASCII; shift_mode = FALSE;
2848 /* maintain various input_mode here */
2852 } else if ( c1 == 'N' || c1 == 'n' ){
2854 c3 = (*i_getc)(f); /* skip SS2 */
2855 if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2870 } else if (c1 == ESC && iconv == s_iconv) {
2871 /* ESC in Shift_JIS */
2872 if ((c1 = (*i_getc)(f)) == EOF) {
2873 /* (*oconv)(0, ESC); don't send bogus code */
2875 } else if (c1 == '$') {
2877 if ((c1 = (*i_getc)(f)) == EOF) {
2879 (*oconv)(0, ESC); don't send bogus code
2880 (*oconv)(0, '$'); */
2883 if (('E' <= c1 && c1 <= 'G') ||
2884 ('O' <= c1 && c1 <= 'Q')) {
2892 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2893 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
2894 while ((c1 = (*i_getc)(f)) != EOF) {
2895 if (SPACE <= c1 && c1 <= 'z') {
2896 (*oconv)(0, c1 + c0);
2897 } else break; /* c1 == SO */
2901 if (c1 == EOF) LAST;
2908 } else if (c1 == NL || c1 == CR) {
2910 input_mode = ASCII; set_iconv(FALSE, 0);
2912 } else if (mime_decode_f && !mime_decode_mode){
2914 if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2922 } else { /* if (c1 == CR)*/
2923 if ((c1=(*i_getc)(f))!=EOF) {
2927 } else if (c1 == NL && (c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
2942 if (prev_cr && c1 == NL) crmode_f = CRLF;
2945 } else if (c1 == DEL && input_mode == X0208 ) {
2955 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
2958 if ((c0 = (*i_getc)(f)) != EOF) {
2961 if ((c3 = (*i_getc)(f)) != EOF) {
2963 (*iconv)(c2, c1, c0|c3);
2968 /* 3 bytes EUC or UTF-8 */
2969 if ((c0 = (*i_getc)(f)) != EOF) {
2971 (*iconv)(c2, c1, c0);
2979 0x7F <= c2 && c2 <= 0x92 &&
2980 0x21 <= c1 && c1 <= 0x7E) {
2982 if(c1 == 0x7F) return 0;
2983 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
2986 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
2990 (*oconv)(PREFIX_EUCG3 | c2, c1);
2992 #endif /* X0212_ENABLE */
2994 (*oconv)(PREFIX_EUCG3 | c2, c1);
2997 (*oconv)(input_mode, c1); /* other special case */
3003 /* goto next_word */
3007 (*iconv)(EOF, 0, 0);
3008 if (!is_inputcode_set)
3011 struct input_code *p = input_code_list;
3012 struct input_code *result = p;
3014 if (p->score < result->score) result = p;
3017 set_input_codename(result->name);
3024 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3026 nkf_char ret, c3, c0;
3030 /** it must NOT be in the kanji shifte sequence */
3031 /** it must NOT be written in JIS7 */
3032 /** and it must be after 2 byte 8bit code */
3038 while ((c1 = (*i_getc)(f)) != EOF) {
3044 if (push_hold_buf(c1) == EOF || estab_f){
3050 struct input_code *p = input_code_list;
3051 struct input_code *result = p;
3056 if (p->status_func && p->score < result->score){
3061 set_iconv(TRUE, result->iconv_func);
3066 ** 1) EOF is detected, or
3067 ** 2) Code is established, or
3068 ** 3) Buffer is FULL (but last word is pushed)
3070 ** in 1) and 3) cases, we continue to use
3071 ** Kanji codes by oconv and leave estab_f unchanged.
3076 while (hold_index < hold_count){
3077 c2 = hold_buf[hold_index++];
3079 #ifdef NUMCHAR_OPTION
3080 || is_unicode_capsule(c2)
3085 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3086 (*iconv)(X0201, c2, 0);
3089 if (hold_index < hold_count){
3090 c1 = hold_buf[hold_index++];
3100 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3103 if (hold_index < hold_count){
3104 c0 = hold_buf[hold_index++];
3105 } else if ((c0 = (*i_getc)(f)) == EOF) {
3111 if (hold_index < hold_count){
3112 c3 = hold_buf[hold_index++];
3113 } else if ((c3 = (*i_getc)(f)) == EOF) {
3118 (*iconv)(c2, c1, c0|c3);
3123 /* 3 bytes EUC or UTF-8 */
3124 if (hold_index < hold_count){
3125 c0 = hold_buf[hold_index++];
3126 } else if ((c0 = (*i_getc)(f)) == EOF) {
3132 (*iconv)(c2, c1, c0);
3135 if (c0 == EOF) break;
3140 nkf_char push_hold_buf(nkf_char c2)
3142 if (hold_count >= HOLD_SIZE*2)
3144 hold_buf[hold_count++] = (unsigned char)c2;
3145 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3148 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3150 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3153 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3154 #ifdef SHIFTJIS_CP932
3155 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3156 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3163 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3164 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3170 #endif /* SHIFTJIS_CP932 */
3172 if (!x0213_f && is_ibmext_in_sjis(c2)){
3173 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3176 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3189 if(x0213_f && c2 >= 0xF0){
3190 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3191 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3192 }else{ /* 78<=k<=94 */
3193 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3194 if (0x9E < c1) c2++;
3197 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3198 if (0x9E < c1) c2++;
3201 c1 = c1 - ((c1 > DEL) ? SPACE : 0x1F);
3208 c2 = x0212_unshift(c2);
3215 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3219 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3221 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3223 if(c1 == 0x7F) return 0;
3224 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3227 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3228 if (ret) return ret;
3234 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3239 }else if (c2 == 0x8f){
3243 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3244 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3245 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3248 c2 = (c2 << 8) | (c1 & 0x7f);
3250 #ifdef SHIFTJIS_CP932
3253 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3254 s2e_conv(s2, s1, &c2, &c1);
3261 #endif /* SHIFTJIS_CP932 */
3263 #endif /* X0212_ENABLE */
3264 } else if (c2 == SSO){
3267 } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
3270 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3271 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3272 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3277 #ifdef SHIFTJIS_CP932
3278 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3280 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3281 s2e_conv(s2, s1, &c2, &c1);
3288 #endif /* SHIFTJIS_CP932 */
3295 #ifdef UTF8_INPUT_ENABLE
3296 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3303 }else if (0xc0 <= c2 && c2 <= 0xef) {
3304 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3305 #ifdef NUMCHAR_OPTION
3308 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3316 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3319 static const char w_iconv_utf8_1st_byte[] =
3321 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3322 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3323 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3324 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3326 if (c2 < 0 || 0xff < c2) {
3327 }else if (c2 == 0) { /* 0 : 1 byte*/
3329 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3332 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3334 if (c1 < 0x80 || 0xBF < c1) return 0;
3337 if (c0 == 0) return -1;
3338 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3343 if (c0 == 0) return -1;
3344 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3348 if (c0 == 0) return -1;
3349 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3353 if (c0 == 0) return -2;
3354 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3358 if (c0 == 0) return -2;
3359 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3363 if (c0 == 0) return -2;
3364 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3372 if (c2 == 0 || c2 == EOF){
3373 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3374 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3377 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3386 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3387 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3394 }else if (val < 0x800){
3395 *p2 = 0xc0 | (val >> 6);
3396 *p1 = 0x80 | (val & 0x3f);
3398 } else if (val <= NKF_INT32_C(0xFFFF)) {
3399 *p2 = 0xe0 | (val >> 12);
3400 *p1 = 0x80 | ((val >> 6) & 0x3f);
3401 *p0 = 0x80 | (val & 0x3f);
3402 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3403 *p2 = 0xe0 | (val >> 16);
3404 *p1 = 0x80 | ((val >> 12) & 0x3f);
3405 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3414 #ifdef UTF8_INPUT_ENABLE
3415 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3420 } else if (c2 >= 0xf0){
3421 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3422 val = (c2 & 0x0f) << 18;
3423 val |= (c1 & 0x3f) << 12;
3424 val |= (c0 & 0x3f00) >> 2;
3426 }else if (c2 >= 0xe0){
3427 val = (c2 & 0x0f) << 12;
3428 val |= (c1 & 0x3f) << 6;
3430 }else if (c2 >= 0xc0){
3431 val = (c2 & 0x1f) << 6;
3439 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3441 nkf_char c2, c1, c0;
3448 w16w_conv(val, &c2, &c1, &c0);
3449 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3450 #ifdef NUMCHAR_OPTION
3453 *p1 = CLASS_UNICODE | val;
3462 #ifdef UTF8_INPUT_ENABLE
3463 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3466 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3469 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3470 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3472 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3474 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3479 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3480 if (ret) return ret;
3485 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3489 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3490 } else if (is_unicode_bmp(c1)) {
3491 ret = w16e_conv(c1, &c2, &c1);
3494 c1 = CLASS_UNICODE | c1;
3496 if (ret) return ret;
3501 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3503 const unsigned short *const *pp;
3504 const unsigned short *const *const *ppp;
3505 static const char no_best_fit_chars_table_C2[] =
3506 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3508 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3509 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3510 static const char no_best_fit_chars_table_C2_ms[] =
3511 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3513 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3514 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3515 static const char no_best_fit_chars_table_932_C2[] =
3516 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3517 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3518 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3519 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3520 static const char no_best_fit_chars_table_932_C3[] =
3521 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3522 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3524 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3530 }else if(c2 < 0xe0){
3531 if(no_best_fit_chars_f){
3532 if(ms_ucs_map_f == UCS_MAP_CP932){
3535 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3538 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3541 }else if(!cp932inv_f){
3544 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3547 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3550 }else if(ms_ucs_map_f == UCS_MAP_MS){
3551 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3552 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3570 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3571 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3572 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3574 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3575 }else if(c0 < 0xF0){
3576 if(no_best_fit_chars_f){
3577 if(ms_ucs_map_f == UCS_MAP_CP932){
3578 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3579 }else if(ms_ucs_map_f == UCS_MAP_MS){
3584 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3587 if(c0 == 0x92) return 1;
3592 if(c1 == 0x80 || c0 == 0x9C) return 1;
3595 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3600 if(c0 == 0x94) return 1;
3603 if(c0 == 0xBB) return 1;
3613 if(c0 == 0x95) return 1;
3616 if(c0 == 0xA5) return 1;
3623 if(c0 == 0x8D) return 1;
3626 if(c0 == 0x9E && !cp932inv_f) return 1;
3629 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3637 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3638 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3639 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3641 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3643 #ifdef SHIFTJIS_CP932
3644 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3646 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3647 s2e_conv(s2, s1, p2, p1);
3656 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3659 const unsigned short *p;
3662 if (pp == 0) return 1;
3665 if (c1 < 0 || psize <= c1) return 1;
3667 if (p == 0) return 1;
3670 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3672 if (val == 0) return 1;
3673 if (no_cp932ext_f && (
3674 (val>>8) == 0x2D || /* NEC special characters */
3675 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3683 if (c2 == SO) c2 = X0201;
3690 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3697 (*f)(0, bin2hex(c>>shift));
3707 void encode_fallback_html(nkf_char c)
3712 if(c >= NKF_INT32_C(1000000))
3713 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3714 if(c >= NKF_INT32_C(100000))
3715 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3717 (*oconv)(0, 0x30+(c/10000 )%10);
3719 (*oconv)(0, 0x30+(c/1000 )%10);
3721 (*oconv)(0, 0x30+(c/100 )%10);
3723 (*oconv)(0, 0x30+(c/10 )%10);
3725 (*oconv)(0, 0x30+ c %10);
3730 void encode_fallback_xml(nkf_char c)
3735 nkf_each_char_to_hex(oconv, c);
3740 void encode_fallback_java(nkf_char c)
3744 if(!is_unicode_bmp(c)){
3748 (*oconv)(0, bin2hex(c>>20));
3749 (*oconv)(0, bin2hex(c>>16));
3753 (*oconv)(0, bin2hex(c>>12));
3754 (*oconv)(0, bin2hex(c>> 8));
3755 (*oconv)(0, bin2hex(c>> 4));
3756 (*oconv)(0, bin2hex(c ));
3760 void encode_fallback_perl(nkf_char c)
3765 nkf_each_char_to_hex(oconv, c);
3770 void encode_fallback_subchar(nkf_char c)
3772 c = unicode_subchar;
3773 (*oconv)((c>>8)&0xFF, c&0xFF);
3778 #ifdef UTF8_OUTPUT_ENABLE
3779 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3781 const unsigned short *p;
3784 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3792 p = euc_to_utf8_1byte;
3794 } else if (is_eucg3(c2)){
3795 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3798 c2 = (c2&0x7f) - 0x21;
3799 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3800 p = x0212_to_utf8_2bytes[c2];
3806 c2 = (c2&0x7f) - 0x21;
3807 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3809 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3810 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3811 euc_to_utf8_2bytes_ms[c2];
3816 c1 = (c1 & 0x7f) - 0x21;
3817 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3822 void w_oconv(nkf_char c2, nkf_char c1)
3828 output_bom_f = FALSE;
3839 #ifdef NUMCHAR_OPTION
3840 if (c2 == 0 && is_unicode_capsule(c1)){
3841 val = c1 & VALUE_MASK;
3844 }else if (val < 0x800){
3845 (*o_putc)(0xC0 | (val >> 6));
3846 (*o_putc)(0x80 | (val & 0x3f));
3847 } else if (val <= NKF_INT32_C(0xFFFF)) {
3848 (*o_putc)(0xE0 | (val >> 12));
3849 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3850 (*o_putc)(0x80 | (val & 0x3f));
3851 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3852 (*o_putc)(0xF0 | ( val>>18));
3853 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3854 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3855 (*o_putc)(0x80 | ( val & 0x3f));
3862 output_mode = ASCII;
3864 } else if (c2 == ISO8859_1) {
3865 output_mode = ISO8859_1;
3866 (*o_putc)(c1 | 0x080);
3869 val = e2w_conv(c2, c1);
3871 w16w_conv(val, &c2, &c1, &c0);
3875 if (c0) (*o_putc)(c0);
3881 void w_oconv16(nkf_char c2, nkf_char c1)
3884 output_bom_f = FALSE;
3885 if (output_endian == ENDIAN_LITTLE){
3886 (*o_putc)((unsigned char)'\377');
3890 (*o_putc)((unsigned char)'\377');
3899 if (c2 == ISO8859_1) {
3902 #ifdef NUMCHAR_OPTION
3903 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3904 if (is_unicode_bmp(c1)) {
3905 c2 = (c1 >> 8) & 0xff;
3909 if (c1 <= UNICODE_MAX) {
3910 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3911 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3912 if (output_endian == ENDIAN_LITTLE){
3913 (*o_putc)(c2 & 0xff);
3914 (*o_putc)((c2 >> 8) & 0xff);
3915 (*o_putc)(c1 & 0xff);
3916 (*o_putc)((c1 >> 8) & 0xff);
3918 (*o_putc)((c2 >> 8) & 0xff);
3919 (*o_putc)(c2 & 0xff);
3920 (*o_putc)((c1 >> 8) & 0xff);
3921 (*o_putc)(c1 & 0xff);
3928 nkf_char val = e2w_conv(c2, c1);
3929 c2 = (val >> 8) & 0xff;
3933 if (output_endian == ENDIAN_LITTLE){
3942 void w_oconv32(nkf_char c2, nkf_char c1)
3945 output_bom_f = FALSE;
3946 if (output_endian == ENDIAN_LITTLE){
3947 (*o_putc)((unsigned char)'\377');
3955 (*o_putc)((unsigned char)'\377');
3964 if (c2 == ISO8859_1) {
3966 #ifdef NUMCHAR_OPTION
3967 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3971 c1 = e2w_conv(c2, c1);
3974 if (output_endian == ENDIAN_LITTLE){
3975 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3976 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3977 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3981 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
3982 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
3983 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
3988 void e_oconv(nkf_char c2, nkf_char c1)
3990 #ifdef NUMCHAR_OPTION
3991 if (c2 == 0 && is_unicode_capsule(c1)){
3992 w16e_conv(c1, &c2, &c1);
3993 if (c2 == 0 && is_unicode_capsule(c1)){
3994 c2 = c1 & VALUE_MASK;
3995 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
3999 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4000 c1 = 0x21 + c1 % 94;
4003 (*o_putc)((c2 & 0x7f) | 0x080);
4004 (*o_putc)(c1 | 0x080);
4006 (*o_putc)((c2 & 0x7f) | 0x080);
4007 (*o_putc)(c1 | 0x080);
4011 if (encode_fallback) (*encode_fallback)(c1);
4020 } else if (c2 == 0) {
4021 output_mode = ASCII;
4023 } else if (c2 == X0201) {
4024 output_mode = JAPANESE_EUC;
4025 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4026 } else if (c2 == ISO8859_1) {
4027 output_mode = ISO8859_1;
4028 (*o_putc)(c1 | 0x080);
4030 } else if (is_eucg3(c2)){
4031 output_mode = JAPANESE_EUC;
4032 #ifdef SHIFTJIS_CP932
4035 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4036 s2e_conv(s2, s1, &c2, &c1);
4041 output_mode = ASCII;
4043 }else if (is_eucg3(c2)){
4046 (*o_putc)((c2 & 0x7f) | 0x080);
4047 (*o_putc)(c1 | 0x080);
4050 (*o_putc)((c2 & 0x7f) | 0x080);
4051 (*o_putc)(c1 | 0x080);
4055 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4056 set_iconv(FALSE, 0);
4057 return; /* too late to rescue this char */
4059 output_mode = JAPANESE_EUC;
4060 (*o_putc)(c2 | 0x080);
4061 (*o_putc)(c1 | 0x080);
4066 nkf_char x0212_shift(nkf_char c)
4071 if (0x75 <= c && c <= 0x7f){
4072 ret = c + (0x109 - 0x75);
4075 if (0x75 <= c && c <= 0x7f){
4076 ret = c + (0x113 - 0x75);
4083 nkf_char x0212_unshift(nkf_char c)
4086 if (0x7f <= c && c <= 0x88){
4087 ret = c + (0x75 - 0x7f);
4088 }else if (0x89 <= c && c <= 0x92){
4089 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4093 #endif /* X0212_ENABLE */
4095 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4101 if((0x21 <= ndx && ndx <= 0x2F)){
4102 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4103 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4105 }else if(0x6E <= ndx && ndx <= 0x7E){
4106 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4107 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4113 else if(nkf_isgraph(ndx)){
4115 const unsigned short *ptr;
4116 ptr = x0212_shiftjis[ndx - 0x21];
4118 val = ptr[(c1 & 0x7f) - 0x21];
4127 c2 = x0212_shift(c2);
4129 #endif /* X0212_ENABLE */
4131 if(0x7F < c2) return 1;
4132 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4133 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4137 void s_oconv(nkf_char c2, nkf_char c1)
4139 #ifdef NUMCHAR_OPTION
4140 if (c2 == 0 && is_unicode_capsule(c1)){
4141 w16e_conv(c1, &c2, &c1);
4142 if (c2 == 0 && is_unicode_capsule(c1)){
4143 c2 = c1 & VALUE_MASK;
4144 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4147 c2 = c1 / 188 + 0xF0;
4149 c1 += 0x40 + (c1 > 0x3e);
4154 if(encode_fallback)(*encode_fallback)(c1);
4163 } else if (c2 == 0) {
4164 output_mode = ASCII;
4166 } else if (c2 == X0201) {
4167 output_mode = SHIFT_JIS;
4169 } else if (c2 == ISO8859_1) {
4170 output_mode = ISO8859_1;
4171 (*o_putc)(c1 | 0x080);
4173 } else if (is_eucg3(c2)){
4174 output_mode = SHIFT_JIS;
4175 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4181 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4182 set_iconv(FALSE, 0);
4183 return; /* too late to rescue this char */
4185 output_mode = SHIFT_JIS;
4186 e2s_conv(c2, c1, &c2, &c1);
4188 #ifdef SHIFTJIS_CP932
4190 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4191 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4197 #endif /* SHIFTJIS_CP932 */
4200 if (prefix_table[(unsigned char)c1]){
4201 (*o_putc)(prefix_table[(unsigned char)c1]);
4207 void j_oconv(nkf_char c2, nkf_char c1)
4209 #ifdef NUMCHAR_OPTION
4210 if (c2 == 0 && is_unicode_capsule(c1)){
4211 w16e_conv(c1, &c2, &c1);
4212 if (c2 == 0 && is_unicode_capsule(c1)){
4213 c2 = c1 & VALUE_MASK;
4214 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4217 c2 = 0x7F + c1 / 94;
4218 c1 = 0x21 + c1 % 94;
4220 if (encode_fallback) (*encode_fallback)(c1);
4227 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4230 (*o_putc)(ascii_intro);
4231 output_mode = ASCII;
4235 } else if (is_eucg3(c2)){
4237 if(output_mode!=X0213_2){
4238 output_mode = X0213_2;
4242 (*o_putc)(X0213_2&0x7F);
4245 if(output_mode!=X0212){
4246 output_mode = X0212;
4250 (*o_putc)(X0212&0x7F);
4253 (*o_putc)(c2 & 0x7f);
4256 } else if (c2==X0201) {
4257 if (output_mode!=X0201) {
4258 output_mode = X0201;
4264 } else if (c2==ISO8859_1) {
4265 /* iso8859 introduction, or 8th bit on */
4266 /* Can we convert in 7bit form using ESC-'-'-A ?
4268 output_mode = ISO8859_1;
4270 } else if (c2 == 0) {
4271 if (output_mode !=ASCII && output_mode!=ISO8859_1) {
4274 (*o_putc)(ascii_intro);
4275 output_mode = ASCII;
4280 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4281 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4283 if (output_mode!=X0213_1) {
4284 output_mode = X0213_1;
4288 (*o_putc)(X0213_1&0x7F);
4290 }else if (output_mode != X0208) {
4291 output_mode = X0208;
4294 (*o_putc)(kanji_intro);
4301 void base64_conv(nkf_char c2, nkf_char c1)
4303 mime_prechar(c2, c1);
4304 (*o_base64conv)(c2,c1);
4308 static nkf_char broken_buf[3];
4309 static int broken_counter = 0;
4310 static int broken_last = 0;
4311 nkf_char broken_getc(FILE *f)
4315 if (broken_counter>0) {
4316 return broken_buf[--broken_counter];
4319 if (c=='$' && broken_last != ESC
4320 && (input_mode==ASCII || input_mode==X0201)) {
4323 if (c1=='@'|| c1=='B') {
4324 broken_buf[0]=c1; broken_buf[1]=c;
4331 } else if (c=='(' && broken_last != ESC
4332 && (input_mode==X0208 || input_mode==X0201)) { /* ) */
4335 if (c1=='J'|| c1=='B') {
4336 broken_buf[0]=c1; broken_buf[1]=c;
4349 nkf_char broken_ungetc(nkf_char c, FILE *f)
4351 if (broken_counter<2)
4352 broken_buf[broken_counter++]=c;
4356 void cr_conv(nkf_char c2, nkf_char c1)
4360 if (! (c2==0&&c1==NL) ) {
4366 } else if (c1=='\r') {
4368 } else if (c1=='\n') {
4369 if (crmode_f==CRLF) {
4370 (*o_crconv)(0,'\r');
4371 } else if (crmode_f==CR) {
4372 (*o_crconv)(0,'\r');
4376 } else if (c1!='\032' || crmode_f!=NL){
4382 Return value of fold_conv()
4384 \n add newline and output char
4385 \r add newline and output nothing
4388 1 (or else) normal output
4390 fold state in prev (previous character)
4392 >0x80 Japanese (X0208/X0201)
4397 This fold algorthm does not preserve heading space in a line.
4398 This is the main difference from fmt.
4401 #define char_size(c2,c1) (c2?2:1)
4403 void fold_conv(nkf_char c2, nkf_char c1)
4406 nkf_char fold_state;
4408 if (c1== '\r' && !fold_preserve_f) {
4409 fold_state=0; /* ignore cr */
4410 }else if (c1== '\n'&&f_prev=='\r' && fold_preserve_f) {
4412 fold_state=0; /* ignore cr */
4413 } else if (c1== BS) {
4414 if (f_line>0) f_line--;
4416 } else if (c2==EOF && f_line != 0) { /* close open last line */
4418 } else if ((c1=='\n' && !fold_preserve_f)
4419 || ((c1=='\r'||(c1=='\n'&&f_prev!='\r'))
4420 && fold_preserve_f)) {
4422 if (fold_preserve_f) {
4426 } else if ((f_prev == c1 && !fold_preserve_f)
4427 || (f_prev == '\n' && fold_preserve_f)
4428 ) { /* duplicate newline */
4431 fold_state = '\n'; /* output two newline */
4437 if (f_prev&0x80) { /* Japanese? */
4439 fold_state = 0; /* ignore given single newline */
4440 } else if (f_prev==' ') {
4444 if (++f_line<=fold_len)
4448 fold_state = '\r'; /* fold and output nothing */
4452 } else if (c1=='\f') {
4455 fold_state = '\n'; /* output newline and clear */
4456 } else if ( (c2==0 && c1==' ')||
4457 (c2==0 && c1=='\t')||
4458 (c2=='!'&& c1=='!')) {
4459 /* X0208 kankaku or ascii space */
4460 if (f_prev == ' ') {
4461 fold_state = 0; /* remove duplicate spaces */
4464 if (++f_line<=fold_len)
4465 fold_state = ' '; /* output ASCII space only */
4467 f_prev = ' '; f_line = 0;
4468 fold_state = '\r'; /* fold and output nothing */
4472 prev0 = f_prev; /* we still need this one... , but almost done */
4474 if (c2 || c2==X0201)
4475 f_prev |= 0x80; /* this is Japanese */
4476 f_line += char_size(c2,c1);
4477 if (f_line<=fold_len) { /* normal case */
4480 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4481 f_line = char_size(c2,c1);
4482 fold_state = '\n'; /* We can't wait, do fold now */
4483 } else if (c2==X0201) {
4484 /* simple kinsoku rules return 1 means no folding */
4485 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4486 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4487 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4488 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4489 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4490 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4491 else if (SPACE<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4493 fold_state = '\n';/* add one new f_line before this character */
4496 fold_state = '\n';/* add one new f_line before this character */
4499 /* kinsoku point in ASCII */
4500 if ( c1==')'|| /* { [ ( */
4511 /* just after special */
4512 } else if (!is_alnum(prev0)) {
4513 f_line = char_size(c2,c1);
4515 } else if ((prev0==' ') || /* ignored new f_line */
4516 (prev0=='\n')|| /* ignored new f_line */
4517 (prev0&0x80)) { /* X0208 - ASCII */
4518 f_line = char_size(c2,c1);
4519 fold_state = '\n';/* add one new f_line before this character */
4521 fold_state = 1; /* default no fold in ASCII */
4525 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4526 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4527 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4528 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4529 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4530 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4531 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4532 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4533 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4534 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4535 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4536 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4537 /* default no fold in kinsoku */
4540 f_line = char_size(c2,c1);
4541 /* add one new f_line before this character */
4544 f_line = char_size(c2,c1);
4546 /* add one new f_line before this character */
4551 /* terminator process */
4552 switch(fold_state) {
4571 nkf_char z_prev2=0,z_prev1=0;
4573 void z_conv(nkf_char c2, nkf_char c1)
4576 /* if (c2) c1 &= 0x7f; assertion */
4578 if (c2 == X0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4584 if (z_prev2 == X0201) {
4586 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4588 (*o_zconv)(dv[(z_prev1-SPACE)*2], dv[(z_prev1-SPACE)*2+1]);
4590 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SPACE)*2]) { /*
\e$BH>ByE@
\e(B */
4592 (*o_zconv)(ev[(z_prev1-SPACE)*2], ev[(z_prev1-SPACE)*2+1]);
4597 (*o_zconv)(cv[(z_prev1-SPACE)*2], cv[(z_prev1-SPACE)*2+1]);
4600 if (dv[(c1-SPACE)*2] || ev[(c1-SPACE)*2]) {
4601 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4606 (*o_zconv)(cv[(c1-SPACE)*2], cv[(c1-SPACE)*2+1]);
4617 if (alpha_f&1 && c2 == 0x23 ) {
4618 /* JISX0208 Alphabet */
4620 } else if (c2 == 0x21) {
4621 /* JISX0208 Kigou */
4626 } else if (alpha_f&4) {
4631 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4637 if (alpha_f&8 && c2 == 0) {
4641 case '>': entity = ">"; break;
4642 case '<': entity = "<"; break;
4643 case '\"': entity = """; break;
4644 case '&': entity = "&"; break;
4647 while (*entity) (*o_zconv)(0, *entity++);
4653 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4658 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4662 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4666 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4670 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4674 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4678 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4682 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4686 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4691 (*o_zconv)(X0201, c);
4694 } else if (c2 == 0x25) {
4695 /* JISX0208 Katakana */
4696 static const int fullwidth_to_halfwidth[] =
4698 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4699 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4700 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4701 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4702 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4703 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4704 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4705 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4706 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4707 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4708 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4709 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4711 if (fullwidth_to_halfwidth[c1-0x20]){
4712 c2 = fullwidth_to_halfwidth[c1-0x20];
4713 (*o_zconv)(X0201, c2>>8);
4715 (*o_zconv)(X0201, c2&0xFF);
4725 #define rot13(c) ( \
4727 (c <= 'M') ? (c + 13): \
4728 (c <= 'Z') ? (c - 13): \
4730 (c <= 'm') ? (c + 13): \
4731 (c <= 'z') ? (c - 13): \
4735 #define rot47(c) ( \
4737 ( c <= 'O' ) ? (c + 47) : \
4738 ( c <= '~' ) ? (c - 47) : \
4742 void rot_conv(nkf_char c2, nkf_char c1)
4744 if (c2==0 || c2==X0201 || c2==ISO8859_1) {
4750 (*o_rot_conv)(c2,c1);
4753 void hira_conv(nkf_char c2, nkf_char c1)
4757 if (0x20 < c1 && c1 < 0x74) {
4759 (*o_hira_conv)(c2,c1);
4761 } else if (c1 == 0x74 && (output_conv == w_oconv || output_conv == w_oconv16)) {
4763 c1 = CLASS_UNICODE | 0x3094;
4764 (*o_hira_conv)(c2,c1);
4767 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4769 (*o_hira_conv)(c2,c1);
4774 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4777 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4779 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4783 (*o_hira_conv)(c2,c1);
4787 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4789 static const nkf_char range[RANGE_NUM_MAX][2] = {
4810 nkf_char start, end, c;
4812 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4816 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4821 for (i = 0; i < RANGE_NUM_MAX; i++) {
4822 start = range[i][0];
4825 if (c >= start && c <= end) {
4830 (*o_iso2022jp_check_conv)(c2,c1);
4834 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4836 static const unsigned char *mime_pattern[] = {
4837 (const unsigned char *)"\075?EUC-JP?B?",
4838 (const unsigned char *)"\075?SHIFT_JIS?B?",
4839 (const unsigned char *)"\075?ISO-8859-1?Q?",
4840 (const unsigned char *)"\075?ISO-8859-1?B?",
4841 (const unsigned char *)"\075?ISO-2022-JP?B?",
4842 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4843 #if defined(UTF8_INPUT_ENABLE)
4844 (const unsigned char *)"\075?UTF-8?B?",
4845 (const unsigned char *)"\075?UTF-8?Q?",
4847 (const unsigned char *)"\075?US-ASCII?Q?",
4852 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4853 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4854 e_iconv, s_iconv, 0, 0, 0, 0,
4855 #if defined(UTF8_INPUT_ENABLE)
4861 static const nkf_char mime_encode[] = {
4862 JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
4863 #if defined(UTF8_INPUT_ENABLE)
4870 static const nkf_char mime_encode_method[] = {
4871 'B', 'B','Q', 'B', 'B', 'Q',
4872 #if defined(UTF8_INPUT_ENABLE)
4880 #define MAXRECOVER 20
4882 void switch_mime_getc(void)
4884 if (i_getc!=mime_getc) {
4885 i_mgetc = i_getc; i_getc = mime_getc;
4886 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4887 if(mime_f==STRICT_MIME) {
4888 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4889 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4894 void unswitch_mime_getc(void)
4896 if(mime_f==STRICT_MIME) {
4897 i_mgetc = i_mgetc_buf;
4898 i_mungetc = i_mungetc_buf;
4901 i_ungetc = i_mungetc;
4902 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4903 mime_iconv_back = NULL;
4906 nkf_char mime_begin_strict(FILE *f)
4910 const unsigned char *p,*q;
4911 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4913 mime_decode_mode = FALSE;
4914 /* =? has been checked */
4916 p = mime_pattern[j];
4919 for(i=2;p[i]>' ';i++) { /* start at =? */
4920 if ( ((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i] ) {
4921 /* pattern fails, try next one */
4923 while (mime_pattern[++j]) {
4924 p = mime_pattern[j];
4925 for(k=2;k<i;k++) /* assume length(p) > i */
4926 if (p[k]!=q[k]) break;
4927 if (k==i && nkf_toupper(c1)==p[k]) break;
4929 p = mime_pattern[j];
4930 if (p) continue; /* found next one, continue */
4931 /* all fails, output from recovery buffer */
4939 mime_decode_mode = p[i-2];
4941 mime_iconv_back = iconv;
4942 set_iconv(FALSE, mime_priority_func[j]);
4943 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4945 if (mime_decode_mode=='B') {
4946 mimebuf_f = unbuf_f;
4948 /* do MIME integrity check */
4949 return mime_integrity(f,mime_pattern[j]);
4957 nkf_char mime_getc_buf(FILE *f)
4959 /* we don't keep eof of Fifo, becase it contains ?= as
4960 a terminator. It was checked in mime_integrity. */
4961 return ((mimebuf_f)?
4962 (*i_mgetc_buf)(f):Fifo(mime_input++));
4965 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
4968 (*i_mungetc_buf)(c,f);
4970 Fifo(--mime_input) = (unsigned char)c;
4974 nkf_char mime_begin(FILE *f)
4979 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4980 /* re-read and convert again from mime_buffer. */
4982 /* =? has been checked */
4984 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
4985 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4986 /* We accept any character type even if it is breaked by new lines */
4987 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
4988 if (c1=='\n'||c1==' '||c1=='\r'||
4989 c1=='-'||c1=='_'||is_alnum(c1) ) continue;
4991 /* Failed. But this could be another MIME preemble */
4999 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5000 if (!(++i<MAXRECOVER) || c1==EOF) break;
5001 if (c1=='b'||c1=='B') {
5002 mime_decode_mode = 'B';
5003 } else if (c1=='q'||c1=='Q') {
5004 mime_decode_mode = 'Q';
5008 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5009 if (!(++i<MAXRECOVER) || c1==EOF) break;
5011 mime_decode_mode = FALSE;
5017 if (!mime_decode_mode) {
5018 /* false MIME premble, restart from mime_buffer */
5019 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5020 /* Since we are in MIME mode until buffer becomes empty, */
5021 /* we never go into mime_begin again for a while. */
5024 /* discard mime preemble, and goto MIME mode */
5026 /* do no MIME integrity check */
5027 return c1; /* used only for checking EOF */
5031 void no_putc(nkf_char c)
5036 void debug(const char *str)
5039 fprintf(stderr, "%s\n", str);
5044 void set_input_codename(char *codename)
5048 strcmp(codename, "") != 0 &&
5049 strcmp(codename, input_codename) != 0)
5051 is_inputcode_mixed = TRUE;
5053 input_codename = codename;
5054 is_inputcode_set = TRUE;
5057 #if !defined(PERL_XS) && !defined(WIN32DLL)
5058 void print_guessed_code(char *filename)
5060 char *codename = "BINARY";
5061 char *str_crmode = NULL;
5062 if (!is_inputcode_mixed) {
5063 if (strcmp(input_codename, "") == 0) {
5066 codename = input_codename;
5068 if (crmode_f == CR) str_crmode = "CR";
5069 else if (crmode_f == NL) str_crmode = "LF";
5070 else if (crmode_f == CRLF) str_crmode = "CRLF";
5072 if (filename != NULL) printf("%s:", filename);
5073 if (str_crmode != NULL) printf("%s (%s)\n", codename, str_crmode);
5074 else printf("%s\n", codename);
5080 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5082 nkf_char c1, c2, c3;
5088 if (!nkf_isxdigit(c2)){
5093 if (!nkf_isxdigit(c3)){
5098 return (hex2bin(c2) << 4) | hex2bin(c3);
5101 nkf_char cap_getc(FILE *f)
5103 return hex_getc(':', f, i_cgetc, i_cungetc);
5106 nkf_char cap_ungetc(nkf_char c, FILE *f)
5108 return (*i_cungetc)(c, f);
5111 nkf_char url_getc(FILE *f)
5113 return hex_getc('%', f, i_ugetc, i_uungetc);
5116 nkf_char url_ungetc(nkf_char c, FILE *f)
5118 return (*i_uungetc)(c, f);
5122 #ifdef NUMCHAR_OPTION
5123 nkf_char numchar_getc(FILE *f)
5125 nkf_char (*g)(FILE *) = i_ngetc;
5126 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5137 if (buf[i] == 'x' || buf[i] == 'X'){
5138 for (j = 0; j < 7; j++){
5140 if (!nkf_isxdigit(buf[i])){
5147 c |= hex2bin(buf[i]);
5150 for (j = 0; j < 8; j++){
5154 if (!nkf_isdigit(buf[i])){
5161 c += hex2bin(buf[i]);
5167 return CLASS_UNICODE | c;
5176 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5178 return (*i_nungetc)(c, f);
5182 #ifdef UNICODE_NORMALIZATION
5184 /* Normalization Form C */
5185 nkf_char nfc_getc(FILE *f)
5187 nkf_char (*g)(FILE *f) = i_nfc_getc;
5188 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5189 int i=0, j, k=1, lower, upper;
5191 const nkf_nfchar *array;
5194 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5195 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5196 while (upper >= lower) {
5197 j = (lower+upper) / 2;
5198 array = normalization_table[j].nfd;
5199 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5200 if (array[k] != buf[k]){
5201 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5208 array = normalization_table[j].nfc;
5209 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5210 buf[i] = (nkf_char)(array[i]);
5221 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5223 return (*i_nfc_ungetc)(c, f);
5225 #endif /* UNICODE_NORMALIZATION */
5231 nkf_char c1, c2, c3, c4, cc;
5232 nkf_char t1, t2, t3, t4, mode, exit_mode;
5233 nkf_char lwsp_count;
5236 nkf_char lwsp_size = 128;
5238 if (mime_top != mime_last) { /* Something is in FIFO */
5239 return Fifo(mime_top++);
5241 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5242 mime_decode_mode=FALSE;
5243 unswitch_mime_getc();
5244 return (*i_getc)(f);
5247 if (mimebuf_f == FIXED_MIME)
5248 exit_mode = mime_decode_mode;
5251 if (mime_decode_mode == 'Q') {
5252 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5254 if (c1=='_' && mimebuf_f != FIXED_MIME) return ' ';
5255 if (c1<=' ' || DEL<=c1) {
5256 mime_decode_mode = exit_mode; /* prepare for quit */
5259 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5263 mime_decode_mode = exit_mode; /* prepare for quit */
5264 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5265 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5266 /* end Q encoding */
5267 input_mode = exit_mode;
5269 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5270 if (lwsp_buf==NULL) {
5271 perror("can't malloc");
5274 while ((c1=(*i_getc)(f))!=EOF) {
5279 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5287 if ((c1=(*i_getc)(f))!=EOF && c1 == NL) {
5288 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5303 lwsp_buf[lwsp_count] = (unsigned char)c1;
5304 if (lwsp_count++>lwsp_size){
5306 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5307 if (lwsp_buf_new==NULL) {
5309 perror("can't realloc");
5312 lwsp_buf = lwsp_buf_new;
5318 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5320 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5321 i_ungetc(lwsp_buf[lwsp_count],f);
5327 if (c1=='='&&c2<' ') { /* this is soft wrap */
5328 while((c1 = (*i_mgetc)(f)) <=' ') {
5329 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5331 mime_decode_mode = 'Q'; /* still in MIME */
5332 goto restart_mime_q;
5335 mime_decode_mode = 'Q'; /* still in MIME */
5339 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5340 if (c2<=' ') return c2;
5341 mime_decode_mode = 'Q'; /* still in MIME */
5342 return ((hex2bin(c2)<<4) + hex2bin(c3));
5345 if (mime_decode_mode != 'B') {
5346 mime_decode_mode = FALSE;
5347 return (*i_mgetc)(f);
5351 /* Base64 encoding */
5353 MIME allows line break in the middle of
5354 Base64, but we are very pessimistic in decoding
5355 in unbuf mode because MIME encoded code may broken by
5356 less or editor's control sequence (such as ESC-[-K in unbuffered
5357 mode. ignore incomplete MIME.
5359 mode = mime_decode_mode;
5360 mime_decode_mode = exit_mode; /* prepare for quit */
5362 while ((c1 = (*i_mgetc)(f))<=' ') {
5367 if ((c2 = (*i_mgetc)(f))<=' ') {
5370 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5371 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5374 if ((c1 == '?') && (c2 == '=')) {
5377 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5378 if (lwsp_buf==NULL) {
5379 perror("can't malloc");
5382 while ((c1=(*i_getc)(f))!=EOF) {
5387 if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5395 if ((c1=(*i_getc)(f))!=EOF) {
5399 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SPACE||c1==TAB)) {
5414 lwsp_buf[lwsp_count] = (unsigned char)c1;
5415 if (lwsp_count++>lwsp_size){
5417 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5418 if (lwsp_buf_new==NULL) {
5420 perror("can't realloc");
5423 lwsp_buf = lwsp_buf_new;
5429 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SPACE && lwsp_buf[lwsp_count-1] != TAB))) {
5431 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5432 i_ungetc(lwsp_buf[lwsp_count],f);
5439 if ((c3 = (*i_mgetc)(f))<=' ') {
5442 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5443 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5447 if ((c4 = (*i_mgetc)(f))<=' ') {
5450 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5451 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5455 mime_decode_mode = mode; /* still in MIME sigh... */
5457 /* BASE 64 decoding */
5459 t1 = 0x3f & base64decode(c1);
5460 t2 = 0x3f & base64decode(c2);
5461 t3 = 0x3f & base64decode(c3);
5462 t4 = 0x3f & base64decode(c4);
5463 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5465 Fifo(mime_last++) = (unsigned char)cc;
5466 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5468 Fifo(mime_last++) = (unsigned char)cc;
5469 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5471 Fifo(mime_last++) = (unsigned char)cc;
5476 return Fifo(mime_top++);
5479 nkf_char mime_ungetc(nkf_char c, FILE *f)
5481 Fifo(--mime_top) = (unsigned char)c;
5485 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5489 /* In buffered mode, read until =? or NL or buffer full
5491 mime_input = mime_top;
5492 mime_last = mime_top;
5494 while(*p) Fifo(mime_input++) = *p++;
5497 while((c=(*i_getc)(f))!=EOF) {
5498 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5499 break; /* buffer full */
5501 if (c=='=' && d=='?') {
5502 /* checked. skip header, start decode */
5503 Fifo(mime_input++) = (unsigned char)c;
5504 /* mime_last_input = mime_input; */
5509 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5511 /* Should we check length mod 4? */
5512 Fifo(mime_input++) = (unsigned char)c;
5515 /* In case of Incomplete MIME, no MIME decode */
5516 Fifo(mime_input++) = (unsigned char)c;
5517 mime_last = mime_input; /* point undecoded buffer */
5518 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5519 switch_mime_getc(); /* anyway we need buffered getc */
5523 nkf_char base64decode(nkf_char c)
5528 i = c - 'A'; /* A..Z 0-25 */
5530 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5532 } else if (c > '/') {
5533 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5534 } else if (c == '+') {
5535 i = '>' /* 62 */ ; /* + 62 */
5537 i = '?' /* 63 */ ; /* / 63 */
5542 static const char basis_64[] =
5543 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5545 static nkf_char b64c;
5546 #define MIMEOUT_BUF_LENGTH (60)
5547 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5548 int mimeout_buf_count = 0;
5549 int mimeout_preserve_space = 0;
5550 #define itoh4(c) (c>=10?c+'A'-10:c+'0')
5552 void open_mime(nkf_char mode)
5554 const unsigned char *p;
5557 p = mime_pattern[0];
5558 for(i=0;mime_pattern[i];i++) {
5559 if (mode == mime_encode[i]) {
5560 p = mime_pattern[i];
5564 mimeout_mode = mime_encode_method[i];
5567 if (base64_count>45) {
5568 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5569 (*o_mputc)(mimeout_buf[i]);
5575 if (!mimeout_preserve_space && mimeout_buf_count>0
5576 && (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5577 || mimeout_buf[i]==CR || mimeout_buf[i]==NL )) {
5581 if (!mimeout_preserve_space) {
5582 for (;i<mimeout_buf_count;i++) {
5583 if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
5584 || mimeout_buf[i]==CR || mimeout_buf[i]==NL ) {
5585 (*o_mputc)(mimeout_buf[i]);
5592 mimeout_preserve_space = FALSE;
5598 j = mimeout_buf_count;
5599 mimeout_buf_count = 0;
5601 mime_putc(mimeout_buf[i]);
5605 void close_mime(void)
5615 switch(mimeout_mode) {
5620 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5626 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5632 if (mimeout_f!=FIXED_MIME) {
5634 } else if (mimeout_mode != 'Q')
5639 void mimeout_addchar(nkf_char c)
5641 switch(mimeout_mode) {
5646 } else if(!nkf_isalnum(c)) {
5648 (*o_mputc)(itoh4(((c>>4)&0xf)));
5649 (*o_mputc)(itoh4((c&0xf)));
5658 (*o_mputc)(basis_64[c>>2]);
5663 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5669 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5670 (*o_mputc)(basis_64[c & 0x3F]);
5681 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5683 void mime_prechar(nkf_char c2, nkf_char c1)
5687 if (base64_count + mimeout_buf_count/3*4> 73){
5688 (*o_base64conv)(EOF,0);
5689 (*o_base64conv)(0,NL);
5690 (*o_base64conv)(0,SPACE);
5693 if (base64_count + mimeout_buf_count/3*4> 66){
5694 (*o_base64conv)(EOF,0);
5695 (*o_base64conv)(0,NL);
5696 (*o_base64conv)(0,SPACE);
5698 }/*else if (mime_lastchar2){
5699 if (c1 <=DEL && !nkf_isspace(c1)){
5700 (*o_base64conv)(0,SPACE);
5704 if (c2 && mime_lastchar2 == 0
5705 && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
5706 (*o_base64conv)(0,SPACE);
5709 /*mime_lastchar2 = c2;
5710 mime_lastchar1 = c1;*/
5713 void mime_putc(nkf_char c)
5718 if (mimeout_f == FIXED_MIME){
5719 if (mimeout_mode == 'Q'){
5720 if (base64_count > 71){
5721 if (c!=CR && c!=NL) {
5728 if (base64_count > 71){
5733 if (c == EOF) { /* c==EOF */
5737 if (c != EOF) { /* c==EOF */
5743 /* mimeout_f != FIXED_MIME */
5745 if (c == EOF) { /* c==EOF */
5746 j = mimeout_buf_count;
5747 mimeout_buf_count = 0;
5750 if (!nkf_isblank(mimeout_buf[j-1])) {
5752 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5755 mimeout_addchar(mimeout_buf[i]);
5759 mimeout_addchar(mimeout_buf[i]);
5763 mimeout_addchar(mimeout_buf[i]);
5769 mimeout_addchar(mimeout_buf[i]);
5775 if (mimeout_mode=='Q') {
5776 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5777 if (c == CR || c == NL) {
5782 } else if (c <= SPACE) {
5784 if (base64_count > 70) {
5788 if (!nkf_isblank(c)) {
5799 if (mimeout_buf_count > 0){
5800 lastchar = mimeout_buf[mimeout_buf_count - 1];
5805 if (!mimeout_mode) {
5806 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
5807 if (nkf_isspace(c)) {
5808 if (c==CR || c==NL) {
5811 for (i=0;i<mimeout_buf_count;i++) {
5812 (*o_mputc)(mimeout_buf[i]);
5813 if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
5819 mimeout_buf[0] = (char)c;
5820 mimeout_buf_count = 1;
5822 if (base64_count > 1
5823 && base64_count + mimeout_buf_count > 76
5824 && mimeout_buf[0] != CR && mimeout_buf[0] != NL){
5827 if (!nkf_isspace(mimeout_buf[0])){
5832 mimeout_buf[mimeout_buf_count++] = (char)c;
5833 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5834 open_mime(output_mode);
5839 if (lastchar==CR || lastchar == NL){
5840 for (i=0;i<mimeout_buf_count;i++) {
5841 (*o_mputc)(mimeout_buf[i]);
5844 mimeout_buf_count = 0;
5846 if (lastchar==SPACE) {
5847 for (i=0;i<mimeout_buf_count-1;i++) {
5848 (*o_mputc)(mimeout_buf[i]);
5851 mimeout_buf[0] = SPACE;
5852 mimeout_buf_count = 1;
5854 open_mime(output_mode);
5857 /* mimeout_mode == 'B', 1, 2 */
5858 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
5859 if (lastchar == CR || lastchar == NL){
5860 if (nkf_isblank(c)) {
5861 for (i=0;i<mimeout_buf_count;i++) {
5862 mimeout_addchar(mimeout_buf[i]);
5864 mimeout_buf_count = 0;
5865 } else if (SPACE<c && c<DEL) {
5867 for (i=0;i<mimeout_buf_count;i++) {
5868 (*o_mputc)(mimeout_buf[i]);
5871 mimeout_buf_count = 0;
5874 if (c==SPACE || c==TAB || c==CR || c==NL) {
5875 for (i=0;i<mimeout_buf_count;i++) {
5876 if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
5878 for (i=0;i<mimeout_buf_count;i++) {
5879 (*o_mputc)(mimeout_buf[i]);
5882 mimeout_buf_count = 0;
5885 mimeout_buf[mimeout_buf_count++] = (char)c;
5886 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5888 for (i=0;i<mimeout_buf_count;i++) {
5889 (*o_mputc)(mimeout_buf[i]);
5892 mimeout_buf_count = 0;
5896 if (mimeout_buf_count>0 && SPACE<c && c!='=') {
5897 mimeout_buf[mimeout_buf_count++] = (char)c;
5898 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5899 j = mimeout_buf_count;
5900 mimeout_buf_count = 0;
5902 mimeout_addchar(mimeout_buf[i]);
5909 if (mimeout_buf_count>0) {
5910 j = mimeout_buf_count;
5911 mimeout_buf_count = 0;
5913 if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
5915 mimeout_addchar(mimeout_buf[i]);
5921 (*o_mputc)(mimeout_buf[i]);
5923 open_mime(output_mode);
5930 #if defined(PERL_XS) || defined(WIN32DLL)
5934 struct input_code *p = input_code_list;
5947 mime_f = STRICT_MIME;
5948 mime_decode_f = FALSE;
5953 #if defined(MSDOS) || defined(__OS2__)
5958 iso2022jp_f = FALSE;
5959 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5960 ms_ucs_map_f = UCS_MAP_ASCII;
5962 #ifdef UTF8_INPUT_ENABLE
5963 no_cp932ext_f = FALSE;
5964 no_best_fit_chars_f = FALSE;
5965 encode_fallback = NULL;
5966 unicode_subchar = '?';
5967 input_endian = ENDIAN_BIG;
5969 #ifdef UTF8_OUTPUT_ENABLE
5970 output_bom_f = FALSE;
5971 output_endian = ENDIAN_BIG;
5973 #ifdef UNICODE_NORMALIZATION
5986 is_inputcode_mixed = FALSE;
5987 is_inputcode_set = FALSE;
5991 #ifdef SHIFTJIS_CP932
6001 for (i = 0; i < 256; i++){
6002 prefix_table[i] = 0;
6006 mimeout_buf_count = 0;
6011 fold_preserve_f = FALSE;
6014 kanji_intro = DEFAULT_J;
6015 ascii_intro = DEFAULT_R;
6016 fold_margin = FOLD_MARGIN;
6017 output_conv = DEFAULT_CONV;
6018 oconv = DEFAULT_CONV;
6019 o_zconv = no_connection;
6020 o_fconv = no_connection;
6021 o_crconv = no_connection;
6022 o_rot_conv = no_connection;
6023 o_hira_conv = no_connection;
6024 o_base64conv = no_connection;
6025 o_iso2022jp_check_conv = no_connection;
6028 i_ungetc = std_ungetc;
6030 i_bungetc = std_ungetc;
6033 i_mungetc = std_ungetc;
6034 i_mgetc_buf = std_getc;
6035 i_mungetc_buf = std_ungetc;
6036 output_mode = ASCII;
6039 mime_decode_mode = FALSE;
6045 z_prev2=0,z_prev1=0;
6047 iconv_for_check = 0;
6049 input_codename = "";
6056 void no_connection(nkf_char c2, nkf_char c1)
6058 no_connection2(c2,c1,0);
6061 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6063 fprintf(stderr,"nkf internal module connection failure.\n");
6065 return 0; /* LINT */
6070 #define fprintf dllprintf
6074 fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
6075 fprintf(stderr,"Flags:\n");
6076 fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
6077 #ifdef DEFAULT_CODE_SJIS
6078 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n");
6080 #ifdef DEFAULT_CODE_JIS
6081 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n");
6083 #ifdef DEFAULT_CODE_EUC
6084 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n");
6086 #ifdef DEFAULT_CODE_UTF8
6087 fprintf(stderr,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n");
6089 #ifdef UTF8_OUTPUT_ENABLE
6090 fprintf(stderr," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n");
6092 fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n");
6093 #ifdef UTF8_INPUT_ENABLE
6094 fprintf(stderr," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n");
6096 fprintf(stderr,"t no conversion\n");
6097 fprintf(stderr,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n");
6098 fprintf(stderr,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n");
6099 fprintf(stderr,"r {de/en}crypt ROT13/47\n");
6100 fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
6101 fprintf(stderr,"v Show this usage. V: show version\n");
6102 fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
6103 fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
6104 fprintf(stderr,"l ISO8859-1 (Latin-1) support\n");
6105 fprintf(stderr,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n");
6106 fprintf(stderr,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n");
6107 fprintf(stderr," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n");
6108 fprintf(stderr," 4: JISX0208 Katakana to JISX0201 Katakana\n");
6109 fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n");
6110 fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n");
6112 fprintf(stderr,"T Text mode output\n");
6114 fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n");
6115 fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
6116 fprintf(stderr,"d,c Convert line breaks -d: LF -c: CRLF\n");
6117 fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
6118 fprintf(stderr,"\n");
6119 fprintf(stderr,"Long name options\n");
6120 fprintf(stderr," --ic=<input codeset> --oc=<output codeset>\n");
6121 fprintf(stderr," Specify the input or output codeset\n");
6122 fprintf(stderr," --fj --unix --mac --windows\n");
6123 fprintf(stderr," --jis --euc --sjis --utf8 --utf16 --mime --base64\n");
6124 fprintf(stderr," Convert for the system or code\n");
6125 fprintf(stderr," --hiragana --katakana --katakana-hiragana\n");
6126 fprintf(stderr," To Hiragana/Katakana Conversion\n");
6127 fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
6129 fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
6131 #ifdef NUMCHAR_OPTION
6132 fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
6134 #ifdef UTF8_INPUT_ENABLE
6135 fprintf(stderr," --fb-{skip, html, xml, perl, java, subchar}\n");
6136 fprintf(stderr," Specify how nkf handles unassigned characters\n");
6139 fprintf(stderr," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n");
6140 fprintf(stderr," Overwrite original listed files by filtered result\n");
6141 fprintf(stderr," --overwrite preserves timestamp of original files\n");
6143 fprintf(stderr," -g --guess Guess the input code\n");
6144 fprintf(stderr," --help --version Show this help/the version\n");
6145 fprintf(stderr," For more information, see also man nkf\n");
6146 fprintf(stderr,"\n");
6152 fprintf(stderr,"Network Kanji Filter Version %s (%s) "
6153 #if defined(MSDOS) && !defined(__WIN32__) && !defined(__WIN16__) && !defined(__OS2__)
6156 #if defined(MSDOS) && defined(__WIN16__)
6159 #if defined(MSDOS) && defined(__WIN32__)
6165 ,NKF_VERSION,NKF_RELEASE_DATE);
6166 fprintf(stderr,"\n%s\n",CopyRight);
6171 **
\e$B%Q%C%A@):n<T
\e(B
6172 ** void@merope.pleiades.or.jp (Kusakabe Youichi)
6173 ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp>
6174 ** ohta@src.ricoh.co.jp (Junn Ohta)
6175 ** inouet@strl.nhk.or.jp (Tomoyuki Inoue)
6176 ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama)
6177 ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp>
6178 ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe)
6179 ** kono@ie.u-ryukyu.ac.jp (Shinji Kono)
6180 ** GHG00637@nifty-serve.or.jp (COW)