1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.173 2008/02/06 22:14:13 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-02-07"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
45 /* state of output_mode and input_mode
121 NKF_ENCODING_TABLE_SIZE,
122 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
123 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
124 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
125 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
126 JIS_X_0208 = 0x1168, /* @B */
127 JIS_X_0212 = 0x1159, /* D */
128 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
129 JIS_X_0213_2 = 0x1229, /* P */
130 JIS_X_0213_1 = 0x1233, /* Q */
134 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
135 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
138 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
139 void j_oconv(nkf_char c2, nkf_char c1);
140 void s_oconv(nkf_char c2, nkf_char c1);
141 void e_oconv(nkf_char c2, nkf_char c1);
142 void w_oconv(nkf_char c2, nkf_char c1);
143 void w_oconv16(nkf_char c2, nkf_char c1);
144 void w_oconv32(nkf_char c2, nkf_char c1);
148 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
149 void (*oconv)(nkf_char c2, nkf_char c1);
150 } nkf_native_encoding;
152 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
153 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
154 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
155 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
156 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
157 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
158 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
163 const nkf_native_encoding *base_encoding;
166 nkf_encoding nkf_encoding_table[] = {
167 {ASCII, "US-ASCII", &NkfEncodingASCII},
168 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
169 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
170 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
171 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
172 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
173 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
175 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
176 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
177 {CP10001, "CP10001", &NkfEncodingShift_JIS},
178 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
179 {CP51932, "CP51932", &NkfEncodingEUC_JP},
180 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
181 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
182 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
183 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
184 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
185 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
186 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
187 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
188 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
189 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
190 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
191 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
192 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
193 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
194 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
195 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
196 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
197 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
198 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
199 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
200 {BINARY, "BINARY", &NkfEncodingASCII},
207 } encoding_name_to_id_table[] = {
210 {"ISO-2022-JP", ISO_2022_JP},
211 {"ISO2022JP-CP932", CP50220},
212 {"CP50220", CP50220},
213 {"CP50221", CP50221},
214 {"CP50222", CP50222},
215 {"ISO-2022-JP-1", ISO_2022_JP_1},
216 {"ISO-2022-JP-3", ISO_2022_JP_3},
217 {"SHIFT_JIS", SHIFT_JIS},
219 {"WINDOWS-31J", WINDOWS_31J},
220 {"CSWINDOWS31J", WINDOWS_31J},
221 {"CP932", WINDOWS_31J},
222 {"MS932", WINDOWS_31J},
223 {"CP10001", CP10001},
226 {"CP51932", CP51932},
227 {"EUC-JP-MS", EUCJP_MS},
228 {"EUCJP-MS", EUCJP_MS},
229 {"EUCJPMS", EUCJP_MS},
230 {"EUC-JP-ASCII", EUCJP_ASCII},
231 {"EUCJP-ASCII", EUCJP_ASCII},
232 {"SHIFT_JISX0213", SHIFT_JISX0213},
233 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
234 {"EUC-JISX0213", EUC_JISX0213},
235 {"EUC-JIS-2004", EUC_JIS_2004},
238 {"UTF-8-BOM", UTF_8_BOM},
239 {"UTF8-MAC", UTF8_MAC},
240 {"UTF-8-MAC", UTF8_MAC},
242 {"UTF-16BE", UTF_16BE},
243 {"UTF-16BE-BOM", UTF_16BE_BOM},
244 {"UTF-16LE", UTF_16LE},
245 {"UTF-16LE-BOM", UTF_16LE_BOM},
247 {"UTF-32BE", UTF_32BE},
248 {"UTF-32BE-BOM", UTF_32BE_BOM},
249 {"UTF-32LE", UTF_32LE},
250 {"UTF-32LE-BOM", UTF_32LE_BOM},
255 #if defined(DEFAULT_CODE_JIS)
256 #define DEFAULT_ENCIDX ISO_2022_JP
257 #elif defined(DEFAULT_CODE_SJIS)
258 #define DEFAULT_ENCIDX SHIFT_JIS
259 #elif defined(DEFAULT_CODE_EUC)
260 #define DEFAULT_ENCIDX EUC_JP
261 #elif defined(DEFAULT_CODE_UTF8)
262 #define DEFAULT_ENCIDX UTF_8
266 #define is_alnum(c) \
267 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
269 /* I don't trust portablity of toupper */
270 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
271 #define nkf_isoctal(c) ('0'<=c && c<='7')
272 #define nkf_isdigit(c) ('0'<=c && c<='9')
273 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
274 #define nkf_isblank(c) (c == SP || c == TAB)
275 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
276 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
277 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
278 #define nkf_isprint(c) (SP<=c && c<='~')
279 #define nkf_isgraph(c) ('!'<=c && c<='~')
280 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
281 ('A'<=c&&c<='F') ? (c-'A'+10) : \
282 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
283 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
284 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
285 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
286 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
287 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
289 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
291 #define HOLD_SIZE 1024
292 #if defined(INT_IS_SHORT)
293 #define IOBUF_SIZE 2048
295 #define IOBUF_SIZE 16384
298 #define DEFAULT_J 'B'
299 #define DEFAULT_R 'B'
306 /* MIME preprocessor */
308 #ifdef EASYWIN /*Easy Win */
309 extern POINT _BufferSize;
318 void (*status_func)(struct input_code *, nkf_char);
319 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
323 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
324 static nkf_encoding *input_encoding = NULL;
325 static nkf_encoding *output_encoding = NULL;
327 static int kanji_convert(FILE *f);
328 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
330 * 0: Shift_JIS, eucJP-ascii
335 #define UCS_MAP_ASCII 0
337 #define UCS_MAP_CP932 2
338 #define UCS_MAP_CP10001 3
339 static int ms_ucs_map_f = UCS_MAP_ASCII;
341 #ifdef UTF8_INPUT_ENABLE
342 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
343 static int no_cp932ext_f = FALSE;
344 /* ignore ZERO WIDTH NO-BREAK SPACE */
345 static int no_best_fit_chars_f = FALSE;
346 static int input_endian = ENDIAN_BIG;
347 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
348 static void (*encode_fallback)(nkf_char c) = NULL;
349 static void w_status(struct input_code *, nkf_char);
351 #ifdef UTF8_OUTPUT_ENABLE
352 static int output_bom_f = FALSE;
353 static int output_endian = ENDIAN_BIG;
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_getc(FILE *f);
365 static void mime_putc(nkf_char c);
369 #if !defined(PERL_XS) && !defined(WIN32DLL)
370 static unsigned char stdibuf[IOBUF_SIZE];
371 static unsigned char stdobuf[IOBUF_SIZE];
375 static int unbuf_f = FALSE;
376 static int estab_f = FALSE;
377 static int nop_f = FALSE;
378 static int binmode_f = TRUE; /* binary mode */
379 static int rot_f = FALSE; /* rot14/43 mode */
380 static int hira_f = FALSE; /* hira/kata henkan */
381 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
382 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
383 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
384 static int mimebuf_f = FALSE; /* MIME buffered input */
385 static int broken_f = FALSE; /* convert ESC-less broken JIS */
386 static int iso8859_f = FALSE; /* ISO8859 through */
387 static int mimeout_f = FALSE; /* base64 mode */
388 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
389 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
391 #ifdef UNICODE_NORMALIZATION
392 static int nfc_f = FALSE;
393 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
394 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
398 static int cap_f = FALSE;
399 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
400 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
402 static int url_f = FALSE;
403 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
404 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
407 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
408 #define CLASS_MASK NKF_INT32_C(0xFF000000)
409 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
410 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
411 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
412 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
413 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
414 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
415 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
416 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
417 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
419 #ifdef NUMCHAR_OPTION
420 static int numchar_f = FALSE;
421 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
422 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
426 static int noout_f = FALSE;
427 static void no_putc(nkf_char c);
428 static int debug_f = FALSE;
429 static void debug(const char *str);
430 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
433 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
434 static void set_input_codename(char *codename);
437 static int exec_f = 0;
440 #ifdef SHIFTJIS_CP932
441 /* invert IBM extended characters to others */
442 static int cp51932_f = FALSE;
444 /* invert NEC-selected IBM extended characters to IBM extended characters */
445 static int cp932inv_f = TRUE;
447 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
448 #endif /* SHIFTJIS_CP932 */
451 static int x0212_f = FALSE;
453 static int x0213_f = FALSE;
455 static unsigned char prefix_table[256];
457 static void e_status(struct input_code *, nkf_char);
458 static void s_status(struct input_code *, nkf_char);
460 struct input_code input_code_list[] = {
461 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
462 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
463 #ifdef UTF8_INPUT_ENABLE
464 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
469 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
470 static int base64_count = 0;
472 /* X0208 -> ASCII converter */
475 static int f_line = 0; /* chars in line */
476 static int f_prev = 0;
477 static int fold_preserve_f = FALSE; /* preserve new lines */
478 static int fold_f = FALSE;
479 static int fold_len = 0;
482 static unsigned char kanji_intro = DEFAULT_J;
483 static unsigned char ascii_intro = DEFAULT_R;
487 #define FOLD_MARGIN 10
488 #define DEFAULT_FOLD 60
490 static int fold_margin = FOLD_MARGIN;
492 /* process default */
494 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
496 fprintf(stderr,"nkf internal module connection failure.\n");
501 void no_connection(nkf_char c2, nkf_char c1)
503 no_connection2(c2,c1,0);
506 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
507 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
509 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
510 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
511 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
512 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
513 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
514 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
515 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
517 /* static redirections */
519 static void (*o_putc)(nkf_char c) = std_putc;
521 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
522 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
524 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
525 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
527 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
529 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
530 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
532 /* for strict mime */
533 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
534 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
537 static int output_mode = ASCII; /* output kanji mode */
538 static int input_mode = ASCII; /* input kanji mode */
539 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
541 /* X0201 / X0208 conversion tables */
543 /* X0201 kana conversion table */
545 static const unsigned char cv[]= {
546 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
547 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
548 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
549 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
550 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
551 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
552 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
553 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
554 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
555 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
556 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
557 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
558 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
559 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
560 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
561 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
565 /* X0201 kana conversion table for daguten */
567 static const unsigned char dv[]= {
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
572 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
573 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
574 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
575 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
576 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
577 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
579 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
586 /* X0201 kana conversion table for han-daguten */
588 static const unsigned char ev[]= {
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
600 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 /* X0208 kigou conversion table */
609 /* 0x8140 - 0x819e */
610 static const unsigned char fv[] = {
612 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
613 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
614 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
616 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
617 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
618 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
619 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
620 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
628 static int option_mode = 0;
629 static int file_out_f = FALSE;
631 static int overwrite_f = FALSE;
632 static int preserve_time_f = FALSE;
633 static int backup_f = FALSE;
634 static char *backup_suffix = "";
637 static int eolmode_f = 0; /* CR, LF, CRLF */
638 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
639 static nkf_char prev_cr = 0; /* CR or 0 */
640 #ifdef EASYWIN /*Easy Win */
641 static int end_check;
644 #define STD_GC_BUFSIZE (256)
645 nkf_char std_gc_buf[STD_GC_BUFSIZE];
648 char* nkf_strcpy(const char *str)
650 char* result = malloc(strlen(str) + 1);
659 static void nkf_str_upcase(const char *src, char *dest, size_t length)
662 for (; i < length && src[i]; i++) {
663 dest[i] = nkf_toupper(src[i]);
668 static nkf_encoding *nkf_enc_from_index(int idx)
670 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
673 return &nkf_encoding_table[idx];
676 static int nkf_enc_find_index(const char *name)
679 if (*name == 'X' && *(name+1) == '-') name += 2;
680 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
681 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
682 return encoding_name_to_id_table[i].id;
688 static nkf_encoding *nkf_enc_find(const char *name)
691 idx = nkf_enc_find_index(name);
692 if (idx < 0) return 0;
693 return nkf_enc_from_index(idx);
696 #define nkf_enc_name(enc) (enc)->name
697 #define nkf_enc_to_index(enc) (enc)->id
698 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
699 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
700 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
701 #define nkf_enc_asciicompat(enc) (\
702 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
703 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
704 #define nkf_enc_unicode_p(enc) (\
705 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
706 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
707 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
708 #define nkf_enc_cp5022x_p(enc) (\
709 nkf_enc_to_index(enc) == CP50220 ||\
710 nkf_enc_to_index(enc) == CP50221 ||\
711 nkf_enc_to_index(enc) == CP50222)
713 #ifdef DEFAULT_CODE_LOCALE
714 static char* nkf_locale_charmap()
716 #ifdef HAVE_LANGINFO_H
717 return nl_langinfo(CODESET);
718 #elif defined(__WIN32__)
719 return sprintf("CP%d", GetACP());
725 static nkf_encoding* nkf_locale_encoding()
727 nkf_encoding *enc = 0;
728 char *encname = nkf_locale_charmap();
730 enc = nkf_enc_find(encname);
731 if (enc < 0) enc = 0;
734 #endif /* DEFAULT_CODE_LOCALE */
736 static nkf_encoding* nkf_default_encoding()
738 nkf_encoding *enc = 0;
739 #ifdef DEFAULT_CODE_LOCALE
740 enc = nkf_locale_encoding();
742 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
749 #define fprintf dllprintf
754 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
760 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
762 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
763 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
764 #ifdef UTF8_OUTPUT_ENABLE
765 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
767 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
768 #ifdef UTF8_INPUT_ENABLE
769 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
772 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
773 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
774 "r {de/en}crypt ROT13/47\n"
775 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
776 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
777 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
778 "l ISO8859-1 (Latin-1) support\n"
779 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
780 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
781 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
782 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
783 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
784 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
786 "T Text mode output\n"
788 "O Output to File (DEFAULT 'nkf.out')\n"
789 "I Convert non ISO-2022-JP charactor to GETA\n"
790 "d,c Convert line breaks -d: LF -c: CRLF\n"
791 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
792 "v, V Show this usage. V: show configuration\n"
794 "Long name options\n"
795 " --ic=<input codeset> --oc=<output codeset>\n"
796 " Specify the input or output codeset\n"
797 " --fj --unix --mac --windows\n"
798 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
799 " Convert for the system or code\n"
800 " --hiragana --katakana --katakana-hiragana\n"
801 " To Hiragana/Katakana Conversion\n"
802 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
804 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
806 #ifdef NUMCHAR_OPTION
807 " --numchar-input Convert Unicode Character Reference\n"
809 #ifdef UTF8_INPUT_ENABLE
810 " --fb-{skip, html, xml, perl, java, subchar}\n"
811 " Specify how nkf handles unassigned characters\n"
814 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
815 " Overwrite original listed files by filtered result\n"
816 " --overwrite preserves timestamp of original files\n"
818 " -g --guess Guess the input code\n"
819 " --help --version Show this help/the version\n"
820 " For more information, see also man nkf\n"
825 void show_configuration(void)
828 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
831 " Compile-time options:\n"
832 " Compiled at: " __DATE__ " " __TIME__ "\n"
835 " Default output encoding: "
836 #ifdef DEFAULT_CODE_LOCALE
837 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
839 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
845 " Default output end of line: "
846 #if DEFAULT_NEWLINE == CR
848 #elif DEFAULT_NEWLINE == CRLF
854 " Decode MIME encoded string: "
855 #if MIME_DECODE_DEFAULT
861 " Convert JIS X 0201 Katakana: "
868 " --help, --version output: "
869 #if HELP_OUTPUT_HELP_OUTPUT
879 char *get_backup_filename(const char *suffix, const char *filename)
881 char *backup_filename;
882 int asterisk_count = 0;
884 int filename_length = strlen(filename);
886 for(i = 0; suffix[i]; i++){
887 if(suffix[i] == '*') asterisk_count++;
891 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
892 if (!backup_filename){
893 perror("Can't malloc backup filename.");
897 for(i = 0, j = 0; suffix[i];){
898 if(suffix[i] == '*'){
899 backup_filename[j] = '\0';
900 strncat(backup_filename, filename, filename_length);
902 j += filename_length;
904 backup_filename[j++] = suffix[i++];
907 backup_filename[j] = '\0';
909 j = strlen(suffix) + filename_length;
910 backup_filename = malloc( + 1);
911 strcpy(backup_filename, filename);
912 strcat(backup_filename, suffix);
913 backup_filename[j] = '\0';
915 return backup_filename;
919 #ifdef UTF8_INPUT_ENABLE
920 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
927 (*f)(0, bin2hex(c>>shift));
937 void encode_fallback_html(nkf_char c)
942 if(c >= NKF_INT32_C(1000000))
943 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
944 if(c >= NKF_INT32_C(100000))
945 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
947 (*oconv)(0, 0x30+(c/10000 )%10);
949 (*oconv)(0, 0x30+(c/1000 )%10);
951 (*oconv)(0, 0x30+(c/100 )%10);
953 (*oconv)(0, 0x30+(c/10 )%10);
955 (*oconv)(0, 0x30+ c %10);
960 void encode_fallback_xml(nkf_char c)
965 nkf_each_char_to_hex(oconv, c);
970 void encode_fallback_java(nkf_char c)
974 if(!nkf_char_unicode_bmp_p(c)){
978 (*oconv)(0, bin2hex(c>>20));
979 (*oconv)(0, bin2hex(c>>16));
983 (*oconv)(0, bin2hex(c>>12));
984 (*oconv)(0, bin2hex(c>> 8));
985 (*oconv)(0, bin2hex(c>> 4));
986 (*oconv)(0, bin2hex(c ));
990 void encode_fallback_perl(nkf_char c)
995 nkf_each_char_to_hex(oconv, c);
1000 void encode_fallback_subchar(nkf_char c)
1002 c = unicode_subchar;
1003 (*oconv)((c>>8)&0xFF, c&0xFF);
1008 static const struct {
1032 {"katakana-hiragana","h3"},
1040 #ifdef UTF8_OUTPUT_ENABLE
1050 {"fb-subchar=", ""},
1052 #ifdef UTF8_INPUT_ENABLE
1053 {"utf8-input", "W"},
1054 {"utf16-input", "W16"},
1055 {"no-cp932ext", ""},
1056 {"no-best-fit-chars",""},
1058 #ifdef UNICODE_NORMALIZATION
1059 {"utf8mac-input", ""},
1071 #ifdef NUMCHAR_OPTION
1072 {"numchar-input", ""},
1078 #ifdef SHIFTJIS_CP932
1088 static void set_input_encoding(nkf_encoding *enc)
1090 switch (nkf_enc_to_index(enc)) {
1096 #ifdef SHIFTJIS_CP932
1099 #ifdef UTF8_OUTPUT_ENABLE
1100 ms_ucs_map_f = UCS_MAP_CP932;
1117 #ifdef SHIFTJIS_CP932
1120 #ifdef UTF8_OUTPUT_ENABLE
1121 ms_ucs_map_f = UCS_MAP_CP932;
1127 #ifdef SHIFTJIS_CP932
1130 #ifdef UTF8_OUTPUT_ENABLE
1131 ms_ucs_map_f = UCS_MAP_CP10001;
1135 #ifdef SHIFTJIS_CP932
1138 #ifdef UTF8_OUTPUT_ENABLE
1139 ms_ucs_map_f = UCS_MAP_CP932;
1143 #ifdef SHIFTJIS_CP932
1146 #ifdef UTF8_OUTPUT_ENABLE
1147 ms_ucs_map_f = UCS_MAP_MS;
1151 #ifdef SHIFTJIS_CP932
1154 #ifdef UTF8_OUTPUT_ENABLE
1155 ms_ucs_map_f = UCS_MAP_ASCII;
1158 case SHIFT_JISX0213:
1159 case SHIFT_JIS_2004:
1161 #ifdef SHIFTJIS_CP932
1168 #ifdef SHIFTJIS_CP932
1172 #ifdef UTF8_INPUT_ENABLE
1173 #ifdef UNICODE_NORMALIZATION
1181 input_endian = ENDIAN_BIG;
1185 input_endian = ENDIAN_LITTLE;
1190 input_endian = ENDIAN_BIG;
1194 input_endian = ENDIAN_LITTLE;
1200 static void set_output_encoding(nkf_encoding *enc)
1202 switch (nkf_enc_to_index(enc)) {
1205 #ifdef SHIFTJIS_CP932
1206 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1208 #ifdef UTF8_OUTPUT_ENABLE
1209 ms_ucs_map_f = UCS_MAP_CP932;
1213 #ifdef SHIFTJIS_CP932
1214 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1216 #ifdef UTF8_OUTPUT_ENABLE
1217 ms_ucs_map_f = UCS_MAP_CP932;
1224 #ifdef SHIFTJIS_CP932
1225 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1233 #ifdef SHIFTJIS_CP932
1234 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_CP932;
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_CP10001;
1251 #ifdef SHIFTJIS_CP932
1252 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_CP932;
1259 #ifdef SHIFTJIS_CP932
1260 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1262 #ifdef UTF8_OUTPUT_ENABLE
1263 ms_ucs_map_f = UCS_MAP_CP932;
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_MS;
1278 #ifdef UTF8_OUTPUT_ENABLE
1279 ms_ucs_map_f = UCS_MAP_ASCII;
1282 case SHIFT_JISX0213:
1283 case SHIFT_JIS_2004:
1285 #ifdef SHIFTJIS_CP932
1286 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1295 #ifdef SHIFTJIS_CP932
1296 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1299 #ifdef UTF8_OUTPUT_ENABLE
1301 output_bom_f = TRUE;
1305 output_bom_f = TRUE;
1308 output_endian = ENDIAN_LITTLE;
1309 output_bom_f = FALSE;
1312 output_endian = ENDIAN_LITTLE;
1313 output_bom_f = TRUE;
1316 output_bom_f = TRUE;
1319 output_endian = ENDIAN_LITTLE;
1320 output_bom_f = FALSE;
1323 output_endian = ENDIAN_LITTLE;
1324 output_bom_f = TRUE;
1330 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1333 struct input_code *p = input_code_list;
1335 if (iconv_func == p->iconv_func){
1344 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1346 #ifdef INPUT_CODE_FIX
1347 if (f || !input_encoding)
1354 #ifdef INPUT_CODE_FIX
1355 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1361 if (estab_f && iconv_for_check != iconv){
1362 struct input_code *p = find_inputcode_byfunc(iconv);
1364 set_input_codename(p->name);
1367 iconv_for_check = iconv;
1373 nkf_char x0212_shift(nkf_char c)
1378 if (0x75 <= c && c <= 0x7f){
1379 ret = c + (0x109 - 0x75);
1382 if (0x75 <= c && c <= 0x7f){
1383 ret = c + (0x113 - 0x75);
1390 nkf_char x0212_unshift(nkf_char c)
1393 if (0x7f <= c && c <= 0x88){
1394 ret = c + (0x75 - 0x7f);
1395 }else if (0x89 <= c && c <= 0x92){
1396 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1400 #endif /* X0212_ENABLE */
1402 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1408 if((0x21 <= ndx && ndx <= 0x2F)){
1409 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1410 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1412 }else if(0x6E <= ndx && ndx <= 0x7E){
1413 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1414 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1420 else if(nkf_isgraph(ndx)){
1422 const unsigned short *ptr;
1423 ptr = x0212_shiftjis[ndx - 0x21];
1425 val = ptr[(c1 & 0x7f) - 0x21];
1434 c2 = x0212_shift(c2);
1436 #endif /* X0212_ENABLE */
1438 if(0x7F < c2) return 1;
1439 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1440 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1444 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1446 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1449 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1450 #ifdef SHIFTJIS_CP932
1451 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1452 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1459 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1460 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1466 #endif /* SHIFTJIS_CP932 */
1468 if (!x0213_f && is_ibmext_in_sjis(c2)){
1469 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1472 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1485 if(x0213_f && c2 >= 0xF0){
1486 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1487 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1488 }else{ /* 78<=k<=94 */
1489 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1490 if (0x9E < c1) c2++;
1493 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1494 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1495 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1496 if (0x9E < c1) c2++;
1499 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1506 c2 = x0212_unshift(c2);
1513 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1514 void nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1522 }else if (val < 0x800){
1523 *p1 = 0xc0 | (val >> 6);
1524 *p2 = 0x80 | (val & 0x3f);
1527 } else if (nkf_char_unicode_bmp_p(val)) {
1528 *p1 = 0xe0 | (val >> 12);
1529 *p2 = 0x80 | ((val >> 6) & 0x3f);
1530 *p3 = 0x80 | ( val & 0x3f);
1532 } else if (nkf_char_unicode_value_p(val)) {
1533 *p1 = 0xe0 | (val >> 16);
1534 *p2 = 0x80 | ((val >> 12) & 0x3f);
1535 *p3 = 0x80 | ((val >> 6) & 0x3f);
1536 *p4 = 0x80 | ( val & 0x3f);
1545 nkf_char nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1552 else if (c1 <= 0xC3) {
1553 /* trail byte or invalid */
1556 else if (c1 <= 0xDF) {
1558 wc = (c1 & 0x1F) << 6;
1561 else if (c1 <= 0xEF) {
1563 wc = (c1 & 0x0F) << 12;
1564 wc |= (c2 & 0x3F) << 6;
1567 else if (c2 <= 0xF4) {
1569 wc = (c1 & 0x0F) << 18;
1570 wc |= (c2 & 0x3F) << 12;
1571 wc |= (c3 & 0x3F) << 6;
1581 #ifdef UTF8_INPUT_ENABLE
1582 static int unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1583 const unsigned short *const *pp, nkf_char psize,
1584 nkf_char *p2, nkf_char *p1)
1587 const unsigned short *p;
1590 if (pp == 0) return 1;
1593 if (c1 < 0 || psize <= c1) return 1;
1595 if (p == 0) return 1;
1598 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1600 if (val == 0) return 1;
1601 if (no_cp932ext_f && (
1602 (val>>8) == 0x2D || /* NEC special characters */
1603 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1611 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1618 static nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1620 const unsigned short *const *pp;
1621 const unsigned short *const *const *ppp;
1622 static const char no_best_fit_chars_table_C2[] =
1623 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1625 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1626 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1627 static const char no_best_fit_chars_table_C2_ms[] =
1628 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1630 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1631 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1632 static const char no_best_fit_chars_table_932_C2[] =
1633 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1634 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1635 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1636 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1637 static const char no_best_fit_chars_table_932_C3[] =
1638 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1639 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1641 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1647 }else if(c2 < 0xe0){
1648 if(no_best_fit_chars_f){
1649 if(ms_ucs_map_f == UCS_MAP_CP932){
1652 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1655 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1658 }else if(!cp932inv_f){
1661 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1664 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1667 }else if(ms_ucs_map_f == UCS_MAP_MS){
1668 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1669 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1687 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1688 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1689 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1691 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1692 }else if(c0 < 0xF0){
1693 if(no_best_fit_chars_f){
1694 if(ms_ucs_map_f == UCS_MAP_CP932){
1695 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1696 }else if(ms_ucs_map_f == UCS_MAP_MS){
1701 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1704 if(c0 == 0x92) return 1;
1709 if(c1 == 0x80 || c0 == 0x9C) return 1;
1712 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1717 if(c0 == 0x94) return 1;
1720 if(c0 == 0xBB) return 1;
1730 if(c0 == 0x95) return 1;
1733 if(c0 == 0xA5) return 1;
1740 if(c0 == 0x8D) return 1;
1743 if(c0 == 0x9E && !cp932inv_f) return 1;
1746 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1754 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1755 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1756 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1758 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1760 #ifdef SHIFTJIS_CP932
1761 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1763 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1764 s2e_conv(s2, s1, p2, p1);
1773 #ifdef UTF8_OUTPUT_ENABLE
1774 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
1776 const unsigned short *p;
1778 if (c2 == JIS_X_0201_1976_K) {
1779 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1787 p = euc_to_utf8_1byte;
1789 } else if (is_eucg3(c2)){
1790 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1793 c2 = (c2&0x7f) - 0x21;
1794 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1795 p = x0212_to_utf8_2bytes[c2];
1801 c2 = (c2&0x7f) - 0x21;
1802 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1804 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1805 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1806 euc_to_utf8_2bytes_ms[c2];
1811 c1 = (c1 & 0x7f) - 0x21;
1812 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1818 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1825 }else if (0xc0 <= c2 && c2 <= 0xef) {
1826 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1827 #ifdef NUMCHAR_OPTION
1830 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1838 #ifdef UTF8_INPUT_ENABLE
1839 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1848 else if (nkf_char_unicode_bmp_p(val)){
1849 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1850 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1853 *p1 = nkf_char_unicode_new(val);
1859 *p1 = nkf_char_unicode_new(val);
1865 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1867 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1868 if (iso2022jp_f && !x0201_f) {
1869 c2 = GETA1; c1 = GETA2;
1871 c2 = JIS_X_0201_1976_K;
1875 }else if (c2 == 0x8f){
1879 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1880 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1881 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1884 c2 = (c2 << 8) | (c1 & 0x7f);
1886 #ifdef SHIFTJIS_CP932
1889 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1890 s2e_conv(s2, s1, &c2, &c1);
1897 #endif /* SHIFTJIS_CP932 */
1899 #endif /* X0212_ENABLE */
1900 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1903 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1904 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1905 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1910 #ifdef SHIFTJIS_CP932
1911 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1913 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1914 s2e_conv(s2, s1, &c2, &c1);
1921 #endif /* SHIFTJIS_CP932 */
1928 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1930 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1931 if (iso2022jp_f && !x0201_f) {
1932 c2 = GETA1; c1 = GETA2;
1936 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1938 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1940 if(c1 == 0x7F) return 0;
1941 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1944 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1945 if (ret) return ret;
1951 nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
1953 nkf_char ret = 0, c4 = 0;
1954 static const char w_iconv_utf8_1st_byte[] =
1956 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1957 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1958 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
1959 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
1966 if (c1 < 0 || 0xff < c1) {
1967 }else if (c1 == 0) { /* 0 : 1 byte*/
1969 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
1972 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
1974 if (c2 < 0x80 || 0xBF < c2) return 0;
1977 if (c3 == 0) return -1;
1978 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
1983 if (c3 == 0) return -1;
1984 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
1988 if (c3 == 0) return -1;
1989 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
1993 if (c3 == 0) return -2;
1994 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
1998 if (c3 == 0) return -2;
1999 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2003 if (c3 == 0) return -2;
2004 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2012 if (c1 == 0 || c1 == EOF){
2013 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2014 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2017 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2025 #define NKF_ICONV_INVALID_CODE_RANGE -13
2026 static size_t unicode_iconv(nkf_char wc)
2034 }else if ((wc>>11) == 27) {
2035 /* unpaired surrogate */
2036 return NKF_ICONV_INVALID_CODE_RANGE;
2037 }else if (wc < 0xFFFF) {
2038 ret = w16e_conv(wc, &c2, &c1);
2039 if (ret) return ret;
2040 }else if (wc < 0x10FFFF) {
2042 c1 = nkf_char_unicode_new(wc);
2044 return NKF_ICONV_INVALID_CODE_RANGE;
2050 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2051 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2052 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2053 size_t nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2062 if (input_endian == ENDIAN_BIG) {
2063 if (0xD8 <= c1 && c1 <= 0xDB) {
2064 if (0xDC <= c3 && c3 <= 0xDF) {
2065 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2066 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2071 if (0xD8 <= c2 && c2 <= 0xDB) {
2072 if (0xDC <= c4 && c4 <= 0xDF) {
2073 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2074 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2080 return (*unicode_iconv)(wc);
2083 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2088 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2093 size_t nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2102 switch(input_endian){
2104 wc = c2 << 16 | c3 << 8 | c4;
2107 wc = c3 << 16 | c2 << 8 | c1;
2110 wc = c1 << 16 | c4 << 8 | c3;
2113 wc = c4 << 16 | c1 << 8 | c2;
2116 return NKF_ICONV_INVALID_CODE_RANGE;
2119 return (*unicode_iconv)(wc);
2123 #define output_ascii_escape_sequence(mode) do { \
2124 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2127 (*o_putc)(ascii_intro); \
2128 output_mode = mode; \
2132 void output_escape_sequence(int mode)
2134 if (output_mode == mode)
2142 case JIS_X_0201_1976_K:
2150 (*o_putc)(kanji_intro);
2162 (*o_putc)('O'); /* TODO */
2174 void j_oconv(nkf_char c2, nkf_char c1)
2176 #ifdef NUMCHAR_OPTION
2177 if (c2 == 0 && nkf_char_unicode_p(c1)){
2178 w16e_conv(c1, &c2, &c1);
2179 if (c2 == 0 && nkf_char_unicode_p(c1)){
2180 c2 = c1 & VALUE_MASK;
2181 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2184 c2 = 0x7F + c1 / 94;
2185 c1 = 0x21 + c1 % 94;
2187 if (encode_fallback) (*encode_fallback)(c1);
2194 output_ascii_escape_sequence(ASCII);
2197 else if (c2 == EOF) {
2198 output_ascii_escape_sequence(ASCII);
2201 else if (c2 == ISO_8859_1) {
2202 output_ascii_escape_sequence(ISO_8859_1);
2205 else if (c2 == JIS_X_0201_1976_K) {
2206 output_escape_sequence(JIS_X_0201_1976_K);
2209 } else if (is_eucg3(c2)){
2210 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2211 (*o_putc)(c2 & 0x7f);
2216 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2217 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2218 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2224 void e_oconv(nkf_char c2, nkf_char c1)
2226 #ifdef NUMCHAR_OPTION
2227 if (c2 == 0 && nkf_char_unicode_p(c1)){
2228 w16e_conv(c1, &c2, &c1);
2229 if (c2 == 0 && nkf_char_unicode_p(c1)){
2230 c2 = c1 & VALUE_MASK;
2231 if (x0212_f && ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2235 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2236 c1 = 0x21 + c1 % 94;
2239 (*o_putc)((c2 & 0x7f) | 0x080);
2240 (*o_putc)(c1 | 0x080);
2242 (*o_putc)((c2 & 0x7f) | 0x080);
2243 (*o_putc)(c1 | 0x080);
2247 if (encode_fallback) (*encode_fallback)(c1);
2255 } else if (c2 == 0) {
2256 output_mode = ASCII;
2258 } else if (c2 == JIS_X_0201_1976_K) {
2259 output_mode = EUC_JP;
2260 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2261 } else if (c2 == ISO_8859_1) {
2262 output_mode = ISO_8859_1;
2263 (*o_putc)(c1 | 0x080);
2265 } else if (is_eucg3(c2)){
2266 output_mode = EUC_JP;
2267 #ifdef SHIFTJIS_CP932
2270 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2271 s2e_conv(s2, s1, &c2, &c1);
2276 output_mode = ASCII;
2278 }else if (is_eucg3(c2)){
2281 (*o_putc)((c2 & 0x7f) | 0x080);
2282 (*o_putc)(c1 | 0x080);
2285 (*o_putc)((c2 & 0x7f) | 0x080);
2286 (*o_putc)(c1 | 0x080);
2290 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2291 set_iconv(FALSE, 0);
2292 return; /* too late to rescue this char */
2294 output_mode = EUC_JP;
2295 (*o_putc)(c2 | 0x080);
2296 (*o_putc)(c1 | 0x080);
2300 void s_oconv(nkf_char c2, nkf_char c1)
2302 #ifdef NUMCHAR_OPTION
2303 if (c2 == 0 && nkf_char_unicode_p(c1)){
2304 w16e_conv(c1, &c2, &c1);
2305 if (c2 == 0 && nkf_char_unicode_p(c1)){
2306 c2 = c1 & VALUE_MASK;
2307 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2310 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2312 c1 += 0x40 + (c1 > 0x3e);
2317 if(encode_fallback)(*encode_fallback)(c1);
2326 } else if (c2 == 0) {
2327 output_mode = ASCII;
2329 } else if (c2 == JIS_X_0201_1976_K) {
2330 output_mode = SHIFT_JIS;
2332 } else if (c2 == ISO_8859_1) {
2333 output_mode = ISO_8859_1;
2334 (*o_putc)(c1 | 0x080);
2336 } else if (is_eucg3(c2)){
2337 output_mode = SHIFT_JIS;
2338 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2344 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2345 set_iconv(FALSE, 0);
2346 return; /* too late to rescue this char */
2348 output_mode = SHIFT_JIS;
2349 e2s_conv(c2, c1, &c2, &c1);
2351 #ifdef SHIFTJIS_CP932
2353 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2354 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2360 #endif /* SHIFTJIS_CP932 */
2363 if (prefix_table[(unsigned char)c1]){
2364 (*o_putc)(prefix_table[(unsigned char)c1]);
2370 #ifdef UTF8_OUTPUT_ENABLE
2371 void w_oconv(nkf_char c2, nkf_char c1)
2377 output_bom_f = FALSE;
2388 if (c2 == 0 && nkf_char_unicode_p(c1)){
2389 val = c1 & VALUE_MASK;
2390 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2392 if (c2) (*o_putc)(c2);
2393 if (c3) (*o_putc)(c3);
2394 if (c4) (*o_putc)(c4);
2401 val = e2w_conv(c2, c1);
2403 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2405 if (c2) (*o_putc)(c2);
2406 if (c3) (*o_putc)(c3);
2407 if (c4) (*o_putc)(c4);
2412 void w_oconv16(nkf_char c2, nkf_char c1)
2415 output_bom_f = FALSE;
2416 if (output_endian == ENDIAN_LITTLE){
2430 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2431 if (nkf_char_unicode_bmp_p(c1)) {
2432 c2 = (c1 >> 8) & 0xff;
2436 if (c1 <= UNICODE_MAX) {
2437 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2438 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2439 if (output_endian == ENDIAN_LITTLE){
2440 (*o_putc)(c2 & 0xff);
2441 (*o_putc)((c2 >> 8) & 0xff);
2442 (*o_putc)(c1 & 0xff);
2443 (*o_putc)((c1 >> 8) & 0xff);
2445 (*o_putc)((c2 >> 8) & 0xff);
2446 (*o_putc)(c2 & 0xff);
2447 (*o_putc)((c1 >> 8) & 0xff);
2448 (*o_putc)(c1 & 0xff);
2454 nkf_char val = e2w_conv(c2, c1);
2455 c2 = (val >> 8) & 0xff;
2459 if (output_endian == ENDIAN_LITTLE){
2468 void w_oconv32(nkf_char c2, nkf_char c1)
2471 output_bom_f = FALSE;
2472 if (output_endian == ENDIAN_LITTLE){
2490 if (c2 == ISO_8859_1) {
2492 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2495 c1 = e2w_conv(c2, c1);
2498 if (output_endian == ENDIAN_LITTLE){
2499 (*o_putc)( c1 & 0xFF);
2500 (*o_putc)((c1 >> 8) & 0xFF);
2501 (*o_putc)((c1 >> 16) & 0xFF);
2505 (*o_putc)((c1 >> 16) & 0xFF);
2506 (*o_putc)((c1 >> 8) & 0xFF);
2507 (*o_putc)( c1 & 0xFF);
2512 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2513 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2514 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2515 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2516 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2517 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2518 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2519 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2521 #define SCORE_INIT (SCORE_iMIME)
2523 static const char score_table_A0[] = {
2526 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2527 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2530 static const char score_table_F0[] = {
2531 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2532 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2533 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2534 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2537 void set_code_score(struct input_code *ptr, nkf_char score)
2540 ptr->score |= score;
2544 void clr_code_score(struct input_code *ptr, nkf_char score)
2547 ptr->score &= ~score;
2551 void code_score(struct input_code *ptr)
2553 nkf_char c2 = ptr->buf[0];
2554 #ifdef UTF8_OUTPUT_ENABLE
2555 nkf_char c1 = ptr->buf[1];
2558 set_code_score(ptr, SCORE_ERROR);
2559 }else if (c2 == SS2){
2560 set_code_score(ptr, SCORE_KANA);
2561 }else if (c2 == 0x8f){
2562 set_code_score(ptr, SCORE_X0212);
2563 #ifdef UTF8_OUTPUT_ENABLE
2564 }else if (!e2w_conv(c2, c1)){
2565 set_code_score(ptr, SCORE_NO_EXIST);
2567 }else if ((c2 & 0x70) == 0x20){
2568 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2569 }else if ((c2 & 0x70) == 0x70){
2570 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2571 }else if ((c2 & 0x70) >= 0x50){
2572 set_code_score(ptr, SCORE_L2);
2576 void status_disable(struct input_code *ptr)
2581 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2584 void status_push_ch(struct input_code *ptr, nkf_char c)
2586 ptr->buf[ptr->index++] = c;
2589 void status_clear(struct input_code *ptr)
2595 void status_reset(struct input_code *ptr)
2598 ptr->score = SCORE_INIT;
2601 void status_reinit(struct input_code *ptr)
2604 ptr->_file_stat = 0;
2607 void status_check(struct input_code *ptr, nkf_char c)
2609 if (c <= DEL && estab_f){
2614 void s_status(struct input_code *ptr, nkf_char c)
2618 status_check(ptr, c);
2623 }else if (nkf_char_unicode_p(c)){
2625 }else if (0xa1 <= c && c <= 0xdf){
2626 status_push_ch(ptr, SS2);
2627 status_push_ch(ptr, c);
2630 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2632 status_push_ch(ptr, c);
2633 }else if (0xed <= c && c <= 0xee){
2635 status_push_ch(ptr, c);
2636 #ifdef SHIFTJIS_CP932
2637 }else if (is_ibmext_in_sjis(c)){
2639 status_push_ch(ptr, c);
2640 #endif /* SHIFTJIS_CP932 */
2642 }else if (0xf0 <= c && c <= 0xfc){
2644 status_push_ch(ptr, c);
2645 #endif /* X0212_ENABLE */
2647 status_disable(ptr);
2651 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2652 status_push_ch(ptr, c);
2653 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2657 status_disable(ptr);
2661 #ifdef SHIFTJIS_CP932
2662 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2663 status_push_ch(ptr, c);
2664 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2665 set_code_score(ptr, SCORE_CP932);
2670 #endif /* SHIFTJIS_CP932 */
2671 status_disable(ptr);
2674 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2675 status_push_ch(ptr, c);
2676 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2677 set_code_score(ptr, SCORE_CP932);
2680 status_disable(ptr);
2686 void e_status(struct input_code *ptr, nkf_char c)
2690 status_check(ptr, c);
2695 }else if (nkf_char_unicode_p(c)){
2697 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2699 status_push_ch(ptr, c);
2701 }else if (0x8f == c){
2703 status_push_ch(ptr, c);
2704 #endif /* X0212_ENABLE */
2706 status_disable(ptr);
2710 if (0xa1 <= c && c <= 0xfe){
2711 status_push_ch(ptr, c);
2715 status_disable(ptr);
2720 if (0xa1 <= c && c <= 0xfe){
2722 status_push_ch(ptr, c);
2724 status_disable(ptr);
2726 #endif /* X0212_ENABLE */
2730 #ifdef UTF8_INPUT_ENABLE
2731 void w_status(struct input_code *ptr, nkf_char c)
2735 status_check(ptr, c);
2740 }else if (nkf_char_unicode_p(c)){
2742 }else if (0xc0 <= c && c <= 0xdf){
2744 status_push_ch(ptr, c);
2745 }else if (0xe0 <= c && c <= 0xef){
2747 status_push_ch(ptr, c);
2748 }else if (0xf0 <= c && c <= 0xf4){
2750 status_push_ch(ptr, c);
2752 status_disable(ptr);
2757 if (0x80 <= c && c <= 0xbf){
2758 status_push_ch(ptr, c);
2759 if (ptr->index > ptr->stat){
2760 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2761 && ptr->buf[2] == 0xbf);
2762 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2763 &ptr->buf[0], &ptr->buf[1]);
2770 status_disable(ptr);
2774 if (0x80 <= c && c <= 0xbf){
2775 if (ptr->index < ptr->stat){
2776 status_push_ch(ptr, c);
2781 status_disable(ptr);
2788 void code_status(nkf_char c)
2790 int action_flag = 1;
2791 struct input_code *result = 0;
2792 struct input_code *p = input_code_list;
2794 if (!p->status_func) {
2798 if (!p->status_func)
2800 (p->status_func)(p, c);
2803 }else if(p->stat == 0){
2814 if (result && !estab_f){
2815 set_iconv(TRUE, result->iconv_func);
2816 }else if (c <= DEL){
2817 struct input_code *ptr = input_code_list;
2827 nkf_char std_getc(FILE *f)
2830 return std_gc_buf[--std_gc_ndx];
2836 nkf_char std_ungetc(nkf_char c, FILE *f)
2838 if (std_gc_ndx == STD_GC_BUFSIZE){
2841 std_gc_buf[std_gc_ndx++] = c;
2846 void std_putc(nkf_char c)
2853 static unsigned char hold_buf[HOLD_SIZE*2];
2854 static int hold_count = 0;
2855 nkf_char push_hold_buf(nkf_char c2)
2857 if (hold_count >= HOLD_SIZE*2)
2859 hold_buf[hold_count++] = (unsigned char)c2;
2860 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2863 static int h_conv(FILE *f, int c1, int c2)
2869 /** it must NOT be in the kanji shifte sequence */
2870 /** it must NOT be written in JIS7 */
2871 /** and it must be after 2 byte 8bit code */
2877 while ((c2 = (*i_getc)(f)) != EOF) {
2883 if (push_hold_buf(c2) == EOF || estab_f) {
2889 struct input_code *p = input_code_list;
2890 struct input_code *result = p;
2895 if (p->status_func && p->score < result->score) {
2900 set_iconv(TRUE, result->iconv_func);
2905 ** 1) EOF is detected, or
2906 ** 2) Code is established, or
2907 ** 3) Buffer is FULL (but last word is pushed)
2909 ** in 1) and 3) cases, we continue to use
2910 ** Kanji codes by oconv and leave estab_f unchanged.
2915 while (hold_index < hold_count){
2916 c1 = hold_buf[hold_index++];
2920 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
2921 (*iconv)(JIS_X_0201_1976_K, c1, 0);
2924 if (hold_index < hold_count){
2925 c2 = hold_buf[hold_index++];
2935 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
2938 if (hold_index < hold_count){
2939 c3 = hold_buf[hold_index++];
2940 } else if ((c3 = (*i_getc)(f)) == EOF) {
2945 if (hold_index < hold_count){
2946 c4 = hold_buf[hold_index++];
2947 } else if ((c4 = (*i_getc)(f)) == EOF) {
2952 (*iconv)(c1, c2, (c3<<8)|c4);
2957 /* 3 bytes EUC or UTF-8 */
2958 if (hold_index < hold_count){
2959 c3 = hold_buf[hold_index++];
2960 } else if ((c3 = (*i_getc)(f)) == EOF) {
2966 (*iconv)(c1, c2, c3);
2969 if (c3 == EOF) break;
2975 * Check and Ignore BOM
2977 void check_bom(FILE *f)
2980 switch(c2 = (*i_getc)(f)){
2982 if((c2 = (*i_getc)(f)) == 0x00){
2983 if((c2 = (*i_getc)(f)) == 0xFE){
2984 if((c2 = (*i_getc)(f)) == 0xFF){
2985 if(!input_encoding){
2986 set_iconv(TRUE, w_iconv32);
2988 if (iconv == w_iconv32) {
2989 input_endian = ENDIAN_BIG;
2992 (*i_ungetc)(0xFF,f);
2993 }else (*i_ungetc)(c2,f);
2994 (*i_ungetc)(0xFE,f);
2995 }else if(c2 == 0xFF){
2996 if((c2 = (*i_getc)(f)) == 0xFE){
2997 if(!input_encoding){
2998 set_iconv(TRUE, w_iconv32);
3000 if (iconv == w_iconv32) {
3001 input_endian = ENDIAN_2143;
3004 (*i_ungetc)(0xFF,f);
3005 }else (*i_ungetc)(c2,f);
3006 (*i_ungetc)(0xFF,f);
3007 }else (*i_ungetc)(c2,f);
3008 (*i_ungetc)(0x00,f);
3009 }else (*i_ungetc)(c2,f);
3010 (*i_ungetc)(0x00,f);
3013 if((c2 = (*i_getc)(f)) == 0xBB){
3014 if((c2 = (*i_getc)(f)) == 0xBF){
3015 if(!input_encoding){
3016 set_iconv(TRUE, w_iconv);
3018 if (iconv == w_iconv) {
3021 (*i_ungetc)(0xBF,f);
3022 }else (*i_ungetc)(c2,f);
3023 (*i_ungetc)(0xBB,f);
3024 }else (*i_ungetc)(c2,f);
3025 (*i_ungetc)(0xEF,f);
3028 if((c2 = (*i_getc)(f)) == 0xFF){
3029 if((c2 = (*i_getc)(f)) == 0x00){
3030 if((c2 = (*i_getc)(f)) == 0x00){
3031 if(!input_encoding){
3032 set_iconv(TRUE, w_iconv32);
3034 if (iconv == w_iconv32) {
3035 input_endian = ENDIAN_3412;
3038 (*i_ungetc)(0x00,f);
3039 }else (*i_ungetc)(c2,f);
3040 (*i_ungetc)(0x00,f);
3041 }else (*i_ungetc)(c2,f);
3042 if(!input_encoding){
3043 set_iconv(TRUE, w_iconv16);
3045 if (iconv == w_iconv16) {
3046 input_endian = ENDIAN_BIG;
3049 (*i_ungetc)(0xFF,f);
3050 }else (*i_ungetc)(c2,f);
3051 (*i_ungetc)(0xFE,f);
3054 if((c2 = (*i_getc)(f)) == 0xFE){
3055 if((c2 = (*i_getc)(f)) == 0x00){
3056 if((c2 = (*i_getc)(f)) == 0x00){
3057 if(!input_encoding){
3058 set_iconv(TRUE, w_iconv32);
3060 if (iconv == w_iconv32) {
3061 input_endian = ENDIAN_LITTLE;
3064 (*i_ungetc)(0x00,f);
3065 }else (*i_ungetc)(c2,f);
3066 (*i_ungetc)(0x00,f);
3067 }else (*i_ungetc)(c2,f);
3068 if(!input_encoding){
3069 set_iconv(TRUE, w_iconv16);
3071 if (iconv == w_iconv16) {
3072 input_endian = ENDIAN_LITTLE;
3075 (*i_ungetc)(0xFE,f);
3076 }else (*i_ungetc)(c2,f);
3077 (*i_ungetc)(0xFF,f);
3091 static void init_broken_state(void)
3093 memset(&broken_state, 0, sizeof(broken_state));
3096 static void push_broken_buf(c)
3098 broken_state.buf[broken_state.count++] = c;
3101 static nkf_char pop_broken_buf(void)
3103 return broken_state.buf[--broken_state.count];
3106 nkf_char broken_getc(FILE *f)
3110 if (broken_state.count > 0) {
3111 return pop_broken_buf();
3114 if (c=='$' && broken_state.status != ESC
3115 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3117 broken_state.status = 0;
3118 if (c1=='@'|| c1=='B') {
3119 push_broken_buf(c1);
3126 } else if (c=='(' && broken_state.status != ESC
3127 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3129 broken_state.status = 0;
3130 if (c1=='J'|| c1=='B') {
3131 push_broken_buf(c1);
3139 broken_state.status = c;
3144 nkf_char broken_ungetc(nkf_char c, FILE *f)
3146 if (broken_state.count < 2)
3151 void eol_conv(nkf_char c2, nkf_char c1)
3153 if (guess_f && input_eol != EOF) {
3154 if (c2 == 0 && c1 == LF) {
3155 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3156 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3157 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3159 else if (!input_eol) input_eol = CR;
3160 else if (input_eol != CR) input_eol = EOF;
3162 if (prev_cr || (c2 == 0 && c1 == LF)) {
3164 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3165 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3167 if (c2 == 0 && c1 == CR) prev_cr = CR;
3168 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3172 Return value of fold_conv()
3174 LF add newline and output char
3175 CR add newline and output nothing
3178 1 (or else) normal output
3180 fold state in prev (previous character)
3182 >0x80 Japanese (X0208/X0201)
3187 This fold algorthm does not preserve heading space in a line.
3188 This is the main difference from fmt.
3191 #define char_size(c2,c1) (c2?2:1)
3193 void fold_conv(nkf_char c2, nkf_char c1)
3196 nkf_char fold_state;
3198 if (c1== CR && !fold_preserve_f) {
3199 fold_state=0; /* ignore cr */
3200 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3202 fold_state=0; /* ignore cr */
3203 } else if (c1== BS) {
3204 if (f_line>0) f_line--;
3206 } else if (c2==EOF && f_line != 0) { /* close open last line */
3208 } else if ((c1==LF && !fold_preserve_f)
3209 || ((c1==CR||(c1==LF&&f_prev!=CR))
3210 && fold_preserve_f)) {
3212 if (fold_preserve_f) {
3216 } else if ((f_prev == c1 && !fold_preserve_f)
3217 || (f_prev == LF && fold_preserve_f)
3218 ) { /* duplicate newline */
3221 fold_state = LF; /* output two newline */
3227 if (f_prev&0x80) { /* Japanese? */
3229 fold_state = 0; /* ignore given single newline */
3230 } else if (f_prev==SP) {
3234 if (++f_line<=fold_len)
3238 fold_state = CR; /* fold and output nothing */
3242 } else if (c1=='\f') {
3245 fold_state = LF; /* output newline and clear */
3246 } else if ( (c2==0 && c1==SP)||
3247 (c2==0 && c1==TAB)||
3248 (c2=='!'&& c1=='!')) {
3249 /* X0208 kankaku or ascii space */
3251 fold_state = 0; /* remove duplicate spaces */
3254 if (++f_line<=fold_len)
3255 fold_state = SP; /* output ASCII space only */
3257 f_prev = SP; f_line = 0;
3258 fold_state = CR; /* fold and output nothing */
3262 prev0 = f_prev; /* we still need this one... , but almost done */
3264 if (c2 || c2 == JIS_X_0201_1976_K)
3265 f_prev |= 0x80; /* this is Japanese */
3266 f_line += char_size(c2,c1);
3267 if (f_line<=fold_len) { /* normal case */
3270 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3271 f_line = char_size(c2,c1);
3272 fold_state = LF; /* We can't wait, do fold now */
3273 } else if (c2 == JIS_X_0201_1976_K) {
3274 /* simple kinsoku rules return 1 means no folding */
3275 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3276 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3277 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3278 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3279 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3280 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3281 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3283 fold_state = LF;/* add one new f_line before this character */
3286 fold_state = LF;/* add one new f_line before this character */
3289 /* kinsoku point in ASCII */
3290 if ( c1==')'|| /* { [ ( */
3301 /* just after special */
3302 } else if (!is_alnum(prev0)) {
3303 f_line = char_size(c2,c1);
3305 } else if ((prev0==SP) || /* ignored new f_line */
3306 (prev0==LF)|| /* ignored new f_line */
3307 (prev0&0x80)) { /* X0208 - ASCII */
3308 f_line = char_size(c2,c1);
3309 fold_state = LF;/* add one new f_line before this character */
3311 fold_state = 1; /* default no fold in ASCII */
3315 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3316 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3317 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3318 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3319 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3320 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3321 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3322 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3323 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3324 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3325 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3326 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3327 /* default no fold in kinsoku */
3330 f_line = char_size(c2,c1);
3331 /* add one new f_line before this character */
3334 f_line = char_size(c2,c1);
3336 /* add one new f_line before this character */
3341 /* terminator process */
3342 switch(fold_state) {
3344 OCONV_NEWLINE((*o_fconv));
3350 OCONV_NEWLINE((*o_fconv));
3361 nkf_char z_prev2=0,z_prev1=0;
3363 void z_conv(nkf_char c2, nkf_char c1)
3366 /* if (c2) c1 &= 0x7f; assertion */
3368 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3374 if (z_prev2 == JIS_X_0201_1976_K) {
3375 if (c2 == JIS_X_0201_1976_K) {
3376 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3378 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3380 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3382 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3387 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3389 if (c2 == JIS_X_0201_1976_K) {
3390 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3391 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3396 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3407 if (alpha_f&1 && c2 == 0x23) {
3408 /* JISX0208 Alphabet */
3410 } else if (c2 == 0x21) {
3411 /* JISX0208 Kigou */
3416 } else if (alpha_f&4) {
3421 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3427 if (alpha_f&8 && c2 == 0) {
3431 case '>': entity = ">"; break;
3432 case '<': entity = "<"; break;
3433 case '\"': entity = """; break;
3434 case '&': entity = "&"; break;
3437 while (*entity) (*o_zconv)(0, *entity++);
3443 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3448 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3452 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3456 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3460 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3464 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3468 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3472 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3476 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3481 (*o_zconv)(JIS_X_0201_1976_K, c);
3484 } else if (c2 == 0x25) {
3485 /* JISX0208 Katakana */
3486 static const int fullwidth_to_halfwidth[] =
3488 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3489 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3490 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3491 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3492 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3493 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3494 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3495 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3496 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3497 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3498 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3499 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3501 if (fullwidth_to_halfwidth[c1-0x20]){
3502 c2 = fullwidth_to_halfwidth[c1-0x20];
3503 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3505 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3515 #define rot13(c) ( \
3517 (c <= 'M') ? (c + 13): \
3518 (c <= 'Z') ? (c - 13): \
3520 (c <= 'm') ? (c + 13): \
3521 (c <= 'z') ? (c - 13): \
3525 #define rot47(c) ( \
3527 ( c <= 'O') ? (c + 47) : \
3528 ( c <= '~') ? (c - 47) : \
3532 void rot_conv(nkf_char c2, nkf_char c1)
3534 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3540 (*o_rot_conv)(c2,c1);
3543 void hira_conv(nkf_char c2, nkf_char c1)
3547 if (0x20 < c1 && c1 < 0x74) {
3549 (*o_hira_conv)(c2,c1);
3551 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3553 c1 = nkf_char_unicode_new(0x3094);
3554 (*o_hira_conv)(c2,c1);
3557 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3559 (*o_hira_conv)(c2,c1);
3564 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3567 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3569 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3573 (*o_hira_conv)(c2,c1);
3577 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3579 #define RANGE_NUM_MAX 18
3580 static const nkf_char range[RANGE_NUM_MAX][2] = {
3601 nkf_char start, end, c;
3603 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3607 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3612 for (i = 0; i < RANGE_NUM_MAX; i++) {
3613 start = range[i][0];
3616 if (c >= start && c <= end) {
3621 (*o_iso2022jp_check_conv)(c2,c1);
3625 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3627 static const unsigned char *mime_pattern[] = {
3628 (const unsigned char *)"\075?EUC-JP?B?",
3629 (const unsigned char *)"\075?SHIFT_JIS?B?",
3630 (const unsigned char *)"\075?ISO-8859-1?Q?",
3631 (const unsigned char *)"\075?ISO-8859-1?B?",
3632 (const unsigned char *)"\075?ISO-2022-JP?B?",
3633 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3634 #if defined(UTF8_INPUT_ENABLE)
3635 (const unsigned char *)"\075?UTF-8?B?",
3636 (const unsigned char *)"\075?UTF-8?Q?",
3638 (const unsigned char *)"\075?US-ASCII?Q?",
3643 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3644 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3645 e_iconv, s_iconv, 0, 0, 0, 0,
3646 #if defined(UTF8_INPUT_ENABLE)
3652 static const nkf_char mime_encode[] = {
3653 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3654 #if defined(UTF8_INPUT_ENABLE)
3661 static const nkf_char mime_encode_method[] = {
3662 'B', 'B','Q', 'B', 'B', 'Q',
3663 #if defined(UTF8_INPUT_ENABLE)
3671 /* MIME preprocessor fifo */
3673 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3674 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3675 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3677 unsigned char buf[MIME_BUF_SIZE];
3679 unsigned int last; /* decoded */
3680 unsigned int input; /* undecoded */
3682 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3684 #define MAXRECOVER 20
3686 static void mime_input_buf_unshift(nkf_char c)
3688 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3691 nkf_char mime_ungetc(nkf_char c, FILE *f)
3693 mime_input_buf_unshift(c);
3697 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
3700 (*i_mungetc_buf)(c,f);
3702 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3706 nkf_char mime_getc_buf(FILE *f)
3708 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3709 a terminator. It was checked in mime_integrity. */
3710 return ((mimebuf_f)?
3711 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3714 void switch_mime_getc(void)
3716 if (i_getc!=mime_getc) {
3717 i_mgetc = i_getc; i_getc = mime_getc;
3718 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3719 if(mime_f==STRICT_MIME) {
3720 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3721 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3726 void unswitch_mime_getc(void)
3728 if(mime_f==STRICT_MIME) {
3729 i_mgetc = i_mgetc_buf;
3730 i_mungetc = i_mungetc_buf;
3733 i_ungetc = i_mungetc;
3734 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3735 mime_iconv_back = NULL;
3738 nkf_char mime_integrity(FILE *f, const unsigned char *p)
3742 /* In buffered mode, read until =? or NL or buffer full
3744 mime_input_state.input = mime_input_state.top;
3745 mime_input_state.last = mime_input_state.top;
3747 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3749 q = mime_input_state.input;
3750 while((c=(*i_getc)(f))!=EOF) {
3751 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3752 break; /* buffer full */
3754 if (c=='=' && d=='?') {
3755 /* checked. skip header, start decode */
3756 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3757 /* mime_last_input = mime_input_state.input; */
3758 mime_input_state.input = q;
3762 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3764 /* Should we check length mod 4? */
3765 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3768 /* In case of Incomplete MIME, no MIME decode */
3769 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3770 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3771 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3772 switch_mime_getc(); /* anyway we need buffered getc */
3776 nkf_char mime_begin_strict(FILE *f)
3780 const unsigned char *p,*q;
3781 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3783 mime_decode_mode = FALSE;
3784 /* =? has been checked */
3786 p = mime_pattern[j];
3789 for(i=2;p[i]>SP;i++) { /* start at =? */
3790 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3791 /* pattern fails, try next one */
3793 while (mime_pattern[++j]) {
3794 p = mime_pattern[j];
3795 for(k=2;k<i;k++) /* assume length(p) > i */
3796 if (p[k]!=q[k]) break;
3797 if (k==i && nkf_toupper(c1)==p[k]) break;
3799 p = mime_pattern[j];
3800 if (p) continue; /* found next one, continue */
3801 /* all fails, output from recovery buffer */
3809 mime_decode_mode = p[i-2];
3811 mime_iconv_back = iconv;
3812 set_iconv(FALSE, mime_priority_func[j]);
3813 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3815 if (mime_decode_mode=='B') {
3816 mimebuf_f = unbuf_f;
3818 /* do MIME integrity check */
3819 return mime_integrity(f,mime_pattern[j]);
3827 nkf_char mime_begin(FILE *f)
3832 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3833 /* re-read and convert again from mime_buffer. */
3835 /* =? has been checked */
3836 k = mime_input_state.last;
3837 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3838 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3839 /* We accept any character type even if it is breaked by new lines */
3840 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3841 if (c1==LF||c1==SP||c1==CR||
3842 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3844 /* Failed. But this could be another MIME preemble */
3846 mime_input_state.last--;
3852 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3853 if (!(++i<MAXRECOVER) || c1==EOF) break;
3854 if (c1=='b'||c1=='B') {
3855 mime_decode_mode = 'B';
3856 } else if (c1=='q'||c1=='Q') {
3857 mime_decode_mode = 'Q';
3861 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3862 if (!(++i<MAXRECOVER) || c1==EOF) break;
3864 mime_decode_mode = FALSE;
3870 if (!mime_decode_mode) {
3871 /* false MIME premble, restart from mime_buffer */
3872 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3873 /* Since we are in MIME mode until buffer becomes empty, */
3874 /* we never go into mime_begin again for a while. */
3877 /* discard mime preemble, and goto MIME mode */
3878 mime_input_state.last = k;
3879 /* do no MIME integrity check */
3880 return c1; /* used only for checking EOF */
3884 void no_putc(nkf_char c)
3889 void debug(const char *str)
3892 fprintf(stderr, "%s\n", str ? str : "NULL");
3897 void set_input_codename(char *codename)
3899 if (!input_codename) {
3900 input_codename = codename;
3901 } else if (strcmp(codename, input_codename) != 0) {
3902 input_codename = "";
3906 static char* get_guessed_code(void)
3908 if (input_codename && !*input_codename) {
3909 input_codename = "BINARY";
3911 struct input_code *p = find_inputcode_byfunc(iconv);
3912 if (!input_codename) {
3913 input_codename = "ASCII";
3914 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
3915 if (p->score & (SCORE_DEPEND|SCORE_CP932))
3916 input_codename = "CP932";
3917 } else if (strcmp(input_codename, "EUC-JP") == 0) {
3918 if (p->score & (SCORE_X0212))
3919 input_codename = "EUCJP-MS";
3920 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3921 input_codename = "CP51932";
3922 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
3923 if (p->score & (SCORE_KANA))
3924 input_codename = "CP50221";
3925 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3926 input_codename = "CP50220";
3929 return input_codename;
3932 #if !defined(PERL_XS) && !defined(WIN32DLL)
3933 void print_guessed_code(char *filename)
3935 if (filename != NULL) printf("%s: ", filename);
3936 if (input_codename && !*input_codename) {
3939 input_codename = get_guessed_code();
3941 printf("%s\n", input_codename);
3945 input_eol == CR ? " (CR)" :
3946 input_eol == LF ? " (LF)" :
3947 input_eol == CRLF ? " (CRLF)" :
3948 input_eol == EOF ? " (MIXED NL)" :
3957 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
3959 nkf_char c1, c2, c3;
3965 if (!nkf_isxdigit(c2)){
3970 if (!nkf_isxdigit(c3)){
3975 return (hex2bin(c2) << 4) | hex2bin(c3);
3978 nkf_char cap_getc(FILE *f)
3980 return hex_getc(':', f, i_cgetc, i_cungetc);
3983 nkf_char cap_ungetc(nkf_char c, FILE *f)
3985 return (*i_cungetc)(c, f);
3988 nkf_char url_getc(FILE *f)
3990 return hex_getc('%', f, i_ugetc, i_uungetc);
3993 nkf_char url_ungetc(nkf_char c, FILE *f)
3995 return (*i_uungetc)(c, f);
3999 #ifdef NUMCHAR_OPTION
4000 nkf_char numchar_getc(FILE *f)
4002 nkf_char (*g)(FILE *) = i_ngetc;
4003 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4014 if (buf[i] == 'x' || buf[i] == 'X'){
4015 for (j = 0; j < 7; j++){
4017 if (!nkf_isxdigit(buf[i])){
4024 c |= hex2bin(buf[i]);
4027 for (j = 0; j < 8; j++){
4031 if (!nkf_isdigit(buf[i])){
4038 c += hex2bin(buf[i]);
4044 return nkf_char_unicode_new(c);
4053 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4055 return (*i_nungetc)(c, f);
4059 #ifdef UNICODE_NORMALIZATION
4061 /* Normalization Form C */
4062 nkf_char nfc_getc(FILE *f)
4064 nkf_char (*g)(FILE *f) = i_nfc_getc;
4065 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4066 int i=0, j, k=1, lower, upper;
4068 const unsigned char *array;
4071 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4072 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4073 while (upper >= lower) {
4074 j = (lower+upper) / 2;
4075 array = normalization_table[j].nfd;
4076 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4077 if (array[k] != buf[k]){
4078 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4085 array = normalization_table[j].nfc;
4086 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4087 buf[i] = (nkf_char)(array[i]);
4098 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4100 return (*i_nfc_ungetc)(c, f);
4102 #endif /* UNICODE_NORMALIZATION */
4105 static nkf_char base64decode(nkf_char c)
4110 i = c - 'A'; /* A..Z 0-25 */
4111 } else if (c == '_') {
4112 i = '?' /* 63 */ ; /* _ 63 */
4114 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4116 } else if (c > '/') {
4117 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4118 } else if (c == '+' || c == '-') {
4119 i = '>' /* 62 */ ; /* + and - 62 */
4121 i = '?' /* 63 */ ; /* / 63 */
4129 nkf_char c1, c2, c3, c4, cc;
4130 nkf_char t1, t2, t3, t4, mode, exit_mode;
4131 nkf_char lwsp_count;
4134 nkf_char lwsp_size = 128;
4136 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4137 return mime_input_buf(mime_input_state.top++);
4139 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4140 mime_decode_mode=FALSE;
4141 unswitch_mime_getc();
4142 return (*i_getc)(f);
4145 if (mimebuf_f == FIXED_MIME)
4146 exit_mode = mime_decode_mode;
4149 if (mime_decode_mode == 'Q') {
4150 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4152 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4153 if (c1<=SP || DEL<=c1) {
4154 mime_decode_mode = exit_mode; /* prepare for quit */
4157 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4161 mime_decode_mode = exit_mode; /* prepare for quit */
4162 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4163 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4164 /* end Q encoding */
4165 input_mode = exit_mode;
4167 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4168 if (lwsp_buf==NULL) {
4169 perror("can't malloc");
4172 while ((c1=(*i_getc)(f))!=EOF) {
4177 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4185 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4186 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4201 lwsp_buf[lwsp_count] = (unsigned char)c1;
4202 if (lwsp_count++>lwsp_size){
4204 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4205 if (lwsp_buf_new==NULL) {
4207 perror("can't realloc");
4210 lwsp_buf = lwsp_buf_new;
4216 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4218 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4219 i_ungetc(lwsp_buf[lwsp_count],f);
4225 if (c1=='='&&c2<SP) { /* this is soft wrap */
4226 while((c1 = (*i_mgetc)(f)) <=SP) {
4227 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4229 mime_decode_mode = 'Q'; /* still in MIME */
4230 goto restart_mime_q;
4233 mime_decode_mode = 'Q'; /* still in MIME */
4237 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4238 if (c2<=SP) return c2;
4239 mime_decode_mode = 'Q'; /* still in MIME */
4240 return ((hex2bin(c2)<<4) + hex2bin(c3));
4243 if (mime_decode_mode != 'B') {
4244 mime_decode_mode = FALSE;
4245 return (*i_mgetc)(f);
4249 /* Base64 encoding */
4251 MIME allows line break in the middle of
4252 Base64, but we are very pessimistic in decoding
4253 in unbuf mode because MIME encoded code may broken by
4254 less or editor's control sequence (such as ESC-[-K in unbuffered
4255 mode. ignore incomplete MIME.
4257 mode = mime_decode_mode;
4258 mime_decode_mode = exit_mode; /* prepare for quit */
4260 while ((c1 = (*i_mgetc)(f))<=SP) {
4265 if ((c2 = (*i_mgetc)(f))<=SP) {
4268 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4269 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4272 if ((c1 == '?') && (c2 == '=')) {
4275 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4276 if (lwsp_buf==NULL) {
4277 perror("can't malloc");
4280 while ((c1=(*i_getc)(f))!=EOF) {
4285 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4293 if ((c1=(*i_getc)(f))!=EOF) {
4297 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4312 lwsp_buf[lwsp_count] = (unsigned char)c1;
4313 if (lwsp_count++>lwsp_size){
4315 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4316 if (lwsp_buf_new==NULL) {
4318 perror("can't realloc");
4321 lwsp_buf = lwsp_buf_new;
4327 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4329 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4330 i_ungetc(lwsp_buf[lwsp_count],f);
4337 if ((c3 = (*i_mgetc)(f))<=SP) {
4340 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4341 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4345 if ((c4 = (*i_mgetc)(f))<=SP) {
4348 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4349 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4353 mime_decode_mode = mode; /* still in MIME sigh... */
4355 /* BASE 64 decoding */
4357 t1 = 0x3f & base64decode(c1);
4358 t2 = 0x3f & base64decode(c2);
4359 t3 = 0x3f & base64decode(c3);
4360 t4 = 0x3f & base64decode(c4);
4361 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4363 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4364 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4366 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4367 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4369 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4374 return mime_input_buf(mime_input_state.top++);
4377 static const char basis_64[] =
4378 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4380 #define MIMEOUT_BUF_LENGTH (60)
4382 char buf[MIMEOUT_BUF_LENGTH+1];
4387 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4389 static void open_mime(nkf_char mode)
4391 const unsigned char *p;
4394 p = mime_pattern[0];
4395 for(i=0;mime_pattern[i];i++) {
4396 if (mode == mime_encode[i]) {
4397 p = mime_pattern[i];
4401 mimeout_mode = mime_encode_method[i];
4403 if (base64_count>45) {
4404 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4405 (*o_mputc)(mimeout_state.buf[i]);
4408 PUT_NEWLINE((*o_mputc));
4411 if (mimeout_state.count>0
4412 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4413 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4417 for (;i<mimeout_state.count;i++) {
4418 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4419 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4420 (*o_mputc)(mimeout_state.buf[i]);
4430 j = mimeout_state.count;
4431 mimeout_state.count = 0;
4433 mime_putc(mimeout_state.buf[i]);
4437 static void mime_prechar(nkf_char c2, nkf_char c1)
4439 if (mimeout_mode > 0){
4441 if (base64_count + mimeout_state.count/3*4> 73){
4442 (*o_base64conv)(EOF,0);
4443 OCONV_NEWLINE((*o_base64conv));
4444 (*o_base64conv)(0,SP);
4448 if (base64_count + mimeout_state.count/3*4> 66) {
4449 (*o_base64conv)(EOF,0);
4450 OCONV_NEWLINE((*o_base64conv));
4451 (*o_base64conv)(0,SP);
4457 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4458 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4459 open_mime(output_mode);
4460 (*o_base64conv)(EOF,0);
4461 OCONV_NEWLINE((*o_base64conv));
4462 (*o_base64conv)(0,SP);
4469 static void close_mime(void)
4477 static void eof_mime(void)
4479 switch(mimeout_mode) {
4484 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4490 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4495 if (mimeout_mode > 0) {
4496 if (mimeout_f!=FIXED_MIME) {
4498 } else if (mimeout_mode != 'Q')
4503 static void mimeout_addchar(nkf_char c)
4505 switch(mimeout_mode) {
4510 } else if(!nkf_isalnum(c)) {
4512 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4513 (*o_mputc)(bin2hex((c&0xf)));
4521 mimeout_state.state=c;
4522 (*o_mputc)(basis_64[c>>2]);
4527 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4528 mimeout_state.state=c;
4533 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4534 (*o_mputc)(basis_64[c & 0x3F]);
4545 static void mime_putc(nkf_char c)
4550 if (mimeout_f == FIXED_MIME){
4551 if (mimeout_mode == 'Q'){
4552 if (base64_count > 71){
4553 if (c!=CR && c!=LF) {
4555 PUT_NEWLINE((*o_mputc));
4560 if (base64_count > 71){
4562 PUT_NEWLINE((*o_mputc));
4565 if (c == EOF) { /* c==EOF */
4569 if (c != EOF) { /* c==EOF */
4575 /* mimeout_f != FIXED_MIME */
4577 if (c == EOF) { /* c==EOF */
4578 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4579 j = mimeout_state.count;
4580 mimeout_state.count = 0;
4582 if (mimeout_mode > 0) {
4583 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4585 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4588 mimeout_addchar(mimeout_state.buf[i]);
4592 mimeout_addchar(mimeout_state.buf[i]);
4596 mimeout_addchar(mimeout_state.buf[i]);
4602 mimeout_addchar(mimeout_state.buf[i]);
4608 if (mimeout_state.count > 0){
4609 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4614 if (mimeout_mode=='Q') {
4615 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4616 if (c == CR || c == LF) {
4621 } else if (c <= SP) {
4623 if (base64_count > 70) {
4624 PUT_NEWLINE((*o_mputc));
4627 if (!nkf_isblank(c)) {
4632 if (base64_count > 70) {
4634 PUT_NEWLINE((*o_mputc));
4637 open_mime(output_mode);
4639 if (!nkf_noescape_mime(c)) {
4650 if (mimeout_mode <= 0) {
4651 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4652 if (nkf_isspace(c)) {
4654 if (mimeout_mode == -1) {
4657 if (c==CR || c==LF) {
4659 open_mime(output_mode);
4665 for (i=0;i<mimeout_state.count;i++) {
4666 (*o_mputc)(mimeout_state.buf[i]);
4667 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4678 mimeout_state.buf[0] = (char)c;
4679 mimeout_state.count = 1;
4681 if (base64_count > 1
4682 && base64_count + mimeout_state.count > 76
4683 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4684 PUT_NEWLINE((*o_mputc));
4686 if (!nkf_isspace(mimeout_state.buf[0])){
4691 mimeout_state.buf[mimeout_state.count++] = (char)c;
4692 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4693 open_mime(output_mode);
4698 if (lastchar==CR || lastchar == LF){
4699 for (i=0;i<mimeout_state.count;i++) {
4700 (*o_mputc)(mimeout_state.buf[i]);
4703 mimeout_state.count = 0;
4706 for (i=0;i<mimeout_state.count-1;i++) {
4707 (*o_mputc)(mimeout_state.buf[i]);
4710 mimeout_state.buf[0] = SP;
4711 mimeout_state.count = 1;
4713 open_mime(output_mode);
4716 /* mimeout_mode == 'B', 1, 2 */
4717 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4718 if (lastchar == CR || lastchar == LF){
4719 if (nkf_isblank(c)) {
4720 for (i=0;i<mimeout_state.count;i++) {
4721 mimeout_addchar(mimeout_state.buf[i]);
4723 mimeout_state.count = 0;
4724 } else if (SP<c && c<DEL) {
4726 for (i=0;i<mimeout_state.count;i++) {
4727 (*o_mputc)(mimeout_state.buf[i]);
4730 mimeout_state.count = 0;
4732 mimeout_state.buf[mimeout_state.count++] = (char)c;
4735 if (c==SP || c==TAB || c==CR || c==LF) {
4736 for (i=0;i<mimeout_state.count;i++) {
4737 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4739 for (i=0;i<mimeout_state.count;i++) {
4740 (*o_mputc)(mimeout_state.buf[i]);
4743 mimeout_state.count = 0;
4746 mimeout_state.buf[mimeout_state.count++] = (char)c;
4747 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4749 for (i=0;i<mimeout_state.count;i++) {
4750 (*o_mputc)(mimeout_state.buf[i]);
4753 mimeout_state.count = 0;
4757 if (mimeout_state.count>0 && SP<c && c!='=') {
4758 mimeout_state.buf[mimeout_state.count++] = (char)c;
4759 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4760 j = mimeout_state.count;
4761 mimeout_state.count = 0;
4763 mimeout_addchar(mimeout_state.buf[i]);
4770 if (mimeout_state.count>0) {
4771 j = mimeout_state.count;
4772 mimeout_state.count = 0;
4774 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4776 mimeout_addchar(mimeout_state.buf[i]);
4782 (*o_mputc)(mimeout_state.buf[i]);
4784 open_mime(output_mode);
4790 void base64_conv(nkf_char c2, nkf_char c1)
4792 mime_prechar(c2, c1);
4793 (*o_base64conv)(c2,c1);
4797 typedef struct nkf_iconv_t {
4800 size_t input_buffer_size;
4801 char *output_buffer;
4802 size_t output_buffer_size;
4805 nkf_iconv_t nkf_iconv_new(char *tocode, char *fromcode)
4807 nkf_iconv_t converter;
4809 converter->input_buffer_size = IOBUF_SIZE;
4810 converter->input_buffer = malloc(converter->input_buffer_size);
4811 if (converter->input_buffer == NULL)
4812 perror("can't malloc");
4814 converter->output_buffer_size = IOBUF_SIZE * 2;
4815 converter->output_buffer = malloc(converter->output_buffer_size);
4816 if (converter->output_buffer == NULL)
4817 perror("can't malloc");
4819 converter->cd = iconv_open(tocode, fromcode);
4820 if (converter->cd == (iconv_t)-1)
4824 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4827 perror("can't iconv_open");
4832 size_t nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4834 size_t invalid = (size_t)0;
4835 char *input_buffer = converter->input_buffer;
4836 size_t input_length = (size_t)0;
4837 char *output_buffer = converter->output_buffer;
4838 size_t output_length = converter->output_buffer_size;
4843 while ((c = (*i_getc)(f)) != EOF) {
4844 input_buffer[input_length++] = c;
4845 if (input_length < converter->input_buffer_size) break;
4849 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4850 while (output_length-- > 0) {
4851 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4853 if (ret == (size_t) - 1) {
4856 if (input_buffer != converter->input_buffer)
4857 memmove(converter->input_buffer, input_buffer, input_length);
4860 converter->output_buffer_size *= 2;
4861 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4862 if (output_buffer == NULL) {
4863 perror("can't realloc");
4866 converter->output_buffer = output_buffer;
4869 perror("can't iconv");
4881 void nkf_iconv_close(nkf_iconv_t *convert)
4883 free(converter->inbuf);
4884 free(converter->outbuf);
4885 iconv_close(converter->cd);
4893 struct input_code *p = input_code_list;
4905 mime_f = MIME_DECODE_DEFAULT;
4906 mime_decode_f = FALSE;
4911 x0201_f = X0201_DEFAULT;
4912 iso2022jp_f = FALSE;
4913 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
4914 ms_ucs_map_f = UCS_MAP_ASCII;
4916 #ifdef UTF8_INPUT_ENABLE
4917 no_cp932ext_f = FALSE;
4918 no_best_fit_chars_f = FALSE;
4919 encode_fallback = NULL;
4920 unicode_subchar = '?';
4921 input_endian = ENDIAN_BIG;
4923 #ifdef UTF8_OUTPUT_ENABLE
4924 output_bom_f = FALSE;
4925 output_endian = ENDIAN_BIG;
4927 #ifdef UNICODE_NORMALIZATION
4943 #ifdef SHIFTJIS_CP932
4953 for (i = 0; i < 256; i++){
4954 prefix_table[i] = 0;
4958 mimeout_state.count = 0;
4963 fold_preserve_f = FALSE;
4966 kanji_intro = DEFAULT_J;
4967 ascii_intro = DEFAULT_R;
4968 fold_margin = FOLD_MARGIN;
4969 o_zconv = no_connection;
4970 o_fconv = no_connection;
4971 o_eol_conv = no_connection;
4972 o_rot_conv = no_connection;
4973 o_hira_conv = no_connection;
4974 o_base64conv = no_connection;
4975 o_iso2022jp_check_conv = no_connection;
4978 i_ungetc = std_ungetc;
4980 i_bungetc = std_ungetc;
4983 i_mungetc = std_ungetc;
4984 i_mgetc_buf = std_getc;
4985 i_mungetc_buf = std_ungetc;
4986 output_mode = ASCII;
4988 mime_decode_mode = FALSE;
4994 init_broken_state();
4995 z_prev2=0,z_prev1=0;
4997 iconv_for_check = 0;
4999 input_codename = NULL;
5000 input_encoding = NULL;
5001 output_encoding = NULL;
5007 int module_connection(void)
5009 if (input_encoding) set_input_encoding(input_encoding);
5010 if (!output_encoding) {
5011 output_encoding = nkf_default_encoding();
5013 if (!output_encoding) {
5014 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5017 set_output_encoding(output_encoding);
5018 oconv = nkf_enc_to_oconv(output_encoding);
5021 /* replace continucation module, from output side */
5023 /* output redicrection */
5025 if (noout_f || guess_f){
5032 if (mimeout_f == TRUE) {
5033 o_base64conv = oconv; oconv = base64_conv;
5035 /* base64_count = 0; */
5038 if (eolmode_f || guess_f) {
5039 o_eol_conv = oconv; oconv = eol_conv;
5042 o_rot_conv = oconv; oconv = rot_conv;
5045 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5048 o_hira_conv = oconv; oconv = hira_conv;
5051 o_fconv = oconv; oconv = fold_conv;
5054 if (alpha_f || x0201_f) {
5055 o_zconv = oconv; oconv = z_conv;
5059 i_ungetc = std_ungetc;
5060 /* input redicrection */
5063 i_cgetc = i_getc; i_getc = cap_getc;
5064 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5067 i_ugetc = i_getc; i_getc = url_getc;
5068 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5071 #ifdef NUMCHAR_OPTION
5073 i_ngetc = i_getc; i_getc = numchar_getc;
5074 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5077 #ifdef UNICODE_NORMALIZATION
5079 i_nfc_getc = i_getc; i_getc = nfc_getc;
5080 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5083 if (mime_f && mimebuf_f==FIXED_MIME) {
5084 i_mgetc = i_getc; i_getc = mime_getc;
5085 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5088 i_bgetc = i_getc; i_getc = broken_getc;
5089 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5091 if (input_encoding) {
5092 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5094 set_iconv(FALSE, e_iconv);
5098 struct input_code *p = input_code_list;
5107 Conversion main loop. Code detection only.
5110 #if !defined(PERL_XS) && !defined(WIN32DLL)
5111 nkf_char noconvert(FILE *f)
5116 module_connection();
5117 while ((c = (*i_getc)(f)) != EOF)
5124 int kanji_convert(FILE *f)
5126 nkf_char c1=0, c2=0, c3=0, c4=0;
5127 int shift_mode = FALSE; /* TRUE or FALSE or JIS_X_0201_1976_K */
5128 int is_8bit = FALSE;
5130 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5135 output_mode = ASCII;
5137 #define NEXT continue /* no output, get next */
5138 #define SKIP c2=0;continue /* no output, get next */
5139 #define MORE c2=c1;continue /* need one more byte */
5140 #define SEND ; /* output c1 and c2, get next */
5141 #define LAST break /* end of loop, go closing */
5143 if (module_connection() < 0) {
5144 #if !defined(PERL_XS) && !defined(WIN32DLL)
5145 fprintf(stderr, "no output encoding given\n");
5151 #ifdef UTF8_INPUT_ENABLE
5152 if(iconv == w_iconv32){
5153 while ((c1 = (*i_getc)(f)) != EOF &&
5154 (c2 = (*i_getc)(f)) != EOF &&
5155 (c3 = (*i_getc)(f)) != EOF &&
5156 (c4 = (*i_getc)(f)) != EOF) {
5157 nkf_iconv_utf_32(c1, c2, c3, c4);
5159 (*i_ungetc)(EOF, f);
5161 else if (iconv == w_iconv16) {
5162 while ((c1 = (*i_getc)(f)) != EOF &&
5163 (c2 = (*i_getc)(f)) != EOF) {
5164 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5165 (c3 = (*i_getc)(f)) != EOF &&
5166 (c4 = (*i_getc)(f)) != EOF) {
5167 nkf_iconv_utf_16(c1, c2, c3, c4);
5170 (*i_ungetc)(EOF, f);
5174 while ((c1 = (*i_getc)(f)) != EOF) {
5175 #ifdef INPUT_CODE_FIX
5176 if (!input_encoding)
5182 /* in case of 8th bit is on */
5183 if (!estab_f&&!mime_decode_mode) {
5184 /* in case of not established yet */
5185 /* It is still ambiguious */
5186 if (h_conv(f, c2, c1)==EOF)
5191 /* in case of already established */
5193 /* ignore bogus code */
5200 /* 2nd byte of 7 bit code or SJIS */
5205 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5208 } else if (c1 > DEL) {
5210 if (!estab_f && !iso8859_f) {
5211 /* not established yet */
5213 } else { /* estab_f==TRUE */
5219 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5220 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5222 c2 = JIS_X_0201_1976_K;
5227 /* already established */
5231 } else if (SP < c1 && c1 < DEL) {
5232 /* in case of Roman characters */
5234 /* output 1 shifted byte */
5238 } else if (SP <= c1 && c1 < (0xE0&0x7F)){
5239 /* output 1 shifted byte */
5240 c2 = JIS_X_0201_1976_K;
5243 /* look like bogus code */
5246 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5247 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5248 /* in case of Kanji shifted */
5250 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5251 /* Check MIME code */
5252 if ((c1 = (*i_getc)(f)) == EOF) {
5255 } else if (c1 == '?') {
5256 /* =? is mime conversion start sequence */
5257 if(mime_f == STRICT_MIME) {
5258 /* check in real detail */
5259 if (mime_begin_strict(f) == EOF)
5262 } else if (mime_begin(f) == EOF)
5271 /* normal ASCII code */
5274 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5277 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5280 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5281 if ((c1 = (*i_getc)(f)) == EOF) {
5282 /* (*oconv)(0, ESC); don't send bogus code */
5284 } else if (c1 == '$') {
5285 if ((c1 = (*i_getc)(f)) == EOF) {
5287 (*oconv)(0, ESC); don't send bogus code
5288 (*oconv)(0, '$'); */
5290 } else if (c1 == '@'|| c1 == 'B') {
5291 /* This is kanji introduction */
5292 input_mode = JIS_X_0208;
5294 set_input_codename("ISO-2022-JP");
5296 debug("ISO-2022-JP");
5299 } else if (c1 == '(') {
5300 if ((c1 = (*i_getc)(f)) == EOF) {
5301 /* don't send bogus code
5307 } else if (c1 == '@'|| c1 == 'B') {
5308 /* This is kanji introduction */
5309 input_mode = JIS_X_0208;
5313 } else if (c1 == 'D'){
5314 input_mode = JIS_X_0212;
5317 #endif /* X0212_ENABLE */
5318 } else if (c1 == 'O' || c1 == 'Q'){
5319 input_mode = JIS_X_0213_1;
5322 } else if (c1 == 'P'){
5323 input_mode = JIS_X_0213_2;
5327 /* could be some special code */
5334 } else if (broken_f&0x2) {
5335 /* accept any ESC-(-x as broken code ... */
5336 input_mode = JIS_X_0208;
5345 } else if (c1 == '(') {
5346 if ((c1 = (*i_getc)(f)) == EOF) {
5347 /* don't send bogus code
5349 (*oconv)(0, '('); */
5353 /* This is X0201 kana introduction */
5354 input_mode = JIS_X_0201_1976_K; shift_mode = JIS_X_0201_1976_K;
5356 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5357 /* This is X0208 kanji introduction */
5358 input_mode = ASCII; shift_mode = FALSE;
5360 } else if (broken_f&0x2) {
5361 input_mode = ASCII; shift_mode = FALSE;
5366 /* maintain various input_mode here */
5370 } else if ( c1 == 'N' || c1 == 'n'){
5372 c4 = (*i_getc)(f); /* skip SS2 */
5373 if ( (SP<=c4 && c4 < 0x60) || (0xa0<=c4 && c4 < 0xe0)){
5375 c2 = JIS_X_0201_1976_K;
5388 } else if (c1 == ESC && iconv == s_iconv) {
5389 /* ESC in Shift_JIS */
5390 if ((c1 = (*i_getc)(f)) == EOF) {
5391 /* (*oconv)(0, ESC); don't send bogus code */
5393 } else if (c1 == '$') {
5395 if ((c1 = (*i_getc)(f)) == EOF) {
5397 (*oconv)(0, ESC); don't send bogus code
5398 (*oconv)(0, '$'); */
5401 if (('E' <= c1 && c1 <= 'G') ||
5402 ('O' <= c1 && c1 <= 'Q')) {
5410 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
5411 c3 = nkf_char_unicode_new((jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000);
5412 while ((c1 = (*i_getc)(f)) != EOF) {
5413 if (SP <= c1 && c1 <= 'z') {
5414 (*oconv)(0, c1 + c3);
5415 } else break; /* c1 == SO */
5419 if (c1 == EOF) LAST;
5426 } else if (c1 == LF || c1 == CR) {
5428 input_mode = ASCII; set_iconv(FALSE, 0);
5430 } else if (mime_decode_f && !mime_decode_mode){
5432 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5440 } else { /* if (c1 == CR)*/
5441 if ((c1=(*i_getc)(f))!=EOF) {
5445 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5465 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5468 if ((c3 = (*i_getc)(f)) != EOF) {
5471 if ((c4 = (*i_getc)(f)) != EOF) {
5473 (*iconv)(c2, c1, c3|c4);
5478 /* 3 bytes EUC or UTF-8 */
5479 if ((c3 = (*i_getc)(f)) != EOF) {
5481 (*iconv)(c2, c1, c3);
5489 0x7F <= c2 && c2 <= 0x92 &&
5490 0x21 <= c1 && c1 <= 0x7E) {
5494 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5497 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5501 (*oconv)(PREFIX_EUCG3 | c2, c1);
5503 #endif /* X0212_ENABLE */
5505 (*oconv)(PREFIX_EUCG3 | c2, c1);
5508 (*oconv)(input_mode, c1); /* other special case */
5514 /* goto next_word */
5518 (*iconv)(EOF, 0, 0);
5519 if (!input_codename)
5522 struct input_code *p = input_code_list;
5523 struct input_code *result = p;
5525 if (p->score < result->score) result = p;
5528 set_input_codename(result->name);
5530 debug(result->name);
5538 * int options(unsigned char *cp)
5544 int options(unsigned char *cp)
5548 unsigned char *cp_back = NULL;
5554 while(*cp && *cp++!='-');
5555 while (*cp || cp_back) {
5563 case '-': /* literal options */
5564 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5568 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5569 p = (unsigned char *)long_option[i].name;
5570 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5571 if (*p == cp[j] || cp[j] == SP){
5578 #if !defined(PERL_XS) && !defined(WIN32DLL)
5579 fprintf(stderr, "unknown long option: --%s\n", cp);
5583 while(*cp && *cp != SP && cp++);
5584 if (long_option[i].alias[0]){
5586 cp = (unsigned char *)long_option[i].alias;
5588 if (strcmp(long_option[i].name, "ic=") == 0){
5589 nkf_str_upcase((char *)p, codeset, 32);
5590 enc = nkf_enc_find(codeset);
5592 input_encoding = enc;
5595 if (strcmp(long_option[i].name, "oc=") == 0){
5596 nkf_str_upcase((char *)p, codeset, 32);
5597 enc = nkf_enc_find(codeset);
5598 if (enc <= 0) continue;
5599 output_encoding = enc;
5602 if (strcmp(long_option[i].name, "guess=") == 0){
5603 if (p[0] == '0' || p[0] == '1') {
5611 if (strcmp(long_option[i].name, "overwrite") == 0){
5614 preserve_time_f = TRUE;
5617 if (strcmp(long_option[i].name, "overwrite=") == 0){
5620 preserve_time_f = TRUE;
5622 backup_suffix = malloc(strlen((char *) p) + 1);
5623 strcpy(backup_suffix, (char *) p);
5626 if (strcmp(long_option[i].name, "in-place") == 0){
5629 preserve_time_f = FALSE;
5632 if (strcmp(long_option[i].name, "in-place=") == 0){
5635 preserve_time_f = FALSE;
5637 backup_suffix = malloc(strlen((char *) p) + 1);
5638 strcpy(backup_suffix, (char *) p);
5643 if (strcmp(long_option[i].name, "cap-input") == 0){
5647 if (strcmp(long_option[i].name, "url-input") == 0){
5652 #ifdef NUMCHAR_OPTION
5653 if (strcmp(long_option[i].name, "numchar-input") == 0){
5659 if (strcmp(long_option[i].name, "no-output") == 0){
5663 if (strcmp(long_option[i].name, "debug") == 0){
5668 if (strcmp(long_option[i].name, "cp932") == 0){
5669 #ifdef SHIFTJIS_CP932
5673 #ifdef UTF8_OUTPUT_ENABLE
5674 ms_ucs_map_f = UCS_MAP_CP932;
5678 if (strcmp(long_option[i].name, "no-cp932") == 0){
5679 #ifdef SHIFTJIS_CP932
5683 #ifdef UTF8_OUTPUT_ENABLE
5684 ms_ucs_map_f = UCS_MAP_ASCII;
5688 #ifdef SHIFTJIS_CP932
5689 if (strcmp(long_option[i].name, "cp932inv") == 0){
5696 if (strcmp(long_option[i].name, "x0212") == 0){
5703 if (strcmp(long_option[i].name, "exec-in") == 0){
5707 if (strcmp(long_option[i].name, "exec-out") == 0){
5712 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5713 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5714 no_cp932ext_f = TRUE;
5717 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5718 no_best_fit_chars_f = TRUE;
5721 if (strcmp(long_option[i].name, "fb-skip") == 0){
5722 encode_fallback = NULL;
5725 if (strcmp(long_option[i].name, "fb-html") == 0){
5726 encode_fallback = encode_fallback_html;
5729 if (strcmp(long_option[i].name, "fb-xml") == 0){
5730 encode_fallback = encode_fallback_xml;
5733 if (strcmp(long_option[i].name, "fb-java") == 0){
5734 encode_fallback = encode_fallback_java;
5737 if (strcmp(long_option[i].name, "fb-perl") == 0){
5738 encode_fallback = encode_fallback_perl;
5741 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5742 encode_fallback = encode_fallback_subchar;
5745 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5746 encode_fallback = encode_fallback_subchar;
5747 unicode_subchar = 0;
5749 /* decimal number */
5750 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5751 unicode_subchar *= 10;
5752 unicode_subchar += hex2bin(p[i]);
5754 }else if(p[1] == 'x' || p[1] == 'X'){
5755 /* hexadecimal number */
5756 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5757 unicode_subchar <<= 4;
5758 unicode_subchar |= hex2bin(p[i]);
5762 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5763 unicode_subchar *= 8;
5764 unicode_subchar += hex2bin(p[i]);
5767 w16e_conv(unicode_subchar, &i, &j);
5768 unicode_subchar = i<<8 | j;
5772 #ifdef UTF8_OUTPUT_ENABLE
5773 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5774 ms_ucs_map_f = UCS_MAP_MS;
5778 #ifdef UNICODE_NORMALIZATION
5779 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5784 if (strcmp(long_option[i].name, "prefix=") == 0){
5785 if (nkf_isgraph(p[0])){
5786 for (i = 1; nkf_isgraph(p[i]); i++){
5787 prefix_table[p[i]] = p[0];
5792 #if !defined(PERL_XS) && !defined(WIN32DLL)
5793 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5798 case 'b': /* buffered mode */
5801 case 'u': /* non bufferd mode */
5804 case 't': /* transparent mode */
5809 } else if (*cp=='2') {
5813 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5821 case 'j': /* JIS output */
5823 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5825 case 'e': /* AT&T EUC output */
5826 output_encoding = nkf_enc_from_index(EUC_JP);
5828 case 's': /* SJIS output */
5829 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5831 case 'l': /* ISO8859 Latin-1 support, no conversion */
5832 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5833 input_encoding = nkf_enc_from_index(ISO_8859_1);
5835 case 'i': /* Kanji IN ESC-$-@/B */
5836 if (*cp=='@'||*cp=='B')
5837 kanji_intro = *cp++;
5839 case 'o': /* ASCII IN ESC-(-J/B */
5840 if (*cp=='J'||*cp=='B'||*cp=='H')
5841 ascii_intro = *cp++;
5845 bit:1 katakana->hiragana
5846 bit:2 hiragana->katakana
5848 if ('9'>= *cp && *cp>='0')
5849 hira_f |= (*cp++ -'0');
5856 #if defined(MSDOS) || defined(__OS2__)
5863 show_configuration();
5871 #ifdef UTF8_OUTPUT_ENABLE
5872 case 'w': /* UTF-8 output */
5877 output_encoding = nkf_enc_from_index(UTF_8N);
5879 output_bom_f = TRUE;
5880 output_encoding = nkf_enc_from_index(UTF_8_BOM);
5884 if ('1'== cp[0] && '6'==cp[1]) {
5887 } else if ('3'== cp[0] && '2'==cp[1]) {
5891 output_encoding = nkf_enc_from_index(UTF_8);
5896 output_endian = ENDIAN_LITTLE;
5897 } else if (cp[0] == 'B') {
5900 output_encoding = nkf_enc_from_index(enc_idx);
5905 enc_idx = enc_idx == UTF_16
5906 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5907 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5909 output_bom_f = TRUE;
5910 enc_idx = enc_idx == UTF_16
5911 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
5912 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
5914 output_encoding = nkf_enc_from_index(enc_idx);
5918 #ifdef UTF8_INPUT_ENABLE
5919 case 'W': /* UTF input */
5922 input_encoding = nkf_enc_from_index(UTF_8);
5925 if ('1'== cp[0] && '6'==cp[1]) {
5927 input_endian = ENDIAN_BIG;
5929 } else if ('3'== cp[0] && '2'==cp[1]) {
5931 input_endian = ENDIAN_BIG;
5934 input_encoding = nkf_enc_from_index(UTF_8);
5939 input_endian = ENDIAN_LITTLE;
5940 } else if (cp[0] == 'B') {
5942 input_endian = ENDIAN_BIG;
5944 enc_idx = enc_idx == UTF_16
5945 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5946 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5947 input_encoding = nkf_enc_from_index(enc_idx);
5951 /* Input code assumption */
5952 case 'J': /* ISO-2022-JP input */
5953 input_encoding = nkf_enc_from_index(ISO_2022_JP);
5955 case 'E': /* EUC-JP input */
5956 input_encoding = nkf_enc_from_index(EUC_JP);
5958 case 'S': /* Windows-31J input */
5959 input_encoding = nkf_enc_from_index(WINDOWS_31J);
5961 case 'Z': /* Convert X0208 alphabet to asii */
5963 bit:0 Convert JIS X 0208 Alphabet to ASCII
5964 bit:1 Convert Kankaku to one space
5965 bit:2 Convert Kankaku to two spaces
5966 bit:3 Convert HTML Entity
5967 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
5969 while ('0'<= *cp && *cp <='9') {
5970 alpha_f |= 1 << (*cp++ - '0');
5972 if (!alpha_f) alpha_f = 1;
5974 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
5975 x0201_f = FALSE; /* No X0201->X0208 conversion */
5977 ESC-(-I in JIS, EUC, MS Kanji
5978 SI/SO in JIS, EUC, MS Kanji
5979 SS2 in EUC, JIS, not in MS Kanji
5980 MS Kanji (0xa0-0xdf)
5982 ESC-(-I in JIS (0x20-0x5f)
5983 SS2 in EUC (0xa0-0xdf)
5984 0xa0-0xd in MS Kanji (0xa0-0xdf)
5987 case 'X': /* Convert X0201 kana to X0208 */
5990 case 'F': /* prserve new lines */
5991 fold_preserve_f = TRUE;
5992 case 'f': /* folding -f60 or -f */
5995 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
5997 fold_len += *cp++ - '0';
5999 if (!(0<fold_len && fold_len<BUFSIZ))
6000 fold_len = DEFAULT_FOLD;
6004 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6006 fold_margin += *cp++ - '0';
6010 case 'm': /* MIME support */
6011 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6012 if (*cp=='B'||*cp=='Q') {
6013 mime_decode_mode = *cp++;
6014 mimebuf_f = FIXED_MIME;
6015 } else if (*cp=='N') {
6016 mime_f = TRUE; cp++;
6017 } else if (*cp=='S') {
6018 mime_f = STRICT_MIME; cp++;
6019 } else if (*cp=='0') {
6020 mime_decode_f = FALSE;
6021 mime_f = FALSE; cp++;
6023 mime_f = STRICT_MIME;
6026 case 'M': /* MIME output */
6029 mimeout_f = FIXED_MIME; cp++;
6030 } else if (*cp=='Q') {
6032 mimeout_f = FIXED_MIME; cp++;
6037 case 'B': /* Broken JIS support */
6039 bit:1 allow any x on ESC-(-x or ESC-$-x
6040 bit:2 reset to ascii on NL
6042 if ('9'>= *cp && *cp>='0')
6043 broken_f |= 1<<(*cp++ -'0');
6048 case 'O':/* for Output file */
6052 case 'c':/* add cr code */
6055 case 'd':/* delete cr code */
6058 case 'I': /* ISO-2022-JP output */
6061 case 'L': /* line mode */
6062 if (*cp=='u') { /* unix */
6063 eolmode_f = LF; cp++;
6064 } else if (*cp=='m') { /* mac */
6065 eolmode_f = CR; cp++;
6066 } else if (*cp=='w') { /* windows */
6067 eolmode_f = CRLF; cp++;
6068 } else if (*cp=='0') { /* no conversion */
6069 eolmode_f = 0; cp++;
6074 if ('2' <= *cp && *cp <= '9') {
6077 } else if (*cp == '0' || *cp == '1') {
6086 /* module muliple options in a string are allowed for Perl moudle */
6087 while(*cp && *cp++!='-');
6090 #if !defined(PERL_XS) && !defined(WIN32DLL)
6091 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6093 /* bogus option but ignored */
6101 #include "nkf32dll.c"
6102 #elif defined(PERL_XS)
6103 #else /* WIN32DLL */
6104 int main(int argc, char **argv)
6109 char *outfname = NULL;
6112 #ifdef EASYWIN /*Easy Win */
6113 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6115 #ifdef DEFAULT_CODE_LOCALE
6116 setlocale(LC_CTYPE, "");
6118 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6119 cp = (unsigned char *)*argv;
6124 if (pipe(fds) < 0 || (pid = fork()) < 0){
6135 execvp(argv[1], &argv[1]);
6152 int debug_f_back = debug_f;
6155 int exec_f_back = exec_f;
6158 int x0212_f_back = x0212_f;
6160 int x0213_f_back = x0213_f;
6161 int guess_f_back = guess_f;
6163 guess_f = guess_f_back;
6166 debug_f = debug_f_back;
6169 exec_f = exec_f_back;
6172 x0212_f = x0212_f_back;
6174 x0213_f = x0213_f_back;
6177 if (binmode_f == TRUE)
6178 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6179 if (freopen("","wb",stdout) == NULL)
6186 setbuf(stdout, (char *) NULL);
6188 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6191 if (binmode_f == TRUE)
6192 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6193 if (freopen("","rb",stdin) == NULL) return (-1);
6197 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6201 kanji_convert(stdin);
6202 if (guess_f) print_guessed_code(NULL);
6206 int is_argument_error = FALSE;
6208 input_codename = NULL;
6211 iconv_for_check = 0;
6213 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6215 is_argument_error = TRUE;
6223 /* reopen file for stdout */
6224 if (file_out_f == TRUE) {
6227 outfname = malloc(strlen(origfname)
6228 + strlen(".nkftmpXXXXXX")
6234 strcpy(outfname, origfname);
6238 for (i = strlen(outfname); i; --i){
6239 if (outfname[i - 1] == '/'
6240 || outfname[i - 1] == '\\'){
6246 strcat(outfname, "ntXXXXXX");
6248 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6249 S_IREAD | S_IWRITE);
6251 strcat(outfname, ".nkftmpXXXXXX");
6252 fd = mkstemp(outfname);
6255 || (fd_backup = dup(fileno(stdout))) < 0
6256 || dup2(fd, fileno(stdout)) < 0
6267 outfname = "nkf.out";
6270 if(freopen(outfname, "w", stdout) == NULL) {
6274 if (binmode_f == TRUE) {
6275 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6276 if (freopen("","wb",stdout) == NULL)
6283 if (binmode_f == TRUE)
6284 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6285 if (freopen("","rb",fin) == NULL)
6290 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6294 char *filename = NULL;
6296 if (nfiles > 1) filename = origfname;
6297 if (guess_f) print_guessed_code(filename);
6303 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6311 if (dup2(fd_backup, fileno(stdout)) < 0){
6314 if (stat(origfname, &sb)) {
6315 fprintf(stderr, "Can't stat %s\n", origfname);
6317 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6318 if (chmod(outfname, sb.st_mode)) {
6319 fprintf(stderr, "Can't set permission %s\n", outfname);
6322 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6323 if(preserve_time_f){
6324 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6325 tb[0] = tb[1] = sb.st_mtime;
6326 if (utime(outfname, tb)) {
6327 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6330 tb.actime = sb.st_atime;
6331 tb.modtime = sb.st_mtime;
6332 if (utime(outfname, &tb)) {
6333 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6338 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6340 unlink(backup_filename);
6342 if (rename(origfname, backup_filename)) {
6343 perror(backup_filename);
6344 fprintf(stderr, "Can't rename %s to %s\n",
6345 origfname, backup_filename);
6349 if (unlink(origfname)){
6354 if (rename(outfname, origfname)) {
6356 fprintf(stderr, "Can't rename %s to %s\n",
6357 outfname, origfname);
6364 if (is_argument_error)
6367 #ifdef EASYWIN /*Easy Win */
6368 if (file_out_f == FALSE)
6369 scanf("%d",&end_check);
6372 #else /* for Other OS */
6373 if (file_out_f == TRUE)
6375 #endif /*Easy Win */
6378 #endif /* WIN32DLL */