1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.172 2008/02/06 20:46:39 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-02-07"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
45 /* state of output_mode and input_mode
121 NKF_ENCODING_TABLE_SIZE,
122 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
123 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
124 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
125 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
126 JIS_X_0208 = 0x1168, /* @B */
127 JIS_X_0212 = 0x1159, /* D */
128 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
129 JIS_X_0213_2 = 0x1229, /* P */
130 JIS_X_0213_1 = 0x1233, /* Q */
134 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
135 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
138 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
139 void j_oconv(nkf_char c2, nkf_char c1);
140 void s_oconv(nkf_char c2, nkf_char c1);
141 void e_oconv(nkf_char c2, nkf_char c1);
142 void w_oconv(nkf_char c2, nkf_char c1);
143 void w_oconv16(nkf_char c2, nkf_char c1);
144 void w_oconv32(nkf_char c2, nkf_char c1);
148 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
149 void (*oconv)(nkf_char c2, nkf_char c1);
150 } nkf_native_encoding;
152 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
153 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
154 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
155 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
156 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
157 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
158 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
163 const nkf_native_encoding *base_encoding;
166 nkf_encoding nkf_encoding_table[] = {
167 {ASCII, "US-ASCII", &NkfEncodingASCII},
168 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
169 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
170 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
171 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
172 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
173 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
175 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
176 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
177 {CP10001, "CP10001", &NkfEncodingShift_JIS},
178 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
179 {CP51932, "CP51932", &NkfEncodingEUC_JP},
180 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
181 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
182 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
183 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
184 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
185 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
186 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
187 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
188 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
189 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
190 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
191 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
192 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
193 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
194 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
195 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
196 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
197 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
198 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
199 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
200 {BINARY, "BINARY", &NkfEncodingASCII},
207 } encoding_name_to_id_table[] = {
210 {"ISO-2022-JP", ISO_2022_JP},
211 {"ISO2022JP-CP932", CP50220},
212 {"CP50220", CP50220},
213 {"CP50221", CP50221},
214 {"CP50222", CP50222},
215 {"ISO-2022-JP-1", ISO_2022_JP_1},
216 {"ISO-2022-JP-3", ISO_2022_JP_3},
217 {"SHIFT_JIS", SHIFT_JIS},
219 {"WINDOWS-31J", WINDOWS_31J},
220 {"CSWINDOWS31J", WINDOWS_31J},
221 {"CP932", WINDOWS_31J},
222 {"MS932", WINDOWS_31J},
223 {"CP10001", CP10001},
226 {"CP51932", CP51932},
227 {"EUC-JP-MS", EUCJP_MS},
228 {"EUCJP-MS", EUCJP_MS},
229 {"EUCJPMS", EUCJP_MS},
230 {"EUC-JP-ASCII", EUCJP_ASCII},
231 {"EUCJP-ASCII", EUCJP_ASCII},
232 {"SHIFT_JISX0213", SHIFT_JISX0213},
233 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
234 {"EUC-JISX0213", EUC_JISX0213},
235 {"EUC-JIS-2004", EUC_JIS_2004},
238 {"UTF-8-BOM", UTF_8_BOM},
239 {"UTF8-MAC", UTF8_MAC},
240 {"UTF-8-MAC", UTF8_MAC},
242 {"UTF-16BE", UTF_16BE},
243 {"UTF-16BE-BOM", UTF_16BE_BOM},
244 {"UTF-16LE", UTF_16LE},
245 {"UTF-16LE-BOM", UTF_16LE_BOM},
247 {"UTF-32BE", UTF_32BE},
248 {"UTF-32BE-BOM", UTF_32BE_BOM},
249 {"UTF-32LE", UTF_32LE},
250 {"UTF-32LE-BOM", UTF_32LE_BOM},
255 #if defined(DEFAULT_CODE_JIS)
256 #define DEFAULT_ENCIDX ISO_2022_JP
257 #elif defined(DEFAULT_CODE_SJIS)
258 #define DEFAULT_ENCIDX SHIFT_JIS
259 #elif defined(DEFAULT_CODE_EUC)
260 #define DEFAULT_ENCIDX EUC_JP
261 #elif defined(DEFAULT_CODE_UTF8)
262 #define DEFAULT_ENCIDX UTF_8
266 #define is_alnum(c) \
267 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
269 /* I don't trust portablity of toupper */
270 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
271 #define nkf_isoctal(c) ('0'<=c && c<='7')
272 #define nkf_isdigit(c) ('0'<=c && c<='9')
273 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
274 #define nkf_isblank(c) (c == SP || c == TAB)
275 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
276 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
277 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
278 #define nkf_isprint(c) (SP<=c && c<='~')
279 #define nkf_isgraph(c) ('!'<=c && c<='~')
280 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
281 ('A'<=c&&c<='F') ? (c-'A'+10) : \
282 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
283 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
284 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
285 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
286 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
287 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
289 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
291 #define HOLD_SIZE 1024
292 #if defined(INT_IS_SHORT)
293 #define IOBUF_SIZE 2048
295 #define IOBUF_SIZE 16384
298 #define DEFAULT_J 'B'
299 #define DEFAULT_R 'B'
306 /* MIME preprocessor */
308 #ifdef EASYWIN /*Easy Win */
309 extern POINT _BufferSize;
318 void (*status_func)(struct input_code *, nkf_char);
319 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
323 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
324 static nkf_encoding *input_encoding = NULL;
325 static nkf_encoding *output_encoding = NULL;
327 static int kanji_convert(FILE *f);
328 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
330 * 0: Shift_JIS, eucJP-ascii
335 #define UCS_MAP_ASCII 0
337 #define UCS_MAP_CP932 2
338 #define UCS_MAP_CP10001 3
339 static int ms_ucs_map_f = UCS_MAP_ASCII;
341 #ifdef UTF8_INPUT_ENABLE
342 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
343 static int no_cp932ext_f = FALSE;
344 /* ignore ZERO WIDTH NO-BREAK SPACE */
345 static int no_best_fit_chars_f = FALSE;
346 static int input_endian = ENDIAN_BIG;
347 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
348 static void (*encode_fallback)(nkf_char c) = NULL;
349 static void w_status(struct input_code *, nkf_char);
351 #ifdef UTF8_OUTPUT_ENABLE
352 static int output_bom_f = FALSE;
353 static int output_endian = ENDIAN_BIG;
356 static void std_putc(nkf_char c);
357 static nkf_char std_getc(FILE *f);
358 static nkf_char std_ungetc(nkf_char c,FILE *f);
360 static nkf_char broken_getc(FILE *f);
361 static nkf_char broken_ungetc(nkf_char c,FILE *f);
363 static nkf_char mime_getc(FILE *f);
365 static void mime_putc(nkf_char c);
369 #if !defined(PERL_XS) && !defined(WIN32DLL)
370 static unsigned char stdibuf[IOBUF_SIZE];
371 static unsigned char stdobuf[IOBUF_SIZE];
375 static int unbuf_f = FALSE;
376 static int estab_f = FALSE;
377 static int nop_f = FALSE;
378 static int binmode_f = TRUE; /* binary mode */
379 static int rot_f = FALSE; /* rot14/43 mode */
380 static int hira_f = FALSE; /* hira/kata henkan */
381 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
382 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
383 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
384 static int mimebuf_f = FALSE; /* MIME buffered input */
385 static int broken_f = FALSE; /* convert ESC-less broken JIS */
386 static int iso8859_f = FALSE; /* ISO8859 through */
387 static int mimeout_f = FALSE; /* base64 mode */
388 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
389 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
391 #ifdef UNICODE_NORMALIZATION
392 static int nfc_f = FALSE;
393 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
394 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
398 static int cap_f = FALSE;
399 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
400 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
402 static int url_f = FALSE;
403 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
404 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
407 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
408 #define CLASS_MASK NKF_INT32_C(0xFF000000)
409 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
410 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
411 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
412 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
413 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
414 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
415 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
416 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
417 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
419 #ifdef NUMCHAR_OPTION
420 static int numchar_f = FALSE;
421 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
422 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
426 static int noout_f = FALSE;
427 static void no_putc(nkf_char c);
428 static int debug_f = FALSE;
429 static void debug(const char *str);
430 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
433 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
434 static void set_input_codename(char *codename);
437 static int exec_f = 0;
440 #ifdef SHIFTJIS_CP932
441 /* invert IBM extended characters to others */
442 static int cp51932_f = FALSE;
444 /* invert NEC-selected IBM extended characters to IBM extended characters */
445 static int cp932inv_f = TRUE;
447 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
448 #endif /* SHIFTJIS_CP932 */
451 static int x0212_f = FALSE;
453 static int x0213_f = FALSE;
455 static unsigned char prefix_table[256];
457 static void e_status(struct input_code *, nkf_char);
458 static void s_status(struct input_code *, nkf_char);
460 struct input_code input_code_list[] = {
461 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
462 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
463 #ifdef UTF8_INPUT_ENABLE
464 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
469 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
470 static int base64_count = 0;
472 /* X0208 -> ASCII converter */
475 static int f_line = 0; /* chars in line */
476 static int f_prev = 0;
477 static int fold_preserve_f = FALSE; /* preserve new lines */
478 static int fold_f = FALSE;
479 static int fold_len = 0;
482 static unsigned char kanji_intro = DEFAULT_J;
483 static unsigned char ascii_intro = DEFAULT_R;
487 #define FOLD_MARGIN 10
488 #define DEFAULT_FOLD 60
490 static int fold_margin = FOLD_MARGIN;
492 /* process default */
494 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
496 fprintf(stderr,"nkf internal module connection failure.\n");
501 void no_connection(nkf_char c2, nkf_char c1)
503 no_connection2(c2,c1,0);
506 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
507 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
509 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
510 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
511 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
512 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
513 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
514 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
515 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
517 /* static redirections */
519 static void (*o_putc)(nkf_char c) = std_putc;
521 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
522 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
524 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
525 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
527 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
529 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
530 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
532 /* for strict mime */
533 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
534 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
537 static int output_mode = ASCII; /* output kanji mode */
538 static int input_mode = ASCII; /* input kanji mode */
539 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
541 /* X0201 / X0208 conversion tables */
543 /* X0201 kana conversion table */
545 static const unsigned char cv[]= {
546 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
547 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
548 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
549 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
550 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
551 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
552 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
553 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
554 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
555 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
556 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
557 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
558 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
559 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
560 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
561 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
565 /* X0201 kana conversion table for daguten */
567 static const unsigned char dv[]= {
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
572 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
573 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
574 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
575 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
576 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
577 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
579 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
586 /* X0201 kana conversion table for han-daguten */
588 static const unsigned char ev[]= {
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
600 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 /* X0208 kigou conversion table */
609 /* 0x8140 - 0x819e */
610 static const unsigned char fv[] = {
612 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
613 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
614 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
616 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
617 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
618 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
619 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
620 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
628 static int option_mode = 0;
629 static int file_out_f = FALSE;
631 static int overwrite_f = FALSE;
632 static int preserve_time_f = FALSE;
633 static int backup_f = FALSE;
634 static char *backup_suffix = "";
637 static int eolmode_f = 0; /* CR, LF, CRLF */
638 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
639 static nkf_char prev_cr = 0; /* CR or 0 */
640 #ifdef EASYWIN /*Easy Win */
641 static int end_check;
644 #define STD_GC_BUFSIZE (256)
645 nkf_char std_gc_buf[STD_GC_BUFSIZE];
648 char* nkf_strcpy(const char *str)
650 char* result = malloc(strlen(str) + 1);
659 static void nkf_str_upcase(const char *src, char *dest, size_t length)
662 for (; i < length && src[i]; i++) {
663 dest[i] = nkf_toupper(src[i]);
668 static nkf_encoding *nkf_enc_from_index(int idx)
670 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
673 return &nkf_encoding_table[idx];
676 static int nkf_enc_find_index(const char *name)
679 if (*name == 'X' && *(name+1) == '-') name += 2;
680 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
681 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
682 return encoding_name_to_id_table[i].id;
688 static nkf_encoding *nkf_enc_find(const char *name)
691 idx = nkf_enc_find_index(name);
692 if (idx < 0) return 0;
693 return nkf_enc_from_index(idx);
696 #define nkf_enc_name(enc) (enc)->name
697 #define nkf_enc_to_index(enc) (enc)->id
698 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
699 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
700 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
701 #define nkf_enc_asciicompat(enc) (\
702 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
703 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
704 #define nkf_enc_unicode_p(enc) (\
705 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
706 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
707 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
708 #define nkf_enc_cp5022x_p(enc) (\
709 nkf_enc_to_index(enc) == CP50220 ||\
710 nkf_enc_to_index(enc) == CP50221 ||\
711 nkf_enc_to_index(enc) == CP50222)
713 #ifdef DEFAULT_CODE_LOCALE
714 static char* nkf_locale_charmap()
716 #ifdef HAVE_LANGINFO_H
717 return nl_langinfo(CODESET);
718 #elif defined(__WIN32__)
719 return sprintf("CP%d", GetACP());
725 static nkf_encoding* nkf_locale_encoding()
727 nkf_encoding *enc = 0;
728 char *encname = nkf_locale_charmap();
730 enc = nkf_enc_find(encname);
731 if (enc < 0) enc = 0;
734 #endif /* DEFAULT_CODE_LOCALE */
736 static nkf_encoding* nkf_default_encoding()
738 nkf_encoding *enc = 0;
739 #ifdef DEFAULT_CODE_LOCALE
740 enc = nkf_locale_encoding();
742 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
749 #define fprintf dllprintf
754 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
760 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
762 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
763 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
764 #ifdef UTF8_OUTPUT_ENABLE
765 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
767 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
768 #ifdef UTF8_INPUT_ENABLE
769 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
772 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
773 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
774 "r {de/en}crypt ROT13/47\n"
775 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
776 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
777 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
778 "l ISO8859-1 (Latin-1) support\n"
779 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
780 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
781 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
782 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
783 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
784 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
786 "T Text mode output\n"
788 "O Output to File (DEFAULT 'nkf.out')\n"
789 "I Convert non ISO-2022-JP charactor to GETA\n"
790 "d,c Convert line breaks -d: LF -c: CRLF\n"
791 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
792 "v, V Show this usage. V: show configuration\n"
794 "Long name options\n"
795 " --ic=<input codeset> --oc=<output codeset>\n"
796 " Specify the input or output codeset\n"
797 " --fj --unix --mac --windows\n"
798 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
799 " Convert for the system or code\n"
800 " --hiragana --katakana --katakana-hiragana\n"
801 " To Hiragana/Katakana Conversion\n"
802 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
804 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
806 #ifdef NUMCHAR_OPTION
807 " --numchar-input Convert Unicode Character Reference\n"
809 #ifdef UTF8_INPUT_ENABLE
810 " --fb-{skip, html, xml, perl, java, subchar}\n"
811 " Specify how nkf handles unassigned characters\n"
814 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
815 " Overwrite original listed files by filtered result\n"
816 " --overwrite preserves timestamp of original files\n"
818 " -g --guess Guess the input code\n"
819 " --help --version Show this help/the version\n"
820 " For more information, see also man nkf\n"
825 void show_configuration(void)
828 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
831 " Compile-time options:\n"
832 " Compiled at: " __DATE__ " " __TIME__ "\n"
835 " Default output encoding: "
836 #ifdef DEFAULT_CODE_LOCALE
837 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
839 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
845 " Default output end of line: "
846 #if DEFAULT_NEWLINE == CR
848 #elif DEFAULT_NEWLINE == CRLF
854 " Decode MIME encoded string: "
855 #if MIME_DECODE_DEFAULT
861 " Convert JIS X 0201 Katakana: "
868 " --help, --version output: "
869 #if HELP_OUTPUT_HELP_OUTPUT
879 char *get_backup_filename(const char *suffix, const char *filename)
881 char *backup_filename;
882 int asterisk_count = 0;
884 int filename_length = strlen(filename);
886 for(i = 0; suffix[i]; i++){
887 if(suffix[i] == '*') asterisk_count++;
891 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
892 if (!backup_filename){
893 perror("Can't malloc backup filename.");
897 for(i = 0, j = 0; suffix[i];){
898 if(suffix[i] == '*'){
899 backup_filename[j] = '\0';
900 strncat(backup_filename, filename, filename_length);
902 j += filename_length;
904 backup_filename[j++] = suffix[i++];
907 backup_filename[j] = '\0';
909 j = strlen(suffix) + filename_length;
910 backup_filename = malloc( + 1);
911 strcpy(backup_filename, filename);
912 strcat(backup_filename, suffix);
913 backup_filename[j] = '\0';
915 return backup_filename;
919 #ifdef UTF8_INPUT_ENABLE
920 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
927 (*f)(0, bin2hex(c>>shift));
937 void encode_fallback_html(nkf_char c)
942 if(c >= NKF_INT32_C(1000000))
943 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
944 if(c >= NKF_INT32_C(100000))
945 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
947 (*oconv)(0, 0x30+(c/10000 )%10);
949 (*oconv)(0, 0x30+(c/1000 )%10);
951 (*oconv)(0, 0x30+(c/100 )%10);
953 (*oconv)(0, 0x30+(c/10 )%10);
955 (*oconv)(0, 0x30+ c %10);
960 void encode_fallback_xml(nkf_char c)
965 nkf_each_char_to_hex(oconv, c);
970 void encode_fallback_java(nkf_char c)
974 if(!nkf_char_unicode_bmp_p(c)){
978 (*oconv)(0, bin2hex(c>>20));
979 (*oconv)(0, bin2hex(c>>16));
983 (*oconv)(0, bin2hex(c>>12));
984 (*oconv)(0, bin2hex(c>> 8));
985 (*oconv)(0, bin2hex(c>> 4));
986 (*oconv)(0, bin2hex(c ));
990 void encode_fallback_perl(nkf_char c)
995 nkf_each_char_to_hex(oconv, c);
1000 void encode_fallback_subchar(nkf_char c)
1002 c = unicode_subchar;
1003 (*oconv)((c>>8)&0xFF, c&0xFF);
1008 static const struct {
1032 {"katakana-hiragana","h3"},
1040 #ifdef UTF8_OUTPUT_ENABLE
1050 {"fb-subchar=", ""},
1052 #ifdef UTF8_INPUT_ENABLE
1053 {"utf8-input", "W"},
1054 {"utf16-input", "W16"},
1055 {"no-cp932ext", ""},
1056 {"no-best-fit-chars",""},
1058 #ifdef UNICODE_NORMALIZATION
1059 {"utf8mac-input", ""},
1071 #ifdef NUMCHAR_OPTION
1072 {"numchar-input", ""},
1078 #ifdef SHIFTJIS_CP932
1088 static void set_input_encoding(nkf_encoding *enc)
1090 switch (nkf_enc_to_index(enc)) {
1096 #ifdef SHIFTJIS_CP932
1099 #ifdef UTF8_OUTPUT_ENABLE
1100 ms_ucs_map_f = UCS_MAP_CP932;
1117 #ifdef SHIFTJIS_CP932
1120 #ifdef UTF8_OUTPUT_ENABLE
1121 ms_ucs_map_f = UCS_MAP_CP932;
1127 #ifdef SHIFTJIS_CP932
1130 #ifdef UTF8_OUTPUT_ENABLE
1131 ms_ucs_map_f = UCS_MAP_CP10001;
1135 #ifdef SHIFTJIS_CP932
1138 #ifdef UTF8_OUTPUT_ENABLE
1139 ms_ucs_map_f = UCS_MAP_CP932;
1143 #ifdef SHIFTJIS_CP932
1146 #ifdef UTF8_OUTPUT_ENABLE
1147 ms_ucs_map_f = UCS_MAP_MS;
1151 #ifdef SHIFTJIS_CP932
1154 #ifdef UTF8_OUTPUT_ENABLE
1155 ms_ucs_map_f = UCS_MAP_ASCII;
1158 case SHIFT_JISX0213:
1159 case SHIFT_JIS_2004:
1161 #ifdef SHIFTJIS_CP932
1168 #ifdef SHIFTJIS_CP932
1172 #ifdef UTF8_INPUT_ENABLE
1173 #ifdef UNICODE_NORMALIZATION
1181 input_endian = ENDIAN_BIG;
1185 input_endian = ENDIAN_LITTLE;
1190 input_endian = ENDIAN_BIG;
1194 input_endian = ENDIAN_LITTLE;
1200 static void set_output_encoding(nkf_encoding *enc)
1202 switch (nkf_enc_to_index(enc)) {
1205 #ifdef SHIFTJIS_CP932
1206 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1208 #ifdef UTF8_OUTPUT_ENABLE
1209 ms_ucs_map_f = UCS_MAP_CP932;
1213 #ifdef SHIFTJIS_CP932
1214 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1216 #ifdef UTF8_OUTPUT_ENABLE
1217 ms_ucs_map_f = UCS_MAP_CP932;
1224 #ifdef SHIFTJIS_CP932
1225 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1233 #ifdef SHIFTJIS_CP932
1234 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_CP932;
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_CP10001;
1251 #ifdef SHIFTJIS_CP932
1252 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1254 #ifdef UTF8_OUTPUT_ENABLE
1255 ms_ucs_map_f = UCS_MAP_CP932;
1259 #ifdef SHIFTJIS_CP932
1260 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1262 #ifdef UTF8_OUTPUT_ENABLE
1263 ms_ucs_map_f = UCS_MAP_CP932;
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_MS;
1278 #ifdef UTF8_OUTPUT_ENABLE
1279 ms_ucs_map_f = UCS_MAP_ASCII;
1282 case SHIFT_JISX0213:
1283 case SHIFT_JIS_2004:
1285 #ifdef SHIFTJIS_CP932
1286 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1295 #ifdef SHIFTJIS_CP932
1296 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1299 #ifdef UTF8_OUTPUT_ENABLE
1301 output_bom_f = TRUE;
1305 output_bom_f = TRUE;
1308 output_endian = ENDIAN_LITTLE;
1309 output_bom_f = FALSE;
1312 output_endian = ENDIAN_LITTLE;
1313 output_bom_f = TRUE;
1316 output_bom_f = TRUE;
1319 output_endian = ENDIAN_LITTLE;
1320 output_bom_f = FALSE;
1323 output_endian = ENDIAN_LITTLE;
1324 output_bom_f = TRUE;
1330 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1333 struct input_code *p = input_code_list;
1335 if (iconv_func == p->iconv_func){
1344 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1346 #ifdef INPUT_CODE_FIX
1347 if (f || !input_encoding)
1354 #ifdef INPUT_CODE_FIX
1355 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1361 if (estab_f && iconv_for_check != iconv){
1362 struct input_code *p = find_inputcode_byfunc(iconv);
1364 set_input_codename(p->name);
1367 iconv_for_check = iconv;
1373 nkf_char x0212_shift(nkf_char c)
1378 if (0x75 <= c && c <= 0x7f){
1379 ret = c + (0x109 - 0x75);
1382 if (0x75 <= c && c <= 0x7f){
1383 ret = c + (0x113 - 0x75);
1390 nkf_char x0212_unshift(nkf_char c)
1393 if (0x7f <= c && c <= 0x88){
1394 ret = c + (0x75 - 0x7f);
1395 }else if (0x89 <= c && c <= 0x92){
1396 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1400 #endif /* X0212_ENABLE */
1402 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1408 if((0x21 <= ndx && ndx <= 0x2F)){
1409 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1410 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1412 }else if(0x6E <= ndx && ndx <= 0x7E){
1413 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1414 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1420 else if(nkf_isgraph(ndx)){
1422 const unsigned short *ptr;
1423 ptr = x0212_shiftjis[ndx - 0x21];
1425 val = ptr[(c1 & 0x7f) - 0x21];
1434 c2 = x0212_shift(c2);
1436 #endif /* X0212_ENABLE */
1438 if(0x7F < c2) return 1;
1439 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1440 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1444 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1446 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1449 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1450 #ifdef SHIFTJIS_CP932
1451 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1452 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1459 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1460 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1466 #endif /* SHIFTJIS_CP932 */
1468 if (!x0213_f && is_ibmext_in_sjis(c2)){
1469 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1472 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1485 if(x0213_f && c2 >= 0xF0){
1486 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1487 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1488 }else{ /* 78<=k<=94 */
1489 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1490 if (0x9E < c1) c2++;
1493 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1494 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1495 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1496 if (0x9E < c1) c2++;
1499 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1506 c2 = x0212_unshift(c2);
1513 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1514 void nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1522 }else if (val < 0x800){
1523 *p1 = 0xc0 | (val >> 6);
1524 *p2 = 0x80 | (val & 0x3f);
1527 } else if (nkf_char_unicode_bmp_p(val)) {
1528 *p1 = 0xe0 | (val >> 12);
1529 *p2 = 0x80 | ((val >> 6) & 0x3f);
1530 *p3 = 0x80 | ( val & 0x3f);
1532 } else if (nkf_char_unicode_value_p(val)) {
1533 *p1 = 0xe0 | (val >> 16);
1534 *p2 = 0x80 | ((val >> 12) & 0x3f);
1535 *p3 = 0x80 | ((val >> 6) & 0x3f);
1536 *p4 = 0x80 | ( val & 0x3f);
1545 nkf_char nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1552 else if (c1 <= 0xC3) {
1553 /* trail byte or invalid */
1556 else if (c1 <= 0xDF) {
1558 wc = (c1 & 0x1F) << 6;
1561 else if (c1 <= 0xEF) {
1563 wc = (c1 & 0x0F) << 12;
1564 wc |= (c2 & 0x3F) << 6;
1567 else if (c2 <= 0xF4) {
1569 wc = (c1 & 0x0F) << 18;
1570 wc |= (c2 & 0x3F) << 12;
1571 wc |= (c3 & 0x3F) << 6;
1581 #ifdef UTF8_INPUT_ENABLE
1582 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
1585 const unsigned short *p;
1588 if (pp == 0) return 1;
1591 if (c1 < 0 || psize <= c1) return 1;
1593 if (p == 0) return 1;
1596 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1598 if (val == 0) return 1;
1599 if (no_cp932ext_f && (
1600 (val>>8) == 0x2D || /* NEC special characters */
1601 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1609 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1616 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1618 const unsigned short *const *pp;
1619 const unsigned short *const *const *ppp;
1620 static const char no_best_fit_chars_table_C2[] =
1621 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1623 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1624 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1625 static const char no_best_fit_chars_table_C2_ms[] =
1626 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1628 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1629 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1630 static const char no_best_fit_chars_table_932_C2[] =
1631 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1633 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1634 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1635 static const char no_best_fit_chars_table_932_C3[] =
1636 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1637 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1639 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1645 }else if(c2 < 0xe0){
1646 if(no_best_fit_chars_f){
1647 if(ms_ucs_map_f == UCS_MAP_CP932){
1650 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1653 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1656 }else if(!cp932inv_f){
1659 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1662 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1665 }else if(ms_ucs_map_f == UCS_MAP_MS){
1666 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1667 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1685 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1686 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1687 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1689 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1690 }else if(c0 < 0xF0){
1691 if(no_best_fit_chars_f){
1692 if(ms_ucs_map_f == UCS_MAP_CP932){
1693 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1694 }else if(ms_ucs_map_f == UCS_MAP_MS){
1699 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1702 if(c0 == 0x92) return 1;
1707 if(c1 == 0x80 || c0 == 0x9C) return 1;
1710 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1715 if(c0 == 0x94) return 1;
1718 if(c0 == 0xBB) return 1;
1728 if(c0 == 0x95) return 1;
1731 if(c0 == 0xA5) return 1;
1738 if(c0 == 0x8D) return 1;
1741 if(c0 == 0x9E && !cp932inv_f) return 1;
1744 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1752 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1753 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1754 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1756 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1758 #ifdef SHIFTJIS_CP932
1759 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1761 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1762 s2e_conv(s2, s1, p2, p1);
1771 #ifdef UTF8_OUTPUT_ENABLE
1772 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
1774 const unsigned short *p;
1776 if (c2 == JIS_X_0201_1976_K) {
1777 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1785 p = euc_to_utf8_1byte;
1787 } else if (is_eucg3(c2)){
1788 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1791 c2 = (c2&0x7f) - 0x21;
1792 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1793 p = x0212_to_utf8_2bytes[c2];
1799 c2 = (c2&0x7f) - 0x21;
1800 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1802 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1803 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1804 euc_to_utf8_2bytes_ms[c2];
1809 c1 = (c1 & 0x7f) - 0x21;
1810 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1816 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1823 }else if (0xc0 <= c2 && c2 <= 0xef) {
1824 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1825 #ifdef NUMCHAR_OPTION
1828 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1836 #ifdef UTF8_INPUT_ENABLE
1837 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1846 else if (nkf_char_unicode_bmp_p(val)){
1847 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1848 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1851 *p1 = nkf_char_unicode_new(val);
1857 *p1 = nkf_char_unicode_new(val);
1863 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1865 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1866 if (iso2022jp_f && !x0201_f) {
1867 c2 = GETA1; c1 = GETA2;
1869 c2 = JIS_X_0201_1976_K;
1873 }else if (c2 == 0x8f){
1877 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1878 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1879 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1882 c2 = (c2 << 8) | (c1 & 0x7f);
1884 #ifdef SHIFTJIS_CP932
1887 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1888 s2e_conv(s2, s1, &c2, &c1);
1895 #endif /* SHIFTJIS_CP932 */
1897 #endif /* X0212_ENABLE */
1898 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1901 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1902 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1903 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1908 #ifdef SHIFTJIS_CP932
1909 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1911 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1912 s2e_conv(s2, s1, &c2, &c1);
1919 #endif /* SHIFTJIS_CP932 */
1926 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1928 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1929 if (iso2022jp_f && !x0201_f) {
1930 c2 = GETA1; c1 = GETA2;
1934 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1936 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1938 if(c1 == 0x7F) return 0;
1939 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1942 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1943 if (ret) return ret;
1949 nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
1951 nkf_char ret = 0, c4 = 0;
1952 static const char w_iconv_utf8_1st_byte[] =
1954 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1955 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1956 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
1957 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
1964 if (c1 < 0 || 0xff < c1) {
1965 }else if (c1 == 0) { /* 0 : 1 byte*/
1967 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
1970 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
1972 if (c2 < 0x80 || 0xBF < c2) return 0;
1975 if (c3 == 0) return -1;
1976 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
1981 if (c3 == 0) return -1;
1982 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
1986 if (c3 == 0) return -1;
1987 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
1991 if (c3 == 0) return -2;
1992 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
1996 if (c3 == 0) return -2;
1997 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2001 if (c3 == 0) return -2;
2002 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2010 if (c1 == 0 || c1 == EOF){
2011 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2012 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2015 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2023 #define NKF_ICONV_INVALID_CODE_RANGE -13
2024 static size_t unicode_iconv(nkf_char wc)
2032 }else if ((wc>>3) == 27) {
2033 /* unpaired surrogate */
2034 return NKF_ICONV_INVALID_CODE_RANGE;
2035 }else if (wc < 0xFFFF) {
2036 ret = w16e_conv(wc, &c2, &c1);
2037 if (ret) return ret;
2038 }else if (wc < 0x10FFFF) {
2040 c1 = nkf_char_unicode_new(wc);
2042 return NKF_ICONV_INVALID_CODE_RANGE;
2048 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2049 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2050 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2051 size_t nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2060 if (input_endian == ENDIAN_BIG) {
2061 if (0xD8 <= c1 && c1 <= 0xDB) {
2062 if (0xDC <= c3 && c3 <= 0xDF) {
2063 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2064 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2069 if (0xD8 <= c2 && c2 <= 0xDB) {
2070 if (0xDC <= c4 && c4 <= 0xDF) {
2071 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2072 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2078 return (*unicode_iconv)(wc);
2081 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2086 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2091 size_t nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2100 switch(input_endian){
2102 wc = c2 << 16 | c3 << 8 | c4;
2105 wc = c3 << 16 | c2 << 8 | c1;
2108 wc = c1 << 16 | c4 << 8 | c3;
2111 wc = c4 << 16 | c1 << 8 | c2;
2114 return NKF_ICONV_INVALID_CODE_RANGE;
2117 return (*unicode_iconv)(wc);
2121 #define output_ascii_escape_sequence(mode) do { \
2122 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2125 (*o_putc)(ascii_intro); \
2126 output_mode = mode; \
2130 void output_escape_sequence(int mode)
2132 if (output_mode == mode)
2140 case JIS_X_0201_1976_K:
2148 (*o_putc)(kanji_intro);
2160 (*o_putc)('O'); /* TODO */
2172 void j_oconv(nkf_char c2, nkf_char c1)
2174 #ifdef NUMCHAR_OPTION
2175 if (c2 == 0 && nkf_char_unicode_p(c1)){
2176 w16e_conv(c1, &c2, &c1);
2177 if (c2 == 0 && nkf_char_unicode_p(c1)){
2178 c2 = c1 & VALUE_MASK;
2179 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2182 c2 = 0x7F + c1 / 94;
2183 c1 = 0x21 + c1 % 94;
2185 if (encode_fallback) (*encode_fallback)(c1);
2192 output_ascii_escape_sequence(ASCII);
2195 else if (c2 == EOF) {
2196 output_ascii_escape_sequence(ASCII);
2199 else if (c2 == ISO_8859_1) {
2200 output_ascii_escape_sequence(ISO_8859_1);
2203 else if (c2 == JIS_X_0201_1976_K) {
2204 output_escape_sequence(JIS_X_0201_1976_K);
2207 } else if (is_eucg3(c2)){
2208 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2209 (*o_putc)(c2 & 0x7f);
2214 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2215 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2216 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2222 void e_oconv(nkf_char c2, nkf_char c1)
2224 #ifdef NUMCHAR_OPTION
2225 if (c2 == 0 && nkf_char_unicode_p(c1)){
2226 w16e_conv(c1, &c2, &c1);
2227 if (c2 == 0 && nkf_char_unicode_p(c1)){
2228 c2 = c1 & VALUE_MASK;
2229 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2233 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2234 c1 = 0x21 + c1 % 94;
2237 (*o_putc)((c2 & 0x7f) | 0x080);
2238 (*o_putc)(c1 | 0x080);
2240 (*o_putc)((c2 & 0x7f) | 0x080);
2241 (*o_putc)(c1 | 0x080);
2245 if (encode_fallback) (*encode_fallback)(c1);
2253 } else if (c2 == 0) {
2254 output_mode = ASCII;
2256 } else if (c2 == JIS_X_0201_1976_K) {
2257 output_mode = EUC_JP;
2258 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2259 } else if (c2 == ISO_8859_1) {
2260 output_mode = ISO_8859_1;
2261 (*o_putc)(c1 | 0x080);
2263 } else if (is_eucg3(c2)){
2264 output_mode = EUC_JP;
2265 #ifdef SHIFTJIS_CP932
2268 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2269 s2e_conv(s2, s1, &c2, &c1);
2274 output_mode = ASCII;
2276 }else if (is_eucg3(c2)){
2279 (*o_putc)((c2 & 0x7f) | 0x080);
2280 (*o_putc)(c1 | 0x080);
2283 (*o_putc)((c2 & 0x7f) | 0x080);
2284 (*o_putc)(c1 | 0x080);
2288 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2289 set_iconv(FALSE, 0);
2290 return; /* too late to rescue this char */
2292 output_mode = EUC_JP;
2293 (*o_putc)(c2 | 0x080);
2294 (*o_putc)(c1 | 0x080);
2298 void s_oconv(nkf_char c2, nkf_char c1)
2300 #ifdef NUMCHAR_OPTION
2301 if (c2 == 0 && nkf_char_unicode_p(c1)){
2302 w16e_conv(c1, &c2, &c1);
2303 if (c2 == 0 && nkf_char_unicode_p(c1)){
2304 c2 = c1 & VALUE_MASK;
2305 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2308 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2310 c1 += 0x40 + (c1 > 0x3e);
2315 if(encode_fallback)(*encode_fallback)(c1);
2324 } else if (c2 == 0) {
2325 output_mode = ASCII;
2327 } else if (c2 == JIS_X_0201_1976_K) {
2328 output_mode = SHIFT_JIS;
2330 } else if (c2 == ISO_8859_1) {
2331 output_mode = ISO_8859_1;
2332 (*o_putc)(c1 | 0x080);
2334 } else if (is_eucg3(c2)){
2335 output_mode = SHIFT_JIS;
2336 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2342 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2343 set_iconv(FALSE, 0);
2344 return; /* too late to rescue this char */
2346 output_mode = SHIFT_JIS;
2347 e2s_conv(c2, c1, &c2, &c1);
2349 #ifdef SHIFTJIS_CP932
2351 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2352 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2358 #endif /* SHIFTJIS_CP932 */
2361 if (prefix_table[(unsigned char)c1]){
2362 (*o_putc)(prefix_table[(unsigned char)c1]);
2368 #ifdef UTF8_OUTPUT_ENABLE
2369 void w_oconv(nkf_char c2, nkf_char c1)
2375 output_bom_f = FALSE;
2386 if (c2 == 0 && nkf_char_unicode_p(c1)){
2387 val = c1 & VALUE_MASK;
2388 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2390 if (c2) (*o_putc)(c2);
2391 if (c3) (*o_putc)(c3);
2392 if (c4) (*o_putc)(c4);
2398 } else if (c2 == ISO_8859_1) {
2399 (*o_putc)(0xC2 + (c1 >= 0x40));
2400 (*o_putc)(c1 + 0x40);
2402 val = e2w_conv(c2, c1);
2404 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2406 if (c2) (*o_putc)(c2);
2407 if (c3) (*o_putc)(c3);
2408 if (c4) (*o_putc)(c4);
2413 void w_oconv16(nkf_char c2, nkf_char c1)
2416 output_bom_f = FALSE;
2417 if (output_endian == ENDIAN_LITTLE){
2431 if (c2 == ISO_8859_1) {
2434 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2435 if (nkf_char_unicode_bmp_p(c1)) {
2436 c2 = (c1 >> 8) & 0xff;
2440 if (c1 <= UNICODE_MAX) {
2441 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2442 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2443 if (output_endian == ENDIAN_LITTLE){
2444 (*o_putc)(c2 & 0xff);
2445 (*o_putc)((c2 >> 8) & 0xff);
2446 (*o_putc)(c1 & 0xff);
2447 (*o_putc)((c1 >> 8) & 0xff);
2449 (*o_putc)((c2 >> 8) & 0xff);
2450 (*o_putc)(c2 & 0xff);
2451 (*o_putc)((c1 >> 8) & 0xff);
2452 (*o_putc)(c1 & 0xff);
2458 nkf_char val = e2w_conv(c2, c1);
2459 c2 = (val >> 8) & 0xff;
2463 if (output_endian == ENDIAN_LITTLE){
2472 void w_oconv32(nkf_char c2, nkf_char c1)
2475 output_bom_f = FALSE;
2476 if (output_endian == ENDIAN_LITTLE){
2494 if (c2 == ISO_8859_1) {
2496 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2499 c1 = e2w_conv(c2, c1);
2502 if (output_endian == ENDIAN_LITTLE){
2503 (*o_putc)( c1 & 0xFF);
2504 (*o_putc)((c1 >> 8) & 0xFF);
2505 (*o_putc)((c1 >> 16) & 0xFF);
2509 (*o_putc)((c1 >> 16) & 0xFF);
2510 (*o_putc)((c1 >> 8) & 0xFF);
2511 (*o_putc)( c1 & 0xFF);
2516 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2517 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2518 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2519 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2520 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2521 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2522 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2523 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2525 #define SCORE_INIT (SCORE_iMIME)
2527 static const char score_table_A0[] = {
2530 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2531 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2534 static const char score_table_F0[] = {
2535 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2536 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2537 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2538 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2541 void set_code_score(struct input_code *ptr, nkf_char score)
2544 ptr->score |= score;
2548 void clr_code_score(struct input_code *ptr, nkf_char score)
2551 ptr->score &= ~score;
2555 void code_score(struct input_code *ptr)
2557 nkf_char c2 = ptr->buf[0];
2558 #ifdef UTF8_OUTPUT_ENABLE
2559 nkf_char c1 = ptr->buf[1];
2562 set_code_score(ptr, SCORE_ERROR);
2563 }else if (c2 == SS2){
2564 set_code_score(ptr, SCORE_KANA);
2565 }else if (c2 == 0x8f){
2566 set_code_score(ptr, SCORE_X0212);
2567 #ifdef UTF8_OUTPUT_ENABLE
2568 }else if (!e2w_conv(c2, c1)){
2569 set_code_score(ptr, SCORE_NO_EXIST);
2571 }else if ((c2 & 0x70) == 0x20){
2572 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2573 }else if ((c2 & 0x70) == 0x70){
2574 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2575 }else if ((c2 & 0x70) >= 0x50){
2576 set_code_score(ptr, SCORE_L2);
2580 void status_disable(struct input_code *ptr)
2585 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2588 void status_push_ch(struct input_code *ptr, nkf_char c)
2590 ptr->buf[ptr->index++] = c;
2593 void status_clear(struct input_code *ptr)
2599 void status_reset(struct input_code *ptr)
2602 ptr->score = SCORE_INIT;
2605 void status_reinit(struct input_code *ptr)
2608 ptr->_file_stat = 0;
2611 void status_check(struct input_code *ptr, nkf_char c)
2613 if (c <= DEL && estab_f){
2618 void s_status(struct input_code *ptr, nkf_char c)
2622 status_check(ptr, c);
2627 }else if (nkf_char_unicode_p(c)){
2629 }else if (0xa1 <= c && c <= 0xdf){
2630 status_push_ch(ptr, SS2);
2631 status_push_ch(ptr, c);
2634 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2636 status_push_ch(ptr, c);
2637 }else if (0xed <= c && c <= 0xee){
2639 status_push_ch(ptr, c);
2640 #ifdef SHIFTJIS_CP932
2641 }else if (is_ibmext_in_sjis(c)){
2643 status_push_ch(ptr, c);
2644 #endif /* SHIFTJIS_CP932 */
2646 }else if (0xf0 <= c && c <= 0xfc){
2648 status_push_ch(ptr, c);
2649 #endif /* X0212_ENABLE */
2651 status_disable(ptr);
2655 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2656 status_push_ch(ptr, c);
2657 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2661 status_disable(ptr);
2665 #ifdef SHIFTJIS_CP932
2666 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2667 status_push_ch(ptr, c);
2668 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2669 set_code_score(ptr, SCORE_CP932);
2674 #endif /* SHIFTJIS_CP932 */
2675 status_disable(ptr);
2678 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2679 status_push_ch(ptr, c);
2680 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2681 set_code_score(ptr, SCORE_CP932);
2684 status_disable(ptr);
2690 void e_status(struct input_code *ptr, nkf_char c)
2694 status_check(ptr, c);
2699 }else if (nkf_char_unicode_p(c)){
2701 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2703 status_push_ch(ptr, c);
2705 }else if (0x8f == c){
2707 status_push_ch(ptr, c);
2708 #endif /* X0212_ENABLE */
2710 status_disable(ptr);
2714 if (0xa1 <= c && c <= 0xfe){
2715 status_push_ch(ptr, c);
2719 status_disable(ptr);
2724 if (0xa1 <= c && c <= 0xfe){
2726 status_push_ch(ptr, c);
2728 status_disable(ptr);
2730 #endif /* X0212_ENABLE */
2734 #ifdef UTF8_INPUT_ENABLE
2735 void w_status(struct input_code *ptr, nkf_char c)
2739 status_check(ptr, c);
2744 }else if (nkf_char_unicode_p(c)){
2746 }else if (0xc0 <= c && c <= 0xdf){
2748 status_push_ch(ptr, c);
2749 }else if (0xe0 <= c && c <= 0xef){
2751 status_push_ch(ptr, c);
2752 }else if (0xf0 <= c && c <= 0xf4){
2754 status_push_ch(ptr, c);
2756 status_disable(ptr);
2761 if (0x80 <= c && c <= 0xbf){
2762 status_push_ch(ptr, c);
2763 if (ptr->index > ptr->stat){
2764 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2765 && ptr->buf[2] == 0xbf);
2766 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2767 &ptr->buf[0], &ptr->buf[1]);
2774 status_disable(ptr);
2778 if (0x80 <= c && c <= 0xbf){
2779 if (ptr->index < ptr->stat){
2780 status_push_ch(ptr, c);
2785 status_disable(ptr);
2792 void code_status(nkf_char c)
2794 int action_flag = 1;
2795 struct input_code *result = 0;
2796 struct input_code *p = input_code_list;
2798 if (!p->status_func) {
2802 if (!p->status_func)
2804 (p->status_func)(p, c);
2807 }else if(p->stat == 0){
2818 if (result && !estab_f){
2819 set_iconv(TRUE, result->iconv_func);
2820 }else if (c <= DEL){
2821 struct input_code *ptr = input_code_list;
2831 nkf_char std_getc(FILE *f)
2834 return std_gc_buf[--std_gc_ndx];
2840 nkf_char std_ungetc(nkf_char c, FILE *f)
2842 if (std_gc_ndx == STD_GC_BUFSIZE){
2845 std_gc_buf[std_gc_ndx++] = c;
2850 void std_putc(nkf_char c)
2857 static unsigned char hold_buf[HOLD_SIZE*2];
2858 static int hold_count = 0;
2859 nkf_char push_hold_buf(nkf_char c2)
2861 if (hold_count >= HOLD_SIZE*2)
2863 hold_buf[hold_count++] = (unsigned char)c2;
2864 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2867 static int h_conv(FILE *f, int c1, int c2)
2873 /** it must NOT be in the kanji shifte sequence */
2874 /** it must NOT be written in JIS7 */
2875 /** and it must be after 2 byte 8bit code */
2881 while ((c2 = (*i_getc)(f)) != EOF) {
2887 if (push_hold_buf(c2) == EOF || estab_f) {
2893 struct input_code *p = input_code_list;
2894 struct input_code *result = p;
2899 if (p->status_func && p->score < result->score) {
2904 set_iconv(TRUE, result->iconv_func);
2909 ** 1) EOF is detected, or
2910 ** 2) Code is established, or
2911 ** 3) Buffer is FULL (but last word is pushed)
2913 ** in 1) and 3) cases, we continue to use
2914 ** Kanji codes by oconv and leave estab_f unchanged.
2919 while (hold_index < hold_count){
2920 c1 = hold_buf[hold_index++];
2924 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
2925 (*iconv)(JIS_X_0201_1976_K, c1, 0);
2928 if (hold_index < hold_count){
2929 c2 = hold_buf[hold_index++];
2939 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
2942 if (hold_index < hold_count){
2943 c3 = hold_buf[hold_index++];
2944 } else if ((c3 = (*i_getc)(f)) == EOF) {
2949 if (hold_index < hold_count){
2950 c4 = hold_buf[hold_index++];
2951 } else if ((c4 = (*i_getc)(f)) == EOF) {
2956 (*iconv)(c1, c2, (c3<<8)|c4);
2961 /* 3 bytes EUC or UTF-8 */
2962 if (hold_index < hold_count){
2963 c3 = hold_buf[hold_index++];
2964 } else if ((c3 = (*i_getc)(f)) == EOF) {
2970 (*iconv)(c1, c2, c3);
2973 if (c3 == EOF) break;
2979 * Check and Ignore BOM
2981 void check_bom(FILE *f)
2984 switch(c2 = (*i_getc)(f)){
2986 if((c2 = (*i_getc)(f)) == 0x00){
2987 if((c2 = (*i_getc)(f)) == 0xFE){
2988 if((c2 = (*i_getc)(f)) == 0xFF){
2989 if(!input_encoding){
2990 set_iconv(TRUE, w_iconv32);
2992 if (iconv == w_iconv32) {
2993 input_endian = ENDIAN_BIG;
2996 (*i_ungetc)(0xFF,f);
2997 }else (*i_ungetc)(c2,f);
2998 (*i_ungetc)(0xFE,f);
2999 }else if(c2 == 0xFF){
3000 if((c2 = (*i_getc)(f)) == 0xFE){
3001 if(!input_encoding){
3002 set_iconv(TRUE, w_iconv32);
3004 if (iconv == w_iconv32) {
3005 input_endian = ENDIAN_2143;
3008 (*i_ungetc)(0xFF,f);
3009 }else (*i_ungetc)(c2,f);
3010 (*i_ungetc)(0xFF,f);
3011 }else (*i_ungetc)(c2,f);
3012 (*i_ungetc)(0x00,f);
3013 }else (*i_ungetc)(c2,f);
3014 (*i_ungetc)(0x00,f);
3017 if((c2 = (*i_getc)(f)) == 0xBB){
3018 if((c2 = (*i_getc)(f)) == 0xBF){
3019 if(!input_encoding){
3020 set_iconv(TRUE, w_iconv);
3022 if (iconv == w_iconv) {
3025 (*i_ungetc)(0xBF,f);
3026 }else (*i_ungetc)(c2,f);
3027 (*i_ungetc)(0xBB,f);
3028 }else (*i_ungetc)(c2,f);
3029 (*i_ungetc)(0xEF,f);
3032 if((c2 = (*i_getc)(f)) == 0xFF){
3033 if((c2 = (*i_getc)(f)) == 0x00){
3034 if((c2 = (*i_getc)(f)) == 0x00){
3035 if(!input_encoding){
3036 set_iconv(TRUE, w_iconv32);
3038 if (iconv == w_iconv32) {
3039 input_endian = ENDIAN_3412;
3042 (*i_ungetc)(0x00,f);
3043 }else (*i_ungetc)(c2,f);
3044 (*i_ungetc)(0x00,f);
3045 }else (*i_ungetc)(c2,f);
3046 if(!input_encoding){
3047 set_iconv(TRUE, w_iconv16);
3049 if (iconv == w_iconv16) {
3050 input_endian = ENDIAN_BIG;
3053 (*i_ungetc)(0xFF,f);
3054 }else (*i_ungetc)(c2,f);
3055 (*i_ungetc)(0xFE,f);
3058 if((c2 = (*i_getc)(f)) == 0xFE){
3059 if((c2 = (*i_getc)(f)) == 0x00){
3060 if((c2 = (*i_getc)(f)) == 0x00){
3061 if(!input_encoding){
3062 set_iconv(TRUE, w_iconv32);
3064 if (iconv == w_iconv32) {
3065 input_endian = ENDIAN_LITTLE;
3068 (*i_ungetc)(0x00,f);
3069 }else (*i_ungetc)(c2,f);
3070 (*i_ungetc)(0x00,f);
3071 }else (*i_ungetc)(c2,f);
3072 if(!input_encoding){
3073 set_iconv(TRUE, w_iconv16);
3075 if (iconv == w_iconv16) {
3076 input_endian = ENDIAN_LITTLE;
3079 (*i_ungetc)(0xFE,f);
3080 }else (*i_ungetc)(c2,f);
3081 (*i_ungetc)(0xFF,f);
3095 static void init_broken_state(void)
3097 memset(&broken_state, 0, sizeof(broken_state));
3100 static void push_broken_buf(c)
3102 broken_state.buf[broken_state.count++] = c;
3105 static nkf_char pop_broken_buf(void)
3107 return broken_state.buf[--broken_state.count];
3110 nkf_char broken_getc(FILE *f)
3114 if (broken_state.count > 0) {
3115 return pop_broken_buf();
3118 if (c=='$' && broken_state.status != ESC
3119 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3121 broken_state.status = 0;
3122 if (c1=='@'|| c1=='B') {
3123 push_broken_buf(c1);
3130 } else if (c=='(' && broken_state.status != ESC
3131 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3133 broken_state.status = 0;
3134 if (c1=='J'|| c1=='B') {
3135 push_broken_buf(c1);
3143 broken_state.status = c;
3148 nkf_char broken_ungetc(nkf_char c, FILE *f)
3150 if (broken_state.count < 2)
3155 void eol_conv(nkf_char c2, nkf_char c1)
3157 if (guess_f && input_eol != EOF) {
3158 if (c2 == 0 && c1 == LF) {
3159 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3160 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3161 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3163 else if (!input_eol) input_eol = CR;
3164 else if (input_eol != CR) input_eol = EOF;
3166 if (prev_cr || (c2 == 0 && c1 == LF)) {
3168 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3169 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3171 if (c2 == 0 && c1 == CR) prev_cr = CR;
3172 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3176 Return value of fold_conv()
3178 LF add newline and output char
3179 CR add newline and output nothing
3182 1 (or else) normal output
3184 fold state in prev (previous character)
3186 >0x80 Japanese (X0208/X0201)
3191 This fold algorthm does not preserve heading space in a line.
3192 This is the main difference from fmt.
3195 #define char_size(c2,c1) (c2?2:1)
3197 void fold_conv(nkf_char c2, nkf_char c1)
3200 nkf_char fold_state;
3202 if (c1== CR && !fold_preserve_f) {
3203 fold_state=0; /* ignore cr */
3204 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3206 fold_state=0; /* ignore cr */
3207 } else if (c1== BS) {
3208 if (f_line>0) f_line--;
3210 } else if (c2==EOF && f_line != 0) { /* close open last line */
3212 } else if ((c1==LF && !fold_preserve_f)
3213 || ((c1==CR||(c1==LF&&f_prev!=CR))
3214 && fold_preserve_f)) {
3216 if (fold_preserve_f) {
3220 } else if ((f_prev == c1 && !fold_preserve_f)
3221 || (f_prev == LF && fold_preserve_f)
3222 ) { /* duplicate newline */
3225 fold_state = LF; /* output two newline */
3231 if (f_prev&0x80) { /* Japanese? */
3233 fold_state = 0; /* ignore given single newline */
3234 } else if (f_prev==SP) {
3238 if (++f_line<=fold_len)
3242 fold_state = CR; /* fold and output nothing */
3246 } else if (c1=='\f') {
3249 fold_state = LF; /* output newline and clear */
3250 } else if ( (c2==0 && c1==SP)||
3251 (c2==0 && c1==TAB)||
3252 (c2=='!'&& c1=='!')) {
3253 /* X0208 kankaku or ascii space */
3255 fold_state = 0; /* remove duplicate spaces */
3258 if (++f_line<=fold_len)
3259 fold_state = SP; /* output ASCII space only */
3261 f_prev = SP; f_line = 0;
3262 fold_state = CR; /* fold and output nothing */
3266 prev0 = f_prev; /* we still need this one... , but almost done */
3268 if (c2 || c2 == JIS_X_0201_1976_K)
3269 f_prev |= 0x80; /* this is Japanese */
3270 f_line += char_size(c2,c1);
3271 if (f_line<=fold_len) { /* normal case */
3274 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3275 f_line = char_size(c2,c1);
3276 fold_state = LF; /* We can't wait, do fold now */
3277 } else if (c2 == JIS_X_0201_1976_K) {
3278 /* simple kinsoku rules return 1 means no folding */
3279 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3280 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3281 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3282 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3283 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3284 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3285 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3287 fold_state = LF;/* add one new f_line before this character */
3290 fold_state = LF;/* add one new f_line before this character */
3293 /* kinsoku point in ASCII */
3294 if ( c1==')'|| /* { [ ( */
3305 /* just after special */
3306 } else if (!is_alnum(prev0)) {
3307 f_line = char_size(c2,c1);
3309 } else if ((prev0==SP) || /* ignored new f_line */
3310 (prev0==LF)|| /* ignored new f_line */
3311 (prev0&0x80)) { /* X0208 - ASCII */
3312 f_line = char_size(c2,c1);
3313 fold_state = LF;/* add one new f_line before this character */
3315 fold_state = 1; /* default no fold in ASCII */
3319 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3320 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3321 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3322 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3323 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3324 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3325 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3326 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3327 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3328 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3329 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3330 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3331 /* default no fold in kinsoku */
3334 f_line = char_size(c2,c1);
3335 /* add one new f_line before this character */
3338 f_line = char_size(c2,c1);
3340 /* add one new f_line before this character */
3345 /* terminator process */
3346 switch(fold_state) {
3348 OCONV_NEWLINE((*o_fconv));
3354 OCONV_NEWLINE((*o_fconv));
3365 nkf_char z_prev2=0,z_prev1=0;
3367 void z_conv(nkf_char c2, nkf_char c1)
3370 /* if (c2) c1 &= 0x7f; assertion */
3372 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3378 if (z_prev2 == JIS_X_0201_1976_K) {
3379 if (c2 == JIS_X_0201_1976_K) {
3380 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3382 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3384 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3386 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3391 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3393 if (c2 == JIS_X_0201_1976_K) {
3394 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3395 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3400 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3411 if (alpha_f&1 && c2 == 0x23) {
3412 /* JISX0208 Alphabet */
3414 } else if (c2 == 0x21) {
3415 /* JISX0208 Kigou */
3420 } else if (alpha_f&4) {
3425 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3431 if (alpha_f&8 && c2 == 0) {
3435 case '>': entity = ">"; break;
3436 case '<': entity = "<"; break;
3437 case '\"': entity = """; break;
3438 case '&': entity = "&"; break;
3441 while (*entity) (*o_zconv)(0, *entity++);
3447 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3452 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3456 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3460 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3464 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3468 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3472 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3476 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3480 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3485 (*o_zconv)(JIS_X_0201_1976_K, c);
3488 } else if (c2 == 0x25) {
3489 /* JISX0208 Katakana */
3490 static const int fullwidth_to_halfwidth[] =
3492 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3493 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3494 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3495 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3496 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3497 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3498 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3499 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3500 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3501 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3502 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3503 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3505 if (fullwidth_to_halfwidth[c1-0x20]){
3506 c2 = fullwidth_to_halfwidth[c1-0x20];
3507 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3509 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3519 #define rot13(c) ( \
3521 (c <= 'M') ? (c + 13): \
3522 (c <= 'Z') ? (c - 13): \
3524 (c <= 'm') ? (c + 13): \
3525 (c <= 'z') ? (c - 13): \
3529 #define rot47(c) ( \
3531 ( c <= 'O') ? (c + 47) : \
3532 ( c <= '~') ? (c - 47) : \
3536 void rot_conv(nkf_char c2, nkf_char c1)
3538 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3544 (*o_rot_conv)(c2,c1);
3547 void hira_conv(nkf_char c2, nkf_char c1)
3551 if (0x20 < c1 && c1 < 0x74) {
3553 (*o_hira_conv)(c2,c1);
3555 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3557 c1 = nkf_char_unicode_new(0x3094);
3558 (*o_hira_conv)(c2,c1);
3561 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3563 (*o_hira_conv)(c2,c1);
3568 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3571 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3573 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3577 (*o_hira_conv)(c2,c1);
3581 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3583 #define RANGE_NUM_MAX 18
3584 static const nkf_char range[RANGE_NUM_MAX][2] = {
3605 nkf_char start, end, c;
3607 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3611 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3616 for (i = 0; i < RANGE_NUM_MAX; i++) {
3617 start = range[i][0];
3620 if (c >= start && c <= end) {
3625 (*o_iso2022jp_check_conv)(c2,c1);
3629 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3631 static const unsigned char *mime_pattern[] = {
3632 (const unsigned char *)"\075?EUC-JP?B?",
3633 (const unsigned char *)"\075?SHIFT_JIS?B?",
3634 (const unsigned char *)"\075?ISO-8859-1?Q?",
3635 (const unsigned char *)"\075?ISO-8859-1?B?",
3636 (const unsigned char *)"\075?ISO-2022-JP?B?",
3637 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3638 #if defined(UTF8_INPUT_ENABLE)
3639 (const unsigned char *)"\075?UTF-8?B?",
3640 (const unsigned char *)"\075?UTF-8?Q?",
3642 (const unsigned char *)"\075?US-ASCII?Q?",
3647 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3648 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3649 e_iconv, s_iconv, 0, 0, 0, 0,
3650 #if defined(UTF8_INPUT_ENABLE)
3656 static const nkf_char mime_encode[] = {
3657 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3658 #if defined(UTF8_INPUT_ENABLE)
3665 static const nkf_char mime_encode_method[] = {
3666 'B', 'B','Q', 'B', 'B', 'Q',
3667 #if defined(UTF8_INPUT_ENABLE)
3675 /* MIME preprocessor fifo */
3677 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3678 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3679 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3681 unsigned char buf[MIME_BUF_SIZE];
3683 unsigned int last; /* decoded */
3684 unsigned int input; /* undecoded */
3686 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3688 #define MAXRECOVER 20
3690 static void mime_input_buf_unshift(nkf_char c)
3692 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3695 nkf_char mime_ungetc(nkf_char c, FILE *f)
3697 mime_input_buf_unshift(c);
3701 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
3704 (*i_mungetc_buf)(c,f);
3706 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3710 nkf_char mime_getc_buf(FILE *f)
3712 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3713 a terminator. It was checked in mime_integrity. */
3714 return ((mimebuf_f)?
3715 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3718 void switch_mime_getc(void)
3720 if (i_getc!=mime_getc) {
3721 i_mgetc = i_getc; i_getc = mime_getc;
3722 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3723 if(mime_f==STRICT_MIME) {
3724 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3725 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3730 void unswitch_mime_getc(void)
3732 if(mime_f==STRICT_MIME) {
3733 i_mgetc = i_mgetc_buf;
3734 i_mungetc = i_mungetc_buf;
3737 i_ungetc = i_mungetc;
3738 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3739 mime_iconv_back = NULL;
3742 nkf_char mime_integrity(FILE *f, const unsigned char *p)
3746 /* In buffered mode, read until =? or NL or buffer full
3748 mime_input_state.input = mime_input_state.top;
3749 mime_input_state.last = mime_input_state.top;
3751 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3753 q = mime_input_state.input;
3754 while((c=(*i_getc)(f))!=EOF) {
3755 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3756 break; /* buffer full */
3758 if (c=='=' && d=='?') {
3759 /* checked. skip header, start decode */
3760 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3761 /* mime_last_input = mime_input_state.input; */
3762 mime_input_state.input = q;
3766 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3768 /* Should we check length mod 4? */
3769 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3772 /* In case of Incomplete MIME, no MIME decode */
3773 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3774 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3775 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3776 switch_mime_getc(); /* anyway we need buffered getc */
3780 nkf_char mime_begin_strict(FILE *f)
3784 const unsigned char *p,*q;
3785 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3787 mime_decode_mode = FALSE;
3788 /* =? has been checked */
3790 p = mime_pattern[j];
3793 for(i=2;p[i]>SP;i++) { /* start at =? */
3794 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3795 /* pattern fails, try next one */
3797 while (mime_pattern[++j]) {
3798 p = mime_pattern[j];
3799 for(k=2;k<i;k++) /* assume length(p) > i */
3800 if (p[k]!=q[k]) break;
3801 if (k==i && nkf_toupper(c1)==p[k]) break;
3803 p = mime_pattern[j];
3804 if (p) continue; /* found next one, continue */
3805 /* all fails, output from recovery buffer */
3813 mime_decode_mode = p[i-2];
3815 mime_iconv_back = iconv;
3816 set_iconv(FALSE, mime_priority_func[j]);
3817 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3819 if (mime_decode_mode=='B') {
3820 mimebuf_f = unbuf_f;
3822 /* do MIME integrity check */
3823 return mime_integrity(f,mime_pattern[j]);
3831 nkf_char mime_begin(FILE *f)
3836 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3837 /* re-read and convert again from mime_buffer. */
3839 /* =? has been checked */
3840 k = mime_input_state.last;
3841 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3842 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3843 /* We accept any character type even if it is breaked by new lines */
3844 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3845 if (c1==LF||c1==SP||c1==CR||
3846 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3848 /* Failed. But this could be another MIME preemble */
3850 mime_input_state.last--;
3856 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3857 if (!(++i<MAXRECOVER) || c1==EOF) break;
3858 if (c1=='b'||c1=='B') {
3859 mime_decode_mode = 'B';
3860 } else if (c1=='q'||c1=='Q') {
3861 mime_decode_mode = 'Q';
3865 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3866 if (!(++i<MAXRECOVER) || c1==EOF) break;
3868 mime_decode_mode = FALSE;
3874 if (!mime_decode_mode) {
3875 /* false MIME premble, restart from mime_buffer */
3876 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3877 /* Since we are in MIME mode until buffer becomes empty, */
3878 /* we never go into mime_begin again for a while. */
3881 /* discard mime preemble, and goto MIME mode */
3882 mime_input_state.last = k;
3883 /* do no MIME integrity check */
3884 return c1; /* used only for checking EOF */
3888 void no_putc(nkf_char c)
3893 void debug(const char *str)
3896 fprintf(stderr, "%s\n", str ? str : "NULL");
3901 void set_input_codename(char *codename)
3903 if (!input_codename) {
3904 input_codename = codename;
3905 } else if (strcmp(codename, input_codename) != 0) {
3906 input_codename = "";
3910 static char* get_guessed_code(void)
3912 if (input_codename && !*input_codename) {
3913 input_codename = "BINARY";
3915 struct input_code *p = find_inputcode_byfunc(iconv);
3916 if (!input_codename) {
3917 input_codename = "ASCII";
3918 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
3919 if (p->score & (SCORE_DEPEND|SCORE_CP932))
3920 input_codename = "CP932";
3921 } else if (strcmp(input_codename, "EUC-JP") == 0) {
3922 if (p->score & (SCORE_X0212))
3923 input_codename = "EUCJP-MS";
3924 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3925 input_codename = "CP51932";
3926 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
3927 if (p->score & (SCORE_KANA))
3928 input_codename = "CP50221";
3929 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3930 input_codename = "CP50220";
3933 return input_codename;
3936 #if !defined(PERL_XS) && !defined(WIN32DLL)
3937 void print_guessed_code(char *filename)
3939 if (filename != NULL) printf("%s: ", filename);
3940 if (input_codename && !*input_codename) {
3943 input_codename = get_guessed_code();
3945 printf("%s\n", input_codename);
3949 input_eol == CR ? " (CR)" :
3950 input_eol == LF ? " (LF)" :
3951 input_eol == CRLF ? " (CRLF)" :
3952 input_eol == EOF ? " (MIXED NL)" :
3961 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
3963 nkf_char c1, c2, c3;
3969 if (!nkf_isxdigit(c2)){
3974 if (!nkf_isxdigit(c3)){
3979 return (hex2bin(c2) << 4) | hex2bin(c3);
3982 nkf_char cap_getc(FILE *f)
3984 return hex_getc(':', f, i_cgetc, i_cungetc);
3987 nkf_char cap_ungetc(nkf_char c, FILE *f)
3989 return (*i_cungetc)(c, f);
3992 nkf_char url_getc(FILE *f)
3994 return hex_getc('%', f, i_ugetc, i_uungetc);
3997 nkf_char url_ungetc(nkf_char c, FILE *f)
3999 return (*i_uungetc)(c, f);
4003 #ifdef NUMCHAR_OPTION
4004 nkf_char numchar_getc(FILE *f)
4006 nkf_char (*g)(FILE *) = i_ngetc;
4007 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4018 if (buf[i] == 'x' || buf[i] == 'X'){
4019 for (j = 0; j < 7; j++){
4021 if (!nkf_isxdigit(buf[i])){
4028 c |= hex2bin(buf[i]);
4031 for (j = 0; j < 8; j++){
4035 if (!nkf_isdigit(buf[i])){
4042 c += hex2bin(buf[i]);
4048 return nkf_char_unicode_new(c);
4057 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4059 return (*i_nungetc)(c, f);
4063 #ifdef UNICODE_NORMALIZATION
4065 /* Normalization Form C */
4066 nkf_char nfc_getc(FILE *f)
4068 nkf_char (*g)(FILE *f) = i_nfc_getc;
4069 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4070 int i=0, j, k=1, lower, upper;
4072 const unsigned char *array;
4075 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4076 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4077 while (upper >= lower) {
4078 j = (lower+upper) / 2;
4079 array = normalization_table[j].nfd;
4080 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4081 if (array[k] != buf[k]){
4082 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4089 array = normalization_table[j].nfc;
4090 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4091 buf[i] = (nkf_char)(array[i]);
4102 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4104 return (*i_nfc_ungetc)(c, f);
4106 #endif /* UNICODE_NORMALIZATION */
4109 static nkf_char base64decode(nkf_char c)
4114 i = c - 'A'; /* A..Z 0-25 */
4115 } else if (c == '_') {
4116 i = '?' /* 63 */ ; /* _ 63 */
4118 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4120 } else if (c > '/') {
4121 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4122 } else if (c == '+' || c == '-') {
4123 i = '>' /* 62 */ ; /* + and - 62 */
4125 i = '?' /* 63 */ ; /* / 63 */
4133 nkf_char c1, c2, c3, c4, cc;
4134 nkf_char t1, t2, t3, t4, mode, exit_mode;
4135 nkf_char lwsp_count;
4138 nkf_char lwsp_size = 128;
4140 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4141 return mime_input_buf(mime_input_state.top++);
4143 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4144 mime_decode_mode=FALSE;
4145 unswitch_mime_getc();
4146 return (*i_getc)(f);
4149 if (mimebuf_f == FIXED_MIME)
4150 exit_mode = mime_decode_mode;
4153 if (mime_decode_mode == 'Q') {
4154 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4156 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4157 if (c1<=SP || DEL<=c1) {
4158 mime_decode_mode = exit_mode; /* prepare for quit */
4161 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4165 mime_decode_mode = exit_mode; /* prepare for quit */
4166 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4167 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4168 /* end Q encoding */
4169 input_mode = exit_mode;
4171 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4172 if (lwsp_buf==NULL) {
4173 perror("can't malloc");
4176 while ((c1=(*i_getc)(f))!=EOF) {
4181 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4189 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4190 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4205 lwsp_buf[lwsp_count] = (unsigned char)c1;
4206 if (lwsp_count++>lwsp_size){
4208 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4209 if (lwsp_buf_new==NULL) {
4211 perror("can't realloc");
4214 lwsp_buf = lwsp_buf_new;
4220 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4222 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4223 i_ungetc(lwsp_buf[lwsp_count],f);
4229 if (c1=='='&&c2<SP) { /* this is soft wrap */
4230 while((c1 = (*i_mgetc)(f)) <=SP) {
4231 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4233 mime_decode_mode = 'Q'; /* still in MIME */
4234 goto restart_mime_q;
4237 mime_decode_mode = 'Q'; /* still in MIME */
4241 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4242 if (c2<=SP) return c2;
4243 mime_decode_mode = 'Q'; /* still in MIME */
4244 return ((hex2bin(c2)<<4) + hex2bin(c3));
4247 if (mime_decode_mode != 'B') {
4248 mime_decode_mode = FALSE;
4249 return (*i_mgetc)(f);
4253 /* Base64 encoding */
4255 MIME allows line break in the middle of
4256 Base64, but we are very pessimistic in decoding
4257 in unbuf mode because MIME encoded code may broken by
4258 less or editor's control sequence (such as ESC-[-K in unbuffered
4259 mode. ignore incomplete MIME.
4261 mode = mime_decode_mode;
4262 mime_decode_mode = exit_mode; /* prepare for quit */
4264 while ((c1 = (*i_mgetc)(f))<=SP) {
4269 if ((c2 = (*i_mgetc)(f))<=SP) {
4272 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4273 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4276 if ((c1 == '?') && (c2 == '=')) {
4279 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4280 if (lwsp_buf==NULL) {
4281 perror("can't malloc");
4284 while ((c1=(*i_getc)(f))!=EOF) {
4289 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4297 if ((c1=(*i_getc)(f))!=EOF) {
4301 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4316 lwsp_buf[lwsp_count] = (unsigned char)c1;
4317 if (lwsp_count++>lwsp_size){
4319 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4320 if (lwsp_buf_new==NULL) {
4322 perror("can't realloc");
4325 lwsp_buf = lwsp_buf_new;
4331 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4333 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4334 i_ungetc(lwsp_buf[lwsp_count],f);
4341 if ((c3 = (*i_mgetc)(f))<=SP) {
4344 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4345 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4349 if ((c4 = (*i_mgetc)(f))<=SP) {
4352 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4353 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4357 mime_decode_mode = mode; /* still in MIME sigh... */
4359 /* BASE 64 decoding */
4361 t1 = 0x3f & base64decode(c1);
4362 t2 = 0x3f & base64decode(c2);
4363 t3 = 0x3f & base64decode(c3);
4364 t4 = 0x3f & base64decode(c4);
4365 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4367 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4368 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4370 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4371 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4373 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4378 return mime_input_buf(mime_input_state.top++);
4381 static const char basis_64[] =
4382 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4384 #define MIMEOUT_BUF_LENGTH (60)
4386 char buf[MIMEOUT_BUF_LENGTH+1];
4391 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4393 static void open_mime(nkf_char mode)
4395 const unsigned char *p;
4398 p = mime_pattern[0];
4399 for(i=0;mime_pattern[i];i++) {
4400 if (mode == mime_encode[i]) {
4401 p = mime_pattern[i];
4405 mimeout_mode = mime_encode_method[i];
4407 if (base64_count>45) {
4408 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4409 (*o_mputc)(mimeout_state.buf[i]);
4412 PUT_NEWLINE((*o_mputc));
4415 if (mimeout_state.count>0
4416 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4417 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4421 for (;i<mimeout_state.count;i++) {
4422 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4423 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4424 (*o_mputc)(mimeout_state.buf[i]);
4434 j = mimeout_state.count;
4435 mimeout_state.count = 0;
4437 mime_putc(mimeout_state.buf[i]);
4441 static void mime_prechar(nkf_char c2, nkf_char c1)
4443 if (mimeout_mode > 0){
4445 if (base64_count + mimeout_state.count/3*4> 73){
4446 (*o_base64conv)(EOF,0);
4447 OCONV_NEWLINE((*o_base64conv));
4448 (*o_base64conv)(0,SP);
4452 if (base64_count + mimeout_state.count/3*4> 66) {
4453 (*o_base64conv)(EOF,0);
4454 OCONV_NEWLINE((*o_base64conv));
4455 (*o_base64conv)(0,SP);
4461 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4462 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4463 open_mime(output_mode);
4464 (*o_base64conv)(EOF,0);
4465 OCONV_NEWLINE((*o_base64conv));
4466 (*o_base64conv)(0,SP);
4473 static void close_mime(void)
4481 static void eof_mime(void)
4483 switch(mimeout_mode) {
4488 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4494 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4499 if (mimeout_mode > 0) {
4500 if (mimeout_f!=FIXED_MIME) {
4502 } else if (mimeout_mode != 'Q')
4507 static void mimeout_addchar(nkf_char c)
4509 switch(mimeout_mode) {
4514 } else if(!nkf_isalnum(c)) {
4516 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4517 (*o_mputc)(bin2hex((c&0xf)));
4525 mimeout_state.state=c;
4526 (*o_mputc)(basis_64[c>>2]);
4531 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4532 mimeout_state.state=c;
4537 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4538 (*o_mputc)(basis_64[c & 0x3F]);
4549 static void mime_putc(nkf_char c)
4554 if (mimeout_f == FIXED_MIME){
4555 if (mimeout_mode == 'Q'){
4556 if (base64_count > 71){
4557 if (c!=CR && c!=LF) {
4559 PUT_NEWLINE((*o_mputc));
4564 if (base64_count > 71){
4566 PUT_NEWLINE((*o_mputc));
4569 if (c == EOF) { /* c==EOF */
4573 if (c != EOF) { /* c==EOF */
4579 /* mimeout_f != FIXED_MIME */
4581 if (c == EOF) { /* c==EOF */
4582 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4583 j = mimeout_state.count;
4584 mimeout_state.count = 0;
4586 if (mimeout_mode > 0) {
4587 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4589 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4592 mimeout_addchar(mimeout_state.buf[i]);
4596 mimeout_addchar(mimeout_state.buf[i]);
4600 mimeout_addchar(mimeout_state.buf[i]);
4606 mimeout_addchar(mimeout_state.buf[i]);
4612 if (mimeout_state.count > 0){
4613 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4618 if (mimeout_mode=='Q') {
4619 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4620 if (c == CR || c == LF) {
4625 } else if (c <= SP) {
4627 if (base64_count > 70) {
4628 PUT_NEWLINE((*o_mputc));
4631 if (!nkf_isblank(c)) {
4636 if (base64_count > 70) {
4638 PUT_NEWLINE((*o_mputc));
4641 open_mime(output_mode);
4643 if (!nkf_noescape_mime(c)) {
4654 if (mimeout_mode <= 0) {
4655 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4656 if (nkf_isspace(c)) {
4658 if (mimeout_mode == -1) {
4661 if (c==CR || c==LF) {
4663 open_mime(output_mode);
4669 for (i=0;i<mimeout_state.count;i++) {
4670 (*o_mputc)(mimeout_state.buf[i]);
4671 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4682 mimeout_state.buf[0] = (char)c;
4683 mimeout_state.count = 1;
4685 if (base64_count > 1
4686 && base64_count + mimeout_state.count > 76
4687 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4688 PUT_NEWLINE((*o_mputc));
4690 if (!nkf_isspace(mimeout_state.buf[0])){
4695 mimeout_state.buf[mimeout_state.count++] = (char)c;
4696 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4697 open_mime(output_mode);
4702 if (lastchar==CR || lastchar == LF){
4703 for (i=0;i<mimeout_state.count;i++) {
4704 (*o_mputc)(mimeout_state.buf[i]);
4707 mimeout_state.count = 0;
4710 for (i=0;i<mimeout_state.count-1;i++) {
4711 (*o_mputc)(mimeout_state.buf[i]);
4714 mimeout_state.buf[0] = SP;
4715 mimeout_state.count = 1;
4717 open_mime(output_mode);
4720 /* mimeout_mode == 'B', 1, 2 */
4721 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4722 if (lastchar == CR || lastchar == LF){
4723 if (nkf_isblank(c)) {
4724 for (i=0;i<mimeout_state.count;i++) {
4725 mimeout_addchar(mimeout_state.buf[i]);
4727 mimeout_state.count = 0;
4728 } else if (SP<c && c<DEL) {
4730 for (i=0;i<mimeout_state.count;i++) {
4731 (*o_mputc)(mimeout_state.buf[i]);
4734 mimeout_state.count = 0;
4736 mimeout_state.buf[mimeout_state.count++] = (char)c;
4739 if (c==SP || c==TAB || c==CR || c==LF) {
4740 for (i=0;i<mimeout_state.count;i++) {
4741 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4743 for (i=0;i<mimeout_state.count;i++) {
4744 (*o_mputc)(mimeout_state.buf[i]);
4747 mimeout_state.count = 0;
4750 mimeout_state.buf[mimeout_state.count++] = (char)c;
4751 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4753 for (i=0;i<mimeout_state.count;i++) {
4754 (*o_mputc)(mimeout_state.buf[i]);
4757 mimeout_state.count = 0;
4761 if (mimeout_state.count>0 && SP<c && c!='=') {
4762 mimeout_state.buf[mimeout_state.count++] = (char)c;
4763 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4764 j = mimeout_state.count;
4765 mimeout_state.count = 0;
4767 mimeout_addchar(mimeout_state.buf[i]);
4774 if (mimeout_state.count>0) {
4775 j = mimeout_state.count;
4776 mimeout_state.count = 0;
4778 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4780 mimeout_addchar(mimeout_state.buf[i]);
4786 (*o_mputc)(mimeout_state.buf[i]);
4788 open_mime(output_mode);
4794 void base64_conv(nkf_char c2, nkf_char c1)
4796 mime_prechar(c2, c1);
4797 (*o_base64conv)(c2,c1);
4801 typedef struct nkf_iconv_t {
4804 size_t input_buffer_size;
4805 char *output_buffer;
4806 size_t output_buffer_size;
4809 nkf_iconv_t nkf_iconv_new(char *tocode, char *fromcode)
4811 nkf_iconv_t converter;
4813 converter->input_buffer_size = IOBUF_SIZE;
4814 converter->input_buffer = malloc(converter->input_buffer_size);
4815 if (converter->input_buffer == NULL)
4816 perror("can't malloc");
4818 converter->output_buffer_size = IOBUF_SIZE * 2;
4819 converter->output_buffer = malloc(converter->output_buffer_size);
4820 if (converter->output_buffer == NULL)
4821 perror("can't malloc");
4823 converter->cd = iconv_open(tocode, fromcode);
4824 if (converter->cd == (iconv_t)-1)
4828 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4831 perror("can't iconv_open");
4836 size_t nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4838 size_t invalid = (size_t)0;
4839 char *input_buffer = converter->input_buffer;
4840 size_t input_length = (size_t)0;
4841 char *output_buffer = converter->output_buffer;
4842 size_t output_length = converter->output_buffer_size;
4847 while ((c = (*i_getc)(f)) != EOF) {
4848 input_buffer[input_length++] = c;
4849 if (input_length < converter->input_buffer_size) break;
4853 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4854 while (output_length-- > 0) {
4855 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4857 if (ret == (size_t) - 1) {
4860 if (input_buffer != converter->input_buffer)
4861 memmove(converter->input_buffer, input_buffer, input_length);
4864 converter->output_buffer_size *= 2;
4865 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4866 if (output_buffer == NULL) {
4867 perror("can't realloc");
4870 converter->output_buffer = output_buffer;
4873 perror("can't iconv");
4885 void nkf_iconv_close(nkf_iconv_t *convert)
4887 free(converter->inbuf);
4888 free(converter->outbuf);
4889 iconv_close(converter->cd);
4897 struct input_code *p = input_code_list;
4909 mime_f = MIME_DECODE_DEFAULT;
4910 mime_decode_f = FALSE;
4915 x0201_f = X0201_DEFAULT;
4916 iso2022jp_f = FALSE;
4917 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
4918 ms_ucs_map_f = UCS_MAP_ASCII;
4920 #ifdef UTF8_INPUT_ENABLE
4921 no_cp932ext_f = FALSE;
4922 no_best_fit_chars_f = FALSE;
4923 encode_fallback = NULL;
4924 unicode_subchar = '?';
4925 input_endian = ENDIAN_BIG;
4927 #ifdef UTF8_OUTPUT_ENABLE
4928 output_bom_f = FALSE;
4929 output_endian = ENDIAN_BIG;
4931 #ifdef UNICODE_NORMALIZATION
4947 #ifdef SHIFTJIS_CP932
4957 for (i = 0; i < 256; i++){
4958 prefix_table[i] = 0;
4962 mimeout_state.count = 0;
4967 fold_preserve_f = FALSE;
4970 kanji_intro = DEFAULT_J;
4971 ascii_intro = DEFAULT_R;
4972 fold_margin = FOLD_MARGIN;
4973 o_zconv = no_connection;
4974 o_fconv = no_connection;
4975 o_eol_conv = no_connection;
4976 o_rot_conv = no_connection;
4977 o_hira_conv = no_connection;
4978 o_base64conv = no_connection;
4979 o_iso2022jp_check_conv = no_connection;
4982 i_ungetc = std_ungetc;
4984 i_bungetc = std_ungetc;
4987 i_mungetc = std_ungetc;
4988 i_mgetc_buf = std_getc;
4989 i_mungetc_buf = std_ungetc;
4990 output_mode = ASCII;
4992 mime_decode_mode = FALSE;
4998 init_broken_state();
4999 z_prev2=0,z_prev1=0;
5001 iconv_for_check = 0;
5003 input_codename = NULL;
5004 input_encoding = NULL;
5005 output_encoding = NULL;
5011 int module_connection(void)
5013 if (input_encoding) set_input_encoding(input_encoding);
5014 if (!output_encoding) {
5015 output_encoding = nkf_default_encoding();
5017 if (!output_encoding) {
5018 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5021 set_output_encoding(output_encoding);
5022 oconv = nkf_enc_to_oconv(output_encoding);
5025 /* replace continucation module, from output side */
5027 /* output redicrection */
5029 if (noout_f || guess_f){
5036 if (mimeout_f == TRUE) {
5037 o_base64conv = oconv; oconv = base64_conv;
5039 /* base64_count = 0; */
5042 if (eolmode_f || guess_f) {
5043 o_eol_conv = oconv; oconv = eol_conv;
5046 o_rot_conv = oconv; oconv = rot_conv;
5049 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5052 o_hira_conv = oconv; oconv = hira_conv;
5055 o_fconv = oconv; oconv = fold_conv;
5058 if (alpha_f || x0201_f) {
5059 o_zconv = oconv; oconv = z_conv;
5063 i_ungetc = std_ungetc;
5064 /* input redicrection */
5067 i_cgetc = i_getc; i_getc = cap_getc;
5068 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5071 i_ugetc = i_getc; i_getc = url_getc;
5072 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5075 #ifdef NUMCHAR_OPTION
5077 i_ngetc = i_getc; i_getc = numchar_getc;
5078 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5081 #ifdef UNICODE_NORMALIZATION
5083 i_nfc_getc = i_getc; i_getc = nfc_getc;
5084 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5087 if (mime_f && mimebuf_f==FIXED_MIME) {
5088 i_mgetc = i_getc; i_getc = mime_getc;
5089 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5092 i_bgetc = i_getc; i_getc = broken_getc;
5093 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5095 if (input_encoding) {
5096 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5098 set_iconv(FALSE, e_iconv);
5102 struct input_code *p = input_code_list;
5111 Conversion main loop. Code detection only.
5114 #if !defined(PERL_XS) && !defined(WIN32DLL)
5115 nkf_char noconvert(FILE *f)
5120 module_connection();
5121 while ((c = (*i_getc)(f)) != EOF)
5128 int kanji_convert(FILE *f)
5130 nkf_char c1=0, c2=0, c3=0, c4=0;
5131 int shift_mode = FALSE; /* TRUE or FALSE or JIS_X_0201_1976_K */
5132 int is_8bit = FALSE;
5134 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5139 output_mode = ASCII;
5141 #define NEXT continue /* no output, get next */
5142 #define SKIP c2=0;continue /* no output, get next */
5143 #define MORE c2=c1;continue /* need one more byte */
5144 #define SEND ; /* output c1 and c2, get next */
5145 #define LAST break /* end of loop, go closing */
5147 if (module_connection() < 0) {
5148 #if !defined(PERL_XS) && !defined(WIN32DLL)
5149 fprintf(stderr, "no output encoding given\n");
5155 #ifdef UTF8_INPUT_ENABLE
5156 if(iconv == w_iconv32){
5157 while ((c1 = (*i_getc)(f)) != EOF &&
5158 (c2 = (*i_getc)(f)) != EOF &&
5159 (c3 = (*i_getc)(f)) != EOF &&
5160 (c4 = (*i_getc)(f)) != EOF) {
5161 nkf_iconv_utf_32(c1, c2, c3, c4);
5163 (*i_ungetc)(EOF, f);
5165 else if (iconv == w_iconv16) {
5166 while ((c1 = (*i_getc)(f)) != EOF &&
5167 (c2 = (*i_getc)(f)) != EOF) {
5168 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5169 (c3 = (*i_getc)(f)) != EOF &&
5170 (c4 = (*i_getc)(f)) != EOF) {
5171 nkf_iconv_utf_16(c1, c2, c3, c4);
5174 (*i_ungetc)(EOF, f);
5178 while ((c1 = (*i_getc)(f)) != EOF) {
5179 #ifdef INPUT_CODE_FIX
5180 if (!input_encoding)
5186 /* in case of 8th bit is on */
5187 if (!estab_f&&!mime_decode_mode) {
5188 /* in case of not established yet */
5189 /* It is still ambiguious */
5190 if (h_conv(f, c2, c1)==EOF)
5195 /* in case of already established */
5197 /* ignore bogus code */
5204 /* 2nd byte of 7 bit code or SJIS */
5209 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5212 } else if (c1 > DEL) {
5214 if (!estab_f && !iso8859_f) {
5215 /* not established yet */
5217 } else { /* estab_f==TRUE */
5223 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5224 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5226 c2 = JIS_X_0201_1976_K;
5231 /* already established */
5235 } else if (SP < c1 && c1 < DEL) {
5236 /* in case of Roman characters */
5238 /* output 1 shifted byte */
5242 } else if (SP <= c1 && c1 < (0xE0&0x7F)){
5243 /* output 1 shifted byte */
5244 c2 = JIS_X_0201_1976_K;
5247 /* look like bogus code */
5250 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5251 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5252 /* in case of Kanji shifted */
5254 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5255 /* Check MIME code */
5256 if ((c1 = (*i_getc)(f)) == EOF) {
5259 } else if (c1 == '?') {
5260 /* =? is mime conversion start sequence */
5261 if(mime_f == STRICT_MIME) {
5262 /* check in real detail */
5263 if (mime_begin_strict(f) == EOF)
5266 } else if (mime_begin(f) == EOF)
5275 /* normal ASCII code */
5278 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5281 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5284 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5285 if ((c1 = (*i_getc)(f)) == EOF) {
5286 /* (*oconv)(0, ESC); don't send bogus code */
5288 } else if (c1 == '$') {
5289 if ((c1 = (*i_getc)(f)) == EOF) {
5291 (*oconv)(0, ESC); don't send bogus code
5292 (*oconv)(0, '$'); */
5294 } else if (c1 == '@'|| c1 == 'B') {
5295 /* This is kanji introduction */
5296 input_mode = JIS_X_0208;
5298 set_input_codename("ISO-2022-JP");
5300 debug("ISO-2022-JP");
5303 } else if (c1 == '(') {
5304 if ((c1 = (*i_getc)(f)) == EOF) {
5305 /* don't send bogus code
5311 } else if (c1 == '@'|| c1 == 'B') {
5312 /* This is kanji introduction */
5313 input_mode = JIS_X_0208;
5317 } else if (c1 == 'D'){
5318 input_mode = JIS_X_0212;
5321 #endif /* X0212_ENABLE */
5322 } else if (c1 == 'O' || c1 == 'Q'){
5323 input_mode = JIS_X_0213_1;
5326 } else if (c1 == 'P'){
5327 input_mode = JIS_X_0213_2;
5331 /* could be some special code */
5338 } else if (broken_f&0x2) {
5339 /* accept any ESC-(-x as broken code ... */
5340 input_mode = JIS_X_0208;
5349 } else if (c1 == '(') {
5350 if ((c1 = (*i_getc)(f)) == EOF) {
5351 /* don't send bogus code
5353 (*oconv)(0, '('); */
5357 /* This is X0201 kana introduction */
5358 input_mode = JIS_X_0201_1976_K; shift_mode = JIS_X_0201_1976_K;
5360 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5361 /* This is X0208 kanji introduction */
5362 input_mode = ASCII; shift_mode = FALSE;
5364 } else if (broken_f&0x2) {
5365 input_mode = ASCII; shift_mode = FALSE;
5370 /* maintain various input_mode here */
5374 } else if ( c1 == 'N' || c1 == 'n'){
5376 c4 = (*i_getc)(f); /* skip SS2 */
5377 if ( (SP<=c4 && c4 < 0x60) || (0xa0<=c4 && c4 < 0xe0)){
5379 c2 = JIS_X_0201_1976_K;
5392 } else if (c1 == ESC && iconv == s_iconv) {
5393 /* ESC in Shift_JIS */
5394 if ((c1 = (*i_getc)(f)) == EOF) {
5395 /* (*oconv)(0, ESC); don't send bogus code */
5397 } else if (c1 == '$') {
5399 if ((c1 = (*i_getc)(f)) == EOF) {
5401 (*oconv)(0, ESC); don't send bogus code
5402 (*oconv)(0, '$'); */
5405 if (('E' <= c1 && c1 <= 'G') ||
5406 ('O' <= c1 && c1 <= 'Q')) {
5414 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
5415 c3 = nkf_char_unicode_new((jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000);
5416 while ((c1 = (*i_getc)(f)) != EOF) {
5417 if (SP <= c1 && c1 <= 'z') {
5418 (*oconv)(0, c1 + c3);
5419 } else break; /* c1 == SO */
5423 if (c1 == EOF) LAST;
5430 } else if (c1 == LF || c1 == CR) {
5432 input_mode = ASCII; set_iconv(FALSE, 0);
5434 } else if (mime_decode_f && !mime_decode_mode){
5436 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5444 } else { /* if (c1 == CR)*/
5445 if ((c1=(*i_getc)(f))!=EOF) {
5449 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5469 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5472 if ((c3 = (*i_getc)(f)) != EOF) {
5475 if ((c4 = (*i_getc)(f)) != EOF) {
5477 (*iconv)(c2, c1, c3|c4);
5482 /* 3 bytes EUC or UTF-8 */
5483 if ((c3 = (*i_getc)(f)) != EOF) {
5485 (*iconv)(c2, c1, c3);
5493 0x7F <= c2 && c2 <= 0x92 &&
5494 0x21 <= c1 && c1 <= 0x7E) {
5498 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5501 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5505 (*oconv)(PREFIX_EUCG3 | c2, c1);
5507 #endif /* X0212_ENABLE */
5509 (*oconv)(PREFIX_EUCG3 | c2, c1);
5512 (*oconv)(input_mode, c1); /* other special case */
5518 /* goto next_word */
5522 (*iconv)(EOF, 0, 0);
5523 if (!input_codename)
5526 struct input_code *p = input_code_list;
5527 struct input_code *result = p;
5529 if (p->score < result->score) result = p;
5532 set_input_codename(result->name);
5534 debug(result->name);
5542 * int options(unsigned char *cp)
5548 int options(unsigned char *cp)
5552 unsigned char *cp_back = NULL;
5558 while(*cp && *cp++!='-');
5559 while (*cp || cp_back) {
5567 case '-': /* literal options */
5568 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5572 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5573 p = (unsigned char *)long_option[i].name;
5574 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5575 if (*p == cp[j] || cp[j] == SP){
5582 #if !defined(PERL_XS) && !defined(WIN32DLL)
5583 fprintf(stderr, "unknown long option: --%s\n", cp);
5587 while(*cp && *cp != SP && cp++);
5588 if (long_option[i].alias[0]){
5590 cp = (unsigned char *)long_option[i].alias;
5592 if (strcmp(long_option[i].name, "ic=") == 0){
5593 nkf_str_upcase((char *)p, codeset, 32);
5594 enc = nkf_enc_find(codeset);
5596 input_encoding = enc;
5599 if (strcmp(long_option[i].name, "oc=") == 0){
5600 nkf_str_upcase((char *)p, codeset, 32);
5601 enc = nkf_enc_find(codeset);
5602 if (enc <= 0) continue;
5603 output_encoding = enc;
5606 if (strcmp(long_option[i].name, "guess=") == 0){
5607 if (p[0] == '0' || p[0] == '1') {
5615 if (strcmp(long_option[i].name, "overwrite") == 0){
5618 preserve_time_f = TRUE;
5621 if (strcmp(long_option[i].name, "overwrite=") == 0){
5624 preserve_time_f = TRUE;
5626 backup_suffix = malloc(strlen((char *) p) + 1);
5627 strcpy(backup_suffix, (char *) p);
5630 if (strcmp(long_option[i].name, "in-place") == 0){
5633 preserve_time_f = FALSE;
5636 if (strcmp(long_option[i].name, "in-place=") == 0){
5639 preserve_time_f = FALSE;
5641 backup_suffix = malloc(strlen((char *) p) + 1);
5642 strcpy(backup_suffix, (char *) p);
5647 if (strcmp(long_option[i].name, "cap-input") == 0){
5651 if (strcmp(long_option[i].name, "url-input") == 0){
5656 #ifdef NUMCHAR_OPTION
5657 if (strcmp(long_option[i].name, "numchar-input") == 0){
5663 if (strcmp(long_option[i].name, "no-output") == 0){
5667 if (strcmp(long_option[i].name, "debug") == 0){
5672 if (strcmp(long_option[i].name, "cp932") == 0){
5673 #ifdef SHIFTJIS_CP932
5677 #ifdef UTF8_OUTPUT_ENABLE
5678 ms_ucs_map_f = UCS_MAP_CP932;
5682 if (strcmp(long_option[i].name, "no-cp932") == 0){
5683 #ifdef SHIFTJIS_CP932
5687 #ifdef UTF8_OUTPUT_ENABLE
5688 ms_ucs_map_f = UCS_MAP_ASCII;
5692 #ifdef SHIFTJIS_CP932
5693 if (strcmp(long_option[i].name, "cp932inv") == 0){
5700 if (strcmp(long_option[i].name, "x0212") == 0){
5707 if (strcmp(long_option[i].name, "exec-in") == 0){
5711 if (strcmp(long_option[i].name, "exec-out") == 0){
5716 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5717 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5718 no_cp932ext_f = TRUE;
5721 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5722 no_best_fit_chars_f = TRUE;
5725 if (strcmp(long_option[i].name, "fb-skip") == 0){
5726 encode_fallback = NULL;
5729 if (strcmp(long_option[i].name, "fb-html") == 0){
5730 encode_fallback = encode_fallback_html;
5733 if (strcmp(long_option[i].name, "fb-xml") == 0){
5734 encode_fallback = encode_fallback_xml;
5737 if (strcmp(long_option[i].name, "fb-java") == 0){
5738 encode_fallback = encode_fallback_java;
5741 if (strcmp(long_option[i].name, "fb-perl") == 0){
5742 encode_fallback = encode_fallback_perl;
5745 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5746 encode_fallback = encode_fallback_subchar;
5749 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5750 encode_fallback = encode_fallback_subchar;
5751 unicode_subchar = 0;
5753 /* decimal number */
5754 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5755 unicode_subchar *= 10;
5756 unicode_subchar += hex2bin(p[i]);
5758 }else if(p[1] == 'x' || p[1] == 'X'){
5759 /* hexadecimal number */
5760 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5761 unicode_subchar <<= 4;
5762 unicode_subchar |= hex2bin(p[i]);
5766 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5767 unicode_subchar *= 8;
5768 unicode_subchar += hex2bin(p[i]);
5771 w16e_conv(unicode_subchar, &i, &j);
5772 unicode_subchar = i<<8 | j;
5776 #ifdef UTF8_OUTPUT_ENABLE
5777 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5778 ms_ucs_map_f = UCS_MAP_MS;
5782 #ifdef UNICODE_NORMALIZATION
5783 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5788 if (strcmp(long_option[i].name, "prefix=") == 0){
5789 if (nkf_isgraph(p[0])){
5790 for (i = 1; nkf_isgraph(p[i]); i++){
5791 prefix_table[p[i]] = p[0];
5796 #if !defined(PERL_XS) && !defined(WIN32DLL)
5797 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5802 case 'b': /* buffered mode */
5805 case 'u': /* non bufferd mode */
5808 case 't': /* transparent mode */
5813 } else if (*cp=='2') {
5817 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5825 case 'j': /* JIS output */
5827 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5829 case 'e': /* AT&T EUC output */
5830 output_encoding = nkf_enc_from_index(EUC_JP);
5832 case 's': /* SJIS output */
5833 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5835 case 'l': /* ISO8859 Latin-1 support, no conversion */
5836 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5837 input_encoding = nkf_enc_from_index(ISO_8859_1);
5839 case 'i': /* Kanji IN ESC-$-@/B */
5840 if (*cp=='@'||*cp=='B')
5841 kanji_intro = *cp++;
5843 case 'o': /* ASCII IN ESC-(-J/B */
5844 if (*cp=='J'||*cp=='B'||*cp=='H')
5845 ascii_intro = *cp++;
5849 bit:1 katakana->hiragana
5850 bit:2 hiragana->katakana
5852 if ('9'>= *cp && *cp>='0')
5853 hira_f |= (*cp++ -'0');
5860 #if defined(MSDOS) || defined(__OS2__)
5867 show_configuration();
5875 #ifdef UTF8_OUTPUT_ENABLE
5876 case 'w': /* UTF-8 output */
5881 output_encoding = nkf_enc_from_index(UTF_8N);
5883 output_bom_f = TRUE;
5884 output_encoding = nkf_enc_from_index(UTF_8_BOM);
5888 if ('1'== cp[0] && '6'==cp[1]) {
5891 } else if ('3'== cp[0] && '2'==cp[1]) {
5895 output_encoding = nkf_enc_from_index(UTF_8);
5900 output_endian = ENDIAN_LITTLE;
5901 } else if (cp[0] == 'B') {
5904 output_encoding = nkf_enc_from_index(enc_idx);
5909 enc_idx = enc_idx == UTF_16
5910 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5911 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5913 output_bom_f = TRUE;
5914 enc_idx = enc_idx == UTF_16
5915 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
5916 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
5918 output_encoding = nkf_enc_from_index(enc_idx);
5922 #ifdef UTF8_INPUT_ENABLE
5923 case 'W': /* UTF input */
5926 input_encoding = nkf_enc_from_index(UTF_8);
5929 if ('1'== cp[0] && '6'==cp[1]) {
5931 input_endian = ENDIAN_BIG;
5933 } else if ('3'== cp[0] && '2'==cp[1]) {
5935 input_endian = ENDIAN_BIG;
5938 input_encoding = nkf_enc_from_index(UTF_8);
5943 input_endian = ENDIAN_LITTLE;
5944 } else if (cp[0] == 'B') {
5946 input_endian = ENDIAN_BIG;
5948 enc_idx = enc_idx == UTF_16
5949 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5950 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5951 input_encoding = nkf_enc_from_index(enc_idx);
5955 /* Input code assumption */
5956 case 'J': /* ISO-2022-JP input */
5957 input_encoding = nkf_enc_from_index(ISO_2022_JP);
5959 case 'E': /* EUC-JP input */
5960 input_encoding = nkf_enc_from_index(EUC_JP);
5962 case 'S': /* Windows-31J input */
5963 input_encoding = nkf_enc_from_index(WINDOWS_31J);
5965 case 'Z': /* Convert X0208 alphabet to asii */
5967 bit:0 Convert JIS X 0208 Alphabet to ASCII
5968 bit:1 Convert Kankaku to one space
5969 bit:2 Convert Kankaku to two spaces
5970 bit:3 Convert HTML Entity
5971 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
5973 while ('0'<= *cp && *cp <='9') {
5974 alpha_f |= 1 << (*cp++ - '0');
5976 if (!alpha_f) alpha_f = 1;
5978 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
5979 x0201_f = FALSE; /* No X0201->X0208 conversion */
5981 ESC-(-I in JIS, EUC, MS Kanji
5982 SI/SO in JIS, EUC, MS Kanji
5983 SS2 in EUC, JIS, not in MS Kanji
5984 MS Kanji (0xa0-0xdf)
5986 ESC-(-I in JIS (0x20-0x5f)
5987 SS2 in EUC (0xa0-0xdf)
5988 0xa0-0xd in MS Kanji (0xa0-0xdf)
5991 case 'X': /* Convert X0201 kana to X0208 */
5994 case 'F': /* prserve new lines */
5995 fold_preserve_f = TRUE;
5996 case 'f': /* folding -f60 or -f */
5999 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6001 fold_len += *cp++ - '0';
6003 if (!(0<fold_len && fold_len<BUFSIZ))
6004 fold_len = DEFAULT_FOLD;
6008 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6010 fold_margin += *cp++ - '0';
6014 case 'm': /* MIME support */
6015 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6016 if (*cp=='B'||*cp=='Q') {
6017 mime_decode_mode = *cp++;
6018 mimebuf_f = FIXED_MIME;
6019 } else if (*cp=='N') {
6020 mime_f = TRUE; cp++;
6021 } else if (*cp=='S') {
6022 mime_f = STRICT_MIME; cp++;
6023 } else if (*cp=='0') {
6024 mime_decode_f = FALSE;
6025 mime_f = FALSE; cp++;
6027 mime_f = STRICT_MIME;
6030 case 'M': /* MIME output */
6033 mimeout_f = FIXED_MIME; cp++;
6034 } else if (*cp=='Q') {
6036 mimeout_f = FIXED_MIME; cp++;
6041 case 'B': /* Broken JIS support */
6043 bit:1 allow any x on ESC-(-x or ESC-$-x
6044 bit:2 reset to ascii on NL
6046 if ('9'>= *cp && *cp>='0')
6047 broken_f |= 1<<(*cp++ -'0');
6052 case 'O':/* for Output file */
6056 case 'c':/* add cr code */
6059 case 'd':/* delete cr code */
6062 case 'I': /* ISO-2022-JP output */
6065 case 'L': /* line mode */
6066 if (*cp=='u') { /* unix */
6067 eolmode_f = LF; cp++;
6068 } else if (*cp=='m') { /* mac */
6069 eolmode_f = CR; cp++;
6070 } else if (*cp=='w') { /* windows */
6071 eolmode_f = CRLF; cp++;
6072 } else if (*cp=='0') { /* no conversion */
6073 eolmode_f = 0; cp++;
6078 if ('2' <= *cp && *cp <= '9') {
6081 } else if (*cp == '0' || *cp == '1') {
6090 /* module muliple options in a string are allowed for Perl moudle */
6091 while(*cp && *cp++!='-');
6094 #if !defined(PERL_XS) && !defined(WIN32DLL)
6095 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6097 /* bogus option but ignored */
6105 #include "nkf32dll.c"
6106 #elif defined(PERL_XS)
6107 #else /* WIN32DLL */
6108 int main(int argc, char **argv)
6113 char *outfname = NULL;
6116 #ifdef EASYWIN /*Easy Win */
6117 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6119 #ifdef DEFAULT_CODE_LOCALE
6120 setlocale(LC_CTYPE, "");
6122 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6123 cp = (unsigned char *)*argv;
6128 if (pipe(fds) < 0 || (pid = fork()) < 0){
6139 execvp(argv[1], &argv[1]);
6156 int debug_f_back = debug_f;
6159 int exec_f_back = exec_f;
6162 int x0212_f_back = x0212_f;
6164 int x0213_f_back = x0213_f;
6165 int guess_f_back = guess_f;
6167 guess_f = guess_f_back;
6170 debug_f = debug_f_back;
6173 exec_f = exec_f_back;
6176 x0212_f = x0212_f_back;
6178 x0213_f = x0213_f_back;
6181 if (binmode_f == TRUE)
6182 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6183 if (freopen("","wb",stdout) == NULL)
6190 setbuf(stdout, (char *) NULL);
6192 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6195 if (binmode_f == TRUE)
6196 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6197 if (freopen("","rb",stdin) == NULL) return (-1);
6201 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6205 kanji_convert(stdin);
6206 if (guess_f) print_guessed_code(NULL);
6210 int is_argument_error = FALSE;
6212 input_codename = NULL;
6215 iconv_for_check = 0;
6217 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6219 is_argument_error = TRUE;
6227 /* reopen file for stdout */
6228 if (file_out_f == TRUE) {
6231 outfname = malloc(strlen(origfname)
6232 + strlen(".nkftmpXXXXXX")
6238 strcpy(outfname, origfname);
6242 for (i = strlen(outfname); i; --i){
6243 if (outfname[i - 1] == '/'
6244 || outfname[i - 1] == '\\'){
6250 strcat(outfname, "ntXXXXXX");
6252 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6253 S_IREAD | S_IWRITE);
6255 strcat(outfname, ".nkftmpXXXXXX");
6256 fd = mkstemp(outfname);
6259 || (fd_backup = dup(fileno(stdout))) < 0
6260 || dup2(fd, fileno(stdout)) < 0
6271 outfname = "nkf.out";
6274 if(freopen(outfname, "w", stdout) == NULL) {
6278 if (binmode_f == TRUE) {
6279 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6280 if (freopen("","wb",stdout) == NULL)
6287 if (binmode_f == TRUE)
6288 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6289 if (freopen("","rb",fin) == NULL)
6294 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6298 char *filename = NULL;
6300 if (nfiles > 1) filename = origfname;
6301 if (guess_f) print_guessed_code(filename);
6307 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6315 if (dup2(fd_backup, fileno(stdout)) < 0){
6318 if (stat(origfname, &sb)) {
6319 fprintf(stderr, "Can't stat %s\n", origfname);
6321 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6322 if (chmod(outfname, sb.st_mode)) {
6323 fprintf(stderr, "Can't set permission %s\n", outfname);
6326 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6327 if(preserve_time_f){
6328 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6329 tb[0] = tb[1] = sb.st_mtime;
6330 if (utime(outfname, tb)) {
6331 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6334 tb.actime = sb.st_atime;
6335 tb.modtime = sb.st_mtime;
6336 if (utime(outfname, &tb)) {
6337 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6342 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6344 unlink(backup_filename);
6346 if (rename(origfname, backup_filename)) {
6347 perror(backup_filename);
6348 fprintf(stderr, "Can't rename %s to %s\n",
6349 origfname, backup_filename);
6353 if (unlink(origfname)){
6358 if (rename(outfname, origfname)) {
6360 fprintf(stderr, "Can't rename %s to %s\n",
6361 outfname, origfname);
6368 if (is_argument_error)
6371 #ifdef EASYWIN /*Easy Win */
6372 if (file_out_f == FALSE)
6373 scanf("%d",&end_check);
6376 #else /* for Other OS */
6377 if (file_out_f == TRUE)
6379 #endif /*Easy Win */
6382 #endif /* WIN32DLL */