1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.185 2008/10/28 13:42:25 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-10-28"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
49 /* state of output_mode and input_mode
128 NKF_ENCODING_TABLE_SIZE,
129 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
130 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
131 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
132 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
133 JIS_X_0208 = 0x1168, /* @B */
134 JIS_X_0212 = 0x1159, /* D */
135 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
136 JIS_X_0213_2 = 0x1229, /* P */
137 JIS_X_0213_1 = 0x1233, /* Q */
140 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
141 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
142 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
143 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
144 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
145 static void j_oconv(nkf_char c2, nkf_char c1);
146 static void s_oconv(nkf_char c2, nkf_char c1);
147 static void e_oconv(nkf_char c2, nkf_char c1);
148 static void w_oconv(nkf_char c2, nkf_char c1);
149 static void w_oconv16(nkf_char c2, nkf_char c1);
150 static void w_oconv32(nkf_char c2, nkf_char c1);
154 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
155 void (*oconv)(nkf_char c2, nkf_char c1);
156 } nkf_native_encoding;
158 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
159 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
160 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
161 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
162 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
163 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
164 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
169 const nkf_native_encoding *base_encoding;
172 nkf_encoding nkf_encoding_table[] = {
173 {ASCII, "US-ASCII", &NkfEncodingASCII},
174 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
175 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
176 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
177 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
178 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
179 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
180 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
181 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
182 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
183 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
184 {CP10001, "CP10001", &NkfEncodingShift_JIS},
185 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
186 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
187 {CP51932, "CP51932", &NkfEncodingEUC_JP},
188 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
189 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
190 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
191 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
192 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
193 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
194 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
195 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
196 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
197 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
198 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
199 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
200 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
201 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
202 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
203 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
204 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
205 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
206 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
207 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
208 {BINARY, "BINARY", &NkfEncodingASCII},
215 } encoding_name_to_id_table[] = {
218 {"ISO-2022-JP", ISO_2022_JP},
219 {"ISO2022JP-CP932", CP50220},
220 {"CP50220", CP50220},
221 {"CP50221", CP50221},
222 {"CSISO2022JP", CP50221},
223 {"CP50222", CP50222},
224 {"ISO-2022-JP-1", ISO_2022_JP_1},
225 {"ISO-2022-JP-3", ISO_2022_JP_3},
226 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
227 {"SHIFT_JIS", SHIFT_JIS},
229 {"WINDOWS-31J", WINDOWS_31J},
230 {"CSWINDOWS31J", WINDOWS_31J},
231 {"CP932", WINDOWS_31J},
232 {"MS932", WINDOWS_31J},
233 {"CP10001", CP10001},
236 {"EUCJP-NKF", EUCJP_NKF},
237 {"CP51932", CP51932},
238 {"EUC-JP-MS", EUCJP_MS},
239 {"EUCJP-MS", EUCJP_MS},
240 {"EUCJPMS", EUCJP_MS},
241 {"EUC-JP-ASCII", EUCJP_ASCII},
242 {"EUCJP-ASCII", EUCJP_ASCII},
243 {"SHIFT_JISX0213", SHIFT_JISX0213},
244 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
245 {"EUC-JISX0213", EUC_JISX0213},
246 {"EUC-JIS-2004", EUC_JIS_2004},
249 {"UTF-8-BOM", UTF_8_BOM},
250 {"UTF8-MAC", UTF8_MAC},
251 {"UTF-8-MAC", UTF8_MAC},
253 {"UTF-16BE", UTF_16BE},
254 {"UTF-16BE-BOM", UTF_16BE_BOM},
255 {"UTF-16LE", UTF_16LE},
256 {"UTF-16LE-BOM", UTF_16LE_BOM},
258 {"UTF-32BE", UTF_32BE},
259 {"UTF-32BE-BOM", UTF_32BE_BOM},
260 {"UTF-32LE", UTF_32LE},
261 {"UTF-32LE-BOM", UTF_32LE_BOM},
266 #if defined(DEFAULT_CODE_JIS)
267 #define DEFAULT_ENCIDX ISO_2022_JP
268 #elif defined(DEFAULT_CODE_SJIS)
269 #define DEFAULT_ENCIDX SHIFT_JIS
270 #elif defined(DEFAULT_CODE_EUC)
271 #define DEFAULT_ENCIDX EUC_JP
272 #elif defined(DEFAULT_CODE_UTF8)
273 #define DEFAULT_ENCIDX UTF_8
277 #define is_alnum(c) \
278 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
280 /* I don't trust portablity of toupper */
281 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
282 #define nkf_isoctal(c) ('0'<=c && c<='7')
283 #define nkf_isdigit(c) ('0'<=c && c<='9')
284 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
285 #define nkf_isblank(c) (c == SP || c == TAB)
286 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
287 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
288 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
289 #define nkf_isprint(c) (SP<=c && c<='~')
290 #define nkf_isgraph(c) ('!'<=c && c<='~')
291 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
292 ('A'<=c&&c<='F') ? (c-'A'+10) : \
293 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
294 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
295 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
296 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
297 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
298 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
300 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
301 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
303 #define HOLD_SIZE 1024
304 #if defined(INT_IS_SHORT)
305 #define IOBUF_SIZE 2048
307 #define IOBUF_SIZE 16384
310 #define DEFAULT_J 'B'
311 #define DEFAULT_R 'B'
318 /* MIME preprocessor */
320 #ifdef EASYWIN /*Easy Win */
321 extern POINT _BufferSize;
330 void (*status_func)(struct input_code *, nkf_char);
331 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
335 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
336 static nkf_encoding *input_encoding = NULL;
337 static nkf_encoding *output_encoding = NULL;
339 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
341 * 0: Shift_JIS, eucJP-ascii
346 #define UCS_MAP_ASCII 0
348 #define UCS_MAP_CP932 2
349 #define UCS_MAP_CP10001 3
350 static int ms_ucs_map_f = UCS_MAP_ASCII;
352 #ifdef UTF8_INPUT_ENABLE
353 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
354 static int no_cp932ext_f = FALSE;
355 /* ignore ZERO WIDTH NO-BREAK SPACE */
356 static int no_best_fit_chars_f = FALSE;
357 static int input_endian = ENDIAN_BIG;
358 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
359 static void (*encode_fallback)(nkf_char c) = NULL;
360 static void w_status(struct input_code *, nkf_char);
362 #ifdef UTF8_OUTPUT_ENABLE
363 static int output_bom_f = FALSE;
364 static int output_endian = ENDIAN_BIG;
367 static void std_putc(nkf_char c);
368 static nkf_char std_getc(FILE *f);
369 static nkf_char std_ungetc(nkf_char c,FILE *f);
371 static nkf_char broken_getc(FILE *f);
372 static nkf_char broken_ungetc(nkf_char c,FILE *f);
374 static nkf_char mime_getc(FILE *f);
376 static void mime_putc(nkf_char c);
380 #if !defined(PERL_XS) && !defined(WIN32DLL)
381 static unsigned char stdibuf[IOBUF_SIZE];
382 static unsigned char stdobuf[IOBUF_SIZE];
386 static int unbuf_f = FALSE;
387 static int estab_f = FALSE;
388 static int nop_f = FALSE;
389 static int binmode_f = TRUE; /* binary mode */
390 static int rot_f = FALSE; /* rot14/43 mode */
391 static int hira_f = FALSE; /* hira/kata henkan */
392 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
393 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
394 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
395 static int mimebuf_f = FALSE; /* MIME buffered input */
396 static int broken_f = FALSE; /* convert ESC-less broken JIS */
397 static int iso8859_f = FALSE; /* ISO8859 through */
398 static int mimeout_f = FALSE; /* base64 mode */
399 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
400 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
402 #ifdef UNICODE_NORMALIZATION
403 static int nfc_f = FALSE;
404 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
405 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
409 static int cap_f = FALSE;
410 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
411 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int url_f = FALSE;
414 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
415 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
418 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
419 #define CLASS_MASK NKF_INT32_C(0xFF000000)
420 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
421 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
422 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
423 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
424 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
425 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
426 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
427 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
428 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
430 #ifdef NUMCHAR_OPTION
431 static int numchar_f = FALSE;
432 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
433 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
437 static int noout_f = FALSE;
438 static void no_putc(nkf_char c);
439 static int debug_f = FALSE;
440 static void debug(const char *str);
441 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
444 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
445 static void set_input_codename(const char *codename);
448 static int exec_f = 0;
451 #ifdef SHIFTJIS_CP932
452 /* invert IBM extended characters to others */
453 static int cp51932_f = FALSE;
455 /* invert NEC-selected IBM extended characters to IBM extended characters */
456 static int cp932inv_f = TRUE;
458 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
459 #endif /* SHIFTJIS_CP932 */
461 static int x0212_f = FALSE;
462 static int x0213_f = FALSE;
464 static unsigned char prefix_table[256];
466 static void e_status(struct input_code *, nkf_char);
467 static void s_status(struct input_code *, nkf_char);
469 struct input_code input_code_list[] = {
470 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
471 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
472 #ifdef UTF8_INPUT_ENABLE
473 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
478 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
479 static int base64_count = 0;
481 /* X0208 -> ASCII converter */
484 static int f_line = 0; /* chars in line */
485 static int f_prev = 0;
486 static int fold_preserve_f = FALSE; /* preserve new lines */
487 static int fold_f = FALSE;
488 static int fold_len = 0;
491 static unsigned char kanji_intro = DEFAULT_J;
492 static unsigned char ascii_intro = DEFAULT_R;
496 #define FOLD_MARGIN 10
497 #define DEFAULT_FOLD 60
499 static int fold_margin = FOLD_MARGIN;
501 /* process default */
504 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
506 fprintf(stderr,"nkf internal module connection failure.\n");
512 no_connection(nkf_char c2, nkf_char c1)
514 no_connection2(c2,c1,0);
517 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
518 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
526 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
528 /* static redirections */
530 static void (*o_putc)(nkf_char c) = std_putc;
532 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
533 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
535 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
536 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
538 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
540 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
541 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
543 /* for strict mime */
544 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
545 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
548 static int output_mode = ASCII; /* output kanji mode */
549 static int input_mode = ASCII; /* input kanji mode */
550 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
552 /* X0201 / X0208 conversion tables */
554 /* X0201 kana conversion table */
556 static const unsigned char cv[]= {
557 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
558 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
559 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
560 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
561 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
562 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
563 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
564 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
565 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
566 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
567 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
568 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
569 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
570 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
571 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
572 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
576 /* X0201 kana conversion table for daguten */
578 static const unsigned char dv[]= {
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
584 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
585 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
586 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
587 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
588 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
590 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 /* X0201 kana conversion table for han-daguten */
599 static const unsigned char ev[]= {
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
611 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
619 /* X0208 kigou conversion table */
620 /* 0x8140 - 0x819e */
621 static const unsigned char fv[] = {
623 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
624 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
625 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
626 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
627 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
628 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
629 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
631 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
639 static int option_mode = 0;
640 static int file_out_f = FALSE;
642 static int overwrite_f = FALSE;
643 static int preserve_time_f = FALSE;
644 static int backup_f = FALSE;
645 static char *backup_suffix = "";
648 static int eolmode_f = 0; /* CR, LF, CRLF */
649 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
650 static nkf_char prev_cr = 0; /* CR or 0 */
651 #ifdef EASYWIN /*Easy Win */
652 static int end_check;
655 #define STD_GC_BUFSIZE (256)
656 nkf_char std_gc_buf[STD_GC_BUFSIZE];
660 nkf_str_caseeql(const char *src, const char *target)
663 for (i = 0; src[i] && target[i]; i++) {
664 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
666 if (src[i] || target[i]) return FALSE;
671 nkf_enc_from_index(int idx)
673 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
676 return &nkf_encoding_table[idx];
680 nkf_enc_find_index(const char *name)
683 if (name[0] == 'X' && *(name+1) == '-') name += 2;
684 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
685 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
686 return encoding_name_to_id_table[i].id;
693 nkf_enc_find(const char *name)
696 idx = nkf_enc_find_index(name);
697 if (idx < 0) return 0;
698 return nkf_enc_from_index(idx);
701 #define nkf_enc_name(enc) (enc)->name
702 #define nkf_enc_to_index(enc) (enc)->id
703 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
704 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
705 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
706 #define nkf_enc_asciicompat(enc) (\
707 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
708 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
709 #define nkf_enc_unicode_p(enc) (\
710 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
711 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
712 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
713 #define nkf_enc_cp5022x_p(enc) (\
714 nkf_enc_to_index(enc) == CP50220 ||\
715 nkf_enc_to_index(enc) == CP50221 ||\
716 nkf_enc_to_index(enc) == CP50222)
718 #ifdef DEFAULT_CODE_LOCALE
722 #ifdef HAVE_LANGINFO_H
723 return nl_langinfo(CODESET);
724 #elif defined(__WIN32__)
727 int len = sprintf(buf, "CP%d", GetACP());
729 str = malloc(len + 1);
741 nkf_locale_encoding()
743 nkf_encoding *enc = 0;
744 char *encname = nkf_locale_charmap();
746 enc = nkf_enc_find(encname);
747 if (enc < 0) enc = 0;
750 #endif /* DEFAULT_CODE_LOCALE */
753 nkf_default_encoding()
755 nkf_encoding *enc = 0;
756 #ifdef DEFAULT_CODE_LOCALE
757 enc = nkf_locale_encoding();
759 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
766 #define fprintf dllprintf
772 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
779 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
781 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
782 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
783 #ifdef UTF8_OUTPUT_ENABLE
784 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
786 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
787 #ifdef UTF8_INPUT_ENABLE
788 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
791 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
792 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
793 "r {de/en}crypt ROT13/47\n"
794 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
795 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
796 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
797 "l ISO8859-1 (Latin-1) support\n"
798 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
799 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
800 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
801 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
802 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
803 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
805 "T Text mode output\n"
807 "O Output to File (DEFAULT 'nkf.out')\n"
808 "I Convert non ISO-2022-JP charactor to GETA\n"
809 "d,c Convert line breaks -d: LF -c: CRLF\n"
810 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
811 "v, V Show this usage. V: show configuration\n"
813 "Long name options\n"
814 " --ic=<input codeset> --oc=<output codeset>\n"
815 " Specify the input or output codeset\n"
816 " --fj --unix --mac --windows\n"
817 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
818 " Convert for the system or code\n"
819 " --hiragana --katakana --katakana-hiragana\n"
820 " To Hiragana/Katakana Conversion\n"
821 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
823 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
825 #ifdef NUMCHAR_OPTION
826 " --numchar-input Convert Unicode Character Reference\n"
828 #ifdef UTF8_INPUT_ENABLE
829 " --fb-{skip, html, xml, perl, java, subchar}\n"
830 " Specify how nkf handles unassigned characters\n"
833 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
834 " Overwrite original listed files by filtered result\n"
835 " --overwrite preserves timestamp of original files\n"
837 " -g --guess Guess the input code\n"
838 " --help --version Show this help/the version\n"
839 " For more information, see also man nkf\n"
845 show_configuration(void)
848 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
851 " Compile-time options:\n"
852 " Compiled at: " __DATE__ " " __TIME__ "\n"
855 " Default output encoding: "
856 #ifdef DEFAULT_CODE_LOCALE
857 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
859 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
865 " Default output end of line: "
866 #if DEFAULT_NEWLINE == CR
868 #elif DEFAULT_NEWLINE == CRLF
874 " Decode MIME encoded string: "
875 #if MIME_DECODE_DEFAULT
881 " Convert JIS X 0201 Katakana: "
888 " --help, --version output: "
889 #if HELP_OUTPUT_HELP_OUTPUT
900 get_backup_filename(const char *suffix, const char *filename)
902 char *backup_filename;
903 int asterisk_count = 0;
905 int filename_length = strlen(filename);
907 for(i = 0; suffix[i]; i++){
908 if(suffix[i] == '*') asterisk_count++;
912 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
913 if (!backup_filename){
914 perror("Can't malloc backup filename.");
918 for(i = 0, j = 0; suffix[i];){
919 if(suffix[i] == '*'){
920 backup_filename[j] = '\0';
921 strncat(backup_filename, filename, filename_length);
923 j += filename_length;
925 backup_filename[j++] = suffix[i++];
928 backup_filename[j] = '\0';
930 j = filename_length + strlen(suffix);
931 backup_filename = malloc(j + 1);
932 strcpy(backup_filename, filename);
933 strcat(backup_filename, suffix);
934 backup_filename[j] = '\0';
936 return backup_filename;
940 #ifdef UTF8_INPUT_ENABLE
942 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
949 (*f)(0, bin2hex(c>>shift));
960 encode_fallback_html(nkf_char c)
965 if(c >= NKF_INT32_C(1000000))
966 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
967 if(c >= NKF_INT32_C(100000))
968 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
970 (*oconv)(0, 0x30+(c/10000 )%10);
972 (*oconv)(0, 0x30+(c/1000 )%10);
974 (*oconv)(0, 0x30+(c/100 )%10);
976 (*oconv)(0, 0x30+(c/10 )%10);
978 (*oconv)(0, 0x30+ c %10);
984 encode_fallback_xml(nkf_char c)
989 nkf_each_char_to_hex(oconv, c);
995 encode_fallback_java(nkf_char c)
999 if(!nkf_char_unicode_bmp_p(c)){
1003 (*oconv)(0, bin2hex(c>>20));
1004 (*oconv)(0, bin2hex(c>>16));
1008 (*oconv)(0, bin2hex(c>>12));
1009 (*oconv)(0, bin2hex(c>> 8));
1010 (*oconv)(0, bin2hex(c>> 4));
1011 (*oconv)(0, bin2hex(c ));
1016 encode_fallback_perl(nkf_char c)
1021 nkf_each_char_to_hex(oconv, c);
1027 encode_fallback_subchar(nkf_char c)
1029 c = unicode_subchar;
1030 (*oconv)((c>>8)&0xFF, c&0xFF);
1035 static const struct {
1059 {"katakana-hiragana","h3"},
1067 #ifdef UTF8_OUTPUT_ENABLE
1077 {"fb-subchar=", ""},
1079 #ifdef UTF8_INPUT_ENABLE
1080 {"utf8-input", "W"},
1081 {"utf16-input", "W16"},
1082 {"no-cp932ext", ""},
1083 {"no-best-fit-chars",""},
1085 #ifdef UNICODE_NORMALIZATION
1086 {"utf8mac-input", ""},
1098 #ifdef NUMCHAR_OPTION
1099 {"numchar-input", ""},
1105 #ifdef SHIFTJIS_CP932
1116 set_input_encoding(nkf_encoding *enc)
1118 switch (nkf_enc_to_index(enc)) {
1125 #ifdef SHIFTJIS_CP932
1128 #ifdef UTF8_OUTPUT_ENABLE
1129 ms_ucs_map_f = UCS_MAP_CP932;
1139 case ISO_2022_JP_2004:
1146 #ifdef SHIFTJIS_CP932
1149 #ifdef UTF8_OUTPUT_ENABLE
1150 ms_ucs_map_f = UCS_MAP_CP932;
1155 #ifdef SHIFTJIS_CP932
1158 #ifdef UTF8_OUTPUT_ENABLE
1159 ms_ucs_map_f = UCS_MAP_CP10001;
1167 #ifdef SHIFTJIS_CP932
1170 #ifdef UTF8_OUTPUT_ENABLE
1171 ms_ucs_map_f = UCS_MAP_CP932;
1175 #ifdef SHIFTJIS_CP932
1178 #ifdef UTF8_OUTPUT_ENABLE
1179 ms_ucs_map_f = UCS_MAP_MS;
1183 #ifdef SHIFTJIS_CP932
1186 #ifdef UTF8_OUTPUT_ENABLE
1187 ms_ucs_map_f = UCS_MAP_ASCII;
1190 case SHIFT_JISX0213:
1191 case SHIFT_JIS_2004:
1193 #ifdef SHIFTJIS_CP932
1200 #ifdef SHIFTJIS_CP932
1204 #ifdef UTF8_INPUT_ENABLE
1205 #ifdef UNICODE_NORMALIZATION
1213 input_endian = ENDIAN_BIG;
1217 input_endian = ENDIAN_LITTLE;
1222 input_endian = ENDIAN_BIG;
1226 input_endian = ENDIAN_LITTLE;
1233 set_output_encoding(nkf_encoding *enc)
1235 switch (nkf_enc_to_index(enc)) {
1238 #ifdef SHIFTJIS_CP932
1239 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1241 #ifdef UTF8_OUTPUT_ENABLE
1242 ms_ucs_map_f = UCS_MAP_CP932;
1246 #ifdef SHIFTJIS_CP932
1247 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1249 #ifdef UTF8_OUTPUT_ENABLE
1250 ms_ucs_map_f = UCS_MAP_CP932;
1255 #ifdef SHIFTJIS_CP932
1256 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1262 #ifdef SHIFTJIS_CP932
1263 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1269 #ifdef UTF8_OUTPUT_ENABLE
1270 ms_ucs_map_f = UCS_MAP_CP932;
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP10001;
1280 #ifdef SHIFTJIS_CP932
1281 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1283 #ifdef UTF8_OUTPUT_ENABLE
1284 ms_ucs_map_f = UCS_MAP_ASCII;
1289 #ifdef SHIFTJIS_CP932
1290 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1292 #ifdef UTF8_OUTPUT_ENABLE
1293 ms_ucs_map_f = UCS_MAP_ASCII;
1297 #ifdef SHIFTJIS_CP932
1298 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1300 #ifdef UTF8_OUTPUT_ENABLE
1301 ms_ucs_map_f = UCS_MAP_CP932;
1306 #ifdef UTF8_OUTPUT_ENABLE
1307 ms_ucs_map_f = UCS_MAP_MS;
1312 #ifdef UTF8_OUTPUT_ENABLE
1313 ms_ucs_map_f = UCS_MAP_ASCII;
1316 case SHIFT_JISX0213:
1317 case SHIFT_JIS_2004:
1319 #ifdef SHIFTJIS_CP932
1320 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1327 #ifdef SHIFTJIS_CP932
1328 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1331 #ifdef UTF8_OUTPUT_ENABLE
1333 output_bom_f = TRUE;
1337 output_bom_f = TRUE;
1340 output_endian = ENDIAN_LITTLE;
1341 output_bom_f = FALSE;
1344 output_endian = ENDIAN_LITTLE;
1345 output_bom_f = TRUE;
1348 output_bom_f = TRUE;
1351 output_endian = ENDIAN_LITTLE;
1352 output_bom_f = FALSE;
1355 output_endian = ENDIAN_LITTLE;
1356 output_bom_f = TRUE;
1362 static struct input_code*
1363 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1366 struct input_code *p = input_code_list;
1368 if (iconv_func == p->iconv_func){
1378 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1380 #ifdef INPUT_CODE_FIX
1381 if (f || !input_encoding)
1388 #ifdef INPUT_CODE_FIX
1389 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1395 if (estab_f && iconv_for_check != iconv){
1396 struct input_code *p = find_inputcode_byfunc(iconv);
1398 set_input_codename(p->name);
1401 iconv_for_check = iconv;
1408 x0212_shift(nkf_char c)
1413 if (0x75 <= c && c <= 0x7f){
1414 ret = c + (0x109 - 0x75);
1417 if (0x75 <= c && c <= 0x7f){
1418 ret = c + (0x113 - 0x75);
1426 x0212_unshift(nkf_char c)
1429 if (0x7f <= c && c <= 0x88){
1430 ret = c + (0x75 - 0x7f);
1431 }else if (0x89 <= c && c <= 0x92){
1432 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1436 #endif /* X0212_ENABLE */
1439 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1445 if((0x21 <= ndx && ndx <= 0x2F)){
1446 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1447 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1449 }else if(0x6E <= ndx && ndx <= 0x7E){
1450 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1451 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1457 else if(nkf_isgraph(ndx)){
1459 const unsigned short *ptr;
1460 ptr = x0212_shiftjis[ndx - 0x21];
1462 val = ptr[(c1 & 0x7f) - 0x21];
1471 c2 = x0212_shift(c2);
1473 #endif /* X0212_ENABLE */
1475 if(0x7F < c2) return 1;
1476 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1477 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1482 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1484 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1487 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1488 if (0xFC < c1) return 1;
1489 #ifdef SHIFTJIS_CP932
1490 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1491 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1498 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1499 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1505 #endif /* SHIFTJIS_CP932 */
1507 if (!x0213_f && is_ibmext_in_sjis(c2)){
1508 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1511 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1524 if(x0213_f && c2 >= 0xF0){
1525 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1526 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1527 }else{ /* 78<=k<=94 */
1528 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1529 if (0x9E < c1) c2++;
1532 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1533 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1534 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1535 if (0x9E < c1) c2++;
1538 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1545 c2 = x0212_unshift(c2);
1552 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1554 nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1562 }else if (val < 0x800){
1563 *p1 = 0xc0 | (val >> 6);
1564 *p2 = 0x80 | (val & 0x3f);
1567 } else if (nkf_char_unicode_bmp_p(val)) {
1568 *p1 = 0xe0 | (val >> 12);
1569 *p2 = 0x80 | ((val >> 6) & 0x3f);
1570 *p3 = 0x80 | ( val & 0x3f);
1572 } else if (nkf_char_unicode_value_p(val)) {
1573 *p1 = 0xe0 | (val >> 16);
1574 *p2 = 0x80 | ((val >> 12) & 0x3f);
1575 *p3 = 0x80 | ((val >> 6) & 0x3f);
1576 *p4 = 0x80 | ( val & 0x3f);
1586 nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1593 else if (c1 <= 0xC3) {
1594 /* trail byte or invalid */
1597 else if (c1 <= 0xDF) {
1599 wc = (c1 & 0x1F) << 6;
1602 else if (c1 <= 0xEF) {
1604 wc = (c1 & 0x0F) << 12;
1605 wc |= (c2 & 0x3F) << 6;
1608 else if (c2 <= 0xF4) {
1610 wc = (c1 & 0x0F) << 18;
1611 wc |= (c2 & 0x3F) << 12;
1612 wc |= (c3 & 0x3F) << 6;
1622 #ifdef UTF8_INPUT_ENABLE
1624 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1625 const unsigned short *const *pp, nkf_char psize,
1626 nkf_char *p2, nkf_char *p1)
1629 const unsigned short *p;
1632 if (pp == 0) return 1;
1635 if (c1 < 0 || psize <= c1) return 1;
1637 if (p == 0) return 1;
1640 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1642 if (val == 0) return 1;
1643 if (no_cp932ext_f && (
1644 (val>>8) == 0x2D || /* NEC special characters */
1645 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1653 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1661 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1663 const unsigned short *const *pp;
1664 const unsigned short *const *const *ppp;
1665 static const char no_best_fit_chars_table_C2[] =
1666 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1669 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1670 static const char no_best_fit_chars_table_C2_ms[] =
1671 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1674 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1675 static const char no_best_fit_chars_table_932_C2[] =
1676 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1679 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1680 static const char no_best_fit_chars_table_932_C3[] =
1681 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1690 }else if(c2 < 0xe0){
1691 if(no_best_fit_chars_f){
1692 if(ms_ucs_map_f == UCS_MAP_CP932){
1695 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1698 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1701 }else if(!cp932inv_f){
1704 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1707 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1710 }else if(ms_ucs_map_f == UCS_MAP_MS){
1711 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1712 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1730 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1731 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1732 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1734 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1735 }else if(c0 < 0xF0){
1736 if(no_best_fit_chars_f){
1737 if(ms_ucs_map_f == UCS_MAP_CP932){
1738 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1739 }else if(ms_ucs_map_f == UCS_MAP_MS){
1744 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1747 if(c0 == 0x92) return 1;
1752 if(c1 == 0x80 || c0 == 0x9C) return 1;
1755 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1760 if(c0 == 0x94) return 1;
1763 if(c0 == 0xBB) return 1;
1773 if(c0 == 0x95) return 1;
1776 if(c0 == 0xA5) return 1;
1783 if(c0 == 0x8D) return 1;
1786 if(c0 == 0x9E && !cp932inv_f) return 1;
1789 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1797 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1798 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1799 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1801 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1803 #ifdef SHIFTJIS_CP932
1804 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1806 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1807 s2e_conv(s2, s1, p2, p1);
1816 #ifdef UTF8_OUTPUT_ENABLE
1818 e2w_conv(nkf_char c2, nkf_char c1)
1820 const unsigned short *p;
1822 if (c2 == JIS_X_0201_1976_K) {
1823 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1831 p = euc_to_utf8_1byte;
1833 } else if (is_eucg3(c2)){
1834 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1837 c2 = (c2&0x7f) - 0x21;
1838 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1839 p = x0212_to_utf8_2bytes[c2];
1845 c2 = (c2&0x7f) - 0x21;
1846 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1848 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1849 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1850 euc_to_utf8_2bytes_ms[c2];
1855 c1 = (c1 & 0x7f) - 0x21;
1856 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1863 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1870 }else if (0xc0 <= c2 && c2 <= 0xef) {
1871 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1872 #ifdef NUMCHAR_OPTION
1875 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1883 #ifdef UTF8_INPUT_ENABLE
1885 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1894 else if (nkf_char_unicode_bmp_p(val)){
1895 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1896 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1899 *p1 = nkf_char_unicode_new(val);
1905 *p1 = nkf_char_unicode_new(val);
1912 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1914 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1915 if (iso2022jp_f && !x0201_f) {
1916 c2 = GETA1; c1 = GETA2;
1918 c2 = JIS_X_0201_1976_K;
1922 }else if (c2 == 0x8f){
1926 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1927 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1928 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1931 c2 = (c2 << 8) | (c1 & 0x7f);
1933 #ifdef SHIFTJIS_CP932
1936 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1937 s2e_conv(s2, s1, &c2, &c1);
1944 #endif /* SHIFTJIS_CP932 */
1946 #endif /* X0212_ENABLE */
1947 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1950 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1951 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1952 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1957 #ifdef SHIFTJIS_CP932
1958 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1960 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1961 s2e_conv(s2, s1, &c2, &c1);
1968 #endif /* SHIFTJIS_CP932 */
1976 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1978 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1979 if (iso2022jp_f && !x0201_f) {
1980 c2 = GETA1; c1 = GETA2;
1984 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1986 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1988 if(c1 == 0x7F) return 0;
1989 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1992 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1993 if (ret) return ret;
2000 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2002 nkf_char ret = 0, c4 = 0;
2003 static const char w_iconv_utf8_1st_byte[] =
2005 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2006 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2007 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2008 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2015 if (c1 < 0 || 0xff < c1) {
2016 }else if (c1 == 0) { /* 0 : 1 byte*/
2018 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2021 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2023 if (c2 < 0x80 || 0xBF < c2) return 0;
2026 if (c3 == 0) return -1;
2027 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2032 if (c3 == 0) return -1;
2033 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2037 if (c3 == 0) return -1;
2038 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2042 if (c3 == 0) return -2;
2043 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2047 if (c3 == 0) return -2;
2048 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2052 if (c3 == 0) return -2;
2053 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2061 if (c1 == 0 || c1 == EOF){
2062 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2063 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2066 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2074 #define NKF_ICONV_INVALID_CODE_RANGE -13
2076 unicode_iconv(nkf_char wc)
2084 }else if ((wc>>11) == 27) {
2085 /* unpaired surrogate */
2086 return NKF_ICONV_INVALID_CODE_RANGE;
2087 }else if (wc < 0xFFFF) {
2088 ret = w16e_conv(wc, &c2, &c1);
2089 if (ret) return ret;
2090 }else if (wc < 0x10FFFF) {
2092 c1 = nkf_char_unicode_new(wc);
2094 return NKF_ICONV_INVALID_CODE_RANGE;
2100 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2101 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2102 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2104 nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2113 if (input_endian == ENDIAN_BIG) {
2114 if (0xD8 <= c1 && c1 <= 0xDB) {
2115 if (0xDC <= c3 && c3 <= 0xDF) {
2116 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2117 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2122 if (0xD8 <= c2 && c2 <= 0xDB) {
2123 if (0xDC <= c4 && c4 <= 0xDF) {
2124 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2125 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2131 return (*unicode_iconv)(wc);
2135 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2141 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2147 nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2156 switch(input_endian){
2158 wc = c2 << 16 | c3 << 8 | c4;
2161 wc = c3 << 16 | c2 << 8 | c1;
2164 wc = c1 << 16 | c4 << 8 | c3;
2167 wc = c4 << 16 | c1 << 8 | c2;
2170 return NKF_ICONV_INVALID_CODE_RANGE;
2173 return (*unicode_iconv)(wc);
2177 #define output_ascii_escape_sequence(mode) do { \
2178 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2181 (*o_putc)(ascii_intro); \
2182 output_mode = mode; \
2187 output_escape_sequence(int mode)
2189 if (output_mode == mode)
2197 case JIS_X_0201_1976_K:
2205 (*o_putc)(kanji_intro);
2230 j_oconv(nkf_char c2, nkf_char c1)
2232 #ifdef NUMCHAR_OPTION
2233 if (c2 == 0 && nkf_char_unicode_p(c1)){
2234 w16e_conv(c1, &c2, &c1);
2235 if (c2 == 0 && nkf_char_unicode_p(c1)){
2236 c2 = c1 & VALUE_MASK;
2237 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2240 c2 = 0x7F + c1 / 94;
2241 c1 = 0x21 + c1 % 94;
2243 if (encode_fallback) (*encode_fallback)(c1);
2250 output_ascii_escape_sequence(ASCII);
2253 else if (c2 == EOF) {
2254 output_ascii_escape_sequence(ASCII);
2257 else if (c2 == ISO_8859_1) {
2258 output_ascii_escape_sequence(ISO_8859_1);
2261 else if (c2 == JIS_X_0201_1976_K) {
2262 output_escape_sequence(JIS_X_0201_1976_K);
2265 } else if (is_eucg3(c2)){
2266 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2267 (*o_putc)(c2 & 0x7f);
2272 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2273 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2274 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2281 e_oconv(nkf_char c2, nkf_char c1)
2283 if (c2 == 0 && nkf_char_unicode_p(c1)){
2284 w16e_conv(c1, &c2, &c1);
2285 if (c2 == 0 && nkf_char_unicode_p(c1)){
2286 c2 = c1 & VALUE_MASK;
2287 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2291 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2292 c1 = 0x21 + c1 % 94;
2295 (*o_putc)((c2 & 0x7f) | 0x080);
2296 (*o_putc)(c1 | 0x080);
2298 (*o_putc)((c2 & 0x7f) | 0x080);
2299 (*o_putc)(c1 | 0x080);
2303 if (encode_fallback) (*encode_fallback)(c1);
2311 } else if (c2 == 0) {
2312 output_mode = ASCII;
2314 } else if (c2 == JIS_X_0201_1976_K) {
2315 output_mode = EUC_JP;
2316 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2317 } else if (c2 == ISO_8859_1) {
2318 output_mode = ISO_8859_1;
2319 (*o_putc)(c1 | 0x080);
2321 } else if (is_eucg3(c2)){
2322 output_mode = EUC_JP;
2323 #ifdef SHIFTJIS_CP932
2326 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2327 s2e_conv(s2, s1, &c2, &c1);
2332 output_mode = ASCII;
2334 }else if (is_eucg3(c2)){
2337 (*o_putc)((c2 & 0x7f) | 0x080);
2338 (*o_putc)(c1 | 0x080);
2341 (*o_putc)((c2 & 0x7f) | 0x080);
2342 (*o_putc)(c1 | 0x080);
2346 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2347 set_iconv(FALSE, 0);
2348 return; /* too late to rescue this char */
2350 output_mode = EUC_JP;
2351 (*o_putc)(c2 | 0x080);
2352 (*o_putc)(c1 | 0x080);
2357 s_oconv(nkf_char c2, nkf_char c1)
2359 #ifdef NUMCHAR_OPTION
2360 if (c2 == 0 && nkf_char_unicode_p(c1)){
2361 w16e_conv(c1, &c2, &c1);
2362 if (c2 == 0 && nkf_char_unicode_p(c1)){
2363 c2 = c1 & VALUE_MASK;
2364 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2367 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2369 c1 += 0x40 + (c1 > 0x3e);
2374 if(encode_fallback)(*encode_fallback)(c1);
2383 } else if (c2 == 0) {
2384 output_mode = ASCII;
2386 } else if (c2 == JIS_X_0201_1976_K) {
2387 output_mode = SHIFT_JIS;
2389 } else if (c2 == ISO_8859_1) {
2390 output_mode = ISO_8859_1;
2391 (*o_putc)(c1 | 0x080);
2393 } else if (is_eucg3(c2)){
2394 output_mode = SHIFT_JIS;
2395 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2401 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2402 set_iconv(FALSE, 0);
2403 return; /* too late to rescue this char */
2405 output_mode = SHIFT_JIS;
2406 e2s_conv(c2, c1, &c2, &c1);
2408 #ifdef SHIFTJIS_CP932
2410 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2411 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2417 #endif /* SHIFTJIS_CP932 */
2420 if (prefix_table[(unsigned char)c1]){
2421 (*o_putc)(prefix_table[(unsigned char)c1]);
2427 #ifdef UTF8_OUTPUT_ENABLE
2429 w_oconv(nkf_char c2, nkf_char c1)
2435 output_bom_f = FALSE;
2446 if (c2 == 0 && nkf_char_unicode_p(c1)){
2447 val = c1 & VALUE_MASK;
2448 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2450 if (c2) (*o_putc)(c2);
2451 if (c3) (*o_putc)(c3);
2452 if (c4) (*o_putc)(c4);
2459 val = e2w_conv(c2, c1);
2461 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2463 if (c2) (*o_putc)(c2);
2464 if (c3) (*o_putc)(c3);
2465 if (c4) (*o_putc)(c4);
2471 w_oconv16(nkf_char c2, nkf_char c1)
2474 output_bom_f = FALSE;
2475 if (output_endian == ENDIAN_LITTLE){
2489 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2490 if (nkf_char_unicode_bmp_p(c1)) {
2491 c2 = (c1 >> 8) & 0xff;
2495 if (c1 <= UNICODE_MAX) {
2496 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2497 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2498 if (output_endian == ENDIAN_LITTLE){
2499 (*o_putc)(c2 & 0xff);
2500 (*o_putc)((c2 >> 8) & 0xff);
2501 (*o_putc)(c1 & 0xff);
2502 (*o_putc)((c1 >> 8) & 0xff);
2504 (*o_putc)((c2 >> 8) & 0xff);
2505 (*o_putc)(c2 & 0xff);
2506 (*o_putc)((c1 >> 8) & 0xff);
2507 (*o_putc)(c1 & 0xff);
2513 nkf_char val = e2w_conv(c2, c1);
2514 c2 = (val >> 8) & 0xff;
2519 if (output_endian == ENDIAN_LITTLE){
2529 w_oconv32(nkf_char c2, nkf_char c1)
2532 output_bom_f = FALSE;
2533 if (output_endian == ENDIAN_LITTLE){
2551 if (c2 == ISO_8859_1) {
2553 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2556 c1 = e2w_conv(c2, c1);
2559 if (output_endian == ENDIAN_LITTLE){
2560 (*o_putc)( c1 & 0xFF);
2561 (*o_putc)((c1 >> 8) & 0xFF);
2562 (*o_putc)((c1 >> 16) & 0xFF);
2566 (*o_putc)((c1 >> 16) & 0xFF);
2567 (*o_putc)((c1 >> 8) & 0xFF);
2568 (*o_putc)( c1 & 0xFF);
2573 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2574 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2575 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2576 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2577 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2578 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2579 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2580 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2582 #define SCORE_INIT (SCORE_iMIME)
2584 static const char score_table_A0[] = {
2587 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2588 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2591 static const char score_table_F0[] = {
2592 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2593 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2594 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2595 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2599 set_code_score(struct input_code *ptr, nkf_char score)
2602 ptr->score |= score;
2607 clr_code_score(struct input_code *ptr, nkf_char score)
2610 ptr->score &= ~score;
2615 code_score(struct input_code *ptr)
2617 nkf_char c2 = ptr->buf[0];
2618 #ifdef UTF8_OUTPUT_ENABLE
2619 nkf_char c1 = ptr->buf[1];
2622 set_code_score(ptr, SCORE_ERROR);
2623 }else if (c2 == SS2){
2624 set_code_score(ptr, SCORE_KANA);
2625 }else if (c2 == 0x8f){
2626 set_code_score(ptr, SCORE_X0212);
2627 #ifdef UTF8_OUTPUT_ENABLE
2628 }else if (!e2w_conv(c2, c1)){
2629 set_code_score(ptr, SCORE_NO_EXIST);
2631 }else if ((c2 & 0x70) == 0x20){
2632 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2633 }else if ((c2 & 0x70) == 0x70){
2634 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2635 }else if ((c2 & 0x70) >= 0x50){
2636 set_code_score(ptr, SCORE_L2);
2641 status_disable(struct input_code *ptr)
2646 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2650 status_push_ch(struct input_code *ptr, nkf_char c)
2652 ptr->buf[ptr->index++] = c;
2656 status_clear(struct input_code *ptr)
2663 status_reset(struct input_code *ptr)
2666 ptr->score = SCORE_INIT;
2670 status_reinit(struct input_code *ptr)
2673 ptr->_file_stat = 0;
2677 status_check(struct input_code *ptr, nkf_char c)
2679 if (c <= DEL && estab_f){
2685 s_status(struct input_code *ptr, nkf_char c)
2689 status_check(ptr, c);
2694 }else if (nkf_char_unicode_p(c)){
2696 }else if (0xa1 <= c && c <= 0xdf){
2697 status_push_ch(ptr, SS2);
2698 status_push_ch(ptr, c);
2701 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2703 status_push_ch(ptr, c);
2704 }else if (0xed <= c && c <= 0xee){
2706 status_push_ch(ptr, c);
2707 #ifdef SHIFTJIS_CP932
2708 }else if (is_ibmext_in_sjis(c)){
2710 status_push_ch(ptr, c);
2711 #endif /* SHIFTJIS_CP932 */
2713 }else if (0xf0 <= c && c <= 0xfc){
2715 status_push_ch(ptr, c);
2716 #endif /* X0212_ENABLE */
2718 status_disable(ptr);
2722 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2723 status_push_ch(ptr, c);
2724 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2728 status_disable(ptr);
2732 #ifdef SHIFTJIS_CP932
2733 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2734 status_push_ch(ptr, c);
2735 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2736 set_code_score(ptr, SCORE_CP932);
2741 #endif /* SHIFTJIS_CP932 */
2742 status_disable(ptr);
2745 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2746 status_push_ch(ptr, c);
2747 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2748 set_code_score(ptr, SCORE_CP932);
2751 status_disable(ptr);
2758 e_status(struct input_code *ptr, nkf_char c)
2762 status_check(ptr, c);
2767 }else if (nkf_char_unicode_p(c)){
2769 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2771 status_push_ch(ptr, c);
2773 }else if (0x8f == c){
2775 status_push_ch(ptr, c);
2776 #endif /* X0212_ENABLE */
2778 status_disable(ptr);
2782 if (0xa1 <= c && c <= 0xfe){
2783 status_push_ch(ptr, c);
2787 status_disable(ptr);
2792 if (0xa1 <= c && c <= 0xfe){
2794 status_push_ch(ptr, c);
2796 status_disable(ptr);
2798 #endif /* X0212_ENABLE */
2802 #ifdef UTF8_INPUT_ENABLE
2804 w_status(struct input_code *ptr, nkf_char c)
2808 status_check(ptr, c);
2813 }else if (nkf_char_unicode_p(c)){
2815 }else if (0xc0 <= c && c <= 0xdf){
2817 status_push_ch(ptr, c);
2818 }else if (0xe0 <= c && c <= 0xef){
2820 status_push_ch(ptr, c);
2821 }else if (0xf0 <= c && c <= 0xf4){
2823 status_push_ch(ptr, c);
2825 status_disable(ptr);
2830 if (0x80 <= c && c <= 0xbf){
2831 status_push_ch(ptr, c);
2832 if (ptr->index > ptr->stat){
2833 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2834 && ptr->buf[2] == 0xbf);
2835 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2836 &ptr->buf[0], &ptr->buf[1]);
2843 status_disable(ptr);
2847 if (0x80 <= c && c <= 0xbf){
2848 if (ptr->index < ptr->stat){
2849 status_push_ch(ptr, c);
2854 status_disable(ptr);
2862 code_status(nkf_char c)
2864 int action_flag = 1;
2865 struct input_code *result = 0;
2866 struct input_code *p = input_code_list;
2868 if (!p->status_func) {
2872 if (!p->status_func)
2874 (p->status_func)(p, c);
2877 }else if(p->stat == 0){
2888 if (result && !estab_f){
2889 set_iconv(TRUE, result->iconv_func);
2890 }else if (c <= DEL){
2891 struct input_code *ptr = input_code_list;
2905 return std_gc_buf[--std_gc_ndx];
2912 std_ungetc(nkf_char c, FILE *f)
2914 if (std_gc_ndx == STD_GC_BUFSIZE){
2917 std_gc_buf[std_gc_ndx++] = c;
2923 std_putc(nkf_char c)
2930 static unsigned char hold_buf[HOLD_SIZE*2];
2931 static int hold_count = 0;
2933 push_hold_buf(nkf_char c2)
2935 if (hold_count >= HOLD_SIZE*2)
2937 hold_buf[hold_count++] = (unsigned char)c2;
2938 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2942 h_conv(FILE *f, int c1, int c2)
2948 /** it must NOT be in the kanji shifte sequence */
2949 /** it must NOT be written in JIS7 */
2950 /** and it must be after 2 byte 8bit code */
2956 while ((c2 = (*i_getc)(f)) != EOF) {
2962 if (push_hold_buf(c2) == EOF || estab_f) {
2968 struct input_code *p = input_code_list;
2969 struct input_code *result = p;
2974 if (p->status_func && p->score < result->score) {
2979 set_iconv(TRUE, result->iconv_func);
2984 ** 1) EOF is detected, or
2985 ** 2) Code is established, or
2986 ** 3) Buffer is FULL (but last word is pushed)
2988 ** in 1) and 3) cases, we continue to use
2989 ** Kanji codes by oconv and leave estab_f unchanged.
2994 while (hold_index < hold_count){
2995 c1 = hold_buf[hold_index++];
2999 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3000 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3003 if (hold_index < hold_count){
3004 c2 = hold_buf[hold_index++];
3014 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3017 if (hold_index < hold_count){
3018 c3 = hold_buf[hold_index++];
3019 } else if ((c3 = (*i_getc)(f)) == EOF) {
3024 if (hold_index < hold_count){
3025 c4 = hold_buf[hold_index++];
3026 } else if ((c4 = (*i_getc)(f)) == EOF) {
3031 (*iconv)(c1, c2, (c3<<8)|c4);
3036 /* 3 bytes EUC or UTF-8 */
3037 if (hold_index < hold_count){
3038 c3 = hold_buf[hold_index++];
3039 } else if ((c3 = (*i_getc)(f)) == EOF) {
3045 (*iconv)(c1, c2, c3);
3048 if (c3 == EOF) break;
3054 * Check and Ignore BOM
3060 switch(c2 = (*i_getc)(f)){
3062 if((c2 = (*i_getc)(f)) == 0x00){
3063 if((c2 = (*i_getc)(f)) == 0xFE){
3064 if((c2 = (*i_getc)(f)) == 0xFF){
3065 if(!input_encoding){
3066 set_iconv(TRUE, w_iconv32);
3068 if (iconv == w_iconv32) {
3069 input_endian = ENDIAN_BIG;
3072 (*i_ungetc)(0xFF,f);
3073 }else (*i_ungetc)(c2,f);
3074 (*i_ungetc)(0xFE,f);
3075 }else if(c2 == 0xFF){
3076 if((c2 = (*i_getc)(f)) == 0xFE){
3077 if(!input_encoding){
3078 set_iconv(TRUE, w_iconv32);
3080 if (iconv == w_iconv32) {
3081 input_endian = ENDIAN_2143;
3084 (*i_ungetc)(0xFF,f);
3085 }else (*i_ungetc)(c2,f);
3086 (*i_ungetc)(0xFF,f);
3087 }else (*i_ungetc)(c2,f);
3088 (*i_ungetc)(0x00,f);
3089 }else (*i_ungetc)(c2,f);
3090 (*i_ungetc)(0x00,f);
3093 if((c2 = (*i_getc)(f)) == 0xBB){
3094 if((c2 = (*i_getc)(f)) == 0xBF){
3095 if(!input_encoding){
3096 set_iconv(TRUE, w_iconv);
3098 if (iconv == w_iconv) {
3101 (*i_ungetc)(0xBF,f);
3102 }else (*i_ungetc)(c2,f);
3103 (*i_ungetc)(0xBB,f);
3104 }else (*i_ungetc)(c2,f);
3105 (*i_ungetc)(0xEF,f);
3108 if((c2 = (*i_getc)(f)) == 0xFF){
3109 if((c2 = (*i_getc)(f)) == 0x00){
3110 if((c2 = (*i_getc)(f)) == 0x00){
3111 if(!input_encoding){
3112 set_iconv(TRUE, w_iconv32);
3114 if (iconv == w_iconv32) {
3115 input_endian = ENDIAN_3412;
3118 (*i_ungetc)(0x00,f);
3119 }else (*i_ungetc)(c2,f);
3120 (*i_ungetc)(0x00,f);
3121 }else (*i_ungetc)(c2,f);
3122 if(!input_encoding){
3123 set_iconv(TRUE, w_iconv16);
3125 if (iconv == w_iconv16) {
3126 input_endian = ENDIAN_BIG;
3129 (*i_ungetc)(0xFF,f);
3130 }else (*i_ungetc)(c2,f);
3131 (*i_ungetc)(0xFE,f);
3134 if((c2 = (*i_getc)(f)) == 0xFE){
3135 if((c2 = (*i_getc)(f)) == 0x00){
3136 if((c2 = (*i_getc)(f)) == 0x00){
3137 if(!input_encoding){
3138 set_iconv(TRUE, w_iconv32);
3140 if (iconv == w_iconv32) {
3141 input_endian = ENDIAN_LITTLE;
3144 (*i_ungetc)(0x00,f);
3145 }else (*i_ungetc)(c2,f);
3146 (*i_ungetc)(0x00,f);
3147 }else (*i_ungetc)(c2,f);
3148 if(!input_encoding){
3149 set_iconv(TRUE, w_iconv16);
3151 if (iconv == w_iconv16) {
3152 input_endian = ENDIAN_LITTLE;
3155 (*i_ungetc)(0xFE,f);
3156 }else (*i_ungetc)(c2,f);
3157 (*i_ungetc)(0xFF,f);
3172 init_broken_state(void)
3174 memset(&broken_state, 0, sizeof(broken_state));
3180 broken_state.buf[broken_state.count++] = c;
3184 pop_broken_buf(void)
3186 return broken_state.buf[--broken_state.count];
3190 broken_getc(FILE *f)
3194 if (broken_state.count > 0) {
3195 return pop_broken_buf();
3198 if (c=='$' && broken_state.status != ESC
3199 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3201 broken_state.status = 0;
3202 if (c1=='@'|| c1=='B') {
3203 push_broken_buf(c1);
3210 } else if (c=='(' && broken_state.status != ESC
3211 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3213 broken_state.status = 0;
3214 if (c1=='J'|| c1=='B') {
3215 push_broken_buf(c1);
3223 broken_state.status = c;
3229 broken_ungetc(nkf_char c, FILE *f)
3231 if (broken_state.count < 2)
3237 eol_conv(nkf_char c2, nkf_char c1)
3239 if (guess_f && input_eol != EOF) {
3240 if (c2 == 0 && c1 == LF) {
3241 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3242 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3243 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3245 else if (!input_eol) input_eol = CR;
3246 else if (input_eol != CR) input_eol = EOF;
3248 if (prev_cr || (c2 == 0 && c1 == LF)) {
3250 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3251 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3253 if (c2 == 0 && c1 == CR) prev_cr = CR;
3254 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3258 Return value of fold_conv()
3260 LF add newline and output char
3261 CR add newline and output nothing
3264 1 (or else) normal output
3266 fold state in prev (previous character)
3268 >0x80 Japanese (X0208/X0201)
3273 This fold algorthm does not preserve heading space in a line.
3274 This is the main difference from fmt.
3277 #define char_size(c2,c1) (c2?2:1)
3280 fold_conv(nkf_char c2, nkf_char c1)
3283 nkf_char fold_state;
3285 if (c1== CR && !fold_preserve_f) {
3286 fold_state=0; /* ignore cr */
3287 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3289 fold_state=0; /* ignore cr */
3290 } else if (c1== BS) {
3291 if (f_line>0) f_line--;
3293 } else if (c2==EOF && f_line != 0) { /* close open last line */
3295 } else if ((c1==LF && !fold_preserve_f)
3296 || ((c1==CR||(c1==LF&&f_prev!=CR))
3297 && fold_preserve_f)) {
3299 if (fold_preserve_f) {
3303 } else if ((f_prev == c1 && !fold_preserve_f)
3304 || (f_prev == LF && fold_preserve_f)
3305 ) { /* duplicate newline */
3308 fold_state = LF; /* output two newline */
3314 if (f_prev&0x80) { /* Japanese? */
3316 fold_state = 0; /* ignore given single newline */
3317 } else if (f_prev==SP) {
3321 if (++f_line<=fold_len)
3325 fold_state = CR; /* fold and output nothing */
3329 } else if (c1=='\f') {
3332 fold_state = LF; /* output newline and clear */
3333 } else if ( (c2==0 && c1==SP)||
3334 (c2==0 && c1==TAB)||
3335 (c2=='!'&& c1=='!')) {
3336 /* X0208 kankaku or ascii space */
3338 fold_state = 0; /* remove duplicate spaces */
3341 if (++f_line<=fold_len)
3342 fold_state = SP; /* output ASCII space only */
3344 f_prev = SP; f_line = 0;
3345 fold_state = CR; /* fold and output nothing */
3349 prev0 = f_prev; /* we still need this one... , but almost done */
3351 if (c2 || c2 == JIS_X_0201_1976_K)
3352 f_prev |= 0x80; /* this is Japanese */
3353 f_line += char_size(c2,c1);
3354 if (f_line<=fold_len) { /* normal case */
3357 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3358 f_line = char_size(c2,c1);
3359 fold_state = LF; /* We can't wait, do fold now */
3360 } else if (c2 == JIS_X_0201_1976_K) {
3361 /* simple kinsoku rules return 1 means no folding */
3362 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3363 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3364 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3365 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3366 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3367 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3368 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3370 fold_state = LF;/* add one new f_line before this character */
3373 fold_state = LF;/* add one new f_line before this character */
3376 /* kinsoku point in ASCII */
3377 if ( c1==')'|| /* { [ ( */
3388 /* just after special */
3389 } else if (!is_alnum(prev0)) {
3390 f_line = char_size(c2,c1);
3392 } else if ((prev0==SP) || /* ignored new f_line */
3393 (prev0==LF)|| /* ignored new f_line */
3394 (prev0&0x80)) { /* X0208 - ASCII */
3395 f_line = char_size(c2,c1);
3396 fold_state = LF;/* add one new f_line before this character */
3398 fold_state = 1; /* default no fold in ASCII */
3402 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3403 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3404 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3405 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3406 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3407 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3408 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3409 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3410 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3411 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3412 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3413 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3414 /* default no fold in kinsoku */
3417 f_line = char_size(c2,c1);
3418 /* add one new f_line before this character */
3421 f_line = char_size(c2,c1);
3423 /* add one new f_line before this character */
3428 /* terminator process */
3429 switch(fold_state) {
3431 OCONV_NEWLINE((*o_fconv));
3437 OCONV_NEWLINE((*o_fconv));
3448 static nkf_char z_prev2=0,z_prev1=0;
3451 z_conv(nkf_char c2, nkf_char c1)
3454 /* if (c2) c1 &= 0x7f; assertion */
3456 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3462 if (z_prev2 == JIS_X_0201_1976_K) {
3463 if (c2 == JIS_X_0201_1976_K) {
3464 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3466 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3468 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3470 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3475 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3477 if (c2 == JIS_X_0201_1976_K) {
3478 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3479 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3484 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3495 if (alpha_f&1 && c2 == 0x23) {
3496 /* JISX0208 Alphabet */
3498 } else if (c2 == 0x21) {
3499 /* JISX0208 Kigou */
3504 } else if (alpha_f&4) {
3509 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3515 if (alpha_f&8 && c2 == 0) {
3517 const char *entity = 0;
3519 case '>': entity = ">"; break;
3520 case '<': entity = "<"; break;
3521 case '\"': entity = """; break;
3522 case '&': entity = "&"; break;
3525 while (*entity) (*o_zconv)(0, *entity++);
3531 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3536 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3540 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3544 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3548 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3552 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3556 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3560 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3564 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3569 (*o_zconv)(JIS_X_0201_1976_K, c);
3572 } else if (c2 == 0x25) {
3573 /* JISX0208 Katakana */
3574 static const int fullwidth_to_halfwidth[] =
3576 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3577 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3578 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3579 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3580 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3581 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3582 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3583 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3584 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3585 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3586 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3587 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3589 if (fullwidth_to_halfwidth[c1-0x20]){
3590 c2 = fullwidth_to_halfwidth[c1-0x20];
3591 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3593 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3603 #define rot13(c) ( \
3605 (c <= 'M') ? (c + 13): \
3606 (c <= 'Z') ? (c - 13): \
3608 (c <= 'm') ? (c + 13): \
3609 (c <= 'z') ? (c - 13): \
3613 #define rot47(c) ( \
3615 ( c <= 'O') ? (c + 47) : \
3616 ( c <= '~') ? (c - 47) : \
3621 rot_conv(nkf_char c2, nkf_char c1)
3623 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3629 (*o_rot_conv)(c2,c1);
3633 hira_conv(nkf_char c2, nkf_char c1)
3637 if (0x20 < c1 && c1 < 0x74) {
3639 (*o_hira_conv)(c2,c1);
3641 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3643 c1 = nkf_char_unicode_new(0x3094);
3644 (*o_hira_conv)(c2,c1);
3647 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3649 (*o_hira_conv)(c2,c1);
3654 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3657 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3659 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3663 (*o_hira_conv)(c2,c1);
3668 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3670 #define RANGE_NUM_MAX 18
3671 static const nkf_char range[RANGE_NUM_MAX][2] = {
3692 nkf_char start, end, c;
3694 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3698 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3703 for (i = 0; i < RANGE_NUM_MAX; i++) {
3704 start = range[i][0];
3707 if (c >= start && c <= end) {
3712 (*o_iso2022jp_check_conv)(c2,c1);
3716 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3718 static const unsigned char *mime_pattern[] = {
3719 (const unsigned char *)"\075?EUC-JP?B?",
3720 (const unsigned char *)"\075?SHIFT_JIS?B?",
3721 (const unsigned char *)"\075?ISO-8859-1?Q?",
3722 (const unsigned char *)"\075?ISO-8859-1?B?",
3723 (const unsigned char *)"\075?ISO-2022-JP?B?",
3724 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3725 #if defined(UTF8_INPUT_ENABLE)
3726 (const unsigned char *)"\075?UTF-8?B?",
3727 (const unsigned char *)"\075?UTF-8?Q?",
3729 (const unsigned char *)"\075?US-ASCII?Q?",
3734 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3735 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3736 e_iconv, s_iconv, 0, 0, 0, 0,
3737 #if defined(UTF8_INPUT_ENABLE)
3743 static const nkf_char mime_encode[] = {
3744 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3745 #if defined(UTF8_INPUT_ENABLE)
3752 static const nkf_char mime_encode_method[] = {
3753 'B', 'B','Q', 'B', 'B', 'Q',
3754 #if defined(UTF8_INPUT_ENABLE)
3762 /* MIME preprocessor fifo */
3764 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3765 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3766 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3768 unsigned char buf[MIME_BUF_SIZE];
3770 unsigned int last; /* decoded */
3771 unsigned int input; /* undecoded */
3773 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3775 #define MAXRECOVER 20
3778 mime_input_buf_unshift(nkf_char c)
3780 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3784 mime_ungetc(nkf_char c, FILE *f)
3786 mime_input_buf_unshift(c);
3791 mime_ungetc_buf(nkf_char c, FILE *f)
3794 (*i_mungetc_buf)(c,f);
3796 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3801 mime_getc_buf(FILE *f)
3803 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3804 a terminator. It was checked in mime_integrity. */
3805 return ((mimebuf_f)?
3806 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3810 switch_mime_getc(void)
3812 if (i_getc!=mime_getc) {
3813 i_mgetc = i_getc; i_getc = mime_getc;
3814 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3815 if(mime_f==STRICT_MIME) {
3816 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3817 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3823 unswitch_mime_getc(void)
3825 if(mime_f==STRICT_MIME) {
3826 i_mgetc = i_mgetc_buf;
3827 i_mungetc = i_mungetc_buf;
3830 i_ungetc = i_mungetc;
3831 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3832 mime_iconv_back = NULL;
3836 mime_integrity(FILE *f, const unsigned char *p)
3840 /* In buffered mode, read until =? or NL or buffer full
3842 mime_input_state.input = mime_input_state.top;
3843 mime_input_state.last = mime_input_state.top;
3845 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3847 q = mime_input_state.input;
3848 while((c=(*i_getc)(f))!=EOF) {
3849 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3850 break; /* buffer full */
3852 if (c=='=' && d=='?') {
3853 /* checked. skip header, start decode */
3854 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3855 /* mime_last_input = mime_input_state.input; */
3856 mime_input_state.input = q;
3860 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3862 /* Should we check length mod 4? */
3863 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3866 /* In case of Incomplete MIME, no MIME decode */
3867 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3868 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3869 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3870 switch_mime_getc(); /* anyway we need buffered getc */
3875 mime_begin_strict(FILE *f)
3879 const unsigned char *p,*q;
3880 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3882 mime_decode_mode = FALSE;
3883 /* =? has been checked */
3885 p = mime_pattern[j];
3888 for(i=2;p[i]>SP;i++) { /* start at =? */
3889 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3890 /* pattern fails, try next one */
3892 while (mime_pattern[++j]) {
3893 p = mime_pattern[j];
3894 for(k=2;k<i;k++) /* assume length(p) > i */
3895 if (p[k]!=q[k]) break;
3896 if (k==i && nkf_toupper(c1)==p[k]) break;
3898 p = mime_pattern[j];
3899 if (p) continue; /* found next one, continue */
3900 /* all fails, output from recovery buffer */
3908 mime_decode_mode = p[i-2];
3910 mime_iconv_back = iconv;
3911 set_iconv(FALSE, mime_priority_func[j]);
3912 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3914 if (mime_decode_mode=='B') {
3915 mimebuf_f = unbuf_f;
3917 /* do MIME integrity check */
3918 return mime_integrity(f,mime_pattern[j]);
3932 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3933 /* re-read and convert again from mime_buffer. */
3935 /* =? has been checked */
3936 k = mime_input_state.last;
3937 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3938 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3939 /* We accept any character type even if it is breaked by new lines */
3940 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3941 if (c1==LF||c1==SP||c1==CR||
3942 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3944 /* Failed. But this could be another MIME preemble */
3946 mime_input_state.last--;
3952 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3953 if (!(++i<MAXRECOVER) || c1==EOF) break;
3954 if (c1=='b'||c1=='B') {
3955 mime_decode_mode = 'B';
3956 } else if (c1=='q'||c1=='Q') {
3957 mime_decode_mode = 'Q';
3961 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3962 if (!(++i<MAXRECOVER) || c1==EOF) break;
3964 mime_decode_mode = FALSE;
3970 if (!mime_decode_mode) {
3971 /* false MIME premble, restart from mime_buffer */
3972 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3973 /* Since we are in MIME mode until buffer becomes empty, */
3974 /* we never go into mime_begin again for a while. */
3977 /* discard mime preemble, and goto MIME mode */
3978 mime_input_state.last = k;
3979 /* do no MIME integrity check */
3980 return c1; /* used only for checking EOF */
3991 debug(const char *str)
3994 fprintf(stderr, "%s\n", str ? str : "NULL");
4000 set_input_codename(const char *codename)
4002 if (!input_codename) {
4003 input_codename = codename;
4004 } else if (strcmp(codename, input_codename) != 0) {
4005 input_codename = "";
4010 get_guessed_code(void)
4012 if (input_codename && !*input_codename) {
4013 input_codename = "BINARY";
4015 struct input_code *p = find_inputcode_byfunc(iconv);
4016 if (!input_codename) {
4017 input_codename = "ASCII";
4018 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4019 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4020 input_codename = "CP932";
4021 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4022 if (p->score & (SCORE_X0212))
4023 input_codename = "EUCJP-MS";
4024 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4025 input_codename = "CP51932";
4026 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4027 if (p->score & (SCORE_KANA))
4028 input_codename = "CP50221";
4029 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4030 input_codename = "CP50220";
4033 return input_codename;
4036 #if !defined(PERL_XS) && !defined(WIN32DLL)
4038 print_guessed_code(char *filename)
4040 if (filename != NULL) printf("%s: ", filename);
4041 if (input_codename && !*input_codename) {
4044 input_codename = get_guessed_code();
4046 printf("%s\n", input_codename);
4050 input_eol == CR ? " (CR)" :
4051 input_eol == LF ? " (LF)" :
4052 input_eol == CRLF ? " (CRLF)" :
4053 input_eol == EOF ? " (MIXED NL)" :
4063 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4065 nkf_char c1, c2, c3;
4071 if (!nkf_isxdigit(c2)){
4076 if (!nkf_isxdigit(c3)){
4081 return (hex2bin(c2) << 4) | hex2bin(c3);
4087 return hex_getc(':', f, i_cgetc, i_cungetc);
4091 cap_ungetc(nkf_char c, FILE *f)
4093 return (*i_cungetc)(c, f);
4099 return hex_getc('%', f, i_ugetc, i_uungetc);
4103 url_ungetc(nkf_char c, FILE *f)
4105 return (*i_uungetc)(c, f);
4109 #ifdef NUMCHAR_OPTION
4111 numchar_getc(FILE *f)
4113 nkf_char (*g)(FILE *) = i_ngetc;
4114 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4125 if (buf[i] == 'x' || buf[i] == 'X'){
4126 for (j = 0; j < 7; j++){
4128 if (!nkf_isxdigit(buf[i])){
4135 c |= hex2bin(buf[i]);
4138 for (j = 0; j < 8; j++){
4142 if (!nkf_isdigit(buf[i])){
4149 c += hex2bin(buf[i]);
4155 return nkf_char_unicode_new(c);
4165 numchar_ungetc(nkf_char c, FILE *f)
4167 return (*i_nungetc)(c, f);
4171 #ifdef UNICODE_NORMALIZATION
4173 /* Normalization Form C */
4177 nkf_char (*g)(FILE *f) = i_nfc_getc;
4178 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4179 int i=0, j, k=1, lower, upper;
4181 const unsigned char *array;
4184 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4185 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4186 while (upper >= lower) {
4187 j = (lower+upper) / 2;
4188 array = normalization_table[j].nfd;
4189 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4190 if (array[k] != buf[k]){
4191 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4198 array = normalization_table[j].nfc;
4199 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4200 buf[i] = (nkf_char)(array[i]);
4212 nfc_ungetc(nkf_char c, FILE *f)
4214 return (*i_nfc_ungetc)(c, f);
4216 #endif /* UNICODE_NORMALIZATION */
4220 base64decode(nkf_char c)
4225 i = c - 'A'; /* A..Z 0-25 */
4226 } else if (c == '_') {
4227 i = '?' /* 63 */ ; /* _ 63 */
4229 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4231 } else if (c > '/') {
4232 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4233 } else if (c == '+' || c == '-') {
4234 i = '>' /* 62 */ ; /* + and - 62 */
4236 i = '?' /* 63 */ ; /* / 63 */
4244 nkf_char c1, c2, c3, c4, cc;
4245 nkf_char t1, t2, t3, t4, mode, exit_mode;
4246 nkf_char lwsp_count;
4249 nkf_char lwsp_size = 128;
4251 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4252 return mime_input_buf(mime_input_state.top++);
4254 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4255 mime_decode_mode=FALSE;
4256 unswitch_mime_getc();
4257 return (*i_getc)(f);
4260 if (mimebuf_f == FIXED_MIME)
4261 exit_mode = mime_decode_mode;
4264 if (mime_decode_mode == 'Q') {
4265 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4267 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4268 if (c1<=SP || DEL<=c1) {
4269 mime_decode_mode = exit_mode; /* prepare for quit */
4272 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4276 mime_decode_mode = exit_mode; /* prepare for quit */
4277 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4278 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4279 /* end Q encoding */
4280 input_mode = exit_mode;
4282 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4283 if (lwsp_buf==NULL) {
4284 perror("can't malloc");
4287 while ((c1=(*i_getc)(f))!=EOF) {
4292 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4300 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4301 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4316 lwsp_buf[lwsp_count] = (unsigned char)c1;
4317 if (lwsp_count++>lwsp_size){
4319 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4320 if (lwsp_buf_new==NULL) {
4322 perror("can't realloc");
4325 lwsp_buf = lwsp_buf_new;
4331 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4333 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4334 i_ungetc(lwsp_buf[lwsp_count],f);
4340 if (c1=='='&&c2<SP) { /* this is soft wrap */
4341 while((c1 = (*i_mgetc)(f)) <=SP) {
4342 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4344 mime_decode_mode = 'Q'; /* still in MIME */
4345 goto restart_mime_q;
4348 mime_decode_mode = 'Q'; /* still in MIME */
4352 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4353 if (c2<=SP) return c2;
4354 mime_decode_mode = 'Q'; /* still in MIME */
4355 return ((hex2bin(c2)<<4) + hex2bin(c3));
4358 if (mime_decode_mode != 'B') {
4359 mime_decode_mode = FALSE;
4360 return (*i_mgetc)(f);
4364 /* Base64 encoding */
4366 MIME allows line break in the middle of
4367 Base64, but we are very pessimistic in decoding
4368 in unbuf mode because MIME encoded code may broken by
4369 less or editor's control sequence (such as ESC-[-K in unbuffered
4370 mode. ignore incomplete MIME.
4372 mode = mime_decode_mode;
4373 mime_decode_mode = exit_mode; /* prepare for quit */
4375 while ((c1 = (*i_mgetc)(f))<=SP) {
4380 if ((c2 = (*i_mgetc)(f))<=SP) {
4383 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4384 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4387 if ((c1 == '?') && (c2 == '=')) {
4390 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4391 if (lwsp_buf==NULL) {
4392 perror("can't malloc");
4395 while ((c1=(*i_getc)(f))!=EOF) {
4400 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4408 if ((c1=(*i_getc)(f))!=EOF) {
4412 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4427 lwsp_buf[lwsp_count] = (unsigned char)c1;
4428 if (lwsp_count++>lwsp_size){
4430 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4431 if (lwsp_buf_new==NULL) {
4433 perror("can't realloc");
4436 lwsp_buf = lwsp_buf_new;
4442 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4444 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4445 i_ungetc(lwsp_buf[lwsp_count],f);
4452 if ((c3 = (*i_mgetc)(f))<=SP) {
4455 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4456 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4460 if ((c4 = (*i_mgetc)(f))<=SP) {
4463 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4464 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4468 mime_decode_mode = mode; /* still in MIME sigh... */
4470 /* BASE 64 decoding */
4472 t1 = 0x3f & base64decode(c1);
4473 t2 = 0x3f & base64decode(c2);
4474 t3 = 0x3f & base64decode(c3);
4475 t4 = 0x3f & base64decode(c4);
4476 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4478 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4479 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4481 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4482 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4484 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4489 return mime_input_buf(mime_input_state.top++);
4492 static const char basis_64[] =
4493 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4495 #define MIMEOUT_BUF_LENGTH (60)
4497 char buf[MIMEOUT_BUF_LENGTH+1];
4502 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4505 open_mime(nkf_char mode)
4507 const unsigned char *p;
4510 p = mime_pattern[0];
4511 for(i=0;mime_pattern[i];i++) {
4512 if (mode == mime_encode[i]) {
4513 p = mime_pattern[i];
4517 mimeout_mode = mime_encode_method[i];
4519 if (base64_count>45) {
4520 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4521 (*o_mputc)(mimeout_state.buf[i]);
4524 PUT_NEWLINE((*o_mputc));
4527 if (mimeout_state.count>0
4528 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4529 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4533 for (;i<mimeout_state.count;i++) {
4534 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4535 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4536 (*o_mputc)(mimeout_state.buf[i]);
4546 j = mimeout_state.count;
4547 mimeout_state.count = 0;
4549 mime_putc(mimeout_state.buf[i]);
4554 mime_prechar(nkf_char c2, nkf_char c1)
4556 if (mimeout_mode > 0){
4558 if (base64_count + mimeout_state.count/3*4> 73){
4559 (*o_base64conv)(EOF,0);
4560 OCONV_NEWLINE((*o_base64conv));
4561 (*o_base64conv)(0,SP);
4565 if (base64_count + mimeout_state.count/3*4> 66) {
4566 (*o_base64conv)(EOF,0);
4567 OCONV_NEWLINE((*o_base64conv));
4568 (*o_base64conv)(0,SP);
4574 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4575 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4576 open_mime(output_mode);
4577 (*o_base64conv)(EOF,0);
4578 OCONV_NEWLINE((*o_base64conv));
4579 (*o_base64conv)(0,SP);
4598 switch(mimeout_mode) {
4603 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4609 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4614 if (mimeout_mode > 0) {
4615 if (mimeout_f!=FIXED_MIME) {
4617 } else if (mimeout_mode != 'Q')
4623 mimeout_addchar(nkf_char c)
4625 switch(mimeout_mode) {
4630 } else if(!nkf_isalnum(c)) {
4632 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4633 (*o_mputc)(bin2hex((c&0xf)));
4641 mimeout_state.state=c;
4642 (*o_mputc)(basis_64[c>>2]);
4647 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4648 mimeout_state.state=c;
4653 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4654 (*o_mputc)(basis_64[c & 0x3F]);
4666 mime_putc(nkf_char c)
4671 if (mimeout_f == FIXED_MIME){
4672 if (mimeout_mode == 'Q'){
4673 if (base64_count > 71){
4674 if (c!=CR && c!=LF) {
4676 PUT_NEWLINE((*o_mputc));
4681 if (base64_count > 71){
4683 PUT_NEWLINE((*o_mputc));
4686 if (c == EOF) { /* c==EOF */
4690 if (c != EOF) { /* c==EOF */
4696 /* mimeout_f != FIXED_MIME */
4698 if (c == EOF) { /* c==EOF */
4699 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4700 j = mimeout_state.count;
4701 mimeout_state.count = 0;
4703 if (mimeout_mode > 0) {
4704 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4706 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4709 mimeout_addchar(mimeout_state.buf[i]);
4713 mimeout_addchar(mimeout_state.buf[i]);
4717 mimeout_addchar(mimeout_state.buf[i]);
4723 mimeout_addchar(mimeout_state.buf[i]);
4729 if (mimeout_state.count > 0){
4730 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4735 if (mimeout_mode=='Q') {
4736 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4737 if (c == CR || c == LF) {
4742 } else if (c <= SP) {
4744 if (base64_count > 70) {
4745 PUT_NEWLINE((*o_mputc));
4748 if (!nkf_isblank(c)) {
4753 if (base64_count > 70) {
4755 PUT_NEWLINE((*o_mputc));
4758 open_mime(output_mode);
4760 if (!nkf_noescape_mime(c)) {
4771 if (mimeout_mode <= 0) {
4772 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4773 if (nkf_isspace(c)) {
4775 if (mimeout_mode == -1) {
4778 if (c==CR || c==LF) {
4780 open_mime(output_mode);
4786 for (i=0;i<mimeout_state.count;i++) {
4787 (*o_mputc)(mimeout_state.buf[i]);
4788 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4799 mimeout_state.buf[0] = (char)c;
4800 mimeout_state.count = 1;
4802 if (base64_count > 1
4803 && base64_count + mimeout_state.count > 76
4804 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4805 PUT_NEWLINE((*o_mputc));
4807 if (!nkf_isspace(mimeout_state.buf[0])){
4812 mimeout_state.buf[mimeout_state.count++] = (char)c;
4813 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4814 open_mime(output_mode);
4819 if (lastchar==CR || lastchar == LF){
4820 for (i=0;i<mimeout_state.count;i++) {
4821 (*o_mputc)(mimeout_state.buf[i]);
4824 mimeout_state.count = 0;
4827 for (i=0;i<mimeout_state.count-1;i++) {
4828 (*o_mputc)(mimeout_state.buf[i]);
4831 mimeout_state.buf[0] = SP;
4832 mimeout_state.count = 1;
4834 open_mime(output_mode);
4837 /* mimeout_mode == 'B', 1, 2 */
4838 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4839 if (lastchar == CR || lastchar == LF){
4840 if (nkf_isblank(c)) {
4841 for (i=0;i<mimeout_state.count;i++) {
4842 mimeout_addchar(mimeout_state.buf[i]);
4844 mimeout_state.count = 0;
4845 } else if (SP<c && c<DEL) {
4847 for (i=0;i<mimeout_state.count;i++) {
4848 (*o_mputc)(mimeout_state.buf[i]);
4851 mimeout_state.count = 0;
4853 mimeout_state.buf[mimeout_state.count++] = (char)c;
4856 if (c==SP || c==TAB || c==CR || c==LF) {
4857 for (i=0;i<mimeout_state.count;i++) {
4858 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4860 for (i=0;i<mimeout_state.count;i++) {
4861 (*o_mputc)(mimeout_state.buf[i]);
4864 mimeout_state.count = 0;
4867 mimeout_state.buf[mimeout_state.count++] = (char)c;
4868 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4870 for (i=0;i<mimeout_state.count;i++) {
4871 (*o_mputc)(mimeout_state.buf[i]);
4874 mimeout_state.count = 0;
4878 if (mimeout_state.count>0 && SP<c && c!='=') {
4879 mimeout_state.buf[mimeout_state.count++] = (char)c;
4880 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4881 j = mimeout_state.count;
4882 mimeout_state.count = 0;
4884 mimeout_addchar(mimeout_state.buf[i]);
4891 if (mimeout_state.count>0) {
4892 j = mimeout_state.count;
4893 mimeout_state.count = 0;
4895 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4897 mimeout_addchar(mimeout_state.buf[i]);
4903 (*o_mputc)(mimeout_state.buf[i]);
4905 open_mime(output_mode);
4912 base64_conv(nkf_char c2, nkf_char c1)
4914 mime_prechar(c2, c1);
4915 (*o_base64conv)(c2,c1);
4919 typedef struct nkf_iconv_t {
4922 size_t input_buffer_size;
4923 char *output_buffer;
4924 size_t output_buffer_size;
4928 nkf_iconv_new(char *tocode, char *fromcode)
4930 nkf_iconv_t converter;
4932 converter->input_buffer_size = IOBUF_SIZE;
4933 converter->input_buffer = malloc(converter->input_buffer_size);
4934 if (converter->input_buffer == NULL)
4935 perror("can't malloc");
4937 converter->output_buffer_size = IOBUF_SIZE * 2;
4938 converter->output_buffer = malloc(converter->output_buffer_size);
4939 if (converter->output_buffer == NULL)
4940 perror("can't malloc");
4942 converter->cd = iconv_open(tocode, fromcode);
4943 if (converter->cd == (iconv_t)-1)
4947 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4950 perror("can't iconv_open");
4956 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4958 size_t invalid = (size_t)0;
4959 char *input_buffer = converter->input_buffer;
4960 size_t input_length = (size_t)0;
4961 char *output_buffer = converter->output_buffer;
4962 size_t output_length = converter->output_buffer_size;
4967 while ((c = (*i_getc)(f)) != EOF) {
4968 input_buffer[input_length++] = c;
4969 if (input_length < converter->input_buffer_size) break;
4973 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4974 while (output_length-- > 0) {
4975 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4977 if (ret == (size_t) - 1) {
4980 if (input_buffer != converter->input_buffer)
4981 memmove(converter->input_buffer, input_buffer, input_length);
4984 converter->output_buffer_size *= 2;
4985 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4986 if (output_buffer == NULL) {
4987 perror("can't realloc");
4990 converter->output_buffer = output_buffer;
4993 perror("can't iconv");
5006 nkf_iconv_close(nkf_iconv_t *convert)
5008 free(converter->inbuf);
5009 free(converter->outbuf);
5010 iconv_close(converter->cd);
5019 struct input_code *p = input_code_list;
5031 mime_f = MIME_DECODE_DEFAULT;
5032 mime_decode_f = FALSE;
5037 x0201_f = X0201_DEFAULT;
5038 iso2022jp_f = FALSE;
5039 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5040 ms_ucs_map_f = UCS_MAP_ASCII;
5042 #ifdef UTF8_INPUT_ENABLE
5043 no_cp932ext_f = FALSE;
5044 no_best_fit_chars_f = FALSE;
5045 encode_fallback = NULL;
5046 unicode_subchar = '?';
5047 input_endian = ENDIAN_BIG;
5049 #ifdef UTF8_OUTPUT_ENABLE
5050 output_bom_f = FALSE;
5051 output_endian = ENDIAN_BIG;
5053 #ifdef UNICODE_NORMALIZATION
5069 #ifdef SHIFTJIS_CP932
5079 for (i = 0; i < 256; i++){
5080 prefix_table[i] = 0;
5084 mimeout_state.count = 0;
5089 fold_preserve_f = FALSE;
5092 kanji_intro = DEFAULT_J;
5093 ascii_intro = DEFAULT_R;
5094 fold_margin = FOLD_MARGIN;
5095 o_zconv = no_connection;
5096 o_fconv = no_connection;
5097 o_eol_conv = no_connection;
5098 o_rot_conv = no_connection;
5099 o_hira_conv = no_connection;
5100 o_base64conv = no_connection;
5101 o_iso2022jp_check_conv = no_connection;
5104 i_ungetc = std_ungetc;
5106 i_bungetc = std_ungetc;
5109 i_mungetc = std_ungetc;
5110 i_mgetc_buf = std_getc;
5111 i_mungetc_buf = std_ungetc;
5112 output_mode = ASCII;
5114 mime_decode_mode = FALSE;
5120 init_broken_state();
5121 z_prev2=0,z_prev1=0;
5123 iconv_for_check = 0;
5125 input_codename = NULL;
5126 input_encoding = NULL;
5127 output_encoding = NULL;
5134 module_connection(void)
5136 if (input_encoding) set_input_encoding(input_encoding);
5137 if (!output_encoding) {
5138 output_encoding = nkf_default_encoding();
5140 if (!output_encoding) {
5141 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5144 set_output_encoding(output_encoding);
5145 oconv = nkf_enc_to_oconv(output_encoding);
5148 /* replace continucation module, from output side */
5150 /* output redicrection */
5152 if (noout_f || guess_f){
5159 if (mimeout_f == TRUE) {
5160 o_base64conv = oconv; oconv = base64_conv;
5162 /* base64_count = 0; */
5165 if (eolmode_f || guess_f) {
5166 o_eol_conv = oconv; oconv = eol_conv;
5169 o_rot_conv = oconv; oconv = rot_conv;
5172 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5175 o_hira_conv = oconv; oconv = hira_conv;
5178 o_fconv = oconv; oconv = fold_conv;
5181 if (alpha_f || x0201_f) {
5182 o_zconv = oconv; oconv = z_conv;
5186 i_ungetc = std_ungetc;
5187 /* input redicrection */
5190 i_cgetc = i_getc; i_getc = cap_getc;
5191 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5194 i_ugetc = i_getc; i_getc = url_getc;
5195 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5198 #ifdef NUMCHAR_OPTION
5200 i_ngetc = i_getc; i_getc = numchar_getc;
5201 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5204 #ifdef UNICODE_NORMALIZATION
5206 i_nfc_getc = i_getc; i_getc = nfc_getc;
5207 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5210 if (mime_f && mimebuf_f==FIXED_MIME) {
5211 i_mgetc = i_getc; i_getc = mime_getc;
5212 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5215 i_bgetc = i_getc; i_getc = broken_getc;
5216 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5218 if (input_encoding) {
5219 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5221 set_iconv(FALSE, e_iconv);
5225 struct input_code *p = input_code_list;
5234 Conversion main loop. Code detection only.
5237 #if !defined(PERL_XS) && !defined(WIN32DLL)
5244 module_connection();
5245 while ((c = (*i_getc)(f)) != EOF)
5252 #define NEXT continue /* no output, get next */
5253 #define SKIP c2=0;continue /* no output, get next */
5254 #define MORE c2=c1;continue /* need one more byte */
5255 #define SEND ; /* output c1 and c2, get next */
5256 #define LAST break /* end of loop, go closing */
5257 #define set_input_mode(mode) do { \
5258 input_mode = mode; \
5260 set_input_codename("ISO-2022-JP"); \
5261 debug("ISO-2022-JP"); \
5265 kanji_convert(FILE *f)
5267 nkf_char c1=0, c2=0, c3=0, c4=0;
5268 int shift_mode = 0; /* 0, 1, 2, 3 */
5270 int is_8bit = FALSE;
5272 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5277 output_mode = ASCII;
5279 if (module_connection() < 0) {
5280 #if !defined(PERL_XS) && !defined(WIN32DLL)
5281 fprintf(stderr, "no output encoding given\n");
5287 #ifdef UTF8_INPUT_ENABLE
5288 if(iconv == w_iconv32){
5289 while ((c1 = (*i_getc)(f)) != EOF &&
5290 (c2 = (*i_getc)(f)) != EOF &&
5291 (c3 = (*i_getc)(f)) != EOF &&
5292 (c4 = (*i_getc)(f)) != EOF) {
5293 nkf_iconv_utf_32(c1, c2, c3, c4);
5295 (*i_ungetc)(EOF, f);
5297 else if (iconv == w_iconv16) {
5298 while ((c1 = (*i_getc)(f)) != EOF &&
5299 (c2 = (*i_getc)(f)) != EOF) {
5300 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5301 (c3 = (*i_getc)(f)) != EOF &&
5302 (c4 = (*i_getc)(f)) != EOF) {
5303 nkf_iconv_utf_16(c1, c2, c3, c4);
5306 (*i_ungetc)(EOF, f);
5310 while ((c1 = (*i_getc)(f)) != EOF) {
5311 #ifdef INPUT_CODE_FIX
5312 if (!input_encoding)
5318 /* in case of 8th bit is on */
5319 if (!estab_f&&!mime_decode_mode) {
5320 /* in case of not established yet */
5321 /* It is still ambiguious */
5322 if (h_conv(f, c2, c1)==EOF) {
5330 /* in case of already established */
5332 /* ignore bogus code */
5340 /* 2nd byte of 7 bit code or SJIS */
5346 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5349 } else if (c1 > DEL) {
5351 if (!estab_f && !iso8859_f) {
5352 /* not established yet */
5354 } else { /* estab_f==TRUE */
5360 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5361 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5363 c2 = JIS_X_0201_1976_K;
5368 /* already established */
5372 } else if (SP < c1 && c1 < DEL) {
5373 /* in case of Roman characters */
5375 /* output 1 shifted byte */
5379 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5380 /* output 1 shifted byte */
5381 c2 = JIS_X_0201_1976_K;
5384 /* look like bogus code */
5387 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5388 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5389 /* in case of Kanji shifted */
5391 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5392 /* Check MIME code */
5393 if ((c1 = (*i_getc)(f)) == EOF) {
5396 } else if (c1 == '?') {
5397 /* =? is mime conversion start sequence */
5398 if(mime_f == STRICT_MIME) {
5399 /* check in real detail */
5400 if (mime_begin_strict(f) == EOF)
5403 } else if (mime_begin(f) == EOF)
5412 /* normal ASCII code */
5415 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5418 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5421 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5422 if ((c1 = (*i_getc)(f)) == EOF) {
5423 /* (*oconv)(0, ESC); don't send bogus code */
5426 else if (c1 == '&') {
5428 if ((c1 = (*i_getc)(f)) == EOF) {
5434 else if (c1 == '$') {
5436 if ((c1 = (*i_getc)(f)) == EOF) {
5437 /* don't send bogus code
5439 (*oconv)(0, '$'); */
5441 } else if (c1 == '@' || c1 == 'B') {
5443 set_input_mode(JIS_X_0208);
5445 } else if (c1 == '(') {
5447 if ((c1 = (*i_getc)(f)) == EOF) {
5448 /* don't send bogus code
5454 } else if (c1 == '@'|| c1 == 'B') {
5456 set_input_mode(JIS_X_0208);
5459 } else if (c1 == 'D'){
5460 set_input_mode(JIS_X_0212);
5462 #endif /* X0212_ENABLE */
5463 } else if (c1 == 'O' || c1 == 'Q'){
5464 set_input_mode(JIS_X_0213_1);
5466 } else if (c1 == 'P'){
5467 set_input_mode(JIS_X_0213_2);
5470 /* could be some special code */
5477 } else if (broken_f&0x2) {
5478 /* accept any ESC-(-x as broken code ... */
5479 input_mode = JIS_X_0208;
5488 } else if (c1 == '(') {
5490 if ((c1 = (*i_getc)(f)) == EOF) {
5491 /* don't send bogus code
5493 (*oconv)(0, '('); */
5496 else if (c1 == 'I') {
5497 /* JIS X 0201 Katakana */
5498 set_input_mode(JIS_X_0201_1976_K);
5501 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5502 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5503 set_input_mode(ASCII);
5506 else if (broken_f&0x2) {
5507 set_input_mode(ASCII);
5516 else if (c1 == '.') {
5518 if ((c1 = (*i_getc)(f)) == EOF) {
5521 else if (c1 == 'A') {
5532 else if (c1 == 'N') {
5535 if (g2 == ISO_8859_1) {
5550 } else if (c1 == ESC && iconv == s_iconv) {
5551 /* ESC in Shift_JIS */
5552 if ((c1 = (*i_getc)(f)) == EOF) {
5553 /* (*oconv)(0, ESC); don't send bogus code */
5555 } else if (c1 == '$') {
5557 if ((c1 = (*i_getc)(f)) == EOF) {
5559 } else if (('E' <= c1 && c1 <= 'G') ||
5560 ('O' <= c1 && c1 <= 'Q')) {
5568 static const int jphone_emoji_first_table[7] =
5569 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5570 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5571 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5572 while (SP <= c1 && c1 <= 'z') {
5573 (*oconv)(0, c1 + c3);
5574 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5589 } else if (c1 == LF || c1 == CR) {
5591 input_mode = ASCII; set_iconv(FALSE, 0);
5593 } else if (mime_decode_f && !mime_decode_mode){
5595 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5603 } else { /* if (c1 == CR)*/
5604 if ((c1=(*i_getc)(f))!=EOF) {
5608 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5628 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5631 if ((c3 = (*i_getc)(f)) != EOF) {
5634 if ((c4 = (*i_getc)(f)) != EOF) {
5636 (*iconv)(c2, c1, c3|c4);
5641 /* 3 bytes EUC or UTF-8 */
5642 if ((c3 = (*i_getc)(f)) != EOF) {
5644 (*iconv)(c2, c1, c3);
5652 0x7F <= c2 && c2 <= 0x92 &&
5653 0x21 <= c1 && c1 <= 0x7E) {
5655 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5658 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5662 (*oconv)(PREFIX_EUCG3 | c2, c1);
5664 #endif /* X0212_ENABLE */
5666 (*oconv)(PREFIX_EUCG3 | c2, c1);
5669 (*oconv)(input_mode, c1); /* other special case */
5675 /* goto next_word */
5679 (*iconv)(EOF, 0, 0);
5680 if (!input_codename)
5683 struct input_code *p = input_code_list;
5684 struct input_code *result = p;
5686 if (p->score < result->score) result = p;
5689 set_input_codename(result->name);
5691 debug(result->name);
5699 * int options(unsigned char *cp)
5706 options(unsigned char *cp)
5710 unsigned char *cp_back = NULL;
5715 while(*cp && *cp++!='-');
5716 while (*cp || cp_back) {
5724 case '-': /* literal options */
5725 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5729 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5730 p = (unsigned char *)long_option[i].name;
5731 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5732 if (*p == cp[j] || cp[j] == SP){
5739 #if !defined(PERL_XS) && !defined(WIN32DLL)
5740 fprintf(stderr, "unknown long option: --%s\n", cp);
5744 while(*cp && *cp != SP && cp++);
5745 if (long_option[i].alias[0]){
5747 cp = (unsigned char *)long_option[i].alias;
5749 if (strcmp(long_option[i].name, "ic=") == 0){
5750 enc = nkf_enc_find((char *)p);
5752 input_encoding = enc;
5755 if (strcmp(long_option[i].name, "oc=") == 0){
5756 enc = nkf_enc_find((char *)p);
5757 if (enc <= 0) continue;
5758 output_encoding = enc;
5761 if (strcmp(long_option[i].name, "guess=") == 0){
5762 if (p[0] == '0' || p[0] == '1') {
5770 if (strcmp(long_option[i].name, "overwrite") == 0){
5773 preserve_time_f = TRUE;
5776 if (strcmp(long_option[i].name, "overwrite=") == 0){
5779 preserve_time_f = TRUE;
5781 backup_suffix = malloc(strlen((char *) p) + 1);
5782 strcpy(backup_suffix, (char *) p);
5785 if (strcmp(long_option[i].name, "in-place") == 0){
5788 preserve_time_f = FALSE;
5791 if (strcmp(long_option[i].name, "in-place=") == 0){
5794 preserve_time_f = FALSE;
5796 backup_suffix = malloc(strlen((char *) p) + 1);
5797 strcpy(backup_suffix, (char *) p);
5802 if (strcmp(long_option[i].name, "cap-input") == 0){
5806 if (strcmp(long_option[i].name, "url-input") == 0){
5811 #ifdef NUMCHAR_OPTION
5812 if (strcmp(long_option[i].name, "numchar-input") == 0){
5818 if (strcmp(long_option[i].name, "no-output") == 0){
5822 if (strcmp(long_option[i].name, "debug") == 0){
5827 if (strcmp(long_option[i].name, "cp932") == 0){
5828 #ifdef SHIFTJIS_CP932
5832 #ifdef UTF8_OUTPUT_ENABLE
5833 ms_ucs_map_f = UCS_MAP_CP932;
5837 if (strcmp(long_option[i].name, "no-cp932") == 0){
5838 #ifdef SHIFTJIS_CP932
5842 #ifdef UTF8_OUTPUT_ENABLE
5843 ms_ucs_map_f = UCS_MAP_ASCII;
5847 #ifdef SHIFTJIS_CP932
5848 if (strcmp(long_option[i].name, "cp932inv") == 0){
5855 if (strcmp(long_option[i].name, "x0212") == 0){
5862 if (strcmp(long_option[i].name, "exec-in") == 0){
5866 if (strcmp(long_option[i].name, "exec-out") == 0){
5871 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5872 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5873 no_cp932ext_f = TRUE;
5876 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5877 no_best_fit_chars_f = TRUE;
5880 if (strcmp(long_option[i].name, "fb-skip") == 0){
5881 encode_fallback = NULL;
5884 if (strcmp(long_option[i].name, "fb-html") == 0){
5885 encode_fallback = encode_fallback_html;
5888 if (strcmp(long_option[i].name, "fb-xml") == 0){
5889 encode_fallback = encode_fallback_xml;
5892 if (strcmp(long_option[i].name, "fb-java") == 0){
5893 encode_fallback = encode_fallback_java;
5896 if (strcmp(long_option[i].name, "fb-perl") == 0){
5897 encode_fallback = encode_fallback_perl;
5900 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5901 encode_fallback = encode_fallback_subchar;
5904 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5905 encode_fallback = encode_fallback_subchar;
5906 unicode_subchar = 0;
5908 /* decimal number */
5909 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5910 unicode_subchar *= 10;
5911 unicode_subchar += hex2bin(p[i]);
5913 }else if(p[1] == 'x' || p[1] == 'X'){
5914 /* hexadecimal number */
5915 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5916 unicode_subchar <<= 4;
5917 unicode_subchar |= hex2bin(p[i]);
5921 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5922 unicode_subchar *= 8;
5923 unicode_subchar += hex2bin(p[i]);
5926 w16e_conv(unicode_subchar, &i, &j);
5927 unicode_subchar = i<<8 | j;
5931 #ifdef UTF8_OUTPUT_ENABLE
5932 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5933 ms_ucs_map_f = UCS_MAP_MS;
5937 #ifdef UNICODE_NORMALIZATION
5938 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5943 if (strcmp(long_option[i].name, "prefix=") == 0){
5944 if (nkf_isgraph(p[0])){
5945 for (i = 1; nkf_isgraph(p[i]); i++){
5946 prefix_table[p[i]] = p[0];
5951 #if !defined(PERL_XS) && !defined(WIN32DLL)
5952 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5957 case 'b': /* buffered mode */
5960 case 'u': /* non bufferd mode */
5963 case 't': /* transparent mode */
5968 } else if (*cp=='2') {
5972 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5980 case 'j': /* JIS output */
5982 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5984 case 'e': /* AT&T EUC output */
5985 output_encoding = nkf_enc_from_index(EUCJP_NKF);
5987 case 's': /* SJIS output */
5988 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5990 case 'l': /* ISO8859 Latin-1 support, no conversion */
5991 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5992 input_encoding = nkf_enc_from_index(ISO_8859_1);
5994 case 'i': /* Kanji IN ESC-$-@/B */
5995 if (*cp=='@'||*cp=='B')
5996 kanji_intro = *cp++;
5998 case 'o': /* ASCII IN ESC-(-J/B */
5999 if (*cp=='J'||*cp=='B'||*cp=='H')
6000 ascii_intro = *cp++;
6004 bit:1 katakana->hiragana
6005 bit:2 hiragana->katakana
6007 if ('9'>= *cp && *cp>='0')
6008 hira_f |= (*cp++ -'0');
6015 #if defined(MSDOS) || defined(__OS2__)
6022 show_configuration();
6030 #ifdef UTF8_OUTPUT_ENABLE
6031 case 'w': /* UTF-8 output */
6036 output_encoding = nkf_enc_from_index(UTF_8N);
6038 output_bom_f = TRUE;
6039 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6043 if ('1'== cp[0] && '6'==cp[1]) {
6046 } else if ('3'== cp[0] && '2'==cp[1]) {
6050 output_encoding = nkf_enc_from_index(UTF_8);
6055 output_endian = ENDIAN_LITTLE;
6056 } else if (cp[0] == 'B') {
6059 output_encoding = nkf_enc_from_index(enc_idx);
6064 enc_idx = enc_idx == UTF_16
6065 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6066 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6068 output_bom_f = TRUE;
6069 enc_idx = enc_idx == UTF_16
6070 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6071 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6073 output_encoding = nkf_enc_from_index(enc_idx);
6077 #ifdef UTF8_INPUT_ENABLE
6078 case 'W': /* UTF input */
6081 input_encoding = nkf_enc_from_index(UTF_8);
6084 if ('1'== cp[0] && '6'==cp[1]) {
6086 input_endian = ENDIAN_BIG;
6088 } else if ('3'== cp[0] && '2'==cp[1]) {
6090 input_endian = ENDIAN_BIG;
6093 input_encoding = nkf_enc_from_index(UTF_8);
6098 input_endian = ENDIAN_LITTLE;
6099 } else if (cp[0] == 'B') {
6101 input_endian = ENDIAN_BIG;
6103 enc_idx = (enc_idx == UTF_16
6104 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6105 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6106 input_encoding = nkf_enc_from_index(enc_idx);
6110 /* Input code assumption */
6111 case 'J': /* ISO-2022-JP input */
6112 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6114 case 'E': /* EUC-JP input */
6115 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6117 case 'S': /* Windows-31J input */
6118 input_encoding = nkf_enc_from_index(WINDOWS_31J);
6120 case 'Z': /* Convert X0208 alphabet to asii */
6122 bit:0 Convert JIS X 0208 Alphabet to ASCII
6123 bit:1 Convert Kankaku to one space
6124 bit:2 Convert Kankaku to two spaces
6125 bit:3 Convert HTML Entity
6126 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6128 while ('0'<= *cp && *cp <='9') {
6129 alpha_f |= 1 << (*cp++ - '0');
6131 if (!alpha_f) alpha_f = 1;
6133 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6134 x0201_f = FALSE; /* No X0201->X0208 conversion */
6136 ESC-(-I in JIS, EUC, MS Kanji
6137 SI/SO in JIS, EUC, MS Kanji
6138 SS2 in EUC, JIS, not in MS Kanji
6139 MS Kanji (0xa0-0xdf)
6141 ESC-(-I in JIS (0x20-0x5f)
6142 SS2 in EUC (0xa0-0xdf)
6143 0xa0-0xd in MS Kanji (0xa0-0xdf)
6146 case 'X': /* Convert X0201 kana to X0208 */
6149 case 'F': /* prserve new lines */
6150 fold_preserve_f = TRUE;
6151 case 'f': /* folding -f60 or -f */
6154 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6156 fold_len += *cp++ - '0';
6158 if (!(0<fold_len && fold_len<BUFSIZ))
6159 fold_len = DEFAULT_FOLD;
6163 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6165 fold_margin += *cp++ - '0';
6169 case 'm': /* MIME support */
6170 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6171 if (*cp=='B'||*cp=='Q') {
6172 mime_decode_mode = *cp++;
6173 mimebuf_f = FIXED_MIME;
6174 } else if (*cp=='N') {
6175 mime_f = TRUE; cp++;
6176 } else if (*cp=='S') {
6177 mime_f = STRICT_MIME; cp++;
6178 } else if (*cp=='0') {
6179 mime_decode_f = FALSE;
6180 mime_f = FALSE; cp++;
6182 mime_f = STRICT_MIME;
6185 case 'M': /* MIME output */
6188 mimeout_f = FIXED_MIME; cp++;
6189 } else if (*cp=='Q') {
6191 mimeout_f = FIXED_MIME; cp++;
6196 case 'B': /* Broken JIS support */
6198 bit:1 allow any x on ESC-(-x or ESC-$-x
6199 bit:2 reset to ascii on NL
6201 if ('9'>= *cp && *cp>='0')
6202 broken_f |= 1<<(*cp++ -'0');
6207 case 'O':/* for Output file */
6211 case 'c':/* add cr code */
6214 case 'd':/* delete cr code */
6217 case 'I': /* ISO-2022-JP output */
6220 case 'L': /* line mode */
6221 if (*cp=='u') { /* unix */
6222 eolmode_f = LF; cp++;
6223 } else if (*cp=='m') { /* mac */
6224 eolmode_f = CR; cp++;
6225 } else if (*cp=='w') { /* windows */
6226 eolmode_f = CRLF; cp++;
6227 } else if (*cp=='0') { /* no conversion */
6228 eolmode_f = 0; cp++;
6233 if ('2' <= *cp && *cp <= '9') {
6236 } else if (*cp == '0' || *cp == '1') {
6245 /* module muliple options in a string are allowed for Perl moudle */
6246 while(*cp && *cp++!='-');
6249 #if !defined(PERL_XS) && !defined(WIN32DLL)
6250 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6252 /* bogus option but ignored */
6260 #include "nkf32dll.c"
6261 #elif defined(PERL_XS)
6262 #else /* WIN32DLL */
6264 main(int argc, char **argv)
6269 char *outfname = NULL;
6272 #ifdef EASYWIN /*Easy Win */
6273 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6275 #ifdef DEFAULT_CODE_LOCALE
6276 setlocale(LC_CTYPE, "");
6278 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6279 cp = (unsigned char *)*argv;
6284 if (pipe(fds) < 0 || (pid = fork()) < 0){
6295 execvp(argv[1], &argv[1]);
6312 int debug_f_back = debug_f;
6315 int exec_f_back = exec_f;
6318 int x0212_f_back = x0212_f;
6320 int x0213_f_back = x0213_f;
6321 int guess_f_back = guess_f;
6323 guess_f = guess_f_back;
6326 debug_f = debug_f_back;
6329 exec_f = exec_f_back;
6331 x0212_f = x0212_f_back;
6332 x0213_f = x0213_f_back;
6335 if (binmode_f == TRUE)
6336 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6337 if (freopen("","wb",stdout) == NULL)
6344 setbuf(stdout, (char *) NULL);
6346 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6349 if (binmode_f == TRUE)
6350 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6351 if (freopen("","rb",stdin) == NULL) return (-1);
6355 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6359 kanji_convert(stdin);
6360 if (guess_f) print_guessed_code(NULL);
6364 int is_argument_error = FALSE;
6366 input_codename = NULL;
6369 iconv_for_check = 0;
6371 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6373 is_argument_error = TRUE;
6381 /* reopen file for stdout */
6382 if (file_out_f == TRUE) {
6385 outfname = malloc(strlen(origfname)
6386 + strlen(".nkftmpXXXXXX")
6392 strcpy(outfname, origfname);
6396 for (i = strlen(outfname); i; --i){
6397 if (outfname[i - 1] == '/'
6398 || outfname[i - 1] == '\\'){
6404 strcat(outfname, "ntXXXXXX");
6406 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6407 S_IREAD | S_IWRITE);
6409 strcat(outfname, ".nkftmpXXXXXX");
6410 fd = mkstemp(outfname);
6413 || (fd_backup = dup(fileno(stdout))) < 0
6414 || dup2(fd, fileno(stdout)) < 0
6425 outfname = "nkf.out";
6428 if(freopen(outfname, "w", stdout) == NULL) {
6432 if (binmode_f == TRUE) {
6433 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6434 if (freopen("","wb",stdout) == NULL)
6441 if (binmode_f == TRUE)
6442 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6443 if (freopen("","rb",fin) == NULL)
6448 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6452 char *filename = NULL;
6454 if (nfiles > 1) filename = origfname;
6455 if (guess_f) print_guessed_code(filename);
6461 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6469 if (dup2(fd_backup, fileno(stdout)) < 0){
6472 if (stat(origfname, &sb)) {
6473 fprintf(stderr, "Can't stat %s\n", origfname);
6475 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6476 if (chmod(outfname, sb.st_mode)) {
6477 fprintf(stderr, "Can't set permission %s\n", outfname);
6480 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6481 if(preserve_time_f){
6482 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6483 tb[0] = tb[1] = sb.st_mtime;
6484 if (utime(outfname, tb)) {
6485 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6488 tb.actime = sb.st_atime;
6489 tb.modtime = sb.st_mtime;
6490 if (utime(outfname, &tb)) {
6491 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6496 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6498 unlink(backup_filename);
6500 if (rename(origfname, backup_filename)) {
6501 perror(backup_filename);
6502 fprintf(stderr, "Can't rename %s to %s\n",
6503 origfname, backup_filename);
6505 free(backup_filename);
6508 if (unlink(origfname)){
6513 if (rename(outfname, origfname)) {
6515 fprintf(stderr, "Can't rename %s to %s\n",
6516 outfname, origfname);
6523 if (is_argument_error)
6526 #ifdef EASYWIN /*Easy Win */
6527 if (file_out_f == FALSE)
6528 scanf("%d",&end_check);
6531 #else /* for Other OS */
6532 if (file_out_f == TRUE)
6534 #endif /*Easy Win */
6537 #endif /* WIN32DLL */